In this tutorial we'll parse text from a PDF file and output it on CMD. We'll make use of python's pdfminer. So ready your keyboard, and let's code.
Here's the code:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
path = 'C:\Users\KENNETHDEANCICERON\Desktop\USB\SpYder ver1.0\\rp.pdf' #entire file path to be converted
rsrcmgr = PDFResourceManager() #Create PDFResourceManager object that stores shared resources such as fonts or images
retstr = BytesIO() #API for reading and writing bytes objects
codec = 'utf-8'
laparams = LAParams() #set out parameter objects for analysis
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) #set out an object that will store a value from a converted text with the use of shared resources
fp = open(path, 'rb') #opens up the pdf file in read and write mode(rb)
interpreter = PDFPageInterpreter(rsrcmgr, device) #this will create a object interpreter to process content from PDFDocument, thus interpreter needs to connect to resource manager and devices
password = "" #we'll set this to null for no creds needed
maxpages = 0
caching = True
pagenos=set()
#This let's us loop through page from the pdf file as it process the page stored in the PDFDocument object
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
print text
#def main()
# convert_pdf_to_txt(path)
#In this video we'll parse text from a PDF file and output it on CMD. We'll make use of python's
#pdfminer. So ready your keyboard, code with me, and cue the intro
Below is the video tutorial:
Comments