34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
import sys
|
|
|
|
fp = open(sys.argv[1], 'rb')
|
|
from python-pdfminer3k.pdfparser import PDFParser, PDFDocument
|
|
from python-pdfminer3k.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
from python-pdfminer3k.converter import PDFPageAggregator
|
|
from python-pdfminer3k.layout import LAParams, LTTextBox, LTTextLine
|
|
|
|
parser = PDFParser(fp)
|
|
doc = PDFDocument()
|
|
parser.set_document(doc)
|
|
doc.set_parser(parser)
|
|
doc.initialize('')
|
|
rsrcmgr = PDFResourceManager()
|
|
laparams = LAParams()
|
|
laparams.char_margin = 1.0
|
|
laparams.word_margin = 1.0
|
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
extracted_text = ''
|
|
|
|
for page in doc.get_pages():
|
|
interpreter.process_page(page)
|
|
layout = device.get_result()
|
|
for lt_obj in layout:
|
|
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
|
#extracted_text += lt_obj.get_text()
|
|
with open('pdftest.csv', 'w') as csvfile:
|
|
csvwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
|
csvwriter.writerow(lt_obj.get_text())
|
|
|
|
|
|
|
|
|