laywerrobot/BGH/readpdf.py

35 lines
1.1 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
import sys
fp = open(sys.argv[1], 'rb')
from python-pdfminer3k.pdfparser import PDFParser, PDFDocument
from python-pdfminer3k.pdfinterp import PDFResourceManager, PDFPageInterpreter
from python-pdfminer3k.converter import PDFPageAggregator
from python-pdfminer3k.layout import LAParams, LTTextBox, LTTextLine
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 1.0
laparams.word_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
extracted_text = ''
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
#extracted_text += lt_obj.get_text()
with open('pdftest.csv', 'w') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
csvwriter.writerow(lt_obj.get_text())