用于将PDF转换为文本的Python模块哪些是将PDF文件转换为文本的最佳Python模块?
3 回答
有只小跳蛙
TA贡献1824条经验 获得超8个赞
process_pdf
import sysfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.layout import LAParamsfrom cStringIO import StringIOdef pdfparser(data): fp = file(data, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() print dataif __name__ == '__main__': pdfparser(sys.argv[1])
import sysfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.layout import LAParamsimport iodef pdfparser(data): fp = open(data, 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() print(data)if __name__ == '__main__': pdfparser(sys.argv[1])
添加回答
举报
0/150
提交
取消