python读取pdf文件的包有pdfminer, pdfminer3k,pdfplumber等,其他我没去试

pdfminer

import io
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams


class PDFUtils:

    def pdf2txt(self, path):
        output = io.StringIO()
        with open(path, 'rb') as f:
            praser = PDFParser(f)

            doc = PDFDocument(praser)

            if not doc.is_extractable:
                raise PDFTextExtractionNotAllowed

            pdfrm = PDFResourceManager()

            laparams = LAParams()

            device = PDFPageAggregator(pdfrm, laparams=laparams)

            interpreter = PDFPageInterpreter(pdfrm, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:
                    if hasattr(x, "get_text"):
                        content = x.get_text()
                        output.write(content)

        content = output.getvalue()
        output.close()
        return content


if __name__ == '__main__':
    path = '测试.pdf'
    pdf_utils = PDFUtils()
    print(pdf_utils.pdf2txt(path))

pdfminer3k

from io import StringIO
from io import open
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf


def read_pdf(pdf):
    p_manager = PDFResourceManager()
    out_fp = StringIO()
    la_params = LAParams()
    device = TextConverter(p_manager, out_fp, laparams=la_params)
    process_pdf(p_manager, device, pdf)
    device.close()
    content = out_fp.getvalue()
    out_fp.close()
    return content


if __name__ == '__main__':
    with open('测试.pdf', "rb") as my_pdf:
        print(read_pdf(my_pdf))

 pdfplumber     

来源:https://www.jianshu.com/p/c5f474ab5716

import pdfplumber
import pandas as pd

with pdfplumber.open("测试.pdf") as pdf:
    page = pdf.pages[1]   # 第一页的信息
    text = page.extract_text()
    print(text)
    table = page.extract_tables()
    for t in table:
        # 得到的table是嵌套list类型,转化成DataFrame更加方便查看和分析
​        df = pd.DataFrame(t[1:], columns=t[0])
​        print(df)

 

本文地址:https://blog.csdn.net/q389797999/article/details/110230503