More than 5 years have passed since last update.

PDFMinerでXY座標付きのテキストを得る

Python

Posted at 2018-08-24

引数に渡したpdfまたはinitial_listに記述したpdfを読む。
pdfのパスに*が含まれていたらglobとして解釈する
- 例えば.\**\*.pdfなら下位ディレクトリのPDFも検索される
環境
- Windows7
- Python3.6.5(Anaconda)
- pdfminer.six(20170720 conda-forge)

from typing import Union, Tuple, List, Optional, Generator, BinaryIO, Deque, NamedTuple
import collections
import sys
import glob
import io

from pdfminer.pdfdevice import PDFTextDevice
from pdfminer.pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager, PDFTextState
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdffont import PDFCIDFont, PDFUnicodeNotDefined


sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding=sys.stdout.encoding, errors='ignore')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding=sys.stderr.encoding, errors='ignore')


class MyPDFText(NamedTuple):
    page: int
    x: float
    y: float
    width: float
    height: float
    text: str
    def __str__(self) -> str:
        return self.text
    def tsv(self) -> str:
        return '\t'.join([f'{val}' for val in self])
    # def fixed(self) -> str:
    #     return f'''{self.page: >2.0f} {self.x: >7.2f} {self.y: >7.2f} {self.width: >7.2f} {self.height: >7.2f}  "{self.text}"'''


class MyDevice(PDFTextDevice):
    def __init__(self, rsrcmgr: PDFResourceManager) -> None:
        super().__init__(rsrcmgr)
        self.pageno = 0
        self.deque: Deque[MyPDFText] = collections.deque()
        return

    def render_string(self, textstate: PDFTextState, seq: List[bytes]) -> None:
        def generate_chars_from(textstate: PDFTextState, seq: List[bytes]) -> Generator[str, None, None]:
            font = textstate.font
            for obj in seq:
                chars = font.decode(obj)
                for cid in chars:
                    try:
                        yield font.to_unichr(cid)
                    except PDFUnicodeNotDefined:
                        print(f'Unicode Not Defined: cid={cid} chars={chars}', file=sys.stderr)
                        print(f'Unicode Not Defined: decode_as_cp932={obj.decode("cp932", errors="replace")}', file=sys.stderr)

        cidtext = ''
        try:
            super().render_string(textstate, seq)
            cidtext = ''.join(generate_chars_from(textstate, seq))
        except Exception as err:
            print(f'Exception: {err}', file=sys.stderr)
        # pyinstaller使用時はcmapファイルを読めずcidtextが常に空文字列になる
        if cidtext == '' and 'NONE' in str(textstate.font.cmap).upper():  #'<CMap: 90ms-RKSJ-H>', '<CMap: 90msp-RKSJ-H>'のはずが'<CMap: None>'なので
            cidtext = ''.join([byte_seq.decode('cp932', errors='replace') for byte_seq in seq])  # shift-jisのデコードを試みる
        height = textstate.font.get_height() * textstate.fontsize
        if textstate.linematrix == (0, 0):
            width, height = 0, 0
        else:
            width, height = [(height if n == 0.0 else n) for n in textstate.linematrix]
        mat = textstate.matrix
        l_x, b_y = mat[4], mat[5]
        mytext = MyPDFText(self.pageno, l_x, b_y, width, height, cidtext)
        self.deque.append(mytext)
        return

    def render_char(self, matrix: Tuple[Union[int, float], ...], font: PDFCIDFont, fontsize: float, scaling: float, rise: float, cid: int) -> float:
        try:
            text: str = font.to_unichr(cid)
            assert isinstance(text, str)
        except PDFUnicodeNotDefined:
            # text = self.handle_undefined_char(font, cid)
            raise
        adv = font.char_width(cid) * fontsize * scaling
        return adv

    def begin_page(self, page: PDFPage, ctm: Tuple[int, int, int, int, float, float]) -> None:
        return

    def end_page(self, page: PDFPage) -> None:
        self.pageno += 1
        return


def load_pdf(fp: BinaryIO, password: Optional[str] = None) -> PDFDocument:
    parser = PDFParser(fp)
    document = PDFDocument(parser, password)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    return document


def texts_from(pdf_path: str) -> Deque[MyPDFText]:
    with open(pdf_path, 'rb') as fp:
        doc = load_pdf(fp)
        rsrcmgr = PDFResourceManager()
        device = MyDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            try:
                interpreter.process_page(page)
            except Exception as err:
                print(f'ファイル処理中にエラーが発生しました：file={pdf_path} page={device.pageno} error={err} sys.exc_info()={sys.exc_info()}', file=sys.stderr)
        return device.deque


def main() -> None:
    def iterate(filelist: List[str]) -> Generator[str, None, None]:
        for filepath in filelist:
            if '*' in filepath:
                yield from glob.glob(filepath)
            else:
                yield filepath
    initial_list = [
        ] or sys.argv[1:]
    print(*MyPDFText._fields, sep='\t')
    for filepath in iterate(initial_list):
        pdftexts = texts_from(filepath)
        count = len(pdftexts)
        print(f'#### {count}件 {filepath}', file=sys.stderr, flush=True)
        for pdftext in pdftexts:
            print(pdftext.tsv())

if __name__ == '__main__':
    main()

メモ

PDFMinerの出力は、定義済みのDeviceオブジェクトを上手く選ぶか、継承して自作することで変更できる。
- width, heightの値の意味は自分でもよく分かっていない
型ヒントは手作業で付けたので不正確かも。
cp932なPDFを扱うことが多いので、そのためのコードを残している。
まとまった量の文書を変換してExcelで散布図(XY)を取るとスクレイピング範囲の目安になる。
Python 3.6のtyping.NamedTuple(3.7以降のdataclass)が便利。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up