from typing import Union, Tuple, List, Optional, Generator, BinaryIO, Deque, NamedTuple
import collections
import sys
import glob
import io
from pdfminer.pdfdevice import PDFTextDevice
from pdfminer.pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager, PDFTextState
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdffont import PDFCIDFont, PDFUnicodeNotDefined
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding=sys.stdout.encoding, errors='ignore')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding=sys.stderr.encoding, errors='ignore')
class MyPDFText(NamedTuple):
page: int
x: float
y: float
width: float
height: float
text: str
def __str__(self) -> str:
return self.text
def tsv(self) -> str:
return '\t'.join([f'{val}' for val in self])
# def fixed(self) -> str:
# return f'''{self.page: >2.0f} {self.x: >7.2f} {self.y: >7.2f} {self.width: >7.2f} {self.height: >7.2f} "{self.text}"'''
class MyDevice(PDFTextDevice):
def __init__(self, rsrcmgr: PDFResourceManager) -> None:
super().__init__(rsrcmgr)
self.pageno = 0
self.deque: Deque[MyPDFText] = collections.deque()
return
def render_string(self, textstate: PDFTextState, seq: List[bytes]) -> None:
def generate_chars_from(textstate: PDFTextState, seq: List[bytes]) -> Generator[str, None, None]:
font = textstate.font
for obj in seq:
chars = font.decode(obj)
for cid in chars:
try:
yield font.to_unichr(cid)
except PDFUnicodeNotDefined:
print(f'Unicode Not Defined: cid={cid} chars={chars}', file=sys.stderr)
print(f'Unicode Not Defined: decode_as_cp932={obj.decode("cp932", errors="replace")}', file=sys.stderr)
cidtext = ''
try:
super().render_string(textstate, seq)
cidtext = ''.join(generate_chars_from(textstate, seq))
except Exception as err:
print(f'Exception: {err}', file=sys.stderr)
# pyinstaller使用時はcmapファイルを読めずcidtextが常に空文字列になる
if cidtext == '' and 'NONE' in str(textstate.font.cmap).upper(): #'<CMap: 90ms-RKSJ-H>', '<CMap: 90msp-RKSJ-H>'のはずが'<CMap: None>'なので
cidtext = ''.join([byte_seq.decode('cp932', errors='replace') for byte_seq in seq]) # shift-jisのデコードを試みる
height = textstate.font.get_height() * textstate.fontsize
if textstate.linematrix == (0, 0):
width, height = 0, 0
else:
width, height = [(height if n == 0.0 else n) for n in textstate.linematrix]
mat = textstate.matrix
l_x, b_y = mat[4], mat[5]
mytext = MyPDFText(self.pageno, l_x, b_y, width, height, cidtext)
self.deque.append(mytext)
return
def render_char(self, matrix: Tuple[Union[int, float], ...], font: PDFCIDFont, fontsize: float, scaling: float, rise: float, cid: int) -> float:
try:
text: str = font.to_unichr(cid)
assert isinstance(text, str)
except PDFUnicodeNotDefined:
# text = self.handle_undefined_char(font, cid)
raise
adv = font.char_width(cid) * fontsize * scaling
return adv
def begin_page(self, page: PDFPage, ctm: Tuple[int, int, int, int, float, float]) -> None:
return
def end_page(self, page: PDFPage) -> None:
self.pageno += 1
return
def load_pdf(fp: BinaryIO, password: Optional[str] = None) -> PDFDocument:
parser = PDFParser(fp)
document = PDFDocument(parser, password)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
return document
def texts_from(pdf_path: str) -> Deque[MyPDFText]:
with open(pdf_path, 'rb') as fp:
doc = load_pdf(fp)
rsrcmgr = PDFResourceManager()
device = MyDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
try:
interpreter.process_page(page)
except Exception as err:
print(f'ファイル処理中にエラーが発生しました:file={pdf_path} page={device.pageno} error={err} sys.exc_info()={sys.exc_info()}', file=sys.stderr)
return device.deque
def main() -> None:
def iterate(filelist: List[str]) -> Generator[str, None, None]:
for filepath in filelist:
if '*' in filepath:
yield from glob.glob(filepath)
else:
yield filepath
initial_list = [
] or sys.argv[1:]
print(*MyPDFText._fields, sep='\t')
for filepath in iterate(initial_list):
pdftexts = texts_from(filepath)
count = len(pdftexts)
print(f'#### {count}件 {filepath}', file=sys.stderr, flush=True)
for pdftext in pdftexts:
print(pdftext.tsv())
if __name__ == '__main__':
main()