0
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

pdfの表を抜き出すメモ

Last updated at Posted at 2021-07-04

macOS:BigSur
python:3.7.9
camelot-py:0.8.2
tabula-py:1.4.1

変数はぼかす。
camelotで読んだらprintで確認しつつあたりつける必要がありそう。

pip install opencv-py
pip install tabula-py
pip install camelot-py
あとghostscriptの32bit版、いや64bit版だっけをインストールする。

get_table_from_pdf.py
import glob
import pandas as pd
import pickle
import tabula
import camelot
import numpy as np
import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import matplotlib as mpl
mpl.rcParams['font.family'] = 'eiragino Maru Goteic Pro'
import matplotlib.pyplot as plt
%matplotlib inline

dir_monn = path_monn+"*.pdf"

#金額のカンマを削除してint形式で返す。カンマない金額はそのままintで
def conv_camma_to_int(str_num_with_camma):
    if "," in str_num_with_camma:
        return(int(str_num_with_camma.replace(",","")))
    else:
        return(int(str_num_with_camma))
def minute_to_hour(minute_time):
    return(minute_time/60)

#読んだファイル内容確認用?
table_pdf_test = camelot.read_pdf(files[0], split_text=True, password=pass_pdf)
for t in table_pdf_test:
    df = t.df
    df.drop(0, inplace=True)
    print(df)

#抜き出した内容確認用?
print(table_pdf_test[2].df.iloc[0,2])
print(table_pdf_test[2].df.iloc[1,1].replace("\n","\t"))
wkwk_info = table_pdf_test[0].df.iloc[1,1]

if "\n" in wkwk_info and len(wkwk_info.split("\n"))==3:
    print(wkwk_info.split("\n")[2])

#処理

dic_all = {}
list_all_hanawarikirin = []
list_all_got_allgot = []
list_all_byebye_allgot = []
list_all_sasieikigotgaku = []
list_all_nohuhandtouch = []
list_year_month = []
#other
list_all_m_nohu_time = []
for f in files:
    if "monn" in f:
        table_pdf_test = camelot.read_pdf(f, split_text=True, password=pass_pdf)
        tdf = table_pdf_test[2].df
        tdf.drop(0, inplace=True)
        got_detailed = tdf.iloc[1,1]
        if "\n" in got_detailed:
            list_got_detailed = got_detailed.split("\n")
            num_got_detailed = len(list_got_detailed)
            if num_got_detailed == 2:
                base_monn = conv_camma_to_int(list_got_detailed[0].strip())
                nohu_handtouch = 0
                tsukin_handtouch = conv_camma_to_int(list_got_detailed[1].strip())
            elif num_got_detailed == 3:
                base_monn = conv_camma_to_int(list_got_detailed[0].strip())
                nohu_handtouch = conv_camma_to_int(list_got_detailed[1].strip())
                tsukin_handtouch = conv_camma_to_int(list_got_detailed[2].strip())
        else:
            base_monn = conv_camma_to_int(got_detailed.strip())
            nohu_handtouch = 0
            tsukin_handtouch = 0
        got_allgotA = tdf.iloc[2,1].strip()
        if "\n" in got_allgotA:
            list_got_allgot = got_allgotA.split("\n")
            if len(list_got_allgot) == 3:
                got_allgot = conv_camma_to_int(list_got_allgot[0].strip())
                omgsrerikin = conv_camma_to_int(list_got_allgot[1].strip())
                tanotagotgaku = conv_camma_to_int(list_got_allgot[2].strip())
            else:
                print("ERROR")
                print("FILE_YEAR:", f.split("/")[-1].replace(".pdf",""))
                print(got_allgotA)
                print()
        else:
            got_allgot = conv_camma_to_int(got_allgotA)
        byebye_detailed = tdf.iloc[1,3]
        list_byebye_detailed = byebye_detailed.split("\n")
        num_byebye_detailed = len(list_byebye_detailed)
        if num_byebye_detailed == 9:
            sndomg = conv_camma_to_int(list_byebye_detailed[0].strip())
            juminfomgm = conv_camma_to_int(list_byebye_detailed[1].strip())
            tomhuhotomjim = conv_camma_to_int(list_byebye_detailed[2].strip())
            huuseiyrinhotomjim = conv_camma_to_int(list_byebye_detailed[3].strip())
            hureuhotomjim = conv_camma_to_int(list_byebye_detailed[4].strip())
            tangaihotomjim = conv_camma_to_int(list_byebye_detailed[5].strip())
            srekudouei = 0
            kreusaiatukaidaikin = conv_camma_to_int(list_byebye_detailed[6].strip())
            krkumiaiei = conv_camma_to_int(list_byebye_detailed[7].strip())
            zatubyebye = conv_camma_to_int(list_byebye_detailed[8].strip())
        else:
            print("ERROR")
            print("FILE_YEAR:", f.split("/")[-1].replace(".pdf",""))
            print(byebye_detailed)
            print()
        byebye_allgot = conv_camma_to_int(tdf.iloc[2,3].strip())
        sasieikigotgaku = conv_camma_to_int(tdf.iloc[4,3].strip())
        wkwk_df = table_pdf_test[0].df
        wkwk_df.drop(0, inplace=True)
        wkwk_joho = wkwk_df.iloc[1,1]
        list_wkwk_joho = wkwk_joho.split("\n")
        num_wkwk_joho = len(list_wkwk_joho)
        if num_wkwk_joho == 4:
            got_nissu = float(list_wkwk_joho[0].strip())
            yryu_nissu = float(list_wkwk_joho[1].strip())
            krzikan = list_wkwk_joho[2].strip()
            nohu_time = list_wkwk_joho[3].strip()

        z_hour = int(nohu_time.split("h")[0])
        z_minute = int(nohu_time.split("h")[1].replace("m",""))
        td = datetime.timedelta(hours=z_hour, minutes=z_minute, seconds=0)
        m_nohu_time = int(td.total_seconds())/60            
        #リストIN
        year_month = int(f.split("-")[-1].replace(".pdf",""))
        list_year_month.append(year_month)
        list_all_hanawarikirin.append(base_monn)
        list_all_got_allgot.append(got_allgot)
        list_all_byebye_allgot.append(byebye_allgot)
        list_all_sasieikigotgaku.append(sasieikigotgaku)
        list_all_nohuhandtouch.append(nohu_handtouch)
        list_all_m_nohu_time.append(m_nohu_time)
        dic_all[year_month] = [base_monn, got_allgot, byebye_allgot, \
                               sasieikigotgaku, nohu_handtouch, m_nohu_time]

0
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?