macOS:BigSur
python:3.7.9
camelot-py:0.8.2
tabula-py:1.4.1
変数はぼかす。
camelotで読んだらprintで確認しつつあたりつける必要がありそう。
pip install opencv-py
pip install tabula-py
pip install camelot-py
あとghostscriptの32bit版、いや64bit版だっけをインストールする。
get_table_from_pdf.py
import glob
import pandas as pd
import pickle
import tabula
import camelot
import numpy as np
import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import matplotlib as mpl
mpl.rcParams['font.family'] = 'eiragino Maru Goteic Pro'
import matplotlib.pyplot as plt
%matplotlib inline
dir_monn = path_monn+"*.pdf"
#金額のカンマを削除してint形式で返す。カンマない金額はそのままintで
def conv_camma_to_int(str_num_with_camma):
if "," in str_num_with_camma:
return(int(str_num_with_camma.replace(",","")))
else:
return(int(str_num_with_camma))
def minute_to_hour(minute_time):
return(minute_time/60)
#読んだファイル内容確認用?
table_pdf_test = camelot.read_pdf(files[0], split_text=True, password=pass_pdf)
for t in table_pdf_test:
df = t.df
df.drop(0, inplace=True)
print(df)
#抜き出した内容確認用?
print(table_pdf_test[2].df.iloc[0,2])
print(table_pdf_test[2].df.iloc[1,1].replace("\n","\t"))
wkwk_info = table_pdf_test[0].df.iloc[1,1]
if "\n" in wkwk_info and len(wkwk_info.split("\n"))==3:
print(wkwk_info.split("\n")[2])
#処理
dic_all = {}
list_all_hanawarikirin = []
list_all_got_allgot = []
list_all_byebye_allgot = []
list_all_sasieikigotgaku = []
list_all_nohuhandtouch = []
list_year_month = []
#other
list_all_m_nohu_time = []
for f in files:
if "monn" in f:
table_pdf_test = camelot.read_pdf(f, split_text=True, password=pass_pdf)
tdf = table_pdf_test[2].df
tdf.drop(0, inplace=True)
got_detailed = tdf.iloc[1,1]
if "\n" in got_detailed:
list_got_detailed = got_detailed.split("\n")
num_got_detailed = len(list_got_detailed)
if num_got_detailed == 2:
base_monn = conv_camma_to_int(list_got_detailed[0].strip())
nohu_handtouch = 0
tsukin_handtouch = conv_camma_to_int(list_got_detailed[1].strip())
elif num_got_detailed == 3:
base_monn = conv_camma_to_int(list_got_detailed[0].strip())
nohu_handtouch = conv_camma_to_int(list_got_detailed[1].strip())
tsukin_handtouch = conv_camma_to_int(list_got_detailed[2].strip())
else:
base_monn = conv_camma_to_int(got_detailed.strip())
nohu_handtouch = 0
tsukin_handtouch = 0
got_allgotA = tdf.iloc[2,1].strip()
if "\n" in got_allgotA:
list_got_allgot = got_allgotA.split("\n")
if len(list_got_allgot) == 3:
got_allgot = conv_camma_to_int(list_got_allgot[0].strip())
omgsrerikin = conv_camma_to_int(list_got_allgot[1].strip())
tanotagotgaku = conv_camma_to_int(list_got_allgot[2].strip())
else:
print("ERROR")
print("FILE_YEAR:", f.split("/")[-1].replace(".pdf",""))
print(got_allgotA)
print()
else:
got_allgot = conv_camma_to_int(got_allgotA)
byebye_detailed = tdf.iloc[1,3]
list_byebye_detailed = byebye_detailed.split("\n")
num_byebye_detailed = len(list_byebye_detailed)
if num_byebye_detailed == 9:
sndomg = conv_camma_to_int(list_byebye_detailed[0].strip())
juminfomgm = conv_camma_to_int(list_byebye_detailed[1].strip())
tomhuhotomjim = conv_camma_to_int(list_byebye_detailed[2].strip())
huuseiyrinhotomjim = conv_camma_to_int(list_byebye_detailed[3].strip())
hureuhotomjim = conv_camma_to_int(list_byebye_detailed[4].strip())
tangaihotomjim = conv_camma_to_int(list_byebye_detailed[5].strip())
srekudouei = 0
kreusaiatukaidaikin = conv_camma_to_int(list_byebye_detailed[6].strip())
krkumiaiei = conv_camma_to_int(list_byebye_detailed[7].strip())
zatubyebye = conv_camma_to_int(list_byebye_detailed[8].strip())
else:
print("ERROR")
print("FILE_YEAR:", f.split("/")[-1].replace(".pdf",""))
print(byebye_detailed)
print()
byebye_allgot = conv_camma_to_int(tdf.iloc[2,3].strip())
sasieikigotgaku = conv_camma_to_int(tdf.iloc[4,3].strip())
wkwk_df = table_pdf_test[0].df
wkwk_df.drop(0, inplace=True)
wkwk_joho = wkwk_df.iloc[1,1]
list_wkwk_joho = wkwk_joho.split("\n")
num_wkwk_joho = len(list_wkwk_joho)
if num_wkwk_joho == 4:
got_nissu = float(list_wkwk_joho[0].strip())
yryu_nissu = float(list_wkwk_joho[1].strip())
krzikan = list_wkwk_joho[2].strip()
nohu_time = list_wkwk_joho[3].strip()
z_hour = int(nohu_time.split("h")[0])
z_minute = int(nohu_time.split("h")[1].replace("m",""))
td = datetime.timedelta(hours=z_hour, minutes=z_minute, seconds=0)
m_nohu_time = int(td.total_seconds())/60
#リストIN
year_month = int(f.split("-")[-1].replace(".pdf",""))
list_year_month.append(year_month)
list_all_hanawarikirin.append(base_monn)
list_all_got_allgot.append(got_allgot)
list_all_byebye_allgot.append(byebye_allgot)
list_all_sasieikigotgaku.append(sasieikigotgaku)
list_all_nohuhandtouch.append(nohu_handtouch)
list_all_m_nohu_time.append(m_nohu_time)
dic_all[year_month] = [base_monn, got_allgot, byebye_allgot, \
sasieikigotgaku, nohu_handtouch, m_nohu_time]