プログラム
import json
import pathlib
import re
import pandas as pd
import pdfplumber
import requests
def get_pdf_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
table_settings = {
"vertical_strategy": "lines_strict",
"horizontal_strategy": "lines_strict",
}
def get_dict_from_pdf(fpath):
result = {}
patients = {}
with pdfplumber.open(fpath) as pdf:
page = pdf.pages[0]
for table in page.extract_tables(table_settings):
print(table[0][0])
if table[0][0] == "総数":
result["患者の発生状況"] = (
pd.Series(table[2], index=["総数", "濃厚接触者", "海外渡航歴", "調査中"])
.str.replace(",", "")
.astype(int)
.to_dict()
)
elif table[0][0].startswith("65歳以上"):
patients["65歳以上高齢者"] = int(table[1][0].replace(",", ""))
elif table[0][0] == "重症者":
patients["重症者"] = int(table[1][0].replace(",", ""))
elif table[0][0].startswith("検査実施件数"):
patients["検査実施件数"] = int(table[1][0].replace(",", ""))
elif table[0][0] == "10歳未満":
result["年代"] = (
pd.Series(table[1], index=table[0])
.str.replace(",", "")
.astype(int)
.to_dict()
)
elif table[0] == ["男性", "女性", "不明"]:
result["性別"] = (
pd.Series(table[1], index=table[0])
.str.replace(",", "")
.astype(int)
.to_dict()
)
elif table[0][0] == "総数(累計)":
result["都内発生数"] = (
pd.Series(
table[2],
index=[
"総数(累計)",
"入院中",
"重症者",
"宿泊療養",
"自宅療養",
"入院・療養等調整中",
"死亡(累計)",
"退院(累計)",
],
)
.str.replace(",", "")
.astype(int)
.to_dict()
)
elif table[0][0] == "千代田":
df_tmp = pd.DataFrame(table)
idx = df_tmp.iloc[0::2].stack().dropna().values
dat = df_tmp.iloc[1::2].stack().dropna().values
df = (
pd.Series(dat, index=idx)
.str.extract("(.+)\((.+)\)")
.rename(columns={0: "患者数", 1: "都内発生分"})
)
df["患者数"] = df["患者数"].str.replace(",", "").astype(int)
df["都内発生分"] = df["都内発生分"].str.replace(",", "").astype(int)
result["区市町村別患者数"] = df.to_dict(orient="dict")
elif table[0][0].endswith("0代"):
result["重症者の年代"] = (
pd.Series(table[1], index=table[0])
.str.replace(",", "")
.astype(int)
.to_dict()
)
elif table[0] == ["男", "女", "確認中"]:
result["重症者の性別"] = (
pd.Series(table[1], index=table[0])
.str.replace(",", "")
.astype(int)
.to_dict()
)
elif table[0][0] == "入院":
result["治療中"] = (
pd.Series(table[1], index=[re.sub("\s", "", i) for i in table[0]])
.str.replace(",", "")
.astype(int)
.to_dict()
)
result["患者の発生状況"].update(patients)
return result
# ダウンロード
p_pdf = get_pdf_file("https://www.metro.tokyo.lg.jp/tosei/hodohappyo/press/2020/11/30/documents/20201129.pdf")
# PDFから表を抽出
data = get_dict_from_pdf(p_pdf)
# 拡張子をJSONに変更
p_json = p_pdf.with_suffix('.json')
# JSONに保存
with p_json.open(mode="w", encoding="utf-8") as fw:
json.dump(data, fw, ensure_ascii=False, indent=4)
結果
{
"患者の発生状況": {
"総数": 449,
"濃厚接触者": 228,
"海外渡航歴": 0,
"調査中": 221,
"65歳以上高齢者": 63,
"重症者": 0,
"検査実施件数": 7612
},
"年代": {
"10歳未満": 16,
"10代": 20,
"20代": 107,
"30代": 91,
"40代": 80,
"50代": 50,
"60代": 37,
"70代": 22,
"80代": 16,
"90代": 10,
"100歳以上": 0,
"不明": 0
},
"性別": {
"男性": 241,
"女性": 208,
"不明": 0
},
"都内発生数": {
"総数(累計)": 42793,
"入院中": 1721,
"重症者": 53,
"宿泊療養": 727,
"自宅療養": 1159,
"入院・療養等調整中": 633,
"死亡(累計)": 509,
"退院(累計)": 38044
},
"区市町村別患者数": {
"患者数": {
"千代田": 232,
"中央": 880,
"港": 1882,
"新宿": 3597,
"文京": 720,
"台東": 889,
"墨田": 880,
"江東": 1429,
"品川": 1463,
"目黒": 1261,
"大田": 2119,
"世田谷": 3347,
"渋谷": 1430,
"中野": 1639,
"杉並": 1704,
"豊島": 1172,
"北": 852,
"荒川": 664,
"板橋": 1478,
"練馬": 1755,
"足立": 1951,
"葛飾": 1176,
"江戸川": 1630,
"八王子": 734,
"立川": 209,
"武蔵野": 309,
"三鷹": 351,
"青梅": 154,
"府中": 388,
"昭島": 114,
"調布": 416,
"町田": 482,
"小金井": 177,
"小平": 247,
"日野": 241,
"東村山": 141,
"国分寺": 148,
"国立": 71,
"福生": 77,
"狛江": 137,
"東大和": 58,
"清瀬": 95,
"東久留米": 114,
"武蔵村山": 48,
"多摩": 181,
"稲城": 118,
"羽村": 44,
"あきる野": 68,
"西東京": 392,
"瑞穂": 23,
"日の出": 30,
"檜原": 2,
"奥多摩": 3,
"大島": 6,
"利島": 0,
"新島": 0,
"神津島": 0,
"三宅": 3,
"御蔵島": 1,
"八丈": 6,
"青ヶ島": 0,
"小笠原": 2,
"都外": 2570,
"調査中": 34
},
"都内発生分": {
"千代田": 214,
"中央": 757,
"港": 1751,
"新宿": 3411,
"文京": 618,
"台東": 820,
"墨田": 765,
"江東": 1222,
"品川": 1332,
"目黒": 1163,
"大田": 1899,
"世田谷": 3116,
"渋谷": 1323,
"中野": 1520,
"杉並": 1521,
"豊島": 1071,
"北": 757,
"荒川": 599,
"板橋": 1286,
"練馬": 1542,
"足立": 1704,
"葛飾": 988,
"江戸川": 1478,
"八王子": 620,
"立川": 184,
"武蔵野": 271,
"三鷹": 297,
"青梅": 130,
"府中": 363,
"昭島": 89,
"調布": 369,
"町田": 415,
"小金井": 165,
"小平": 213,
"日野": 192,
"東村山": 117,
"国分寺": 132,
"国立": 64,
"福生": 67,
"狛江": 119,
"東大和": 49,
"清瀬": 78,
"東久留米": 105,
"武蔵村山": 38,
"多摩": 156,
"稲城": 103,
"羽村": 36,
"あきる野": 51,
"西東京": 345,
"瑞穂": 19,
"日の出": 11,
"檜原": 2,
"奥多摩": 3,
"大島": 6,
"利島": 0,
"新島": 0,
"神津島": 0,
"三宅": 3,
"御蔵島": 1,
"八丈": 4,
"青ヶ島": 0,
"小笠原": 2,
"都外": 2460,
"調査中": 34
}
},
"重症者の年代": {
"40代": 2,
"50代": 4,
"60代": 12,
"70代": 22,
"80代": 13,
"確認中": 0
},
"治療中": {
"入院": 13,
"宿泊療養": 9,
"自宅療養": 35,
"退院・療養等終了": 13,
"他県転送": 1,
"移管手続中": 110,
"不明のため調査中": 0
},
"重症者の性別": {
"男": 40,
"女": 13,
"確認中": 0
}
}