LoginSignup
0
3

More than 3 years have passed since last update.

東京都の新型コロナウイルスに関連した患者の発生についての別紙PDFから表を抽出しJSONに変換

Last updated at Posted at 2020-12-05

PDF

02_00-1.png

プログラム

import json
import pathlib
import re

import pandas as pd
import pdfplumber
import requests

def get_pdf_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

table_settings = {
    "vertical_strategy": "lines_strict",
    "horizontal_strategy": "lines_strict",
}

def get_dict_from_pdf(fpath):

    result = {}
    patients = {}

    with pdfplumber.open(fpath) as pdf:

        page = pdf.pages[0]

        for table in page.extract_tables(table_settings):
            print(table[0][0])

            if table[0][0] == "総数":
                result["患者の発生状況"] = (
                    pd.Series(table[2], index=["総数", "濃厚接触者", "海外渡航歴", "調査中"])
                    .str.replace(",", "")
                    .astype(int)
                    .to_dict()
                )

            elif table[0][0].startswith("65歳以上"):
                patients["65歳以上高齢者"] = int(table[1][0].replace(",", ""))

            elif table[0][0] == "重症者":
                patients["重症者"] = int(table[1][0].replace(",", ""))

            elif table[0][0].startswith("検査実施件数"):
                patients["検査実施件数"] = int(table[1][0].replace(",", ""))

            elif table[0][0] == "10歳未満":
                result["年代"] = (
                    pd.Series(table[1], index=table[0])
                    .str.replace(",", "")
                    .astype(int)
                    .to_dict()
                )

            elif table[0] == ["男性", "女性", "不明"]:
                result["性別"] = (
                    pd.Series(table[1], index=table[0])
                    .str.replace(",", "")
                    .astype(int)
                    .to_dict()
                )

            elif table[0][0] == "総数(累計)":
                result["都内発生数"] = (
                    pd.Series(
                        table[2],
                        index=[
                            "総数(累計)",
                            "入院中",
                            "重症者",
                            "宿泊療養",
                            "自宅療養",
                            "入院・療養等調整中",
                            "死亡(累計)",
                            "退院(累計)",
                        ],
                    )
                    .str.replace(",", "")
                    .astype(int)
                    .to_dict()
                )

            elif table[0][0] == "千代田":
                df_tmp = pd.DataFrame(table)

                idx = df_tmp.iloc[0::2].stack().dropna().values
                dat = df_tmp.iloc[1::2].stack().dropna().values

                df = (
                    pd.Series(dat, index=idx)
                    .str.extract("(.+)\((.+)\)")
                    .rename(columns={0: "患者数", 1: "都内発生分"})
                )

                df["患者数"] = df["患者数"].str.replace(",", "").astype(int)
                df["都内発生分"] = df["都内発生分"].str.replace(",", "").astype(int)

                result["区市町村別患者数"] = df.to_dict(orient="dict")

            elif table[0][0].endswith("0代"):
                result["重症者の年代"] = (
                    pd.Series(table[1], index=table[0])
                    .str.replace(",", "")
                    .astype(int)
                    .to_dict()
                )

            elif table[0] == ["男", "女", "確認中"]:
                result["重症者の性別"] = (
                    pd.Series(table[1], index=table[0])
                    .str.replace(",", "")
                    .astype(int)
                    .to_dict()
                )

            elif table[0][0] == "入院":
                result["治療中"] = (
                    pd.Series(table[1], index=[re.sub("\s", "", i) for i in table[0]])
                    .str.replace(",", "")
                    .astype(int)
                    .to_dict()
                )

    result["患者の発生状況"].update(patients)

    return result
# ダウンロード
p_pdf = get_pdf_file("https://www.metro.tokyo.lg.jp/tosei/hodohappyo/press/2020/11/30/documents/20201129.pdf")

# PDFから表を抽出
data = get_dict_from_pdf(p_pdf)

# 拡張子をJSONに変更
p_json = p_pdf.with_suffix('.json')

# JSONに保存
with p_json.open(mode="w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)

結果

{
    "患者の発生状況": {
        "総数": 449,
        "濃厚接触者": 228,
        "海外渡航歴": 0,
        "調査中": 221,
        "65歳以上高齢者": 63,
        "重症者": 0,
        "検査実施件数": 7612
    },
    "年代": {
        "10歳未満": 16,
        "10代": 20,
        "20代": 107,
        "30代": 91,
        "40代": 80,
        "50代": 50,
        "60代": 37,
        "70代": 22,
        "80代": 16,
        "90代": 10,
        "100歳以上": 0,
        "不明": 0
    },
    "性別": {
        "男性": 241,
        "女性": 208,
        "不明": 0
    },
    "都内発生数": {
        "総数(累計)": 42793,
        "入院中": 1721,
        "重症者": 53,
        "宿泊療養": 727,
        "自宅療養": 1159,
        "入院・療養等調整中": 633,
        "死亡(累計)": 509,
        "退院(累計)": 38044
    },
    "区市町村別患者数": {
        "患者数": {
            "千代田": 232,
            "中央": 880,
            "港": 1882,
            "新宿": 3597,
            "文京": 720,
            "台東": 889,
            "墨田": 880,
            "江東": 1429,
            "品川": 1463,
            "目黒": 1261,
            "大田": 2119,
            "世田谷": 3347,
            "渋谷": 1430,
            "中野": 1639,
            "杉並": 1704,
            "豊島": 1172,
            "北": 852,
            "荒川": 664,
            "板橋": 1478,
            "練馬": 1755,
            "足立": 1951,
            "葛飾": 1176,
            "江戸川": 1630,
            "八王子": 734,
            "立川": 209,
            "武蔵野": 309,
            "三鷹": 351,
            "青梅": 154,
            "府中": 388,
            "昭島": 114,
            "調布": 416,
            "町田": 482,
            "小金井": 177,
            "小平": 247,
            "日野": 241,
            "東村山": 141,
            "国分寺": 148,
            "国立": 71,
            "福生": 77,
            "狛江": 137,
            "東大和": 58,
            "清瀬": 95,
            "東久留米": 114,
            "武蔵村山": 48,
            "多摩": 181,
            "稲城": 118,
            "羽村": 44,
            "あきる野": 68,
            "西東京": 392,
            "瑞穂": 23,
            "日の出": 30,
            "檜原": 2,
            "奥多摩": 3,
            "大島": 6,
            "利島": 0,
            "新島": 0,
            "神津島": 0,
            "三宅": 3,
            "御蔵島": 1,
            "八丈": 6,
            "青ヶ島": 0,
            "小笠原": 2,
            "都外": 2570,
            "調査中": 34
        },
        "都内発生分": {
            "千代田": 214,
            "中央": 757,
            "港": 1751,
            "新宿": 3411,
            "文京": 618,
            "台東": 820,
            "墨田": 765,
            "江東": 1222,
            "品川": 1332,
            "目黒": 1163,
            "大田": 1899,
            "世田谷": 3116,
            "渋谷": 1323,
            "中野": 1520,
            "杉並": 1521,
            "豊島": 1071,
            "北": 757,
            "荒川": 599,
            "板橋": 1286,
            "練馬": 1542,
            "足立": 1704,
            "葛飾": 988,
            "江戸川": 1478,
            "八王子": 620,
            "立川": 184,
            "武蔵野": 271,
            "三鷹": 297,
            "青梅": 130,
            "府中": 363,
            "昭島": 89,
            "調布": 369,
            "町田": 415,
            "小金井": 165,
            "小平": 213,
            "日野": 192,
            "東村山": 117,
            "国分寺": 132,
            "国立": 64,
            "福生": 67,
            "狛江": 119,
            "東大和": 49,
            "清瀬": 78,
            "東久留米": 105,
            "武蔵村山": 38,
            "多摩": 156,
            "稲城": 103,
            "羽村": 36,
            "あきる野": 51,
            "西東京": 345,
            "瑞穂": 19,
            "日の出": 11,
            "檜原": 2,
            "奥多摩": 3,
            "大島": 6,
            "利島": 0,
            "新島": 0,
            "神津島": 0,
            "三宅": 3,
            "御蔵島": 1,
            "八丈": 4,
            "青ヶ島": 0,
            "小笠原": 2,
            "都外": 2460,
            "調査中": 34
        }
    },
    "重症者の年代": {
        "40代": 2,
        "50代": 4,
        "60代": 12,
        "70代": 22,
        "80代": 13,
        "確認中": 0
    },
    "治療中": {
        "入院": 13,
        "宿泊療養": 9,
        "自宅療養": 35,
        "退院・療養等終了": 13,
        "他県転送": 1,
        "移管手続中": 110,
        "不明のため調査中": 0
    },
    "重症者の性別": {
        "男": 40,
        "女": 13,
        "確認中": 0
    }
}
0
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
3