More than 3 years have passed since last update.


Posted at 2020-12-05




import json
import pathlib
import re

import pandas as pd
import pdfplumber
import requests

def get_pdf_file(url, dir="."):

    r = requests.get(url)

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
    return p

table_settings = {
    "vertical_strategy": "lines_strict",
    "horizontal_strategy": "lines_strict",

def get_dict_from_pdf(fpath):

    result = {}
    patients = {}

    with pdfplumber.open(fpath) as pdf:

        page = pdf.pages[0]

        for table in page.extract_tables(table_settings):

            if table[0][0] == "総数":
                result["患者の発生状況"] = (
                    pd.Series(table[2], index=["総数", "濃厚接触者", "海外渡航歴", "調査中"])
                    .str.replace(",", "")

            elif table[0][0].startswith("65歳以上"):
                patients["65歳以上高齢者"] = int(table[1][0].replace(",", ""))

            elif table[0][0] == "重症者":
                patients["重症者"] = int(table[1][0].replace(",", ""))

            elif table[0][0].startswith("検査実施件数"):
                patients["検査実施件数"] = int(table[1][0].replace(",", ""))

            elif table[0][0] == "10歳未満":
                result["年代"] = (
                    pd.Series(table[1], index=table[0])
                    .str.replace(",", "")

            elif table[0] == ["男性", "女性", "不明"]:
                result["性別"] = (
                    pd.Series(table[1], index=table[0])
                    .str.replace(",", "")

            elif table[0][0] == "総数(累計)":
                result["都内発生数"] = (
                    .str.replace(",", "")

            elif table[0][0] == "千代田":
                df_tmp = pd.DataFrame(table)

                idx = df_tmp.iloc[0::2].stack().dropna().values
                dat = df_tmp.iloc[1::2].stack().dropna().values

                df = (
                    pd.Series(dat, index=idx)
                    .rename(columns={0: "患者数", 1: "都内発生分"})

                df["患者数"] = df["患者数"].str.replace(",", "").astype(int)
                df["都内発生分"] = df["都内発生分"].str.replace(",", "").astype(int)

                result["区市町村別患者数"] = df.to_dict(orient="dict")

            elif table[0][0].endswith("0代"):
                result["重症者の年代"] = (
                    pd.Series(table[1], index=table[0])
                    .str.replace(",", "")

            elif table[0] == ["", "", "確認中"]:
                result["重症者の性別"] = (
                    pd.Series(table[1], index=table[0])
                    .str.replace(",", "")

            elif table[0][0] == "入院":
                result["治療中"] = (
                    pd.Series(table[1], index=[re.sub("\s", "", i) for i in table[0]])
                    .str.replace(",", "")


    return result
# ダウンロード
p_pdf = get_pdf_file("https://www.metro.tokyo.lg.jp/tosei/hodohappyo/press/2020/11/30/documents/20201129.pdf")

# PDFから表を抽出
data = get_dict_from_pdf(p_pdf)

# 拡張子をJSONに変更
p_json = p_pdf.with_suffix('.json')

# JSONに保存
with p_json.open(mode="w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)


    "患者の発生状況": {
        "総数": 449,
        "濃厚接触者": 228,
        "海外渡航歴": 0,
        "調査中": 221,
        "65歳以上高齢者": 63,
        "重症者": 0,
        "検査実施件数": 7612
    "年代": {
        "10歳未満": 16,
        "10代": 20,
        "20代": 107,
        "30代": 91,
        "40代": 80,
        "50代": 50,
        "60代": 37,
        "70代": 22,
        "80代": 16,
        "90代": 10,
        "100歳以上": 0,
        "不明": 0
    "性別": {
        "男性": 241,
        "女性": 208,
        "不明": 0
    "都内発生数": {
        "総数(累計)": 42793,
        "入院中": 1721,
        "重症者": 53,
        "宿泊療養": 727,
        "自宅療養": 1159,
        "入院・療養等調整中": 633,
        "死亡(累計)": 509,
        "退院(累計)": 38044
    "区市町村別患者数": {
        "患者数": {
            "千代田": 232,
            "中央": 880,
            "港": 1882,
            "新宿": 3597,
            "文京": 720,
            "台東": 889,
            "墨田": 880,
            "江東": 1429,
            "品川": 1463,
            "目黒": 1261,
            "大田": 2119,
            "世田谷": 3347,
            "渋谷": 1430,
            "中野": 1639,
            "杉並": 1704,
            "豊島": 1172,
            "北": 852,
            "荒川": 664,
            "板橋": 1478,
            "練馬": 1755,
            "足立": 1951,
            "葛飾": 1176,
            "江戸川": 1630,
            "八王子": 734,
            "立川": 209,
            "武蔵野": 309,
            "三鷹": 351,
            "青梅": 154,
            "府中": 388,
            "昭島": 114,
            "調布": 416,
            "町田": 482,
            "小金井": 177,
            "小平": 247,
            "日野": 241,
            "東村山": 141,
            "国分寺": 148,
            "国立": 71,
            "福生": 77,
            "狛江": 137,
            "東大和": 58,
            "清瀬": 95,
            "東久留米": 114,
            "武蔵村山": 48,
            "多摩": 181,
            "稲城": 118,
            "羽村": 44,
            "あきる野": 68,
            "西東京": 392,
            "瑞穂": 23,
            "日の出": 30,
            "檜原": 2,
            "奥多摩": 3,
            "大島": 6,
            "利島": 0,
            "新島": 0,
            "神津島": 0,
            "三宅": 3,
            "御蔵島": 1,
            "八丈": 6,
            "青ヶ島": 0,
            "小笠原": 2,
            "都外": 2570,
            "調査中": 34
        "都内発生分": {
            "千代田": 214,
            "中央": 757,
            "港": 1751,
            "新宿": 3411,
            "文京": 618,
            "台東": 820,
            "墨田": 765,
            "江東": 1222,
            "品川": 1332,
            "目黒": 1163,
            "大田": 1899,
            "世田谷": 3116,
            "渋谷": 1323,
            "中野": 1520,
            "杉並": 1521,
            "豊島": 1071,
            "北": 757,
            "荒川": 599,
            "板橋": 1286,
            "練馬": 1542,
            "足立": 1704,
            "葛飾": 988,
            "江戸川": 1478,
            "八王子": 620,
            "立川": 184,
            "武蔵野": 271,
            "三鷹": 297,
            "青梅": 130,
            "府中": 363,
            "昭島": 89,
            "調布": 369,
            "町田": 415,
            "小金井": 165,
            "小平": 213,
            "日野": 192,
            "東村山": 117,
            "国分寺": 132,
            "国立": 64,
            "福生": 67,
            "狛江": 119,
            "東大和": 49,
            "清瀬": 78,
            "東久留米": 105,
            "武蔵村山": 38,
            "多摩": 156,
            "稲城": 103,
            "羽村": 36,
            "あきる野": 51,
            "西東京": 345,
            "瑞穂": 19,
            "日の出": 11,
            "檜原": 2,
            "奥多摩": 3,
            "大島": 6,
            "利島": 0,
            "新島": 0,
            "神津島": 0,
            "三宅": 3,
            "御蔵島": 1,
            "八丈": 4,
            "青ヶ島": 0,
            "小笠原": 2,
            "都外": 2460,
            "調査中": 34
    "重症者の年代": {
        "40代": 2,
        "50代": 4,
        "60代": 12,
        "70代": 22,
        "80代": 13,
        "確認中": 0
    "治療中": {
        "入院": 13,
        "宿泊療養": 9,
        "自宅療養": 35,
        "退院・療養等終了": 13,
        "他県転送": 1,
        "移管手続中": 110,
        "不明のため調査中": 0
    "重症者の性別": {
        "男": 40,
        "女": 13,
        "確認中": 0

