More than 1 year has passed since last update.

災害拠点病院一覧のPDFをCSVに変換

Posted at 2022-01-31

import pathlib

import camelot
import pandas as pd
import requests


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

# ダウンロード

url = "https://www.mhlw.go.jp/content/10800000/000773371.pdf"

p = fetch_file(url)


# PDF変換

tables = camelot.read_pdf(
    str(p), pages="all", split_text=True, strip_text=" 　", line_scale=40
)

dfs = [pd.DataFrame(table.data[1:], columns=table.data[0]) for table in tables]

df0 = pd.concat(dfs).set_index("No")

df1 = df0.copy()

df1["都道府県"] = df1["都道府県"].str.extract("(.+[都道府県])").fillna(method="ffill")

df1

df1.to_csv("災害拠点病院一覧.csv", encoding="utf_8_sig")

# 基幹・地域　集計
df2 = pd.crosstab(df1["都道府県"], df1["区分"])

df2

# 基幹・地域　抽出

df3 = (
    df0["都道府県"]
    .str.split(expand=True)
    .dropna(how="all")
    .rename(columns={0: "都道府県", 1: "基幹", 2: "地域"})
)

df3.set_index("都道府県", inplace=True)

df3["基幹"] = (
    pd.to_numeric(df3["基幹"].str.replace("基幹", ""), errors="coerce")
    .fillna(0)
    .astype(int)
)

df3["地域"] = (
    pd.to_numeric(df3["地域"].str.replace("地域", ""), errors="coerce")
    .fillna(0)
    .astype(int)
)

df3

# 基幹・地域 確認
(df3 - df2).reindex(df3.index)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up