import pathlib
import camelot
import pandas as pd
import requests
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
# ダウンロード
url = "https://www.mhlw.go.jp/content/10800000/000773371.pdf"
p = fetch_file(url)
# PDF変換
tables = camelot.read_pdf(
str(p), pages="all", split_text=True, strip_text=" ", line_scale=40
)
dfs = [pd.DataFrame(table.data[1:], columns=table.data[0]) for table in tables]
df0 = pd.concat(dfs).set_index("No")
df1 = df0.copy()
df1["都道府県"] = df1["都道府県"].str.extract("(.+[都道府県])").fillna(method="ffill")
df1
df1.to_csv("災害拠点病院一覧.csv", encoding="utf_8_sig")
# 基幹・地域 集計
df2 = pd.crosstab(df1["都道府県"], df1["区分"])
df2
# 基幹・地域 抽出
df3 = (
df0["都道府県"]
.str.split(expand=True)
.dropna(how="all")
.rename(columns={0: "都道府県", 1: "基幹", 2: "地域"})
)
df3.set_index("都道府県", inplace=True)
df3["基幹"] = (
pd.to_numeric(df3["基幹"].str.replace("基幹", ""), errors="coerce")
.fillna(0)
.astype(int)
)
df3["地域"] = (
pd.to_numeric(df3["地域"].str.replace("地域", ""), errors="coerce")
.fillna(0)
.astype(int)
)
df3
# 基幹・地域 確認
(df3 - df2).reindex(df3.index)