スプレッドシート
camelotだと遅いのと変換できない県があったのでpdfplumberで変換
とりあえず変換のみ
import pathlib
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pdfplumber
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, name, dir="."):
p = pathlib.Path(dir, f"{name}.pdf")
p.parent.mkdir(parents=True, exist_ok=True)
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000186912_00002.html"
soup = fetch_soup(url)
dfs = []
for i in soup.select("ul.m-listLink--hCol2 > li > a"):
pref = i.get_text()
print(pref)
link = urljoin(url, i.get("href"))
p = fetch_file(link, pref)
with pdfplumber.open(p) as pdf:
for page in pdf.pages:
table = page.extract_table()
df_tmp = pd.DataFrame(table)
row, col = df_tmp.shape
if col == 8:
df_tmp.insert(0, "有無", "")
df_tmp.set_axis(
[
"公表の希望の有無",
"施設名",
"郵便番号",
"住所",
"電話番号",
"ウェブサイトURL",
"産科、婦人科又は産婦人科の標榜の有無",
"医療機関における緊急避妊にかかる対面診療への対応可能時間帯",
"常時の緊急避妊薬の在庫の有無",
],
axis=1,
inplace=True,
)
dfs.append(df_tmp)
df0 = pd.concat(dfs).reset_index(drop=True)
df0
df1 = df0.copy()
df1[df1["施設名"].isin(["基本情報", "施設名"])].to_csv("kakunin.csv", encoding="utf_8_sig")
df1.shape
df2 = df1[~df1["施設名"].isin(["基本情報", "施設名"])].copy().reset_index(drop=True)
df2.shape
df2["施設名"] = df2["施設名"].str.replace("\n", "")
df2["住所"] = df2["住所"].str.replace("\n", "")
df2["ウェブサイトURL"] = df2["ウェブサイトURL"].str.replace("\n", "")
df2
df2.to_csv("list.csv", encoding="utf_8_sig")