緊急避妊にかかる対面診療が可能な産婦人科医療機関等の一覧をCSVに変換

Last updated at 2024-06-09Posted at 2021-09-06

最終更新　令和6年5月20日のデータ
スプレッドシート

camelotだと遅いのと変換できない県があったのでpdfplumberで変換

とりあえず変換のみ

import pathlib
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

import pdfplumber
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, name, dir="."):

    p = pathlib.Path(dir, f"{name}.pdf")
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000186912_00002.html"

soup = fetch_soup(url)

dfs = []

for i in soup.select("ul.m-listLink--hCol2 > li > a"):

    pref = i.get_text()

    print(pref)

    link = urljoin(url, i.get("href"))

    p = fetch_file(link, pref)

    with pdfplumber.open(p) as pdf:

        for page in pdf.pages:

            table = page.extract_table()

            df_tmp = pd.DataFrame(table)

            row, col = df_tmp.shape

            if col == 8:

                df_tmp.insert(0, "有無", "")

            df_tmp = df_tmp.set_axis(
                [
                    "公表の希望の有無",
                    "施設名",
                    "郵便番号",
                    "住所",
                    "電話番号",
                    "ウェブサイトURL",
                    "産科、婦人科又は産婦人科の標榜の有無",
                    "医療機関における緊急避妊にかかる対面診療への対応可能時間帯",
                    "常時の緊急避妊薬の在庫の有無",
                ],
                axis=1,
            )

            dfs.append(df_tmp)

df0 = pd.concat(dfs).reset_index(drop=True)
df0

df1 = df0.copy()

df1[df1["施設名"].isin(["基本情報", "施設名"])].to_csv("kakunin.csv", encoding="utf_8_sig")

df1.shape

df2 = df1[~df1["施設名"].isin(["基本情報", "施設名"])].copy().reset_index(drop=True)

df2.shape


df2["施設名"] = df2["施設名"].str.replace("\n", "")
df2["住所"] = df2["住所"].str.replace("\n", "")
df2["ウェブサイトURL"] = df2["ウェブサイトURL"].str.replace("\n", "")

df2

df2.to_csv("list.csv", encoding="utf_8_sig")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up