LoginSignup
0
0

More than 1 year has passed since last update.

緊急避妊にかかる対面診療が可能な産婦人科医療機関等の一覧をCSVに変換

Last updated at Posted at 2021-09-06

スプレッドシート

camelotだと遅いのと変換できない県があったのでpdfplumberで変換

とりあえず変換のみ

import pathlib
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

import pdfplumber
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, name, dir="."):

    p = pathlib.Path(dir, f"{name}.pdf")
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000186912_00002.html"

soup = fetch_soup(url)

dfs = []

for i in soup.select("ul.m-listLink--hCol2 > li > a"):

    pref = i.get_text()

    print(pref)

    link = urljoin(url, i.get("href"))

    p = fetch_file(link, pref)

    with pdfplumber.open(p) as pdf:

        for page in pdf.pages:

            table = page.extract_table()

            df_tmp = pd.DataFrame(table)

            row, col = df_tmp.shape

            if col == 8:

                df_tmp.insert(0, "有無", "")

            df_tmp.set_axis(
                [
                    "公表の希望の有無",
                    "施設名",
                    "郵便番号",
                    "住所",
                    "電話番号",
                    "ウェブサイトURL",
                    "産科、婦人科又は産婦人科の標榜の有無",
                    "医療機関における緊急避妊にかかる対面診療への対応可能時間帯",
                    "常時の緊急避妊薬の在庫の有無",
                ],
                axis=1,
                inplace=True,
            )

            dfs.append(df_tmp)

df0 = pd.concat(dfs).reset_index(drop=True)
df0

df1 = df0.copy()

df1[df1["施設名"].isin(["基本情報", "施設名"])].to_csv("kakunin.csv", encoding="utf_8_sig")

df1.shape

df2 = df1[~df1["施設名"].isin(["基本情報", "施設名"])].copy().reset_index(drop=True)

df2.shape


df2["施設名"] = df2["施設名"].str.replace("\n", "")
df2["住所"] = df2["住所"].str.replace("\n", "")
df2["ウェブサイトURL"] = df2["ウェブサイトURL"].str.replace("\n", "")

df2

df2.to_csv("list.csv", encoding="utf_8_sig")
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0