More than 3 years have passed since last update.

公益財団法人日本薬剤師会の医薬分業進捗状況（保険調剤の動向）のPDFをCSVに変換する

Posted at 2021-01-21

import pathlib
import time
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    # 同一ファイル名の場合はダウンロードしない
    if not p.exists():

        # サーバー負荷軽減のため3秒待機
        time.sleep(3)

        r = requests.get(url)

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

# スクレイピング

url = "https://www.nichiyaku.or.jp/activities/division/faqShinchoku.html"

soup = fetch_soup(url)

# PDFのURLを抽出
links = [urljoin(url, i.get("href")) for i in soup.select("section.section a.btn-pdf")]

# PDF変換

table_settings = {
    "vertical_strategy": "lines",
    "horizontal_strategy": "text",
    "intersection_tolerance": 5,
}

for link in tqdm(links):

    # PDFファイルをダウンロード
    path_pdf = fetch_file(link, "pdf")

    with pdfplumber.open(path_pdf) as pdf:

        page = pdf.pages[0]

        # PDFの表をテキスト変換
        table = page.extract_table(table_settings)

        # CSV加工用に一時pandasで読み込み
        df = pd.DataFrame(table)

        # ヘッダー1行目の結合セルを補完
        df.iloc[0] = df.iloc[0].fillna(method="ffill")

        # 保存用にPDFファイル名の拡張子をCSVに変更
        path_csv = pathlib.Path("csv", path_pdf.with_suffix(".csv").name)
        path_csv.parent.mkdir(parents=True, exist_ok=True)

        df.to_csv(path_csv, encoding="utf_8_sig", index=False, header=False)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up