0
1

More than 3 years have passed since last update.

公益財団法人日本薬剤師会の医薬分業進捗状況(保険調剤の動向)のPDFをCSVに変換する

Posted at

公益財団法人日本薬剤師会の医薬分業進捗状況(保険調剤の動向)のPDFをCSVに変換する

import pathlib
import time
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    # 同一ファイル名の場合はダウンロードしない
    if not p.exists():

        # サーバー負荷軽減のため3秒待機
        time.sleep(3)

        r = requests.get(url)

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

# スクレイピング

url = "https://www.nichiyaku.or.jp/activities/division/faqShinchoku.html"

soup = fetch_soup(url)

# PDFのURLを抽出
links = [urljoin(url, i.get("href")) for i in soup.select("section.section a.btn-pdf")]

# PDF変換

table_settings = {
    "vertical_strategy": "lines",
    "horizontal_strategy": "text",
    "intersection_tolerance": 5,
}

for link in tqdm(links):

    # PDFファイルをダウンロード
    path_pdf = fetch_file(link, "pdf")

    with pdfplumber.open(path_pdf) as pdf:

        page = pdf.pages[0]

        # PDFの表をテキスト変換
        table = page.extract_table(table_settings)

        # CSV加工用に一時pandasで読み込み
        df = pd.DataFrame(table)

        # ヘッダー1行目の結合セルを補完
        df.iloc[0] = df.iloc[0].fillna(method="ffill")

        # 保存用にPDFファイル名の拡張子をCSVに変更
        path_csv = pathlib.Path("csv", path_pdf.with_suffix(".csv").name)
        path_csv.parent.mkdir(parents=True, exist_ok=True)

        df.to_csv(path_csv, encoding="utf_8_sig", index=False, header=False)
0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1