LoginSignup
1
2

More than 3 years have passed since last update.

厚生労働省のインフルエンザの発生状況についてのPDFをデータラングリング(pdfplumber)

Last updated at Posted at 2020-11-07

厚生労働省のインフルエンザの発生状況についてのPDFをデータラングリングをpdfplumberで作成

charsで文字の位置を確認してcropの範囲指定できるので簡単ですごい楽

with pdfplumber.open("data.pdf") as pdf:

    p1 = pdf.pages[1]

    # テキストの位置確認
    p1.chars

    # cropでテキスト取得
    week_crop = p1.within_bbox((0, 90, p1.width, 105))
    s = week_crop.extract_text()

プログラム

import csv
import datetime
import pathlib
import re
from urllib.parse import urljoin

import pdfplumber
import pandas as pd
import requests
from bs4 import BeautifulSoup

def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/kenkou_iryou/kenkou/kekkaku-kansenshou01/houdou_00008.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

d1 = []
d2 = []

for i in soup.select('ul.m-listLink > li > a[href$=".pdf"]')[::-1]:

    text = i.get_text(strip=True)

    t = re.match("(\d{4})年(\d{1,2})月(\d{1,2})日", text)

    # 報道発表日

    if t:
        year, month, day = map(int, t.groups())
        dt_date = datetime.date(year, month, day)
    else:
        dt_date = datetime.date.today()

    # PDFファイル

    link = urljoin(url, i.get("href"))

    p = fetch_file(link)

    with pdfplumber.open(p) as pdf:

        p1 = pdf.pages[1]

        # cropでテキスト取得
        week_crop = p1.within_bbox((0, 90, p1.width, 105))
        s = week_crop.extract_text()

        m = re.search("(\d{4})年(\d{1,2})週\((\d{1,2})月(\d{1,2})日~(\d{1,2})月(\d{1,2})日\)", s)

        if m:
            s_year, s_week, s_month, s_day, e_month, e_day = map(int, m.groups())

            dt_start = datetime.date(s_year, s_month, s_day)
            dt_end = datetime.date(s_year, e_month, e_day)

            if dt_start > dt_end:
                dt_end = datetime.date(s_year + 1, e_month, e_day)

            table = p1.extract_table()

            df_tmp = pd.DataFrame(
                table[2:], columns=["都道府県", "報告数", "定点当たり"]
            ).set_index("都道府県")

            df_tmp.index = df_tmp.index.map(lambda s: "".join(s.split()))

            df_tmp = df_tmp.mask(df_tmp == "-")
            df_tmp["報告数"] = df_tmp["報告数"].str.replace(",", "").astype(float).astype("Int64")
            df_tmp["定点当たり"] = df_tmp["定点当たり"].astype(float)

            df_tmp.loc["年"] = s_year
            df_tmp.loc["週"] = s_week
            df_tmp.loc["開始日"] = dt_start
            df_tmp.loc["終了日"] = dt_end

            s1 = df_tmp["報告数"]
            s1.name = dt_date
            d1.append(s1)

            s2 = df_tmp["定点当たり"]
            s2.name = dt_date
            d2.append(s2)

df1 = pd.concat(d1, axis=1, sort=False).T.astype({"年": int, "週": int})
df2 = pd.concat(d2, axis=1, sort=False).T.astype({"年": int, "週": int})

df3 = df1.join(df2, rsuffix="(定点当たり)")

df = df3.reindex(
    columns=[
        "年",
        "週",
        "開始日",
        "終了日",
        "北海道",
        "北海道(定点当たり)",
        "青森県",
        "青森県(定点当たり)",
        "岩手県",
        "岩手県(定点当たり)",
        "宮城県",
        "宮城県(定点当たり)",
        "秋田県",
        "秋田県(定点当たり)",
        "山形県",
        "山形県(定点当たり)",
        "福島県",
        "福島県(定点当たり)",
        "茨城県",
        "茨城県(定点当たり)",
        "栃木県",
        "栃木県(定点当たり)",
        "群馬県",
        "群馬県(定点当たり)",
        "埼玉県",
        "埼玉県(定点当たり)",
        "千葉県",
        "千葉県(定点当たり)",
        "東京都",
        "東京都(定点当たり)",
        "神奈川県",
        "神奈川県(定点当たり)",
        "新潟県",
        "新潟県(定点当たり)",
        "富山県",
        "富山県(定点当たり)",
        "石川県",
        "石川県(定点当たり)",
        "福井県",
        "福井県(定点当たり)",
        "山梨県",
        "山梨県(定点当たり)",
        "長野県",
        "長野県(定点当たり)",
        "岐阜県",
        "岐阜県(定点当たり)",
        "静岡県",
        "静岡県(定点当たり)",
        "愛知県",
        "愛知県(定点当たり)",
        "三重県",
        "三重県(定点当たり)",
        "滋賀県",
        "滋賀県(定点当たり)",
        "京都府",
        "京都府(定点当たり)",
        "大阪府",
        "大阪府(定点当たり)",
        "兵庫県",
        "兵庫県(定点当たり)",
        "奈良県",
        "奈良県(定点当たり)",
        "和歌山県",
        "和歌山県(定点当たり)",
        "鳥取県",
        "鳥取県(定点当たり)",
        "島根県",
        "島根県(定点当たり)",
        "岡山県",
        "岡山県(定点当たり)",
        "広島県",
        "広島県(定点当たり)",
        "山口県",
        "山口県(定点当たり)",
        "徳島県",
        "徳島県(定点当たり)",
        "香川県",
        "香川県(定点当たり)",
        "愛媛県",
        "愛媛県(定点当たり)",
        "高知県",
        "高知県(定点当たり)",
        "福岡県",
        "福岡県(定点当たり)",
        "佐賀県",
        "佐賀県(定点当たり)",
        "長崎県",
        "長崎県(定点当たり)",
        "熊本県",
        "熊本県(定点当たり)",
        "大分県",
        "大分県(定点当たり)",
        "宮崎県",
        "宮崎県(定点当たり)",
        "鹿児島県",
        "鹿児島県(定点当たり)",
        "沖縄県",
        "沖縄県(定点当たり)",
        "総数",
        "総数(定点当たり)",
        "昨年同期(総数)",
        "昨年同期(総数)(定点当たり)",
    ]
)

df1.to_csv(
    "influ_count.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig",
)

df2.to_csv(
    "influ_point.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig",
)

df.to_csv(
    "influ_all.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig", na_rep="-",
)
1
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
2