1
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

埼玉県の最近の県政ニュースから新型コロナウイルスに関連した患者等の発生についての別紙PDFから陽性者情報を抽出する

Last updated at Posted at 2020-11-29

埼玉県の最近の県政ニュースから新型コロナウイルスに関連した患者等の発生についての別紙PDFから陽性者情報を抽出する

import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup

import sys
import pdfplumber

# PDFファイルのリンクを抽出
def get_pdf_link(url):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    # PDFファイルのリンクを抽出
    tag = soup.select_one("div#tmp_contents > p > a[href$=.pdf]")

    if tag:
        # リンクのテキストが別紙で始まるか確認
        if re.match("別紙", tag.get_text(strip=True)):
            return urljoin(url, tag.get("href"))

    return None


# PDFファイルをダウンロード
def get_pdf_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


# 月日をdatetimeに変換
def days2date(s):

    y = dt_now.year

    days = re.findall("[0-9]{1,2}", s)

    if len(days) == 2:
        m, d = map(int, days)

        if dt_now.month < m:

            y -= 1

        return pd.Timestamp(year=y, month=m, day=d)
    else:
        return pd.NaT


url = "https://www.pref.saitama.lg.jp/news/index.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

# 現在の日付
JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)

dfs = []

for i in soup.select("div.box_news > ul > li > a"):

    # 最近の県政ニュースの新型コロナウイルスに関連した患者等の発生についてを抽出
    if re.match("新型コロナウイルスに関連した患者等の発生について", i.get_text(strip=True)):

        # 別紙のPDFをダウンロード
        link = get_pdf_link(urljoin(url, i.get("href")))

        if link:

            with pdfplumber.open(get_pdf_file(link)) as pdf:

                dfs.extend(
                    [
                        pd.DataFrame(
                            page.extract_table(),
                            index=[
                                "例目",
                                "年代",
                                "性別",
                                "国籍",
                                "職業",
                                "居住地",
                                "発症日",
                                "発症時症状",
                                "陽性判明日",
                                "感染源と疑われる接触",
                                "同居家族",
                                "濃厚接触者",
                            ],
                        ).T.iloc[2:]
                        for page in pdf.pages
                    ]
                )

# ひとつに結合、空白文字を除去、例目をindexに変更
df = pd.concat(dfs).replace("\s", "", regex=True).set_index("例目")

# indexでソート
df.sort_index(inplace=True)

# 月日をdatetimeに変換
df["発症日"] = df["発症日"].apply(days2date)
df["陽性判明日"] = df["陽性判明日"].apply(days2date)

df.to_csv(sys.stdout)
1
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?