import datetime
import pathlib
import re
from urllib.parse import urljoin
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
def str2date(s: pd.Series) -> pd.Series:
df = (
s.str.extract("(\d{4})年(\d{1,2})月(\d{1,2})日")
.rename(columns={0: "year", 1: "month", 2: "day"})
.fillna(0)
.astype(int)
)
return pd.to_datetime(df, errors="coerce")
# スクレイピング
url = "https://www.pref.okinawa.lg.jp/site/hoken/kansen/soumu/press/20200214_covid19_pr1.html"
soup = fetch_soup(url)
tag = soup.find("a", class_=None, text=re.compile("^陽性者一覧"), href=re.compile(".csv$"))
link = urljoin(url, tag.get("href"))
# データラングリング
p = fetch_file(link)
df = pd.read_csv(p, encoding="cp932")
df["確定日YMD"] = str2date(df["確定日"])
df["発病日YMD"] = str2date(df["発病日"])
df["状況"] = df["発病日"].where(df["発病日YMD"].isnull())
df
# 削除条件抽出
df["drop"] = False
# 確定陽性者が数字じゃない
df["drop"] = df["drop"].where(df["確定陽性者"].str.isnumeric(), True)
# 性別が男性・女性・非公表じゃない
df["drop"] = df["drop"].where(df["性別"].isin(["男性", "女性", "非公表"]), True)
# 年齢が欠番
df["drop"] = df["drop"].mask(df["年齢"] == "欠番", True)
# 削除条件抽出
df1 = df[~df["drop"]].copy()
df1.drop("drop", axis=1, inplace=True)
df1["確定陽性者"] = df1["確定陽性者"].astype(int)
df1.set_index("確定陽性者", inplace=True)
df1.sort_index(inplace=True)
df1.to_csv("output.csv", encoding="utf_8_sig")
# 削除条件抽出
df2 = df[df["drop"]].copy()
# 削除条件の番号抽出
# 数字以外を除去
missing_num = sorted([int(i) for i in df2["確定陽性者"].to_list() if i.isdecimal()])
JST = datetime.timezone(datetime.timedelta(hours=+9))
dt_now = datetime.datetime.now(JST).replace(tzinfo=None)
with open("report.txt", "w") as fw:
print(f'Report created at: {dt_now.strftime("%H:%M:%S")} JST', file=fw)
print(f"Total cases: {len(df)}", file=fw)
print(f"Missing cases: {len(missing_num)}", file=fw)
print(f"Missing case id: {missing_num}", file=fw)