気象庁の気象警報・注意報のページから
全国の気象警報・注意報をスクレイピングして集計する
※2021/02/20現在利用できません
import pathlib
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
# 並べ替え用
pref_code = {
"01": "北海道",
"02": "青森県",
"03": "岩手県",
"04": "宮城県",
"05": "秋田県",
"06": "山形県",
"07": "福島県",
"08": "茨城県",
"09": "栃木県",
"10": "群馬県",
"11": "埼玉県",
"12": "千葉県",
"13": "東京都",
"14": "神奈川県",
"15": "新潟県",
"16": "富山県",
"17": "石川県",
"18": "福井県",
"19": "山梨県",
"20": "長野県",
"21": "岐阜県",
"22": "静岡県",
"23": "愛知県",
"24": "三重県",
"25": "滋賀県",
"26": "京都府",
"27": "大阪府",
"28": "兵庫県",
"29": "奈良県",
"30": "和歌山県",
"31": "鳥取県",
"32": "島根県",
"33": "岡山県",
"34": "広島県",
"35": "山口県",
"36": "徳島県",
"37": "香川県",
"38": "愛媛県",
"39": "高知県",
"40": "福岡県",
"41": "佐賀県",
"42": "長崎県",
"43": "熊本県",
"44": "大分県",
"45": "宮崎県",
"46": "鹿児島県",
"47": "沖縄県",
}
# 都道府県名のリスト作成
pref = [v for v in pref_code.values()]
スクレイピング
url = "https://www.jma.go.jp/jp/warn/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
# セッション
with requests.Session() as s:
r = s.get(url, headers=headers)
r.raise_for_status()
base = BeautifulSoup(r.content, "html5lib")
htmls = []
for tag in tqdm(base.select("div#title > noscript > table > tbody > tr > td > a")):
area = tag.get_text(strip=True)
link = urljoin(url, tag.get("href"))
r = s.get(link, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
p = pathlib.Path("html", pathlib.PurePath(link).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="w") as fw:
fw.write(soup.prettify())
htmls.append({"area": area, "url": link, "path": p})
time.sleep(3)
import pandas as pd
def fetch_warn(p, area):
tmp = pd.read_html(p.open(mode="r"), attrs={"id": "WarnTableTable"})[0]
df = tmp.melt(
id_vars=[
("Unnamed: 0_level_0", "Unnamed: 0_level_1"),
("Unnamed: 1_level_0", "Unnamed: 1_level_1"),
("Unnamed: 2_level_0", "Unnamed: 2_level_1"),
]
).dropna(thresh=5)
df.set_axis(
["area1", "area2", "city", "level", "alert", "value"], axis=1, inplace=True
)
df["pref"] = area
return df
dfs = [fetch_warn(html["path"], html["area"]) for html in htmls]
df = pd.concat(dfs).reset_index(drop=True)
# 文字を正規化、空白文字を削除
for col in df.select_dtypes(include=object).columns:
df[col] = df[col].str.normalize("NFKC").str.replace("\s", "")
# 地方名を北海道と沖縄県に置換
df["pref"].replace(
{
"宗谷地方": "北海道",
"上川・留萌地方": "北海道",
"網走・北見・紋別地方": "北海道",
"釧路・根室・十勝地方": "北海道",
"胆振・日高地方": "北海道",
"石狩・空知・後志地方": "北海道",
"渡島・檜山地方": "北海道",
"沖縄本島地方": "沖縄県",
"大東島地方": "沖縄県",
"宮古島地方": "沖縄県",
"八重山地方": "沖縄県",
},
inplace=True,
)
# "●"を0と1に変換
df["value"] = (df["value"] == "●").astype(int)
# 集計
df_alert = df.pivot_table(
index="pref", columns="level", values="value", aggfunc=sum
).reindex(index=pref, columns=["警報", "注意報"])
df_alert
df.to_csv("alert.csv", encoding="utf_8_sig")