More than 3 years have passed since last update.

気象庁から全国の気象警報・注意報をスクレイピングして集計する

Last updated at 2021-02-20Posted at 2021-01-11

気象庁の気象警報・注意報のページから
全国の気象警報・注意報をスクレイピングして集計する

※2021/02/20現在利用できません

import pathlib
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

# 並べ替え用
pref_code = {
    "01": "北海道",
    "02": "青森県",
    "03": "岩手県",
    "04": "宮城県",
    "05": "秋田県",
    "06": "山形県",
    "07": "福島県",
    "08": "茨城県",
    "09": "栃木県",
    "10": "群馬県",
    "11": "埼玉県",
    "12": "千葉県",
    "13": "東京都",
    "14": "神奈川県",
    "15": "新潟県",
    "16": "富山県",
    "17": "石川県",
    "18": "福井県",
    "19": "山梨県",
    "20": "長野県",
    "21": "岐阜県",
    "22": "静岡県",
    "23": "愛知県",
    "24": "三重県",
    "25": "滋賀県",
    "26": "京都府",
    "27": "大阪府",
    "28": "兵庫県",
    "29": "奈良県",
    "30": "和歌山県",
    "31": "鳥取県",
    "32": "島根県",
    "33": "岡山県",
    "34": "広島県",
    "35": "山口県",
    "36": "徳島県",
    "37": "香川県",
    "38": "愛媛県",
    "39": "高知県",
    "40": "福岡県",
    "41": "佐賀県",
    "42": "長崎県",
    "43": "熊本県",
    "44": "大分県",
    "45": "宮崎県",
    "46": "鹿児島県",
    "47": "沖縄県",
}

# 都道府県名のリスト作成
pref = [v for v in pref_code.values()]

スクレイピング

url = "https://www.jma.go.jp/jp/warn/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

# セッション
with requests.Session() as s:

    r = s.get(url, headers=headers)

    r.raise_for_status()

    base = BeautifulSoup(r.content, "html5lib")

    htmls = []

    for tag in tqdm(base.select("div#title > noscript > table > tbody > tr > td > a")):

        area = tag.get_text(strip=True)
        link = urljoin(url, tag.get("href"))

        r = s.get(link, headers=headers)
        r.raise_for_status()

        soup = BeautifulSoup(r.content, "html5lib")

        p = pathlib.Path("html", pathlib.PurePath(link).name)
        p.parent.mkdir(parents=True, exist_ok=True)

        with p.open(mode="w") as fw:
            fw.write(soup.prettify())

        htmls.append({"area": area, "url": link, "path": p})

        time.sleep(3)

import pandas as pd

def fetch_warn(p, area):

    tmp = pd.read_html(p.open(mode="r"), attrs={"id": "WarnTableTable"})[0]

    df = tmp.melt(
        id_vars=[
            ("Unnamed: 0_level_0", "Unnamed: 0_level_1"),
            ("Unnamed: 1_level_0", "Unnamed: 1_level_1"),
            ("Unnamed: 2_level_0", "Unnamed: 2_level_1"),
        ]
    ).dropna(thresh=5)

    df.set_axis(
        ["area1", "area2", "city", "level", "alert", "value"], axis=1, inplace=True
    )

    df["pref"] = area

    return df


dfs = [fetch_warn(html["path"], html["area"]) for html in htmls]

df = pd.concat(dfs).reset_index(drop=True)

# 文字を正規化、空白文字を削除
for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.normalize("NFKC").str.replace("\s", "")

# 地方名を北海道と沖縄県に置換
df["pref"].replace(
    {
        "宗谷地方": "北海道",
        "上川・留萌地方": "北海道",
        "網走・北見・紋別地方": "北海道",
        "釧路・根室・十勝地方": "北海道",
        "胆振・日高地方": "北海道",
        "石狩・空知・後志地方": "北海道",
        "渡島・檜山地方": "北海道",
        "沖縄本島地方": "沖縄県",
        "大東島地方": "沖縄県",
        "宮古島地方": "沖縄県",
        "八重山地方": "沖縄県",
    },
    inplace=True,
)

# "●"を0と1に変換
df["value"] = (df["value"] == "●").astype(int)

# 集計
df_alert = df.pivot_table(
    index="pref", columns="level", values="value", aggfunc=sum
).reindex(index=pref, columns=["警報", "注意報"])

df_alert

df.to_csv("alert.csv", encoding="utf_8_sig")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up