Pandasでスクレイピング
その他内容は上記記事を参考
日ごと
import time
import pandas as pd
# 札幌
prec_no = 14
block_no = 47412
# 期間 ※年と月だけ利用
start = "2022/01/01"
end = "2022/04/30"
# 期間中の月初を作成
dt_range = pd.date_range(start, end, freq="MS")
dfs = []
for dt in dt_range:
url = f"https://www.data.jma.go.jp/obd/stats/etrn/view/daily_a1.php?prec_no={prec_no}&block_no={block_no}&year={year}&month={month}&day=&view="
tmp = pd.read_html(url)[0]
# 列名の重複除去して結合
tmp.columns = [
"_".join(sorted(set(col), key=col.index)) for col in tmp.columns.values
]
tmp["年"] = dt.year
tmp["月"] = dt.month
dfs.append(tmp)
# 1秒待機
time.sleep(1)
# 結合
df = pd.concat(dfs).reset_index(drop=True)
# CSVに変換
df.to_csv("result.csv", encoding="utf_8_sig")
10分ごと
import time
import pandas as pd
# 横浜
prec_no = 46
block_no = 47670
# 期間
start = "2021/06/22"
end = "2021/06/30"
# 期間作成
dt_range = pd.date_range(start, end, freq="D")
dfs = []
for dt in dt_range:
url = f"https://www.data.jma.go.jp/stats/etrn/view/10min_a1.php?prec_no={prec_no}&block_no={block_no}&year={dt.year}&month={dt.month}&day={dt.day}&view="
tmp = pd.read_html(url)[0]
# 列名の重複除去して結合
tmp.columns = [
"_".join(sorted(set(col), key=col.index)) for col in tmp.columns.values
]
tmp["year"] = dt.year
tmp["month"] = dt.month
tmp["day"] = dt.day
dfs.append(tmp)
# 1秒待機
time.sleep(1)
# 結合
df = pd.concat(dfs).reset_index(drop=True)
df[["hour", "minute"]] = df["時分"].str.split(":", expand=True).astype(int)
df["date"] = pd.to_datetime(df[["year", "month", "day"]])
df["time"] = df[["hour", "minute"]].apply(
lambda x: pd.Timedelta(hours=x.hour, minutes=x.minute), axis=1
)
df["時分"] = df["date"] + df["time"]
df.drop(["date", "time", "year", "month", "day", "hour", "minute"], axis=1, inplace=True)
df.rename(columns={"時分": "日時"}, inplace=True)
# CSVに変換
df.to_csv("result.csv", encoding="utf_8_sig")