ベタな方法
ykishi@dezembro issues-90 % cat get-data.txt
wget https://opendata.corona.go.jp/api/Covid19JapanAll -O Covid19JapanAll.json
ykishi@dezembro issues-90 % cat covid19.py
#!/usr/bin/env python3
import requests
import json
import pandas as pd
csv_filename = "the_latest_covid19.csv"
# ===========================================================
# 空のdataframeを作成
# ===========================================================
colnames = ["ymd", "pref", "count"]
df = pd.DataFrame([], columns=colnames)
url = requests.get("https://opendata.corona.go.jp/api/Covid19JapanAll")
text = url.text
data = json.loads(text)
print(type(data))
print(list(enumerate(data)))
print(type(data['itemList']))
print(len(data['itemList']))
i = 0
for item in data['itemList']:
i += 1
if i % 1000 == 0:
print('{n:>16,}'.format(n=i) + " records processed ...")
temp = pd.Series([
item['date'],
item['name_jp'],
item['npatients']
],
index=df.columns)
# print(temp)
df = df.append(temp, ignore_index=True)
# ---------------------------------
# Output into a CSV file
# ---------------------------------
print(df)
df.to_csv(csv_filename)
ykishi@dezembro issues-90 % ./covid19.py
<class 'dict'>
[(0, 'errorInfo'), (1, 'itemList')]
<class 'list'>
20680
1,000 records processed ...
2,000 records processed ...
3,000 records processed ...
4,000 records processed ...
5,000 records processed ...
6,000 records processed ...
7,000 records processed ...
8,000 records processed ...
9,000 records processed ...
10,000 records processed ...
11,000 records processed ...
12,000 records processed ...
13,000 records processed ...
14,000 records processed ...
15,000 records processed ...
16,000 records processed ...
17,000 records processed ...
18,000 records processed ...
19,000 records processed ...
20,000 records processed ...
ymd pref count
0 2021-07-05 北海道 41495
1 2021-07-05 青森県 2521
2 2021-07-05 岩手県 1730
3 2021-07-05 宮城県 9191
4 2021-07-05 秋田県 908
... ... ... ...
20675 2020-04-22 熊本県 41
20676 2020-04-22 大分県 60
20677 2020-04-22 宮崎県 17
20678 2020-04-22 鹿児島県 10
20679 2020-04-22 沖縄県 130
[20680 rows x 3 columns]
ykishi@dezembro issues-90 %
json_normalize() を使う方法
こちらのほうが簡潔:
#!/usr/bin/env python3
import requests
import json
import pandas as pd
from pandas import json_normalize
url = requests.get("https://opendata.corona.go.jp/api/Covid19JapanAll")
text = url.text
data = json.loads(text)
print(len(data['itemList']))
df = json_normalize(data['itemList'])
print(df)
結果:
24769
date name_jp npatients
0 2021-09-30 北海道 60238
1 2021-09-30 青森県 5693
2 2021-09-30 岩手県 3479
3 2021-09-30 宮城県 16204
4 2021-09-30 秋田県 1862
... ... ... ...
24764 2020-04-22 熊本県 41
24765 2020-04-22 大分県 60
24766 2020-04-22 宮崎県 17
24767 2020-04-22 鹿児島県 10
24768 2020-04-22 沖縄県 130
[24769 rows x 3 columns]
EXCELあるいはSQLiteデータベースに保存する
% cat covid19_test.py
#!/usr/bin/env python3
import requests
import json
import pandas as pd
from pandas import json_normalize
from sqlalchemy import create_engine
import sys
import os
import datetime
dbname = "covid19.db"
xlsx_filename = "covid19.xlsx"
url = "https://opendata.corona.go.jp/api/Covid19JapanAll"
start_time = datetime.datetime.today()
response = requests.get(url)
end_time = datetime.datetime.today()
print(
"It took",
str(datetime.datetime.today() - start_time),
"to retrieve data from " + url,
)
text = response.text
data = json.loads(text)
print(len(data["itemList"]))
df = json_normalize(data["itemList"])
print(df)
print(df.columns)
# EXCELに保存
df.to_excel(xlsx_filename, index=False)
# SQLiteに保存
engine = create_engine("sqlite:///" + dbname, echo=False)
df.to_sql("japan_all", engine, if_exists="replace")
内容確認:
$ echo ".schema" | sqlite3 covid19.db
CREATE TABLE df (
"index" BIGINT,
date TEXT,
name_jp TEXT,
npatients TEXT
);
CREATE INDEX ix_df_index ON df ("index");
$ sqlite3 covid19.db << EOF
> .header on
> select max(date), min(date) from df;
> EOF
max(date)|min(date)
2021-10-01|2020-04-22
DBの中身を確認する
% sqlite3 covid19.db
SQLite version 3.32.3 2020-06-18 14:16:19
Enter ".help" for usage hints.
sqlite> .header on
sqlite> .tables
japan_all
sqlite> vacuum;
sqlite> select * from japan_all where name_jp = '北海道' order by date limit 10;
index|date|name_jp|npatients
24769|2020-04-22|北海道|495
24722|2020-04-23|北海道|540
24675|2020-04-24|北海道|562
24628|2020-04-25|北海道|601
24581|2020-04-26|北海道|615
24534|2020-04-27|北海道|650
24487|2020-04-28|北海道|688
24440|2020-04-29|北海道|726
24393|2020-04-30|北海道|767
24346|2020-05-01|北海道|790
sqlite> ^D