LoginSignup
0
0

More than 3 years have passed since last update.

tsukuba-kdbのcsvからjson作成

Posted at
import pathlib
import json

import pandas as pd
import requests

# ダウンロード
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)

    return p


url = "https://github.com/Make-IT-TSUKUBA/alternative-tsukuba-kdb/raw/master/kdb_20210404.csv"

csv_path = fetch_file(url)

df = pd.read_csv(
    csv_path,
    skiprows=3,
    header=None,
    usecols=[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 19],
)

# A列が空白の行を削除
df.dropna(subset=[0], inplace=True)

# 科目番号の上がタイトルのため1行ずらして、科目番号以降を補完
df[20] = df[0].shift()
df[20] = df[20].where(df[0] == "科目番号").fillna(method="ffill")

# 行内の欠損以外をカウント ※下の確認用
df.notnull().sum(axis=1).value_counts()

# 科目番号の行(ヘッダー)、行内に5項目以上ない行は削除
df1 = df[df[0] != "科目番号"].dropna(thresh=5)

# 欠損を空文字に置換
df1.fillna("", inplace=True)

# 改行前後の空白を削除
df1[10] = df1[10].apply(
    lambda s: "\n".join([i.strip() for i in s.strip().splitlines()])
)
df1[11] = df1[11].apply(
    lambda s: "\n".join([i.strip() for i in s.strip().splitlines()])
)

result = []

# Q行が空白の場合削除
for d in df1.values.tolist():

    if not d[13]:
        d.pop(13)

    result.append(d)

# JSON変換
with open("kdb.json", "w", encoding="utf-8") as fp:
    json.dump(result, fp, indent="\t", ensure_ascii=False)
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0