LoginSignup
1
1

More than 1 year has passed since last update.

2022参院選候補者アンケートをスクレイピング

Last updated at Posted at 2022-07-10
import requests
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def get_json(url):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    return r.json()


def get_questions():

    url = (
        "https://www.nhk.or.jp/senkyo-data/database/sangiin/2022/survey/questions.json"
    )

    data = get_json(url)

    dfs = []

    for _, v in data.items():

        tmp = pd.json_normalize(v)
        dfs.append(tmp)

    df = pd.concat(dfs).set_index("id")

    df.drop(["type", "selects"], axis=1, inplace=True)

    return df


df_questions = get_questions()
df_questions


# 選挙区
def get_senkyoku():

    df_senkyoku = pd.read_csv(
        "https://www.nhk.or.jp/senkyo-data/database/sangiin/2022/00/search/sindex.csv",
        names=[
            "typeID",
            "senkyoID",
            "senkyo_name",
            "senkyokuID",
            "senkyoku_name",
            "todoufukenID",
        ],
        storage_options={"User-Agent": "Mozilla/5.0"},
        header=None,
    )

    dfs = []

    for _, row in df_senkyoku.iterrows():

        url = f'https://www.nhk.or.jp/senkyo-data/database/sangiin/2022/survey/{row["senkyokuID"]}.json'

        data = get_json(url)

        tmp = pd.json_normalize(
            data,
            meta=[
                "candidateID",
                "lastname",
                "firstname",
                "sex",
                "touha",
                "touha_name",
                "age",
                "senkyokuID",
                "senkyoku_name",
                "todoufukenID",
                "answerDatetime",
            ],
            record_path="qa",
        )

        dfs.append(tmp)

    df = pd.concat(dfs).reset_index(drop=True)

    return df


# 比例
def get_hirei():

    url = "https://www.nhk.or.jp/senkyo-data/database/sangiin/2022/survey/hirei.json"

    data = get_json(url)

    df = pd.json_normalize(
        data,
        meta=[
            "candidateID",
            "lastname",
            "firstname",
            "sex",
            "touha",
            "touha_name",
            "age",
            "senkyokuID",
            "senkyoku_name",
            "todoufukenID",
            "answerDatetime",
        ],
        record_path="qa",
    )

    return df


# 選挙区取得
df_senkyoku = get_senkyoku()
df_senkyoku

# 比例取得
df_hirei = get_hirei()
df_hirei

# 選挙区と比例結合
df_sangiin = pd.concat([df_senkyoku, df_hirei])

# アンケート結合
df0 = df_sangiin.join(df_questions, on="id")

df0["answers"] = df0["answers"].str[0]

df0

df1 = df0.pivot(
    index=["todoufukenID", "candidateID", "lastname", "firstname"],
    columns=["id", "question"],
    values="answers",
)

df1.to_csv("result.csv")
1
1
1

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1