More than 3 years have passed since last update.

【Python】Altair で様々な散布図を作成する

Posted at 2021-12-05

概要

本稿ではグラフ可視化ライブラリ Altair で作成可能な散布図をいくつか紹介する。この作成例を参考に、様々な散布図を作成してみよう。

参考

散布図以外にも Altair では様々な Figure を作成できる。まずはこちらの記事を参考にしよう。

データの作成

前稿と同様、Figure 作成用のテストデータとして架空の学校で行われた期末試験の得点を使用する。この学校には学生が 300 人在籍し、普通、特進、理数の 3 クラスが存在する。期末試験の科目は国語、数学、理科、社会、英語で各教科 100 点満点とする。

デモデータ作成

import numpy as np
import pandas as pd

np.random.seed(1)# 乱数の固定

n = 300 # 学生の人数
s = np.random.normal(55,10,n) # 学生の学力（score）
c = np.random.randint(0,3,n) # クラス
s = s * (1 + c * 0.015) # クラスの学力差をつける
g = np.random.randint(0,2,n) # 性別

# 得点データの生成
s1 = np.random.uniform(0.75,1.1,n) * s * (1 + g * 0.02)
s2 = np.random.uniform(0.9,1.1,n) * s * (1 - g * 0.05)
s3 = np.random.uniform(0.9,1.05,n) * s * (1 + g * 0.03)
s4 = np.random.uniform(0.9,1.2,n) * s * (1 - g * 0.02)
s5 = np.random.uniform(0.8,1.1,n) * s * (1 + g * 0.01)

sex = ['男','女'] # 性別
cl = ['普通','理数','特進'] # クラス
sub = ['国語','数学','理科','社会','英語'] # 教科

df = pd.DataFrame()
df['学生番号'] = list(map(lambda x: 'ID'+str(x).zfill(3), range(1,1+n)))
df['国語'] = list(map(lambda x: round(x), s1))
df['数学'] = list(map(lambda x: round(x), s2))
df['理科'] = list(map(lambda x: round(x), s3))
df['社会'] = list(map(lambda x: round(x), s4))
df['英語'] = list(map(lambda x: round(x), s5))
df['合計'] = df['国語'] + df['数学'] + df['社会'] + df['理科'] + df['英語']
df['クラス'] = list(map(lambda x: cl[x], c))
df['性別'] = list(map(lambda x: sex[x], g))
print(df.head(10))

df.head(10)

学生番号  国語  数学  理科  社会  英語   合計 クラス 性別
0  ID001  65  68  68  72  76  349  普通  男
1  ID002  48  52  49  56  47  252  普通  男
2  ID003  52  45  50  49  45  241  普通  女
3  ID004  48  39  46  45  39  217  普通  女
4  ID005  52  62  71  68  63  316  特進  女
5  ID006  27  31  32  32  33  155  特進  女
6  ID007  74  63  77  80  78  372  普通  女
7  ID008  53  48  48  52  50  251  特進  男
8  ID009  58  55  60  58  55  286  特進  女
9  ID010  58  53  48  63  48  270  理数  男

各種設定

import altair as alt
from altair_saver import save

# 図のサイズ
width = 300
height = 300

# Figure に使用する色
color_lst = ['steelblue','darkorange']

# 軸に表示させる値
values_lst = [0,20,40,60,80,100]

周辺分布をヒストグラムで表示させた散布図

ヒストグラムと散布図

# Figure 作成
class_radio = alt.binding_radio(options=cl)
class_select = alt.selection_single(
    fields=['クラス'], bind=class_radio, name="class",init={'クラス': cl[0]})

base = alt.Chart(df).add_selection(class_select).transform_filter(class_select)

scatter = base.mark_circle(size=30).encode(
        x=alt.X('国語',
            scale=alt.Scale(domain=[0,100]),
            axis=alt.Axis(labelFontSize=15, titleFontSize=18, values=values_lst,title='国語の得点')
            ),
        y=alt.Y('数学',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(labelFontSize=15, titleFontSize=18, values=values_lst,title='数学の得点')
            ),
        color=alt.Color('性別', 
            scale=alt.Scale(domain=sex,range=color_lst),
            ),
        tooltip=['国語', '数学'],
    ).properties(
        width=width,height=height
    )

hist_x = base.mark_bar().encode(
    x=alt.X("国語", 
        bin=alt.Bin(step=10,extent=[0,100]),
        axis=None
        ),
    y=alt.Y('count(国語)',
        axis=alt.Axis(labelFontSize=15, titleFontSize=18,title='人数'),
        ),
    color=alt.Color('性別', 
        scale=alt.Scale(domain=sex,range=color_lst),
        ),
    ).properties(
        width=width,height=height//3
    )

hist_y = base.mark_bar().encode(
    y=alt.Y("数学", 
        bin=alt.Bin(step=10,extent=[0,100]),
        axis=None
        ),
    x=alt.X('count(数学)',
        axis=alt.Axis(labelFontSize=15, titleFontSize=18,title='人数'),
        ),
    color=alt.Color('性別', 
        scale=alt.Scale(domain=sex,range=color_lst),
        ),
    ).properties(
        width=width//3,height=height
    )

# 図の保存
save(hist_x&(scatter|hist_y),'histgram_scatter.html',embed_options={'actions':True})

【注意点1】 今回はヒストグラムの厚みが散布図のサイズの 1/3 になるように指定している。properties から変更可能。
【注意点2】 ヒストグラムの軸は、散布図にあわせて指定すること。X 軸と Y 軸を混同しやすい。
【ポイント】 ラジオボタンで普通・理数・特進の切り替えができる。

単回帰分析と散布図

Altair ではパラメトリックモデルおよびノンパラメトリックモデルの単回帰分析が可能である。

パラメトリック回帰

scatter = alt.Chart(df).mark_circle(size=30).encode(
        x=alt.X('国語',
            scale=alt.Scale(domain=[0,100]),
            axis=alt.Axis(labelFontSize=15, titleFontSize=18, values=values_lst,title='国語の得点')
            ),
        y=alt.Y('数学',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(labelFontSize=15, titleFontSize=18, values=values_lst,title='数学の得点')
            ),
        color=alt.Color('性別', 
            scale=alt.Scale(domain=sex,range=color_lst),
            legend=alt.Legend(labelFontSize=15,titleFontSize=18)
            ),
    ).properties(
        width=width,height=height
    )

# パラメトリック回帰
regress = scatter + scatter.transform_regression(
    '国語', '数学', method="poly", order = 1,groupby=['性別']).mark_line(shape='mark') 

regress = regress.facet(
    column = alt.Column('クラス', 
        header=alt.Header(labelFontSize=15, titleFontSize=18)
        )
    )

save(regress,
    'scatter_regression.html',embed_options={'actions':True})

【ポイント1】 transform_regression を用いることで様々なパラメトリックモデルの単回帰分析ができる。ただし poly を用いるときは次元数を order で指定する。order=1 の場合 ``linear``` と同じ一次関数の単回帰分析が行われる。

method="XXXXXXX"	Model
linear	`y = a + b * x`
log	`y = a + b * log(x)`
exp	`y = a + exp(b * x)`
pow	`y = a * xb`
quad	`y = a + b * x + c * x2`
poly	`y = a + b * x + … + k * xorder`

【ポイント2】 transform_regression では groupby を用いて男女別々に回帰している。

【ポイント3】 ノンパラメトリック回帰には transform_loess を用いる。

ノンパラメトリック回帰

# ノンパラメトリック回帰
regress = scatter + scatter.transform_loess(
    '国語', '数学', groupby=['性別']).mark_line(shape='mark')

ヒストグラムと連動した散布図

合計得点の分布と国語・数学の得点分布を対応させた Figure を作成する。

ヒストグラムと連動した散布図

# interval selection in the scatter plot
brush = alt.selection(type="interval", encodings=['x'])

# left panel: scatter plot
points = alt.Chart().mark_point(filled=True, color="black").encode(
    x=alt.X('国語',
        scale=alt.Scale(domain=[0,100]),
        axis=alt.Axis(labelFontSize=15, titleFontSize=18, values=values_lst,title='国語の得点')
        ),
    y=alt.Y('数学',
        scale=alt.Scale(domain=[0, 100]),
        axis=alt.Axis(labelFontSize=15, titleFontSize=18, values=values_lst,title='数学の得点')
        ),
    color=alt.condition(brush, 
        alt.Color('性別:N', 
            scale=alt.Scale(domain=sex,range=color_lst),
            legend=alt.Legend(labelFontSize=15,titleFontSize=18)
            ),
        alt.value("#ddd")
        ),
).add_selection(
    brush
).properties(width=300,height=300)

# right panel: histogram
base = alt.Chart().mark_bar().encode(
    x=alt.X('合計:Q',
            bin=alt.Bin(step=50,extent=[0,500]),
            scale=alt.Scale(domain=[0,500]),
            axis=alt.Axis(labelFontSize=15, titleFontSize=18, title='合計')
            ),
    y=alt.Y('count()',
        axis=alt.Axis(labelFontSize=15, titleFontSize=18, title='人数')
        )
).properties(
    width=300,height=300
)

# gray background with selection
background = base.encode(
    color=alt.value('#ddd')
).add_selection(brush)

# blue highlights on the transformed data
highlight = base.transform_filter(brush).encode(
    color=alt.Color('性別:N', 
        scale=alt.Scale(domain=sex,range=color_lst),
        legend=alt.Legend(labelFontSize=15,titleFontSize=18)
        )
    )

hist = alt.layer(background,highlight)

# build the chart:
chart = alt.hconcat(points,hist,data=df)

save(chart,'histgram_scatter_interactive.html',embed_options={'actions':True})

【ポイント1】 例えば下図では,ヒストグラムで合計得点 300~350 点を選択して、その得点帯にいる人のプロットを散布図でハイライトさせている。

【ポイント2】 一方で下図では,散布図で国語の得点 40~60 点を選択して、その得点帯にいる人の合計得点をヒストグラムでハイライトさせている。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up