第24回
Seaborn
import seaborn as sns
%matplotlib inline
import pandas as pd
df = pd.read_csv('train.csv')
df = df.dropna(subset=['Age'])
sns.distplot(df['Age'])
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcdd7e3d0>
sns.set()
sns.distplot(df['Age'],bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcdd05850>
sns.jointplot()で2変数の分布を見る
sns.jointplot(x='Age', y='Fare', data=df)
<seaborn.axisgrid.JointGrid at 0x7fcbcdbe31d0>
sns.jointplot(x='Age', y='Fare', data=df, kind='hex')
<seaborn.axisgrid.JointGrid at 0x7fcbcdb2b890>
(重要)sns.pairplot()で複数のカラムの分布を一発で表示
sns.pairplot(df[['Age', 'Fare', 'Pclass', 'Survived']], hue='Survived', kind='scatter', plot_kws={'alpha': 0.5})
/opt/anaconda3/lib/python3.7/site-packages/statsmodels/nonparametric/kde.py:487: RuntimeWarning: invalid value encountered in true_divide
binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
/opt/anaconda3/lib/python3.7/site-packages/statsmodels/nonparametric/kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
FAC1 = 2*(np.pi*bw/RANGE)**2
<seaborn.axisgrid.PairGrid at 0x7fcbcda34990>
第25回
sns.barplot() 「棒グラフ」を作る
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
df = pd.read_csv('train.csv')
df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
sns.barplot(x='Survived', y='Age', data=df) #平均値
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcdd7e390>
sns.barplot(x='Survived', y='Age', data=df, estimator=np.median) #中央値
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcce3a190>
sns.countplot() データ数を比較
sns.countplot(x='Sex', data=df, hue='Survived')
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbccfb0c10>
sns.boxplot() カテゴリ別に値を比較する
sns.boxplot(x='Pclass', y='Age', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbccd82750>
sns.boxplot(x='Pclass', y='Age', data=df, hue='Survived')
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcc2a4ed0>
sns.violonplot() データの分析を可視化できる
sns.violinplot(x='Pclass', y='Age', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcc1e3ad0>
sns.violinplot(x='Pclass', y='Age', data=df, hue=('Survived'))
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcc107210>
sns.swarmplot() 本当の分布を確認できる
sns.swarmplot(x='Pclass', y='Age', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcc19a710>
sns.swarmplot(x='Pclass', y='Age', data=df, size=4, hue='Survived')
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbc6e9fe50>
第26回
Heatmapを描写する
df.corr()で相関関係を作る
import pandas as pd
df = pd.read_csv('train.csv')
corr = df.corr()
corr
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
PassengerId | 1.000000 | -0.005007 | -0.035144 | 0.036847 | -0.057527 | -0.001652 | 0.012658 |
Survived | -0.005007 | 1.000000 | -0.338481 | -0.077221 | -0.035322 | 0.081629 | 0.257307 |
Pclass | -0.035144 | -0.338481 | 1.000000 | -0.369226 | 0.083081 | 0.018443 | -0.549500 |
Age | 0.036847 | -0.077221 | -0.369226 | 1.000000 | -0.308247 | -0.189119 | 0.096067 |
SibSp | -0.057527 | -0.035322 | 0.083081 | -0.308247 | 1.000000 | 0.414838 | 0.159651 |
Parch | -0.001652 | 0.081629 | 0.018443 | -0.189119 | 0.414838 | 1.000000 | 0.216225 |
Fare | 0.012658 | 0.257307 | -0.549500 | 0.096067 | 0.159651 | 0.216225 | 1.000000 |
sns.heatmap()でHeatmapをplot
sns.heatmap(corr)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbc6df9850>
sns.heatmap(corr, cmap='coolwarm', annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbcd0b3290>
データを俯瞰するときに役立つsns.heatmap()
flights = sns.load_dataset('flights')
print(len(flights))
flights.head()
144
year | month | passengers | |
---|---|---|---|
0 | 1949 | January | 112 |
1 | 1949 | February | 118 |
2 | 1949 | March | 132 |
3 | 1949 | April | 129 |
4 | 1949 | May | 121 |
# pivot_tableを作成
flights_pivot = flights.pivot_table(index='month', columns='year', values='passengers')
flights_pivot
year | 1949 | 1950 | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | 1958 | 1959 | 1960 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
month | ||||||||||||
January | 112 | 115 | 145 | 171 | 196 | 204 | 242 | 284 | 315 | 340 | 360 | 417 |
February | 118 | 126 | 150 | 180 | 196 | 188 | 233 | 277 | 301 | 318 | 342 | 391 |
March | 132 | 141 | 178 | 193 | 236 | 235 | 267 | 317 | 356 | 362 | 406 | 419 |
April | 129 | 135 | 163 | 181 | 235 | 227 | 269 | 313 | 348 | 348 | 396 | 461 |
May | 121 | 125 | 172 | 183 | 229 | 234 | 270 | 318 | 355 | 363 | 420 | 472 |
June | 135 | 149 | 178 | 218 | 243 | 264 | 315 | 374 | 422 | 435 | 472 | 535 |
July | 148 | 170 | 199 | 230 | 264 | 302 | 364 | 413 | 465 | 491 | 548 | 622 |
August | 148 | 170 | 199 | 242 | 272 | 293 | 347 | 405 | 467 | 505 | 559 | 606 |
September | 136 | 158 | 184 | 209 | 237 | 259 | 312 | 355 | 404 | 404 | 463 | 508 |
October | 119 | 133 | 162 | 191 | 211 | 229 | 274 | 306 | 347 | 359 | 407 | 461 |
November | 104 | 114 | 146 | 172 | 180 | 203 | 237 | 271 | 305 | 310 | 362 | 390 |
December | 118 | 140 | 166 | 194 | 201 | 229 | 278 | 306 | 336 | 337 | 405 | 432 |
sns.heatmap(flights_pivot)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbc5baabd0>
第27回
sns.set()で基本styleを変更
context引数で用途を指定
import pandas as pd
import seaborn as sns
%matplotlib inline
df = pd.read_csv('train.csv')
sns.set(context=('poster'))
df = df.dropna(subset=['Age'])
sns.distplot(df['Age'])
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbc568f690>
style引数でグラフ全体のstyleを指定
sns.set_style(style='whitegrid') #背景色を変更
sns.distplot(df['Age'])
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbc563d850>
palette引数で色を指定
sns.set(palette='bright')
sns.violinplot(x='Pclass', y='Age', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbc5472850>
sns.despine()で軸や枠を落とす
sns.set(palette='bright' ,style='ticks') ##style='ticks'でplotに軸
sns.violinplot(x='Pclass', y='Age', data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbc53ed810>
sns.set(palette='bright' )
sns.violinplot(x='Pclass', y='Age', data=df)
sns.despine()
matplotlibと同様pltモジュールでいろいろなことができる
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
sns.distplot(df['Age'])
<matplotlib.axes._subplots.AxesSubplot at 0x7fcbc52d8750>
sns.distplot(df['Age'])
plt.savefig('seaborn_sample.png')