# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
# Common imports
import numpy as np
import os
# to make this notebook's output stable across runs
np.random.seed(42)
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
fetch_housing_data()
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
データ構造を見てみる
housing = load_housing_data()
housing.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
housing['ocean_proximity'].value_counts()
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
Name: ocean_proximity, dtype: int64
housing.describe()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20,15))
plt.show()
- 単位は何か気にすること
- 上限下限を切っている場合はその部分だけ極端に大きくなっている → 住宅価格の中央値や築年数の中央値が今回のそれに該当する
テストセットを作る
import numpy as np
def split_tarin_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))#len(data)の数値までのランダムなnumpy配列を作成
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[ : test_set_size]
train_indices = shuffled_indices[test_set_size : ]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_tarin_test(housing, 0.2)
len(train_set)
16512
len(test_set)
4128
train_set
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
14196 | -117.03 | 32.71 | 33.0 | 3126.0 | 627.0 | 2300.0 | 623.0 | 3.2596 | 103000.0 | NEAR OCEAN |
8267 | -118.16 | 33.77 | 49.0 | 3382.0 | 787.0 | 1314.0 | 756.0 | 3.8125 | 382100.0 | NEAR OCEAN |
17445 | -120.48 | 34.66 | 4.0 | 1897.0 | 331.0 | 915.0 | 336.0 | 4.1563 | 172600.0 | NEAR OCEAN |
14265 | -117.11 | 32.69 | 36.0 | 1421.0 | 367.0 | 1418.0 | 355.0 | 1.9425 | 93400.0 | NEAR OCEAN |
2271 | -119.80 | 36.78 | 43.0 | 2382.0 | 431.0 | 874.0 | 380.0 | 3.5542 | 96500.0 | INLAND |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11284 | -117.96 | 33.78 | 35.0 | 1330.0 | 201.0 | 658.0 | 217.0 | 6.3700 | 229200.0 | <1H OCEAN |
11964 | -117.43 | 34.02 | 33.0 | 3084.0 | 570.0 | 1753.0 | 449.0 | 3.0500 | 97800.0 | INLAND |
5390 | -118.38 | 34.03 | 36.0 | 2101.0 | 569.0 | 1756.0 | 527.0 | 2.9344 | 222100.0 | <1H OCEAN |
860 | -121.96 | 37.58 | 15.0 | 3575.0 | 597.0 | 1777.0 | 559.0 | 5.7192 | 283500.0 | <1H OCEAN |
15795 | -122.42 | 37.77 | 52.0 | 4226.0 | 1315.0 | 2619.0 | 1242.0 | 2.5755 | 325000.0 | NEAR BAY |
16512 rows × 10 columns
この状態だと、関数を実行するたびに異なる配列が作成されてしまい、データセット全体を見ることになる
テストセットをなんのために作ったのか分からない
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
len(train_set)
16512
len(test_set)
4128
train_set
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
---|---|---|---|---|---|---|---|---|---|---|
14196 | -117.03 | 32.71 | 33.0 | 3126.0 | 627.0 | 2300.0 | 623.0 | 3.2596 | 103000.0 | NEAR OCEAN |
8267 | -118.16 | 33.77 | 49.0 | 3382.0 | 787.0 | 1314.0 | 756.0 | 3.8125 | 382100.0 | NEAR OCEAN |
17445 | -120.48 | 34.66 | 4.0 | 1897.0 | 331.0 | 915.0 | 336.0 | 4.1563 | 172600.0 | NEAR OCEAN |
14265 | -117.11 | 32.69 | 36.0 | 1421.0 | 367.0 | 1418.0 | 355.0 | 1.9425 | 93400.0 | NEAR OCEAN |
2271 | -119.80 | 36.78 | 43.0 | 2382.0 | 431.0 | 874.0 | 380.0 | 3.5542 | 96500.0 | INLAND |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11284 | -117.96 | 33.78 | 35.0 | 1330.0 | 201.0 | 658.0 | 217.0 | 6.3700 | 229200.0 | <1H OCEAN |
11964 | -117.43 | 34.02 | 33.0 | 3084.0 | 570.0 | 1753.0 | 449.0 | 3.0500 | 97800.0 | INLAND |
5390 | -118.38 | 34.03 | 36.0 | 2101.0 | 569.0 | 1756.0 | 527.0 | 2.9344 | 222100.0 | <1H OCEAN |
860 | -121.96 | 37.58 | 15.0 | 3575.0 | 597.0 | 1777.0 | 559.0 | 5.7192 | 283500.0 | <1H OCEAN |
15795 | -122.42 | 37.77 | 52.0 | 4226.0 | 1315.0 | 2619.0 | 1242.0 | 2.5755 | 325000.0 | NEAR BAY |
16512 rows × 10 columns
層化抽出法
housing['income_cat'] = pd.cut(housing['median_income'],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1,2,3,4,5])
housing['income_cat'].hist()
<AxesSubplot:>
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
strat_test_set['income_cat'].value_counts() / len(strat_test_set)
3 0.350533
2 0.318798
4 0.176357
5 0.114583
1 0.039729
Name: income_cat, dtype: float64
len(strat_test_set)
4128
len(strat_test_set['income_cat'])
4128
strat_test_set
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | income_cat | |
---|---|---|---|---|---|---|---|---|---|---|---|
5241 | -118.39 | 34.12 | 29.0 | 6447.0 | 1012.0 | 2184.0 | 960.0 | 8.2816 | 500001.0 | <1H OCEAN | 5 |
10970 | -117.86 | 33.77 | 39.0 | 4159.0 | 655.0 | 1669.0 | 651.0 | 4.6111 | 240300.0 | <1H OCEAN | 4 |
20351 | -119.05 | 34.21 | 27.0 | 4357.0 | 926.0 | 2110.0 | 876.0 | 3.0119 | 218200.0 | <1H OCEAN | 3 |
6568 | -118.15 | 34.20 | 52.0 | 1786.0 | 306.0 | 1018.0 | 322.0 | 4.1518 | 182100.0 | INLAND | 3 |
13285 | -117.68 | 34.07 | 32.0 | 1775.0 | 314.0 | 1067.0 | 302.0 | 4.0375 | 121300.0 | INLAND | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
20519 | -121.53 | 38.58 | 33.0 | 4988.0 | 1169.0 | 2414.0 | 1075.0 | 1.9728 | 76400.0 | INLAND | 2 |
17430 | -120.44 | 34.65 | 30.0 | 2265.0 | 512.0 | 1402.0 | 471.0 | 1.9750 | 134000.0 | NEAR OCEAN | 2 |
4019 | -118.49 | 34.18 | 31.0 | 3073.0 | 674.0 | 1486.0 | 684.0 | 4.8984 | 311700.0 | <1H OCEAN | 4 |
12107 | -117.32 | 33.99 | 27.0 | 5464.0 | 850.0 | 2400.0 | 836.0 | 4.7110 | 133500.0 | INLAND | 4 |
2398 | -118.91 | 36.79 | 19.0 | 1616.0 | 324.0 | 187.0 | 80.0 | 3.7857 | 78600.0 | INLAND | 3 |
4128 rows × 11 columns
strat_test_set['income_cat'].value_counts()
3 1447
2 1316
4 728
5 473
1 164
Name: income_cat, dtype: int64
strat_test_set['income_cat']
5241 5
10970 4
20351 3
6568 3
13285 3
..
20519 2
17430 2
4019 4
12107 4
2398 3
Name: income_cat, Length: 4128, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
housing['income_cat']
0 5
1 5
2 5
3 4
4 3
..
20635 2
20636 2
20637 2
20638 2
20639 2
Name: income_cat, Length: 20640, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
strat_train_set
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | income_cat | |
---|---|---|---|---|---|---|---|---|---|---|---|
17606 | -121.89 | 37.29 | 38.0 | 1568.0 | 351.0 | 710.0 | 339.0 | 2.7042 | 286600.0 | <1H OCEAN | 2 |
18632 | -121.93 | 37.05 | 14.0 | 679.0 | 108.0 | 306.0 | 113.0 | 6.4214 | 340600.0 | <1H OCEAN | 5 |
14650 | -117.20 | 32.77 | 31.0 | 1952.0 | 471.0 | 936.0 | 462.0 | 2.8621 | 196900.0 | NEAR OCEAN | 2 |
3230 | -119.61 | 36.31 | 25.0 | 1847.0 | 371.0 | 1460.0 | 353.0 | 1.8839 | 46300.0 | INLAND | 2 |
3555 | -118.59 | 34.23 | 17.0 | 6592.0 | 1525.0 | 4459.0 | 1463.0 | 3.0347 | 254500.0 | <1H OCEAN | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6563 | -118.13 | 34.20 | 46.0 | 1271.0 | 236.0 | 573.0 | 210.0 | 4.9312 | 240200.0 | INLAND | 4 |
12053 | -117.56 | 33.88 | 40.0 | 1196.0 | 294.0 | 1052.0 | 258.0 | 2.0682 | 113000.0 | INLAND | 2 |
13908 | -116.40 | 34.09 | 9.0 | 4855.0 | 872.0 | 2098.0 | 765.0 | 3.2723 | 97800.0 | INLAND | 3 |
11159 | -118.01 | 33.82 | 31.0 | 1960.0 | 380.0 | 1356.0 | 356.0 | 4.0625 | 225900.0 | <1H OCEAN | 3 |
15775 | -122.45 | 37.77 | 52.0 | 3095.0 | 682.0 | 1269.0 | 639.0 | 3.5750 | 500001.0 | NEAR BAY | 3 |
16512 rows × 11 columns
データを研究、可視化して理解を深める
地理データの可視化
housing = strat_train_set.copy()
# len(housing)
# type(housing) #DataFrame
housing.plot(kind='scatter', x ='longitude', y='latitude', alpha=0.1)
<AxesSubplot:xlabel='longitude', ylabel='latitude'>
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
s=housing['population']/100, label='population', figsize=(10, 7),
c=housing['median_house_value'], cmap=plt.get_cmap('jet'), colorbar='True',
sharex=False)
plt.legend()
<matplotlib.legend.Legend at 0x7f185a1b5820>
相関を探す
corr_matrix = housing.corr()
corr_matrix
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
longitude | 1.000000 | -0.924478 | -0.105848 | 0.048871 | 0.076598 | 0.108030 | 0.063070 | -0.019583 | -0.047432 |
latitude | -0.924478 | 1.000000 | 0.005766 | -0.039184 | -0.072419 | -0.115222 | -0.077647 | -0.075205 | -0.142724 |
housing_median_age | -0.105848 | 0.005766 | 1.000000 | -0.364509 | -0.325047 | -0.298710 | -0.306428 | -0.111360 | 0.114110 |
total_rooms | 0.048871 | -0.039184 | -0.364509 | 1.000000 | 0.929379 | 0.855109 | 0.918392 | 0.200087 | 0.135097 |
total_bedrooms | 0.076598 | -0.072419 | -0.325047 | 0.929379 | 1.000000 | 0.876320 | 0.980170 | -0.009740 | 0.047689 |
population | 0.108030 | -0.115222 | -0.298710 | 0.855109 | 0.876320 | 1.000000 | 0.904637 | 0.002380 | -0.026920 |
households | 0.063070 | -0.077647 | -0.306428 | 0.918392 | 0.980170 | 0.904637 | 1.000000 | 0.010781 | 0.064506 |
median_income | -0.019583 | -0.075205 | -0.111360 | 0.200087 | -0.009740 | 0.002380 | 0.010781 | 1.000000 | 0.687160 |
median_house_value | -0.047432 | -0.142724 | 0.114110 | 0.135097 | 0.047689 | -0.026920 | 0.064506 | 0.687160 | 1.000000 |
corr_matrix['median_house_value'].sort_values(ascending=False)
median_house_value 1.000000
median_income 0.687160
total_rooms 0.135097
housing_median_age 0.114110
households 0.064506
total_bedrooms 0.047689
population -0.026920
longitude -0.047432
latitude -0.142724
Name: median_house_value, dtype: float64
from pandas.plotting import scatter_matrix
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(frame=housing[attributes], figsize=(12, 8))
array([[<AxesSubplot:xlabel='median_house_value', ylabel='median_house_value'>,
<AxesSubplot:xlabel='median_income', ylabel='median_house_value'>,
<AxesSubplot:xlabel='total_rooms', ylabel='median_house_value'>,
<AxesSubplot:xlabel='housing_median_age', ylabel='median_house_value'>],
[<AxesSubplot:xlabel='median_house_value', ylabel='median_income'>,
<AxesSubplot:xlabel='median_income', ylabel='median_income'>,
<AxesSubplot:xlabel='total_rooms', ylabel='median_income'>,
<AxesSubplot:xlabel='housing_median_age', ylabel='median_income'>],
[<AxesSubplot:xlabel='median_house_value', ylabel='total_rooms'>,
<AxesSubplot:xlabel='median_income', ylabel='total_rooms'>,
<AxesSubplot:xlabel='total_rooms', ylabel='total_rooms'>,
<AxesSubplot:xlabel='housing_median_age', ylabel='total_rooms'>],
[<AxesSubplot:xlabel='median_house_value', ylabel='housing_median_age'>,
<AxesSubplot:xlabel='median_income', ylabel='housing_median_age'>,
<AxesSubplot:xlabel='total_rooms', ylabel='housing_median_age'>,
<AxesSubplot:xlabel='housing_median_age', ylabel='housing_median_age'>]],
dtype=object)
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1)
<AxesSubplot:xlabel='median_income', ylabel='median_house_value'>
属性の組み合わせを試してみる 特徴量エンジニアリング
# 新しい属性を作ってみる
housing['room_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']
# 相関係数を計算
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
median_house_value 1.000000
median_income 0.687160
room_per_household 0.146285
total_rooms 0.135097
housing_median_age 0.114110
households 0.064506
total_bedrooms 0.047689
population_per_household -0.021985
population -0.026920
longitude -0.047432
latitude -0.142724
bedrooms_per_room -0.259984
Name: median_house_value, dtype: float64
機械学習アルゴリズムが処理しやすいようにデータを準備する
データのクリーニング
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()
housing
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity | income_cat | |
---|---|---|---|---|---|---|---|---|---|---|
17606 | -121.89 | 37.29 | 38.0 | 1568.0 | 351.0 | 710.0 | 339.0 | 2.7042 | <1H OCEAN | 2 |
18632 | -121.93 | 37.05 | 14.0 | 679.0 | 108.0 | 306.0 | 113.0 | 6.4214 | <1H OCEAN | 5 |
14650 | -117.20 | 32.77 | 31.0 | 1952.0 | 471.0 | 936.0 | 462.0 | 2.8621 | NEAR OCEAN | 2 |
3230 | -119.61 | 36.31 | 25.0 | 1847.0 | 371.0 | 1460.0 | 353.0 | 1.8839 | INLAND | 2 |
3555 | -118.59 | 34.23 | 17.0 | 6592.0 | 1525.0 | 4459.0 | 1463.0 | 3.0347 | <1H OCEAN | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6563 | -118.13 | 34.20 | 46.0 | 1271.0 | 236.0 | 573.0 | 210.0 | 4.9312 | INLAND | 4 |
12053 | -117.56 | 33.88 | 40.0 | 1196.0 | 294.0 | 1052.0 | 258.0 | 2.0682 | INLAND | 2 |
13908 | -116.40 | 34.09 | 9.0 | 4855.0 | 872.0 | 2098.0 | 765.0 | 3.2723 | INLAND | 3 |
11159 | -118.01 | 33.82 | 31.0 | 1960.0 | 380.0 | 1356.0 | 356.0 | 4.0625 | <1H OCEAN | 3 |
15775 | -122.45 | 37.77 | 52.0 | 3095.0 | 682.0 | 1269.0 | 639.0 | 3.5750 | NEAR BAY | 3 |
16512 rows × 10 columns
housing_labels
17606 286600.0
18632 340600.0
14650 196900.0
3230 46300.0
3555 254500.0
...
6563 240200.0
12053 113000.0
13908 97800.0
11159 225900.0
15775 500001.0
Name: median_house_value, Length: 16512, dtype: float64
housing.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 16512 non-null float64
1 latitude 16512 non-null float64
2 housing_median_age 16512 non-null float64
3 total_rooms 16512 non-null float64
4 total_bedrooms 16354 non-null float64
5 population 16512 non-null float64
6 households 16512 non-null float64
7 median_income 16512 non-null float64
8 ocean_proximity 16512 non-null object
9 income_cat 16512 non-null category
dtypes: category(1), float64(8), object(1)
memory usage: 1.3+ MB
housing.isnull().any()
longitude False
latitude False
housing_median_age False
total_rooms False
total_bedrooms True
population False
households False
median_income False
ocean_proximity False
income_cat False
dtype: bool
median = housing['total_bedrooms'].median()
median
# housing['total_bedrooms'].head()
433.0
housing['ocean_proximity']#object
17606 <1H OCEAN
18632 <1H OCEAN
14650 NEAR OCEAN
3230 INLAND
3555 <1H OCEAN
...
6563 INLAND
12053 INLAND
13908 INLAND
11159 <1H OCEAN
15775 NEAR BAY
Name: ocean_proximity, Length: 16512, dtype: object
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')# 欠損値を各属性の中央値で保管するインスタンスを作成
housing_num = housing.drop('ocean_proximity', axis=1)#テキスト属性を取り除いて、数値だけのhousingデータを作成
imputer.fit(housing_num) # fitメソッドで訓練データhousing_numに、imputerインスタンスを適合させられる
SimpleImputer(strategy='median')
# 中央値が正しく格納されているか確認
imputer.statistics_ # statisticsインスタンス変数に中央値が格納している
array([-118.51 , 34.26 , 29. , 2119.5 , 433. , 1164. ,
408. , 3.5409, 3. ])
housing_num.median().values
array([-118.51 , 34.26 , 29. , 2119.5 , 433. , 1164. ,
408. , 3.5409])
X = imputer.transform(housing_num) # housing_numの欠損値をすべて中央値で補完
type(X)
print(X)
[[-121.89 37.29 38. ... 339. 2.7042 2. ]
[-121.93 37.05 14. ... 113. 6.4214 5. ]
[-117.2 32.77 31. ... 462. 2.8621 2. ]
...
[-116.4 34.09 9. ... 765. 3.2723 3. ]
[-118.01 33.82 31. ... 356. 4.0625 3. ]
[-122.45 37.77 52. ... 639. 3.575 3. ]]
housing_tr = pd.DataFrame(data=X, columns=housing_num.columns, index=housing_num.index)
housing_tr
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | income_cat | |
---|---|---|---|---|---|---|---|---|---|
17606 | -121.89 | 37.29 | 38.0 | 1568.0 | 351.0 | 710.0 | 339.0 | 2.7042 | 2.0 |
18632 | -121.93 | 37.05 | 14.0 | 679.0 | 108.0 | 306.0 | 113.0 | 6.4214 | 5.0 |
14650 | -117.20 | 32.77 | 31.0 | 1952.0 | 471.0 | 936.0 | 462.0 | 2.8621 | 2.0 |
3230 | -119.61 | 36.31 | 25.0 | 1847.0 | 371.0 | 1460.0 | 353.0 | 1.8839 | 2.0 |
3555 | -118.59 | 34.23 | 17.0 | 6592.0 | 1525.0 | 4459.0 | 1463.0 | 3.0347 | 3.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6563 | -118.13 | 34.20 | 46.0 | 1271.0 | 236.0 | 573.0 | 210.0 | 4.9312 | 4.0 |
12053 | -117.56 | 33.88 | 40.0 | 1196.0 | 294.0 | 1052.0 | 258.0 | 2.0682 | 2.0 |
13908 | -116.40 | 34.09 | 9.0 | 4855.0 | 872.0 | 2098.0 | 765.0 | 3.2723 | 3.0 |
11159 | -118.01 | 33.82 | 31.0 | 1960.0 | 380.0 | 1356.0 | 356.0 | 4.0625 | 3.0 |
15775 | -122.45 | 37.77 | 52.0 | 3095.0 | 682.0 | 1269.0 | 639.0 | 3.5750 | 3.0 |
16512 rows × 9 columns
housing_tr.isnull().any() # 欠損値が補完されていることが分かる
longitude False
latitude False
housing_median_age False
total_rooms False
total_bedrooms False
population False
households False
median_income False
income_cat False
dtype: bool
テキスト/カテゴリ属性の処理
housing
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity | income_cat | |
---|---|---|---|---|---|---|---|---|---|---|
17606 | -121.89 | 37.29 | 38.0 | 1568.0 | 351.0 | 710.0 | 339.0 | 2.7042 | <1H OCEAN | 2 |
18632 | -121.93 | 37.05 | 14.0 | 679.0 | 108.0 | 306.0 | 113.0 | 6.4214 | <1H OCEAN | 5 |
14650 | -117.20 | 32.77 | 31.0 | 1952.0 | 471.0 | 936.0 | 462.0 | 2.8621 | NEAR OCEAN | 2 |
3230 | -119.61 | 36.31 | 25.0 | 1847.0 | 371.0 | 1460.0 | 353.0 | 1.8839 | INLAND | 2 |
3555 | -118.59 | 34.23 | 17.0 | 6592.0 | 1525.0 | 4459.0 | 1463.0 | 3.0347 | <1H OCEAN | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6563 | -118.13 | 34.20 | 46.0 | 1271.0 | 236.0 | 573.0 | 210.0 | 4.9312 | INLAND | 4 |
12053 | -117.56 | 33.88 | 40.0 | 1196.0 | 294.0 | 1052.0 | 258.0 | 2.0682 | INLAND | 2 |
13908 | -116.40 | 34.09 | 9.0 | 4855.0 | 872.0 | 2098.0 | 765.0 | 3.2723 | INLAND | 3 |
11159 | -118.01 | 33.82 | 31.0 | 1960.0 | 380.0 | 1356.0 | 356.0 | 4.0625 | <1H OCEAN | 3 |
15775 | -122.45 | 37.77 | 52.0 | 3095.0 | 682.0 | 1269.0 | 639.0 | 3.5750 | NEAR BAY | 3 |
16512 rows × 10 columns
housing.isnull().any()
longitude False
latitude False
housing_median_age False
total_rooms False
total_bedrooms True
population False
households False
median_income False
ocean_proximity False
income_cat False
dtype: bool
housing_cat = housing['ocean_proximity']
housing_cat.head(10)
17606 <1H OCEAN
18632 <1H OCEAN
14650 NEAR OCEAN
3230 INLAND
3555 <1H OCEAN
19480 INLAND
8879 <1H OCEAN
13685 INLAND
4937 <1H OCEAN
4861 <1H OCEAN
Name: ocean_proximity, dtype: object
# テキストラベルを数値に変換 目的:機械学習アルゴリズムは数値属性のほうが操作しやすいため
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-162-bf255ca0d523> in <module>
4
5 ordinal_encoder = OrdinalEncoder()
----> 6 housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
~/anaconda3/lib/python3.8/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
697 if y is None:
698 # fit method of arity 1 (unsupervised transformation)
--> 699 return self.fit(X, **fit_params).transform(X)
700 else:
701 # fit method of arity 2 (supervised transformation)
~/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_encoders.py in fit(self, X, y)
759 f"got {self.unknown_value}.")
760
--> 761 self._fit(X)
762
763 if self.handle_unknown == 'use_encoded_value':
~/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_encoders.py in _fit(self, X, handle_unknown, force_all_finite)
75
76 def _fit(self, X, handle_unknown='error', force_all_finite=True):
---> 77 X_list, n_samples, n_features = self._check_X(
78 X, force_all_finite=force_all_finite)
79
~/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_encoders.py in _check_X(self, X, force_all_finite)
42 if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
43 # if not a dataframe, do normal check_array validation
---> 44 X_temp = check_array(X, dtype=None,
45 force_all_finite=force_all_finite)
46 if (not hasattr(X, 'dtype')
~/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
635 # If input is 1D raise error
636 if array.ndim == 1:
--> 637 raise ValueError(
638 "Expected 2D array, got 1D array instead:\narray={}.\n"
639 "Reshape your data either using array.reshape(-1, 1) if "
ValueError: Expected 2D array, got 1D array instead:
array=['<1H OCEAN' '<1H OCEAN' 'NEAR OCEAN' ... 'INLAND' '<1H OCEAN' 'NEAR BAY'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.