1
4

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

書籍「scikit-learn、Keras、TensorFlowによる実践機械学習 第2版」写経&自分用メモ

Last updated at Posted at 2021-09-02
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
fetch_housing_data()
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

データ構造を見てみる

housing = load_housing_data()
housing.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
housing['ocean_proximity'].value_counts()
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64
housing.describe()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20,15))
plt.show()

png

  • 単位は何か気にすること
  • 上限下限を切っている場合はその部分だけ極端に大きくなっている → 住宅価格の中央値や築年数の中央値が今回のそれに該当する

テストセットを作る

import numpy as np

def split_tarin_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))#len(data)の数値までのランダムなnumpy配列を作成
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[ : test_set_size]
    train_indices = shuffled_indices[test_set_size : ]
    return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_tarin_test(housing, 0.2)
len(train_set)
16512
len(test_set)
4128
train_set
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
14196 -117.03 32.71 33.0 3126.0 627.0 2300.0 623.0 3.2596 103000.0 NEAR OCEAN
8267 -118.16 33.77 49.0 3382.0 787.0 1314.0 756.0 3.8125 382100.0 NEAR OCEAN
17445 -120.48 34.66 4.0 1897.0 331.0 915.0 336.0 4.1563 172600.0 NEAR OCEAN
14265 -117.11 32.69 36.0 1421.0 367.0 1418.0 355.0 1.9425 93400.0 NEAR OCEAN
2271 -119.80 36.78 43.0 2382.0 431.0 874.0 380.0 3.5542 96500.0 INLAND
... ... ... ... ... ... ... ... ... ... ...
11284 -117.96 33.78 35.0 1330.0 201.0 658.0 217.0 6.3700 229200.0 <1H OCEAN
11964 -117.43 34.02 33.0 3084.0 570.0 1753.0 449.0 3.0500 97800.0 INLAND
5390 -118.38 34.03 36.0 2101.0 569.0 1756.0 527.0 2.9344 222100.0 <1H OCEAN
860 -121.96 37.58 15.0 3575.0 597.0 1777.0 559.0 5.7192 283500.0 <1H OCEAN
15795 -122.42 37.77 52.0 4226.0 1315.0 2619.0 1242.0 2.5755 325000.0 NEAR BAY

16512 rows × 10 columns

この状態だと、関数を実行するたびに異なる配列が作成されてしまい、データセット全体を見ることになる
テストセットをなんのために作ったのか分からない

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

len(train_set)
16512
len(test_set)
4128
train_set
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
14196 -117.03 32.71 33.0 3126.0 627.0 2300.0 623.0 3.2596 103000.0 NEAR OCEAN
8267 -118.16 33.77 49.0 3382.0 787.0 1314.0 756.0 3.8125 382100.0 NEAR OCEAN
17445 -120.48 34.66 4.0 1897.0 331.0 915.0 336.0 4.1563 172600.0 NEAR OCEAN
14265 -117.11 32.69 36.0 1421.0 367.0 1418.0 355.0 1.9425 93400.0 NEAR OCEAN
2271 -119.80 36.78 43.0 2382.0 431.0 874.0 380.0 3.5542 96500.0 INLAND
... ... ... ... ... ... ... ... ... ... ...
11284 -117.96 33.78 35.0 1330.0 201.0 658.0 217.0 6.3700 229200.0 <1H OCEAN
11964 -117.43 34.02 33.0 3084.0 570.0 1753.0 449.0 3.0500 97800.0 INLAND
5390 -118.38 34.03 36.0 2101.0 569.0 1756.0 527.0 2.9344 222100.0 <1H OCEAN
860 -121.96 37.58 15.0 3575.0 597.0 1777.0 559.0 5.7192 283500.0 <1H OCEAN
15795 -122.42 37.77 52.0 4226.0 1315.0 2619.0 1242.0 2.5755 325000.0 NEAR BAY

16512 rows × 10 columns

層化抽出法

housing['income_cat'] = pd.cut(housing['median_income'], 
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1,2,3,4,5])
housing['income_cat'].hist()
<AxesSubplot:>

png

from sklearn.model_selection import  StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

strat_test_set['income_cat'].value_counts() / len(strat_test_set)
3    0.350533
2    0.318798
4    0.176357
5    0.114583
1    0.039729
Name: income_cat, dtype: float64
len(strat_test_set)
4128
len(strat_test_set['income_cat'])
4128
strat_test_set
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity income_cat
5241 -118.39 34.12 29.0 6447.0 1012.0 2184.0 960.0 8.2816 500001.0 <1H OCEAN 5
10970 -117.86 33.77 39.0 4159.0 655.0 1669.0 651.0 4.6111 240300.0 <1H OCEAN 4
20351 -119.05 34.21 27.0 4357.0 926.0 2110.0 876.0 3.0119 218200.0 <1H OCEAN 3
6568 -118.15 34.20 52.0 1786.0 306.0 1018.0 322.0 4.1518 182100.0 INLAND 3
13285 -117.68 34.07 32.0 1775.0 314.0 1067.0 302.0 4.0375 121300.0 INLAND 3
... ... ... ... ... ... ... ... ... ... ... ...
20519 -121.53 38.58 33.0 4988.0 1169.0 2414.0 1075.0 1.9728 76400.0 INLAND 2
17430 -120.44 34.65 30.0 2265.0 512.0 1402.0 471.0 1.9750 134000.0 NEAR OCEAN 2
4019 -118.49 34.18 31.0 3073.0 674.0 1486.0 684.0 4.8984 311700.0 <1H OCEAN 4
12107 -117.32 33.99 27.0 5464.0 850.0 2400.0 836.0 4.7110 133500.0 INLAND 4
2398 -118.91 36.79 19.0 1616.0 324.0 187.0 80.0 3.7857 78600.0 INLAND 3

4128 rows × 11 columns

strat_test_set['income_cat'].value_counts()
3    1447
2    1316
4     728
5     473
1     164
Name: income_cat, dtype: int64
strat_test_set['income_cat']
5241     5
10970    4
20351    3
6568     3
13285    3
        ..
20519    2
17430    2
4019     4
12107    4
2398     3
Name: income_cat, Length: 4128, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
housing['income_cat']
0        5
1        5
2        5
3        4
4        3
        ..
20635    2
20636    2
20637    2
20638    2
20639    2
Name: income_cat, Length: 20640, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]

strat_train_set
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity income_cat
17606 -121.89 37.29 38.0 1568.0 351.0 710.0 339.0 2.7042 286600.0 <1H OCEAN 2
18632 -121.93 37.05 14.0 679.0 108.0 306.0 113.0 6.4214 340600.0 <1H OCEAN 5
14650 -117.20 32.77 31.0 1952.0 471.0 936.0 462.0 2.8621 196900.0 NEAR OCEAN 2
3230 -119.61 36.31 25.0 1847.0 371.0 1460.0 353.0 1.8839 46300.0 INLAND 2
3555 -118.59 34.23 17.0 6592.0 1525.0 4459.0 1463.0 3.0347 254500.0 <1H OCEAN 3
... ... ... ... ... ... ... ... ... ... ... ...
6563 -118.13 34.20 46.0 1271.0 236.0 573.0 210.0 4.9312 240200.0 INLAND 4
12053 -117.56 33.88 40.0 1196.0 294.0 1052.0 258.0 2.0682 113000.0 INLAND 2
13908 -116.40 34.09 9.0 4855.0 872.0 2098.0 765.0 3.2723 97800.0 INLAND 3
11159 -118.01 33.82 31.0 1960.0 380.0 1356.0 356.0 4.0625 225900.0 <1H OCEAN 3
15775 -122.45 37.77 52.0 3095.0 682.0 1269.0 639.0 3.5750 500001.0 NEAR BAY 3

16512 rows × 11 columns


データを研究、可視化して理解を深める

地理データの可視化

housing = strat_train_set.copy()
# len(housing)
# type(housing) #DataFrame
housing.plot(kind='scatter', x ='longitude', y='latitude', alpha=0.1)
<AxesSubplot:xlabel='longitude', ylabel='latitude'>

png

housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
             s=housing['population']/100, label='population', figsize=(10, 7),
             c=housing['median_house_value'], cmap=plt.get_cmap('jet'), colorbar='True',  
             sharex=False)
plt.legend()

<matplotlib.legend.Legend at 0x7f185a1b5820>

png

相関を探す

corr_matrix = housing.corr()
corr_matrix
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
longitude 1.000000 -0.924478 -0.105848 0.048871 0.076598 0.108030 0.063070 -0.019583 -0.047432
latitude -0.924478 1.000000 0.005766 -0.039184 -0.072419 -0.115222 -0.077647 -0.075205 -0.142724
housing_median_age -0.105848 0.005766 1.000000 -0.364509 -0.325047 -0.298710 -0.306428 -0.111360 0.114110
total_rooms 0.048871 -0.039184 -0.364509 1.000000 0.929379 0.855109 0.918392 0.200087 0.135097
total_bedrooms 0.076598 -0.072419 -0.325047 0.929379 1.000000 0.876320 0.980170 -0.009740 0.047689
population 0.108030 -0.115222 -0.298710 0.855109 0.876320 1.000000 0.904637 0.002380 -0.026920
households 0.063070 -0.077647 -0.306428 0.918392 0.980170 0.904637 1.000000 0.010781 0.064506
median_income -0.019583 -0.075205 -0.111360 0.200087 -0.009740 0.002380 0.010781 1.000000 0.687160
median_house_value -0.047432 -0.142724 0.114110 0.135097 0.047689 -0.026920 0.064506 0.687160 1.000000
corr_matrix['median_house_value'].sort_values(ascending=False)
median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64
from pandas.plotting import scatter_matrix

attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(frame=housing[attributes], figsize=(12, 8))
array([[<AxesSubplot:xlabel='median_house_value', ylabel='median_house_value'>,
        <AxesSubplot:xlabel='median_income', ylabel='median_house_value'>,
        <AxesSubplot:xlabel='total_rooms', ylabel='median_house_value'>,
        <AxesSubplot:xlabel='housing_median_age', ylabel='median_house_value'>],
       [<AxesSubplot:xlabel='median_house_value', ylabel='median_income'>,
        <AxesSubplot:xlabel='median_income', ylabel='median_income'>,
        <AxesSubplot:xlabel='total_rooms', ylabel='median_income'>,
        <AxesSubplot:xlabel='housing_median_age', ylabel='median_income'>],
       [<AxesSubplot:xlabel='median_house_value', ylabel='total_rooms'>,
        <AxesSubplot:xlabel='median_income', ylabel='total_rooms'>,
        <AxesSubplot:xlabel='total_rooms', ylabel='total_rooms'>,
        <AxesSubplot:xlabel='housing_median_age', ylabel='total_rooms'>],
       [<AxesSubplot:xlabel='median_house_value', ylabel='housing_median_age'>,
        <AxesSubplot:xlabel='median_income', ylabel='housing_median_age'>,
        <AxesSubplot:xlabel='total_rooms', ylabel='housing_median_age'>,
        <AxesSubplot:xlabel='housing_median_age', ylabel='housing_median_age'>]],
      dtype=object)

png

housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1)
<AxesSubplot:xlabel='median_income', ylabel='median_house_value'>

png

属性の組み合わせを試してみる 特徴量エンジニアリング

# 新しい属性を作ってみる
housing['room_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']
# 相関係数を計算
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
median_house_value          1.000000
median_income               0.687160
room_per_household          0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

機械学習アルゴリズムが処理しやすいようにデータを準備する

データのクリーニング

housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()
housing
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity income_cat
17606 -121.89 37.29 38.0 1568.0 351.0 710.0 339.0 2.7042 <1H OCEAN 2
18632 -121.93 37.05 14.0 679.0 108.0 306.0 113.0 6.4214 <1H OCEAN 5
14650 -117.20 32.77 31.0 1952.0 471.0 936.0 462.0 2.8621 NEAR OCEAN 2
3230 -119.61 36.31 25.0 1847.0 371.0 1460.0 353.0 1.8839 INLAND 2
3555 -118.59 34.23 17.0 6592.0 1525.0 4459.0 1463.0 3.0347 <1H OCEAN 3
... ... ... ... ... ... ... ... ... ... ...
6563 -118.13 34.20 46.0 1271.0 236.0 573.0 210.0 4.9312 INLAND 4
12053 -117.56 33.88 40.0 1196.0 294.0 1052.0 258.0 2.0682 INLAND 2
13908 -116.40 34.09 9.0 4855.0 872.0 2098.0 765.0 3.2723 INLAND 3
11159 -118.01 33.82 31.0 1960.0 380.0 1356.0 356.0 4.0625 <1H OCEAN 3
15775 -122.45 37.77 52.0 3095.0 682.0 1269.0 639.0 3.5750 NEAR BAY 3

16512 rows × 10 columns

housing_labels
17606    286600.0
18632    340600.0
14650    196900.0
3230      46300.0
3555     254500.0
           ...   
6563     240200.0
12053    113000.0
13908     97800.0
11159    225900.0
15775    500001.0
Name: median_house_value, Length: 16512, dtype: float64
housing.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           16512 non-null  float64 
 1   latitude            16512 non-null  float64 
 2   housing_median_age  16512 non-null  float64 
 3   total_rooms         16512 non-null  float64 
 4   total_bedrooms      16354 non-null  float64 
 5   population          16512 non-null  float64 
 6   households          16512 non-null  float64 
 7   median_income       16512 non-null  float64 
 8   ocean_proximity     16512 non-null  object  
 9   income_cat          16512 non-null  category
dtypes: category(1), float64(8), object(1)
memory usage: 1.3+ MB
housing.isnull().any()
longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
ocean_proximity       False
income_cat            False
dtype: bool
median = housing['total_bedrooms'].median()
median
# housing['total_bedrooms'].head()
433.0
housing['ocean_proximity']#object
17606     <1H OCEAN
18632     <1H OCEAN
14650    NEAR OCEAN
3230         INLAND
3555      <1H OCEAN
            ...    
6563         INLAND
12053        INLAND
13908        INLAND
11159     <1H OCEAN
15775      NEAR BAY
Name: ocean_proximity, Length: 16512, dtype: object
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')# 欠損値を各属性の中央値で保管するインスタンスを作成
housing_num = housing.drop('ocean_proximity', axis=1)#テキスト属性を取り除いて、数値だけのhousingデータを作成
imputer.fit(housing_num) # fitメソッドで訓練データhousing_numに、imputerインスタンスを適合させられる
SimpleImputer(strategy='median')
# 中央値が正しく格納されているか確認
imputer.statistics_ # statisticsインスタンス変数に中央値が格納している
array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409,    3.    ])
housing_num.median().values
array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])
X = imputer.transform(housing_num) # housing_numの欠損値をすべて中央値で補完
type(X)
print(X)
[[-121.89     37.29     38.     ...  339.        2.7042    2.    ]
 [-121.93     37.05     14.     ...  113.        6.4214    5.    ]
 [-117.2      32.77     31.     ...  462.        2.8621    2.    ]
 ...
 [-116.4      34.09      9.     ...  765.        3.2723    3.    ]
 [-118.01     33.82     31.     ...  356.        4.0625    3.    ]
 [-122.45     37.77     52.     ...  639.        3.575     3.    ]]
housing_tr = pd.DataFrame(data=X, columns=housing_num.columns, index=housing_num.index)
housing_tr
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income income_cat
17606 -121.89 37.29 38.0 1568.0 351.0 710.0 339.0 2.7042 2.0
18632 -121.93 37.05 14.0 679.0 108.0 306.0 113.0 6.4214 5.0
14650 -117.20 32.77 31.0 1952.0 471.0 936.0 462.0 2.8621 2.0
3230 -119.61 36.31 25.0 1847.0 371.0 1460.0 353.0 1.8839 2.0
3555 -118.59 34.23 17.0 6592.0 1525.0 4459.0 1463.0 3.0347 3.0
... ... ... ... ... ... ... ... ... ...
6563 -118.13 34.20 46.0 1271.0 236.0 573.0 210.0 4.9312 4.0
12053 -117.56 33.88 40.0 1196.0 294.0 1052.0 258.0 2.0682 2.0
13908 -116.40 34.09 9.0 4855.0 872.0 2098.0 765.0 3.2723 3.0
11159 -118.01 33.82 31.0 1960.0 380.0 1356.0 356.0 4.0625 3.0
15775 -122.45 37.77 52.0 3095.0 682.0 1269.0 639.0 3.5750 3.0

16512 rows × 9 columns

housing_tr.isnull().any() # 欠損値が補完されていることが分かる
longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
income_cat            False
dtype: bool

テキスト/カテゴリ属性の処理

housing
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity income_cat
17606 -121.89 37.29 38.0 1568.0 351.0 710.0 339.0 2.7042 <1H OCEAN 2
18632 -121.93 37.05 14.0 679.0 108.0 306.0 113.0 6.4214 <1H OCEAN 5
14650 -117.20 32.77 31.0 1952.0 471.0 936.0 462.0 2.8621 NEAR OCEAN 2
3230 -119.61 36.31 25.0 1847.0 371.0 1460.0 353.0 1.8839 INLAND 2
3555 -118.59 34.23 17.0 6592.0 1525.0 4459.0 1463.0 3.0347 <1H OCEAN 3
... ... ... ... ... ... ... ... ... ... ...
6563 -118.13 34.20 46.0 1271.0 236.0 573.0 210.0 4.9312 INLAND 4
12053 -117.56 33.88 40.0 1196.0 294.0 1052.0 258.0 2.0682 INLAND 2
13908 -116.40 34.09 9.0 4855.0 872.0 2098.0 765.0 3.2723 INLAND 3
11159 -118.01 33.82 31.0 1960.0 380.0 1356.0 356.0 4.0625 <1H OCEAN 3
15775 -122.45 37.77 52.0 3095.0 682.0 1269.0 639.0 3.5750 NEAR BAY 3

16512 rows × 10 columns

housing.isnull().any()
longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
ocean_proximity       False
income_cat            False
dtype: bool
housing_cat = housing['ocean_proximity']
housing_cat.head(10)
17606     <1H OCEAN
18632     <1H OCEAN
14650    NEAR OCEAN
3230         INLAND
3555      <1H OCEAN
19480        INLAND
8879      <1H OCEAN
13685        INLAND
4937      <1H OCEAN
4861      <1H OCEAN
Name: ocean_proximity, dtype: object
# テキストラベルを数値に変換 目的:機械学習アルゴリズムは数値属性のほうが操作しやすいため

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-162-bf255ca0d523> in <module>
      4 
      5 ordinal_encoder = OrdinalEncoder()
----> 6 housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)


~/anaconda3/lib/python3.8/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    697         if y is None:
    698             # fit method of arity 1 (unsupervised transformation)
--> 699             return self.fit(X, **fit_params).transform(X)
    700         else:
    701             # fit method of arity 2 (supervised transformation)


~/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_encoders.py in fit(self, X, y)
    759                             f"got {self.unknown_value}.")
    760 
--> 761         self._fit(X)
    762 
    763         if self.handle_unknown == 'use_encoded_value':


~/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_encoders.py in _fit(self, X, handle_unknown, force_all_finite)
     75 
     76     def _fit(self, X, handle_unknown='error', force_all_finite=True):
---> 77         X_list, n_samples, n_features = self._check_X(
     78             X, force_all_finite=force_all_finite)
     79 


~/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_encoders.py in _check_X(self, X, force_all_finite)
     42         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
     43             # if not a dataframe, do normal check_array validation
---> 44             X_temp = check_array(X, dtype=None,
     45                                  force_all_finite=force_all_finite)
     46             if (not hasattr(X, 'dtype')


~/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0


~/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    635             # If input is 1D raise error
    636             if array.ndim == 1:
--> 637                 raise ValueError(
    638                     "Expected 2D array, got 1D array instead:\narray={}.\n"
    639                     "Reshape your data either using array.reshape(-1, 1) if "


ValueError: Expected 2D array, got 1D array instead:
array=['<1H OCEAN' '<1H OCEAN' 'NEAR OCEAN' ... 'INLAND' '<1H OCEAN' 'NEAR BAY'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

1
4
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
4

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?