データの前処理に関するコードメモ
#データフレーム内の欠損値数の確認
df_train.isnull().sum()
ages 18
list_price 0
num_reviews 0
piece_count 0
review_difficulty 0
star_rating 405
country 255
dtype: int64
#null値を持つ行を削除
df = df.dropna(subset=['charges'])
#欠損値の補完(平均値or最頻値を補完)
df = df.fillna({'bmi': df['bmi'].mean()})
#カテゴリカル変数の確認
df_obj = df.select_dtypes(include='object')
gender smoker region
0 female yes southwest
3 male no northwest
9 female no northwest
df_uni = df_obj.nunique()
gender 2
smoker 2
region 4
dtype: int64
for uni in df_obj.columns:
print(uni)
print(df_obj[uni].unique())
gender
['female' 'male']
smoker
['yes' 'no']
region
['southwest' 'northwest' 'southeast' 'northeast']
#カテゴリカル変数をダミー変数へ
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['gender'])
le.transform(df['gender'])
age gender bmi children smoker region charges
0 19 0 27.900000 0 yes southwest 16884.92400
3 33 1 22.705000 0 no northwest 21984.47061
9 60 0 30.716434 0 no northwest 28923.13692
#One-Hot Encording (カテゴリカル変数をすべてダミー変数へ)
df = pd.get_dummies(df, drop_first=True)
One-Hot Encordingがどちゃくそ便利!