データ分析でよく使うスニペット集【前処理編】

Last updated at 2024-08-21Posted at 2024-02-24

はじめに

データ分析のコンペに参加していると、よく使うスクリプトがあるようです
これをスニペットというらしく、自分でもまとめてみます。

可視化・グラフ描画

データをもらったら、まずはデータを眺める
以下のような出力を見ながら、前処理の仕方を考える

sns.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib.inline
sns.set()

# 日本語化
# !pip install japanese_matplotlib
import japanese_matplotlib

# 数値とオブジェクト型のカラムをリストで取得
numeric_cols = df_train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df_train.select_dtypes(include=['object']).columns

# violinplotを描画する関数
def plot_violinplot(df, numeric_cols, n_rows, n_cols):
    plt.figure(figsize=(15, 8))

    for i, col in enumerate(numeric_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.violinplot(x=df[col])
        
    plt.tight_layout()
    plt.show()

# Boxplotを描画する関数
def plot_boxplot(df, numeric_cols, n_rows, n_cols):
    plt.figure(figsize=(15, 8))
    
    for i, col in enumerate(numeric_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.boxplot(x=df[col])
        plt.xticks(rotation=90)  # x軸のラベルを回転して表示
        
    plt.tight_layout()
    plt.show()

# ヒストグラムを描画する関数
def plot_histograms(df, categorical_cols, n_rows, n_cols):
    plt.figure(figsize=(15, 8))
    
    for i, col in enumerate(categorical_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.histplot(df[col], kde=True)
    
    plt.tight_layout()
    plt.show()

欠損値の数と一意の数を集計

def basic_details(df):
    b = pd.DataFrame()
    b['Missing value'] = df.isnull().sum()
    b['N unique value'] = df.nunique()
    b['dtype'] = df.dtypes
    return b
basic_details(df_train)

前処理の前後の結合と分割

欠損値

# df_trainとdf_testを結合する

## df_trainとdf_testに'is_train'列を追加（訓練データ=1, テストデータ=0）
df_train['is_train'] = 1
df_test['is_train'] = 0

# SalePriceがテストデータには存在しないので、結合のために仮の列を追加
df_test['SalePrice'] = None

# 訓練データセットとテストデータセットを結合
combined_df = pd.concat([df_train, df_test], ignore_index=True)

# 前処理が完了したら、分割する

# 訓練データセットの抽出
df_train_processed = combined_df[combined_df['is_train'] == 1].drop(columns=['is_train', 'SalePrice'])

# テストデータセットの抽出（'SalePrice'列を除外）
df_test_processed = combined_df[combined_df['is_train'] == 0].drop(columns=['is_train', 'SalePrice'])

メモリの削減

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

print('start size: {:5.2f} Mb'.format(combined_df.memory_usage().sum() / 1024**2))
combined_df = reduce_mem_usage(combined_df)

カテゴリ変数の数値化

# ラベルエンコーディング
from sklearn.preprocessing import LabelEncoder

# カテゴリカラムを取得
categorical_cols = df_train.select_dtypes(include=['object']).columns

# df_trainにラベルエンコーディング
label_encoders = {}
encoded_columns_train = {}
for cat_col in categorical_cols:
    le = LabelEncoder()
    le.fit(df_train[cat_col])
    encoded_columns_train[cat_col+'_le'] = le.transform(df_train[cat_col]) # ラベルエンコーディングされた列を辞書に追加
    label_encoders[cat_col] = le  # LabelEncoderを辞書に保存
# 辞書から新しいDataFrameを作成し、元のDataFrameに結合
encoded_df = pd.DataFrame(encoded_columns_train)
df_train = pd.concat([df_train, encoded_df], axis=1)

# df_testにラベルエンコーディング
encoded_columns_test = {}
for cat_col in categorical_cols:
    #df_test[cat_col].fillna(df_train[cat_col].mode()[0], inplace=True)  # 欠損値を最頻値に置き換え
    le = label_encoders[cat_col]  # df_trainでfitされたLabelEncoderを使用
    encoded_columns_test[cat_col + '_le'] = le.transform(df_test[cat_col])
# 辞書から新しいDataFrameを作成し、元のdf_testに結合
encoded_df_test = pd.DataFrame(encoded_columns_test)
df_test = pd.concat([df_test, encoded_df_test], axis=1)


# 必要に応じて、元のカテゴリカル列を削除
# df_train.drop(categorical_cols, axis=1, inplace=True)
# df_test.drop(categorical_cols, axis=1, inplace=True)

外れ値を2つ以上持つ行を削除する

from collections import Counter

def detect_outliers(df, n, features):
    """
    データフレームdfと特徴量のリストを取り、Tukey法による外れ値がn個以上含まれる観測のインデックスのリストを返します。
    """
    outlier_indices = []
    
    # 特徴量（カラム）ごとに繰り返す
    for col in features:
        #外れ値を定義
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        
        # 特徴量colについての外れ値のインデックスリストを作成
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
        
    # 2つ以上のアウトライヤーを含む行を選択する
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
    
    return multiple_outliers

Outliers_to_drop = detect_outliers(train_df,2, numeric_cols)
#df_train.loc[Outliers_to_drop] # 外れ値を2つ以上持つ行のみを表示

# 外れ値を2つ以上持つ行を削除
df_train = df_train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

欠損値をとりあえず埋める

# 欠損値を含む、数値・カテゴリcolを取得
missing_numeric_cols = [col for col in numeric_cols if combined_df[col].isnull().any()]
missing_categorical_cols = [col for col in categorical_cols if combined_df[col].isnull().any()]

# 数値なら0,カテゴリならNで埋める
for col in missing_numeric_cols:
    combined_df[col] = combined_df[col].fillna(0)
    
for col in missing_categorical_cols:
    combined_df[col] = combined_df[col].fillna("N")

欠損値を指定した列ごとの平均・最頻値・中央値で埋める

def fill_missing_values(df, group_col, target_col, method='mean', rounding=None):
    """
    欠損値を埋める関数。
    
    Parameters:
        df (pd.DataFrame): 対象のデータフレーム。
        group_col (str): グループ化に使用する列名。
        target_col (str): 欠損値を埋めたい対象列名。
        method (str): 'mean'（平均値）、'mode'（最頻値）、または 'median'（中央値）のどれを使用するか。デフォルトは 'mean'。
        rounding (int): 四捨五入の桁数を指定する。None の場合は四捨五入を行わない。
    """
    if method == 'mean':
        fill_value = df.groupby(group_col)[target_col].mean()
    elif method == 'mode':
        fill_value = df.groupby(group_col)[target_col].agg(lambda x: x.mode().iloc[0])
    elif method == 'median':
        fill_value = df.groupby(group_col)[target_col].median()
    else:
        raise ValueError("Method must be 'mean', 'mode', or 'median'")
    
    if rounding is not None:
        fill_value = fill_value.round(rounding).astype('int')
    
    fill_value = fill_value.reset_index()
    
    def fill_row(row):
        if pd.isna(row[target_col]):
            return fill_value[fill_value[group_col] == row[group_col]][target_col].values[0]
        else:
            return row[target_col]

    df[target_col] = df.apply(fill_row, axis=1)
    return df

# 実際の使用例
# combined_df = fill_missing_values(combined_df, 'Designation', 'MonthlyIncome', method='median', rounding=-4)
# 結果を確認
# print(combined_df['MonthlyIncome'].isna().sum())

特徴量の追加

# dfに'xごとのyの平均'と'y - xごとのyの平均'列を追加する関数
def add_grouped_feature(df, group_col, value_col, mean_col_name, diff_col_name):
    # グループごとの平均を計算し、新しい列として直接追加
    df[mean_col_name] = df.groupby(group_col)[value_col].transform('mean')
    # 元の値とグループ平均との差分を新しい列として追加
    df[diff_col_name] = df[value_col] - df[mean_col_name]
    
    return df

# 使い方の例
combined_df = add_grouped_feature(df=combined_df,
                                  group_col='Neighborhood',
                                  value_col='GrLivArea',
                                  mean_col_name='Neighborhood_GrLivArea_Mean',
                                  diff_col_name='GrLivArea_Diff_From_Neighborhood_Mean')

記述統計に基づく特徴量エンジニアリング

# 記述統計に基づいて、特徴量を作成する関数
## 各選択カラムに対して、以下の新しい特徴量カラムを作成します:
### c+'_median_range': カラムの値がその中央値より大きいかどうか（0 または 1）。
### c+'_mean_range': カラムの値がその平均値より大きいかどうか（0 または 1）。
### c+'_q1': カラムの値が第1四分位数未満かどうか（0 または 1）。
### c+'_q3': カラムの値が第3四分位数より大きいかどうか（0 または 1）。

def descrictive_stat_feat(df):
    df = pd.DataFrame(df)
    dcol= [c for c in df.columns if df[c].nunique()>=10]
    d_median = df[dcol].median(axis=0)
    d_mean = df[dcol].mean(axis=0)
    q1 = df[dcol].apply(np.float32).quantile(0.25)
    q3 = df[dcol].apply(np.float32).quantile(0.75)
    
    for c in dcol:
        df[c+str('_median_range')] = (df[c].astype(np.float32).values > d_median[c]).astype(np.int8)
        df[c+str('_mean_range')] = (df[c].astype(np.float32).values > d_mean[c]).astype(np.int8)
        df[c+str('_q1')] = (df[c].astype(np.float32).values < q1[c]).astype(np.int8)
        df[c+str('_q3')] = (df[c].astype(np.float32).values > q3[c]).astype(np.int8)
    return df

df = descrictive_stat_feat(df)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up