More than 1 year has passed since last update.

pandas　前処理メモ

pandas

Last updated at 2022-07-26Posted at 2022-06-06

データの外観を知る

print(df.shape) # row x columns of data
print(df.ndim) # dimension of data
print(df.size) # size of data

# テーブルがどんな型の値を持っているか
df.dtypes.value_counts()

# object型の列において、何種類の値を持っているか
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

# どんな種類の値が、何個入っているか
df["CODE_GENDER"].value_counts()

# subsetで軸とするカラムを設定する。
ans = df.dropna(subset=['Age'])

# Ageがnullの列を表示
df[df['Age'].isnull()]

# nullがある列を表示
df_null = df[df.isnull().any(axis=1)]

# 特定のカラムに入力
df_test.loc[88, 'Age'] = 26

# 特定のカラムを表示
df[df['CODE_GENDER']=="XNA"]

# 特定の条件の行を消去
df = df[df['CODE_GENDER']!="XNA"]

# 特定のカラムを消去
df = df.drop("WEEKDAY_APPR_PROCESS_START", axis=1)

# 特定条件のカラムにnanを入れる
df[df['dwllsize'] != 'A'] = np.nan

エンコード

df['CODE_GENDER'] = df['CODE_GENDER'].map({'F':1, 'M':0}).astype(int)

# ライブラリを使用したエンコード
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['Cabin'] = enc.fit_transform(df['Cabin'])

一括エンコード

値が2個しかないものは一括でエンコードする

def encode_categorical_variables(df):
  # Create a label encoder object
  le = LabelEncoder()
  le_count = 0

  # Iterate through the columns
  for col in df:
    if df[col].dtype == 'object':
      # If 2 or fewer unique categories
      if len(list(df[col].unique())) <= 2:
        print(col)
        # Train on the training data
        le.fit(df[col])
        # Transform both training and testing data
        df[col] = le.transform(df[col])            
        # Keep track of how many columns were label encoded
        le_count += 1
              
  print('%d columns were label encoded.' % le_count)
  return df

型変換

df['CNT_FAM_MEMBERS'] = df['CNT_FAM_MEMBERS'].astype(int)

数値型データの圧縮①

def to_numeric(df):
    cols_int = df.select_dtypes(include=['int64']).columns
    cols_float = df.select_dtypes(include=['float64']).columns
    for col in cols_int:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in cols_float:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

数値型データの圧縮②

# Memory Reduction
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

欠損値確認関数①

def missing_values_summary(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'mis_val_count', 1 : 'mis_val_percent'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    'mis_val_percent', ascending=False).round(1)
    print ("カラム数：" + str(df.shape[1]) + "\n" + "欠損値のカラム数： " + str(mis_val_table_ren_columns.shape[0]))
    return mis_val_table_ren_columns

missing_values_summary(df)

欠損値確認関数②

def null_values(df, rate=0):
    """a function to show null values with percentage"""
    nv=pd.concat([df.isnull().sum(), 100 * df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
    return nv[nv['Percentage (%)']>rate].sort_values('Percentage (%)', ascending=False)

欠損値のある行を消す関数

def drop_null_rows(df, threshold=0.05):
  null_rows = df.isnull().sum()/len(df)
  null_rows = null_rows[null_rows < threshold]
  print(null_rows)
  for col in null_rows.index != 0:
    df = df[df[col].notnull()]
  return df

相関関係チェック関数

from IPython.display import display, HTML
# Survivedと渡したカラム名との相関関係を見る関数
def correlation_survived(df, column):
    tmp_df = pd.DataFrame()
    tmp_df["survived ratio"] = df[[column, 'TARGET']].groupby(column)['TARGET'].mean()
    tmp_df["count"] = df[[column, 'TARGET']].groupby(column)['TARGET'].count()
    tmp_df["std"] = df[[column, 'TARGET']].groupby(column)['TARGET'].std()
    tmp_df[column] = tmp_df.index
    tmp_df = tmp_df.reset_index(drop=True)
    tmp_df = tmp_df.sort_values(by="survived ratio", ascending=False)
    sns.barplot(x=column, y='TARGET', data=df, palette='Set3', order=df.groupby(column)['TARGET'].mean().sort_values().index)
    plt.show()

相関関係チェック関数②

def labels(ax, df, xytext=(0, 0)):
    for bar in ax.patches: 
        ax.annotate('%{:.2f}\n{:.0f}'.format(100*bar.get_height()/len(df),bar.get_height()), (bar.get_x() + bar.get_width() / 2,  
                    bar.get_height()), ha='center', va='center', 
                    size=11, xytext=xytext, 
                    textcoords='offset points')

def plot_col(col, df, target='Churn', figsize=(20,6)):

    fig, ax = plt.subplots(1,2,figsize=figsize, sharey=True)

    plt.subplot(121)
    tmp = pd.crosstab(df[col], df[target], normalize='index') * 100
    tmp = tmp.reset_index()
    tmp.rename(columns={0:'NotChurn', 1:'Churn'}, inplace=True)

    ax[0] = sns.countplot(x=col, data=df, hue=target, 
                  order=np.sort(df[col].dropna().unique()),
                  )
    ax[0].tick_params(axis='x', rotation=90)
    labels(ax[0],df[col].dropna(),(0, 0))
    
    ax_twin = ax[0].twinx()
    # sns.set(rc={"lines.linewidth": 0.7})
    ax_twin = sns.pointplot(x=col, y='Churn', data=tmp, color='black', legend=False, 
                  order = np.sort(df[col].dropna().unique()), 
                  linewidth=0.1)
    

    ax[0].grid()

    plt.subplot(122)
    ax[1] = sns.countplot(x=df[col].dropna(),
                  order= np.sort(df[col].dropna().unique()),
                  )
    ax[1].tick_params(axis='x', rotation=90)
    labels(ax[1],df[col].dropna())
    plt.show()

相関係数がしきい値よりも大きい同一線上のカラムを消去 (数値データ)

cat_cols = [col for col in df.columns if df[col].dtype=='object']
num_cols = [col for col in df.columns if df[col].dtype!='object']

# Remove the highly collinear features from data
def remove_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        x: features dataframe
        threshold: features with correlations greater than this value are removed

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns=drops)

    return drops

drop_col = remove_collinear_features(df[num_cols], 0.9)
df.drop(drop_col, axis=1, inplace=True)

相関係数がしきい値よりも大きい同一線上のカラムを消去 (カテゴリーデータ)

import scipy.stats as sts

# References:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

def cramers_v(x, y):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    confusion_matrix = pd.crosstab(x,y)
    chi2 = sts.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

def remove_cramers_v(df):
    drop_cols = []
    for col in cat_cols:
        for col2 in cat_cols:
            if col != col2:
                rate = cramers_v(df[col], df[col2])
                if rate > 0.9:
                    print(col, col2, rate)
                    drop_cols.append(col)
    return drop_cols

drop_cols = remove_cramers_v(df)
df.drop(drop_cols, axis=1, inplace=True)