2
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

pandas 前処理メモ

Last updated at Posted at 2022-06-06

データの外観を知る

print(df.shape) # row x columns of data
print(df.ndim) # dimension of data
print(df.size) # size of data
# テーブルがどんな型の値を持っているか
df.dtypes.value_counts()

# object型の列において、何種類の値を持っているか
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

# どんな種類の値が、何個入っているか
df["CODE_GENDER"].value_counts()

# subsetで軸とするカラムを設定する。
ans = df.dropna(subset=['Age'])

# Ageがnullの列を表示
df[df['Age'].isnull()]

# nullがある列を表示
df_null = df[df.isnull().any(axis=1)]

# 特定のカラムに入力
df_test.loc[88, 'Age'] = 26

# 特定のカラムを表示
df[df['CODE_GENDER']=="XNA"]

# 特定の条件の行を消去
df = df[df['CODE_GENDER']!="XNA"]

# 特定のカラムを消去
df = df.drop("WEEKDAY_APPR_PROCESS_START", axis=1)

# 特定条件のカラムにnanを入れる
df[df['dwllsize'] != 'A'] = np.nan

エンコード

df['CODE_GENDER'] = df['CODE_GENDER'].map({'F':1, 'M':0}).astype(int)

# ライブラリを使用したエンコード
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['Cabin'] = enc.fit_transform(df['Cabin'])

一括エンコード

値が2個しかないものは一括でエンコードする

def encode_categorical_variables(df):
  # Create a label encoder object
  le = LabelEncoder()
  le_count = 0

  # Iterate through the columns
  for col in df:
    if df[col].dtype == 'object':
      # If 2 or fewer unique categories
      if len(list(df[col].unique())) <= 2:
        print(col)
        # Train on the training data
        le.fit(df[col])
        # Transform both training and testing data
        df[col] = le.transform(df[col])            
        # Keep track of how many columns were label encoded
        le_count += 1
              
  print('%d columns were label encoded.' % le_count)
  return df

型変換

df['CNT_FAM_MEMBERS'] = df['CNT_FAM_MEMBERS'].astype(int)

数値型データの圧縮①

def to_numeric(df):
    cols_int = df.select_dtypes(include=['int64']).columns
    cols_float = df.select_dtypes(include=['float64']).columns
    for col in cols_int:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in cols_float:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

数値型データの圧縮②

# Memory Reduction
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

欠損値確認関数①

def missing_values_summary(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'mis_val_count', 1 : 'mis_val_percent'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    'mis_val_percent', ascending=False).round(1)
    print ("カラム数:" + str(df.shape[1]) + "\n" + "欠損値のカラム数: " + str(mis_val_table_ren_columns.shape[0]))
    return mis_val_table_ren_columns

missing_values_summary(df)

欠損値確認関数②

def null_values(df, rate=0):
    """a function to show null values with percentage"""
    nv=pd.concat([df.isnull().sum(), 100 * df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
    return nv[nv['Percentage (%)']>rate].sort_values('Percentage (%)', ascending=False)

欠損値のある行を消す関数

def drop_null_rows(df, threshold=0.05):
  null_rows = df.isnull().sum()/len(df)
  null_rows = null_rows[null_rows < threshold]
  print(null_rows)
  for col in null_rows.index != 0:
    df = df[df[col].notnull()]
  return df

相関関係チェック関数

from IPython.display import display, HTML
# Survivedと渡したカラム名との相関関係を見る関数
def correlation_survived(df, column):
    tmp_df = pd.DataFrame()
    tmp_df["survived ratio"] = df[[column, 'TARGET']].groupby(column)['TARGET'].mean()
    tmp_df["count"] = df[[column, 'TARGET']].groupby(column)['TARGET'].count()
    tmp_df["std"] = df[[column, 'TARGET']].groupby(column)['TARGET'].std()
    tmp_df[column] = tmp_df.index
    tmp_df = tmp_df.reset_index(drop=True)
    tmp_df = tmp_df.sort_values(by="survived ratio", ascending=False)
    sns.barplot(x=column, y='TARGET', data=df, palette='Set3', order=df.groupby(column)['TARGET'].mean().sort_values().index)
    plt.show()

相関関係チェック関数②

def labels(ax, df, xytext=(0, 0)):
    for bar in ax.patches: 
        ax.annotate('%{:.2f}\n{:.0f}'.format(100*bar.get_height()/len(df),bar.get_height()), (bar.get_x() + bar.get_width() / 2,  
                    bar.get_height()), ha='center', va='center', 
                    size=11, xytext=xytext, 
                    textcoords='offset points')

def plot_col(col, df, target='Churn', figsize=(20,6)):

    fig, ax = plt.subplots(1,2,figsize=figsize, sharey=True)

    plt.subplot(121)
    tmp = pd.crosstab(df[col], df[target], normalize='index') * 100
    tmp = tmp.reset_index()
    tmp.rename(columns={0:'NotChurn', 1:'Churn'}, inplace=True)

    ax[0] = sns.countplot(x=col, data=df, hue=target, 
                  order=np.sort(df[col].dropna().unique()),
                  )
    ax[0].tick_params(axis='x', rotation=90)
    labels(ax[0],df[col].dropna(),(0, 0))
    
    ax_twin = ax[0].twinx()
    # sns.set(rc={"lines.linewidth": 0.7})
    ax_twin = sns.pointplot(x=col, y='Churn', data=tmp, color='black', legend=False, 
                  order = np.sort(df[col].dropna().unique()), 
                  linewidth=0.1)
    

    ax[0].grid()

    plt.subplot(122)
    ax[1] = sns.countplot(x=df[col].dropna(),
                  order= np.sort(df[col].dropna().unique()),
                  )
    ax[1].tick_params(axis='x', rotation=90)
    labels(ax[1],df[col].dropna())
    plt.show()

相関係数がしきい値よりも大きい同一線上のカラムを消去 (数値データ)

cat_cols = [col for col in df.columns if df[col].dtype=='object']
num_cols = [col for col in df.columns if df[col].dtype!='object']

# Remove the highly collinear features from data
def remove_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        x: features dataframe
        threshold: features with correlations greater than this value are removed

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns=drops)

    return drops

drop_col = remove_collinear_features(df[num_cols], 0.9)
df.drop(drop_col, axis=1, inplace=True)

相関係数がしきい値よりも大きい同一線上のカラムを消去 (カテゴリーデータ)

import scipy.stats as sts

# References:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

def cramers_v(x, y):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    confusion_matrix = pd.crosstab(x,y)
    chi2 = sts.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

def remove_cramers_v(df):
    drop_cols = []
    for col in cat_cols:
        for col2 in cat_cols:
            if col != col2:
                rate = cramers_v(df[col], df[col2])
                if rate > 0.9:
                    print(col, col2, rate)
                    drop_cols.append(col)
    return drop_cols

drop_cols = remove_cramers_v(df)
df.drop(drop_cols, axis=1, inplace=True)

churn_telecom_project | Kaggle

参考

pandas.DataFrame.dropna — pandas 1.4.2 documentation

kaggle1位の解析手法 「Home Credit Default Risk 債務不履行の予測」①データ理解 – S-Analysis

churn_telecom_project | Kaggle

2
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?