データの外観を知る
print(df.shape) # row x columns of data
print(df.ndim) # dimension of data
print(df.size) # size of data
# テーブルがどんな型の値を持っているか
df.dtypes.value_counts()
# object型の列において、何種類の値を持っているか
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)
# どんな種類の値が、何個入っているか
df["CODE_GENDER"].value_counts()
# subsetで軸とするカラムを設定する。
ans = df.dropna(subset=['Age'])
# Ageがnullの列を表示
df[df['Age'].isnull()]
# nullがある列を表示
df_null = df[df.isnull().any(axis=1)]
# 特定のカラムに入力
df_test.loc[88, 'Age'] = 26
# 特定のカラムを表示
df[df['CODE_GENDER']=="XNA"]
# 特定の条件の行を消去
df = df[df['CODE_GENDER']!="XNA"]
# 特定のカラムを消去
df = df.drop("WEEKDAY_APPR_PROCESS_START", axis=1)
# 特定条件のカラムにnanを入れる
df[df['dwllsize'] != 'A'] = np.nan
エンコード
df['CODE_GENDER'] = df['CODE_GENDER'].map({'F':1, 'M':0}).astype(int)
# ライブラリを使用したエンコード
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['Cabin'] = enc.fit_transform(df['Cabin'])
一括エンコード
値が2個しかないものは一括でエンコードする
def encode_categorical_variables(df):
# Create a label encoder object
le = LabelEncoder()
le_count = 0
# Iterate through the columns
for col in df:
if df[col].dtype == 'object':
# If 2 or fewer unique categories
if len(list(df[col].unique())) <= 2:
print(col)
# Train on the training data
le.fit(df[col])
# Transform both training and testing data
df[col] = le.transform(df[col])
# Keep track of how many columns were label encoded
le_count += 1
print('%d columns were label encoded.' % le_count)
return df
型変換
df['CNT_FAM_MEMBERS'] = df['CNT_FAM_MEMBERS'].astype(int)
数値型データの圧縮①
def to_numeric(df):
cols_int = df.select_dtypes(include=['int64']).columns
cols_float = df.select_dtypes(include=['float64']).columns
for col in cols_int:
df[col] = pd.to_numeric(df[col], downcast='integer')
for col in cols_float:
df[col] = pd.to_numeric(df[col], downcast='float')
return df
数値型データの圧縮②
# Memory Reduction
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose:
print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
欠損値確認関数①
def missing_values_summary(df):
mis_val = df.isnull().sum()
mis_val_percent = 100 * df.isnull().sum() / len(df)
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'mis_val_count', 1 : 'mis_val_percent'})
mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'mis_val_percent', ascending=False).round(1)
print ("カラム数:" + str(df.shape[1]) + "\n" + "欠損値のカラム数: " + str(mis_val_table_ren_columns.shape[0]))
return mis_val_table_ren_columns
missing_values_summary(df)
欠損値確認関数②
def null_values(df, rate=0):
"""a function to show null values with percentage"""
nv=pd.concat([df.isnull().sum(), 100 * df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
return nv[nv['Percentage (%)']>rate].sort_values('Percentage (%)', ascending=False)
欠損値のある行を消す関数
def drop_null_rows(df, threshold=0.05):
null_rows = df.isnull().sum()/len(df)
null_rows = null_rows[null_rows < threshold]
print(null_rows)
for col in null_rows.index != 0:
df = df[df[col].notnull()]
return df
相関関係チェック関数
from IPython.display import display, HTML
# Survivedと渡したカラム名との相関関係を見る関数
def correlation_survived(df, column):
tmp_df = pd.DataFrame()
tmp_df["survived ratio"] = df[[column, 'TARGET']].groupby(column)['TARGET'].mean()
tmp_df["count"] = df[[column, 'TARGET']].groupby(column)['TARGET'].count()
tmp_df["std"] = df[[column, 'TARGET']].groupby(column)['TARGET'].std()
tmp_df[column] = tmp_df.index
tmp_df = tmp_df.reset_index(drop=True)
tmp_df = tmp_df.sort_values(by="survived ratio", ascending=False)
sns.barplot(x=column, y='TARGET', data=df, palette='Set3', order=df.groupby(column)['TARGET'].mean().sort_values().index)
plt.show()
相関関係チェック関数②
def labels(ax, df, xytext=(0, 0)):
for bar in ax.patches:
ax.annotate('%{:.2f}\n{:.0f}'.format(100*bar.get_height()/len(df),bar.get_height()), (bar.get_x() + bar.get_width() / 2,
bar.get_height()), ha='center', va='center',
size=11, xytext=xytext,
textcoords='offset points')
def plot_col(col, df, target='Churn', figsize=(20,6)):
fig, ax = plt.subplots(1,2,figsize=figsize, sharey=True)
plt.subplot(121)
tmp = pd.crosstab(df[col], df[target], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NotChurn', 1:'Churn'}, inplace=True)
ax[0] = sns.countplot(x=col, data=df, hue=target,
order=np.sort(df[col].dropna().unique()),
)
ax[0].tick_params(axis='x', rotation=90)
labels(ax[0],df[col].dropna(),(0, 0))
ax_twin = ax[0].twinx()
# sns.set(rc={"lines.linewidth": 0.7})
ax_twin = sns.pointplot(x=col, y='Churn', data=tmp, color='black', legend=False,
order = np.sort(df[col].dropna().unique()),
linewidth=0.1)
ax[0].grid()
plt.subplot(122)
ax[1] = sns.countplot(x=df[col].dropna(),
order= np.sort(df[col].dropna().unique()),
)
ax[1].tick_params(axis='x', rotation=90)
labels(ax[1],df[col].dropna())
plt.show()
相関係数がしきい値よりも大きい同一線上のカラムを消去 (数値データ)
cat_cols = [col for col in df.columns if df[col].dtype=='object']
num_cols = [col for col in df.columns if df[col].dtype!='object']
# Remove the highly collinear features from data
def remove_collinear_features(x, threshold):
'''
Objective:
Remove collinear features in a dataframe with a correlation coefficient
greater than the threshold. Removing collinear features can help a model
to generalize and improves the interpretability of the model.
Inputs:
x: features dataframe
threshold: features with correlations greater than this value are removed
Output:
dataframe that contains only the non-highly-collinear features
'''
# Calculate the correlation matrix
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterate through the correlation matrix and compare correlations
for i in iters:
for j in range(i+1):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = abs(item.values)
# If correlation exceeds the threshold
if val >= threshold:
# Print the correlated features and the correlation value
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(col.values[0])
# Drop one of each pair of correlated columns
drops = set(drop_cols)
x = x.drop(columns=drops)
return drops
drop_col = remove_collinear_features(df[num_cols], 0.9)
df.drop(drop_col, axis=1, inplace=True)
相関係数がしきい値よりも大きい同一線上のカラムを消去 (カテゴリーデータ)
import scipy.stats as sts
# References:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
def cramers_v(x, y):
""" calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
"""
confusion_matrix = pd.crosstab(x,y)
chi2 = sts.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
def remove_cramers_v(df):
drop_cols = []
for col in cat_cols:
for col2 in cat_cols:
if col != col2:
rate = cramers_v(df[col], df[col2])
if rate > 0.9:
print(col, col2, rate)
drop_cols.append(col)
return drop_cols
drop_cols = remove_cramers_v(df)
df.drop(drop_cols, axis=1, inplace=True)
churn_telecom_project | Kaggle
参考
pandas.DataFrame.dropna — pandas 1.4.2 documentation
kaggle1位の解析手法 「Home Credit Default Risk 債務不履行の予測」①データ理解 – S-Analysis