この記事の目的
データ分析を行う際の完全な自分用のメモです.
コピペが捗るように作成しているのでdf.head()
などの短いものは載せていません.
ライブラリ読み込み
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
データ読み込み
パターン1
filename = "hoge.csv"
df = pd.read_csv(filename, encoding='utf-8')
パターン2
dirname = "/foo/bar/.../"
filename = "hoge.csv"
filepath = os.path.join(dirname, filename)
df = pd.read_csv(filepath, encoding='utf-8')
パターン3 (ディレクトリ内のcsvすべて)
dirname = "/foo/bar/.../"
files = os.listdir(dirname)
csv_files = [fi for fi in files if str(fi)[-4:]==".csv"]
for csv_fi in csv_files:
filepath = os.path.join(dirname, csv_fi)
tmp_df = pd.read_csv(filepath, encoding='utf-8')
データ書き出し
filename = "huga.csv"
df.to_csv(filename, header=True, index=False)
データフレームの操作
カラムの名前変更
df = df.rename(columns={"before01":"after01", "before02":"after02"})
カラムのデータ型変更
df = df.astype({"col": "category"})
データフレームの単純結合
df = pd.concat([upper,lower])
df = pd.concat([left,right], axis=1)
LEFT JOIN
df = pd.merge(left, right, on="key", how='left')
df = pd.merge(left, right, left_on="lkey", right_on="rkey", how='left')
df = pd.merge(left, right, left_on=["lkey01", "lkey02"], right_on=["rkey01", "rkey02"], how='left')
GROUP BY
df = df.groupby(by="col01", as_index=False).sum()
df = df.groupby(by=["col01", "col02"], as_index=False).agg({"col01": ['mean', 'count'], "col02":['std', 'var']})
#indexの振り直し(大体上とセットで使う)
df.reset_index(drop=True, inplace=True)
csvに書き出し
filename = "hoge.csv"
df.to_csv(filename, header=True, index=False)
その他
基本統計量などまとめ
!pip install pandas-profiling
import pandas_profiling as pdp
profile = pdp.ProfileReport(df)
profile.to_file(outputfile="myoutputfile.html")
#Google Colaboで動かす場合
!pip install pandas-profiling==2.8.0
import pandas_profiling as pdp
profile = pdp.ProfileReport(df)
profile.to_file("data_profile.html")
データ読み込んだら, まずこれやればいい.
個数カウント
import collections
lis = ["Alice", "Alice", "Bob", "Bob", "Bob", "Carol"]
c = collections.Counter(lis)
c.most_common(3)
プログレスバー
for i tqdm(range(n)):
foo bar
#内包表現
[foo for i in tqdm(range(n))]
計算時間を計測
%%timeit
foo bar
ガベージコレクション
import gc
gc.collect()
リストのflatten
import collections
def flatten(l):
for el in l:
if isinstance(el, collections.abc.Iterable) and not isinstance(el, (str, bytes)):
yield from flatten(el)
else:
yield el
よく使うテンプレ
list01 = []
list02 = []
for i tqdm(range(n)):
v01 = ???
list01.append(v01)
v02 = ???
list02.append(v02)
df = pd.DataFrame({"col01":list01, "col02":list02})
モデルの精度評価
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.calibration import calibration_curve
def several_score(y, y_proba):
y_hat = [1 if yi>0.5 else 0 for yi in y_proba]
cm = confusion_matrix(y, y_hat)
print("confusion matrix : \n", cm)
print("accuracy\t : ", accuracy_score(y, y_hat))
print("f1\t\t : ", f1_score(y, y_hat))
print("auc\t\t : ", roc_auc_score(y, y_proba))
def roc_auc(y, y_proba, filename=""):
# Compute micro-average ROC curve and ROC area
fpr, tpr, _ = roc_curve(y, y_proba)
roc_auc = auc(fpr, tpr)
if filename!="":
df = pd.DataFrame({"fpr":fpr,"tpr":tpr})
df.to_csv(filename, index=None, encoding="utf_8_sig")
plt.figure(figsize=(5,5))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("ROC and AUC")
plt.legend(loc="lower right")
plt.grid()
plt.show()
def deciles_plot(y, y_proba, bins=10, filename=""):
df = pd.DataFrame({"y":y,"y_proba":y_proba}).sort_values("y_proba")
labels = [i+1 for i in range(bins)]
df["class"] = pd.qcut(df["y_proba"], bins, labels=labels)
tmp = df.groupby("class").mean()
tmp = pd.DataFrame(tmp).reset_index()
tmp["class"] = tmp["class"].astype("int8")
if filename!="":
tmp.to_csv(filename, index=None, encoding="utf_8_sig")
plt.figure(figsize=(6,4))
lw = 2
plt.plot(tmp["class"].values, tmp["y"].values, color='darkorange', lw=lw, marker='o', label="Positive Rate")
overall_mean=df["y"].mean()
plt.plot([1, bins], [overall_mean, overall_mean], color='gray', lw=lw, linestyle='--', label="Overall Mean")
#plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('Each bins')
plt.ylabel('Positive Rate')
plt.title("Deciles Plot")
plt.legend(loc="upper right")
plt.xticks(np.arange(1,bins+1,1))
plt.grid()
plt.show()
def calibration_plot(y, y_proba, bins=10, filename=""):
fraction_of_positives, mean_predicted_value = calibration_curve(y, y_proba, n_bins=bins)
if filename!="":
df = pd.DataFrame({"mean_predicted_value":mean_predicted_value,"fraction_of_positives":fraction_of_positives})
df.to_csv(filename, index=None, encoding="utf_8_sig")
plt.figure(figsize=(5,5))
lw = 2
plt.plot(mean_predicted_value, fraction_of_positives, color='darkorange',
lw=lw, label='Calibration curve')
plt.plot([0, 1], [0, 1], color='gray', label="Best", lw=lw, linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('Mean Predicted value')
plt.ylabel('Fraction of Positives')
plt.title("Calibration Plot")
plt.legend(loc="lower right")
plt.grid()
plt.show()
def check_metrics(y, y_proba, deciles_bins=10, calibration_bins=10):
several_score(y, y_proba)
roc_auc(y, y_proba)
deciles_plot(y, y_proba, bins=deciles_bins)
calibration_plot(y, y_proba, bins=calibration_bins)
メモ
def make_features(df_train, df_test):
#trainとtestを結合して一括で変数作成
#testデータの目的変数がなければ、とりあえず0入れとく
if target not in df_test.columns:
df_test[target] = 0
df_train["train_data"] = True
df_test["train_data"] = False
df = pd.concat([df_train,df_test])
del df_train, df_test
#全体のnanをnp.nanに統一しておく(ほかにも"-"がnanだったりすれば追加する)
df = df.replace("nan", np.nan).replace("NaN", np.nan).replace(float("nan"), np.nan)
#cate
for col in cate_list:
df = df.replace({col: {np.nan:"NaN"}}) #np.nanを文字列NaNにして新しいカテゴリとして考える
df[col] = df[col].astype('category')
#ordered cate
for col in ordered_map_dict.keys():
df[col] = [ordered_map_dict[col][di] for di in df[col]]
df[col] = df[col].astype(np.float64)
#num
for col in num_list:
df[col] = df[col].astype(np.float64)
#num と ordered cateのnanを埋めてis_nan変数を作成
for col in num_list+orderd_cate_list:
if df[col].isnull().any(): #nanが含まれていれば
new_col = "is_nan_{0}".format(col)
df[new_col] = df[col].isnull()
df[col] = df[col].fillna(np.nanmedian(df[col]))
#new_col
cate_list.append(new_col)
df[new_col] = df[new_col].astype('category')
#date
#df全体の処理
all_list = cate_list + orderd_cate_list + num_list
df = df.loc[:,all_list+[target]+["train_data"]]
df_train = df[df["train_data"]].drop("train_data", axis=1)
df_test = df[~df["train_data"]].drop("train_data", axis=1)
del df
print("cate_list : ", cate_list)
print("orderd_cate_list : ", orderd_cate_list)
print("num_list : ", num_list)
print("")
#trainとtestでカテゴリ違うカラムをwarning出す
for col in cate_list:
if not sorted(list(df_train[col].unique())) == sorted(list(df_test[col].unique())):
print("[warning] {0} の種類がtrainとtestで違います.".format(col))
print("train : ", df_train[col].unique())
print("test : ", df_test[col].unique())
print("")
return df_train, df_test
メモ2
def get_evaluate(y_test, predict):
fpr, tpr, thr_arr = metrics.roc_curve(y_test, predict)
auc = metrics.auc(fpr, tpr)
precision = metrics.precision_score(y_test, predict)
recall = metrics.recall_score(y_test, predict)
return auc, precision, recall
n_splits=5
fold = KFold(n_splits=n_splits)
auc_list = []
precision_list = []
recall_list = []
# trainをさらに3つに分割する
#train:学習用
#valid:学習時のearly_stop用
#eval:スコア計算用
for idx_train, idx_valid_eval in fold.split(X_train):
idx_valid, idx_eval = train_test_split(idx_valid_eval, train_size=0.5, random_state=2020)
print(len(idx_train), len(idx_valid), len(idx_eval))
lgb_train = lgb.Dataset(X_train.iloc[idx_train], y_train.iloc[idx_train])
lgb_valid = lgb.Dataset(X_train.iloc[idx_valid], y_train.iloc[idx_valid], reference=lgb_train)
X_eval = X_train.iloc[idx_eval]
y_eval = y_train.iloc[idx_eval]
model = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=100000, early_stopping_rounds=10)
predict_proba = model.predict(X_eval, num_iteration=model.best_iteration)
predict = [0 if i < 0.5 else 1 for i in predict_proba]
auc, precision, recall = get_evaluate(y_eval, predict)
print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))
auc_list.append(auc)
precision_list.append(precision)
recall_list.append(recall)
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list),
np.mean(precision_list),
np.mean(recall_list)))
プロット系
#描画サイズ
plt.figure(figsize=(20,5)) #横20×縦5
#x軸のラベルを縦にして重ならないように
plt.xticks(rotation=90)
#補助線
plt.plot([x0,x1],[y0,y1], color="gray", linestyle="dashed")
#pltで自動的に出る色の並びを取得
colors = plt.get_cmap("tab10").colors
#描画範囲
plt.xlim((x0,x1))
plt.ylim((y0,y1))
plt.show()
メモ3
########## datetime ##########
#日付
date_list = []
########## category ##########
#名義尺度
nominal_list = []
#順序尺度
ordinal_list = []
########## number ##########
#間隔尺度
interval_list = []
#比例尺度
ratio_list = []
drop_list = []
nominal_list = [col for col in nominal_list if col not in drop_list]
ordinal_list = [col for col in ordinal_list if col not in drop_list]
interval_list = [col for col in interval_list if col not in drop_list]
ratio_list = [col for col in ratio_list if col not in drop_list]
print("nominal_list : ", nominal_list)
print("ordinal_list : ", ordinal_list)
print("interval_list : ", interval_list)
print("ratio_list : ", ratio_list)
df = df_origine.copy()
# NaN dealing
if len(nominal_list)!=0:
df.loc[:,nominal_list] = df.loc[:,nominal_list].fillna("unknown")
tmp = pd.get_dummies(df.loc[:,nominal_list])
nominal_dummy_list = list(tmp.columns)
df = pd.concat([df, tmp], axis=1)
else:
nominal_dummy_list=[]
df.loc[:,ordinal_list] = df.loc[:,ordinal_list].fillna(-1)
df.loc[:,interval_list] = df.loc[:,interval_list].fillna(df.loc[:,interval_list].median())
df.loc[:,ratio_list] = df.loc[:,ratio_list].fillna(df.loc[:,ratio_list].median())
# dtype set
#for col in nominal_list:
# df[col] = df[col].astype("category")
for col in nominal_dummy_list+ordinal_list+interval_list+ratio_list:
df[col] = df[col].astype("float")
# sep train/test data
train = df[df["tt_flag"]=="train"]
train = train_augmentation(train, power=10, seed=seed)
x_train = train.loc[:,nominal_dummy_list+ordinal_list+interval_list+ratio_list]
y_train = train.loc[:,target]
test = df[df["tt_flag"]=="test"]
x_test = test.loc[:,nominal_dummy_list+ordinal_list+interval_list+ratio_list]
del df
df = df_origine.copy()
# NaN dealing
df.loc[:,nominal_list] = df.loc[:,nominal_list].fillna("unknown")
df.loc[:,ordinal_list] = df.loc[:,ordinal_list].fillna(-999)
df.loc[:,interval_list] = df.loc[:,interval_list].fillna(-999)
df.loc[:,ratio_list] = df.loc[:,ratio_list].fillna(-999)
# dtype set
for col in nominal_list:
df[col] = df[col].astype("category")
for col in ordinal_list+interval_list+ratio_list:
df[col] = df[col].astype("float")
# sep train/test data
train = df[df["tt_flag"]=="train"]
train = train_augmentation(train, power=10, seed=seed)
x_train = train.loc[:,nominal_list+ordinal_list+interval_list+ratio_list]
y_train = train.loc[:,target]
test = df[df["tt_flag"]=="test"]
x_test = test.loc[:,nominal_list+ordinal_list+interval_list+ratio_list]
del df
import optuna.integration.lightgbm as lgb
params = {"boosting_type":"gbdt",
"objective":"regression",
"metric":"rmse",
"random_state":seed,
'verbose': -1
}
N_SPLITS = 5
kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
f_imp = pd.DataFrame()
i=0
y_train_pred = np.array([0.0]*x_train.shape[0])
y_val_pred = np.array([0.0]*x_train.shape[0])
y_test_pred_lgbm = np.array([0.0]*x_test.shape[0])
for tra_index, val_index in kfold.split(x_train, y_train):
x_tra = x_train.iloc[tra_index]
y_tra = y_train.iloc[tra_index]
x_val = x_train.iloc[val_index]
y_val = y_train.iloc[val_index]
# データセットを生成する
lgb_train = lgb.Dataset(x_tra, y_tra)
lgb_valid = lgb.Dataset(x_val, y_val, reference=lgb_train)
model = lgb.train(params,
train_set = lgb_train,
valid_sets=lgb_valid,
num_boost_round=100000,
categorical_feature=nominal_list,
early_stopping_rounds=10,
verbose_eval=False,
show_progress_bar=False
)
y_train_pred[tra_index] += model.predict(x_tra, num_iteration=model.best_iteration)/(N_SPLITS-1)
y_val_pred[val_index] += model.predict(x_val, num_iteration=model.best_iteration)
y_test_pred_lgbm += model.predict(x_test, num_iteration=model.best_iteration)/N_SPLITS
tmp = pd.DataFrame(model.feature_importance("gain"), index=model.feature_name(), columns=["importance{0}".format(i)])
f_imp = pd.concat([f_imp, tmp], axis=1)
i+=1
print("="*30)
for key, value in model.params.items():
print(" {}: {}".format(key, value))
f_imp["importance"] = f_imp.apply(np.mean,axis=1)
f_imp.sort_values(by="importance", ascending=False).iloc[:30]