LoginSignup
2
1

More than 3 years have passed since last update.

【Python】データ分析の際の覚え書き

Last updated at Posted at 2020-03-12

この記事の目的

データ分析を行う際の完全な自分用のメモです.
コピペが捗るように作成しているのでdf.head()などの短いものは載せていません.

ライブラリ読み込み

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib

import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

データ読み込み

パターン1

filename = "hoge.csv"
df = pd.read_csv(filename, encoding='utf-8')

パターン2

dirname = "/foo/bar/.../"
filename = "hoge.csv"
filepath = os.path.join(dirname, filename)
df = pd.read_csv(filepath, encoding='utf-8')

パターン3 (ディレクトリ内のcsvすべて)

dirname = "/foo/bar/.../"
files = os.listdir(dirname)
csv_files = [fi for fi in files if str(fi)[-4:]==".csv"]

for csv_fi in csv_files:
    filepath = os.path.join(dirname, csv_fi)
    tmp_df = pd.read_csv(filepath, encoding='utf-8')

データ書き出し

filename = "huga.csv"
df.to_csv(filename, header=True, index=False)

データフレームの操作

カラムの名前変更

df = df.rename(columns={"before01":"after01", "before02":"after02"})

カラムのデータ型変更

df = df.astype({"col": "category"})

データフレームの単純結合

df = pd.concat([upper,lower])

df = pd.concat([left,right], axis=1)

LEFT JOIN

df = pd.merge(left, right, on="key", how='left')

df = pd.merge(left, right, left_on="lkey", right_on="rkey", how='left')

df = pd.merge(left, right, left_on=["lkey01", "lkey02"], right_on=["rkey01", "rkey02"], how='left')

GROUP BY

df = df.groupby(by="col01", as_index=False).sum()

df = df.groupby(by=["col01", "col02"], as_index=False).agg({"col01": ['mean', 'count'], "col02":['std', 'var']})

#indexの振り直し(大体上とセットで使う)
df.reset_index(drop=True, inplace=True)

csvに書き出し

filename = "hoge.csv"
df.to_csv(filename, header=True, index=False)

その他

基本統計量などまとめ

!pip install pandas-profiling
import pandas_profiling as pdp

profile = pdp.ProfileReport(df)
profile.to_file(outputfile="myoutputfile.html")
#Google Colaboで動かす場合
!pip install pandas-profiling==2.8.0
import pandas_profiling as pdp

profile = pdp.ProfileReport(df)
profile.to_file("data_profile.html")

データ読み込んだら, まずこれやればいい.

個数カウント

import collections

lis = ["Alice", "Alice", "Bob", "Bob", "Bob", "Carol"]
c = collections.Counter(lis)
c.most_common(3)

プログレスバー

for i tqdm(range(n)):
    foo bar

#内包表現
[foo for i in tqdm(range(n))]

計算時間を計測

%%timeit

foo bar

ガベージコレクション

import gc

gc.collect()

リストのflatten

import collections

def flatten(l):
    for el in l:
        if isinstance(el, collections.abc.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

よく使うテンプレ

list01 = []
list02 = []

for i tqdm(range(n)):
  v01 = ???
  list01.append(v01)
  v02 = ???
  list02.append(v02)

df = pd.DataFrame({"col01":list01, "col02":list02})

モデルの精度評価

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.calibration import calibration_curve

def several_score(y, y_proba):
    y_hat = [1 if yi>0.5 else 0 for yi in y_proba]
    cm = confusion_matrix(y, y_hat)
    print("confusion matrix : \n", cm)
    print("accuracy\t : ", accuracy_score(y, y_hat))
    print("f1\t\t : ", f1_score(y, y_hat))
    print("auc\t\t : ", roc_auc_score(y, y_proba))

def roc_auc(y, y_proba, filename=""):
    # Compute micro-average ROC curve and ROC area
    fpr, tpr, _ = roc_curve(y, y_proba)
    roc_auc = auc(fpr, tpr)

    if filename!="":
        df = pd.DataFrame({"fpr":fpr,"tpr":tpr})
        df.to_csv(filename, index=None, encoding="utf_8_sig")

    plt.figure(figsize=(5,5))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='gray', lw=lw, linestyle='--')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title("ROC and AUC")
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

def deciles_plot(y, y_proba, bins=10, filename=""):
    df = pd.DataFrame({"y":y,"y_proba":y_proba}).sort_values("y_proba")
    labels = [i+1 for i in range(bins)]
    df["class"] = pd.qcut(df["y_proba"], bins, labels=labels)

    tmp = df.groupby("class").mean()
    tmp = pd.DataFrame(tmp).reset_index()
    tmp["class"] = tmp["class"].astype("int8")

    if filename!="":
        tmp.to_csv(filename, index=None, encoding="utf_8_sig")

    plt.figure(figsize=(6,4))
    lw = 2
    plt.plot(tmp["class"].values, tmp["y"].values, color='darkorange', lw=lw, marker='o', label="Positive Rate")
    overall_mean=df["y"].mean()
    plt.plot([1, bins], [overall_mean, overall_mean], color='gray', lw=lw, linestyle='--', label="Overall Mean")
    #plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('Each bins')
    plt.ylabel('Positive Rate')
    plt.title("Deciles Plot")
    plt.legend(loc="upper right")
    plt.xticks(np.arange(1,bins+1,1))
    plt.grid()
    plt.show()

def calibration_plot(y, y_proba, bins=10, filename=""):
    fraction_of_positives, mean_predicted_value = calibration_curve(y, y_proba, n_bins=bins)

    if filename!="":
        df = pd.DataFrame({"mean_predicted_value":mean_predicted_value,"fraction_of_positives":fraction_of_positives})
        df.to_csv(filename, index=None, encoding="utf_8_sig")

    plt.figure(figsize=(5,5))
    lw = 2
    plt.plot(mean_predicted_value, fraction_of_positives, color='darkorange',
            lw=lw, label='Calibration curve')
    plt.plot([0, 1], [0, 1], color='gray', label="Best", lw=lw, linestyle='--')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('Mean Predicted value')
    plt.ylabel('Fraction of Positives')
    plt.title("Calibration Plot")
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

def check_metrics(y, y_proba, deciles_bins=10, calibration_bins=10):
    several_score(y, y_proba)
    roc_auc(y, y_proba)
    deciles_plot(y, y_proba, bins=deciles_bins)
    calibration_plot(y, y_proba, bins=calibration_bins)


メモ

def make_features(df_train, df_test):

    #trainとtestを結合して一括で変数作成
    #testデータの目的変数がなければ、とりあえず0入れとく
    if target not in df_test.columns:
        df_test[target] = 0

    df_train["train_data"] = True
    df_test["train_data"] = False
    df = pd.concat([df_train,df_test])
    del df_train, df_test

    #全体のnanをnp.nanに統一しておく(ほかにも"-"がnanだったりすれば追加する)
    df = df.replace("nan", np.nan).replace("NaN", np.nan).replace(float("nan"), np.nan)


    #cate
    for col in cate_list:
        df = df.replace({col: {np.nan:"NaN"}}) #np.nanを文字列NaNにして新しいカテゴリとして考える
        df[col] = df[col].astype('category')

    #ordered cate

    for col in ordered_map_dict.keys():
        df[col] = [ordered_map_dict[col][di] for di in df[col]]
        df[col] = df[col].astype(np.float64)

    #num
    for col in num_list:
        df[col] = df[col].astype(np.float64)

    #num と ordered cateのnanを埋めてis_nan変数を作成
    for col in num_list+orderd_cate_list:
        if df[col].isnull().any(): #nanが含まれていれば
            new_col = "is_nan_{0}".format(col)
            df[new_col] = df[col].isnull()
            df[col] = df[col].fillna(np.nanmedian(df[col]))

            #new_col
            cate_list.append(new_col)
            df[new_col] = df[new_col].astype('category')


    #date


    #df全体の処理
    all_list = cate_list + orderd_cate_list + num_list
    df = df.loc[:,all_list+[target]+["train_data"]]
    df_train = df[df["train_data"]].drop("train_data", axis=1)
    df_test = df[~df["train_data"]].drop("train_data", axis=1)
    del df

    print("cate_list : ", cate_list)
    print("orderd_cate_list : ", orderd_cate_list)
    print("num_list : ", num_list)
    print("")

    #trainとtestでカテゴリ違うカラムをwarning出す
    for col in cate_list:
        if not sorted(list(df_train[col].unique())) == sorted(list(df_test[col].unique())):
            print("[warning] {0} の種類がtrainとtestで違います.".format(col))
            print("train : ", df_train[col].unique())
            print("test  : ", df_test[col].unique())
            print("")

    return df_train, df_test

メモ2

def get_evaluate(y_test, predict):

    fpr, tpr, thr_arr = metrics.roc_curve(y_test, predict)

    auc = metrics.auc(fpr, tpr)
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)      

    return auc, precision, recall

n_splits=5
fold = KFold(n_splits=n_splits)

auc_list = []
precision_list = []
recall_list = []

# trainをさらに3つに分割する
#train:学習用
#valid:学習時のearly_stop用
#eval:スコア計算用

for idx_train, idx_valid_eval in fold.split(X_train):
    idx_valid, idx_eval = train_test_split(idx_valid_eval, train_size=0.5, random_state=2020)
    print(len(idx_train), len(idx_valid), len(idx_eval))

    lgb_train = lgb.Dataset(X_train.iloc[idx_train], y_train.iloc[idx_train])
    lgb_valid = lgb.Dataset(X_train.iloc[idx_valid], y_train.iloc[idx_valid], reference=lgb_train)
    X_eval = X_train.iloc[idx_eval]
    y_eval = y_train.iloc[idx_eval]


    model = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=100000, early_stopping_rounds=10)

    predict_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    predict = [0 if i < 0.5 else 1 for i in predict_proba]

    auc, precision, recall = get_evaluate(y_eval, predict)

    print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))

    auc_list.append(auc)
    precision_list.append(precision)
    recall_list.append(recall)

print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list), 
                                                         np.mean(precision_list), 
                                                         np.mean(recall_list)))

プロット系

#描画サイズ
plt.figure(figsize=(20,5)) #横20×縦5

#x軸のラベルを縦にして重ならないように
plt.xticks(rotation=90)

#補助線
plt.plot([x0,x1],[y0,y1], color="gray", linestyle="dashed")

#pltで自動的に出る色の並びを取得
colors = plt.get_cmap("tab10").colors

#描画範囲
plt.xlim((x0,x1))
plt.ylim((y0,y1))

plt.show()

メモ3

########## datetime ##########
#日付
date_list = []

########## category ##########
#名義尺度
nominal_list = []
#順序尺度
ordinal_list = []

########## number ##########
#間隔尺度
interval_list = []
#比例尺度
ratio_list = []

drop_list = []
nominal_list = [col for col in nominal_list if col not in drop_list]
ordinal_list = [col for col in ordinal_list if col not in drop_list]
interval_list = [col for col in interval_list if col not in drop_list]
ratio_list = [col for col in ratio_list if col not in drop_list]

print("nominal_list : ", nominal_list)
print("ordinal_list : ", ordinal_list)
print("interval_list : ", interval_list)
print("ratio_list : ", ratio_list)

df = df_origine.copy()

# NaN dealing
if len(nominal_list)!=0:
    df.loc[:,nominal_list] = df.loc[:,nominal_list].fillna("unknown")
    tmp = pd.get_dummies(df.loc[:,nominal_list])
    nominal_dummy_list = list(tmp.columns)
    df = pd.concat([df, tmp], axis=1)
else:
    nominal_dummy_list=[]

df.loc[:,ordinal_list] = df.loc[:,ordinal_list].fillna(-1)
df.loc[:,interval_list] = df.loc[:,interval_list].fillna(df.loc[:,interval_list].median())
df.loc[:,ratio_list] = df.loc[:,ratio_list].fillna(df.loc[:,ratio_list].median())

# dtype set
#for col in nominal_list:
#    df[col] = df[col].astype("category")

for col in nominal_dummy_list+ordinal_list+interval_list+ratio_list:
    df[col] = df[col].astype("float")


# sep train/test data
train = df[df["tt_flag"]=="train"]
train = train_augmentation(train, power=10, seed=seed)
x_train = train.loc[:,nominal_dummy_list+ordinal_list+interval_list+ratio_list]
y_train = train.loc[:,target]

test = df[df["tt_flag"]=="test"]
x_test = test.loc[:,nominal_dummy_list+ordinal_list+interval_list+ratio_list]

del df
df = df_origine.copy()

# NaN dealing
df.loc[:,nominal_list] = df.loc[:,nominal_list].fillna("unknown")
df.loc[:,ordinal_list] = df.loc[:,ordinal_list].fillna(-999)
df.loc[:,interval_list] = df.loc[:,interval_list].fillna(-999)
df.loc[:,ratio_list] = df.loc[:,ratio_list].fillna(-999)

# dtype set
for col in nominal_list:
    df[col] = df[col].astype("category")

for col in ordinal_list+interval_list+ratio_list:
    df[col] = df[col].astype("float")

# sep train/test data
train = df[df["tt_flag"]=="train"]
train = train_augmentation(train, power=10, seed=seed)
x_train = train.loc[:,nominal_list+ordinal_list+interval_list+ratio_list]
y_train = train.loc[:,target]

test = df[df["tt_flag"]=="test"]
x_test = test.loc[:,nominal_list+ordinal_list+interval_list+ratio_list]

del df
import optuna.integration.lightgbm as lgb

params = {"boosting_type":"gbdt",
          "objective":"regression",
          "metric":"rmse",
          "random_state":seed,
          'verbose': -1
         }

N_SPLITS = 5
kfold = KFold(n_splits=N_SPLITS, shuffle=True,  random_state=seed)

f_imp = pd.DataFrame()
i=0
y_train_pred = np.array([0.0]*x_train.shape[0])
y_val_pred = np.array([0.0]*x_train.shape[0])
y_test_pred_lgbm = np.array([0.0]*x_test.shape[0])

for tra_index, val_index in kfold.split(x_train, y_train):

    x_tra = x_train.iloc[tra_index]
    y_tra = y_train.iloc[tra_index]
    x_val = x_train.iloc[val_index]
    y_val = y_train.iloc[val_index]

    # データセットを生成する
    lgb_train = lgb.Dataset(x_tra, y_tra)
    lgb_valid = lgb.Dataset(x_val, y_val, reference=lgb_train)

    model = lgb.train(params, 
                      train_set = lgb_train,
                      valid_sets=lgb_valid,
                      num_boost_round=100000,
                      categorical_feature=nominal_list,
                      early_stopping_rounds=10,
                      verbose_eval=False,
                     show_progress_bar=False
                     )

    y_train_pred[tra_index] += model.predict(x_tra, num_iteration=model.best_iteration)/(N_SPLITS-1)
    y_val_pred[val_index] += model.predict(x_val, num_iteration=model.best_iteration)
    y_test_pred_lgbm += model.predict(x_test, num_iteration=model.best_iteration)/N_SPLITS

    tmp = pd.DataFrame(model.feature_importance("gain"), index=model.feature_name(), columns=["importance{0}".format(i)])

    f_imp = pd.concat([f_imp, tmp], axis=1)
    i+=1

    print("="*30)
    for key, value in model.params.items():
        print("    {}: {}".format(key, value))

f_imp["importance"] = f_imp.apply(np.mean,axis=1)

f_imp.sort_values(by="importance", ascending=False).iloc[:30]

2
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
1