前略

あるコンペで練習した際、使用した操作について五月雨投稿です。

DataFrameの操作

空のデータフレームを作る

df = pd.DataFrame(index=range(行数), columns=['a'], dtype='float64')

最初の列をインデックス列として読み込む

pd.read_csv('XXX.csv',index_col=0)

カラム名の変更

print(df.columns) # ['a','b','c']
df.columns = ['aa','bb','cc']
print(df.columns) # ['aa','bb','cc']

抽出

df[df.loc[:,'a']==0]

in条件の抽出

df[df['a'].isin( # ここに配列 )]

関数とセットでapplyを使う

df.apply['a', 'b'](impute_func, axis=1)

def impute_func(cols):
    col0 = cols[0]
    col1 = cols[1]

    # col0とcol1で計算

    return # 計算結果

applyでif文例えば正負判定

df.apply( lambda x : 1 if x > 0  else 0 )

pandasで1行ずつ処理

def func(df):
    # 空のデータフレームを作る
    df_e = pd.DataFrame(index=range(len(df)), columns=['a'], dtype='float64')

    # 1行目
    df_e.loc[0,'a'] = df.loc[0,'a']

    # Forループで計算
    for i in range(1, len(df)): # 2行目からスタート
        row1 = df.loc[i,'a']
        row2 = df_e.loc[i-1,'a']

        df_e.loc[i,'a'] = row1 + row2

    df.loc[:, 'a'] = df_e['a']

日付処理

日付と時刻をくっ付けて、時間を丸める

df['年月日時'] = pd.to_datetime(df['年月日']+' '+df['時刻']) # 2016-01-19と6:13など
df['年月日時'] = pd.DatetimeIndex(df['年月日時']).round('60min') # 60分単位で切り捨て

数値の処理

四捨五入 (小数点以下 .5 以上は繰上げ、.5未満は切捨て)

np.round(a)

切り捨て (小数部分を取り除く)

np.trunc(a)

切り捨て (小さい側の整数に丸める)

np.floor(a)

切り上げ (大きい側の整数に丸める)

np.ceil(a)

ゼロに近い側の整数に丸める

np.fix(a)

銀行丸め

np.round(df, decimals=n) # 小数点第n+1位を四捨五入

pandasで描画

散布図

df.plot.scatter(x='a',y=['b','c'], alpha=1.0, figsize=(20, 10)) plt.legend(loc='best')
plt.plot()

棒グラフ

df.plot.bar(x='a',y=['b','c'], alpha=1.0, figsize=(20, 10)) 
plt.legend(loc='best')
plt.plot()

箱ひげ図

plt.figure(figsize=(20,10))
sns.boxplot(x='a', y='b', hue='c', data=df)
plt.plot()

統計処理

要約統計量

df.groupby('a').describe()

平均値でクロス集計

pd.crosstab(index=df['a'], columns=df['b'],values=df['c'], aggfunc='mean')

クロス表変換

df.columns # 年月、カテゴリ、値
df = df.set_index(['カテゴリ', '年月'])
df = df.unstack(0).reset_index()
levels = df.columns.levels
labels = df.columns.labels
col_level_1 = levels[0][labels[0]]
col_level_2 = levels[1][labels[1]]

recolnames = [x + "_" + y for x, y in zip(col_level_1, col_level_2)]
df.columns = recolnames
df.rename(columns={'年月_': '年月'},inplace=True)
df.fillna(0,inplace=True) # クロス表にする際に生じたnullを0埋め

機械学習

データのスプリット

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

skf = KFold(n_splits=5, shuffle=True, random_state=5)
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

for train_index, test_index in skf.split(X_train, Y_train):
    X_cv_train = X_train.iloc[train_index]
    X_cv_test = X_train.iloc[test_index]  
    y_cv_train = Y_train.iloc[train_index]
    y_cv_test = Y_train.iloc[test_index]

    # ここに処理

submission作成

submission = pd.DataFrame({
    'a':X_train['a'],    
    'b':Y_prediction
})

ランダムフォレスト（回帰）

from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(
     bootstrap=True,
     criterion='mse',
     max_depth=None,
     max_features='auto',
     max_leaf_nodes=None,
     min_impurity_decrease=0.0,
     min_impurity_split=None,
     min_samples_leaf=1,
     min_samples_split=2,
     min_weight_fraction_leaf=0.0,
     n_estimators=10,
     n_jobs=1,
     oob_score=False,
     random_state=2525,
     verbose=0,
     warm_start=False
     )

forest.fit(X_train, Y_train)

## トレーニングデータでの決定係数を表示
acc_log = round(forest.score(X_train, Y_train)*100, 2)
print('R2:',round(acc_log, 2, ), '%')

ランダムフォレスト（分類）

from sklearn.ensemble import RandomForestRegressor

forest = RandomForestClassifier(
     bootstrap=True,
     class_weight=None,
     criterion='gini',
     max_depth=2,
     max_features='auto',
     max_leaf_nodes=None,
     min_impurity_decrease=0.0,
     min_impurity_split=None,
     min_samples_leaf=1,
     min_samples_split=2,
     min_weight_fraction_leaf=0.0,
     n_estimators=100,
     n_jobs=None,
     oob_score=False,
     random_state=0,
     verbose=0,
     warm_start=False
     )

forest.fit(X_train, Y_train)

predicted_labels = forest.predict(X_train)

print('正解率:', round(forest.score(X_train, Y_train.values), 2, ))

グリッドサーチ

from sklearn.grid_search import GridSearchCV

mod = RandomForestClassifier(
     bootstrap=True,
     class_weight=None,
     criterion='gini',
     max_depth=2,
     max_features='auto',
     max_leaf_nodes=None,
     min_impurity_decrease=0.0,
     min_impurity_split=None,
     min_samples_leaf=1,
     min_samples_split=2,
     min_weight_fraction_leaf=0.0,
     n_estimators=100,
     n_jobs=None,
     oob_score=False,
     random_state=0,
     verbose=0,
     warm_start=False
     )

param_grid = {
      'n_estimators'      : [1,5, 10, 20],
      'max_features'      : [3, 5, 10, 12],
      'random_state'      : [2525],
      'n_jobs'            : [1],
      'min_samples_split' : [2, 3, 5],
      'max_depth'         : [3, 5, 10, 15]
}

gs = GridSearchCV(
             cv=5, # クロスバリデーションの分割数 StratifiedKFoldが使用される
             error_score='raise', # フィッティングでエラーが発生した場合にスコアに割り当てる値
             estimator=mod # 対象の機械学習モデル
             fit_params=None, # 廃止予定
             iid='warn', # 廃止予定
             n_jobs=1, # 並列処理
             param_grid=param_grid # 探索パラメタ辞書
             pre_dispatch=None, # 並列実行中にディスパッチされるジョブの数
             refit=True, # ベストパラメータで再度学習
             return_train_score=False, # トレーニングスコアの保持
             scoring=None, # 評価方法
             verbose=0 # ログ出力レベル
             )

gs.fit(X_train, Y_train)

print(gs.best_estimator_)

ランダムフォレストの重要度可視化

def feature_viz(df,forest):
    #特徴量の重要度
    feature = forest.feature_importances_

    #特徴量の重要度を上から順に出力する
    f = pd.DataFrame({'number': range(0, len(feature)),
                 'feature': feature[:]})
    f2 = f.sort_values('feature',ascending=False)
    f3 = f2.loc[:, 'number']

    #特徴量の名前
    label = df.columns[0:]

    #特徴量の重要度順（降順）
    indices = np.argsort(feature)[::-1]

    for i in range(len(feature)):
        print(str(i + 1) + "   " + str(label[indices[i]]) + "   " + str(feature[indices[i]]))

    plt.title('Feature Importance')
    plt.bar(range(len(feature)),feature[indices], color='lightblue', align='center')
    plt.xticks(range(len(feature)), label[indices], rotation=90)
    plt.xlim([-1, len(feature)])
    plt.tight_layout()
    plt.show()

クロスバリデーション

from sklearn.model_selection import cross_val_score
scores = (estimator=mod,
                X=X_train,
                y=Y_train,
                cv=3,
                groups=None, # スプリットしたデータの名前
                scoring=None, # 評価関数
                n_jobs=1, # 並列処理
                verbose=0 # ログ出力レベル
                fit_params=None, # モデルに渡すパラメータ
                pre_dispatch=None, # 並列実行中にディスパッチされるジョブの数
                error_score='raise', # フィッティングでエラーが発生した場合にスコアに割り当てる値
                )
print(scores)

ホールドアウト

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=0)
# skf = KFold(n_splits=2,shuffle=True,random_state=0)

train_index, test_index = list(skf.split(X_train_0, Y_train_0))[0]
x_train = X_train.iloc[train_index]
x_test = X_train.iloc[test_index]  
y_train = Y_train.iloc[train_index]
y_test = Y_train.iloc[test_index]

XGboost（分類）

import xgboost as xgb
mod = xgb.XGBClassifier(max_depth=6, # 決定木の深さ
                        learning_rate=0.1, # 学習率
                        n_estimators=100, # 決定木の数
                        silent=True, # ブースティングメッセージの有無
                        objective='binary:logistic', # 予測する問題の種類
                        booster='gbtree', # ブースター
                        n_jobs=1, # 並列処理
                        nthread=None, # 平行スレッド数
                        gamma=0, # さらに分割するために必要な最小損失
                        min_child_weight=1, # 子に必要なインスタンス重み（ヘッセ行列）の最小合計
                        max_delta_step=0, # 各ツリーの重みの推定を可能にする最大デルタステップ
                        subsample=1, # トレーニングインスタンスのサブサンプル比率。
                        colsample_bytree=1, # 各ツリーを構築するときの列のサブサンプル比率
                        colsample_bylevel=1, # 各レベルの各分割の列のサブサンプル比率
                        reg_alpha=0, # L1正則
                        reg_lambda=1, # L2正則
                        scale_pos_weight=1, # 正と負のウェイトのバランス
                        base_score=0.5, # すべてのインスタンスの初期予測スコア、グローバルバイアス
                        random_state=0,
                        seed=None, # 非推奨
                        missing=None, # 欠損値を補完する値
                        **kwargs # Keyword arguments for XGBoost Booster object
                        )

mod.fit(X_train, Y_train)
predicted_labels = mod.predict(X_train)

statsmodel OLS

import statsmodels.api as sm
import statsmodels.formula.api as smf

mod = sm.OLS(Y_train['target'], sm.add_constant(X_train['x1','x2']))
result = mod.fit() 
print(result.summary())

X = sm.add_constant(X_train['x1','x2'])
Y_prediction = result.predict(X) # 予測

GLM

import statsmodels.api as sm
import statsmodels.formula.api as smf

data = pd.concat([X_train, Y_train],axis=1)

'''
print(dir(sm.genmod.families.links))
'cauchy', 'cloglog', 'identity', 'inverse_power', 'inverse_squared', 'log', 'logit', 'nbinom', 'np', 'probit', 'scipy', 'sqrt'
'''

link = sm.genmod.families.links.log
# link = sm.genmod.families.links.identity

family = sm.families.Poisson(link=link)

formula = 'target ~  1 + x1 * x2 * x3 * x4'

mod = smf.glm(formula = formula, data = data, family = family)
result = mod.fit() 
# print(result.summary())

X = X_train.to_dict(orient='series')
Y_prediction = result.predict(X)

# AIC
result.aic

評価関数作成

評価関数作成（WMAE）

from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true, y_pred):
    weighted_diff = (1 + 10**4 * y_true) * np.abs(y_true - y_pred)
    total = np.sum(weighted_diff)

    return weighted_diff / total

WMAE_loss = make_scorer(my_custom_loss_func, greater_is_better=False)

WMAE_loss(forest, X_train, Y_train.values.reshape(-1))

混同行列

from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)

予測値のグラフ化

# グラフ作成
plt.figure(figsize=(20, 6))

plt.plot(x, y, ".")
plt.plot(x, Y_prediction, ".")
plt.legend(['target','prediction'])

データサイエンスライブラリ基本形

前略

DataFrameの操作

空のデータフレームを作る

最初の列をインデックス列として読み込む

カラム名の変更

抽出

in条件の抽出

関数とセットでapplyを使う

applyでif文 例えば正負判定

pandasで1行ずつ処理

日付処理

日付と時刻をくっ付けて、時間を丸める

数値の処理

四捨五入 (小数点以下 .5 以上は繰上げ、.5未満は切捨て)

切り捨て (小数部分を取り除く)

切り捨て (小さい側の整数に丸める)

切り上げ (大きい側の整数に丸める)

ゼロに近い側の整数に丸める

銀行丸め

pandasで描画

散布図

棒グラフ

箱ひげ図

統計処理

要約統計量

平均値でクロス集計

クロス表変換

機械学習

データのスプリット

submission作成

ランダムフォレスト（回帰）

ランダムフォレスト（分類）

グリッドサーチ

ランダムフォレストの重要度可視化

クロスバリデーション

ホールドアウト

XGboost（分類）

statsmodel OLS

GLM

評価関数作成

評価関数作成（WMAE）

混同行列

予測値のグラフ化

草々

applyでif文例えば正負判定