Target Encodingのsmoothingの実装

Target Encodingのsmoothingなるものを自分で使うために実装してみました。


[Target Mean Encoding] (https://qiita.com/suaaa7/items/cfe9a9e516b5b784570f)

[TargetEncodingのスムーシング] (https://mikebird28.hatenablog.jp/entry/2018/06/14/172132)



import numpy as np
import pandas as pd

class TargetEncoding_ws(object):
    DFと変換したいカラムリスト、targetを引数として、Target Encoding with Smoothingを行う
    dataframe : DF全体 (pd.DataFrame)
    target : 目的変数のカラム (np.ndarray or np.Series)
    list_cols : 変換したいカラムリスト (list[str])
    k : smoothingのハイパーパラメータ (int)
    impute : 未知のカテゴリに平均を入れるか (boolean)
    def __init__(self, list_cols, k=100, impute=True):
        self.df = None
        self.target = None
        self.list_cols = list_cols
        self.k = k
        self.impute = impute
        self.target_map = {}
        self.target_mean = None
    def sigmoid(self, x, k):
        return 1 / (1 + np.exp(- x / k))
    def fit_univariate(self, target, col):
        col : TargetEncodingしたい変数名
        df = self.df.copy()
        k = self.k
        df["target"] = target
        n_i = df.groupby(col).count()["target"]
        lambda_n_i = self.sigmoid(n_i, k)
        uni_map = df.groupby(col).mean()["target"]
        return lambda_n_i * df.loc[:, "target"].mean() + (1 - lambda_n_i) * uni_map
    def fit(self, data, target):
        self.df = data.copy()
        self.target = target
        if self.impute == True:
            self.target_mean = target.mean()
        for col in list_cols:
            self.target_map[col] = self.fit_univariate(target, col)

    def transform(self, x):
        list_cols = self.list_cols
        x_d = x.copy()
        for col in list_cols:
            x_d.loc[:, col] = x_d.loc[:, col].map(self.target_map[col])
            if self.impute == True:
                x_d.loc[:, col] = np.where(x_d.loc[:, col].isnull(), self.target_mean, x_d.loc[:, col])
        return x_d


from sklearn.datasets import load_iris
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df["cate1"] = data.target
df["cate2"] = df.cate1 + 1 #2カラム以上でも動くことの確認用カラム

data = df.drop('sepal length (cm)', axis=1)
y = df['sepal length (cm)']

X_train = data.iloc[:100, :]
X_test = data.iloc[100:, :]
y_train = y[:100]
y_test = y[100:]


list_cols = ["cate1", "cate2"]
te = TargetEncoding_ws(list_cols=list_cols, k=200, impute=False)
te.fit(X_train, y_train)
sepal width (cm) petal length (cm) petal width (cm) cate1 cate2
0 3.5 1.4 0.2 5.267412 5.267412
1 3.0 1.4 0.2 5.267412 5.267412
2 3.2 1.3 0.2 5.267412 5.267412
3 3.1 1.5 0.2 5.267412 5.267412
4 3.6 1.4 0.2 5.267412 5.267412

| |sepal width (cm) | petal length (cm) | petal width (cm) | cate1 | cate2 |
| 100 | 3.3 | 6.0 | 2.5 | NaN | NaN |
| 101 | 2.7 | 5.1 | 1.9 | NaN | NaN |
| 102 | 3.0 | 5.9 | 2.1 | NaN | NaN |
| 103 | 2.9 | 5.6 | 1.8 | NaN | NaN |
| 104 | 3.0 | 5.8 | 2.2 | NaN | NaN |

list_cols = ["cate1", "cate2"]
te = TargetEncoding_ws(list_cols=list_cols, k=200, impute=True)
te.fit(X_train, y_train)
sepal width (cm) petal length (cm) petal width (cm) cate1 cate2
0 3.5 1.4 0.2 5.267412 5.267412
1 3.0 1.4 0.2 5.267412 5.267412
2 3.2 1.3 0.2 5.267412 5.267412
3 3.1 1.5 0.2 5.267412 5.267412
4 3.6 1.4 0.2 5.267412 5.267412
sepal width (cm) petal length (cm) petal width (cm) cate1 cate2
100 3.3 6.0 2.5 5.471 5.471
101 2.7 5.1 1.9 5.471 5.471
102 3.0 5.9 2.1 5.471 5.471
103 2.9 5.6 1.8 5.471 5.471
104 3.0 5.8 2.2 5.471 5.471



