More than 5 years have passed since last update.

DataLiner用前処理パイプラインテンプレート

Last updated at 2020-05-12Posted at 2020-05-12

はじめに

いちいちReference見ながら処理足すのメンドクサイ！という人向けの全部入りテンプレートです。
用途に合わせてコメントアウトや入れ替え等どうぞ。

インストール

! pip install -U dataliner

テンプレ

import dataliner as dl
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

X, X_test, y = dl.load_titanic()

process = make_pipeline(
    dl.DropColumns(drop_columns=['PassengerId']),
    dl.DropNoVariance(),
    dl.GroupRareCategory(threshold=0.01),
    dl.DropHighCardinality(max_categories=50),
    dl.DropLowAUC(threshold=0.51),
    dl.ClipData(threshold=0.99),
    dl.BinarizeNaN(),
    dl.CountRowNaN(),
    dl.ImputeNaN(cat_strategy='mode', num_strategy='mean'),
    dl.AppendEncoder(dl.TargetMeanEncoding(k=0, f=1, smoothing=True)),
    dl.OneHotEncoding(drop_first=True),
#     dl.TargetMeanEncoding(k=0, f=1, smoothing=True),
#     dl.CountEncoding(),
#     dl.RankedCountEncoding(),
#     dl.FrequencyEncoding(),
#     dl.RankedTargetMeanEncoding(k=0, f=1, smoothing=True),
#     dl.RankedEvaluationMetricEncoding(metric='roc_auc'),
    dl.StandardScaling(),
#     dl.MinMaxScaling(),
    dl.UnionAppend([
        dl.AppendCluster(n_clusters=8, random_state=1234),
        dl.AppendAnomalyScore(n_estimators=100, random_state=1234),
        dl.AppendPrincipalComponent(n_components=5, random_state=1234),
        dl.AppendClusterTargetMean(n_clusters=8, random_state=1234),
        dl.AppendClusterDistance(n_clusters=8, random_state=1234),
        dl.AppendArithmeticFeatures(max_features=50, metric='roc_auc', operation='add', replace_zero=0.001),
        dl.AppendArithmeticFeatures(max_features=50, metric='roc_auc', operation='subtract', replace_zero=0.001),
        dl.AppendArithmeticFeatures(max_features=50, metric='roc_auc', operation='multiply', replace_zero=0.001),
        dl.AppendArithmeticFeatures(max_features=50, metric='roc_auc', operation='divide', replace_zero=0.001),
#        dl.AppendClassificationModel(model=RandomForestClassifier(), probability=False)
    ]),
#     dl.DropLowAUC(),
    dl.PermutationImportanceTest(threshold=0.0001),
    dl.DropHighCorrelation(threshold=0.95),
)

Xt = process.fit_transform(X, y)
Xt_test = process.transform(X)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up