0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

Kaggle Masterに学ぶ機械学習実践アプローチ 写経 06

Last updated at Posted at 2024-08-13
# %%
import pandas as pd 
import numpy as np

# %%
s = pd.date_range('2020-01-06', '2020-01-10', freq='10h').to_series()

# %%
s

# %%
features = {
"dayofweek": s.dt.dayofweek.values,
"dayofyear": s.dt.dayofyear.values,
"hour": s.dt.hour.values,
"is_leap_year": s.dt.is_leap_year.values,
"quarter": s.dt.quarter.values
}

# %%
features

# %%
def generate_features(df):
    df.loc[:,'year'] = df["date"].dt.year
    # df.loc[:,'weekofyear'] = df["date"].dt.weekofyear
    df.loc[:,'month'] = df["date"].dt.month
    df.loc[:,'dayofweek'] = df["date"].dt.dayofweek
    df.loc[:,'weekend'] = (df["date"].dt.weekday >= 5).astype(int)

    # create an aggregate dictionary 
    aggs = {}
    aggs['month'] = ['nunique', 'mean']
    # aggs['weekofyear'] = ['nunique', 'mean']
    aggs['num1'] = ['sum', 'max', 'min', 'mean']
    aggs['customer_id'] = ['size']
    aggs['customer_id'] = ['nunique']
    print(aggs)

    agg_df = df.groupby('customer_id').agg(aggs)
    agg_df= agg_df.reset_index()
    return agg_df

# %%
df = pd.DataFrame({'date': pd.date_range('2020-01-06', '2020-01-10', freq='10h')})
df['num1'] = np.random.randint(0, 100, size=len(df))  # ダミーデータを追加
df['customer_id'] = np.random.randint(1, 10, size=len(df))  # ダミーの顧客IDを追加

df_w_features = generate_features(df)

# %%
df_w_features

# %%
import numpy as np 

# %%
x = [1,2,3,4,5,6,7,8,9,10]

# %%
feature_dict = {}

# 平均
feature_dict['mean'] = np.mean(x)
feature_dict['max'] = np.max(x)
feature_dict['min'] = np.min(x)
feature_dict['std'] = np.std(x)
feature_dict['var'] = np.var(x)
#最小値と最大値の差
feature_dict['ptp'] = np.ptp(x)

# パーセンタイル
feature_dict['percentile_10'] = np.percentile(x, 10)
feature_dict['percentile_60'] = np.percentile(x, 60)
feature_dict['percentile_90'] = np.percentile(x, 90)

# 中央値
feature_dict['quantile_5'] = np.quantile(x, 0.05)
feature_dict['quantile_95'] = np.quantile(x, 0.95)
feature_dict['quantile_99'] = np.quantile(x, 0.99)

feature_dict

# %%
print(feature_dict)

# %%
from tsfresh.feature_extraction import feature_calculators as fc

# %%
feature_dict['abs_energy'] = fc.abs_energy(x)
feature_dict['count_above_mean'] = fc.count_above_mean(x)
feature_dict['count_below_mean'] = fc.count_below_mean(x)
feature_dict['mean_abs_change'] = fc.mean_abs_change(x)
feature_dict['mean_change'] = fc.mean_change(x)


# %%
feature_dict

# %%
df = pd.DataFrame(np.random.rand(100,2), columns=[f"f_{i}" for i in range(1,3)])

# %%
df

# %%
from sklearn import preprocessing

# %%
pf = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

# %%
pf.fit(df)

# %%
poly_feats = pf.transform(df)

# %%
poly_feats

# %%
num_feats = poly_feats.shape[1]
num_feats

# %%
df_transformed = pd.DataFrame(poly_feats, columns=[f"f_{i}" for i in range(1, num_feats+1)])
df_transformed

# %%
print(0.489843*0.084497)

# %%
# 量的変数の列を10つのbinに分割
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)

# %%
df

# %%
df["f_3"] = pd.DataFrame(np.random.rand(10000,1)*10000, columns=["f_3"])

# %%
df

# %%
df["f_3_log"] = np.log(df["f_3"] + 1)

# %%
df

# %%
import numpy as np
from sklearn import impute

# %%
X = np.random.randint(1,15, (10,6))

# %%
X

# %%
X = X.astype(float)

# %%
X

# %%
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan

# %%
X

# %%
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)

# %%

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?