# %%
import pandas as pd
import numpy as np
# %%
s = pd.date_range('2020-01-06', '2020-01-10', freq='10h').to_series()
# %%
s
# %%
features = {
"dayofweek": s.dt.dayofweek.values,
"dayofyear": s.dt.dayofyear.values,
"hour": s.dt.hour.values,
"is_leap_year": s.dt.is_leap_year.values,
"quarter": s.dt.quarter.values
}
# %%
features
# %%
def generate_features(df):
df.loc[:,'year'] = df["date"].dt.year
# df.loc[:,'weekofyear'] = df["date"].dt.weekofyear
df.loc[:,'month'] = df["date"].dt.month
df.loc[:,'dayofweek'] = df["date"].dt.dayofweek
df.loc[:,'weekend'] = (df["date"].dt.weekday >= 5).astype(int)
# create an aggregate dictionary
aggs = {}
aggs['month'] = ['nunique', 'mean']
# aggs['weekofyear'] = ['nunique', 'mean']
aggs['num1'] = ['sum', 'max', 'min', 'mean']
aggs['customer_id'] = ['size']
aggs['customer_id'] = ['nunique']
print(aggs)
agg_df = df.groupby('customer_id').agg(aggs)
agg_df= agg_df.reset_index()
return agg_df
# %%
df = pd.DataFrame({'date': pd.date_range('2020-01-06', '2020-01-10', freq='10h')})
df['num1'] = np.random.randint(0, 100, size=len(df)) # ダミーデータを追加
df['customer_id'] = np.random.randint(1, 10, size=len(df)) # ダミーの顧客IDを追加
df_w_features = generate_features(df)
# %%
df_w_features
# %%
import numpy as np
# %%
x = [1,2,3,4,5,6,7,8,9,10]
# %%
feature_dict = {}
# 平均
feature_dict['mean'] = np.mean(x)
feature_dict['max'] = np.max(x)
feature_dict['min'] = np.min(x)
feature_dict['std'] = np.std(x)
feature_dict['var'] = np.var(x)
#最小値と最大値の差
feature_dict['ptp'] = np.ptp(x)
# パーセンタイル
feature_dict['percentile_10'] = np.percentile(x, 10)
feature_dict['percentile_60'] = np.percentile(x, 60)
feature_dict['percentile_90'] = np.percentile(x, 90)
# 中央値
feature_dict['quantile_5'] = np.quantile(x, 0.05)
feature_dict['quantile_95'] = np.quantile(x, 0.95)
feature_dict['quantile_99'] = np.quantile(x, 0.99)
feature_dict
# %%
print(feature_dict)
# %%
from tsfresh.feature_extraction import feature_calculators as fc
# %%
feature_dict['abs_energy'] = fc.abs_energy(x)
feature_dict['count_above_mean'] = fc.count_above_mean(x)
feature_dict['count_below_mean'] = fc.count_below_mean(x)
feature_dict['mean_abs_change'] = fc.mean_abs_change(x)
feature_dict['mean_change'] = fc.mean_change(x)
# %%
feature_dict
# %%
df = pd.DataFrame(np.random.rand(100,2), columns=[f"f_{i}" for i in range(1,3)])
# %%
df
# %%
from sklearn import preprocessing
# %%
pf = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
# %%
pf.fit(df)
# %%
poly_feats = pf.transform(df)
# %%
poly_feats
# %%
num_feats = poly_feats.shape[1]
num_feats
# %%
df_transformed = pd.DataFrame(poly_feats, columns=[f"f_{i}" for i in range(1, num_feats+1)])
df_transformed
# %%
print(0.489843*0.084497)
# %%
# 量的変数の列を10つのbinに分割
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)
# %%
df
# %%
df["f_3"] = pd.DataFrame(np.random.rand(10000,1)*10000, columns=["f_3"])
# %%
df
# %%
df["f_3_log"] = np.log(df["f_3"] + 1)
# %%
df
# %%
import numpy as np
from sklearn import impute
# %%
X = np.random.randint(1,15, (10,6))
# %%
X
# %%
X = X.astype(float)
# %%
X
# %%
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan
# %%
X
# %%
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)
# %%
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme