# %%
import pandas as pd
from sklearn.datasets import fetch_california_housing
import numpy as np
# %%
data = fetch_california_housing()
# %%
data
# %%
X = data["data"]
col_names = data["feature_names"]
y=data["target"]
# %%
# pandasに変換
df = pd.DataFrame(X, columns=col_names)
# %%
df
# %%
df["MedInc_Sqrt"] = df.MedInc.apply(np.sqrt)
# %%
df.corr()
# %%
from sklearn.feature_selection import chi2, f_classif, f_regression, mutual_info_classif, mutual_info_regression, SelectKBest, SelectPercentile
# %%
class UnivariateFeatureSelection:
def __init__(self, n_features, problem_type, scoring):
"""
scikit-learnの複数の手法に対応した
単変量特徴選択のためのラッパークラス
:param n_features: float型の場合はselectpercentileで、それ以外の場合はselectKBestを利用
:param problem_type: "classification" or "regression"
:param scoring: 評価関数
"""
# 指定された問題の種類に対応している手法
if problem_type == "classificaiton":
valid_scoring = {"f_classif": f_classif,
"chi2":chi2,
"mutual_info_classif": mutual_info_classif}
elif problem_type == "regression":
valid_scoring = {"f_regression": f_regression,
"mutual_info_regression": mutual_info_regression}
else:
raise ValueError("problem_typeは'classification' or 'regression'でなければなりません")
if scoring not in valid_scoring:
raise ValueError("scoringは{}でなければなりません".format(valid_scoring))
# n_featuresがfloat型の場合はSelectPercentile、それ以外の場合はSelectKBestを利用
if isinstance(n_features, int):
self.selection = SelectKBest(scoring=valid_scoring[scoring], k=n_features)
elif isinstance(n_features, float):
self.selection = SelectPercentile(valid_scoring[scoring], percentile = int(n_features*100))
else:
raise ValueError("n_featuresはint型かfloat型でなければなりません")
def fit(self, X, y):
self.selection.fit(X, y)
self.pvalues_ = self.selection.pvalues_
return self
def transform(self, X):
return self.selection.transform(X)
def fit_transform(self, X, y):
return self.selection.fit_transform(X, y)
# %%
ufs = UnivariateFeatureSelection(n_features=0.1, problem_type="regression", scoring="f_regression")
# %%
ufs.fit(X, y)
# %%
x_transformed = ufs.transform(X)
# %%
x_transformed
# %%
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X = data["data"]
y = data["target"]
model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=4)
rfe.fit(X, y)
X_transformed = rfe.transform(X)
# %%
X_transformed
# %%
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
data = load_diabetes()
X = data["data"]
y = data["target"]
col_names = data["feature_names"]
model = RandomForestRegressor()
model.fit(X, y)
# %%
importances = model.feature_importances_
idxs = np.argsort(importances)
# %%
import matplotlib.pyplot as plt
# %%
plt.title("Feature Importances")
plt.barh(range(len(idxs)), importances[idxs], align="center")
plt.yticks(range(len(idxs)), [col_names[i] for i in idxs])
plt.xlabel("Importance")
plt.show()
# %%
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
data = load_diabetes()
X = data["data"]
y = data["target"]
col_names = data["feature_names"]
model = RandomForestRegressor()
sfm = SelectFromModel(estimator=model)
X_transformed = sfm.fit_transform(X, y)
support = sfm.get_support()
print([x for x, y in zip(col_names, support) if y == True])