Kaggle Masterに学ぶ機械学習実践アプローチ写経 07

Python

Posted at 2024-08-13

# %%
import pandas as pd 
from sklearn.datasets import fetch_california_housing
import numpy as np

# %%
data = fetch_california_housing()

# %%
data

# %%
X = data["data"]
col_names = data["feature_names"]
y=data["target"]

# %%
# pandasに変換
df = pd.DataFrame(X, columns=col_names)

# %%

df

# %%
df["MedInc_Sqrt"] = df.MedInc.apply(np.sqrt)

# %%
df.corr()

# %%
from sklearn.feature_selection import chi2, f_classif, f_regression, mutual_info_classif, mutual_info_regression, SelectKBest, SelectPercentile

# %%
class UnivariateFeatureSelection:
    def __init__(self, n_features, problem_type, scoring):
        """
        scikit-learnの複数の手法に対応した
        単変量特徴選択のためのラッパークラス
        :param n_features: float型の場合はselectpercentileで、それ以外の場合はselectKBestを利用
        :param problem_type: "classification" or "regression"
        :param scoring: 評価関数
        """
        # 指定された問題の種類に対応している手法
        if problem_type == "classificaiton":
            valid_scoring = {"f_classif": f_classif,
                             "chi2":chi2,
                             "mutual_info_classif": mutual_info_classif}
        elif problem_type == "regression":
            valid_scoring = {"f_regression": f_regression,
                             "mutual_info_regression": mutual_info_regression}
        else:
            raise ValueError("problem_typeは'classification' or 'regression'でなければなりません")
        
        if scoring not in valid_scoring:
            raise ValueError("scoringは{}でなければなりません".format(valid_scoring))
        
        # n_featuresがfloat型の場合はSelectPercentile、それ以外の場合はSelectKBestを利用
        if isinstance(n_features, int):
            self.selection = SelectKBest(scoring=valid_scoring[scoring], k=n_features)
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(valid_scoring[scoring], percentile = int(n_features*100))
        else:
            raise ValueError("n_featuresはint型かfloat型でなければなりません")
        
    def fit(self, X, y):
        self.selection.fit(X, y)
        self.pvalues_ = self.selection.pvalues_
        return self
    
    def transform(self, X):
        return self.selection.transform(X)
    
    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)



# %%
ufs = UnivariateFeatureSelection(n_features=0.1, problem_type="regression", scoring="f_regression")

# %%
ufs.fit(X, y)

# %%
x_transformed = ufs.transform(X)

# %%
x_transformed

# %%
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing 

data = fetch_california_housing()
X = data["data"]
y = data["target"]

model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=4)
rfe.fit(X, y)
X_transformed = rfe.transform(X)

# %%
X_transformed

# %%
import pandas as pd
from sklearn.datasets import  load_diabetes
from sklearn.ensemble import RandomForestRegressor

data = load_diabetes()
X = data["data"]
y = data["target"]
col_names = data["feature_names"]

model = RandomForestRegressor()
model.fit(X, y)


# %%
importances = model.feature_importances_
idxs = np.argsort(importances)


# %%
import matplotlib.pyplot as plt

# %%
plt.title("Feature Importances")
plt.barh(range(len(idxs)), importances[idxs], align="center")
plt.yticks(range(len(idxs)), [col_names[i] for i in idxs])
plt.xlabel("Importance")
plt.show()

# %%
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel


data = load_diabetes()
X = data["data"]
y = data["target"]
col_names = data["feature_names"]

model = RandomForestRegressor()

sfm = SelectFromModel(estimator=model)
X_transformed = sfm.fit_transform(X, y)

support = sfm.get_support()

print([x for x, y in zip(col_names, support) if y == True])

import pandas as pd

from sklearn import linear_model
from sklearn import metrics 
from sklearn.datasets import make_classification

class GreedyFeatureSelection:
    def evaluate_score(self, X, y):
        model = linear_model.LogisticRegression()
        model.fit(X, y)
        predictions = model.predict_proba(X)[:, 1]
        auc = metrics.roc_auc_score(y, predictions)
        return auc 
    
    def _feature_selection(self, X, y):
        good_features = []
        best_scores = []
        num_features = X.shape[1]
        while(True):
            this_feature =None
            best_score = 0
            for feature in range(num_features):
                if feature in good_features:
                    continue
                selected_features = good_features + [feature]
                xtrain = X[:, selected_features]
                score = self.evaluate_score(xtrain, y)
                if score > best_score:
                    best_score = score
                    this_feature = feature
            if this_feature != None:
                good_features.append(this_feature)
                best_scores.append(best_score)
            if len(best_scores) > 2:
                if(best_scores[-1] < best_scores[-2]):
                    break
        return best_scores[:-1], good_features[:-1]

    def __call__(self, X, y):
        scores, features = self._feature_selection(X, y)
        return X[:, features], scores 
    

if __name__ == "__main__":
    X, y = make_classification(n_samples=1000, n_features=100)
    X_transformed, scores = GreedyFeatureSelection()(X, y)
    print(X_transformed.shape)
    print(scores)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

Kaggle Masterに学ぶ機械学習実践アプローチ 写経 07

Kaggle Masterに学ぶ機械学習実践アプローチ写経 07