0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

Kaggle Masterに学ぶ機械学習実践アプローチ 写経 02

Last updated at Posted at 2024-07-20
# %%
print("hello")

# %% [markdown]
# # 過学習について

# %%
import pandas as pd
import duckdb 

# %%
df = pd.read_csv("winequality-red.csv")

# %%
quality_mapping = {
    3:0,
    4:1,
    5:2,
    6:3,
    7:4,
    8:5
}

# %%
df.loc[:, "quality"] = df.quality.map(quality_mapping)

# %%
df

# %%
df = df.sample(frac=1).reset_index(drop=True)

# %%
df

# %%
df_train = df.head(1000)
df_test = df.tail(500)

# %%
df.columns

# %%
from sklearn import tree
from sklearn import metrics

# %%
clf = tree.DecisionTreeClassifier(max_depth=7)
cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
clf.fit(df_train[cols], df_train.quality)

# %%
train_predictions = clf.predict(df_train[cols])

# %%
test_predictions = clf.predict(df_test[cols])

# %%
train_accuracy = metrics.accuracy_score(df_train.quality, train_predictions)

# %%
test_accyracy = metrics.accuracy_score(df_test.quality, test_predictions)

# %%
train_accuracy

# %%
test_accyracy

# %%
from sklearn import tree 
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns 


# %%
matplotlib.rc("xtick", labelsize=20)

# %%
matplotlib.rc("ytick", labelsize=20)

# %%
%matplotlib inline

# %%
train_accuracies = [0.5]
test_accuracies = [0.5]

# %%
cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
    'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
    'pH', 'sulphates', 'alcohol']
for depth in range(1,25):
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    clf.fit(df_train[cols], df_train.quality)
    train_predictions = clf.predict(df_train[cols])
    test_predictions = clf.predict(df_test[cols])    
    train_accuracy = metrics.accuracy_score(df_train.quality, train_predictions)
    test_accuracy= metrics.accuracy_score(df_test.quality, test_predictions)
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

# %%
plt.figure(figsize=(10,5))

# %%
sns.set_style("whitegrid")
plt.plot(train_accuracies,label = "train_accuracy")
plt.plot(test_accuracies, label="test_accuracy")
plt.legend(loc="upper left", prop={'size':15})
plt.xticks(range(0,26,5))
plt.xlabel("max_depth", size=20)
plt.ylabel("accuracy", size=20)
plt.show()

# %%
names = ['Alice', 'Bob', 'Charlie']
ages = [1,2,3]
for i, (name,age) in enumerate(zip(names,ages)):
    print(i, name, age)

# %%
df

# %% [markdown]
# # kfold法

# %%
df["kfold"] = -1

# %%
df = df.sample(frac=1).reset_index(drop=True)

# %%
df

# %%
from sklearn import model_selection

# %%
kf = model_selection.KFold(n_splits=5)

# %%
type(df)

# %%
df

# %%
for fold, (trn_, val_) in enumerate(kf.split(X=df)):
    df.loc[val_, 'kfold'] = fold 

# %%
df.to_csv("train_folds.csv", index=False)

# %% [markdown]
# # stratifiedKfold

# %%
df["kfold"] = -1

# %%
df =df.sample(frac=1).reset_index(drop=True)

# %%
df

# %%
y=df.quality.values

# %%
kf = model_selection.StratifiedKFold(n_splits=5)

# %%
type(df)

# %%
df

# %%
type(y)

# %%
y

# %%
for i, (trn_, val_) in enumerate(kf.split(X=df, y=y)):
    df.loc[val_, "kfold"] = i

# %%
df.to_csv("train_StratifiedKFold.csv")

# %% [markdown]
# 

# %% [markdown]
# # 回帰データセットに対するkfold

# %%
import numpy as np
import pandas as pd 
from sklearn import datasets
from sklearn import model_selection 

# %%
def create_folds(data):
    data["kfold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    num_bins = int(np.floor(1+np.log2(len(data))))
    data["bins"] = pd.cut(data["target"], num_bins, labels=False)
    kf = model_selection.StratifiedKFold(n_splits=5)
    for f, (trn, val) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[val, 'kfold'] = f 
    #data=data.drop("bins", axis=1)
    return data 

# %%
X, y = datasets.make_regression(
    n_samples = 15000, n_features=100, n_targets=1 
)


# %%
type(X)

# %%
X.shape[0]

# %%
df = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])

# %%
df

# %%
df["target"] = y

# %%
df

# %%
df = create_folds(df)

# %%
df.to_csv("train_bin_StratifiedKFold.csv")

# %%


# %%




0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?