# %%
print("hello")
# %% [markdown]
# # 過学習について
# %%
import pandas as pd
import duckdb
# %%
df = pd.read_csv("winequality-red.csv")
# %%
quality_mapping = {
3:0,
4:1,
5:2,
6:3,
7:4,
8:5
}
# %%
df.loc[:, "quality"] = df.quality.map(quality_mapping)
# %%
df
# %%
df = df.sample(frac=1).reset_index(drop=True)
# %%
df
# %%
df_train = df.head(1000)
df_test = df.tail(500)
# %%
df.columns
# %%
from sklearn import tree
from sklearn import metrics
# %%
clf = tree.DecisionTreeClassifier(max_depth=7)
cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol']
clf.fit(df_train[cols], df_train.quality)
# %%
train_predictions = clf.predict(df_train[cols])
# %%
test_predictions = clf.predict(df_test[cols])
# %%
train_accuracy = metrics.accuracy_score(df_train.quality, train_predictions)
# %%
test_accyracy = metrics.accuracy_score(df_test.quality, test_predictions)
# %%
train_accuracy
# %%
test_accyracy
# %%
from sklearn import tree
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# %%
matplotlib.rc("xtick", labelsize=20)
# %%
matplotlib.rc("ytick", labelsize=20)
# %%
%matplotlib inline
# %%
train_accuracies = [0.5]
test_accuracies = [0.5]
# %%
cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol']
for depth in range(1,25):
clf = tree.DecisionTreeClassifier(max_depth=depth)
clf.fit(df_train[cols], df_train.quality)
train_predictions = clf.predict(df_train[cols])
test_predictions = clf.predict(df_test[cols])
train_accuracy = metrics.accuracy_score(df_train.quality, train_predictions)
test_accuracy= metrics.accuracy_score(df_test.quality, test_predictions)
train_accuracies.append(train_accuracy)
test_accuracies.append(test_accuracy)
# %%
plt.figure(figsize=(10,5))
# %%
sns.set_style("whitegrid")
plt.plot(train_accuracies,label = "train_accuracy")
plt.plot(test_accuracies, label="test_accuracy")
plt.legend(loc="upper left", prop={'size':15})
plt.xticks(range(0,26,5))
plt.xlabel("max_depth", size=20)
plt.ylabel("accuracy", size=20)
plt.show()
# %%
names = ['Alice', 'Bob', 'Charlie']
ages = [1,2,3]
for i, (name,age) in enumerate(zip(names,ages)):
print(i, name, age)
# %%
df
# %% [markdown]
# # kfold法
# %%
df["kfold"] = -1
# %%
df = df.sample(frac=1).reset_index(drop=True)
# %%
df
# %%
from sklearn import model_selection
# %%
kf = model_selection.KFold(n_splits=5)
# %%
type(df)
# %%
df
# %%
for fold, (trn_, val_) in enumerate(kf.split(X=df)):
df.loc[val_, 'kfold'] = fold
# %%
df.to_csv("train_folds.csv", index=False)
# %% [markdown]
# # stratifiedKfold
# %%
df["kfold"] = -1
# %%
df =df.sample(frac=1).reset_index(drop=True)
# %%
df
# %%
y=df.quality.values
# %%
kf = model_selection.StratifiedKFold(n_splits=5)
# %%
type(df)
# %%
df
# %%
type(y)
# %%
y
# %%
for i, (trn_, val_) in enumerate(kf.split(X=df, y=y)):
df.loc[val_, "kfold"] = i
# %%
df.to_csv("train_StratifiedKFold.csv")
# %% [markdown]
#
# %% [markdown]
# # 回帰データセットに対するkfold
# %%
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
# %%
def create_folds(data):
data["kfold"] = -1
data = data.sample(frac=1).reset_index(drop=True)
num_bins = int(np.floor(1+np.log2(len(data))))
data["bins"] = pd.cut(data["target"], num_bins, labels=False)
kf = model_selection.StratifiedKFold(n_splits=5)
for f, (trn, val) in enumerate(kf.split(X=data, y=data.bins.values)):
data.loc[val, 'kfold'] = f
#data=data.drop("bins", axis=1)
return data
# %%
X, y = datasets.make_regression(
n_samples = 15000, n_features=100, n_targets=1
)
# %%
type(X)
# %%
X.shape[0]
# %%
df = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
# %%
df
# %%
df["target"] = y
# %%
df
# %%
df = create_folds(df)
# %%
df.to_csv("train_bin_StratifiedKFold.csv")
# %%
# %%
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme