Kaggle Masterに学ぶ機械学習実践アプローチ写経 03

Python
Posted at 2024-07-28
# %%
import numpy as np
import matplotlib.pyplot as plt

# %%
import pandas as pd

# %%
def accyracy(y_true, y_pred):
    """
    Calculate accuracy of the model
    """
    correct_conter = 0
    for yt, yp in zip(y_true, y_pred):
        if (yt == yp):
            correct_conter += 1
    return correct_conter / len(y_true)

# %%
l1 = [0, 1, 1, 1, 0, 0, 0, 1]
l2 = [0, 1, 0, 1, 0, 1, 0, 0]



# %%
print(accyracy(l1, l2))

# %%
def true_positive(y_true, y_pred):
    tp = 0
    for yt, yp in zip(y_true, y_pred):
        if (yt == 1 and yp == 1):
            tp += 1
    return tp

def true_negative(y_true, y_pred):
    tn = 0
    for yt, yp in zip(y_true, y_pred):
        if (yt == 0 and yp == 0):
            tn += 1
    return tn

def false_positive(y_true, y_pred):
    fp = 0
    for yt, yp in zip(y_true, y_pred):
        if (yt == 0 and yp == 1):
            fp += 1
    return fp

def false_negative(y_true, y_pred):
    fn = 0
    for yt, yp in zip(y_true, y_pred):
        if (yt == 1 and yp == 0):
            fn += 1
    return fn


# %%
print(true_positive(l1, l2))
print(true_negative(l1, l2))
print(false_positive(l1, l2))
print(false_negative(l1, l2))

# %%
def accyracy_v2(y_true, y_pred):
    tp = true_positive(y_true, y_pred)
    tn = true_negative(y_true, y_pred)
    fp = false_positive(y_true, y_pred)
    fn = false_negative(y_true, y_pred)
    return (tp + tn) / (tp + tn + fp + fn)

print(accyracy_v2(l1, l2))

# %%
def precision(y_true, y_pred):
    tp = true_positive(y_true, y_pred)
    fp = false_positive(y_true, y_pred)
    if (tp + fp) == 0:
        return 0
    return tp / (tp + fp)

print(precision(l1, l2))

# %%
def recall(y_true, y_pred):
    tp = true_positive(y_true, y_pred)
    fn = false_negative(y_true, y_pred)
    return tp / (tp + fn)

print(recall(l1,l2))

# %%
y_true = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
          1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
y_pred = [0.02638412, 0.11114267, 0.31620708,
          0.0490937, 0.0191491, 0.17554844,
          0.15952202, 0.03819563, 0.11639273,
          0.079377, 0.08584789, 0.39095342,
          0.27259048, 0.03447096, 0.04644807,
          0.03543574, 0.18521942, 0.05934905,
          0.61977213, 0.33056815]

# %%
thresholds = [0.0490937 , 0.05934905, 0.079377,
 0.08584789, 0.11114267, 0.11639273,
 0.15952202, 0.17554844, 0.18521942,
 0.27259048, 0.31620708, 0.33056815,
 0.39095342, 0.61977213]

# %%
precisions = []
recalls = []
for i in thresholds:
    tmp_prediction = [1 if x > i else 0 for x in y_pred]
    p = precision(y_true, tmp_prediction)
    r = recall(y_true, tmp_prediction)
    precisions.append(p)
    recalls.append(r)


# %%
plt.figure(figsize=(7,7))
plt.plot(recalls, precisions)
plt.xlabel("Recall", fontsize=14)
plt.ylabel("Precision", fontsize=14)
plt.xlim(0, 1)
plt.ylim(0,1)
plt.title("Precision-Recall Curve", fontsize=14)
plt.show()

# %%
def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    if (p + r) == 0:
        return 0
    return 2 * p * r / (p + r)

# %%
y_true = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
          1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
y_pred = [0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
           1, 0, 0, 0, 0, 0, 0, 0, 1, 0]

# %%
print(f1_score(y_true, y_pred))

# %%
from sklearn import metrics 

# %%
metrics.f1_score(y_true, y_pred )

# %%
def tpr(y_true, y_pred):
    return recall(y_true, y_pred)

# %%
def fpr(y_true, y_pred):
    fp = false_positive(y_true, y_pred)
    tn = true_negative(y_true, y_pred)
    return fp / (fp + tn)

# %%
y_true = [0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]

# %%
y_pred =[0.1, 0.3, 0.2, 0.6, 0.8, 0.05, 0.9, 0.5, 0.3, 0.66, 0.3,
0.2, 0.85, 0.15, 0.99]
# y_pred = y_true

# %%
thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.85, 0.9, 0.99, 1.0]

# %%
tpr_list = []
fpr_list = []
for thresh in thresholds:
    tmp_prediction = [1 if x >= thresh else 0 for x in y_pred]
    tmp_tpr = tpr(y_true, tmp_prediction)
    tmp_fpr = fpr(y_true, tmp_prediction)
    tpr_list.append(tmp_tpr)
    fpr_list.append(tmp_fpr)
plt.figure(figsize=(7,7))
plt.plot(fpr_list, tpr_list)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()  

# %%
df = pd.DataFrame({"thresholds": thresholds, "tpr": tpr_list, "fpr": fpr_list})
print(df)

# %%
metrics.roc_auc_score(y_true, y_pred)

# %%
# 信用性と擬陽性の数を格納するリスト
tp_list = []
fp_list = []

# 正解
y_true = [0, 0, 0, 0, 1, 0, 1,
          0, 0, 1, 0, 1, 0, 0, 1]
# 予測確率
y_pred = [0.1, 0.3, 0.2, 0.6, 0.8, 0.05, 
          0.9, 0.5, 0.3, 0.66, 0.3, 0.2, 
          0.85, 0.15, 0.99]
# 閾値のリスト
thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 
              0.6, 0.7, 0.8, 0.85, 0.9, 0.99, 1.0]

# 閾値ごとにtpとfpを計算してリストに格納
for thresh in thresholds:
    tmp_prediction = [1 if x >= thresh else 0 for x in y_pred]
    # 真陽性を計算
    tmp_tp = true_positive(y_true, tmp_prediction)
    # 偽陽性を計算
    tmp_fp = false_positive(y_true, tmp_prediction)
    # 真陽性と偽陽性をリストに格納
    tp_list.append(tmp_tp)
    fp_list.append(tmp_fp)


# %%
df = pd.DataFrame({"thresholds": thresholds, "tp": tp_list, "fp": fp_list})
print(df)

# %%
def log_loss(y_true, y_proba):
    epsilon = 1e-15
    loss = []
    for yt, yp in zip(y_true, y_proba):
        yp = np.clip(yp, epsilon, 1-epsilon)
        tmp_loss = -1 * (yt * np.log(yp) + (1-yt) * np.log(1-yp))
        loss.append(tmp_loss)
    return np.mean(loss)

# %%
# 正解
y_true = [0, 0, 0, 0, 1, 0, 1,
          0, 0, 1, 0, 1, 0, 0, 1]
y_proba = [0.1, 0.3, 0.2, 0.6, 0.8, 0.05, 
          0.9, 0.5, 0.3, 0.66, 0.3, 0.2, 
          0.85, 0.15, 0.99]
print(log_loss(y_true, y_proba))

# %%
metrics.log_loss(y_true, y_proba)

# %%
def macro_prediction(y_true, y_pred):
    num_classes =len(np.unique(y_true))
    precision=0
    for class_ in range(num_classes):
        temp_true = [1 if p == class_ else 0 for p in y_true]
        temp_pred = [1 if p == class_ else 0 for p in y_pred]
        tp = true_positive(temp_true, temp_pred)
        fp = false_positive(temp_true, temp_pred)
        temp_precision = tp/(tp+fp)
        precision += temp_precision
    return precision / num_classes

# %%
def micro_precision(y_true, y_pred):
    num_classes=len(np.unique(y_true))
    tp=0
    fp=0
    for class_ in range(num_classes):
        temp_true = [1 if p == class_ else 0 for p in y_true]
        temp_pred = [1 if p == class_ else 0 for p in y_pred]
        tp += true_positive(temp_true, temp_pred)
        fp += false_positive(temp_true, temp_pred)
    return tp/(tp+fp)

# %%
from collections import Counter

# %%
def weighted_precision(y_true, y_pred):
    num_classes = len(np.unique(y_true))
    class_counts = Counter(y_true)
    precision = 0
    for class_ in range(num_classes):
        temp_true = [1 if p == class_ else 0 for p in y_true]
        temp_pred = [1 if p == class_ else 0 for p in y_pred]
        tp = true_positive(temp_true, temp_pred)
        fp = false_positive(temp_true, temp_pred)
        temp_precision = tp/(tp+fp)
        precision += temp_precision * class_counts[class_]
    return precision / len(y_true)

# %%
y_true=[0,1,2,0,1,2,0,2,2]
y_pred=[0,2,1,0,2,1,0,0,2]

# %%
macro_prediction(y_true,y_pred)


# %%
metrics.precision_score(y_true,y_pred,average="macro")


# %%
micro_precision(y_true,y_pred)


# %%
metrics.precision_score(y_true,y_pred,average="micro")

# %%
weighted_precision(y_true,y_pred)


# %%
metrics.precision_score(y_true,y_pred,average="weighted")

# %%
def weighted_f1(y_true, y_pred):
    num_classes = len(np.unique(y_true))
    class_counts = Counter(y_true)
    f1=0
    for class_ in range(num_classes):
        temp_true = [1 if p == class_ else 0 for p in y_true]
        temp_pred = [1 if p == class_ else 0  for p in y_pred]
        p=precision(temp_true, temp_pred)
        r = recall(temp_true, temp_pred)
        if (p + r != 0):
            temp_f1 = 2*p*r/(p+r)
        else :
            temp_f1 = 0 
        weighted_f1 = temp_f1 * class_counts[class_]
        f1 += weighted_f1
    overall_f1 = f1 / len(y_true)
    return overall_f1


# %%
y_true=[0,1,2,0,1,2,0,2,2]
y_pred=[0,2,1,0,2,1,0,0,2]
print(weighted_f1(y_true,y_pred))
print(metrics.f1_score(y_true,y_pred,average="weighted"))


# %%
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn import metrics 

# %%
y_true=[0,1,2,0,1,2,0,2,2]
y_pred=[0,2,1,0,2,1,0,0,2]

# %%
cm = metrics.confusion_matrix(y_true, y_pred)

# %%
plt.figure(figsize=(10,10))

# %%
cmap = sns.cubehelix_palette(30, hue=0.05, rot =0, 
                            light=0.9, dark=0, as_cmap=True)
sns.set(font_scale=2.5)
sns.heatmap(cm, annot=True, cmap=cmap, cbar=False)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')

# %%
def pk(y_true, y_pred, k):
    """
    This function calculates precision at k
    for a single sample
    :param y_true: list of values, actual classes
    :param y_pred: list of values, predicted classes
    :param k: the value for k
    :return: precision at a given value k
    """
    # if k is 0, return 0. we should never have this
    # as k is always >= 1
    if k == 0:
        return 0
    # we are interested only in top-k predictions
    y_pred = y_pred[:k]
    # convert predictions to set
    pred_set = set(y_pred)
    # convert actual values to set
    true_set = set(y_true)
    # find common values
    common_values = pred_set.intersection(true_set)
    # return length of common values over k
    return len(common_values) / len(y_pred[:k])

# %%
def apk(y_true, y_pred, k):
    """
    This function calculates average precision at k
    for a single sample
    :param y_true: list of values, actual classes
    :param y_pred: list of values, predicted classes
    :param k: the value for k
    :return: average precision at a given value k
    """
    pk_values = []
    for i in range(1, k+1):
        pk_values.append(pk(y_true, y_pred, i))
    if len(pk_values) == 0:
        return 0
    return sum(pk_values) / len(pk_values)

# %%
y_true = [
[1, 2, 3],
[0, 2],
[1],
[2, 3],
[1, 0],
[]
]

y_pred = [
[0, 1, 2],
[1],
[0, 2, 3],
[2, 3, 4, 0],
[0, 1, 2],
[0]
 ]

# %%
for i in range(len(y_true)):
    for j in range(1, 4):
        print(
            f"""
            y_true = {y_true[i]}
            y_pred = {y_pred[i]}
            AP@{j} = {apk(y_true[i], y_pred[i], k=j)}
            """
        )

# %%
def mapk(y_true, y_pred, k):
    apk_values = []
    for i in range(len(y_true)):
        apk_values.append(apk(y_true[i], y_pred[i], k=k))
    return sum(apk_values) / len(apk_values)

# %%
y_true

# %%
y_pred

# %%
mapk(y_true, y_pred, k=1)

# %%
mapk(y_true, y_pred, k=2)

# %%
mapk(y_true, y_pred, k=3)

# %%
mapk(y_true, y_pred, k=4)

# %%
def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

# %%
def mean_squared_error(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))


# %%
def mean_squared_log_error(y_true, y_pred):
    return np.mean(np.square(np.log(y_true + 1) - np.log(y_pred + 1)))


# %%
def mean_percentage_error(y_true, y_pred):
    return np.mean((y_true - y_pred) / y_true) * 100

# %%
def mean_abs_percentage_error(y_true, Y_pred):
    return np.mean(np.abs(y_true - y_pred)/y_true)* 100

# %%
def r2(y_true, y_pred):
    mean_true_value = np.mean(y_true)
    numerator = 0
    denominator = 0
    for yt, yp in zip(y_true, y_pred):
        numerator += (yt - yp) ** 2
        denominator += (yt - mean_true_value) ** 2
    return 1 - (numerator / denominator)

# %%
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up
Kaggle Masterに学ぶ機械学習実践アプローチ 写経 03

Kaggle Masterに学ぶ機械学習実践アプローチ写経 03