Kaggle Masterに学ぶ機械学習実践アプローチ写経 04

Python

Posted at 2024-08-10

# src/train.py
import argparse
import model_dispatcher
import config
import joblib
import pandas as pd
from sklearn import metrics
from sklearn import tree

def run(fold, model):
    # read the training data with folds
    df = pd.read_csv(config.TRAINING_FILE)
    # training data is where kfold is not equal to provided fold
    # also, note that we reset the index
    df_train = df[df.kfold != fold].reset_index(drop=True)
    # validation data is where kfold is equal to provided fold
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    # drop the label column from dataframe and convert it to
    # a numpy array by using .values.
    # target is label column in the dataframe
    x_train = df_train.drop("label", axis=1).values
    y_train = df_train.label.values
    # similarly, for validation, we have
    x_valid = df_valid.drop("label", axis=1).values
    y_valid = df_valid.label.values
    # initialize simple decision tree classifier from sklearn
    clf = model_dispatcher.models[model]
    # fit the model on training data
    clf.fit(x_train, y_train)
    # create predictions for validation samples
    preds = clf.predict(x_valid)
    # calculate & print accuracy
    accuracy = metrics.accuracy_score(y_valid, preds)
    print(f"Fold={fold}, Accuracy={accuracy}")
    # save the model
    joblib.dump(clf, f"{config.MODEL_OUTPUT}dt_{fold}.bin")
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--fold", type=int)
    parser.add_argument("--model", type=str)
    args = parser.parse_args()
    run(fold=args.fold, model=args.model)

from sklearn import tree 
from sklearn import ensemble

models = {
    "decision_tree_gini": tree.DecisionTreeClassifier(criterion="gini"),
    "decision_tree_entropy": tree.DecisionTreeClassifier(criterion="entropy"),
    "rf": ensemble.RandomForestClassifier()
}

TRAINING_FILE = "../input/mnist_train_folds.csv"
MODEL_OUTPUT = "../models/"

#! /bin/bash

for fold in {0..4} 
do
    python train.py --fold $fold --model rf
done

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

Kaggle Masterに学ぶ機械学習実践アプローチ 写経 04

Kaggle Masterに学ぶ機械学習実践アプローチ写経 04