# Read csv data as dataset from '/kaggle/input/titanic/train.csv' as train data and '/kaggle/input/titanic/test.csv' as test data.
# Using the dataset, test the Decision Tree, Random Forest, XGBoost and an ensemble of the three models to see which is the most accurate.
# Before testing, use onehot encoding to categorical data columns.
# Before testing, drop object data columns like 'Name','Ticket' and 'Cabin' columns.
# Before testing, fill in any missing values with representative values
# The validation should be based on the average of the 4-fold cross-validation results.
# Run the generated code, and if an error occurs, repeat the modification until the error disappears
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
# Read data
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
# Preprocessing
categorical_features = ['Sex', 'Embarked']
numerical_features = ['Age', 'Fare']
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())])
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Combine preprocessing for both types of data
preprocessor = ColumnTransformer(transformers=[
('numerical', numerical_transformer, numerical_features),
('categorical', categorical_transformer, categorical_features)
# Create a pipeline for each model
dtc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier())])
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', xgb.XGBClassifier())])
# Create a voting classifier
voting_clf = VotingClassifier(estimators=[('dt', dtc_pipeline), ('rf', rf_pipeline), ('xgb', xgb_pipeline)], voting='hard')
# Evaluate the models
models = [dtc_pipeline, rf_pipeline, xgb_pipeline, voting_clf]
for model in models:
scores = cross_val_score(model, train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1), train['Survived'], cv=4)
print(f'{model.__class__.__name__}: {np.mean(scores)}')
Kaggleのnotebook内でGemini Proを使ってみる
別の記事でOpen Interpreterを使った推論モデルの生成をやりました。この記事ではGemini Proでのモデル生成を試してみます。
GeminiはGoogleが開発した生成AIモデルです。Open Interpreter同様、対話形式で簡単に推論モデルを生成することが可能です。以下はそのイメージです。text
import google.generativeai as genai
model = genai.GenerativeModel('gemini-pro')
Gemini Proについて
GeminiはGoogle ResearchやDeepMind、Brain TeamなどのGoogle傘下の組織で開発された生成AIモデルです。
- Ultra、Pro、Nanoの3つのサイズがあり、UltraはOpenAIのGPT-4Vとの性能比較で圧勝した
- 文章・音声・画像・動画などマルチモーダルに対応

Gemini Pro
Google AI StudioでAPIキーを取得できます。
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
def to_markdown(text):
text = text.replace('•', ' *')
return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
# 自身のOpenAIのAPI-keyの入力
api_key = "API_KEY"
# モデルの選択。他にもclaude-2やcommand-nightlyなど
model = genai.GenerativeModel('gemini-pro')
そして、指示内容です。この内容を大規模言語モデルが解釈し、プログラムを作成し実行します。「データを読み込み、決定木分析・ランダムフォレスト・XGboostでアンサンブル学習を行い、k-fold交差検証をし、最も精度の高いモデルとそのコードを保存して」 という内容です。
# プロンプト入力例
text = """
# resultで結果を受け取る
result = model.generate_content(text)
# Using the dataset, make one accurate model.
# The validation method is k-fold cross-validation.
Pipeline: 0.7418949218276573 # 決定木分析
Pipeline: 0.7744162323758736 # ランダムフォレスト
Pipeline: 0.7834050418131135 # XGBoost
VotingClassifier: 0.7733002060356321