5
4

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

Amazon SageMaker 試しメモ

Posted at

投稿について

この投稿は自分用のメモです。データはkaggleのTitanicデータを使用しています。
他の方の参考にもなれば。

環境準備

BUCKET = 'バケット名'
KEY = '使用するCSVファイルのパス'
PREFIX = 'S3のプレフィックス'
# IAMのroleの宣言
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
role = get_execution_role()
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sagemaker.predictor import csv_serializer, json_deserializer
import sagemaker.amazon.common as smac
import mglearn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
import io
import os
# S3からCSVをダウンロード
s3 = boto3.client('s3')
response = s3.get_object(Bucket=BUCKET, Key=KEY)
data = pd.read_csv(io.StringIO(response['Body'].read().decode('utf-8')))

データ整形

# dataに欠損があるかどうか確認
data.isnull().sum()
# 整形
# 使用しない列を削除
data.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
# Sexをカテゴリー化
data.loc[data['Sex'] == 'male', 'Sex'] = 0
data.loc[data['Sex'] == 'female', 'Sex'] = 1
# Embarkedをカテゴリー化
data = data.dropna(subset=['Embarked'])
EmbarkedList =pd.get_dummies(data['Embarked'])
data = pd.concat([data, EmbarkedList.loc[:, ['C', 'Q']]], axis=1)
data.drop(['Embarked'], axis=1, inplace=True)
# AgeのNaNをSexで分けて平均で埋める
data.loc[(data['Sex'] == 0) & data['Age'].isnull(), 'Age'] = data.loc[data['Sex'] == 0].mean()['Age'].astype(int)
data.loc[(data['Sex'] == 1) & data['Age'].isnull(), 'Age'] = data.loc[data['Sex'] == 1].mean()['Age'].astype(int)
# 相関を調べる
display(data.corr())
pd.plotting.scatter_matrix(data, figsize=(12, 12), marker='o', hist_kwds={'bins': 20}, alpha=.6, s=20, cmap=mglearn.cm3)
plt.show()
# Survivedと相関が低い列を削除
data.drop(['Age', 'SibSp', 'Parch'], axis=1, inplace=True)

scikit-learnで試す

# scikit-learnで試す
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0].values.ravel(), random_state=45)
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, random_state=10)
forest.fit(X_train, y_train)
print("ランダムフォレスト")
print("トレーニングデータの正答率: {:.2f}".format(forest.score(X_train, y_train)))
print("テストデータの正答率: {:.2f}".format(forest.score(X_test, y_test)))
# ロジスティック回帰
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train, y_train)
print("ロジスティック回帰")
print("トレーニングデータの正答率: {:.2f}".format(lr.score(X_train, y_train)))
print("テストデータの正答率: {:.2f}".format(lr.score(X_test, y_test)))

結果

ランダムフォレスト
トレーニングデータの正答率: 0.91
テストデータの正答率: 0.80
ロジスティック回帰
トレーニングデータの正答率: 0.79
テストデータの正答率: 0.77

Amazon SageMakerでモデル作成

# dataをランダムに3分割
train_data, validation_data, test_data = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data)), int(0.9 * len(data))])
# CSVにしてS3へアップロード
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, np.array(train_data.drop(['Survived'], axis=1).as_matrix()) \
                                 .astype('float32'), np.array(train_data['Survived'].as_matrix()).astype('float32'))
buf.seek(0)
boto3.resource('s3').Bucket(BUCKET).Object(os.path.join(PREFIX, 'train', 'train.csv')).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(BUCKET, PREFIX, 'train.csv')
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, np.array(validation_data.drop(['Survived'], axis=1).as_matrix()) \
                                 .astype('float32'), np.array(validation_data['Survived'].as_matrix()).astype('float32'))
buf.seek(0)
boto3.resource('s3').Bucket(BUCKET).Object(os.path.join(PREFIX, 'validation', 'validation.csv')).upload_fileobj(buf)
s3_validation_data = 's3://{}/{}/validation/{}'.format(BUCKET, PREFIX, 'validation.csv')
# SageMakerのlinear-learnerのECRコンテナを指定
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
             'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
             'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
             'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest'}
# SageMakerのセッション
sess = sagemaker.Session()
# sagemakerのestimatorへ必要項目を指定
linear = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path='s3://{}/{}/output'.format(BUCKET, PREFIX),
                                       sagemaker_session=sess)
# ハイパーパラメーターの指定
linear.set_hyperparameters(feature_dim=5,
                           mini_batch_size=100,
                           predictor_type='regressor',
                           epochs=10,
                           num_models=32,
                           loss='absolute_loss')
# モデルフィッティングと出力先の指定(S3)
linear.fit({'train': s3_train_data, 'validation': s3_validation_data})
# デプロイ
linear_predictor = linear.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

予測の実行

# linear_predictorの設定
linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer
# 500行ごとに予測
predictions = np.array([])
test_data_arr = test_data.drop(['Survived'], axis=1).as_matrix()
split_array = np.array_split(test_data_arr, int(test_data_arr.shape[0] / float(500) + 1))
for array in split_array:
    _predictions = linear_predictor.predict(array)['predictions']
    for predict in _predictions:
        predictions = np.round(np.append(predictions, predict['score']))

モデル評価

# 正解率
from sklearn.metrics import accuracy_score
accuracy_score(test_data['Survived'].as_matrix(), predictions)

結果

0.8202247191011236

モデルのエンドポイント削除

# 作成したエンドポイントの削除
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)

参考

  1. Amazon SageMakerを使って銀行定期預金の見込み顧客を予測【SageMaker +XGBoost 機械学習初心者チュートリアル】

  2. Using AWS SageMaker linear regression to predict store transactions
5
4
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
5
4

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?