In [1]:
import boto3
import re
import pandas as pd
import numpy as np
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

In [2]:
# 这里替换成自己的 S3 bucket and prefix
bucket = 'sagemaker-cn-northwest-1-876820548815'
prefix = 'windturbine/xgboost'

# 创建 IAM Role
role = get_execution_role()

In [3]:
#下载训练数据集到本地
!wget https://samick-virginia.s3.amazonaws.com/xgboost/data/wind_turbine_training_data.csv

--2021-03-08 08:58:02--  https://samick-virginia.s3.amazonaws.com/xgboost/data/wind_turbine_training_data.csv
Resolving samick-virginia.s3.amazonaws.com (samick-virginia.s3.amazonaws.com)... 52.216.244.116
Connecting to samick-virginia.s3.amazonaws.com (samick-virginia.s3.amazonaws.com)|52.216.244.116|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30337871 (29M) [text/csv]
Saving to: ‘wind_turbine_training_data.csv’


2021-03-08 08:58:10 (4.96 MB/s) - ‘wind_turbine_training_data.csv’ saved [30337871/30337871]



In [4]:
# 浏览数据集
dataset = pd.read_csv('wind_turbine_training_data.csv')
pd.set_option('display.max_rows', 6)
dataset

Unnamed: 0,turbine_id,wind_speed,RPM_blade,oil_temperature,oil_level,temperature,humidity,vibrations_frequency,pressure,wind_direction,breakdown
0,3,80,61,39,34,33,26,1,77,3,0
1,10,85,78,36,28,35,43,15,62,2,1
2,7,47,31,31,23,46,62,15,32,1,0
...,...,...,...,...,...,...,...,...,...,...,...
999997,4,42,75,25,31,42,35,5,67,2,0
999998,3,48,75,47,10,85,63,7,72,2,1
999999,10,45,60,37,8,39,35,12,64,4,1


In [5]:
# 清洗数据 (删除turbine_id列，按照XGBoost的训练数据格式要求，删除表头并且将最后一列的推理结果数据挪到第一列）
dataset = dataset.drop('turbine_id', axis=1)
dataset = pd.concat([dataset['breakdown'], dataset.drop(['breakdown'], axis=1)], axis=1)
dataset

Unnamed: 0,breakdown,wind_speed,RPM_blade,oil_temperature,oil_level,temperature,humidity,vibrations_frequency,pressure,wind_direction
0,0,80,61,39,34,33,26,1,77,3
1,1,85,78,36,28,35,43,15,62,2
2,0,47,31,31,23,46,62,15,32,1
...,...,...,...,...,...,...,...,...,...,...
999997,0,42,75,25,31,42,35,5,67,2
999998,1,48,75,47,10,85,63,7,72,2
999999,1,45,60,37,8,39,35,12,64,4


In [6]:
# 将数据拆分为训练数据集和验证数据集并保存到本地
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

In [7]:
# 上传数据到 S3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/validation/validation.csv')).upload_file('validation.csv')
# 为 Sagemaker 训练任务指定数据位置 
s3_input_train = TrainingInput(s3_data='s3://{}/{}/data/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/{}/data/validation/'.format(bucket, prefix), content_type='csv')

In [10]:
# 用于模型训练环境的 container 的位置
containers = {
              'cn-northwest-1':'387376663083.dkr.ecr.cn-northwest-1.amazonaws.com.cn/xgboost:latest',
              'cn-north-1':'390948362332.dkr.ecr.cn-north-1.amazonaws.com.cn/xgboost:latest'
             }

# 创建 Sagemaker Session
sess = sagemaker.Session()

In [11]:
# 创建 Sagemaker estimator, 指定训练任务的机器类型和数量等
xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.xlarge',
                                    output_path='s3://{}/{}/model'.format(bucket, prefix),
                                    sagemaker_session=sess)

In [12]:
# 设置超参数并开始训练数据
xgb.set_hyperparameters(eta=0.1, objective='binary:logistic', num_round=25) 
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-03-08 09:57:20 Starting - Starting the training job...
2021-03-08 09:57:23 Starting - Launching requested ML instances......
2021-03-08 09:58:25 Starting - Preparing the instances for training......
2021-03-08 09:59:23 Downloading - Downloading input data...
2021-03-08 10:00:12 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2021-03-08:10:00:12:INFO] Running standalone xgboost training.[0m
[34m[2021-03-08:10:00:12:INFO] File size need to be processed in the node: 23.38mb. Available memory size in the node: 8114.96mb[0m
[34m[2021-03-08:10:00:12:INFO] Determined delimiter of CSV input is ','[0m
[34m[10:00:12] S3DistributionType set as FullyReplicated[0m
[34m[10:00:12] 700000x9 matrix with 6300000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-03-08:10:00:12:INFO] Determined delimiter of CSV input is ','[0m
[34m[10:00:12] S3DistributionType set as FullyReplicated[0m
[34m[

In [13]:
# 部署训练好的模型并生成用于推理的 Endpoint
xgb_predictor = xgb.deploy(
	initial_instance_count = 1,
	instance_type = 'ml.m5.xlarge',
	serializer = CSVSerializer())

-----------!

In [14]:
# 查看在 Sagemaker 上部署好的 Endpoint
print (xgb_predictor.endpoint_name)

xgboost-2021-03-08-10-43-34-693
