{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "dimensional-township", "metadata": {}, "outputs": [], "source": [ "import boto3\n", "import re\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "import sagemaker\n", "from sagemaker import get_execution_role\n", "from sagemaker.inputs import TrainingInput\n", "from sagemaker.serializers import CSVSerializer" ] }, { "cell_type": "code", "execution_count": 2, "id": "collect-albuquerque", "metadata": {}, "outputs": [], "source": [ "# 这里替换成自己的 S3 bucket and prefix\n", "bucket = 'sagemaker-cn-northwest-1-876820548815'\n", "prefix = 'windturbine/xgboost'\n", "\n", "# 创建 IAM Role\n", "role = get_execution_role()" ] }, { "cell_type": "code", "execution_count": 3, "id": "decimal-judges", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-03-08 08:58:02-- https://samick-virginia.s3.amazonaws.com/xgboost/data/wind_turbine_training_data.csv\n", "Resolving samick-virginia.s3.amazonaws.com (samick-virginia.s3.amazonaws.com)... 52.216.244.116\n", "Connecting to samick-virginia.s3.amazonaws.com (samick-virginia.s3.amazonaws.com)|52.216.244.116|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 30337871 (29M) [text/csv]\n", "Saving to: ‘wind_turbine_training_data.csv’\n", "\n", "wind_turbine_traini 100%[===================>] 28.93M 6.26MB/s in 5.8s \n", "\n", "2021-03-08 08:58:10 (4.96 MB/s) - ‘wind_turbine_training_data.csv’ saved [30337871/30337871]\n", "\n" ] } ], "source": [ "#下载训练数据集到本地\n", "!wget https://samick-virginia.s3.amazonaws.com/xgboost/data/wind_turbine_training_data.csv" ] }, { "cell_type": "code", "execution_count": 4, "id": "together-weekend", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
turbine_idwind_speedRPM_bladeoil_temperatureoil_leveltemperaturehumidityvibrations_frequencypressurewind_directionbreakdown
0380613934332617730
110857836283543156221
27473131234662153210
....................................
999997442752531423556720
999998348754710856377221
9999991045603783935126441
\n", "

1000000 rows × 11 columns

\n", "
" ], "text/plain": [ " turbine_id wind_speed RPM_blade oil_temperature oil_level \\\n", "0 3 80 61 39 34 \n", "1 10 85 78 36 28 \n", "2 7 47 31 31 23 \n", "... ... ... ... ... ... \n", "999997 4 42 75 25 31 \n", "999998 3 48 75 47 10 \n", "999999 10 45 60 37 8 \n", "\n", " temperature humidity vibrations_frequency pressure wind_direction \\\n", "0 33 26 1 77 3 \n", "1 35 43 15 62 2 \n", "2 46 62 15 32 1 \n", "... ... ... ... ... ... \n", "999997 42 35 5 67 2 \n", "999998 85 63 7 72 2 \n", "999999 39 35 12 64 4 \n", "\n", " breakdown \n", "0 0 \n", "1 1 \n", "2 0 \n", "... ... \n", "999997 0 \n", "999998 1 \n", "999999 1 \n", "\n", "[1000000 rows x 11 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 浏览数据集\n", "dataset = pd.read_csv('wind_turbine_training_data.csv')\n", "pd.set_option('display.max_rows', 6)\n", "dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "thirty-temperature", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
breakdownwind_speedRPM_bladeoil_temperatureoil_leveltemperaturehumidityvibrations_frequencypressurewind_direction
008061393433261773
1185783628354315622
2047313123466215321
.................................
99999704275253142355672
99999814875471085637722
99999914560378393512644
\n", "

1000000 rows × 10 columns

\n", "
" ], "text/plain": [ " breakdown wind_speed RPM_blade oil_temperature oil_level \\\n", "0 0 80 61 39 34 \n", "1 1 85 78 36 28 \n", "2 0 47 31 31 23 \n", "... ... ... ... ... ... \n", "999997 0 42 75 25 31 \n", "999998 1 48 75 47 10 \n", "999999 1 45 60 37 8 \n", "\n", " temperature humidity vibrations_frequency pressure wind_direction \n", "0 33 26 1 77 3 \n", "1 35 43 15 62 2 \n", "2 46 62 15 32 1 \n", "... ... ... ... ... ... \n", "999997 42 35 5 67 2 \n", "999998 85 63 7 72 2 \n", "999999 39 35 12 64 4 \n", "\n", "[1000000 rows x 10 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 清洗数据 (删除turbine_id列,按照XGBoost的训练数据格式要求,删除表头并且将最后一列的推理结果数据挪到第一列)\n", "dataset = dataset.drop('turbine_id', axis=1)\n", "dataset = pd.concat([dataset['breakdown'], dataset.drop(['breakdown'], axis=1)], axis=1)\n", "dataset" ] }, { "cell_type": "code", "execution_count": 6, "id": "qualified-discount", "metadata": {}, "outputs": [], "source": [ "# 将数据拆分为训练数据集和验证数据集并保存到本地\n", "train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])\n", "train_data.to_csv('train.csv', header=False, index=False)\n", "validation_data.to_csv('validation.csv', header=False, index=False)" ] }, { "cell_type": "code", "execution_count": 7, "id": "caring-airfare", "metadata": {}, "outputs": [], "source": [ "# 上传数据到 S3\n", "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/train/train.csv')).upload_file('train.csv')\n", "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/validation/validation.csv')).upload_file('validation.csv')\n", "# 为 Sagemaker 训练任务指定数据位置 \n", "s3_input_train = TrainingInput(s3_data='s3://{}/{}/data/train'.format(bucket, prefix), content_type='csv')\n", "s3_input_validation = TrainingInput(s3_data='s3://{}/{}/data/validation/'.format(bucket, prefix), content_type='csv')" ] }, { "cell_type": "code", "execution_count": 10, "id": "growing-salem", "metadata": {}, "outputs": [], "source": [ "# 用于模型训练环境的 container 的位置\n", "containers = {\n", " 'cn-northwest-1':'387376663083.dkr.ecr.cn-northwest-1.amazonaws.com.cn/xgboost:latest',\n", " 'cn-north-1':'390948362332.dkr.ecr.cn-north-1.amazonaws.com.cn/xgboost:latest'\n", " }\n", "\n", "# 创建 Sagemaker Session\n", "sess = sagemaker.Session()" ] }, { "cell_type": "code", "execution_count": 11, "id": "demographic-istanbul", "metadata": {}, "outputs": [], "source": [ "# 创建 Sagemaker estimator, 指定训练任务的机器类型和数量等\n", "xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],\n", " role, \n", " instance_count=1, \n", " instance_type='ml.m5.xlarge',\n", " output_path='s3://{}/{}/model'.format(bucket, prefix),\n", " sagemaker_session=sess)" ] }, { "cell_type": "code", "execution_count": 12, "id": "integrated-clerk", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2021-03-08 09:57:20 Starting - Starting the training job...\n", "2021-03-08 09:57:23 Starting - Launching requested ML instances......\n", "2021-03-08 09:58:25 Starting - Preparing the instances for training......\n", "2021-03-08 09:59:23 Downloading - Downloading input data...\n", "2021-03-08 10:00:12 Training - Training image download completed. Training in progress.\u001b[34mArguments: train\u001b[0m\n", "\u001b[34m[2021-03-08:10:00:12:INFO] Running standalone xgboost training.\u001b[0m\n", "\u001b[34m[2021-03-08:10:00:12:INFO] File size need to be processed in the node: 23.38mb. Available memory size in the node: 8114.96mb\u001b[0m\n", "\u001b[34m[2021-03-08:10:00:12:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", "\u001b[34m[10:00:12] S3DistributionType set as FullyReplicated\u001b[0m\n", "\u001b[34m[10:00:12] 700000x9 matrix with 6300000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,\u001b[0m\n", "\u001b[34m[2021-03-08:10:00:12:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", "\u001b[34m[10:00:12] S3DistributionType set as FullyReplicated\u001b[0m\n", "\u001b[34m[10:00:13] 200000x9 matrix with 1800000 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,\u001b[0m\n", "\u001b[34m[10:00:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[0]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[1]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[2]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[3]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[4]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[5]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[6]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[7]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[8]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[9]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[10]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[11]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[12]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[13]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[14]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[15]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[16]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[17]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[18]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[19]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[20]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[21]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[22]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[23]#011train-error:0#011validation-error:0\u001b[0m\n", "\u001b[34m[10:00:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5\u001b[0m\n", "\u001b[34m[24]#011train-error:0#011validation-error:0\u001b[0m\n", "\n", "2021-03-08 10:00:29 Uploading - Uploading generated training model\n", "2021-03-08 10:00:29 Completed - Training job completed\n", "Training seconds: 66\n", "Billable seconds: 66\n" ] } ], "source": [ "# 设置超参数并开始训练数据\n", "xgb.set_hyperparameters(eta=0.1, objective='binary:logistic', num_round=25) \n", "xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})" ] }, { "cell_type": "code", "execution_count": 13, "id": "structural-ceiling", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-----------!" ] } ], "source": [ "# 部署训练好的模型并生成用于推理的 Endpoint\n", "xgb_predictor = xgb.deploy(\n", "\tinitial_instance_count = 1,\n", "\tinstance_type = 'ml.m5.xlarge',\n", "\tserializer = CSVSerializer())" ] }, { "cell_type": "code", "execution_count": 14, "id": "advisory-posting", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "xgboost-2021-03-08-10-43-34-693\n" ] } ], "source": [ "# 查看在 Sagemaker 上部署好的 Endpoint\n", "print (xgb_predictor.endpoint_name)" ] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 5 }