{ "cells": [ { "cell_type": "markdown", "id": "d53c7616", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Datawhale AI 夏令营 第二期\n", "## NLP 赛道深度学习 Baseline 代码精读\n", "---\n", "\n", "
骆秀韬 epsilon_luoo@outlook.com
\n", "\n", "聪明办法学 Python 教学团队" ] }, { "cell_type": "markdown", "id": "6bd58403", "metadata": { "ExecuteTime": { "end_time": "2023-08-07T11:18:02.536644Z", "start_time": "2023-08-07T11:18:02.521635Z" }, "slideshow": { "slide_type": "slide" } }, "source": [ "# 导入所需工具" ] }, { "cell_type": "code", "execution_count": 2, "id": "ed933971-0c45-4329-8793-13499b5bac48", "metadata": { "ExecuteTime": { "end_time": "2023-08-08T11:38:51.620645Z", "start_time": "2023-08-08T11:38:51.611628Z" }, "init_cell": true, "papermill": { "duration": 3.820548, "end_time": "2023-03-24T04:35:44.025675", "exception": false, "start_time": "2023-03-24T04:35:40.205127", "status": "completed" }, "run_control": { "marked": false }, "scrolled": true, "slideshow": { "slide_type": "-" }, "tags": [] }, "outputs": [], "source": [ "# 忽略 Paddle 警告\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "import numpy as np # 数值计算\n", "import pandas as pd # 数据分析\n", "from tqdm import tqdm # 进度条显示\n", "import paddle # PaddlePaddle 深度学习框架\n", "from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer # 飞桨自然语言处理工具包(模型、分词器)\n", "from paddle.io import DataLoader # 数据加载器\n", "from paddlenlp.datasets import MapDataset # 数据集转换\n", "from sklearn.model_selection import train_test_split # 训练集与验证集拆分\n", "import matplotlib.pyplot as plt # 绘图" ] }, { "cell_type": "markdown", "id": "87c20a3c", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# 导入数据集" ] }, { "cell_type": "code", "execution_count": 3, "id": "cbb5b0bc-845c-47c3-bee0-ae22c28a53cf", "metadata": { "ExecuteTime": { "end_time": "2023-08-08T12:07:07.091033Z", "start_time": "2023-08-08T12:07:06.840598Z" }, "execution": { "iopub.execute_input": "2023-08-03T02:30:01.308871Z", "iopub.status.busy": "2023-08-03T02:30:01.308239Z", "iopub.status.idle": "2023-08-03T02:30:01.541078Z", "shell.execute_reply": "2023-08-03T02:30:01.540239Z", "shell.execute_reply.started": "2023-08-03T02:30:01.308837Z" }, "run_control": { "marked": false }, "scrolled": true, "slideshow": { "slide_type": "-" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", " | name | \n", "label | \n", "content | \n", "
---|---|---|---|
2850 | \n", "2851 | \n", "1 | \n", "[ 593 1296 148 242 3747 1107 4242 1759 266 ... | \n", "
13636 | \n", "13637 | \n", "0 | \n", "[5169 3125 1106 169 5212 2044 3974 3670 4889 ... | \n", "
5566 | \n", "5567 | \n", "0 | \n", "[ 0 0 0 1539 1759 266 292 1581 3125 ... | \n", "
13427 | \n", "13428 | \n", "0 | \n", "[ 998 1759 3125 3037 2575 4683 2177 4253 1105 ... | \n", "
5976 | \n", "5977 | \n", "1 | \n", "[2187 2206 2214 3938 677 3967 455 123 148 ... | \n", "
\n", " | name | \n", "content | \n", "label | \n", "
---|---|---|---|
0 | \n", "14001 | \n", "[3125 2196 286 123 1539 1759 266 3549 649 ... | \n", "0.0 | \n", "
1 | \n", "14002 | \n", "[1109 2113 3122 213 3125 1294 5212 2338 2233 ... | \n", "0.0 | \n", "
2 | \n", "14003 | \n", "[ 236 3125 139 3037 5212 4294 1600 4550 3169 ... | \n", "0.0 | \n", "
3 | \n", "14004 | \n", "[ 13 13 13 0 0 0 245 1472 3125 ... | \n", "0.0 | \n", "
4 | \n", "14005 | \n", "[2113 2444 139 1109 4648 4626 181 3635 1145 ... | \n", "0.0 | \n", "
5 | \n", "14006 | \n", "[ 664 139 220 1759 248 2188 664 4544 3125 ... | \n", "0.0 | \n", "
6 | \n", "14007 | \n", "[1472 2214 5212 1759 3125 1294 199 675 3037 ... | \n", "0.0 | \n", "
7 | \n", "14008 | \n", "[ 13 0 0 0 216 2176 3125 526 1100 ... | \n", "0.0 | \n", "
8 | \n", "14009 | \n", "[5212 1339 3747 4242 1759 266 2101 3122 1115 ... | \n", "0.0 | \n", "
9 | \n", "14010 | \n", "[3125 2278 983 4982 2138 284 3635 5212 2113 ... | \n", "0.0 | \n", "