{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Notebook [3]: Training the reader on the SQuAD v1.1 dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook shows how to fine-tune a pre-trained BERT model on the SQuAD." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***Note:*** *To run this notebook you will need to have access to GPU. The fine-tuning of the Reader was done with an AWS EC2 p3.2xlarge machine (GPU Tesla V100 16GB). It took about 2 hours to complete (2 epochs on SQuAD 1.1 train was enough to achieve SOTA results on SQuAD 1.1 dev).*" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:45:44.624084Z", "start_time": "2019-07-20T13:45:41.394789Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/supercalculateur/source/andre/cdqa-dev/env-cdqa/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", " warnings.warn(msg, category=DeprecationWarning)\n", "/home/supercalculateur/source/andre/cdqa-dev/env-cdqa/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:18: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n" ] } ], "source": [ "import os\n", "import torch\n", "import joblib\n", "from cdqa.reader import BertProcessor, BertQA\n", "from cdqa.utils.download import download_squad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download SQuAD datasets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:46:13.505754Z", "start_time": "2019-07-20T13:46:00.589821Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading SQuAD v1.1 data...\n", "train-v1.1.json already downloaded\n", "dev-v1.1.json already downloaded\n", "\n", "Downloading SQuAD v2.0 data...\n", "train-v2.0.json already downloaded\n", "dev-v2.0.json already downloaded\n" ] } ], "source": [ "download_squad(dir='./data')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocess SQuAD 1.1 examples" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:58:36.512980Z", "start_time": "2019-07-20T13:46:44.792080Z" } }, "outputs": [], "source": [ "train_processor = BertProcessor(do_lower_case=True, is_training=True, n_jobs=-1)\n", "train_examples, train_features = train_processor.fit_transform(X='./data/SQuAD_1.1/train-v1.1.json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train the model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reader = BertQA(train_batch_size=12,\n", " learning_rate=3e-5,\n", " num_train_epochs=2,\n", " do_lower_case=True,\n", " output_dir='models')\n", "\n", "reader.fit(X=(train_examples, train_features))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Send model to CPU" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reader.model.to('cpu')\n", "reader.device = torch.device('cpu')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save model locally" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa.joblib'))" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }