{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Huấn luyện một mô hình ngôn ngữ nhân quả từ đầu (TensorFlow)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]\n",
    "!apt install git-lfs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will need to setup git, adapt your email and name in the following cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!git config --global user.email \"you@example.com\"\n",
    "!git config --global user.name \"Your Name\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def any_keyword_in_string(string, keywords):\n",
    "    for keyword in keywords:\n",
    "        if keyword in string:\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False True"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filters = [\"pandas\", \"sklearn\", \"matplotlib\", \"seaborn\"]\n",
    "example_1 = \"import numpy as np\"\n",
    "example_2 = \"import pandas as pd\"\n",
    "\n",
    "print(\n",
    "    any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "from tqdm import tqdm\n",
    "from datasets import Dataset\n",
    "\n",
    "\n",
    "def filter_streaming_dataset(dataset, filters):\n",
    "    filtered_dict = defaultdict(list)\n",
    "    total = 0\n",
    "    for sample in tqdm(iter(dataset)):\n",
    "        total += 1\n",
    "        if any_keyword_in_string(sample[\"content\"], filters):\n",
    "            for k, v in sample.items():\n",
    "                filtered_dict[k].append(v)\n",
    "    print(f\"{len(filtered_dict['content'])/total:.2%} of data after filtering.\")\n",
    "    return Dataset.from_dict(filtered_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.26% of data after filtering."
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Ô này sẽ mất rất nhiều thời gian để thực thi, vì vậy bạn nên bỏ qua và chuyển đến\n",
    "# cái tiếp theo!\n",
    "from datasets import load_dataset\n",
    "\n",
    "split = \"train\"  # \"valid\"\n",
    "filters = [\"pandas\", \"sklearn\", \"matplotlib\", \"seaborn\"]\n",
    "\n",
    "data = load_dataset(f\"transformersbook/codeparrot-{split}\", split=split, streaming=True)\n",
    "filtered_data = filter_streaming_dataset(data, filters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],\n",
       "        num_rows: 606720\n",
       "    })\n",
       "    valid: Dataset({\n",
       "        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],\n",
       "        num_rows: 3322\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset, DatasetDict\n",
    "\n",
    "ds_train = load_dataset(\"huggingface-course/codeparrot-ds-train\", split=\"train\")\n",
    "ds_valid = load_dataset(\"huggingface-course/codeparrot-ds-valid\", split=\"validation\")\n",
    "\n",
    "raw_datasets = DatasetDict(\n",
    "    {\n",
    "        \"train\": ds_train,  # .shuffle().select(range(50000)),\n",
    "        \"valid\": ds_valid,  # .shuffle().select(range(500))\n",
    "    }\n",
    ")\n",
    "\n",
    "raw_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'REPO_NAME: kmike/scikit-learn'\n",
       "'PATH: sklearn/utils/__init__.py'\n",
       "'COPIES: 3'\n",
       "'SIZE: 10094'\n",
       "'''CONTENT: \"\"\"\n",
       "The :mod:`sklearn.utils` module includes various utilites.\n",
       "\"\"\"\n",
       "\n",
       "from collections import Sequence\n",
       "\n",
       "import numpy as np\n",
       "from scipy.sparse import issparse\n",
       "import warnings\n",
       "\n",
       "from .murmurhash import murm\n",
       "LICENSE: bsd-3-clause'''"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for key in raw_datasets[\"train\"][0]:\n",
    "    print(f\"{key.upper()}: {raw_datasets['train'][0][key][:200]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Input IDs length: 34\n",
       "Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]\n",
       "Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "context_length = 128\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n",
    "\n",
    "outputs = tokenizer(\n",
    "    raw_datasets[\"train\"][:2][\"content\"],\n",
    "    truncation=True,\n",
    "    max_length=context_length,\n",
    "    return_overflowing_tokens=True,\n",
    "    return_length=True,\n",
    ")\n",
    "\n",
    "print(f\"Input IDs length: {len(outputs['input_ids'])}\")\n",
    "print(f\"Input chunk lengths: {(outputs['length'])}\")\n",
    "print(f\"Chunk mapping: {outputs['overflow_to_sample_mapping']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids'],\n",
       "        num_rows: 16702061\n",
       "    })\n",
       "    valid: Dataset({\n",
       "        features: ['input_ids'],\n",
       "        num_rows: 93164\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def tokenize(element):\n",
    "    outputs = tokenizer(\n",
    "        element[\"content\"],\n",
    "        truncation=True,\n",
    "        max_length=context_length,\n",
    "        return_overflowing_tokens=True,\n",
    "        return_length=True,\n",
    "    )\n",
    "    input_batch = []\n",
    "    for length, input_ids in zip(outputs[\"length\"], outputs[\"input_ids\"]):\n",
    "        if length == context_length:\n",
    "            input_batch.append(input_ids)\n",
    "    return {\"input_ids\": input_batch}\n",
    "\n",
    "\n",
    "tokenized_datasets = raw_datasets.map(\n",
    "    tokenize, batched=True, remove_columns=raw_datasets[\"train\"].column_names\n",
    ")\n",
    "tokenized_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig\n",
    "\n",
    "config = AutoConfig.from_pretrained(\n",
    "    \"gpt2\",\n",
    "    vocab_size=len(tokenizer),\n",
    "    n_ctx=context_length,\n",
    "    bos_token_id=tokenizer.bos_token_id,\n",
    "    eos_token_id=tokenizer.eos_token_id,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "_________________________________________________________________\n",
       "Layer (type)                 Output Shape              Param #\n",
       "=================================================================\n",
       "transformer (TFGPT2MainLayer multiple                  124242432\n",
       "=================================================================\n",
       "Total params: 124,242,432\n",
       "Trainable params: 124,242,432\n",
       "Non-trainable params: 0\n",
       "_________________________________________________________________"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = TFGPT2LMHeadModel(config)\n",
    "model(model.dummy_inputs)  # Xây mô hình\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorForLanguageModeling\n",
    "\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors=\"tf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "input_ids shape: (5, 128)\n",
       "attention_mask shape: (5, 128)\n",
       "labels shape: (5, 128)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out = data_collator([tokenized_datasets[\"train\"][i] for i in range(5)])\n",
    "for key in out:\n",
    "    print(f\"{key} shape: {out[key].shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf_train_dataset = tokenized_dataset[\"train\"].to_tf_dataset(\n",
    "    columns=[\"input_ids\", \"attention_mask\", \"labels\"],\n",
    "    collate_fn=data_collator,\n",
    "    shuffle=True,\n",
    "    batch_size=32,\n",
    ")\n",
    "tf_eval_dataset = tokenized_dataset[\"valid\"].to_tf_dataset(\n",
    "    columns=[\"input_ids\", \"attention_mask\", \"labels\"],\n",
    "    collate_fn=data_collator,\n",
    "    shuffle=False,\n",
    "    batch_size=32,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import create_optimizer\n",
    "import tensorflow as tf\n",
    "\n",
    "num_train_steps = len(tf_train_dataset)\n",
    "optimizer, schedule = create_optimizer(\n",
    "    init_lr=5e-5,\n",
    "    num_warmup_steps=1_000,\n",
    "    num_train_steps=num_train_steps,\n",
    "    weight_decay_rate=0.01,\n",
    ")\n",
    "model.compile(optimizer=optimizer)\n",
    "\n",
    "# Huấn luyện trong mixed-precision float16\n",
    "tf.keras.mixed_precision.set_global_policy(\"mixed_float16\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers.keras_callbacks import PushToHubCallback\n",
    "\n",
    "callback = PushToHubCallback(output_dir=\"codeparrot-ds\", tokenizer=tokenizer)\n",
    "\n",
    "model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "course_model = TFGPT2LMHeadModel.from_pretrained(\"huggingface-course/codeparrot-ds\")\n",
    "course_tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/codeparrot-ds\")\n",
    "pipe = pipeline(\n",
    "    \"text-generation\", model=course_model, tokenizer=course_tokenizer, device=0\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "# create some data\n",
       "x = np.random.randn(100)\n",
       "y = np.random.randn(100)\n",
       "\n",
       "# create scatter plot with x, y\n",
       "plt.scatter(x, y)\n",
       "\n",
       "# create scatter"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "txt = \"\"\"\\\n",
    "# create some data\n",
    "x = np.random.randn(100)\n",
    "y = np.random.randn(100)\n",
    "\n",
    "# create scatter plot with x, y\n",
    "\"\"\"\n",
    "print(pipe(txt, num_return_sequences=1)[0][\"generated_text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "# create some data\n",
       "x = np.random.randn(100)\n",
       "y = np.random.randn(100)\n",
       "\n",
       "# create dataframe from x and y\n",
       "df = pd.DataFrame({'x': x, 'y': y})\n",
       "df.insert(0,'x', x)\n",
       "for"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "txt = \"\"\"\\\n",
    "# create some data\n",
    "x = np.random.randn(100)\n",
    "y = np.random.randn(100)\n",
    "\n",
    "# create dataframe from x and y\n",
    "\"\"\"\n",
    "print(pipe(txt, num_return_sequences=1)[0][\"generated_text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "# dataframe with profession, income and name\n",
       "df = pd.DataFrame({'profession': x, 'income':y, 'name': z})\n",
       "\n",
       "# calculate the mean income per profession\n",
       "profession = df.groupby(['profession']).mean()\n",
       "\n",
       "# compute the"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "txt = \"\"\"\\\n",
    "# dataframe with profession, income and name\n",
    "df = pd.DataFrame({'profession': x, 'income':y, 'name': z})\n",
    "\n",
    "# calculate the mean income per profession\n",
    "\"\"\"\n",
    "print(pipe(txt, num_return_sequences=1)[0][\"generated_text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "# import random forest regressor from scikit-learn\n",
       "from sklearn.ensemble import RandomForestRegressor\n",
       "\n",
       "# fit random forest model with 300 estimators on X, y:\n",
       "rf = RandomForestRegressor(n_estimators=300, random_state=random_state, max_depth=3)\n",
       "rf.fit(X, y)\n",
       "rf"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "txt = \"\"\"\n",
    "# import random forest regressor from scikit-learn\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "# fit random forest model with 300 estimators on X, y:\n",
    "\"\"\"\n",
    "print(pipe(txt, num_return_sequences=1)[0][\"generated_text\"])"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Huấn luyện một mô hình ngôn ngữ nhân quả từ đầu (TensorFlow)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}