{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Huấn luyện một mô hình ngôn ngữ nhân quả từ đầu (TensorFlow)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]\n", "!apt install git-lfs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will need to setup git, adapt your email and name in the following cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!git config --global user.email \"you@example.com\"\n", "!git config --global user.name \"Your Name\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def any_keyword_in_string(string, keywords):\n", " for keyword in keywords:\n", " if keyword in string:\n", " return True\n", " return False" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False True" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filters = [\"pandas\", \"sklearn\", \"matplotlib\", \"seaborn\"]\n", "example_1 = \"import numpy as np\"\n", "example_2 = \"import pandas as pd\"\n", "\n", "print(\n", " any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "from tqdm import tqdm\n", "from datasets import Dataset\n", "\n", "\n", "def filter_streaming_dataset(dataset, filters):\n", " filtered_dict = defaultdict(list)\n", " total = 0\n", " for sample in tqdm(iter(dataset)):\n", " total += 1\n", " if any_keyword_in_string(sample[\"content\"], filters):\n", " for k, v in sample.items():\n", " filtered_dict[k].append(v)\n", " print(f\"{len(filtered_dict['content'])/total:.2%} of data after filtering.\")\n", " return Dataset.from_dict(filtered_dict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.26% of data after filtering." ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Ô này sẽ mất rất nhiều thời gian để thực thi, vì vậy bạn nên bỏ qua và chuyển đến\n", "# cái tiếp theo!\n", "from datasets import load_dataset\n", "\n", "split = \"train\" # \"valid\"\n", "filters = [\"pandas\", \"sklearn\", \"matplotlib\", \"seaborn\"]\n", "\n", "data = load_dataset(f\"transformersbook/codeparrot-{split}\", split=split, streaming=True)\n", "filtered_data = filter_streaming_dataset(data, filters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],\n", " num_rows: 606720\n", " })\n", " valid: Dataset({\n", " features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],\n", " num_rows: 3322\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset, DatasetDict\n", "\n", "ds_train = load_dataset(\"huggingface-course/codeparrot-ds-train\", split=\"train\")\n", "ds_valid = load_dataset(\"huggingface-course/codeparrot-ds-valid\", split=\"validation\")\n", "\n", "raw_datasets = DatasetDict(\n", " {\n", " \"train\": ds_train, # .shuffle().select(range(50000)),\n", " \"valid\": ds_valid, # .shuffle().select(range(500))\n", " }\n", ")\n", "\n", "raw_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'REPO_NAME: kmike/scikit-learn'\n", "'PATH: sklearn/utils/__init__.py'\n", "'COPIES: 3'\n", "'SIZE: 10094'\n", "'''CONTENT: \"\"\"\n", "The :mod:`sklearn.utils` module includes various utilites.\n", "\"\"\"\n", "\n", "from collections import Sequence\n", "\n", "import numpy as np\n", "from scipy.sparse import issparse\n", "import warnings\n", "\n", "from .murmurhash import murm\n", "LICENSE: bsd-3-clause'''" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for key in raw_datasets[\"train\"][0]:\n", " print(f\"{key.upper()}: {raw_datasets['train'][0][key][:200]}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Input IDs length: 34\n", "Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]\n", "Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "context_length = 128\n", "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n", "\n", "outputs = tokenizer(\n", " raw_datasets[\"train\"][:2][\"content\"],\n", " truncation=True,\n", " max_length=context_length,\n", " return_overflowing_tokens=True,\n", " return_length=True,\n", ")\n", "\n", "print(f\"Input IDs length: {len(outputs['input_ids'])}\")\n", "print(f\"Input chunk lengths: {(outputs['length'])}\")\n", "print(f\"Chunk mapping: {outputs['overflow_to_sample_mapping']}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['input_ids'],\n", " num_rows: 16702061\n", " })\n", " valid: Dataset({\n", " features: ['input_ids'],\n", " num_rows: 93164\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def tokenize(element):\n", " outputs = tokenizer(\n", " element[\"content\"],\n", " truncation=True,\n", " max_length=context_length,\n", " return_overflowing_tokens=True,\n", " return_length=True,\n", " )\n", " input_batch = []\n", " for length, input_ids in zip(outputs[\"length\"], outputs[\"input_ids\"]):\n", " if length == context_length:\n", " input_batch.append(input_ids)\n", " return {\"input_ids\": input_batch}\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(\n", " tokenize, batched=True, remove_columns=raw_datasets[\"train\"].column_names\n", ")\n", "tokenized_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig\n", "\n", "config = AutoConfig.from_pretrained(\n", " \"gpt2\",\n", " vocab_size=len(tokenizer),\n", " n_ctx=context_length,\n", " bos_token_id=tokenizer.bos_token_id,\n", " eos_token_id=tokenizer.eos_token_id,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param #\n", "=================================================================\n", "transformer (TFGPT2MainLayer multiple 124242432\n", "=================================================================\n", "Total params: 124,242,432\n", "Trainable params: 124,242,432\n", "Non-trainable params: 0\n", "_________________________________________________________________" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = TFGPT2LMHeadModel(config)\n", "model(model.dummy_inputs) # Xây mô hình\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForLanguageModeling\n", "\n", "tokenizer.pad_token = tokenizer.eos_token\n", "data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors=\"tf\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "input_ids shape: (5, 128)\n", "attention_mask shape: (5, 128)\n", "labels shape: (5, 128)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "out = data_collator([tokenized_datasets[\"train\"][i] for i in range(5)])\n", "for key in out:\n", " print(f\"{key} shape: {out[key].shape}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tf_train_dataset = tokenized_dataset[\"train\"].to_tf_dataset(\n", " columns=[\"input_ids\", \"attention_mask\", \"labels\"],\n", " collate_fn=data_collator,\n", " shuffle=True,\n", " batch_size=32,\n", ")\n", "tf_eval_dataset = tokenized_dataset[\"valid\"].to_tf_dataset(\n", " columns=[\"input_ids\", \"attention_mask\", \"labels\"],\n", " collate_fn=data_collator,\n", " shuffle=False,\n", " batch_size=32,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import create_optimizer\n", "import tensorflow as tf\n", "\n", "num_train_steps = len(tf_train_dataset)\n", "optimizer, schedule = create_optimizer(\n", " init_lr=5e-5,\n", " num_warmup_steps=1_000,\n", " num_train_steps=num_train_steps,\n", " weight_decay_rate=0.01,\n", ")\n", "model.compile(optimizer=optimizer)\n", "\n", "# Huấn luyện trong mixed-precision float16\n", "tf.keras.mixed_precision.set_global_policy(\"mixed_float16\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers.keras_callbacks import PushToHubCallback\n", "\n", "callback = PushToHubCallback(output_dir=\"codeparrot-ds\", tokenizer=tokenizer)\n", "\n", "model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline\n", "\n", "course_model = TFGPT2LMHeadModel.from_pretrained(\"huggingface-course/codeparrot-ds\")\n", "course_tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/codeparrot-ds\")\n", "pipe = pipeline(\n", " \"text-generation\", model=course_model, tokenizer=course_tokenizer, device=0\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "# create some data\n", "x = np.random.randn(100)\n", "y = np.random.randn(100)\n", "\n", "# create scatter plot with x, y\n", "plt.scatter(x, y)\n", "\n", "# create scatter" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "txt = \"\"\"\\\n", "# create some data\n", "x = np.random.randn(100)\n", "y = np.random.randn(100)\n", "\n", "# create scatter plot with x, y\n", "\"\"\"\n", "print(pipe(txt, num_return_sequences=1)[0][\"generated_text\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "# create some data\n", "x = np.random.randn(100)\n", "y = np.random.randn(100)\n", "\n", "# create dataframe from x and y\n", "df = pd.DataFrame({'x': x, 'y': y})\n", "df.insert(0,'x', x)\n", "for" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "txt = \"\"\"\\\n", "# create some data\n", "x = np.random.randn(100)\n", "y = np.random.randn(100)\n", "\n", "# create dataframe from x and y\n", "\"\"\"\n", "print(pipe(txt, num_return_sequences=1)[0][\"generated_text\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "# dataframe with profession, income and name\n", "df = pd.DataFrame({'profession': x, 'income':y, 'name': z})\n", "\n", "# calculate the mean income per profession\n", "profession = df.groupby(['profession']).mean()\n", "\n", "# compute the" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "txt = \"\"\"\\\n", "# dataframe with profession, income and name\n", "df = pd.DataFrame({'profession': x, 'income':y, 'name': z})\n", "\n", "# calculate the mean income per profession\n", "\"\"\"\n", "print(pipe(txt, num_return_sequences=1)[0][\"generated_text\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "# import random forest regressor from scikit-learn\n", "from sklearn.ensemble import RandomForestRegressor\n", "\n", "# fit random forest model with 300 estimators on X, y:\n", "rf = RandomForestRegressor(n_estimators=300, random_state=random_state, max_depth=3)\n", "rf.fit(X, y)\n", "rf" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "txt = \"\"\"\n", "# import random forest regressor from scikit-learn\n", "from sklearn.ensemble import RandomForestRegressor\n", "\n", "# fit random forest model with 300 estimators on X, y:\n", "\"\"\"\n", "print(pipe(txt, num_return_sequences=1)[0][\"generated_text\"])" ] } ], "metadata": { "colab": { "name": "Huấn luyện một mô hình ngôn ngữ nhân quả từ đầu (TensorFlow)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }