{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Training a new tokenizer from an old one" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]\n", "!apt install git-lfs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will need to setup git, adapt your email and name in the following cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!git config --global user.email \"you@example.com\"\n", "!git config --global user.name \"Your Name\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "# This can take a few minutes to load, so grab a coffee or tea while you wait!\n", "raw_datasets = load_dataset(\"code_search_net\", \"python\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', \n", " 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', \n", " 'func_code_url'\n", " ],\n", " num_rows: 412178\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets[\"train\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(raw_datasets[\"train\"][123456][\"whole_func_string\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Don't uncomment the following line unless your dataset is small!\n", "# training_corpus = [raw_datasets[\"train\"][i: i + 1000][\"whole_func_string\"] for i in range(0, len(raw_datasets[\"train\"]), 1000)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_corpus = (\n", " raw_datasets[\"train\"][i : i + 1000][\"whole_func_string\"]\n", " for i in range(0, len(raw_datasets[\"train\"]), 1000)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n", "[]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen = (i for i in range(10))\n", "print(list(gen))\n", "print(list(gen))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_training_corpus():\n", " return (\n", " raw_datasets[\"train\"][i : i + 1000][\"whole_func_string\"]\n", " for i in range(0, len(raw_datasets[\"train\"]), 1000)\n", " )\n", "\n", "\n", "training_corpus = get_training_corpus()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_training_corpus():\n", " dataset = raw_datasets[\"train\"]\n", " for start_idx in range(0, len(dataset), 1000):\n", " samples = dataset[start_idx : start_idx + 1000]\n", " yield samples[\"whole_func_string\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "old_tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ\"\"\"', 'Add', 'Ġthe', 'Ġtwo',\n", " 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '.\"', '\"\"', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example = '''def add_numbers(a, b):\n", " \"\"\"Add the two numbers `a` and `b`.\"\"\"\n", " return a + b'''\n", "\n", "tokens = old_tokenizer.tokenize(example)\n", "tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['def', 'Ġadd', '_', 'numbers', '(', 'a', ',', 'Ġb', '):', 'ĊĠĠĠ', 'Ġ\"\"\"', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`',\n", " 'a', '`', 'Ġand', 'Ġ`', 'b', '`.\"\"\"', 'ĊĠĠĠ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokens = tokenizer.tokenize(example)\n", "tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "27\n", "36" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(tokens))\n", "print(len(old_tokenizer.tokenize(example)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['class', 'ĠLinear', 'Layer', '():', 'ĊĠĠĠ', 'Ġdef', 'Ġ__', 'init', '__(', 'self', ',', 'Ġinput', '_', 'size', ',',\n", " 'Ġoutput', '_', 'size', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'weight', 'Ġ=', 'Ġtorch', '.', 'randn', '(', 'input', '_',\n", " 'size', ',', 'Ġoutput', '_', 'size', ')', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'bias', 'Ġ=', 'Ġtorch', '.', 'zeros', '(',\n", " 'output', '_', 'size', ')', 'ĊĊĠĠĠ', 'Ġdef', 'Ġ__', 'call', '__(', 'self', ',', 'Ġx', '):', 'ĊĠĠĠĠĠĠĠ',\n", " 'Ġreturn', 'Ġx', 'Ġ@', 'Ġself', '.', 'weights', 'Ġ+', 'Ġself', '.', 'bias', 'ĊĠĠĠĠ']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example = \"\"\"class LinearLayer():\n", " def __init__(self, input_size, output_size):\n", " self.weight = torch.randn(input_size, output_size)\n", " self.bias = torch.zeros(output_size)\n", "\n", " def __call__(self, x):\n", " return x @ self.weights + self.bias\n", " \"\"\"\n", "tokenizer.tokenize(example)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.save_pretrained(\"code-search-net-tokenizer\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.push_to_hub(\"code-search-net-tokenizer\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Replace \"huggingface-course\" below with your actual namespace to use your own tokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")" ] } ], "metadata": { "colab": { "name": "Training a new tokenizer from an old one", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }