{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Time to slice and dice" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget \"https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip\"\n", "!unzip drugsCom_raw.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "data_files = {\"train\": \"drugsComTrain_raw.tsv\", \"test\": \"drugsComTest_raw.tsv\"}\n", "# \\t is the tab character in Python\n", "drug_dataset = load_dataset(\"csv\", data_files=data_files, delimiter=\"\\t\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Unnamed: 0': [87571, 178045, 80482],\n", " 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],\n", " 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],\n", " 'review': ['\"like the previous person mention, I'm a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!\"',\n", " '\"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\\r\\nas a pain reducer and an anti-depressant, however, the side effects outweighed \\r\\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\\r\\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\\r\\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\\r\\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects.\"',\n", " '\"I have been taking Mobic for over a year with no side effects other than an elevated blood pressure. I had severe knee and ankle pain which completely went away after taking Mobic. I attempted to stop the medication however pain returned after a few days.\"'],\n", " 'rating': [9.0, 3.0, 10.0],\n", " 'date': ['September 2, 2015', 'November 7, 2011', 'June 5, 2013'],\n", " 'usefulCount': [36, 13, 128]}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_sample = drug_dataset[\"train\"].shuffle(seed=42).select(range(1000))\n", "# Peek at the first few examples\n", "drug_sample[:3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for split in drug_dataset.keys():\n", " assert len(drug_dataset[split]) == len(drug_dataset[split].unique(\"Unnamed: 0\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],\n", " num_rows: 161297\n", " })\n", " test: Dataset({\n", " features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],\n", " num_rows: 53766\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_dataset = drug_dataset.rename_column(\n", " original_column_name=\"Unnamed: 0\", new_column_name=\"patient_id\"\n", ")\n", "drug_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AttributeError: 'NoneType' object has no attribute 'lower'" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def lowercase_condition(example):\n", " return {\"condition\": example[\"condition\"].lower()}\n", "\n", "\n", "drug_dataset.map(lowercase_condition)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def filter_nones(x):\n", " return x[\"condition\"] is not None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(lambda x: x * x)(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16.0" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(lambda base, height: 0.5 * base * height)(4, 8)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drug_dataset = drug_dataset.filter(lambda x: x[\"condition\"] is not None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['left ventricular dysfunction', 'adhd', 'birth control']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_dataset = drug_dataset.map(lowercase_condition)\n", "# Check that lowercasing worked\n", "drug_dataset[\"train\"][\"condition\"][:3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def compute_review_length(example):\n", " return {\"review_length\": len(example[\"review\"].split())}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'patient_id': 206461,\n", " 'drugName': 'Valsartan',\n", " 'condition': 'left ventricular dysfunction',\n", " 'review': '\"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil\"',\n", " 'rating': 9.0,\n", " 'date': 'May 20, 2012',\n", " 'usefulCount': 27,\n", " 'review_length': 17}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_dataset = drug_dataset.map(compute_review_length)\n", "# Inspect the first training example\n", "drug_dataset[\"train\"][0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'patient_id': [103488, 23627, 20558],\n", " 'drugName': ['Loestrin 21 1 / 20', 'Chlorzoxazone', 'Nucynta'],\n", " 'condition': ['birth control', 'muscle spasm', 'pain'],\n", " 'review': ['\"Excellent.\"', '\"useless\"', '\"ok\"'],\n", " 'rating': [10.0, 1.0, 6.0],\n", " 'date': ['November 4, 2008', 'March 24, 2017', 'August 20, 2016'],\n", " 'usefulCount': [5, 2, 10],\n", " 'review_length': [1, 1, 1]}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_dataset[\"train\"].sort(\"review_length\")[:3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'train': 138514, 'test': 46108}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_dataset = drug_dataset.filter(lambda x: x[\"review_length\"] > 30)\n", "print(drug_dataset.num_rows)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"I'm a transformer called BERT\"" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import html\n", "\n", "text = \"I'm a transformer called BERT\"\n", "html.unescape(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drug_dataset = drug_dataset.map(lambda x: {\"review\": html.unescape(x[\"review\"])})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_drug_dataset = drug_dataset.map(\n", " lambda x: {\"review\": [html.unescape(o) for o in x[\"review\"]]}, batched=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", "\n", "\n", "def tokenize_function(examples):\n", " return tokenizer(examples[\"review\"], truncation=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "slow_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\", use_fast=False)\n", "\n", "\n", "def slow_tokenize_function(examples):\n", " return slow_tokenizer(examples[\"review\"], truncation=True)\n", "\n", "\n", "tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_split(examples):\n", " return tokenizer(\n", " examples[\"review\"],\n", " truncation=True,\n", " max_length=128,\n", " return_overflowing_tokens=True,\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[128, 49]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = tokenize_and_split(drug_dataset[\"train\"][0])\n", "[len(inp) for inp in result[\"input_ids\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ArrowInvalid: Column 1 named condition expected length 1463 but got length 1000" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenized_dataset = drug_dataset.map(\n", " tokenize_and_split, batched=True, remove_columns=drug_dataset[\"train\"].column_names\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(206772, 138514)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tokenized_dataset[\"train\"]), len(drug_dataset[\"train\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_split(examples):\n", " result = tokenizer(\n", " examples[\"review\"],\n", " truncation=True,\n", " max_length=128,\n", " return_overflowing_tokens=True,\n", " )\n", " # Extract mapping between new and old indices\n", " sample_map = result.pop(\"overflow_to_sample_mapping\")\n", " for key, values in examples.items():\n", " result[key] = [values[i] for i in sample_map]\n", " return result" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['attention_mask', 'condition', 'date', 'drugName', 'input_ids', 'patient_id', 'rating', 'review', 'review_length', 'token_type_ids', 'usefulCount'],\n", " num_rows: 206772\n", " })\n", " test: Dataset({\n", " features: ['attention_mask', 'condition', 'date', 'drugName', 'input_ids', 'patient_id', 'rating', 'review', 'review_length', 'token_type_ids', 'usefulCount'],\n", " num_rows: 68876\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)\n", "tokenized_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drug_dataset.set_format(\"pandas\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drug_dataset[\"train\"][:3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_df = drug_dataset[\"train\"][:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "frequencies = (\n", " train_df[\"condition\"]\n", " .value_counts()\n", " .to_frame()\n", " .reset_index()\n", " .rename(columns={\"index\": \"condition\", \"condition\": \"frequency\"})\n", ")\n", "frequencies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['condition', 'frequency'],\n", " num_rows: 819\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import Dataset\n", "\n", "freq_dataset = Dataset.from_pandas(frequencies)\n", "freq_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drug_dataset.reset_format()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'review_clean'],\n", " num_rows: 110811\n", " })\n", " validation: Dataset({\n", " features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'review_clean'],\n", " num_rows: 27703\n", " })\n", " test: Dataset({\n", " features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'review_clean'],\n", " num_rows: 46108\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drug_dataset_clean = drug_dataset[\"train\"].train_test_split(train_size=0.8, seed=42)\n", "# Rename the default \"test\" split to \"validation\"\n", "drug_dataset_clean[\"validation\"] = drug_dataset_clean.pop(\"test\")\n", "# Add the \"test\" set to our `DatasetDict`\n", "drug_dataset_clean[\"test\"] = drug_dataset[\"test\"]\n", "drug_dataset_clean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drug_dataset_clean.save_to_disk(\"drug-reviews\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n", " num_rows: 110811\n", " })\n", " validation: Dataset({\n", " features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n", " num_rows: 27703\n", " })\n", " test: Dataset({\n", " features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],\n", " num_rows: 46108\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_from_disk\n", "\n", "drug_dataset_reloaded = load_from_disk(\"drug-reviews\")\n", "drug_dataset_reloaded" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for split, dataset in drug_dataset_clean.items():\n", " dataset.to_json(f\"drug-reviews-{split}.jsonl\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{\"patient_id\":141780,\"drugName\":\"Escitalopram\",\"condition\":\"depression\",\"review\":\"\\\"I seemed to experience the regular side effects of LEXAPRO, insomnia, low sex drive, sleepiness during the day. I am taking it at night because my doctor said if it made me tired to take it at night. I assumed it would and started out taking it at night. Strange dreams, some pleasant. I was diagnosed with fibromyalgia. Seems to be helping with the pain. Have had anxiety and depression in my family, and have tried quite a few other medications that haven't worked. Only have been on it for two weeks but feel more positive in my mind, want to accomplish more in my life. Hopefully the side effects will dwindle away, worth it to stick with it from hearing others responses. Great medication.\\\"\",\"rating\":9.0,\"date\":\"May 29, 2011\",\"usefulCount\":10,\"review_length\":125}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "!head -n 1 drug-reviews-train.jsonl" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_files = {\n", " \"train\": \"drug-reviews-train.jsonl\",\n", " \"validation\": \"drug-reviews-validation.jsonl\",\n", " \"test\": \"drug-reviews-test.jsonl\",\n", "}\n", "drug_dataset_reloaded = load_dataset(\"json\", data_files=data_files)" ] } ], "metadata": { "colab": { "name": "Time to slice and dice", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }