{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shapes: train_X (4838, 5), test_X (53766, 5)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugNameconditionreviewdateusefulCount
3969CitalopramDepression\"Celexa is a literally a magic pill, for me. 2...December 8, 201448
4926PrevacidGERD\"I have tried almost every acid reducer made i...January 16, 201633
80152AubraBirth Control\"I hate this pill, extreme depression, breast ...December 4, 20165
79507Microgestin Fe 1.5 / 30Abnormal Uterine Bleeding\"I had been on my period for 3months. The micr...June 27, 20166
130035AzuretteBirth Control\"Effective at preventing pregnancy. My skin wa...October 14, 201319
\n", "
" ], "text/plain": [ " drugName condition \\\n", "3969 Citalopram Depression \n", "4926 Prevacid GERD \n", "80152 Aubra Birth Control \n", "79507 Microgestin Fe 1.5 / 30 Abnormal Uterine Bleeding \n", "130035 Azurette Birth Control \n", "\n", " review date \\\n", "3969 \"Celexa is a literally a magic pill, for me. 2... December 8, 2014 \n", "4926 \"I have tried almost every acid reducer made i... January 16, 2016 \n", "80152 \"I hate this pill, extreme depression, breast ... December 4, 2016 \n", "79507 \"I had been on my period for 3months. The micr... June 27, 2016 \n", "130035 \"Effective at preventing pregnancy. My skin wa... October 14, 2013 \n", "\n", " usefulCount \n", "3969 48 \n", "4926 33 \n", "80152 5 \n", "79507 6 \n", "130035 19 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from lale.datasets.uci.uci_datasets import fetch_drugscom\n", "from sklearn.model_selection import train_test_split\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "import lale.schema2enums\n", "train_X_all, train_y_all, test_X, test_y = fetch_drugscom()\n", "#subset 3% = 4,838 rows to speed up experimentation\n", "train_X, train_X_ignore, train_y, train_y_ignore = train_test_split(\n", " train_X_all, train_y_all, train_size=0.03, random_state=42)\n", "print(f'shapes: train_X {train_X.shape}, test_X {test_X.shape}')\n", "train_X.head()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "tfidf\n", "\n", "\n", "Tfidf\n", "\n", "\n", "\n", "\n", "project_0->tfidf\n", "\n", "\n", "\n", "\n", "cat\n", "\n", "\n", "Cat\n", "\n", "\n", "\n", "\n", "tfidf->cat\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "project_1->cat\n", "\n", "\n", "\n", "\n", "lin_r\n", "\n", "\n", "Lin-\n", "R\n", "\n", "\n", "\n", "\n", "cat->lin_r\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.lale import Project\n", "from lale.lib.lale import ConcatFeatures as Cat\n", "from lale.lib.sklearn import TfidfVectorizer as Tfidf\n", "from lale.lib.sklearn import LinearRegression as LinR\n", "trainable = (\n", " Project(columns=['review']) >> Tfidf(max_features=100)\n", " & Project(columns={'type': 'number'})\n", " ) >> Cat >> LinR()\n", "trainable.visualize()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "tfidf\n", "\n", "\n", "Tfidf\n", "\n", "\n", "\n", "\n", "project_0->tfidf\n", "\n", "\n", "\n", "\n", "cat\n", "\n", "\n", "Cat\n", "\n", "\n", "\n", "\n", "tfidf->cat\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "project_1->cat\n", "\n", "\n", "\n", "\n", "lin_r\n", "\n", "\n", "Lin-\n", "R\n", "\n", "\n", "\n", "\n", "cat->lin_r\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trained = trainable.fit(train_X, train_y)\n", "predicted = trained.predict(test_X)\n", "trained.visualize()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "comparisons [10:7.8, 8:9.1, 9:5.7, 9:7.4, 9:5.8, 4:7.6, 6:5.4, 9:8.4, 7:5.9, 2:6.5]\n", "RMSE 3.00\n" ] } ], "source": [ "import math\n", "from sklearn.metrics import mean_squared_error\n", "comparisons = [f'{int(test_y[i])}:{predicted[i]:.1f}' for i in range(10)]\n", "print(f'comparisons [{\", \".join(comparisons)}]')\n", "rmse = math.sqrt(mean_squared_error(predicted, test_y))\n", "print(f'RMSE {rmse:.2f}')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "tfidf\n", "\n", "\n", "Tfidf\n", "\n", "\n", "\n", "\n", "project_0->tfidf\n", "\n", "\n", "\n", "\n", "cat\n", "\n", "\n", "Cat\n", "\n", "\n", "\n", "\n", "tfidf->cat\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "project_1->cat\n", "\n", "\n", "\n", "\n", "xgb\n", "\n", "\n", "XGB\n", "\n", "\n", "\n", "\n", "cat->xgb\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.xgboost import XGBRegressor as XGB\n", "trainable = (\n", " Project(columns=['review']) >> Tfidf(max_features=100)\n", " & Project(columns={'type': 'number'})\n", " ) >> Cat >> XGB()\n", "trainable.visualize()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "comparisons [10:8.2, 8:7.8, 9:4.8, 9:7.4, 9:4.7, 4:7.1, 6:4.1, 9:7.4, 7:6.8, 2:6.2]\n", "RMSE 2.98\n" ] } ], "source": [ "trained = trainable.fit(train_X, train_y)\n", "predicted = trained.predict(test_X)\n", "comparisons = [f'{int(test_y[i])}:{predicted[i]:.1f}' for i in range(10)]\n", "print(f'comparisons [{\", \".join(comparisons)}]')\n", "rmse = math.sqrt(mean_squared_error(predicted, test_y))\n", "print(f'RMSE {rmse:.2f}')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "tfidf\n", "\n", "\n", "Tfidf\n", "\n", "\n", "\n", "\n", "project_0->tfidf\n", "\n", "\n", "\n", "\n", "cat\n", "\n", "\n", "Cat\n", "\n", "\n", "\n", "\n", "tfidf->cat\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "project_1->cat\n", "\n", "\n", "\n", "\n", "forest\n", "\n", "\n", "Forest\n", "\n", "\n", "\n", "\n", "cat->forest\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.sklearn import RandomForestRegressor as Forest\n", "tfidf_hps = {**Tfidf.get_defaults(), 'max_features': 100, 'ngram_range': (1,1)}\n", "planned = (\n", " Project(columns=['review']) >> Tfidf(**tfidf_hps)\n", " & Project(columns={'type': 'number'})\n", " ) >> Cat >> Forest\n", "planned.visualize()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|███████████| 3/3 [01:32<00:00, 29.61s/it, best loss: -0.10521892976978495]\n", "RMSE 3.12\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "tfidf\n", "\n", "\n", "Tfidf\n", "\n", "\n", "\n", "\n", "project_0->tfidf\n", "\n", "\n", "\n", "\n", "cat\n", "\n", "\n", "Cat\n", "\n", "\n", "\n", "\n", "tfidf->cat\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "project_1->cat\n", "\n", "\n", "\n", "\n", "forest\n", "\n", "\n", "Forest\n", "\n", "\n", "\n", "\n", "cat->forest\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.lale import Hyperopt\n", "best_estimator = planned.auto_configure(train_X, train_y, optimizer=Hyperopt, max_evals=3, scoring='r2')\n", "predicted = best_estimator.predict(test_X)\n", "rmse = math.sqrt(mean_squared_error(predicted, test_y))\n", "print(f'RMSE {rmse:.2f}')\n", "best_estimator.visualize()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "bert\n", "\n", "\n", "Bert\n", "\n", "\n", "\n", "\n", "project_0->bert\n", "\n", "\n", "\n", "\n", "cat\n", "\n", "\n", "Cat\n", "\n", "\n", "\n", "\n", "bert->cat\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "project_1->cat\n", "\n", "\n", "\n", "\n", "lin_r\n", "\n", "\n", "Lin-\n", "R\n", "\n", "\n", "\n", "\n", "cat->lin_r\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.lale import Project\n", "from lale.lib.pytorch import BertPretrainedEncoder as Bert\n", "from lale.lib.lale import ConcatFeatures as Cat\n", "from lale.lib.sklearn import LinearRegression as LinR\n", "trainable = (\n", " Project(columns=['review']) >> Bert(batch_size=126)\n", " & Project(columns={'type': 'number'})\n", " ) >> Cat >> LinR()\n", "trainable.visualize()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "bert\n", "\n", "\n", "Bert\n", "\n", "\n", "\n", "\n", "project_0->bert\n", "\n", "\n", "\n", "\n", "cat\n", "\n", "\n", "Cat\n", "\n", "\n", "\n", "\n", "bert->cat\n", "\n", "\n", "\n", "\n", "tfidf\n", "\n", "\n", "Tfidf\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "project_1->cat\n", "\n", "\n", "\n", "\n", "lin_r\n", "\n", "\n", "Lin-\n", "R\n", "\n", "\n", "\n", "\n", "cat->lin_r\n", "\n", "\n", "\n", "\n", "xgb\n", "\n", "\n", "XGB\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.lale import Project\n", "from lale.lib.sklearn import TfidfVectorizer as Tfidf\n", "from lale.lib.pytorch import BertPretrainedEncoder as Bert\n", "from lale.lib.lale import ConcatFeatures as Cat\n", "from lale.lib.sklearn import LinearRegression as LinR\n", "from lale.lib.xgboost import XGBRegressor as XGB\n", "planned = (\n", " Project(columns=['review']) >> (Bert | Tfidf)\n", " & Project(columns={'type': 'number'})\n", " ) >> Cat >> (LinR | XGB)\n", "planned.visualize()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }