{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This IPython notebook illustrates how to sample and label a table (candidate set).\n",
    "First, we need to import py_entitymatching package and other libraries as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Import py_entitymatching package\n",
    "import py_entitymatching as em\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Get the datasets directory\n",
    "datasets_dir = em.get_install_path() + os.sep + 'datasets'\n",
    "\n",
    "path_A = datasets_dir + os.sep + 'DBLP.csv'\n",
    "path_B = datasets_dir + os.sep + 'ACM.csv'\n",
    "path_C = datasets_dir + os.sep + 'tableC.csv'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Metadata file is not present in the given path; proceeding to read the csv file.\n",
      "Metadata file is not present in the given path; proceeding to read the csv file.\n",
      "Metadata file is not present in the given path; proceeding to read the csv file.\n"
     ]
    }
   ],
   "source": [
    "A = em.read_csv_metadata(path_A, key='id')\n",
    "B = em.read_csv_metadata(path_B, key='id')\n",
    "C = em.read_csv_metadata(path_C, key='_id', \n",
    "                         fk_ltable='ltable_id', fk_rtable='rtable_id',\n",
    "                         ltable=A, rtable=B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>ltable_id</th>\n",
       "      <th>rtable_id</th>\n",
       "      <th>ltable_authors</th>\n",
       "      <th>ltable_title</th>\n",
       "      <th>rtable_authors</th>\n",
       "      <th>rtable_title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>conf/sigmod/AbadiC02</td>\n",
       "      <td>191915</td>\n",
       "      <td>Daniel J. Abadi, Mitch Cherniack</td>\n",
       "      <td>Visual COKO: a debugger for query optimizer development</td>\n",
       "      <td>Michael J. Carey, David J. DeWitt, Michael J. Franklin, Nancy E. Hall, Mark L. McAuliffe, Jeffre...</td>\n",
       "      <td>Shoring up persistent applications</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>conf/sigmod/AbadiC02</td>\n",
       "      <td>191931</td>\n",
       "      <td>Daniel J. Abadi, Mitch Cherniack</td>\n",
       "      <td>Visual COKO: a debugger for query optimizer development</td>\n",
       "      <td>Daniel J. Dietterich</td>\n",
       "      <td>DEC data distributor: for data replication and data warehousing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>conf/sigmod/AbadiC02</td>\n",
       "      <td>233356</td>\n",
       "      <td>Daniel J. Abadi, Mitch Cherniack</td>\n",
       "      <td>Visual COKO: a debugger for query optimizer development</td>\n",
       "      <td>Mitch Cherniack, Stanley B. Zdonik</td>\n",
       "      <td>Rule languages and internal algebras for rule-based optimizers</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>conf/sigmod/AbadiC02</td>\n",
       "      <td>276311</td>\n",
       "      <td>Daniel J. Abadi, Mitch Cherniack</td>\n",
       "      <td>Visual COKO: a debugger for query optimizer development</td>\n",
       "      <td>Mitch Cherniack, Stan Zdonik</td>\n",
       "      <td>Changing the rules: transformations for rule-based optimizers</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>conf/sigmod/AbadiC02</td>\n",
       "      <td>335432</td>\n",
       "      <td>Daniel J. Abadi, Mitch Cherniack</td>\n",
       "      <td>Visual COKO: a debugger for query optimizer development</td>\n",
       "      <td>Jianjun Chen, David J. DeWitt, Feng Tian, Yuan Wang</td>\n",
       "      <td>NiagaraCQ: a scalable continuous query system for Internet databases</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   _id             ltable_id  rtable_id                    ltable_authors  \\\n",
       "0    0  conf/sigmod/AbadiC02     191915  Daniel J. Abadi, Mitch Cherniack   \n",
       "1    1  conf/sigmod/AbadiC02     191931  Daniel J. Abadi, Mitch Cherniack   \n",
       "2    2  conf/sigmod/AbadiC02     233356  Daniel J. Abadi, Mitch Cherniack   \n",
       "3    3  conf/sigmod/AbadiC02     276311  Daniel J. Abadi, Mitch Cherniack   \n",
       "4    4  conf/sigmod/AbadiC02     335432  Daniel J. Abadi, Mitch Cherniack   \n",
       "\n",
       "                                              ltable_title  \\\n",
       "0  Visual COKO: a debugger for query optimizer development   \n",
       "1  Visual COKO: a debugger for query optimizer development   \n",
       "2  Visual COKO: a debugger for query optimizer development   \n",
       "3  Visual COKO: a debugger for query optimizer development   \n",
       "4  Visual COKO: a debugger for query optimizer development   \n",
       "\n",
       "                                                                                        rtable_authors  \\\n",
       "0  Michael J. Carey, David J. DeWitt, Michael J. Franklin, Nancy E. Hall, Mark L. McAuliffe, Jeffre...   \n",
       "1                                                                                 Daniel J. Dietterich   \n",
       "2                                                                   Mitch Cherniack, Stanley B. Zdonik   \n",
       "3                                                                         Mitch Cherniack, Stan Zdonik   \n",
       "4                                                  Jianjun Chen, David J. DeWitt, Feng Tian, Yuan Wang   \n",
       "\n",
       "                                                           rtable_title  \n",
       "0                                    Shoring up persistent applications  \n",
       "1       DEC data distributor: for data replication and data warehousing  \n",
       "2        Rule languages and internal algebras for rule-based optimizers  \n",
       "3         Changing the rules: transformations for rule-based optimizers  \n",
       "4  NiagaraCQ: a scalable continuous query system for Internet databases  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "C.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14673"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(C)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample Candidate Set"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From the candidate set, a sample (for labeling purposes) can be obtained like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "S = em.sample_table(C, 450)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Label the Sampled Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Column name (gold_label) is not present in dataframe\n"
     ]
    },
    {
     "ename": "ImportError",
     "evalue": "PyQt4 is not installed. Please install PyQt4 to use GUI related functions in py_entitymatching.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m/Users/pradap/miniconda3/lib/python3.5/site-packages/py_entitymatching-0.1.0-py3.5.egg/py_entitymatching/labeler/labeler.py\u001b[0m in \u001b[0;36mlabel_table\u001b[0;34m(table, label_column_name, verbose)\u001b[0m\n\u001b[1;32m     64\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 65\u001b[0;31m         \u001b[0;32mfrom\u001b[0m \u001b[0mPyQt4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mQtGui\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     66\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mImportError\u001b[0m: No module named 'PyQt4'",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-7-a863790c9d34>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Label the sampled set\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;31m# Specify the name for the label column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlabel_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'gold_label'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/Users/pradap/miniconda3/lib/python3.5/site-packages/py_entitymatching-0.1.0-py3.5.egg/py_entitymatching/labeler/labeler.py\u001b[0m in \u001b[0;36mlabel_table\u001b[0;34m(table, label_column_name, verbose)\u001b[0m\n\u001b[1;32m     65\u001b[0m         \u001b[0;32mfrom\u001b[0m \u001b[0mPyQt4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mQtGui\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     66\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m         raise ImportError('PyQt4 is not installed. Please install PyQt4 to use '\n\u001b[0m\u001b[1;32m     68\u001b[0m                       'GUI related functions in py_entitymatching.')\n\u001b[1;32m     69\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mImportError\u001b[0m: PyQt4 is not installed. Please install PyQt4 to use GUI related functions in py_entitymatching."
     ]
    }
   ],
   "source": [
    "# Label the sampled set\n",
    "# Specify the name for the label column\n",
    "G = em.label_table(S, 'gold_label')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The user must specify 0 for non-match and 1 for match. Typically, the sampling and the labeling step is done in iterations (till we get sufficient density of matches). Once labeled, the labeled data set will look like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Assume that we have labeled the data and stored it in \n",
    "# labeled_data_demo.csv\n",
    "\n",
    "path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'\n",
    "G = em.read_csv_metadata(path_labeled_data, key='_id', \n",
    "                         fk_ltable='ltable_id', fk_rtable='rtable_id',\n",
    "                         ltable=A, rtable=B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "G.head()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}