{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%store -r page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "page_name = 'The_Camp_of_the_Saints'\n",
    "from external.wikipedia import WikipediaDV, WikipediaAPI\n",
    "wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))\n",
    "page = wikipedia_dv.get_page(page_name)\n",
    "page.to_frame('value')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.4 Editions per page with Wikiwho"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from wikiwho_wrapper import WikiWho\n",
    "import pandas as pd\n",
    "wikiwho = WikiWho(lng='en')\n",
    "editions = wikiwho.dv.editions(page.page_id)\n",
    "editions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = editions\n",
    "df['year_month'] = pd.to_datetime(df['year_month'])\n",
    "\n",
    "# Group the data by year month and page (drop the editor information)\n",
    "df.drop('editor_id', axis=1).groupby(['year_month','page_id']).sum()\n",
    "\n",
    "# add columns with the total actions\n",
    "df = df.join(pd.DataFrame(\n",
    "    df.loc[:,'adds':'adds_stopword_count'].values +\\\n",
    "    df.loc[:,'dels':'dels_stopword_count'].values +\\\n",
    "    df.loc[:,'reins':'reins_stopword_count'].values, \n",
    "    index=df.index, \n",
    "    columns=['actions', \n",
    "             'actions_surv_48h', \n",
    "             'actions_persistent', \n",
    "             'actions_stopword_count']\n",
    "))\n",
    "\n",
    "# Visualization\n",
    "from visualization.editions_listener import DFListener\n",
    "from ipywidgets import interact\n",
    "listener = DFListener(df)\n",
    "# interact(listener.editions_per_month, \n",
    "#          begin=df.year_month,\n",
    "#          end=df.year_month.sort_values(ascending=False),\n",
    "#          actions=['All Actions', 'Additions', 'Reinsertions', 'Deletions'])\n",
    "actions = df.loc[:,'actions':'actions_stopword_count'].columns.append(\n",
    "    df.loc[:,'adds':'reins_stopword_count'].columns)\n",
    "interact(listener.editions_per_month, \n",
    "         begin=df.year_month,\n",
    "         end=df.year_month.sort_values(ascending=False),\n",
    "         granularity=['Yearly', 'Monthly'],\n",
    "         first_action=actions,\n",
    "         second_action=actions)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conflict Score of Editors of a Page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "editors = editions['editor_id'].unique()\n",
    "editors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from wikiwho_wrapper import WikiWhoAPI, DataView as WikiWhoDV\n",
    "from metrics.Conflict_Score import conflictScore_token_list\n",
    "\n",
    "api = WikiWhoAPI(lng='en')\n",
    "wikiwho_dv = WikiWhoDV(api)\n",
    "revisions = wikiwho_dv.rev_ids_of_article(page.page_id)\n",
    "tokens = wikiwho_dv.all_content(page.page_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from metrics.Conflict_Score import conflictScore_token_list\n",
    "conflictScore_token_list(tokens, 162969, revisions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "revisions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens.loc[tokens['in'] == -1,'in'] = tokens.loc[tokens['in'] == -1,'o_rev_id']\n",
    "tokens.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tokens[tokens['o_editor']=='25450560']\n",
    "\n",
    "rev_ins = revisions[['rev_time', 'rev_id', 'o_editor']].rename(\n",
    "    columns={'rev_time': 'in_rev_time',\n",
    "             'rev_id': 'in',\n",
    "             'o_editor': 'in_editor'}\n",
    ")\n",
    "\n",
    "rev_outs = revisions[['rev_time', 'rev_id', 'o_editor']].rename(\n",
    "    columns={\n",
    "        'rev_time': 'out_rev_time', \n",
    "        'rev_id': 'out',\n",
    "        'o_editor': 'out_editor'\n",
    "    }\n",
    ")\n",
    "\n",
    "fulltokens = pd.merge(tokens, rev_ins, how='left', on='in')\n",
    "fulltokens = pd.merge(fulltokens, rev_outs, how='left', on='out')\n",
    "\n",
    "fulltokens.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens.set_index('token_id')\n",
    "\n",
    "possible_conflicts = tokens.groupby('token_id').size() > 4\n",
    "possible_conflicts[possible_conflicts].index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = tokens[tokens['token_id'].isin(possible_conflicts[possible_conflicts].index)]\n",
    "\n",
    "tmp.groupby('token_id').shift(1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name, tgroup in tmp.groupby('token_id'):\n",
    "    pass\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tgroup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tgroup.shift(-1).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rgrouprn = tgroup.rename(\n",
    "    columns={'in': 'rev_id_in', 'out':'rev_id_out'}).reset_index()\n",
    "rgrouprn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.wide_to_long(rgrouprn,\n",
    "                'rev_id', \n",
    "                'index',\n",
    "               'test', sep='_', suffix='.+')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from wikiwho_wrapper import WikiWhoAPI, DataView as WikiWhoDV\n",
    "from metrics.Conflict_Score import conflictScore_token_list\n",
    "\n",
    "api = WikiWhoAPI(lng='en')\n",
    "wikiwho_dv = WikiWhoDV(api)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "revisions = wikiwho_dv.rev_ids_of_article('Evolution')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "revisions = revisions.rename(columns={'o_editor': 'editor'})\n",
    "revisions['Evolution'] = pd.to_datetime(revisions['rev_time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#revisions['rev_time__'] = pd.to_datetime(dups_sorted['rev_time'],format='%Y-%m-%dT%H:%M:%SZ')\n",
    "\n",
    "#revisions['rev_time__'] = dups_sorted['rev_time'].str.,format='%Y-%m-%dT%H:%M:%SZ')\n",
    "\n",
    "\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#rev_id = revisions[revisions['rev_time'] < pd.Timestamp(2016,11,1)].sort_values(\n",
    "#    'rev_time', ascending=False).iloc[0,:]['rev_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spec_revision = wikiwho.dv.specific_rev_content_by_article_title(the_page, rev_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spec_tokens = pd.merge(\n",
    "    spec_revision.drop(columns=['rev_id', 'rev_time', 'rev_editor']),\n",
    "    tokens[['token_id', 'in', 'out']], \n",
    "    how='left', on='token_id')\n",
    "spec_tokens.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tokens_alt = tokens.copy()\n",
    "#tokens_alt.loc[tokens_alt['in'] == -1,'in'] = tokens_alt.loc[tokens_alt['in'] == -1,'o_rev_id']\n",
    "#tokens_alt.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Starts here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from wikiwho_wrapper import WikiWho\n",
    "the_page = 'Chicago' # \n",
    "#the_page = page.page_id\n",
    "wikiwho = WikiWho(lng='en')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "revisions = wikiwho.dv.rev_ids_of_article(the_page)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "revisions = revisions.rename(columns={'o_editor': 'editor'})\n",
    "revisions['rev_time'] = pd.to_datetime(revisions['rev_time'])\n",
    "revisions.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens = wikiwho.dv.all_content(the_page)\n",
    "tokens.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens.loc[tokens['in'] == -1,'in'] = tokens.loc[tokens['in'] == -1,'o_rev_id']\n",
    "the_tokens = tokens #spec_tokens\n",
    "dups = the_tokens[the_tokens.duplicated(subset=['token_id'], keep=False)]\n",
    "#dups = the_tokens#[the_tokens['in'] != -1]\n",
    "dups.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_words = open('data/stopword_list.txt', 'r').read().split()\n",
    "dups = dups[~dups['token'].isin(stop_words)]\n",
    "dups.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dups_long = pd.wide_to_long(\n",
    "    dups.rename(columns={'in': 'rev_id_in',\n",
    "                         'out':'rev_id_out'}).reset_index(),\n",
    "    'rev_id','index','action', sep='_', suffix='.+'\n",
    ").reset_index().drop(columns='index').sort_values('token_id')\n",
    "dups_long.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dups_merged = pd.merge(dups_long, revisions[['rev_time', 'rev_id', 'editor']], \n",
    "         how='left', on='rev_id')\n",
    "dups_sorted = dups_merged.sort_values(['token_id', 'rev_time'])\n",
    "\n",
    "\n",
    "dups_sorted['time_diff'] = dups_sorted['rev_time'] - dups_sorted.shift(2)['rev_time']\n",
    "to_remove = ((dups_sorted['o_rev_id'] == dups_sorted['rev_id']) | \n",
    "             (dups_sorted.shift(1)['o_rev_id'] == dups_sorted.shift(1)['rev_id']))\n",
    "dups_sorted.loc[to_remove,'time_diff'] = np.nan\n",
    "\n",
    "# for testing (the bottom line is equivalent to the above 3 but slow)\n",
    "#dups_sorted['time_diff2'] = dups_sorted.groupby('token_id').apply(lambda group: group['rev_time'] - group.shift(2)['rev_time']).values\n",
    "#(dups_sorted['time_diff'].fillna(-1) == dups_sorted['time_diff2'].fillna(-1)).all()\n",
    "\n",
    "dups_sorted.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dups_dated = dups_sorted\n",
    "#dups_dated = dups_sorted[dups_sorted['rev_time'] < pd.Timestamp(2016,10,10)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# removes the last out\n",
    "#dups_dated = dups_dated[dups_dated['rev_id'] != -1]\n",
    "#dups_not_minus.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conflicts = ((dups_dated['token_id'] == dups_dated.shift(1)['token_id']) &\n",
    "    (dups_dated['token_id'] == dups_dated.shift(2)['token_id']) &\n",
    "    (dups_dated['editor'] != dups_dated.shift(1)['editor']) &\n",
    "    (dups_dated['editor'] == dups_dated.shift(2)['editor']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dups_dated[conflicts].shape\n",
    "\n",
    "#dups_dated[dups_dated['token_id'] == 1760]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "c_t = 1 / (\n",
    "    np.log(\n",
    "        dups_dated.loc[conflicts,['token_id','time_diff']].groupby(\n",
    "            'token_id').sum().astype('timedelta64[s]') + 2\n",
    "    ) / np.log(3600))\n",
    "\n",
    "c_t.sum()# / dups_not_minus.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "c_t = np.log(3600) / (\n",
    "    np.log(\n",
    "        dups_dated.loc[conflicts,['token_id','time_diff']].groupby(\n",
    "            'token_id').sum().astype('timedelta64[s]') + 2\n",
    "    ))\n",
    "\n",
    "c_t.sum()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dups_dated['conflict'] = 0\n",
    "\n",
    "dups_dated.loc[conflicts, 'conflict'] = np.log(3600) / np.log(\n",
    "    dups_dated.loc[conflicts,'time_diff'].astype('timedelta64[s]')+2)\n",
    "\n",
    "# editor = '25450560'\n",
    "# actions = len(dups_dated[(dups_dated['editor'] == editor) & dups_dated['time_diff'].notnull()])\n",
    "\n",
    "# dups_dated.loc[conflicts & (dups_dated['editor'] == editor),'conflict'].sum() / actions\n",
    "\n",
    "dups_dated.loc[conflicts, 'conflict'].sum() / len(dups_dated[dups_dated['time_diff'].notnull()])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dups_dated.loc[conflicts, 'conflict'].sum() / len(dups_dated['rev_id'] == dups_dated['o_rev_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "confs_ed = dups_dated.loc[conflicts, ['editor', 'conflict']].groupby('editor').sum()\n",
    "\n",
    "actions = dups_dated.loc[dups_dated['time_diff'].notnull(),['editor','action']].groupby('editor').count()\n",
    "\n",
    "joined = confs_ed.join(actions)\n",
    "\n",
    "joined['conflict_score'] = joined['conflict']  / joined['action']\n",
    "\n",
    "joined.sort_values(\n",
    "    'conflict_score', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "editor = '25450560'\n",
    "conflictse = conflicts & (dups_dated['editor'] == editor)\n",
    "actions = len(dups_dated[(dups_dated['editor'] == editor) & dups_dated['time_diff'].notnull()])\n",
    "\n",
    "c_t = 1 / (\n",
    "    np.log(\n",
    "        dups_dated.loc[conflictse,['token_id','time_diff']].groupby(\n",
    "            'token_id').sum().astype('timedelta64[s]') + 2\n",
    "    ) / np.log(3600))\n",
    "\n",
    "c_t.sum() / actions # / dups_not_minus.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "c_t = dups_dated.loc[conflicts,:].groupby('token_id').size()\n",
    "\n",
    "c_t.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " conflicts = (\n",
    "    (dups_sorted['token_id'] == dups_sorted.shift(1)['token_id']) &\n",
    "    (dups_sorted['token_id'] == dups_sorted.shift(2)['token_id']) &\n",
    "    (dups_sorted['editor'] != dups_sorted.shift(1)['editor']) &\n",
    "    (dups_sorted['editor'] == dups_sorted.shift(2)['editor']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rev_ins = revisions[['rev_time', 'rev_id', 'o_editor']].rename(\n",
    "    columns={'rev_time': 'in_rev_time',\n",
    "             'rev_id': 'in',\n",
    "             'o_editor': 'in_editor'}\n",
    ")\n",
    "\n",
    "rev_outs = revisions[['rev_time', 'rev_id', 'o_editor']].rename(\n",
    "    columns={\n",
    "        'rev_time': 'out_rev_time', \n",
    "        'rev_id': 'out',\n",
    "        'o_editor': 'out_editor'\n",
    "    }\n",
    ")\n",
    "\n",
    "fulltokens = pd.merge(tokens, rev_ins, how='left', on='in')\n",
    "fulltokens = pd.merge(fulltokens, rev_outs, how='left', on='out')\n",
    "\n",
    "fulltokens.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def counting_token_conflict(tkn_group):\n",
    "    return ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &\n",
    "         (tkn_group['editor'] != tkn_group.shift(1)['editor'])).sum()\n",
    "\n",
    "res = dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def counting_token_conflict(tkn_group):\n",
    "    conflicts = ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &\n",
    "         (tkn_group['editor'] != tkn_group.shift(1)['editor']))\n",
    "    \n",
    "    return tkn_group.loc[conflicts, 'time_diff'].sum()\n",
    "\n",
    "res2 = dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from datetime import timedelta\n",
    "#res2[res2 > timedelta(0)]\n",
    "\n",
    "res2\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def counting_token_conflict(tkn_group):\n",
    "\n",
    "    return ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &\n",
    "         (tkn_group['editor'] != tkn_group.shift(1)['editor'])).sum()\n",
    "\n",
    "dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# (dups_sorted['action'] == dups_sorted.shift(1)['action']).sum()\n",
    "\n",
    "\n",
    "\n",
    "# for tkn, tkn_group in dups_sorted.groupby('token_id'):\n",
    "#     pass\n",
    "\n",
    "\n",
    "dups_sorted[dups_sorted['token_id'] == 7162]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}