{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import sqlite3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['content-prefs.sqlite',\n",
       " 'places.sqlite',\n",
       " 'kinto.sqlite',\n",
       " 'permissions.sqlite',\n",
       " 'formhistory.sqlite',\n",
       " 'storage-sync.sqlite',\n",
       " 'favicons.sqlite',\n",
       " 'cookies.sqlite',\n",
       " 'storage.sqlite',\n",
       " 'webappsstore.sqlite']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "firefox_profile_dir = '/home/bird/.mozilla/firefox/old_profiles/iadzfbcv.default/'\n",
    "[x for x in os.listdir(firefox_profile_dir) if x.endswith('sqlite')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "storage_file = '{}/webappsstore.sqlite'.format(firefox_profile_dir)\n",
    "storage_db = sqlite3.connect(storage_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('webappsstore2',)]\n"
     ]
    }
   ],
   "source": [
    "# %load '/home/bird/Documents/tracking technologies/notebooks/get_sqlite_tables.py'\n",
    "def list_tables_in_db(db):\n",
    "    print(db.cursor().execute(\"SELECT name FROM sqlite_master WHERE type='table';\").fetchall())\n",
    "\n",
    "list_tables_in_db(storage_db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "storage_df = pd.read_sql('SELECT * FROM webappsstore2', storage_db)\n",
    "#storage_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "storage_df['origin'] = storage_df.originKey.apply(lambda x: x[::-1].split(':.')[1])\n",
    "#storage_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "cookies_file = '{}/cookies.sqlite'.format(firefox_profile_dir)\n",
    "cookies_db = sqlite3.connect(cookies_file)\n",
    "cookied_df = pd.read_sql('SELECT * FROM moz_cookies', cookies_db)\n",
    "shared_values = []\n",
    "for v in cookied_df.value.unique():\n",
    "    matches = cookied_df[cookied_df.value.str.contains(v, regex=False)]\n",
    "    if len(matches.baseDomain.unique()) > 5:\n",
    "        shared_values.append(v)\n",
    "potential_ids = [x for x in shared_values if(len(x) > 10) & ('com' not in x)] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We have 25 potential ids. Things like:\n"
     ]
    }
   ],
   "source": [
    "# From cookie table\n",
    "print('We have', len(potential_ids), 'potential ids. Things like:')\n",
    "# potential_ids[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "67936421072632709762729202117726060613 found in local storage\n",
      "7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc found in local storage\n"
     ]
    }
   ],
   "source": [
    "for potential_id in potential_ids:\n",
    "    if len(storage_df[storage_df.value.str.contains(potential_id)]) > 0:\n",
    "        print(potential_id, 'found in local storage')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "      <th>origin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>291</th>\n",
       "      <td>{\"email\":null,\"timeIncId\":\"53fdd635-5007-4f53-...</td>\n",
       "      <td>people.com</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 value      origin\n",
       "291  {\"email\":null,\"timeIncId\":\"53fdd635-5007-4f53-...  people.com"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "repeated_id = '67936421072632709762729202117726060613'\n",
    "storage_df[storage_df.value.str.contains(repeated_id)][['value','origin']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "      <th>origin</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc</td>\n",
       "      <td>www.latimes.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>227</th>\n",
       "      <td>7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc</td>\n",
       "      <td>www.merriam-webster.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229</th>\n",
       "      <td>v2_bf1ca6db22b70291779ce41eb2e5aee5_7620423c-7...</td>\n",
       "      <td>www.merriam-webster.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>232</th>\n",
       "      <td>v2_bf1ca6db22b70291779ce41eb2e5aee5_7620423c-7...</td>\n",
       "      <td>tpc.googlesyndication.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>236</th>\n",
       "      <td>7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc</td>\n",
       "      <td>tpc.googlesyndication.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>357</th>\n",
       "      <td>v2_9a6c86f52b3f9239067936a17472df9f_7620423c-7...</td>\n",
       "      <td>www.huffingtonpost.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>359</th>\n",
       "      <td>7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc</td>\n",
       "      <td>www.huffingtonpost.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>407</th>\n",
       "      <td>v2_66138b9f6bd0ec8912d19cb714efd912_7620423c-7...</td>\n",
       "      <td>www.latimes.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>984</th>\n",
       "      <td>7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc</td>\n",
       "      <td>www.bloomberg.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>989</th>\n",
       "      <td>v2_96ea2f0533c59e2312c1f1112ced8f46_7620423c-7...</td>\n",
       "      <td>www.bloomberg.com</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 value  \\\n",
       "27    7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc   \n",
       "227   7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc   \n",
       "229  v2_bf1ca6db22b70291779ce41eb2e5aee5_7620423c-7...   \n",
       "232  v2_bf1ca6db22b70291779ce41eb2e5aee5_7620423c-7...   \n",
       "236   7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc   \n",
       "357  v2_9a6c86f52b3f9239067936a17472df9f_7620423c-7...   \n",
       "359   7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc   \n",
       "407  v2_66138b9f6bd0ec8912d19cb714efd912_7620423c-7...   \n",
       "984   7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc   \n",
       "989  v2_96ea2f0533c59e2312c1f1112ced8f46_7620423c-7...   \n",
       "\n",
       "                        origin  \n",
       "27             www.latimes.com  \n",
       "227    www.merriam-webster.com  \n",
       "229    www.merriam-webster.com  \n",
       "232  tpc.googlesyndication.com  \n",
       "236  tpc.googlesyndication.com  \n",
       "357     www.huffingtonpost.com  \n",
       "359     www.huffingtonpost.com  \n",
       "407            www.latimes.com  \n",
       "984          www.bloomberg.com  \n",
       "989          www.bloomberg.com  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "repeated_id = '7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc'\n",
    "storage_df[storage_df.value.str.contains(repeated_id)][['value', 'origin']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see this id being shared across multiple origins in local storage as well."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}