{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "import inspect\n", "import os\n", "import re\n", "import subprocess\n", "import time\n", "\n", "from bs4 import BeautifulSoup\n", "import numpy as np\n", "import pandas as pd\n", "import ray\n", "import requests\n", "from selenium import webdriver\n", "from selenium.webdriver.common.keys import Keys" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2019-05-31 00:12:44,231\tWARNING worker.py:1341 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.\n", "2019-05-31 00:12:44,232\tINFO node.py:497 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-31_00-12-44_232626_19160/logs.\n", "2019-05-31 00:12:44,340\tINFO services.py:409 -- Waiting for redis server at 127.0.0.1:46401 to respond...\n", "2019-05-31 00:12:44,448\tINFO services.py:409 -- Waiting for redis server at 127.0.0.1:23890 to respond...\n", "2019-05-31 00:12:44,450\tINFO services.py:806 -- Starting Redis shard with 6.58 GB max memory.\n", "2019-05-31 00:12:44,460\tINFO node.py:511 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-31_00-12-44_232626_19160/logs.\n", "2019-05-31 00:12:44,462\tINFO services.py:1441 -- Starting the Plasma object store with 9.87 GB memory using /dev/shm.\n" ] }, { "data": { "text/plain": [ "{'node_ip_address': '172.31.31.63',\n", " 'redis_address': '172.31.31.63:46401',\n", " 'object_store_address': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160/sockets/plasma_store',\n", " 'raylet_socket_name': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160/sockets/raylet',\n", " 'webui_url': None,\n", " 'session_dir': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160'}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ray.init()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Download code\n", "We go through the top python kaggle entries and download the code used." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "chrome_options = webdriver.chrome.options.Options()\n", "chrome_options.add_argument(\"--headless\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "driver = webdriver.Chrome('dependencies/chromedriver', options=chrome_options)\n", "driver.get(\"https://www.kaggle.com/kernels?sortBy=voteCount&language=Python\")\n", "time.sleep(1)\n", "\n", "elem = driver.find_element_by_tag_name(\"body\")\n", "\n", "# number per page varies depending on browser driver\n", "no_of_pagedowns = 300\n", "\n", "while no_of_pagedowns:\n", " elem.send_keys(Keys.PAGE_DOWN)\n", " time.sleep(1)\n", " no_of_pagedowns-=1\n", "\n", "soup = BeautifulSoup(driver.page_source, \"html.parser\")\n", "kernel_links = ['https://www.kaggle.com' + a['href'] \n", " for a in soup.find_all(\"a\", class_=\"block-link__anchor\")]\n", "driver.quit()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2800\n" ] }, { "data": { "text/plain": [ "['https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python',\n", " 'https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python',\n", " 'https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard',\n", " 'https://www.kaggle.com/kanncaa1/data-sciencetutorial-for-beginners',\n", " 'https://www.kaggle.com/dansbecker/how-models-work']" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(kernel_links))\n", "kernel_links[:5]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "@ray.remote\n", "def get_download_link(kernel_link):\n", " try:\n", " driver = webdriver.Chrome('dependencies/chromedriver', options=chrome_options)\n", " driver.get(kernel_link)\n", " innerHTML = driver.execute_script(\"return document.body.innerHTML\")\n", " soup = BeautifulSoup(innerHTML, 'html.parser')\n", " link = soup.find_all(\"a\", class_=\"sc-hwNDZK gNPGxN\")[0]['href']\n", " link = \"https://www.kaggle.com/kernels/scriptcontent/{}/download\".format(link.split(\"/\")[-1])\n", " driver.quit()\n", " #print(\"DONE\")\n", " return link\n", " except Exception as e:\n", " print(\"Error: {}\".format(e))\n", " return None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "code_links = []\n", "for link in kernel_links:\n", " code_links.append(get_download_link.remote(link))\n", "code_links = ray.get(code_links)\n", "code_links = [link for link in code_links if link is not None]\n", "print(len(code_links))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2795\n" ] } ], "source": [ "code_links = [link for link in code_links if link is not None]\n", "print(len(code_links))" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "with open(\"code_links.txt\", \"w\") as file:\n", " [file.write(\"{}\\n\".format(link)) for link in code_links]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "@ray.remote\n", "def download_files(link):\n", " \"\"\"Download the data at the link.\n", " \n", " Parameters\n", " ----------\n", " link: str\n", " Link to the data.\n", " \"\"\"\n", " cmd = \"wget --content-disposition {} -P data\".format(link)\n", " subprocess.Popen(cmd, shell=True, executable='/bin/bash')\n", "\n", "for code_link in code_links:\n", " download_files.remote(code_link)\n", "\n", "# We block on downloading files\n", "ray.get([download_files.remote(code_link) for code_link in code_links])" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# Clean up directory so that only the code remains.\n", "for filename in os.listdir(\"data\"):\n", " if filename == \".DS_Store\":\n", " pass\n", " \n", " name_parts = filename.split(\".\")\n", " if len(name_parts) == 2:\n", " name_parts.append(\"0\")\n", " \n", " if name_parts[1] in ['ipynb', 'py']:\n", " new_name = name_parts[0] + name_parts[2] + \".\" + name_parts[1]\n", " new_name = 'r' + new_name if name_parts[1] == 'py' else new_name\n", " os.rename('data/' + filename, 'data/' + new_name)\n", "\n", " else:\n", " os.remove('data/' + filename)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Mine code\n", "We go through each kaggle entry and mine it for the relevant pandas functions." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Get all the possible functions from base pandas class, dataframes, and series.\n", "search_tokens = set()\n", "\n", "objects = [pd, pd.DataFrame, pd.Series]\n", "indexers = ['iloc', 'iat', 'ix', 'loc', 'at']\n", "for obj in objects:\n", " for token in dir(obj):\n", " # We do not consider private functions or properties\n", " if token[0] == \"_\" and token[:2] != \"__\":\n", " continue\n", " elif inspect.isfunction(getattr(obj, token)):\n", " # For functions, we search for \".function_name(\"\n", " search_tokens.add(\"\\.{}\\(\".format(token))\n", " elif token in indexers:\n", " # For indexing functions, we searhc for \".indexing_function[\"\n", " search_tokens.add(\"\\.{}\\[\".format(token))\n", " else:\n", " # For properties, we add only a period in front\n", " search_tokens.add(\"\\.{}\".format(token))\n", "\n", "# We compile the search tokens together for improved performance\n", "search_tokens = ray.put(search_tokens)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "@ray.remote\n", "def parse_script(counter, script_name, search_tokens):\n", " \"\"\"Parse the script and search for the desired regex expressions.\n", " \n", " Parameters\n", " ----------\n", " counter: Ray actor\n", " Ray actor that that has the method count that takes in a list.\n", " script_name: str\n", " File name of the script.\n", " search_tokens: str\n", " Regex expression as a string. We need this to be a string because\n", " ray cannot pickle compiled regex expressions.\n", " \"\"\"\n", " with open(script_name) as script_file:\n", " code = script_file.read().splitlines()\n", "\n", " search_tokens = re.compile(\"|\".join(search_tokens))\n", " for line in code:\n", " return re.findall(search_tokens, line)\n", "\n", "@ray.remote\n", "class TokenCounter(object):\n", " \"\"\"Ray actor class that tracks the times each token has appeared.\n", " \"\"\"\n", " def __init__(self):\n", " self.token_counts = defaultdict(lambda: 0)\n", " \n", " def count(self, found_token):\n", " \"\"\"Adds to the count of each token in list.\n", " \n", " Parameters\n", " ----------\n", " found_token: list\n", " List of tokens\n", " \"\"\"\n", " for func in found_token:\n", " self.token_counts[func] += 1\n", "\n", " def get_counts(self):\n", " \"\"\"Return the token counts as dict.\"\"\"\n", " return dict(self.token_counts)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [], "source": [ "counter = TokenCounter.remote()\n", "\n", "# We have to block on the parser to make sure everything is passed to actor before we can call the actor method\n", "results = ray.get([parse_script.remote(counter, \"data/{}\".format(filename), search_tokens) \n", " for filename in os.listdir('data')])\n", "ray.get([counter.count.remote(result) for result in results])\n", "method_counts = ray.get(counter.get_counts.remote())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
functioncount
258searchsorted2
259ExcelWriter2
260read_html2
261read_clipboard2
262le2
\n", "
" ], "text/plain": [ " function count\n", "258 searchsorted 2\n", "259 ExcelWriter 2\n", "260 read_html 2\n", "261 read_clipboard 2\n", "262 le 2" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mFreq = pd.DataFrame(method_counts.items(), columns=[\"function\", \"count\"]).sort_values('count', ascending=False)\n", "mFreq[\"function\"] = mFreq[\"function\"].apply(lambda x: x.lstrip(\"\\.\").rstrip('\\('))\n", "mFreq.to_csv(\"results.csv\")\n", "mFreq.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "224.391px" }, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }