{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "plt.style.use('ggplot')\n",
    "plt.rcParams['figure.figsize'] = 16, 9"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data analytics and machine learning with Python"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# I - Acquiring data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A simple HTTP request"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "print(requests.get(\"http://example.com\").text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Communicating with APIs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "response = requests.get(\"https://www.googleapis.com/books/v1/volumes\", params={\"q\":\"machine learning\"})\n",
    "raw_data = response.json()\n",
    "titles = [item['volumeInfo']['title'] for item in raw_data['items']]\n",
    "titles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parsing websites"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import lxml.html\n",
    "\n",
    "page = lxml.html.parse(\"http://www.blocket.se/stockholm?q=apple\")\n",
    "# ^ This is probably illegal. Blocket, please don't sue me!\n",
    "items_data = []\n",
    "for el in page.getroot().find_class(\"item_row\"):\n",
    "    links = el.find_class(\"item_link\")\n",
    "    images = el.find_class(\"item_image\")\n",
    "    prices = el.find_class(\"list_price\")\n",
    "    if links and images and prices and prices[0].text:\n",
    "        items_data.append({\"name\": links[0].text,\n",
    "                           \"image\": images[0].attrib['src'],\n",
    "                           \"price\": int(prices[0].text.split(\":\")[0].replace(\" \", \"\"))})\n",
    "items_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading local files (CSV/JSON)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas\n",
    "\n",
    "df = pandas.read_csv('sample.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Display the DataFrame\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# DataFrame's columns\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Values of a given column\n",
    "df.Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Analyzing the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Any missing values?\n",
    "df['Price']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df['Description']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Fill missing prices by a linear interpolation\n",
    "df['Description'] = df['Description'].fillna(\"No description is available.\")\n",
    "df['Price'] = df['Price'].interpolate()\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# II - Exploring data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df = pandas.read_csv('sample2.csv')\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# This table has 3 columns: Office, Year, Sales\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# It's really easy to query data with Pandas:\n",
    "df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# It's also easy to do aggregations...\n",
    "aggregated_stockholm_sales = df[df.Office == 'Stockholm'].groupby('Year').sum()\n",
    "aggregated_stockholm_sales"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "aggregated_ny_sales = df[df.Office == 'New York'].groupby('Year').sum()\n",
    "# ... and generate plots\n",
    "aggregated_stockholm_sales.plot(kind='bar')\n",
    "aggregated_ny_sales.plot(kind='bar', color='g')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Machine learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import feature_extraction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extracting features from text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "corpus = ['Cats? I love cats!',\n",
    "          'I love dogs.',\n",
    "          'I hate cats :(',\n",
    "          'I love trains',\n",
    "          ]\n",
    "\n",
    "tfidf = feature_extraction.text.TfidfVectorizer()\n",
    "\n",
    "print(tfidf.fit_transform(corpus).toarray())\n",
    "print(tfidf.get_feature_names())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dict vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "\n",
    "data = [json.loads(\"\"\"{\"weight\": 194.0, \"sex\": \"female\", \"student\": true}\"\"\"),\n",
    "        {\"weight\": 60., \"sex\": 'female', \"student\": True},\n",
    "        {\"weight\": 80.1, \"sex\": 'male', \"student\": False},\n",
    "        {\"weight\": 65.3, \"sex\": 'male', \"student\": True},\n",
    "        {\"weight\": 58.5, \"sex\": 'female', \"student\": False}]\n",
    "\n",
    "vectorizer = feature_extraction.DictVectorizer(sparse=False)\n",
    "\n",
    "vectors = vectorizer.fit_transform(data)\n",
    "print(vectors)\n",
    "print(vectorizer.get_feature_names())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pre-processing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Scaling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "\n",
    "data = [[10., 2345., 0., 2.],\n",
    "        [3., -3490., 0.1, 1.99],\n",
    "        [13., 3903., -0.2, 2.11]]\n",
    "\n",
    "preprocessing.normalize(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Dimensionality reduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import decomposition\n",
    "\n",
    "data = [[0.3, 0.2, 0.4,  0.32],\n",
    "        [0.3, 0.5, 1.0, 0.19],\n",
    "        [0.3, -0.4, -0.8, 0.22]]\n",
    "\n",
    "pca = decomposition.PCA()\n",
    "print(pca.fit_transform(data))\n",
    "print(pca.explained_variance_ratio_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Machine learning models"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification (SVM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "from sklearn import svm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "iris = datasets.load_iris()\n",
    "\n",
    "X = iris.data[:, :2]\n",
    "y = iris.target\n",
    "\n",
    "plt.scatter(X[:, 0], X[:, 1], color=['rgb'[v] for v in y])\n",
    "\n",
    "to_predict = np.array([[4.35, 3.1], [5.61, 2.42]])\n",
    "plt.scatter(to_predict[:, 0], to_predict[:, 1], color='purple')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Training the model\n",
    "clf = svm.SVC(kernel='rbf')\n",
    "clf.fit(X, y)\n",
    "\n",
    "# Doing predictions\n",
    "print(clf.predict(to_predict))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Regression (linear regression)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn import linear_model\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def f(x):\n",
    "    return x + np.random.random() * 3.\n",
    "\n",
    "X = np.arange(0, 5, 0.5)\n",
    "X = X.reshape((len(X), 1))\n",
    "y = list(map(f, X))\n",
    "\n",
    "clf = linear_model.LinearRegression()\n",
    "clf.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "new_X = np.arange(0.2, 5.2, 0.3)\n",
    "new_X = new_X.reshape((len(new_X), 1))\n",
    "new_y = clf.predict(new_X)\n",
    "\n",
    "plt.scatter(X, y, color='g', label='Training data')\n",
    "\n",
    "plt.plot(new_X, new_y, '.-', label='Predicted')\n",
    "plt.legend()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clustering (DBScan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.cluster import DBSCAN\n",
    "from sklearn.datasets.samples_generator import make_blobs\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Generate sample data\n",
    "centers = [[1, 1], [-1, -1], [1, -1]]\n",
    "X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.3,\n",
    "                            random_state=0)\n",
    "plt.scatter(X[:, 0], X[:, 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Compute DBSCAN\n",
    "db = DBSCAN(eps=0.3, min_samples=10).fit(X)\n",
    "db.labels_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.scatter(X[:, 0], X[:, 1], c=['rgbw'[v] for v in db.labels_])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import svm, cross_validation, datasets\n",
    "\n",
    "iris = datasets.load_iris()\n",
    "X, y = iris.data, iris.target\n",
    "\n",
    "model = svm.SVC()\n",
    "print(cross_validation.cross_val_score(model, X, y, scoring='precision_weighted'))\n",
    "print(cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# A more complex Machine Learning pipeline: \"what's cooking?\"\n",
    "This is a basic solution I wrote for the Kaggle competition \"What's cooking?\" where the goal is to predict to which type of cuisine a meal belongs to based on a list of ingredients.\n",
    "\n",
    "You'll need more advanced features and methods to win a Kaggle competition, but this already gets you 90% there."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "import json\n",
    "\n",
    "import pandas as pd\n",
    "import scipy.sparse\n",
    "import sklearn.pipeline\n",
    "import sklearn.cross_validation\n",
    "import sklearn.feature_extraction\n",
    "import sklearn.naive_bayes\n",
    "\n",
    "def open_dataset(path):\n",
    "    with open(path) as file:\n",
    "        data = json.load(file)\n",
    "    df = pd.DataFrame(data).set_index('id')\n",
    "    return df\n",
    "\n",
    "df = open_dataset('train.json')\n",
    "\n",
    "pipeline = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))\n",
    "pipeline_bis = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))\n",
    "\n",
    "def map_term_count(ingredients):\n",
    "    return Counter(sum((i.split(' ') for i in ingredients), []))\n",
    "X = pipeline.fit_transform(df.ingredients.apply(Counter))\n",
    "X = scipy.sparse.hstack([X, pipeline_bis.fit_transform(df.ingredients.apply(map_term_count))])\n",
    "y = df.cuisine.values\n",
    "\n",
    "model = sklearn.naive_bayes.MultinomialNB(alpha=0.1)\n",
    "\n",
    "# Cross-validation\n",
    "score = sklearn.cross_validation.cross_val_score(model, X, y, cv=2)\n",
    "print(score)\n",
    "\n",
    "# Running on the test dataset\n",
    "t_df = open_dataset('test.json')\n",
    "X_test = pipeline.transform(t_df.ingredients.apply(Counter))\n",
    "X_test = scipy.sparse.hstack([X_test, pipeline_bis.transform(t_df.ingredients.apply(map_term_count))])\n",
    "\n",
    "model.fit(X, y)\n",
    "\n",
    "predictions = model.predict(X_test)\n",
    "result_df = pd.DataFrame(index=t_df.index)\n",
    "result_df['cuisine'] = pd.Series(predictions, index=result_df.index)\n",
    "\n",
    "result_df['ingredients'] = t_df['ingredients']\n",
    "result_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Thanks for following! I hope you learned a thing or two :-)\n",
    "\n",
    "Feel free to ask any question, or contact me on [kachkach.com](www.kachkach.com) / [@halflings](http://github.com/halflings)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}