{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "plt.style.use('ggplot')\n", "plt.rcParams['figure.figsize'] = 16, 9" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data analytics and machine learning with Python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# I - Acquiring data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### A simple HTTP request" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "import requests\n", "\n", "print(requests.get(\"http://example.com\").text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Communicating with APIs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "response = requests.get(\"https://www.googleapis.com/books/v1/volumes\", params={\"q\":\"machine learning\"})\n", "raw_data = response.json()\n", "titles = [item['volumeInfo']['title'] for item in raw_data['items']]\n", "titles" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parsing websites" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import lxml.html\n", "\n", "page = lxml.html.parse(\"http://www.blocket.se/stockholm?q=apple\")\n", "# ^ This is probably illegal. Blocket, please don't sue me!\n", "items_data = []\n", "for el in page.getroot().find_class(\"item_row\"):\n", " links = el.find_class(\"item_link\")\n", " images = el.find_class(\"item_image\")\n", " prices = el.find_class(\"list_price\")\n", " if links and images and prices and prices[0].text:\n", " items_data.append({\"name\": links[0].text,\n", " \"image\": images[0].attrib['src'],\n", " \"price\": int(prices[0].text.split(\":\")[0].replace(\" \", \"\"))})\n", "items_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading local files (CSV/JSON)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas\n", "\n", "df = pandas.read_csv('sample.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Display the DataFrame\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# DataFrame's columns\n", "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Values of a given column\n", "df.Model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Analyzing the dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Any missing values?\n", "df['Price']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df['Description']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Fill missing prices by a linear interpolation\n", "df['Description'] = df['Description'].fillna(\"No description is available.\")\n", "df['Price'] = df['Price'].interpolate()\n", "\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# II - Exploring data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "df = pandas.read_csv('sample2.csv')\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# This table has 3 columns: Office, Year, Sales\n", "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# It's really easy to query data with Pandas:\n", "df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# It's also easy to do aggregations...\n", "aggregated_stockholm_sales = df[df.Office == 'Stockholm'].groupby('Year').sum()\n", "aggregated_stockholm_sales" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "aggregated_ny_sales = df[df.Office == 'New York'].groupby('Year').sum()\n", "# ... and generate plots\n", "aggregated_stockholm_sales.plot(kind='bar')\n", "aggregated_ny_sales.plot(kind='bar', color='g')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Machine learning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature extraction" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn import feature_extraction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extracting features from text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "corpus = ['Cats? I love cats!',\n", " 'I love dogs.',\n", " 'I hate cats :(',\n", " 'I love trains',\n", " ]\n", "\n", "tfidf = feature_extraction.text.TfidfVectorizer()\n", "\n", "print(tfidf.fit_transform(corpus).toarray())\n", "print(tfidf.get_feature_names())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dict vectorizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import json\n", "\n", "\n", "data = [json.loads(\"\"\"{\"weight\": 194.0, \"sex\": \"female\", \"student\": true}\"\"\"),\n", " {\"weight\": 60., \"sex\": 'female', \"student\": True},\n", " {\"weight\": 80.1, \"sex\": 'male', \"student\": False},\n", " {\"weight\": 65.3, \"sex\": 'male', \"student\": True},\n", " {\"weight\": 58.5, \"sex\": 'female', \"student\": False}]\n", "\n", "vectorizer = feature_extraction.DictVectorizer(sparse=False)\n", "\n", "vectors = vectorizer.fit_transform(data)\n", "print(vectors)\n", "print(vectorizer.get_feature_names())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pre-processing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Scaling" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn import preprocessing\n", "\n", "data = [[10., 2345., 0., 2.],\n", " [3., -3490., 0.1, 1.99],\n", " [13., 3903., -0.2, 2.11]]\n", "\n", "preprocessing.normalize(data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Dimensionality reduction" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn import decomposition\n", "\n", "data = [[0.3, 0.2, 0.4, 0.32],\n", " [0.3, 0.5, 1.0, 0.19],\n", " [0.3, -0.4, -0.8, 0.22]]\n", "\n", "pca = decomposition.PCA()\n", "print(pca.fit_transform(data))\n", "print(pca.explained_variance_ratio_)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Machine learning models" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification (SVM)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn import datasets\n", "from sklearn import svm" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "iris = datasets.load_iris()\n", "\n", "X = iris.data[:, :2]\n", "y = iris.target\n", "\n", "plt.scatter(X[:, 0], X[:, 1], color=['rgb'[v] for v in y])\n", "\n", "to_predict = np.array([[4.35, 3.1], [5.61, 2.42]])\n", "plt.scatter(to_predict[:, 0], to_predict[:, 1], color='purple')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Training the model\n", "clf = svm.SVC(kernel='rbf')\n", "clf.fit(X, y)\n", "\n", "# Doing predictions\n", "print(clf.predict(to_predict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Regression (linear regression)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "from sklearn import linear_model\n", "import matplotlib.pyplot as plt\n", "\n", "def f(x):\n", " return x + np.random.random() * 3.\n", "\n", "X = np.arange(0, 5, 0.5)\n", "X = X.reshape((len(X), 1))\n", "y = list(map(f, X))\n", "\n", "clf = linear_model.LinearRegression()\n", "clf.fit(X, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "new_X = np.arange(0.2, 5.2, 0.3)\n", "new_X = new_X.reshape((len(new_X), 1))\n", "new_y = clf.predict(new_X)\n", "\n", "plt.scatter(X, y, color='g', label='Training data')\n", "\n", "plt.plot(new_X, new_y, '.-', label='Predicted')\n", "plt.legend()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clustering (DBScan)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.cluster import DBSCAN\n", "from sklearn.datasets.samples_generator import make_blobs\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "# Generate sample data\n", "centers = [[1, 1], [-1, -1], [1, -1]]\n", "X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.3,\n", " random_state=0)\n", "plt.scatter(X[:, 0], X[:, 1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Compute DBSCAN\n", "db = DBSCAN(eps=0.3, min_samples=10).fit(X)\n", "db.labels_" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "plt.scatter(X[:, 0], X[:, 1], c=['rgbw'[v] for v in db.labels_])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cross-validation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn import svm, cross_validation, datasets\n", "\n", "iris = datasets.load_iris()\n", "X, y = iris.data, iris.target\n", "\n", "model = svm.SVC()\n", "print(cross_validation.cross_val_score(model, X, y, scoring='precision_weighted'))\n", "print(cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# A more complex Machine Learning pipeline: \"what's cooking?\"\n", "This is a basic solution I wrote for the Kaggle competition \"What's cooking?\" where the goal is to predict to which type of cuisine a meal belongs to based on a list of ingredients.\n", "\n", "You'll need more advanced features and methods to win a Kaggle competition, but this already gets you 90% there." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from collections import Counter\n", "import json\n", "\n", "import pandas as pd\n", "import scipy.sparse\n", "import sklearn.pipeline\n", "import sklearn.cross_validation\n", "import sklearn.feature_extraction\n", "import sklearn.naive_bayes\n", "\n", "def open_dataset(path):\n", " with open(path) as file:\n", " data = json.load(file)\n", " df = pd.DataFrame(data).set_index('id')\n", " return df\n", "\n", "df = open_dataset('train.json')\n", "\n", "pipeline = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))\n", "pipeline_bis = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))\n", "\n", "def map_term_count(ingredients):\n", " return Counter(sum((i.split(' ') for i in ingredients), []))\n", "X = pipeline.fit_transform(df.ingredients.apply(Counter))\n", "X = scipy.sparse.hstack([X, pipeline_bis.fit_transform(df.ingredients.apply(map_term_count))])\n", "y = df.cuisine.values\n", "\n", "model = sklearn.naive_bayes.MultinomialNB(alpha=0.1)\n", "\n", "# Cross-validation\n", "score = sklearn.cross_validation.cross_val_score(model, X, y, cv=2)\n", "print(score)\n", "\n", "# Running on the test dataset\n", "t_df = open_dataset('test.json')\n", "X_test = pipeline.transform(t_df.ingredients.apply(Counter))\n", "X_test = scipy.sparse.hstack([X_test, pipeline_bis.transform(t_df.ingredients.apply(map_term_count))])\n", "\n", "model.fit(X, y)\n", "\n", "predictions = model.predict(X_test)\n", "result_df = pd.DataFrame(index=t_df.index)\n", "result_df['cuisine'] = pd.Series(predictions, index=result_df.index)\n", "\n", "result_df['ingredients'] = t_df['ingredients']\n", "result_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Thanks for following! I hope you learned a thing or two :-)\n", "\n", "Feel free to ask any question, or contact me on [kachkach.com](www.kachkach.com) / [@halflings](http://github.com/halflings)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }