{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "rF2trPuyzm9C" }, "source": [ "# In-Class Basics\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ipcsUFDUzm9C" }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": { "id": "MCJe_ITJzm9G" }, "source": [ "**Linear Regression**\n", "\n", "The goal of this week's exercise is to explore a simple linear regression problem based on Portugese white wine.\n", "\n", "The dataset is based on \n", "Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. **Modeling wine preferences by data mining from physicochemical properties**. Published in Decision Support Systems, Elsevier, 47(4):547-553, 2009. \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NopU99AT9G7s", "outputId": "d7e8848e-b9c0-4eb4-8f18-5acda9d8c343", "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-05-10 08:16:34-- https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:17:07-- (try: 2) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:17:41-- (try: 3) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:18:16-- (try: 4) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:18:52-- (try: 5) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:19:28-- (try: 6) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:20:06-- (try: 7) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:20:45-- (try: 8) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:21:25-- (try: 9) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:22:06-- (try:10) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:22:48-- (try:11) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:23:30-- (try:12) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:24:12-- (try:13) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:24:54-- (try:14) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:25:36-- (try:15) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:26:18-- (try:16) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:27:00-- (try:17) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:27:42-- (try:18) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:28:24-- (try:19) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Retrying.\n", "\n", "--2021-05-10 08:29:06-- (try:20) https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... failed: Connection timed out.\n", "Giving up.\n", "\n" ] } ], "source": [ "# The code snippet below is responsible for downloading the dataset to\n", "# Google. You can directly download the file using the link\n", "# if you work with a local anaconda setup\n", "\n", "# Temporarily replaced link as the ML dataset archive seems to be down\n", "#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv\n", "!wget https://raw.githubusercontent.com/zygmuntz/wine-quality/master/winequality/winequality-white.csv" ] }, { "cell_type": "markdown", "metadata": { "id": "zEiZ19s5zm9G" }, "source": [ "**Before we start**\n", "\n", "The downloaded file contains data on 4989 wines. For each wine 11 features are recorded (column 0 to 10). The final columns contains the quality of the wine. This is what we want to predict.\n", "\n", "List of columns/features: \n", "0. fixed acidity\n", "1. volatile acidity\n", "2. citric acid\n", "3. residual sugar\n", "4. chlorides\n", "5. free sulfur dioxide\n", "6. total sulfur dioxide\n", "7. density\n", "8. pH\n", "9. sulphates\n", "10. alcohol\n", "11. quality\n", "\n", "\n", "\n", "[file]: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5ONqeI5Uzm9H", "outputId": "d31ba8d4-cf0a-4f25-8a93-9091c0dd041a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('data:', (4898, 12))\n", "First example:\n", "('Features:', array([7.600e+00, 3.800e-01, 2.800e-01, 4.200e+00, 2.900e-02, 7.000e+00,\n", " 1.120e+02, 9.906e-01, 3.000e+00, 4.100e-01, 1.260e+01]))\n", "('Quality:', 6.0)\n" ] } ], "source": [ "# load all examples from the file\n", "data = np.genfromtxt('winequality-white.csv',delimiter=\";\",skip_header=1)\n", "\n", "print(\"data:\", data.shape)\n", "\n", "# Prepare for proper training\n", "np.random.shuffle(data) # randomly sort examples\n", "\n", "# take the first 3000 examples for training\n", "# (remember array slicing from last week)\n", "X_train = data[:3000,:11] # all features except last column\n", "y_train = data[:3000,11] # quality column\n", "\n", "# and the remaining examples for testing\n", "X_test = data[3000:,:11] # all features except last column\n", "y_test = data[3000:,11] # quality column\n", "\n", "print(\"First example:\")\n", "print(\"Features:\", X_train[0])\n", "print(\"Quality:\", y_train[0])" ] }, { "cell_type": "markdown", "metadata": { "id": "jiwnyNHpzm9L" }, "source": [ "# Homework\n", "\n", "1. First we want to understand the data better. Plot (`plt.hist`) the distribution of each of the features for the training data as well as the 2D distribution (either `plt.scatter` or `plt.hist2d`) of each feature versus quality. Also calculate the correlation coefficient (`np.corrcoef`) for each feature with quality. Which feature by itself seems most\n", " predictive for the quality?\n", "\n", "2. Calculate the linear regression weights as derived in the lecture. Numpy provides functions for matrix multiplication (`np.matmul`), matrix transposition (`.T`) and matrix inversion (`np.linalg.inv`).\n", "\n", "3. Use the weights to predict the quality for the test dataset. How does your predicted quality compare with the true quality of the test data? Calculate the correlation coefficient between predicted and true quality and draw the scatter plot. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MzzCP2ST898a" }, "outputs": [], "source": [ "x = np.random.uniform(size=(3,4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MlbmmHoA9BJQ", "outputId": "98e65963-3173-46c7-f47b-6b9a6714c187" }, "outputs": [ { "data": { "text/plain": [ "array([[0.27061972, 0.85093187, 0.06038869, 0.6430975 ],\n", " [0.05802941, 0.1492127 , 0.93073299, 0.70555297],\n", " [0.4806267 , 0.27201085, 0.75607278, 0.88637951]])" ] }, "execution_count": 27, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "x" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MqRyPzzN-ar0" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HiYXwCle9Fow", "outputId": "949146bf-7184-488f-a347-4af589917bf6" }, "outputs": [ { "data": { "text/plain": [ "0.14921269768865764" ] }, "execution_count": 28, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "x[1,1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0M56hD2R9VYo", "outputId": "7199e2d2-4d5d-4500-ea8a-37c9c12046b5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0.93073299 0.70555297]\n", " [0.75607278 0.88637951]]\n" ] } ], "source": [ "f = x[1:,2:]\n", "print(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "J3f29BC99cDK", "outputId": "91dd6002-c95f-47ea-de6f-ba4279175b8d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[9.99000000e+02 7.05552973e-01]\n", " [7.56072781e-01 8.86379512e-01]]\n" ] } ], "source": [ "f[0,0] = 999\n", "print(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZnvjBmbs9hdq", "outputId": "c30f3f02-c9a9-42ae-da07-af13151e6596" }, "outputs": [ { "data": { "text/plain": [ "array([[2.70619720e-01, 8.50931871e-01, 6.03886907e-02, 6.43097505e-01],\n", " [5.80294054e-02, 1.49212698e-01, 9.99000000e+02, 7.05552973e-01],\n", " [4.80626701e-01, 2.72010854e-01, 7.56072781e-01, 8.86379512e-01]])" ] }, "execution_count": 35, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "x" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c4kcHKQP-tTp" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "Exercise 4", "provenance": [] }, "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 1 }