{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Aerospike Connect for Spark - SparkML Tutorial for Python\n", "## Tested with Java 8, Spark 2.4.0, Python 3.7, and Aerospike Spark Connector 2.5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup\n", "\n", "Below, a seed address for your Aerospike database cluster is required\n", "\n", "Check the given namespace is available, and your feature key is located as per AS_FEATURE_KEY_PATH\n", "\n", "Finally, review https://www.aerospike.com/enterprise/download/connectors/ to ensure AEROSPIKE_SPARK_JAR_VERSION is correct" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# IP Address or DNS name for one host in your Aerospike cluster\n", "AS_HOST =\"127.0.0.1\"\n", "# Name of one of your namespaces. Type 'show namespaces' at the aql prompt if you are not sure\n", "AS_NAMESPACE = \"test\" \n", "AS_FEATURE_KEY_PATH = \"/etc/aerospike/features.conf\"\n", "AEROSPIKE_SPARK_JAR_VERSION=\"2.5.0\"\n", "\n", "AS_PORT = 3000 # Usually 3000, but change here if not\n", "AS_CONNECTION_STRING = AS_HOST + \":\"+ str(AS_PORT)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Next we locate the Spark installation - this will be found using the SPARK_HOME environment variable that you will have set \n", "# if you followed the repository README\n", "\n", "import findspark\n", "findspark.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Here we download the Aerospike Spark jar\n", "import urllib\n", "import os\n", "\n", "def aerospike_spark_jar_download_url(version=AEROSPIKE_SPARK_JAR_VERSION):\n", " DOWNLOAD_PREFIX=\"https://www.aerospike.com/enterprise/download/connectors/aerospike-spark/\"\n", " DOWNLOAD_SUFFIX=\"/artifact/jar\"\n", " AEROSPIKE_SPARK_JAR_DOWNLOAD_URL = DOWNLOAD_PREFIX+AEROSPIKE_SPARK_JAR_VERSION+DOWNLOAD_SUFFIX\n", " return AEROSPIKE_SPARK_JAR_DOWNLOAD_URL\n", "\n", "def download_aerospike_spark_jar(version=AEROSPIKE_SPARK_JAR_VERSION):\n", " JAR_NAME=\"aerospike-spark-assembly-\"+AEROSPIKE_SPARK_JAR_VERSION+\".jar\"\n", " if(not(os.path.exists(JAR_NAME))) :\n", " urllib.request.urlretrieve(aerospike_spark_jar_download_url(),JAR_NAME)\n", " else :\n", " print(JAR_NAME+\" already downloaded\")\n", " return os.path.join(os.getcwd(),JAR_NAME)\n", "\n", "AEROSPIKE_JAR_PATH=download_aerospike_spark_jar()\n", "os.environ[\"PYSPARK_SUBMIT_ARGS\"] = '--jars ' + AEROSPIKE_JAR_PATH + ' pyspark-shell'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pyspark\n", "from pyspark.context import SparkContext\n", "from pyspark.sql.context import SQLContext\n", "from pyspark.sql.session import SparkSession\n", "from pyspark.sql.types import StringType, StructField, StructType, ArrayType, IntegerType, MapType, LongType, DoubleType" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get a spark session object and set required Aerospike configuration properties" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set up spark and point aerospike db to AS_HOST" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "sc = SparkContext.getOrCreate()\n", "spark = SparkSession(sc)\n", "sqlContext = SQLContext(sc)\n", "spark.conf.set(\"aerospike.namespace\",AS_NAMESPACE)\n", "spark.conf.set(\"aerospike.seedhost\",AS_CONNECTION_STRING)\n", "spark.conf.set(\"aerospike.keyPath\",AS_FEATURE_KEY_PATH )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create Sample Data and load it into Aerospike" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data created\n" ] } ], "source": [ "# We create age vs salary data, using three different Gaussian distributions\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import math\n", "\n", "# Create covariance matrix from std devs + correlation\n", "def covariance_matrix(std_dev_1,std_dev_2,correlation):\n", " return [[std_dev_1 ** 2, correlation * std_dev_1 * std_dev_2], \n", " [correlation * std_dev_1 * std_dev_2, std_dev_2 ** 2]]\n", "\n", "# Return a bivariate sample given means/std dev/correlation\n", "def age_salary_sample(distribution_params,sample_size):\n", " mean = [distribution_params[\"age_mean\"], distribution_params[\"salary_mean\"]]\n", " cov = covariance_matrix(distribution_params[\"age_std_dev\"],distribution_params[\"salary_std_dev\"],\n", " distribution_params[\"age_salary_correlation\"])\n", " return np.random.multivariate_normal(mean, cov, sample_size).T\n", "\n", "# Define the characteristics of our age/salary distribution\n", "age_salary_distribution_1 = {\"age_mean\":25,\"salary_mean\":50000,\n", " \"age_std_dev\":1,\"salary_std_dev\":5000,\"age_salary_correlation\":0.3}\n", "\n", "age_salary_distribution_2 = {\"age_mean\":45,\"salary_mean\":80000,\n", " \"age_std_dev\":4,\"salary_std_dev\":10000,\"age_salary_correlation\":0.7}\n", "\n", "age_salary_distribution_3 = {\"age_mean\":35,\"salary_mean\":70000,\n", " \"age_std_dev\":2,\"salary_std_dev\":9000,\"age_salary_correlation\":0.1}\n", "\n", "distribution_data = [age_salary_distribution_1,age_salary_distribution_2,age_salary_distribution_3]\n", "\n", "# Sample age/salary data for each distributions\n", "group_1_ages,group_1_salaries = age_salary_sample(age_salary_distribution_1,sample_size=100)\n", "group_2_ages,group_2_salaries = age_salary_sample(age_salary_distribution_2,sample_size=120)\n", "group_3_ages,group_3_salaries = age_salary_sample(age_salary_distribution_3,sample_size=80)\n", "\n", "ages=np.concatenate([group_1_ages,group_2_ages,group_3_ages])\n", "salaries=np.concatenate([group_1_salaries,group_2_salaries,group_3_salaries])\n", "\n", "print(\"Data created\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Turn the above records into a Data Frame\n", "# First of all, create an array of arrays\n", "inputBuf = []\n", "\n", "for i in range(0, len(ages)) :\n", " id = i + 1 # Avoid counting from zero\n", " name = \"Individual: {:03d}\".format(id)\n", " # Note we need to make sure values are typed correctly\n", " # salary will have type numpy.float64 - if it is not cast as below, an error will be thrown\n", " age = float(ages[i])\n", " salary = int(salaries[i])\n", " inputBuf.append((id, name,age,salary))\n", "\n", "# Convert to an RDD \n", "inputRDD = spark.sparkContext.parallelize(inputBuf)\n", " \n", "# Convert to a data frame using a schema\n", "schema = StructType([\n", " StructField(\"id\", IntegerType(), True),\n", " StructField(\"name\", StringType(), True),\n", " StructField(\"age\", DoubleType(), True),\n", " StructField(\"salary\",IntegerType(), True)\n", "])\n", "\n", "inputDF=spark.createDataFrame(inputRDD,schema)\n", "\n", "#Write the data frame to Aerospike, the id field is used as the primary key\n", "inputDF \\\n", ".write \\\n", ".mode('overwrite') \\\n", ".format(\"com.aerospike.spark.sql\") \\\n", ".option(\"aerospike.set\", \"salary_data\")\\\n", ".option(\"aerospike.updateByKey\", \"id\") \\\n", ".save()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 1: Load data into a DataFrame using user specified schema " ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---------------+------------------+------+\n", "| id| name| age|salary|\n", "+---+---------------+------------------+------+\n", "|239|Individual: 239| 34.17676930587447| 74530|\n", "|101|Individual: 101| 42.74153431470734| 66879|\n", "|194|Individual: 194| 47.76512911028296| 95551|\n", "| 31|Individual: 031| 25.01729346533023| 63919|\n", "|139|Individual: 139|48.790294708607526| 85122|\n", "+---+---------------+------------------+------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "# If we explicitly set the schema, using the previously created schema object\n", "# we effectively type the rows in the Data Frame\n", "\n", "loadedDFWithSchema=spark \\\n", ".read \\\n", ".format(\"com.aerospike.spark.sql\") \\\n", ".schema(schema) \\\n", ".option(\"aerospike.set\", \"salary_data\").load()\n", "\n", "loadedDFWithSchema.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 2: Explore your data" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idagesalary
count300.000000300.000000300.000000
mean150.50000035.55990466791.943333
std86.7467588.91247314869.353449
min1.00000022.61888138791.000000
25%75.75000025.34556552976.000000
50%150.50000035.67344268783.000000
75%225.25000043.72786878328.000000
max300.00000055.06345798322.000000
\n", "
" ], "text/plain": [ " id age salary\n", "count 300.000000 300.000000 300.000000\n", "mean 150.500000 35.559904 66791.943333\n", "std 86.746758 8.912473 14869.353449\n", "min 1.000000 22.618881 38791.000000\n", "25% 75.750000 25.345565 52976.000000\n", "50% 150.500000 35.673442 68783.000000\n", "75% 225.250000 43.727868 78328.000000\n", "max 300.000000 55.063457 98322.000000" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "\n", "#convert spark df to pandas df\n", "pdf = loadedDFWithSchema.toPandas()\n", "\n", "# Describe the data\n", "\n", "pdf.describe()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22 56\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEGCAYAAACKB4k+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUQklEQVR4nO3df7BfdX3n8ecLEgwRFBKyKZugN22RiCgIScSFooWqVKyEiq5M67AtJd0p7KJ2tkRmZ2XHdQZnrIi0lcaCjehWEGShgraAwE4dAUNMRRIUivy4yI+U8kMoCJH3/vE9OXsLCXzvzf3ec388HzN3vuec7/l+z/szJ7mvez7nnM9JVSFJEsBOXRcgSZo8DAVJUstQkCS1DAVJUstQkCS1ZnVdwI7Ya6+9amhoqOsyJGlKueWWW/65qhZs670pHQpDQ0OsW7eu6zIkaUpJcs/23rP7SJLUMhQkSS1DQZLUGtg5hSQXAO8BHq6qA5pl84CLgCHgbuADVfVokgDnAO8G/hX4T1W1flC1SdJIzz33HMPDwzzzzDNdlzKu5syZw+LFi5k9e3bfnxnkiea/Bv4M+NKIZauBa6vqrCSrm/nTgd8E9m1+3gJ8vnmVpIEbHh5m9913Z2hoiN7fqFNfVfHII48wPDzMkiVL+v7cwLqPqur/Av/ygsXHAmub6bXAyhHLv1Q9NwJ7JNl7ULVJ0kjPPPMM8+fPnzaBAJCE+fPnj/roZ6LPKSysqgea6QeBhc30IuC+EesNN8skaUJMp0DYaixt6uxEc/XG7B71uN1JViVZl2Td5s2bB1CZJM1cE33z2kNJ9q6qB5ruoYeb5fcD+4xYb3Gz7EWqag2wBmDZsmU+DELSuBtafeW4ft/dZx0zrt83SBMdClcAJwJnNa+Xj1h+apKv0jvB/PiIbqaBGO+dPihT6R+TpKlvYN1HSf4G+C6wX5LhJCfRC4N3JLkD+I1mHuAq4C7gTuALwB8Nqi5JmqxWrlzJIYccwhve8AbWrFkDwPnnn8/rXvc6VqxYwcknn8ypp54KwObNm3nf+97H8uXLWb58Od/5znfGpYaBHSlU1QnbeeuobaxbwCmDqkWSpoILLriAefPm8fTTT7N8+XKOOeYYPvGJT7B+/Xp23313jjzySA488EAATjvtND7ykY9w+OGHc++99/Kud72LTZs27XANU3pAPEmaTj73uc9x2WWXAXDfffdx4YUX8ra3vY158+YB8P73v58f//jHAFxzzTVs3Lix/ewTTzzBk08+yW677bZDNRgKkjQJXH/99VxzzTV897vfZe7cubz97W9n6dKl2/3r//nnn+fGG29kzpw541qHYx9J0iTw+OOPs+eeezJ37lxuv/12brzxRp566iluuOEGHn30UbZs2cKll17arv/Od76Tc889t53fsGHDuNThkYIkvUAXV/0dffTRnHfeebz+9a9nv/3249BDD2XRokWcccYZrFixgnnz5rF06VJe/epXA72uplNOOYU3velNbNmyhSOOOILzzjtvh+swFCRpEnjFK17BN7/5zRctX7ZsGatWrWLLli0cd9xxrFy5EoC99tqLiy66aNzrsPtIkiaxM888k4MOOogDDjiAJUuWtKEwKB4pSNIk9ulPf3pCt+eRgiTRG2p6uhlLmwwFSTPenDlzeOSRR6ZVMGx9nsJoL1m1+0jSjLd48WKGh4eZbiMvb33y2mgYCpJmvNmzZ4/q6WTTmd1HkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJahkKkqSWoSBJanUSCkk+kuS2JD9M8jdJ5iRZkuSmJHcmuSjJLl3UJkkz2YSHQpJFwH8FllXVAcDOwAeBTwFnV9WvAo8CJ010bZI003XVfTQL2DXJLGAu8ABwJHBJ8/5aYGU3pUnSzDXhoVBV9wOfBu6lFwaPA7cAj1XVlma1YWDRtj6fZFWSdUnWbd68eSJKlqQZo4vuoz2BY4ElwL8HXgkc3e/nq2pNVS2rqmULFiwYUJWSNDN10X30G8BPqmpzVT0HfB04DNij6U4CWAzc30FtkjSjdREK9wKHJpmbJMBRwEbgOuD4Zp0Tgcs7qE2SZrQuzincRO+E8nrg1qaGNcDpwEeT3AnMB86f6Nokaaab9fKrjL+q+jjw8RcsvgtY0UE5kqSGdzRLklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSp1VcoJHnjoAuRJHWv3yOFv0hyc5I/SvLqgVYkSerMrH5WqqpfS7Iv8PvALUluBr5YVVcPtDrNaEOrr+y6hL7cfdYxXZcgjZu+zylU1R3AfwdOB94GfC7J7Ul+e7QbTbJHkkuaz29K8tYk85JcneSO5nXP0X6vJGnH9HtO4U1JzgY2AUcCv1VVr2+mzx7Dds8BvlVVS4EDm+9dDVxbVfsC1zbzkqQJ1O+RwrnAeuDAqjqlqtYDVNVP6R099K05J3EEcH7zHc9W1WPAscDaZrW1wMrRfK8kacf1dU4BOAZ4uqp+AZBkJ2BOVf1rVV04ym0uATYDX0xyIHALcBqwsKoeaNZ5EFi4rQ8nWQWsAnjNa14zyk1Lkl5Kv0cK1wC7jpif2ywbi1nAwcDnq+rNwFO8oKuoqgqobX24qtZU1bKqWrZgwYIxliBJ2pZ+Q2FOVT25daaZnjvGbQ4Dw1V1UzN/Cb2QeCjJ3gDN68Nj/H5J0hj1GwpPJTl460ySQ4Cnx7LBqnoQuC/Jfs2io4CNwBXAic2yE4HLx/L9kqSx6/ecwoeBryX5KRDgl4D/uAPb/S/AV5LsAtwF/B69gLo4yUnAPcAHduD7JUlj0O/Na99LshTY+tf9j6rqubFutKo2AMu28dZRY/1OSdKO6/dIAWA5MNR85uAkVNWXBlKVJKkTfYVCkguBXwE2AL9oFhdgKEjSNNLvkcIyYP/mUlFJ0jTV79VHP6R3clmSNI31e6SwF7CxGR3151sXVtV7B1KVJKkT/YbCmYMsQpI0OfR7SeoNSV4L7FtV1ySZC+w82NIkSROt36uPTqY3CN08elchLQLOw/sKpBnNByFNP/2eaD4FOAx4AtoH7vy7QRUlSepGv6Hw86p6dutMkllsZxRTSdLU1W8o3JDkDGDXJO8Avgb87eDKkiR1od9QWE3vwTi3An8IXMUon7gmSZr8+r366HngC82PJGma6vfqo5+wjXMIVfXL416RJKkzoxn7aKs5wPvpXZ4qSZpG+jqnUFWPjPi5v6o+C3jhryRNM/12Hx08YnYnekcOo3kWgyRpCuj3F/ufjpjeAtyNj8uUpGmn36uPfn3QhUiSutdv99FHX+r9qvrM+JQjSerSaK4+Wg5c0cz/FnAzcMcgipIkdaPfUFgMHFxVPwNIciZwZVX97qAKkyRNvH6HuVgIPDti/tlmmSRpGun3SOFLwM1JLmvmVwJrB1KRJKkz/V599Mkk3wR+rVn0e1X1/cGVJU1PPpRGk12/3UcAc4EnquocYDjJkgHVJEnqSF+hkOTjwOnAx5pFs4EvD6ooSVI3+j1SOA54L/AUQFX9FNh9UEVJkrrRbyg8W1VFM3x2klcOriRJUlf6DYWLk/wlsEeSk4Fr8IE7kjTtvOzVR0kCXAQsBZ4A9gP+R1VdPeDaJEkT7GVDoaoqyVVV9UbAIJCkaazf7qP1SZYPtBJJUuf6DYW3ADcm+ackP0hya5If7MiGk+yc5PtJvtHML0lyU5I7k1yUZJcd+X5J0ui9ZPdRktdU1b3Auwaw7dOATcCrmvlPAWdX1VeTnAecBHx+ANuVJG3Hyx0p/B+AqroH+ExV3TPyZ6wbTbKY3jOe/6qZD3AkcEmzylp64ytJkibQy4VCRkz/8jhu97PAnwDPN/PzgceqakszPwws2mZByaok65Ks27x58ziWJEl6uVCo7UyPWZL3AA9X1S1j+XxVramqZVW1bMGCBeNRkiSp8XKXpB6Y5Al6Rwy7NtM081VVr9r+R7frMOC9Sd4NzKF3TuEcejfGzWqOFhYD94/huyVJO+AljxSqaueqelVV7V5Vs5rprfNjCQSq6mNVtbiqhoAPAt+uqt8BrgOOb1Y7Ebh8LN8vSRq70QydPWinAx9Ncie9cwznd1yPJM04/T55bSCq6nrg+mb6LmBFl/VI0kw3mY4UJEkdMxQkSS1DQZLUMhQkSS1DQZLUMhQkSS1DQZLUMhQkSS1DQZLU6vSOZkmabIZWX9l1CX25+6xjBvK9HilIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIkloTHgpJ9klyXZKNSW5LclqzfF6Sq5Pc0bzuOdG1SdJM18WRwhbgj6tqf+BQ4JQk+wOrgWural/g2mZekjSBJjwUquqBqlrfTP8M2AQsAo4F1jarrQVWTnRtkjTTdXpOIckQ8GbgJmBhVT3QvPUgsHA7n1mVZF2SdZs3b56YQiVphugsFJLsBlwKfLiqnhj5XlUVUNv6XFWtqaplVbVswYIFE1CpJM0cnYRCktn0AuErVfX1ZvFDSfZu3t8beLiL2iRpJuvi6qMA5wObquozI966AjixmT4RuHyia5OkmW5WB9s8DPgQcGuSDc2yM4CzgIuTnATcA3ygg9okaUab8FCoqn8Asp23j5rIWiRJ/5Z3NEuSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWrO6LkDjZ2j1lV2X0Je7zzqm6xIkbYdHCpKklqEgSWoZCpKklqEgSWoZCpKk1qQKhSRHJ/lRkjuTrO66HkmaaSZNKCTZGfhz4DeB/YETkuzfbVWSNLNMmlAAVgB3VtVdVfUs8FXg2I5rkqQZJVXVdQ0AJDkeOLqq/qCZ/xDwlqo69QXrrQJWNbP7AT+a0ELHZi/gn7suYhzZnslvurVpurUHum3Ta6tqwbbemHJ3NFfVGmBN13WMRpJ1VbWs6zrGi+2Z/KZbm6Zbe2DytmkydR/dD+wzYn5xs0ySNEEmUyh8D9g3yZIkuwAfBK7ouCZJmlEmTfdRVW1Jcirwd8DOwAVVdVvHZY2XKdXd1QfbM/lNtzZNt/bAJG3TpDnRLEnq3mTqPpIkdcxQkCS1DIVxlGSfJNcl2ZjktiSnNcvPTHJ/kg3Nz7u7rrUfSeYkuTnJPzbt+Z/N8iVJbmqGI7mouTBgSniJNv11kp+M2EcHdVzqqCTZOcn3k3yjmZ+y+2irbbRpyu6jJHcnubWpe12zbF6Sq5Pc0bzu2XWdYCiMty3AH1fV/sChwCkjhuo4u6oOan6u6q7EUfk5cGRVHQgcBByd5FDgU/Ta86vAo8BJ3ZU4attrE8B/G7GPNnRV4BidBmwaMT+V99FWL2wTTO199OtN3VvvTVgNXFtV+wLXNvOdMxTGUVU9UFXrm+mf0fsHvajbqsauep5sZmc3PwUcCVzSLF8LrJz46sbmJdo0ZSVZDBwD/FUzH6bwPoIXt2maOpbevoFJtI8MhQFJMgS8GbipWXRqkh8kuWCyHCb2ozmE3wA8DFwN/BPwWFVtaVYZZooF3wvbVFVb99Enm310dpJXdFfhqH0W+BPg+WZ+PlN8H/HiNm01VfdRAX+f5JZmqB6AhVX1QDP9ILCwm9L+LUNhAJLsBlwKfLiqngA+D/wKve6KB4A/7a660amqX1TVQfTuMF8BLO22oh33wjYlOQD4GL22LQfmAad3V2H/krwHeLiqbum6lvHyEm2akvuocXhVHUxvFOhTkhwx8s3q3RswKY5YDYVxlmQ2vUD4SlV9HaCqHmp+ET0PfIHeL9cppaoeA64D3grskWTrjY9TdjiSEW06uun6q6r6OfBFps4+Ogx4b5K76Y0sfCRwDlN7H72oTUm+PIX3EVV1f/P6MHAZvdofSrI3QPP6cHcV/n+Gwjhq+nLPBzZV1WdGLN97xGrHAT+c6NrGIsmCJHs007sC76B3nuQ64PhmtROByzspcAy206bbR/znDL2+3Smxj6rqY1W1uKqG6A0N8+2q+h2m8D7aTpt+d6ruoySvTLL71mngnfRqv4LevoFJtI8mzTAX08RhwIeAW5s+a4Az6D0w6CB6h4d3A3/YRXFjsDewNr0HIO0EXFxV30iyEfhqkv8FfJ9eEE4V22vTt5MsAAJsAP5zhzWOh9OZuvtoe74yRffRQuCyXpYxC/jfVfWtJN8DLk5yEnAP8IEOa2w5zIUkqWX3kSSpZShIklqGgiSpZShIklqGgiSpZShIY5RkZZJKMuXv8pa2MhSksTsB+IfmVZoWDAVpDJrxrQ6nNyT1B5tlOyX5iyS3N+PjX5Xk+Oa9Q5Lc0AyI9ncvuMtdmjQMBWlsjgW+VVU/Bh5Jcgjw28AQsD+9O9vfCu14WOcCx1fVIcAFwCe7KFp6OQ5zIY3NCfQGnoPeoG0n0Pv/9LVm4MMHk1zXvL8fcABwdTPUwc70RsuVJh1DQRqlJPPojUb6xiRF75d80Rv9cpsfAW6rqrdOUInSmNl9JI3e8cCFVfXaqhqqqn2AnwD/AryvObewEHh7s/6PgAVJ2u6kJG/oonDp5RgK0uidwIuPCi4FfoneU842Al8G1gOPV9Wz9ILkU0n+kd4In/9hwqqVRsFRUqVxlGS3qnoyyXzgZuCwqnqw67qkfnlOQRpf32ge4rML8AkDQVONRwqSpJbnFCRJLUNBktQyFCRJLUNBktQyFCRJrf8Ha73WtRmpywkAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEGCAYAAABiq/5QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAU8UlEQVR4nO3df7RdZX3n8feXJHgJIoEkRSTgDVNFMYjGGymLaZtCHdEw4ji0gqDACOkUcck4ixKEqegqXdj5oZYqQqUUmVaCqQULqI0KLe3MAm4iSGJEIsT28kMCI5PCECD4nT/2E3Jz8+uE3H1O7nner7Xuus9+zt77PM8993zuvs/e59mRmUiS6rFHrxsgSeoug1+SKmPwS1JlDH5JqozBL0mVmdzrBnRixowZOTg42OtmSNKEsmzZsicyc+bY+gkR/IODgwwPD/e6GZI0oUTET7dW71CPJFXG4Jekyhj8klSZCTHGL0lb88ILLzAyMsL69et73ZSeGhgYYNasWUyZMqWj9Q1+SRPWyMgI++yzD4ODg0REr5vTE5nJk08+ycjICLNnz+5oG4d6JE1Y69evZ/r06dWGPkBEMH369J36r8fglzSh1Rz6G+3sz8Dgl6TKOMYvqW8MLrplXPe35rIF47q/M844gxNOOIGTTjppXPe7swx+qTLjHY6jjXdQ1m7Dhg1Mnjz+Me1QjyTtgmeeeYYFCxZw5JFHMmfOHBYvXsynP/1p5s2bx5w5c1i4cCFbu9PhttaZP38+5513HkNDQ1x66aXMnj2bF154AYB169ZttvxyGfyStAu+9a1v8ZrXvIZ7772XFStWcPzxx3Puuedy9913s2LFCp599lluvvnmLbbb3jrPP/88w8PDfPKTn2T+/PncckvzX9r111/P+973vo6v198Wg1+SdsERRxzB0qVLueCCC7jjjjvYd999ue222zjqqKM44ogj+N73vsfKlSu32G5767z//e9/qXzWWWdxzTXXAHDNNddw5pln7nKbHeOXpF3w+te/nuXLl3Prrbdy8cUXc9xxx/GFL3yB4eFhDj74YC655JItrrFfv34955xzzjbX2XvvvV8qH3PMMaxZs4bbb7+dF198kTlz5uxymz3il6Rd8MgjjzB16lROO+00zj//fJYvXw7AjBkzePrpp1myZMkW22wM+e2tM9qHPvQhPvCBD4zL0T54xC+pj/TiqqL77ruP888/nz322IMpU6ZwxRVXcOONNzJnzhxe/epXM2/evC22mTZtGmefffZ21xnt1FNP5eKLL+aUU04ZlzbH1s42726GhobSG7FI46OfLudctWoVb3zjG7v6nL2wZMkSbrrpJq677rptrrO1n0VELMvMobHresQvSbuxj370o3zzm9/k1ltvHbd9GvyStBu7/PLLx32fntyVNKFNhOHqtu3sz6D14I+ISRHx/Yi4uSzPjog7I2J1RCyOiD3bboOk/jQwMMCTTz5ZdfhvnI9/YGCg4226MdTzMWAV8Kqy/Bngs5l5fUR8CfgwcEUX2iGpz8yaNYuRkRHWrl3b66b01MY7cHWq1eCPiFnAAuBS4OPRTBp9LPCBssq1wCUY/JJehilTpnR81ylt0vZQz+eA3wN+UZanA09l5oayPAIctLUNI2JhRAxHxHDtf80laTy1FvwRcQLweGYueznbZ+ZVmTmUmUMzZ84c59ZJUr3aHOo5BnhPRLwbGKAZ4/88MC0iJpej/lnAwy22QZI0RmtH/Jl5YWbOysxB4GTge5l5KnAbsPH2M6cDN7XVBknSlnpxHf8FNCd6V9OM+V/dgzZIUrW68sndzLwduL2UHwTe3o3nlSRtyU/uSlJlDH5JqozBL0mVMfglqTIGvyRVxuCXpMoY/JJUGYNfkipj8EtSZQx+SaqMwS9JlTH4JakyBr8kVaYrs3OqPYOLbmlt32suW9DaviX1jkf8klQZg1+SKmPwS1JlDH5Jqownd8eZJ1sl7e484pekyhj8klQZg1+SKmPwS1JlPLkr9ZgXBKjbPOKXpMoY/JJUGYNfkipj8EtSZTy5K6lVnrze/XjEL0mVMfglqTIGvyRVxuCXpMoY/JJUGYNfkipj8EtSZQx+SaqMwS9JlTH4JakyrQV/RAxExF0RcW9ErIyIT5X62RFxZ0SsjojFEbFnW22QJG2pzSP+54BjM/NI4C3A8RHxK8BngM9m5i8DPwc+3GIbJEljtBb82Xi6LE4pXwkcCywp9dcC722rDZKkLbU6xh8RkyLiHuBxYCnwE+CpzNxQVhkBDtrGtgsjYjgihteuXdtmMyWpKq0Gf2a+mJlvAWYBbwfesBPbXpWZQ5k5NHPmzLaaKEnV6cpVPZn5FHAbcDQwLSI23gdgFvBwN9ogSWq0eVXPzIiYVsp7Ae8AVtH8ATiprHY6cFNbbZAkbanNO3AdCFwbEZNo/sDckJk3R8QPgesj4g+A7wNXt9gGSdIYrQV/Zv4AeOtW6h+kGe/XBNXWrfS8jZ7UHX5yV5IqY/BLUmUMfkmqjMEvSZUx+CWpMga/JFXG4JekynQU/BFxRNsNkSR1R6dH/F8sN1U5JyL2bbVFkqRWdRT8mfmrwKnAwcCyiPjLiHhHqy2TJLWi4zH+zHwAuBi4APh14I8j4kcR8b62GidJGn+djvG/OSI+SzO75rHAv83MN5byZ1tsnyRpnHU6SdvlwJeBT2TmsxsrM/ORiLi4lZZJklrRafAvAJ7NzBcBImIPYCAz/19mXtda6yRJ467TMf7vAHuNWp5a6iRJE0ynwT+QmU9vXCjlqe00SZLUpk6D/5mImLtxISLeBjy7nfUlSbupTsf4zwO+FhGPAAG8Gnh/W42SJLWno+DPzLsj4g3AYaXq/sx8ob1mSZLasjP33J0HDJZt5kYEmfmVVlolSWpNR8EfEdcB/wq4B3ixVCdg8EvSBNPpEf8QcHhmZpuNkSS1r9OrelbQnNCVJE1wnR7xzwB+GBF3Ac9trMzM97TSKklSazoN/kvabIQkqXs6vZzz7yLitcDrMvM7ETEVmNRu0yRJbeh0WuazgSXAlaXqIODGltokSWpRpyd3PwIcA6yDl27K8kttNUqS1J5Ox/ify8znIwKAiJhMcx2/1HcGF93Syn7XXLaglf1KO6vTI/6/i4hPAHuVe+1+Dfib9polSWpLp8G/CFgL3Af8DnArzf13JUkTTKdX9fwC+NPyJUmawDqdq+chtjKmn5mHjnuLJEmt2pm5ejYaAH4L2H/8myNJaltHY/yZ+eSor4cz83M0N2CXJE0wnQ71zB21uAfNfwA7M5e/JGk30Wl4//dR5Q3AGuC3x701kqTWdXpVz2+03RBJUnd0OtTz8e09npn/Y3yaI0lqW6cf4BoCfpdmcraDgP8IzAX2KV9biIiDI+K2iPhhRKyMiI+V+v0jYmlEPFC+77fr3ZAkdarTMf5ZwNzM/BeAiLgEuCUzT9vONhuA/5yZyyNiH2BZRCwFzgC+m5mXRcQimk8FX/ByOyBJ2jmdHvEfADw/avn5UrdNmfloZi4v5X8BVtH8t3AicG1Z7VrgvTvRXknSLur0iP8rwF0R8ddl+b1sCu8diohB4K3AncABmfloeegxtvEHJCIWAgsBDjnkkE6fSpKcYXUHOv0A16XAmcDPy9eZmfmHnWwbEa8E/go4LzPXjdlvso3pnTPzqswcysyhmTNndvJUkqQOdDrUAzAVWJeZnwdGImL2jjaIiCk0of8Xmfn1Uv2ziDiwPH4g8PhOtlmStAs6vfXiJ2lOwF5YqqYA/3MH2wRwNbBqzOWe3wBOL+XTgZt2psGSpF3T6Rj/v6MZo994svaRcqXO9hwDfBC4LyLuKXWfAC4DboiIDwM/xU8AS1JXdRr8z2dmRkQCRMTeO9ogM/8BiG08fFyHzytJGmedjvHfEBFXAtMi4mzgO3hTFkmakHZ4xF/G6hcDbwDWAYcBv5+ZS1tumySpBTsM/jLEc2tmHgEY9pI0wXU61LM8Iua12hJJUld0enL3KOC0iFgDPENz0jYz881tNUzayE9hSuNru8EfEYdk5j8B7+xSeyRJLdvREf+NNLNy/jQi/ioz/30X2iRJatGOxvhHX4d/aJsNkSR1x46CP7dRliRNUDsa6jkyItbRHPnvVcqw6eTuq1ptnSRp3G03+DNzUrcaIknqjp2ZllmS1AcMfkmqjMEvSZUx+CWpMga/JFXG4Jekyhj8klQZg1+SKmPwS1JlDH5JqozBL0mVMfglqTIGvyRVxuCXpMoY/JJUGYNfkipj8EtSZQx+SaqMwS9JlTH4JakyBr8kVcbgl6TKTO51AyRpohtcdEsr+11z2YJW9usRvyRVxuCXpMoY/JJUGYNfkipj8EtSZVoL/oj4s4h4PCJWjKrbPyKWRsQD5ft+bT2/JGnr2jzi/3Pg+DF1i4DvZubrgO+WZUlSF7UW/Jn598D/GVN9InBtKV8LvLet55ckbV23x/gPyMxHS/kx4IAuP78kVa9nJ3czM4Hc1uMRsTAihiNieO3atV1smST1t24H/88i4kCA8v3xba2YmVdl5lBmDs2cObNrDZSkftft4P8GcHopnw7c1OXnl6TqtXk551eB/w0cFhEjEfFh4DLgHRHxAPCbZVmS1EWtzc6Zmads46Hj2nrOrWlr1jxob+Y8SWqTn9yVpMoY/JJUGYNfkipj8EtSZQx+SaqMwS9JlTH4JakyBr8kVcbgl6TKGPySVBmDX5IqY/BLUmUMfkmqjMEvSZUx+CWpMga/JFXG4Jekyhj8klQZg1+SKmPwS1JlDH5JqozBL0mVMfglqTIGvyRVxuCXpMoY/JJUGYNfkipj8EtSZQx+SaqMwS9JlTH4JakyBr8kVcbgl6TKGPySVBmDX5IqY/BLUmUMfkmqjMEvSZUx+CWpMga/JFWmJ8EfEcdHxP0RsToiFvWiDZJUq64Hf0RMAr4AvAs4HDglIg7vdjskqVa9OOJ/O7A6Mx/MzOeB64ETe9AOSapSZGZ3nzDiJOD4zDyrLH8QOCozzx2z3kJgYVk8DLh/nJowA3hinPa1O+r3/kH/99H+TXy7Sx9fm5kzx1ZO7kVLOpGZVwFXjfd+I2I4M4fGe7+7i37vH/R/H+3fxLe797EXQz0PAwePWp5V6iRJXdCL4L8beF1EzI6IPYGTgW/0oB2SVKWuD/Vk5oaIOBf4NjAJ+LPMXNnFJoz78NFupt/7B/3fR/s38e3Wfez6yV1JUm/5yV1JqozBL0mVmdDBHxGTIuL7EXFzWZ4dEXeWqSAWl5PHRMQryvLq8vjgqH1cWOrvj4h3jqrv+bQSEbEmIu6LiHsiYrjU7R8RSyPigfJ9v1IfEfHHpb0/iIi5o/Zzeln/gYg4fVT928r+V5dto8v9mxYRSyLiRxGxKiKO7pf+RcRh5XXb+LUuIs7rl/6V5/9PEbEyIlZExFcjYqAP34MfK/1bGRHnlbqJ/xpm5oT9Aj4O/CVwc1m+ATi5lL8E/G4pnwN8qZRPBhaX8uHAvcArgNnAT2hOOE8q5UOBPcs6h/egf2uAGWPq/ghYVMqLgM+U8ruBbwIB/ApwZ6nfH3iwfN+vlPcrj91V1o2y7bu63L9rgbNKeU9gWj/1b1Q/JwGPAa/tl/4BBwEPAXuV5RuAM/rpPQjMAVYAU2kuhPkO8Mv98Bp2/U0wji/KLOC7wLHAzeUH9wQwuTx+NPDtUv42cHQpTy7rBXAhcOGofX67bPfStqV+s/W62Mc1bBn89wMHlvKBwP2lfCVwytj1gFOAK0fVX1nqDgR+NKp+s/W60Ld9S3BEP/ZvTJ/+DfCP/dQ/muD/5xJmk8t78J399B4Efgu4etTyfwF+rx9ew4k81PM5mhfhF2V5OvBUZm4oyyM0v5yw6ZeU8vj/Leu/VD9mm23Vd1sCfxsRy6KZwgLggMx8tJQfAw4o5Z3ty0GlPLa+W2YDa4Frohmu+3JE7E3/9G+0k4GvlnJf9C8zHwb+G/BPwKM076ll9Nd7cAXwqxExPSKm0hzRH0wfvIYTMvgj4gTg8cxc1uu2tOxfZ+ZcmplMPxIRvzb6wWwOEybq9biTgbnAFZn5VuAZmn+bXzLB+wdAGeN+D/C1sY9N5P6Vce0Taf6AvwbYGzi+p40aZ5m5CvgM8LfAt4B7gBfHrDMhX8MJGfzAMcB7ImINzeyexwKfB6ZFxMYPpY2eCuKlaSLK4/sCT7Lt6SN2i2klylEVmfk48Nc0M5v+LCIOBCjfHy+r72xfHi7lsfXdMgKMZOadZXkJzR+CfunfRu8Clmfmz8pyv/TvN4GHMnNtZr4AfJ3mfdlv78GrM/NtmflrwM+BH9MPr2E3x8xaGoebz6aTu19j8xNL55TyR9j8xNINpfwmNj+x9CDNSaXJpTybTSeW3tTlfu0N7DOq/L9ojqj+K5ufWPqjUl7A5ieW7ir1+9OMpe9Xvh4C9i+PjT2x9O4u9/EO4LBSvqT0rW/6V9pwPXDmqOW+6B9wFLCS5sRn0Jyo/2g/vQdL+36pfD8E+BHNBQgT/jXs6g+xpRdmPpuC/9Dyg1xdfgFfUeoHyvLq8viho7a/iObqgfsZdUadZjzvx+Wxi3rQr0PLL/u95Q12UamfTnNS+wGaqww2/gIFzQ1ufgLcBwyN2td/KH1fzeYhNEQzjvkT4E8Yc6K1C318CzAM/AC4sbwp+ql/e9Mc1e47qq6f+vcpmjBcAVxHE9598x4sbbgD+GF5Hx7XL6+hUzZIUmUm6hi/JOllMvglqTIGvyRVxuCXpMoY/JJUGYNfAiLiojID4w/KbJpHbWfdP4+Ik7rZPmk8df3Wi9LuJiKOBk4A5mbmcxExg+ZDQ+O1/8m5af4aqec84peaWRKfyMznADLzicx8JCJ+PyLuLvOxX7W1udK3tU5E3B4Rn4vmPgoXRcRDETGlPPaq0ctStxn8UjMJ18ER8eOI+GJE/Hqp/5PMnJeZc4C9aP4rGGt76+yZmUOZ+SngdpqP9EMzZcHXs5njRuo6g1/Vy8yngbcBC2mmil4cEWcAv1HuFnUfzUSAb9rK5ttbZ/Go8peBM0v5TOCa8e2F1DnH+CUgM1+kOSq/vYT47wBvpplv5Z8j4hKa+WZeEhEDwBe3s84zo/b/jxExGBHzgUmZuaK93kjb5xG/qhfN/XFfN6rqLTQThgE8ERGvBLZ2Fc9AB+uM9hWaW4V6tK+e8ohfglcCl0fENGADzQyKC4GnaGZOfAy4e+xGmflURPzp9tYZ4y+AP2DT3biknnB2TqlLyrX/J2bmB3vdFtXNI36pCyLicpq7cb27122RPOKXpMp4cleSKmPwS1JlDH5JqozBL0mVMfglqTL/H3Eu+XXGItckAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#Histogram - Age\n", "age_min, age_max = int(np.amin(pdf['age'])), math.ceil(np.amax(pdf['age']))\n", "age_bucket_size = 5\n", "print(age_min,age_max)\n", "pdf[['age']].plot(kind='hist',bins=range(age_min,age_max,age_bucket_size),rwidth=0.8)\n", "plt.xlabel('Age',fontsize=10)\n", "plt.legend(loc=None)\n", "plt.show()\n", "\n", "#Histogram - Salary\n", "salary_min, salary_max = int(np.amin(pdf['salary'])), math.ceil(np.amax(pdf['salary']))\n", "salary_bucket_size = 5000\n", "pdf[['salary']].plot(kind='hist',bins=range(salary_min,salary_max,salary_bucket_size),rwidth=0.8)\n", "plt.xlabel('Salary',fontsize=10)\n", "plt.legend(loc=None)\n", "plt.show()\n", "\n", "# Heatmap\n", "age_bucket_count = math.ceil((age_max - age_min)/age_bucket_size)\n", "salary_bucket_count = math.ceil((salary_max - salary_min)/salary_bucket_size)\n", "\n", "x = [[0 for i in range(salary_bucket_count)] for j in range(age_bucket_count)]\n", "for i in range(len(pdf['age'])):\n", " age_bucket = math.floor((pdf['age'][i] - age_min)/age_bucket_size)\n", " salary_bucket = math.floor((pdf['salary'][i] - salary_min)/salary_bucket_size)\n", " x[age_bucket][salary_bucket] += 1\n", "\n", "plt.title(\"Salary/Age distribution heatmap\")\n", "plt.xlabel(\"Salary in '000s\")\n", "plt.ylabel(\"Age\")\n", "\n", "plt.imshow(x, cmap='YlOrRd', interpolation='nearest',extent=[salary_min/1000,salary_max/1000,age_min,age_max],\n", " origin=\"lower\")\n", "plt.colorbar(orientation=\"horizontal\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 3: Create a Segementation model using SparkML\n", "\n", "#### A K-Means clustering model is used to create several segments based on age and salary" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---------------+------------------+------+--------------------+\n", "| id| name| age|salary| features|\n", "+---+---------------+------------------+------+--------------------+\n", "|239|Individual: 239| 34.17676930587447| 74530|[239.0,34.1767693...|\n", "|101|Individual: 101| 42.74153431470734| 66879|[101.0,42.7415343...|\n", "|194|Individual: 194| 47.76512911028296| 95551|[194.0,47.7651291...|\n", "| 31|Individual: 031| 25.01729346533023| 63919|[31.0,25.01729346...|\n", "|139|Individual: 139|48.790294708607526| 85122|[139.0,48.7902947...|\n", "| 14|Individual: 014|23.821849367819627| 50039|[14.0,23.82184936...|\n", "|142|Individual: 142| 41.900346308163| 77074|[142.0,41.9003463...|\n", "|272|Individual: 272| 35.51515262675664| 78120|[272.0,35.5151526...|\n", "| 76|Individual: 076|24.501398941664295| 53245|[76.0,24.50139894...|\n", "|147|Individual: 147| 52.86059009219467| 95568|[147.0,52.8605900...|\n", "| 79|Individual: 079|22.733322137801498| 42284|[79.0,22.73332213...|\n", "| 96|Individual: 096|25.012542214617536| 45102|[96.0,25.01254221...|\n", "|132|Individual: 132| 50.86264730832292| 89061|[132.0,50.8626473...|\n", "| 10|Individual: 010|26.032627396210984| 50665|[10.0,26.03262739...|\n", "|141|Individual: 141| 44.09868665142268| 72602|[141.0,44.0986866...|\n", "|140|Individual: 140| 47.34764914670569| 88577|[140.0,47.3476491...|\n", "|160|Individual: 160|41.622564290221064| 81777|[160.0,41.6225642...|\n", "|112|Individual: 112| 41.46591037852322| 64056|[112.0,41.4659103...|\n", "|120|Individual: 120| 48.28801723706617| 87174|[120.0,48.2880172...|\n", "| 34|Individual: 034| 24.87827400567121| 53419|[34.0,24.87827400...|\n", "+---+---------------+------------------+------+--------------------+\n", "only showing top 20 rows\n", "\n", "Silhouette with squared euclidean distance = 0.8232499809050619\n", "Cluster Centers: \n", "[9.53206107e+01 2.75790774e+01 5.19690840e+04]\n", "[1.93272189e+02 4.17462245e+01 7.82818521e+04]\n" ] } ], "source": [ "from pyspark.ml.clustering import KMeans\n", "from pyspark.ml.evaluation import ClusteringEvaluator\n", "from pyspark.ml.feature import VectorAssembler\n", "\n", "\n", "#All machine learning algorithms in Spark take as input a Vector type, which must be a set of numerical values.\n", "assembler = VectorAssembler(\n", " inputCols=[\"id\", \"age\", \"salary\"],\n", " outputCol=\"features\")\n", "data_2 = assembler.transform(loadedDFWithSchema)\n", "data_2.show()\n", "\n", "# Trains a k-means model.\n", "kmeans = KMeans().setK(2).setSeed(1)\n", "model = kmeans.fit(data_2)\n", "\n", "# Make predictions\n", "predictions = model.transform(data_2)\n", "\n", "# Evaluate clustering by computing Silhouette score\n", "evaluator = ClusteringEvaluator()\n", "\n", "silhouette = evaluator.evaluate(predictions)\n", "print(\"Silhouette with squared euclidean distance = \" + str(silhouette))\n", "\n", "# Shows the result.\n", "centers = model.clusterCenters()\n", "print(\"Cluster Centers: \")\n", "for center in centers:\n", " print(center)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 2 }