{"cells":[{"cell_type":"markdown","source":["This project aims to help a marketing agency to predict customer churn (stop buying their survice) by using a machine learning classification model, so that the company can test this against incoming data for future customers to perdict which customers will churn and assign them an account manager. The data description is as follows:\n\n Name : Name of the latest contact at Company\n Age: Customer Age\n Total_Purchase: Total Ads Purchased\n Account_Manager: Binary 0=No manager, 1= Account manager assigned\n Years: Totaly Years as a customer\n Num_sites: Number of websites that use the service.\n Onboard_date: Date that the name of the latest contact was onboarded\n Location: Client HQ Address\n Company: Name of Client Company"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"14455984-bfb6-4d9b-9013-78722160c095"}}},{"cell_type":"code","source":["from pyspark.sql import SparkSession\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.ml.evaluation import BinaryClassificationEvaluator"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1a1f1212-399f-4e8c-b359-6b5ab705e868"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data = spark.read.csv(\"dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/customer_churn.csv\",inferSchema=True, header=True)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"863f3d92-f705-4e79-83c2-37999f1a7d5b"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"43bf2bc1-905e-4b59-b1ad-6565fd3120f0"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- Names: string (nullable = true)\n |-- Age: double (nullable = true)\n |-- Total_Purchase: double (nullable = true)\n |-- Account_Manager: integer (nullable = true)\n |-- Years: double (nullable = true)\n |-- Num_Sites: double (nullable = true)\n |-- Onboard_date: timestamp (nullable = true)\n |-- Location: string (nullable = true)\n |-- Company: string (nullable = true)\n |-- Churn: integer (nullable = true)\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["root\n |-- Names: string (nullable = true)\n |-- Age: double (nullable = true)\n |-- Total_Purchase: double (nullable = true)\n |-- Account_Manager: integer (nullable = true)\n |-- Years: double (nullable = true)\n |-- Num_Sites: double (nullable = true)\n |-- Onboard_date: timestamp (nullable = true)\n |-- Location: string (nullable = true)\n |-- Company: string (nullable = true)\n |-- Churn: integer (nullable = true)\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["data.describe().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d9a0a86d-1270-4d0d-a1c6-447b4ec2e67c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+\n|summary| Names| Age| Total_Purchase| Account_Manager| Years| Num_Sites| Location| Company| Churn|\n+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+\n| count| 900| 900| 900| 900| 900| 900| 900| 900| 900|\n| mean| null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777| null| null|0.16666666666666666|\n| stddev| null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969| null| null| 0.3728852122772358|\n| min| Aaron King| 22.0| 100.0| 0| 1.0| 3.0|00103 Jeffrey Cre...| Abbott-Thompson| 0|\n| max|Zachary Walsh| 65.0| 18026.01| 1| 9.15| 14.0|Unit 9800 Box 287...|Zuniga, Clark and...| 1|\n+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+\n|summary| Names| Age| Total_Purchase| Account_Manager| Years| Num_Sites| Location| Company| Churn|\n+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+\n| count| 900| 900| 900| 900| 900| 900| 900| 900| 900|\n| mean| null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777| null| null|0.16666666666666666|\n| stddev| null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969| null| null| 0.3728852122772358|\n| min| Aaron King| 22.0| 100.0| 0| 1.0| 3.0|00103 Jeffrey Cre...| Abbott-Thompson| 0|\n| max|Zachary Walsh| 65.0| 18026.01| 1| 9.15| 14.0|Unit 9800 Box 287...|Zuniga, Clark and...| 1|\n+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["data.columns"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b704e482-fdb6-462c-83df-a798d1988e6a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[5]: ['Names',\n 'Age',\n 'Total_Purchase',\n 'Account_Manager',\n 'Years',\n 'Num_Sites',\n 'Onboard_date',\n 'Location',\n 'Company',\n 'Churn']","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[5]: ['Names',\n 'Age',\n 'Total_Purchase',\n 'Account_Manager',\n 'Years',\n 'Num_Sites',\n 'Onboard_date',\n 'Location',\n 'Company',\n 'Churn']"]}}],"execution_count":0},{"cell_type":"code","source":["assembler = VectorAssembler(inputCols=['Age',\n 'Total_Purchase',\n 'Account_Manager',\n 'Years',\n 'Num_Sites'],outputCol='features')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4464158d-f32f-43ba-9d2f-b9242e2f9c8a"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["output = assembler.transform(data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"41184427-2328-44b6-9393-0850ee05e36f"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["final_data = output.select('features','churn')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"743ae0b9-d687-4a28-91c8-66d491358afe"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["train_churn,test_churn = final_data.randomSplit([0.7,0.3])"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"93f1bd24-6e99-4181-b178-e48883c4eba4"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["lr_churn = LogisticRegression(labelCol='churn')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"40bc32ba-a860-49c3-88ec-4f451128719c"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["fitted_churn_model = lr_churn.fit(train_churn)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"dd231fd3-e2a2-44c8-a0df-7814b286e6a2"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["training_sum = fitted_churn_model.summary"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"279b805c-3679-4eb0-a132-d6a95eb00175"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["training_sum.predictions.describe().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"86b1914a-5007-43c7-b492-a058d0e103e2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"/databricks/spark/python/pyspark/sql/context.py:134: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n warnings.warn(\n+-------+------------------+------------------+\n|summary| churn| prediction|\n+-------+------------------+------------------+\n| count| 621| 621|\n| mean|0.1642512077294686|0.1143317230273752|\n| stddev|0.3708020444222667| 0.318470254004318|\n| min| 0.0| 0.0|\n| max| 1.0| 1.0|\n+-------+------------------+------------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["/databricks/spark/python/pyspark/sql/context.py:134: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n warnings.warn(\n+-------+------------------+------------------+\n|summary| churn| prediction|\n+-------+------------------+------------------+\n| count| 621| 621|\n| mean|0.1642512077294686|0.1143317230273752|\n| stddev|0.3708020444222667| 0.318470254004318|\n| min| 0.0| 0.0|\n| max| 1.0| 1.0|\n+-------+------------------+------------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["pred_and_labels = fitted_churn_model.evaluate(test_churn)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6dadccd4-2b6c-4492-a608-e4557e5cba1b"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["pred_and_labels.predictions.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"accb1e69-c64e-448c-9cab-bacebccb8f94"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+--------------------+-----+--------------------+--------------------+----------+\n| features|churn| rawPrediction| probability|prediction|\n+--------------------+-----+--------------------+--------------------+----------+\n|[22.0,11254.38,1....| 0|[4.42610676267903...|[0.98818040771998...| 0.0|\n|[25.0,9672.03,0.0...| 0|[4.38649810533113...|[0.98770872408802...| 0.0|\n|[27.0,8628.8,1.0,...| 0|[4.99547660560702...|[0.99327701009202...| 0.0|\n|[28.0,11128.95,1....| 0|[3.98417417702396...|[0.98173212040432...| 0.0|\n|[29.0,8688.17,1.0...| 1|[2.44899083414891...|[0.92048762114072...| 0.0|\n|[30.0,8403.78,1.0...| 0|[5.58181352981596...|[0.99624839746366...| 0.0|\n|[30.0,10183.98,1....| 0|[2.76467297450614...|[0.94073669499285...| 0.0|\n|[30.0,12788.37,0....| 0|[2.71301102813230...|[0.93779004336492...| 0.0|\n|[30.0,13473.35,0....| 0|[3.02144798568538...|[0.95353372376920...| 0.0|\n|[31.0,8829.83,1.0...| 0|[4.18829744900902...|[0.98505465797648...| 0.0|\n|[31.0,10058.87,1....| 0|[3.97785919321561...|[0.98161852124201...| 0.0|\n|[31.0,11743.24,0....| 0|[6.36096097807910...|[0.99827527419186...| 0.0|\n|[31.0,12264.68,1....| 0|[3.33131327531714...|[0.96548755656129...| 0.0|\n|[32.0,6367.22,1.0...| 0|[3.02667356283644...|[0.95376470631089...| 0.0|\n|[32.0,8011.38,0.0...| 0|[1.89658424424186...|[0.86950444082524...| 0.0|\n|[32.0,8575.71,0.0...| 0|[3.69300591894818...|[0.97570775434429...| 0.0|\n|[32.0,8617.98,1.0...| 1|[0.86853858529779...|[0.70444151703483...| 0.0|\n|[32.0,11540.86,0....| 0|[6.70293126734090...|[0.99877419538724...| 0.0|\n|[32.0,11715.72,0....| 0|[3.40964625291369...|[0.96800464819343...| 0.0|\n|[32.0,12403.6,0.0...| 0|[5.42166677112224...|[0.99559967468850...| 0.0|\n+--------------------+-----+--------------------+--------------------+----------+\nonly showing top 20 rows\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+--------------------+-----+--------------------+--------------------+----------+\n| features|churn| rawPrediction| probability|prediction|\n+--------------------+-----+--------------------+--------------------+----------+\n|[22.0,11254.38,1....| 0|[4.42610676267903...|[0.98818040771998...| 0.0|\n|[25.0,9672.03,0.0...| 0|[4.38649810533113...|[0.98770872408802...| 0.0|\n|[27.0,8628.8,1.0,...| 0|[4.99547660560702...|[0.99327701009202...| 0.0|\n|[28.0,11128.95,1....| 0|[3.98417417702396...|[0.98173212040432...| 0.0|\n|[29.0,8688.17,1.0...| 1|[2.44899083414891...|[0.92048762114072...| 0.0|\n|[30.0,8403.78,1.0...| 0|[5.58181352981596...|[0.99624839746366...| 0.0|\n|[30.0,10183.98,1....| 0|[2.76467297450614...|[0.94073669499285...| 0.0|\n|[30.0,12788.37,0....| 0|[2.71301102813230...|[0.93779004336492...| 0.0|\n|[30.0,13473.35,0....| 0|[3.02144798568538...|[0.95353372376920...| 0.0|\n|[31.0,8829.83,1.0...| 0|[4.18829744900902...|[0.98505465797648...| 0.0|\n|[31.0,10058.87,1....| 0|[3.97785919321561...|[0.98161852124201...| 0.0|\n|[31.0,11743.24,0....| 0|[6.36096097807910...|[0.99827527419186...| 0.0|\n|[31.0,12264.68,1....| 0|[3.33131327531714...|[0.96548755656129...| 0.0|\n|[32.0,6367.22,1.0...| 0|[3.02667356283644...|[0.95376470631089...| 0.0|\n|[32.0,8011.38,0.0...| 0|[1.89658424424186...|[0.86950444082524...| 0.0|\n|[32.0,8575.71,0.0...| 0|[3.69300591894818...|[0.97570775434429...| 0.0|\n|[32.0,8617.98,1.0...| 1|[0.86853858529779...|[0.70444151703483...| 0.0|\n|[32.0,11540.86,0....| 0|[6.70293126734090...|[0.99877419538724...| 0.0|\n|[32.0,11715.72,0....| 0|[3.40964625291369...|[0.96800464819343...| 0.0|\n|[32.0,12403.6,0.0...| 0|[5.42166677112224...|[0.99559967468850...| 0.0|\n+--------------------+-----+--------------------+--------------------+----------+\nonly showing top 20 rows\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n labelCol='churn')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"83b76174-1352-4e97-9ae6-c0a33d0580f0"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["auc = churn_eval.evaluate(pred_and_labels.predictions)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0e562c56-4b46-4ec8-8802-bc90394e21f5"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["auc"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"89b2bc5b-25d4-4c83-945e-fc41c6cd7473"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[24]: 0.7804383116883116","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[24]: 0.7804383116883116"]}}],"execution_count":0},{"cell_type":"markdown","source":["Predict on new data"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"fdd7adad-9003-4ca8-83a0-1af695a70c86"}}},{"cell_type":"code","source":["final_lr_model = lr_churn.fit(final_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"ebb87dbc-5d29-465f-81d0-70284b069b0f"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["new_customers = spark.read.csv(\"dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/new_customers.csv\",inferSchema=True, header=True)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2286367c-7b98-4261-844a-56857d4ecb4f"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["new_customers.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"46ee32df-7498-463b-b50a-1518ce74fe50"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- Names: string (nullable = true)\n |-- Age: double (nullable = true)\n |-- Total_Purchase: double (nullable = true)\n |-- Account_Manager: integer (nullable = true)\n |-- Years: double (nullable = true)\n |-- Num_Sites: double (nullable = true)\n |-- Onboard_date: timestamp (nullable = true)\n |-- Location: string (nullable = true)\n |-- Company: string (nullable = true)\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["root\n |-- Names: string (nullable = true)\n |-- Age: double (nullable = true)\n |-- Total_Purchase: double (nullable = true)\n |-- Account_Manager: integer (nullable = true)\n |-- Years: double (nullable = true)\n |-- Num_Sites: double (nullable = true)\n |-- Onboard_date: timestamp (nullable = true)\n |-- Location: string (nullable = true)\n |-- Company: string (nullable = true)\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["test_new_customers = assembler.transform(new_customers)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"08dbdfe9-d308-4075-b29f-a65d2596aaae"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["test_new_customers.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"06fc5ff9-e25a-4574-a63e-6508d115500e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- Names: string (nullable = true)\n |-- Age: double (nullable = true)\n |-- Total_Purchase: double (nullable = true)\n |-- Account_Manager: integer (nullable = true)\n |-- Years: double (nullable = true)\n |-- Num_Sites: double (nullable = true)\n |-- Onboard_date: timestamp (nullable = true)\n |-- Location: string (nullable = true)\n |-- Company: string (nullable = true)\n |-- features: vector (nullable = true)\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["root\n |-- Names: string (nullable = true)\n |-- Age: double (nullable = true)\n |-- Total_Purchase: double (nullable = true)\n |-- Account_Manager: integer (nullable = true)\n |-- Years: double (nullable = true)\n |-- Num_Sites: double (nullable = true)\n |-- Onboard_date: timestamp (nullable = true)\n |-- Location: string (nullable = true)\n |-- Company: string (nullable = true)\n |-- features: vector (nullable = true)\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["final_results = final_lr_model.transform(test_new_customers)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a8ed5fb5-a680-4b62-9536-19a716557260"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["final_results.select('Company','prediction').show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4c8b01ea-3740-4f7b-9337-8111e3be8b55"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------------+----------+\n| Company|prediction|\n+----------------+----------+\n| King Ltd| 0.0|\n| Cannon-Benson| 1.0|\n|Barron-Robertson| 1.0|\n| Sexton-Golden| 1.0|\n| Wood LLC| 0.0|\n| Parks-Robbins| 1.0|\n+----------------+----------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+----------------+----------+\n| Company|prediction|\n+----------------+----------+\n| King Ltd| 0.0|\n| Cannon-Benson| 1.0|\n|Barron-Robertson| 1.0|\n| Sexton-Golden| 1.0|\n| Wood LLC| 0.0|\n| Parks-Robbins| 1.0|\n+----------------+----------+\n\n"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"ml-logistic-regression-case","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":3598965581908901}},"nbformat":4,"nbformat_minor":0}