{"cells":[{"cell_type":"markdown","source":["In this project we will build a spam detection filter. The dataset consists of volunteered text messages from a study in Singapore and some spam texts from a UK reporting site. The data comes from UCI Repository SMS Spam Detection: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3971be17-18aa-4ceb-9a00-b2a5651f46eb"}}},{"cell_type":"code","source":["from pyspark.sql import SparkSession"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"066ad7a9-30be-45f7-a01b-0cab96eaa5e1"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["spark = SparkSession.builder.appName('nlp').getOrCreate()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"945363c9-5a5b-4585-801b-ed2acc397c69"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data = spark.read.csv(\"dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/SMSSpamCollection\",inferSchema=True,sep='\\t')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b79bbeea-75e3-418c-ad9b-e99b2ca022c7"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data = data.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"faa3143f-9bfe-4ad9-b172-ce449c780d0d"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"53d73d81-f993-4633-93b1-9ab1bc6455de"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-----+--------------------+\n|class| text|\n+-----+--------------------+\n| ham|Go until jurong p...|\n| ham|Ok lar... Joking ...|\n| spam|Free entry in 2 a...|\n| ham|U dun say so earl...|\n| ham|Nah I don't think...|\n| spam|FreeMsg Hey there...|\n| ham|Even my brother i...|\n| ham|As per your reque...|\n| spam|WINNER!! As a val...|\n| spam|Had your mobile 1...|\n| ham|I'm gonna be home...|\n| spam|SIX chances to wi...|\n| spam|URGENT! You have ...|\n| ham|I've been searchi...|\n| ham|I HAVE A DATE ON ...|\n| spam|XXXMobileMovieClu...|\n| ham|Oh k...i'm watchi...|\n| ham|Eh u remember how...|\n| ham|Fine if that’s th...|\n| spam|England v Macedon...|\n+-----+--------------------+\nonly showing top 20 rows\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-----+--------------------+\n|class| text|\n+-----+--------------------+\n| ham|Go until jurong p...|\n| ham|Ok lar... Joking ...|\n| spam|Free entry in 2 a...|\n| ham|U dun say so earl...|\n| ham|Nah I don't think...|\n| spam|FreeMsg Hey there...|\n| ham|Even my brother i...|\n| ham|As per your reque...|\n| spam|WINNER!! As a val...|\n| spam|Had your mobile 1...|\n| ham|I'm gonna be home...|\n| spam|SIX chances to wi...|\n| spam|URGENT! You have ...|\n| ham|I've been searchi...|\n| ham|I HAVE A DATE ON ...|\n| spam|XXXMobileMovieClu...|\n| ham|Oh k...i'm watchi...|\n| ham|Eh u remember how...|\n| ham|Fine if that’s th...|\n| spam|England v Macedon...|\n+-----+--------------------+\nonly showing top 20 rows\n\n"]}}],"execution_count":0},{"cell_type":"markdown","source":["Clean the data"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1043b1f1-cc25-41d0-a762-c9d410ed3141"}}},{"cell_type":"code","source":["from pyspark.sql.functions import length"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3db56346-b105-4cfa-ac01-c75fcdc1c5b7"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data = data.withColumn('length',length(data['text']))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"61369798-0a54-4269-8ed3-c0ecd01c66bd"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a19c18f3-08ce-40b6-acc9-dde3c1e1d166"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-----+--------------------+------+\n|class| text|length|\n+-----+--------------------+------+\n| ham|Go until jurong p...| 111|\n| ham|Ok lar... Joking ...| 29|\n| spam|Free entry in 2 a...| 155|\n| ham|U dun say so earl...| 49|\n| ham|Nah I don't think...| 61|\n| spam|FreeMsg Hey there...| 147|\n| ham|Even my brother i...| 77|\n| ham|As per your reque...| 160|\n| spam|WINNER!! As a val...| 157|\n| spam|Had your mobile 1...| 154|\n| ham|I'm gonna be home...| 109|\n| spam|SIX chances to wi...| 136|\n| spam|URGENT! You have ...| 155|\n| ham|I've been searchi...| 196|\n| ham|I HAVE A DATE ON ...| 35|\n| spam|XXXMobileMovieClu...| 149|\n| ham|Oh k...i'm watchi...| 26|\n| ham|Eh u remember how...| 81|\n| ham|Fine if that’s th...| 56|\n| spam|England v Macedon...| 155|\n+-----+--------------------+------+\nonly showing top 20 rows\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-----+--------------------+------+\n|class| text|length|\n+-----+--------------------+------+\n| ham|Go until jurong p...| 111|\n| ham|Ok lar... Joking ...| 29|\n| spam|Free entry in 2 a...| 155|\n| ham|U dun say so earl...| 49|\n| ham|Nah I don't think...| 61|\n| spam|FreeMsg Hey there...| 147|\n| ham|Even my brother i...| 77|\n| ham|As per your reque...| 160|\n| spam|WINNER!! As a val...| 157|\n| spam|Had your mobile 1...| 154|\n| ham|I'm gonna be home...| 109|\n| spam|SIX chances to wi...| 136|\n| spam|URGENT! You have ...| 155|\n| ham|I've been searchi...| 196|\n| ham|I HAVE A DATE ON ...| 35|\n| spam|XXXMobileMovieClu...| 149|\n| ham|Oh k...i'm watchi...| 26|\n| ham|Eh u remember how...| 81|\n| ham|Fine if that’s th...| 56|\n| spam|England v Macedon...| 155|\n+-----+--------------------+------+\nonly showing top 20 rows\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["data.groupby('class').mean().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"54ebb39d-a639-4f41-a15c-acf4ed9b87df"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-----+-----------------+\n|class| avg(length)|\n+-----+-----------------+\n| ham| 71.4545266210897|\n| spam|138.6706827309237|\n+-----+-----------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-----+-----------------+\n|class| avg(length)|\n+-----+-----------------+\n| ham| 71.4545266210897|\n| spam|138.6706827309237|\n+-----+-----------------+\n\n"]}}],"execution_count":0},{"cell_type":"markdown","source":["Feature Transformations"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"9c094676-23b0-4131-9fb3-d30d8c6f8506"}}},{"cell_type":"code","source":["from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.ml.linalg import Vector"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"59063a10-2dd4-4a79-ad79-d2215c68b6f4"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"token_text\")\nstopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')\ncount_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')\nidf = IDF(inputCol=\"c_vec\", outputCol=\"tf_idf\")\nham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b2ad6e8d-679e-4fe7-a32a-7db82d95ed66"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["clean_up = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d471046b-541f-4d5d-906e-cb9ad0af58f4"}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["Modeling"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a94f485a-a093-4b3b-ab01-ab960b958281"}}},{"cell_type":"code","source":["from pyspark.ml.classification import NaiveBayes"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"dae69634-8b3c-4e9a-bcb3-4d3f919ded01"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["nb = NaiveBayes()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"cabe84ca-140e-49ff-ac56-0bc93d077b98"}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["Pipeline"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b6a2cb29-b316-455f-9a2e-0da8d93a0b1c"}}},{"cell_type":"code","source":["from pyspark.ml import Pipeline"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3da04b6c-dc57-45fb-9c69-cf7f257f1d8a"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data_prep_pipe = Pipeline(stages=[ham_spam_to_num,tokenizer,stopremove,count_vec,idf,clean_up])"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a2bb4f89-7f40-4561-9dcb-c23333387c67"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["cleaner = data_prep_pipe.fit(data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"120ec1ff-198f-4b87-83a6-31e787cc7cc7"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["clean_data = cleaner.transform(data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4551199a-aea9-445f-bd77-661bc009bb71"}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["Training and Evaluation"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b33a4903-bc8c-4d68-9636-7e3d33c81b7b"}}},{"cell_type":"code","source":["from pyspark.ml.evaluation import MulticlassClassificationEvaluator"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"87ed2f30-2ff7-49e3-ab56-4327d52d5c45"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["clean_data = clean_data.select(['label','features'])"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"40604f0c-3b1a-41ed-a5fd-771f8f8b0a87"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["clean_data.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f7eb1f82-300c-4bb0-91b2-9d0fabe450e2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-----+--------------------+\n|label| features|\n+-----+--------------------+\n| 0.0|(13424,[7,11,31,6...|\n| 0.0|(13424,[0,24,297,...|\n| 1.0|(13424,[2,13,19,3...|\n| 0.0|(13424,[0,70,80,1...|\n| 0.0|(13424,[36,134,31...|\n| 1.0|(13424,[10,60,139...|\n| 0.0|(13424,[10,53,103...|\n| 0.0|(13424,[125,184,4...|\n| 1.0|(13424,[1,47,118,...|\n| 1.0|(13424,[0,1,13,27...|\n| 0.0|(13424,[18,43,120...|\n| 1.0|(13424,[8,17,37,8...|\n| 1.0|(13424,[13,30,47,...|\n| 0.0|(13424,[39,96,217...|\n| 0.0|(13424,[552,1697,...|\n| 1.0|(13424,[30,109,11...|\n| 0.0|(13424,[82,214,47...|\n| 0.0|(13424,[0,2,49,13...|\n| 0.0|(13424,[0,74,105,...|\n| 1.0|(13424,[4,30,33,5...|\n+-----+--------------------+\nonly showing top 20 rows\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-----+--------------------+\n|label| features|\n+-----+--------------------+\n| 0.0|(13424,[7,11,31,6...|\n| 0.0|(13424,[0,24,297,...|\n| 1.0|(13424,[2,13,19,3...|\n| 0.0|(13424,[0,70,80,1...|\n| 0.0|(13424,[36,134,31...|\n| 1.0|(13424,[10,60,139...|\n| 0.0|(13424,[10,53,103...|\n| 0.0|(13424,[125,184,4...|\n| 1.0|(13424,[1,47,118,...|\n| 1.0|(13424,[0,1,13,27...|\n| 0.0|(13424,[18,43,120...|\n| 1.0|(13424,[8,17,37,8...|\n| 1.0|(13424,[13,30,47,...|\n| 0.0|(13424,[39,96,217...|\n| 0.0|(13424,[552,1697,...|\n| 1.0|(13424,[30,109,11...|\n| 0.0|(13424,[82,214,47...|\n| 0.0|(13424,[0,2,49,13...|\n| 0.0|(13424,[0,74,105,...|\n| 1.0|(13424,[4,30,33,5...|\n+-----+--------------------+\nonly showing top 20 rows\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["(training,testing) = clean_data.randomSplit([0.7,0.3])"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"84a385ed-6987-441c-bdbf-a09878b693c1"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["spam_predictor = nb.fit(training)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"780ee44b-bddd-4bc8-b09d-88b7aab4452b"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"46c494af-eb8e-4b7e-bd2a-88a164c158d6"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- class: string (nullable = true)\n |-- text: string (nullable = true)\n |-- length: integer (nullable = true)\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["root\n |-- class: string (nullable = true)\n |-- text: string (nullable = true)\n |-- length: integer (nullable = true)\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["test_results = spam_predictor.transform(testing)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"ead1df76-653c-4052-8283-64c07251424a"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["test_results.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a100cdc1-de44-4838-b0f2-9441dc05e6b7"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-----+--------------------+--------------------+--------------------+----------+\n|label| features| rawPrediction| probability|prediction|\n+-----+--------------------+--------------------+--------------------+----------+\n| 0.0|(13424,[0,1,2,41,...|[-1058.4236651267...|[1.0,2.0218993518...| 0.0|\n| 0.0|(13424,[0,1,14,31...|[-216.37757434172...|[1.0,4.6921539302...| 0.0|\n| 0.0|(13424,[0,1,15,20...|[-686.30933948850...|[1.0,4.7480435449...| 0.0|\n| 0.0|(13424,[0,1,21,27...|[-1009.2354637663...|[1.0,3.4420307844...| 0.0|\n| 0.0|(13424,[0,1,24,31...|[-341.12632144915...|[1.0,2.9801386551...| 0.0|\n| 0.0|(13424,[0,1,27,88...|[-1545.8052318883...|[0.99674540942030...| 0.0|\n| 0.0|(13424,[0,1,30,12...|[-612.31523960048...|[1.0,1.7729020150...| 0.0|\n| 0.0|(13424,[0,1,146,1...|[-250.28159301013...|[0.94419859225993...| 0.0|\n| 0.0|(13424,[0,1,874,1...|[-96.083169031673...|[0.99999996864916...| 0.0|\n| 0.0|(13424,[0,1,874,1...|[-97.770226263077...|[0.99999997561199...| 0.0|\n| 0.0|(13424,[0,2,3,6,9...|[-3296.5916245644...|[1.0,3.8692072210...| 0.0|\n| 0.0|(13424,[0,2,4,5,1...|[-2484.9794043685...|[1.0,3.2695873089...| 0.0|\n| 0.0|(13424,[0,2,4,5,1...|[-1610.4821878915...|[1.0,3.9135371465...| 0.0|\n| 0.0|(13424,[0,2,4,8,1...|[-1313.1520641704...|[1.0,5.4727792597...| 0.0|\n| 0.0|(13424,[0,2,4,8,2...|[-563.23303848389...|[1.0,9.1756624825...| 0.0|\n| 0.0|(13424,[0,2,4,25,...|[-425.86125539239...|[1.0,2.9319383384...| 0.0|\n| 0.0|(13424,[0,2,4,44,...|[-1892.9037879435...|[1.0,5.2089698504...| 0.0|\n| 0.0|(13424,[0,2,7,8,1...|[-469.57902691008...|[0.99999999433492...| 0.0|\n| 0.0|(13424,[0,2,7,11,...|[-727.83890019230...|[1.0,2.9949190670...| 0.0|\n| 0.0|(13424,[0,2,7,11,...|[-1411.8150184110...|[1.0,6.3381546067...| 0.0|\n+-----+--------------------+--------------------+--------------------+----------+\nonly showing top 20 rows\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-----+--------------------+--------------------+--------------------+----------+\n|label| features| rawPrediction| probability|prediction|\n+-----+--------------------+--------------------+--------------------+----------+\n| 0.0|(13424,[0,1,2,41,...|[-1058.4236651267...|[1.0,2.0218993518...| 0.0|\n| 0.0|(13424,[0,1,14,31...|[-216.37757434172...|[1.0,4.6921539302...| 0.0|\n| 0.0|(13424,[0,1,15,20...|[-686.30933948850...|[1.0,4.7480435449...| 0.0|\n| 0.0|(13424,[0,1,21,27...|[-1009.2354637663...|[1.0,3.4420307844...| 0.0|\n| 0.0|(13424,[0,1,24,31...|[-341.12632144915...|[1.0,2.9801386551...| 0.0|\n| 0.0|(13424,[0,1,27,88...|[-1545.8052318883...|[0.99674540942030...| 0.0|\n| 0.0|(13424,[0,1,30,12...|[-612.31523960048...|[1.0,1.7729020150...| 0.0|\n| 0.0|(13424,[0,1,146,1...|[-250.28159301013...|[0.94419859225993...| 0.0|\n| 0.0|(13424,[0,1,874,1...|[-96.083169031673...|[0.99999996864916...| 0.0|\n| 0.0|(13424,[0,1,874,1...|[-97.770226263077...|[0.99999997561199...| 0.0|\n| 0.0|(13424,[0,2,3,6,9...|[-3296.5916245644...|[1.0,3.8692072210...| 0.0|\n| 0.0|(13424,[0,2,4,5,1...|[-2484.9794043685...|[1.0,3.2695873089...| 0.0|\n| 0.0|(13424,[0,2,4,5,1...|[-1610.4821878915...|[1.0,3.9135371465...| 0.0|\n| 0.0|(13424,[0,2,4,8,1...|[-1313.1520641704...|[1.0,5.4727792597...| 0.0|\n| 0.0|(13424,[0,2,4,8,2...|[-563.23303848389...|[1.0,9.1756624825...| 0.0|\n| 0.0|(13424,[0,2,4,25,...|[-425.86125539239...|[1.0,2.9319383384...| 0.0|\n| 0.0|(13424,[0,2,4,44,...|[-1892.9037879435...|[1.0,5.2089698504...| 0.0|\n| 0.0|(13424,[0,2,7,8,1...|[-469.57902691008...|[0.99999999433492...| 0.0|\n| 0.0|(13424,[0,2,7,11,...|[-727.83890019230...|[1.0,2.9949190670...| 0.0|\n| 0.0|(13424,[0,2,7,11,...|[-1411.8150184110...|[1.0,6.3381546067...| 0.0|\n+-----+--------------------+--------------------+--------------------+----------+\nonly showing top 20 rows\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["acc_eval = MulticlassClassificationEvaluator()\nacc = acc_eval.evaluate(test_results)\nprint(\"Accuracy of model at predicting spam was: {}\".format(acc))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e09f0600-15ac-4f9a-9a2c-581d90dddaee"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Accuracy of model at predicting spam was: 0.9183086542776753\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Accuracy of model at predicting spam was: 0.9183086542776753\n"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"ml-nlp-case","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":1680652028931063}},"nbformat":4,"nbformat_minor":0}