<!DOCTYPE html> <html> <head> <meta name="databricks-html-version" content="1"> <title>022_TweetCollector - Databricks</title> <meta charset="utf-8"> <meta name="google" content="notranslate"> <meta http-equiv="Content-Language" content="en"> <meta http-equiv="Content-Type" content="text/html; charset=UTF8"> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/jquery-ui-bundle/jquery-ui.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/main.css"> <link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/print.css" media="print"> <link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/img/favicon.ico"/> <script>window.settings = {"sparkDocsSearchGoogleCx":"004588677886978090460:_rj0wilqwdm","dbcForumURL":"http://forums.databricks.com/","dbfsS3Host":"https://databricks-prod-storage-sydney.s3.amazonaws.com","enableThirdPartyApplicationsUI":false,"enableClusterAcls":false,"notebookRevisionVisibilityHorizon":0,"enableTableHandler":true,"isAdmin":true,"enableLargeResultDownload":false,"nameAndEmail":"Raazesh Sainudiin (r.sainudiin@math.canterbury.ac.nz)","enablePresentationTimerConfig":true,"enableFullTextSearch":true,"enableElasticSparkUI":true,"clusters":true,"hideOffHeapCache":false,"applications":false,"useStaticGuide":false,"fileStoreBase":"FileStore","configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableResetPassword":true,"enableJobsSparkUpgrade":true,"sparkVersions":[{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0","packageLabel":"spark-1.3-jenkins-ip-10-30-9-162-U0c2673ac85-Sa2ee4664b2-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1","packageLabel":"spark-1.4-jenkins-ip-10-30-9-162-U0c2673ac85-S33a1e4b9c6-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.0","packageLabel":"spark-1.6-jenkins-ip-10-30-9-162-U0c2673ac85-Scabba801f3-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"master","displayName":"Spark master (dev)","packageLabel":"","upgradable":true,"deprecated":false,"customerVisible":false}],"enableRestrictedClusterCreation":false,"enableFeedback":false,"defaultNumWorkers":8,"serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","driverStdoutFilePrefix":"stdout","enableSparkDocsSearch":true,"prefetchSidebarNodes":true,"sparkHistoryServerEnabled":true,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableNotebookHistoryDiffing":true,"branch":"2.12.3","accountsLimit":-1,"enableNotebookGitBranching":true,"local":false,"displayDefaultContainerMemoryGB":6,"deploymentMode":"production","useSpotForWorkers":false,"enableUserInviteWorkflow":false,"enableStaticNotebooks":true,"dbcGuideURL":"#workspace/databricks_guide/00 Welcome to Databricks","enableCssTransitions":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"orgId":0,"enableNotebookGitVersioning":true,"files":"files/","enableDriverLogsUI":true,"disableLegacyDashboards":false,"enableWorkspaceAclsConfig":true,"dropzoneMaxFileSize":4096,"enableNewDashboardViews":false,"driverLog4jFilePrefix":"log4j","enableMavenLibraries":true,"displayRowLimit":1000,"defaultSparkVersion":{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},"clusterPublisherRootId":5,"enableLatestJobRunResultPermalink":true,"disallowAddingAdmins":false,"enableSparkConfUI":true,"enableOrgSwitcherUI":false,"clustersLimit":-1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":false,"enableClusterDeltaUpdates":true,"csrfToken":"3f4d8617-8d0d-47dd-a072-38dbe25947da","useFixedStaticNotebookVersionForDevelopment":false,"enableBasicReactDialogBoxes":true,"requireEmailUserName":true,"enableDashboardViews":false,"dbcFeedbackURL":"http://feedback.databricks.com/forums/263785-product-feedback","enableWorkspaceAclService":true,"someName":"Raazesh Sainudiin","enableWorkspaceAcls":true,"gitHash":"0c2673ac858e227cad536fdb45d140aeded238db","userFullname":"Raazesh Sainudiin","enableClusterCreatePage":false,"enableImportFromUrl":true,"enableMiniClusters":false,"enableWebSocketDeltaUpdates":true,"enableDebugUI":false,"showHiddenSparkVersions":false,"allowNonAdminUsers":true,"userId":100005,"dbcSupportURL":"","staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/","enableSparkPackages":true,"enableHybridClusterType":false,"enableNotebookHistoryUI":true,"availableWorkspaces":[{"name":"Workspace 0","orgId":0}],"enableFolderHtmlExport":true,"enableSparkVersionsUI":true,"databricksGuideStaticUrl":"","enableHybridClusters":true,"notebookLoadingBackground":"#fff","enableNewJobRunDetailsPage":true,"enableDashboardExport":true,"user":"r.sainudiin@math.canterbury.ac.nz","enableServerAutoComplete":true,"enableStaticHtmlImport":true,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"tablesPublisherRootId":7,"enableNewInputWidgetUI":false,"accounts":true,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4};</script> <script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":89432,"name":"022_TweetCollector","language":"scala","commands":[{"version":"CommandV1","origId":89434,"guid":"864e6246-5187-45ab-b740-063e63aa7f67","subtype":"command","commandType":"auto","position":0.5,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"a9e8898a-9e09-4ed2-9dd4-cf34ddbb5534"},{"version":"CommandV1","origId":129732,"guid":"9605a4c4-1877-4fbd-9b7a-b9dff4cee340","subtype":"command","commandType":"auto","position":0.75,"command":"%md\nThe [html source url](https://raw.githubusercontent.com/raazesh-sainudiin/scalable-data-science/master/db/week6/12_SparkStreaming/022_TweetCollector.html) of this databricks notebook and its recorded Uji :\n\n[](https://www.youtube.com/v/jqLcr2eS-Vs?rel=0&autoplay=1&modestbranding=1&start=2112&end=3535)\n","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f85caca0-c3f0-4c17-a3e4-2b8221079e7c"},{"version":"CommandV1","origId":89435,"guid":"701a2b71-3b70-4eae-abd8-b9913eff248e","subtype":"command","commandType":"auto","position":1.0,"command":"%md\n# Tweet Collector - capture live tweets\n\n### First let's take the twitter stream and write to DBFS as json files\n\n#### See the notebook 022_TweetGenericCollector (this notebook is not robust and it is only for demo)!!!","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"d923b7cf-f5ce-495b-a298-1b533dd2203d"},{"version":"CommandV1","origId":89436,"guid":"de85f793-cfa0-4818-a2ed-32447344609f","subtype":"command","commandType":"auto","position":2.0,"command":"import org.apache.spark._\nimport org.apache.spark.storage._\nimport org.apache.spark.streaming._\nimport org.apache.spark.streaming.twitter.TwitterUtils\n\nimport twitter4j.auth.OAuthAuthorization\nimport twitter4j.conf.ConfigurationBuilder","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">import org.apache.spark._\nimport org.apache.spark.storage._\nimport org.apache.spark.streaming._\nimport org.apache.spark.streaming.twitter.TwitterUtils\nimport twitter4j.auth.OAuthAuthorization\nimport twitter4j.conf.ConfigurationBuilder\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.45939350079E12,"submitTime":1.459393428534E12,"finishTime":1.459393501619E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"7693af39-c996-4062-b7c4-c38414d88925"},{"version":"CommandV1","origId":89437,"guid":"be4679b9-be15-455a-8e1a-ef232d594b48","subtype":"command","commandType":"auto","position":3.0,"command":"%md\nLet's create a directory in dbfs for storing tweets in the cluster's distributed file system.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"5676bc5c-1c8f-4140-8e63-2a255cc60ae9"},{"version":"CommandV1","origId":89438,"guid":"0e7fff28-ad5a-43ed-9480-6b6527b7a388","subtype":"command","commandType":"auto","position":4.0,"command":"val rawTweetsDirectory=\"/rawTweets\"","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rawTweetsDirectory: String = /rawTweets\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392308054E12,"submitTime":1.459392310027E12,"finishTime":1.459392308288E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"ae7fa5c3-e9f9-4bd2-9f39-2137d324707f"},{"version":"CommandV1","origId":89439,"guid":"f61efec4-ddaf-4fa1-8b34-80d3fd4a9011","subtype":"command","commandType":"auto","position":5.0,"command":"dbutils.fs.rm(rawTweetsDirectory, true) // to remove a pre-existing directory and start from scratch uncomment and evaluate this cell","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.45939215638E12,"submitTime":1.459392158303E12,"finishTime":1.459392156469E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"8f6e00b4-d978-4a0f-a4e7-4c107bb84a48"},{"version":"CommandV1","origId":89440,"guid":"f7db217b-3699-4e0c-adb6-ade0abf6f135","subtype":"command","commandType":"auto","position":5.5,"command":"%md\nCapture tweets in every sliding window of `slideInterval` many milliseconds.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"cc4f941d-a502-4952-b789-18bf1cbc426c"},{"version":"CommandV1","origId":89441,"guid":"7bda4f06-8650-4d09-afd2-11ab3c6d06af","subtype":"command","commandType":"auto","position":6.0,"command":"val slideInterval = new Duration(1 * 1000) // 1 * 1000 = 1000 milli-seconds = 1 sec","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">slideInterval: org.apache.spark.streaming.Duration = 1000 ms\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392311417E12,"submitTime":1.45939231339E12,"finishTime":1.459392311523E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"fddd7c6a-6ce2-4934-9a13-5fd97af418e7"},{"version":"CommandV1","origId":89442,"guid":"08192b59-694b-49c1-bda1-86f973c7a2cf","subtype":"command","commandType":"auto","position":6.375,"command":"%md\nRecall that **Discretized Stream** or **DStream** is the basic abstraction provided\nby Spark Streaming. It represents a continuous stream of data, either\nthe input data stream received from source, or the processed data stream\ngenerated by transforming the input stream. Internally, a DStream is\nrepresented by a continuous series of RDDs, which is Spark?s abstraction\nof an immutable, distributed dataset (see [Spark Programming\nGuide](http://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds)\nfor more details). Each RDD in a DStream contains data from a certain\ninterval, as shown in the following figure.\n\n","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"74bbb11d-bcbb-43eb-bca8-2ea813e2baae"},{"version":"CommandV1","origId":89443,"guid":"6fafafe2-73a1-4374-bb43-65fa9602f09e","subtype":"command","commandType":"auto","position":6.5,"command":"%md\nLet's import googles json library next.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"5195b0e0-faf1-4d68-9070-329fdf615810"},{"version":"CommandV1","origId":89444,"guid":"ee052611-dfc2-49b5-ace7-cb80cf0e0739","subtype":"command","commandType":"auto","position":7.0,"command":"import com.google.gson.Gson // the Library has already been attached to this cluster (show live how to do this from scratch?)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">import com.google.gson.Gson\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<div class=\"ansiout\"><console>:42: error: object gson is not a member of package com.google\n import com.google.gson.Gson // the Library has already been attached to this cluster\n ^\n</div>","error":null,"startTime":1.459392317419E12,"submitTime":1.459392319391E12,"finishTime":1.459392317493E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b738e0de-0f85-4556-9092-2110a77b69f9"},{"version":"CommandV1","origId":89445,"guid":"b54aa638-eb31-432b-bd95-f38dfb8250cb","subtype":"command","commandType":"auto","position":8.0,"command":"%md\nOur goal is to take each RDD in the twitter DStream and write it as a json file in our dbfs.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6ece55e7-166d-4dac-8deb-06a382252481"},{"version":"CommandV1","origId":89446,"guid":"f9fdd9d4-f63f-4a91-b3e0-9bd45f9174f9","subtype":"command","commandType":"auto","position":8.5,"command":"// Create a Spark Streaming Context.\nval ssc = new StreamingContext(sc, slideInterval)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">ssc: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@38a79766\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392344688E12,"submitTime":1.459392346659E12,"finishTime":1.45939234484E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"7b4b3c3e-9214-40e2-9826-84625c39d4b2"},{"version":"CommandV1","origId":89447,"guid":"8bdb127e-f14e-4d82-8f65-e50b8d36613c","subtype":"command","commandType":"auto","position":8.75,"command":"%md\n\nCAUTION: Extracting knowledge from tweets is \"easy\" using techniques shown here, but one has to take responsibility for the use of this knowledge and conform to the rules and policies linked below.\n\nRemeber that the use of twitter itself comes with various strings attached. Read:\n\n- [Twitter Rules](https://twitter.com/rules)\n\n\nCrucially, the use of the content from twitter by you (as done in this worksheet) comes with some strings. Read:\n- [Developer Agreement & Policy Twitter Developer Agreement](https://dev.twitter.com/overview/terms/agreement-and-policy)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"3f9e6bd4-5414-4d18-91fa-dd12734e4027"},{"version":"CommandV1","origId":89448,"guid":"246e768d-c550-4744-8616-8377d078b4f2","subtype":"command","commandType":"auto","position":8.875,"command":"%md\n\n### Enter your own Twitter API Credentials.\n* Go to https://apps.twitter.com and look up your Twitter API Credentials, or create an app to create them.\n* Run this cell for the input cells to appear.\n* Enter your credentials.\n* Run the cell again to pick up your defaults.\n\nThe cell-below is hidden to not expose the Twitter API Credentials: `consumerKey`, `consumerSecret`, `accessToken` and `accessTokenSecret`.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b5b2df02-eda2-4e7a-9051-b3f433b0e314"},{"version":"CommandV1","origId":89449,"guid":"78cfee13-8751-4fa5-85f6-bf14a1876271","subtype":"command","commandType":"auto","position":8.9375,"command":"System.setProperty(\"twitter4j.oauth.consumerKey\", getArgument(\"1. Consumer Key (API Key)\", \"\"))\nSystem.setProperty(\"twitter4j.oauth.consumerSecret\", getArgument(\"2. Consumer Secret (API Secret)\", \"\"))\nSystem.setProperty(\"twitter4j.oauth.accessToken\", getArgument(\"3. Access Token\", \"\"))\nSystem.setProperty(\"twitter4j.oauth.accessTokenSecret\", getArgument(\"4. Access Token Secret\", \"\"))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"><console>:44: warning: method getArgument in trait ArgumentsHandler is deprecated: Use createText() or createSelect() to create a widget and getArgument(widgetName) to get its bound value.\n System.setProperty("twitter4j.oauth.consumerKey", getArgument("1. Consumer Key (API Key)", ""))\n ^\n<console>:45: warning: method getArgument in trait ArgumentsHandler is deprecated: Use createText() or createSelect() to create a widget and getArgument(widgetName) to get its bound value.\n System.setProperty("twitter4j.oauth.consumerSecret", getArgument("2. Consumer Secret (API Secret)", ""))\n ^\n<console>:46: warning: method getArgument in trait ArgumentsHandler is deprecated: Use createText() or createSelect() to create a widget and getArgument(widgetName) to get its bound value.\n System.setProperty("twitter4j.oauth.accessToken", getArgument("3. Access Token", ""))\n ^\n<console>:49: warning: method getArgument in trait ArgumentsHandler is deprecated: Use createText() or createSelect() to create a widget and getArgument(widgetName) to get its bound value.\n System.setProperty("twitter4j.oauth.accessTokenSecret", getArgument("4. Access Token Secret", ""))\n ^\nres0: String = ""\n</div>","arguments":{"1. Consumer Key (API Key)":"","2. Consumer Secret (API Secret)":"","3. Access Token":"","4. Access Token Secret":""},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459393508329E12,"submitTime":1.459393436074E12,"finishTime":1.459393508488E12,"collapsed":false,"bindings":{"1. Consumer Key (API Key)":"","2. Consumer Secret (API Secret)":"","3. Access Token":"","4. Access Token Secret":""},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":true,"iPythonMetadata":null,"nuid":"a4116ce4-0265-4ba0-a2a6-0f1fb032461b"},{"version":"CommandV1","origId":89450,"guid":"11bdc1c6-5f20-4afa-88b0-b68e14602f8f","subtype":"command","commandType":"auto","position":8.96875,"command":"%md\nIf you see warnings then ignore for now:\n[https://forums.databricks.com/questions/6941/change-in-getargument-for-notebook-input.html](https://forums.databricks.com/questions/6941/change-in-getargument-for-notebook-input.html).","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"05a3e971-c3df-4a14-9654-12c283f46c3b"},{"version":"CommandV1","origId":89451,"guid":"48257bf7-80d5-4d29-a430-6f6aa1ff547e","subtype":"command","commandType":"auto","position":9.0,"command":"// Create a Twitter Stream for the input source. \nval auth = Some(new OAuthAuthorization(new ConfigurationBuilder().build()))\nval twitterStream = TwitterUtils.createStream(ssc, auth)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">auth: Some[twitter4j.auth.OAuthAuthorization] = Some(OAuthAuthorization{consumerKey='9wfO7Yb2EABN519Cmlu7MpFcW', consumerSecret='******************************************', oauthToken=AccessToken{screenName='null', userId=4173723312}})\ntwitterStream: org.apache.spark.streaming.dstream.ReceiverInputDStream[twitter4j.Status] = org.apache.spark.streaming.twitter.TwitterInputDStream@4c3a146d\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392372149E12,"submitTime":1.459392374119E12,"finishTime":1.459392372324E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"81e29548-bab5-4a7d-8a83-a341b9c3b50c"},{"version":"CommandV1","origId":89452,"guid":"dc0a5d55-f616-4b07-b9e2-bf6c81ea86e0","subtype":"command","commandType":"auto","position":9.5,"command":"%md\nLet's map the tweets into json formatted string (one tweet per line).","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"5650440b-98dc-43a2-bf46-03cb08284ff7"},{"version":"CommandV1","origId":89453,"guid":"7f0bfb41-30bd-40f8-8a97-13287c4dd520","subtype":"command","commandType":"auto","position":10.0,"command":"val twitterStreamJson = twitterStream.map(x => { val gson = new Gson();\n val xJson = gson.toJson(x)\n xJson\n }) ","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">twitterStreamJson: org.apache.spark.streaming.dstream.DStream[String] = org.apache.spark.streaming.dstream.MappedDStream@73ce2cd6\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392378257E12,"submitTime":1.459392380224E12,"finishTime":1.459392378415E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"d7d8a538-899b-44c2-8dd6-a6c0b4b4852b"},{"version":"CommandV1","origId":89454,"guid":"e44daea1-99c6-464a-8578-52507c22fd2c","subtype":"command","commandType":"auto","position":11.0,"command":"var numTweetsCollected = 0L // track number of tweets collected\nval partitionsEachInterval = 1 // This tells the number of partitions in each RDD of tweets in the DStream.\n\ntwitterStreamJson.foreachRDD((rdd, time) => { // for each RDD in the DStream\n val count = rdd.count()\n if (count > 0) {\n val outputRDD = rdd.repartition(partitionsEachInterval) // repartition as desired\n outputRDD.saveAsTextFile(rawTweetsDirectory + \"/tweets_\" + time.milliseconds.toString) // save as textfile\n numTweetsCollected += count // update with the latest count\n }\n })","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">numTweetsCollected: Long = 0\npartitionsEachInterval: Int = 1\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<div class=\"ansiout\"><console>:66: error: not found: value partitionsEachInterval\n val outputRDD = rdd.repartition(partitionsEachInterval) // repartition as desired\n ^\n</div>","error":null,"startTime":1.459392391094E12,"submitTime":1.459392393069E12,"finishTime":1.45939239143E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"09c31682-d026-40e5-afa2-3f2643c1e802"},{"version":"CommandV1","origId":89455,"guid":"307a6d4f-7b52-4163-bc21-d05dbf7e928e","subtype":"command","commandType":"auto","position":12.0,"command":"%md \nNothing has actually happened yet.\n\nLet's start the spark streaming context we have created next.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"654ac060-6f01-469a-a2fe-0aed1e84f9d7"},{"version":"CommandV1","origId":89456,"guid":"753ad9af-723d-4f4e-b9f4-6a32b7f2636f","subtype":"command","commandType":"auto","position":13.0,"command":"ssc.start()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392397013E12,"submitTime":1.459392398985E12,"finishTime":1.459392397503E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"5568e9fc-b6ef-4e2a-9066-9dcfc24578f1"},{"version":"CommandV1","origId":89457,"guid":"55ab75a4-9351-4c25-8355-fc97e1571562","subtype":"command","commandType":"auto","position":14.0,"command":"%md\nLet's look at the spark UI now and monitor the streaming job in action! Go to `Clusters` on the left and click on `UI` and then `Streaming`.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"11d90d42-a2d9-40f7-b7f0-7587a21d70bc"},{"version":"CommandV1","origId":89458,"guid":"a23671c5-b025-44b4-ac47-40a757671a2a","subtype":"command","commandType":"auto","position":15.0,"command":"numTweetsCollected // number of tweets collected so far","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res3: Long = 2478\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392454098E12,"submitTime":1.459392456069E12,"finishTime":1.459392454177E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b5d040a6-a192-4bb5-901e-a3d68421f0ae"},{"version":"CommandV1","origId":89459,"guid":"003aa033-b37e-41cf-929d-cd4e41430795","subtype":"command","commandType":"auto","position":15.5,"command":"%md\nNote that you could easilt fill up disk space!!!\n\nSo let's stop the streaming job next.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"56f5b862-efd8-48c2-87ff-5f39284e2f7c"},{"version":"CommandV1","origId":89460,"guid":"0286ea9f-47b3-4f79-b3bf-1b1c147126b1","subtype":"command","commandType":"auto","position":16.0,"command":"ssc.stop(stopSparkContext = false) // gotto stop soon!!!","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392459676E12,"submitTime":1.459392461658E12,"finishTime":1.459392462221E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"72d5e3f6-0069-4773-86c8-3151797a2eb0"},{"version":"CommandV1","origId":89461,"guid":"b71c7128-8982-460c-80fa-6c9bb2540fdc","subtype":"command","commandType":"auto","position":17.0,"command":"%md\nLet's make sure that the `Streaming` UI is not active in the `Clusters` `UI`.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"3ebf47c9-29ae-4484-b9a4-a1f0836174ad"},{"version":"CommandV1","origId":89462,"guid":"8cff10f1-66fb-4655-9603-f6395968ac7f","subtype":"command","commandType":"auto","position":18.0,"command":"StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) } // extra cautious stopping of all active streaming contexts","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392489543E12,"submitTime":1.45939249152E12,"finishTime":1.459392489668E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"3299eef7-d7fb-4d15-abc7-88a81825243d"},{"version":"CommandV1","origId":89463,"guid":"562c141b-fed3-442b-b854-2216c313e428","subtype":"command","commandType":"auto","position":19.0,"command":"%md\n## Let's examine what was saved in dbfs","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6380e21e-9b0f-495f-afa2-4e2ccccb08b7"},{"version":"CommandV1","origId":89464,"guid":"04cc93b2-549d-419b-afc1-0e9a6b99567e","subtype":"command","commandType":"auto","position":20.0,"command":"display(dbutils.fs.ls(\"/rawTweets/\"))","commandVersion":0,"state":"finished","results":{"type":"table","data":[["dbfs:/rawTweets/tweets_1459392400000/","tweets_1459392400000/",0.0],["dbfs:/rawTweets/tweets_1459392401000/","tweets_1459392401000/",0.0],["dbfs:/rawTweets/tweets_1459392402000/","tweets_1459392402000/",0.0],["dbfs:/rawTweets/tweets_1459392403000/","tweets_1459392403000/",0.0],["dbfs:/rawTweets/tweets_1459392404000/","tweets_1459392404000/",0.0],["dbfs:/rawTweets/tweets_1459392405000/","tweets_1459392405000/",0.0],["dbfs:/rawTweets/tweets_1459392406000/","tweets_1459392406000/",0.0],["dbfs:/rawTweets/tweets_1459392407000/","tweets_1459392407000/",0.0],["dbfs:/rawTweets/tweets_1459392408000/","tweets_1459392408000/",0.0],["dbfs:/rawTweets/tweets_1459392409000/","tweets_1459392409000/",0.0],["dbfs:/rawTweets/tweets_1459392410000/","tweets_1459392410000/",0.0],["dbfs:/rawTweets/tweets_1459392411000/","tweets_1459392411000/",0.0],["dbfs:/rawTweets/tweets_1459392412000/","tweets_1459392412000/",0.0],["dbfs:/rawTweets/tweets_1459392413000/","tweets_1459392413000/",0.0],["dbfs:/rawTweets/tweets_1459392414000/","tweets_1459392414000/",0.0],["dbfs:/rawTweets/tweets_1459392415000/","tweets_1459392415000/",0.0],["dbfs:/rawTweets/tweets_1459392416000/","tweets_1459392416000/",0.0],["dbfs:/rawTweets/tweets_1459392417000/","tweets_1459392417000/",0.0],["dbfs:/rawTweets/tweets_1459392418000/","tweets_1459392418000/",0.0],["dbfs:/rawTweets/tweets_1459392419000/","tweets_1459392419000/",0.0],["dbfs:/rawTweets/tweets_1459392420000/","tweets_1459392420000/",0.0],["dbfs:/rawTweets/tweets_1459392421000/","tweets_1459392421000/",0.0],["dbfs:/rawTweets/tweets_1459392422000/","tweets_1459392422000/",0.0],["dbfs:/rawTweets/tweets_1459392423000/","tweets_1459392423000/",0.0],["dbfs:/rawTweets/tweets_1459392424000/","tweets_1459392424000/",0.0],["dbfs:/rawTweets/tweets_1459392425000/","tweets_1459392425000/",0.0],["dbfs:/rawTweets/tweets_1459392426000/","tweets_1459392426000/",0.0],["dbfs:/rawTweets/tweets_1459392427000/","tweets_1459392427000/",0.0],["dbfs:/rawTweets/tweets_1459392428000/","tweets_1459392428000/",0.0],["dbfs:/rawTweets/tweets_1459392429000/","tweets_1459392429000/",0.0],["dbfs:/rawTweets/tweets_1459392430000/","tweets_1459392430000/",0.0],["dbfs:/rawTweets/tweets_1459392431000/","tweets_1459392431000/",0.0],["dbfs:/rawTweets/tweets_1459392432000/","tweets_1459392432000/",0.0],["dbfs:/rawTweets/tweets_1459392433000/","tweets_1459392433000/",0.0],["dbfs:/rawTweets/tweets_1459392434000/","tweets_1459392434000/",0.0],["dbfs:/rawTweets/tweets_1459392435000/","tweets_1459392435000/",0.0],["dbfs:/rawTweets/tweets_1459392436000/","tweets_1459392436000/",0.0],["dbfs:/rawTweets/tweets_1459392437000/","tweets_1459392437000/",0.0],["dbfs:/rawTweets/tweets_1459392438000/","tweets_1459392438000/",0.0],["dbfs:/rawTweets/tweets_1459392439000/","tweets_1459392439000/",0.0],["dbfs:/rawTweets/tweets_1459392440000/","tweets_1459392440000/",0.0],["dbfs:/rawTweets/tweets_1459392441000/","tweets_1459392441000/",0.0],["dbfs:/rawTweets/tweets_1459392442000/","tweets_1459392442000/",0.0],["dbfs:/rawTweets/tweets_1459392443000/","tweets_1459392443000/",0.0],["dbfs:/rawTweets/tweets_1459392444000/","tweets_1459392444000/",0.0],["dbfs:/rawTweets/tweets_1459392445000/","tweets_1459392445000/",0.0],["dbfs:/rawTweets/tweets_1459392446000/","tweets_1459392446000/",0.0],["dbfs:/rawTweets/tweets_1459392447000/","tweets_1459392447000/",0.0],["dbfs:/rawTweets/tweets_1459392448000/","tweets_1459392448000/",0.0],["dbfs:/rawTweets/tweets_1459392449000/","tweets_1459392449000/",0.0],["dbfs:/rawTweets/tweets_1459392450000/","tweets_1459392450000/",0.0],["dbfs:/rawTweets/tweets_1459392451000/","tweets_1459392451000/",0.0],["dbfs:/rawTweets/tweets_1459392452000/","tweets_1459392452000/",0.0],["dbfs:/rawTweets/tweets_1459392453000/","tweets_1459392453000/",0.0],["dbfs:/rawTweets/tweets_1459392454000/","tweets_1459392454000/",0.0],["dbfs:/rawTweets/tweets_1459392455000/","tweets_1459392455000/",0.0],["dbfs:/rawTweets/tweets_1459392456000/","tweets_1459392456000/",0.0],["dbfs:/rawTweets/tweets_1459392457000/","tweets_1459392457000/",0.0]],"arguments":{},"addedWidgets":{},"removedWidgets":[],"schema":[{"name":"path","type":"\"string\""},{"name":"name","type":"\"string\""},{"name":"size","type":"\"long\""}],"overflow":false,"aggData":[],"aggSchema":[],"aggOverflow":false,"aggSeriesLimitReached":false,"aggError":"","aggType":"","plotOptions":null,"isJsonSchema":true,"dbfsResultPath":null},"errorSummary":null,"error":null,"startTime":1.459392494866E12,"submitTime":1.459392496845E12,"finishTime":1.459392495389E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"d80a25c8-7f53-4c89-9d80-db4a1033c74d"},{"version":"CommandV1","origId":89465,"guid":"07bc0e4a-313d-49bc-8c1f-26ad5f871827","subtype":"command","commandType":"auto","position":20.5,"command":"val tweetsDir = \"/rawTweets/tweets_1459392400000/\" // use an existing file, may have to rename folder based on output above!","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">tweetsDir: String = /rawTweets/tweets_1459392400000/\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392670813E12,"submitTime":1.459392672783E12,"finishTime":1.459392670883E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"9cff61f1-2801-49ad-9991-ed8906ff709b"},{"version":"CommandV1","origId":89466,"guid":"675087b2-68e3-4e8b-bb07-aa89c1ad44a1","subtype":"command","commandType":"auto","position":21.0,"command":"display(dbutils.fs.ls(tweetsDir)) ","commandVersion":0,"state":"finished","results":{"type":"table","data":[["dbfs:/rawTweets/tweets_1459392400000/_SUCCESS","_SUCCESS",0.0],["dbfs:/rawTweets/tweets_1459392400000/part-00000","part-00000",140984.0]],"arguments":{},"addedWidgets":{},"removedWidgets":[],"schema":[{"name":"path","type":"\"string\""},{"name":"name","type":"\"string\""},{"name":"size","type":"\"long\""}],"overflow":false,"aggData":[],"aggSchema":[],"aggOverflow":false,"aggSeriesLimitReached":false,"aggError":"","aggType":"","plotOptions":null,"isJsonSchema":true,"dbfsResultPath":null},"errorSummary":"java.io.FileNotFoundException: /rawTweets/tweets_145939240000","error":"<div class=\"ansiout\">\tat com.databricks.backend.daemon.data.client.DbfsClient.send0(DbfsClient.scala:63)\n\tat com.databricks.backend.daemon.data.client.DbfsClient.sendIdempotent(DbfsClient.scala:40)\n\tat com.databricks.backend.daemon.data.client.DatabricksFileSystem.listStatus(DatabricksFileSystem.scala:184)\n\tat com.databricks.backend.daemon.dbutils.FSUtils$.ls(DBUtilsCore.scala:60)\n\tat com.databricks.dbutils_v1.impl.DbfsUtilsImpl.ls(DbfsUtilsImpl.scala:29)</div>","startTime":1.45939270062E12,"submitTime":1.459392702597E12,"finishTime":1.459392700861E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b0ab3475-1953-4e41-a668-59d4d687988f"},{"version":"CommandV1","origId":89467,"guid":"e1c79f6c-ca33-4914-abec-2534129c01e7","subtype":"command","commandType":"auto","position":25.0,"command":"sc.textFile(tweetsDir+\"part-00000\").count()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res11: Long = 38\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392716408E12,"submitTime":1.459392718391E12,"finishTime":1.45939271681E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f711ede1-bd7e-4ec3-b794-9d02df2c16ff"},{"version":"CommandV1","origId":89468,"guid":"a26d1673-88a4-4a67-990a-a5cbcfb9ac38","subtype":"command","commandType":"auto","position":26.0,"command":"val outJson = sqlContext.read.json(tweetsDir+\"part-00000\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">outJson: org.apache.spark.sql.DataFrame = [contributorsIDs: array<string>, createdAt: string, currentUserRetweetId: bigint, extendedMediaEntities: array<struct<displayURL:string,end:bigint,expandedURL:string,id:bigint,mediaURL:string,mediaURLHttps:string,sizes:struct<0:struct<height:bigint,resize:bigint,width:bigint>,1:struct<height:bigint,resize:bigint,width:bigint>,2:struct<height:bigint,resize:bigint,width:bigint>,3:struct<height:bigint,resize:bigint,width:bigint>>,start:bigint,type:string,url:string,videoAspectRatioHeight:bigint,videoAspectRatioWidth:bigint,videoDurationMillis:bigint,videoVariants:array<struct<bitrate:bigint,contentType:string,url:string>>>>, favoriteCount: bigint, hashtagEntities: array<struct<end:bigint,start:bigint,text:string>>, id: bigint, inReplyToScreenName: string, inReplyToStatusId: bigint, inReplyToUserId: bigint, isFavorited: boolean, isPossiblySensitive: boolean, isRetweeted: boolean, isTruncated: boolean, lang: string, mediaEntities: array<struct<displayURL:string,end:bigint,expandedURL:string,id:bigint,mediaURL:string,mediaURLHttps:string,sizes:struct<0:struct<height:bigint,resize:bigint,width:bigint>,1:struct<height:bigint,resize:bigint,width:bigint>,2:struct<height:bigint,resize:bigint,width:bigint>,3:struct<height:bigint,resize:bigint,width:bigint>>,start:bigint,type:string,url:string>>, quotedStatus: struct<contributorsIDs:array<string>,createdAt:string,currentUserRetweetId:bigint,extendedMediaEntities:array<string>,favoriteCount:bigint,hashtagEntities:array<string>,id:bigint,inReplyToStatusId:bigint,inReplyToUserId:bigint,isFavorited:boolean,isPossiblySensitive:boolean,isRetweeted:boolean,isTruncated:boolean,lang:string,mediaEntities:array<string>,quotedStatusId:bigint,retweetCount:bigint,source:string,symbolEntities:array<string>,text:string,urlEntities:array<struct<displayURL:string,end:bigint,expandedURL:string,start:bigint,url:string>>,user:struct<createdAt:string,description:string,descriptionURLEntities:array<string>,favouritesCount:bigint,followersCount:bigint,friendsCount:bigint,id:bigint,isContributorsEnabled:boolean,isDefaultProfile:boolean,isDefaultProfileImage:boolean,isFollowRequestSent:boolean,isGeoEnabled:boolean,isProtected:boolean,isVerified:boolean,lang:string,listedCount:bigint,location:string,name:string,profileBackgroundColor:string,profileBackgroundImageUrl:string,profileBackgroundImageUrlHttps:string,profileBackgroundTiled:boolean,profileBannerImageUrl:string,profileImageUrl:string,profileImageUrlHttps:string,profileLinkColor:string,profileSidebarBorderColor:string,profileSidebarFillColor:string,profileTextColor:string,profileUseBackgroundImage:boolean,screenName:string,showAllInlineMedia:boolean,statusesCount:bigint,timeZone:string,translator:boolean,url:string,utcOffset:bigint>,userMentionEntities:array<string>>, quotedStatusId: bigint, retweetCount: bigint, retweetedStatus: struct<contributorsIDs:array<string>,createdAt:string,currentUserRetweetId:bigint,extendedMediaEntities:array<struct<displayURL:string,end:bigint,expandedURL:string,id:bigint,mediaURL:string,mediaURLHttps:string,sizes:struct<0:struct<height:bigint,resize:bigint,width:bigint>,1:struct<height:bigint,resize:bigint,width:bigint>,2:struct<height:bigint,resize:bigint,width:bigint>,3:struct<height:bigint,resize:bigint,width:bigint>>,start:bigint,type:string,url:string,videoAspectRatioHeight:bigint,videoAspectRatioWidth:bigint,videoDurationMillis:bigint,videoVariants:array<struct<bitrate:bigint,contentType:string,url:string>>>>,favoriteCount:bigint,hashtagEntities:array<struct<end:bigint,start:bigint,text:string>>,id:bigint,inReplyToScreenName:string,inReplyToStatusId:bigint,inReplyToUserId:bigint,isFavorited:boolean,isPossiblySensitive:boolean,isRetweeted:boolean,isTruncated:boolean,lang:string,mediaEntities:array<struct<displayURL:string,end:bigint,expandedURL:string,id:bigint,mediaURL:string,mediaURLHttps:string,sizes:struct<0:struct<height:bigint,resize:bigint,width:bigint>,1:struct<height:bigint,resize:bigint,width:bigint>,2:struct<height:bigint,resize:bigint,width:bigint>,3:struct<height:bigint,resize:bigint,width:bigint>>,start:bigint,type:string,url:string>>,quotedStatus:struct<contributorsIDs:array<string>,createdAt:string,currentUserRetweetId:bigint,extendedMediaEntities:array<string>,favoriteCount:bigint,hashtagEntities:array<string>,id:bigint,inReplyToStatusId:bigint,inReplyToUserId:bigint,isFavorited:boolean,isPossiblySensitive:boolean,isRetweeted:boolean,isTruncated:boolean,lang:string,mediaEntities:array<string>,quotedStatusId:bigint,retweetCount:bigint,source:string,symbolEntities:array<string>,text:string,urlEntities:array<string>,user:struct<createdAt:string,description:string,descriptionURLEntities:array<string>,favouritesCount:bigint,followersCount:bigint,friendsCount:bigint,id:bigint,isContributorsEnabled:boolean,isDefaultProfile:boolean,isDefaultProfileImage:boolean,isFollowRequestSent:boolean,isGeoEnabled:boolean,isProtected:boolean,isVerified:boolean,lang:string,listedCount:bigint,location:string,name:string,profileBackgroundColor:string,profileBackgroundImageUrl:string,profileBackgroundImageUrlHttps:string,profileBackgroundTiled:boolean,profileImageUrl:string,profileImageUrlHttps:string,profileLinkColor:string,profileSidebarBorderColor:string,profileSidebarFillColor:string,profileTextColor:string,profileUseBackgroundImage:boolean,screenName:string,showAllInlineMedia:boolean,statusesCount:bigint,translator:boolean,utcOffset:bigint>,userMentionEntities:array<string>>,quotedStatusId:bigint,retweetCount:bigint,source:string,symbolEntities:array<string>,text:string,urlEntities:array<struct<displayURL:string,end:bigint,expandedURL:string,start:bigint,url:string>>,user:struct<createdAt:string,description:string,descriptionURLEntities:array<string>,favouritesCount:bigint,followersCount:bigint,friendsCount:bigint,id:bigint,isContributorsEnabled:boolean,isDefaultProfile:boolean,isDefaultProfileImage:boolean,isFollowRequestSent:boolean,isGeoEnabled:boolean,isProtected:boolean,isVerified:boolean,lang:string,listedCount:bigint,location:string,name:string,profileBackgroundColor:string,profileBackgroundImageUrl:string,profileBackgroundImageUrlHttps:string,profileBackgroundTiled:boolean,profileBannerImageUrl:string,profileImageUrl:string,profileImageUrlHttps:string,profileLinkColor:string,profileSidebarBorderColor:string,profileSidebarFillColor:string,profileTextColor:string,profileUseBackgroundImage:boolean,screenName:string,showAllInlineMedia:boolean,statusesCount:bigint,timeZone:string,translator:boolean,url:string,utcOffset:bigint>,userMentionEntities:array<struct<end:bigint,id:bigint,name:string,screenName:string,start:bigint>>>, source: string, symbolEntities: array<string>, text: string, urlEntities: array<struct<displayURL:string,end:bigint,expandedURL:string,start:bigint,url:string>>, user: struct<createdAt:string,description:string,descriptionURLEntities:array<string>,favouritesCount:bigint,followersCount:bigint,friendsCount:bigint,id:bigint,isContributorsEnabled:boolean,isDefaultProfile:boolean,isDefaultProfileImage:boolean,isFollowRequestSent:boolean,isGeoEnabled:boolean,isProtected:boolean,isVerified:boolean,lang:string,listedCount:bigint,location:string,name:string,profileBackgroundColor:string,profileBackgroundImageUrl:string,profileBackgroundImageUrlHttps:string,profileBackgroundTiled:boolean,profileBannerImageUrl:string,profileImageUrl:string,profileImageUrlHttps:string,profileLinkColor:string,profileSidebarBorderColor:string,profileSidebarFillColor:string,profileTextColor:string,profileUseBackgroundImage:boolean,screenName:string,showAllInlineMedia:boolean,statusesCount:bigint,timeZone:string,translator:boolean,url:string,utcOffset:bigint>, userMentionEntities: array<struct<end:bigint,id:bigint,name:string,screenName:string,start:bigint>>]\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"java.io.IOException: No input paths specified in job","error":"<div class=\"ansiout\">\tat org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:156)\n\tat org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:208)\n\tat org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:199)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)\n\tat scala.Option.getOrElse(Option.scala:120)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:237)\n\tat org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)\n\tat scala.Option.getOrElse(Option.scala:120)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:237)\n\tat org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)\n\tat scala.Option.getOrElse(Option.scala:120)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:237)\n\tat org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1115)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:316)\n\tat org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1113)\n\tat org.apache.spark.sql.execution.datasources.json.InferSchema$.infer(InferSchema.scala:65)\n\tat org.apache.spark.sql.execution.datasources.json.JSONRelation$$anonfun$4.apply(JSONRelation.scala:114)\n\tat org.apache.spark.sql.execution.datasources.json.JSONRelation$$anonfun$4.apply(JSONRelation.scala:109)\n\tat scala.Option.getOrElse(Option.scala:120)\n\tat org.apache.spark.sql.execution.datasources.json.JSONRelation.dataSchema$lzycompute(JSONRelation.scala:109)\n\tat org.apache.spark.sql.execution.datasources.json.JSONRelation.dataSchema(JSONRelation.scala:108)\n\tat org.apache.spark.sql.sources.HadoopFsRelation.schema$lzycompute(interfaces.scala:636)\n\tat org.apache.spark.sql.sources.HadoopFsRelation.schema(interfaces.scala:635)\n\tat org.apache.spark.sql.execution.datasources.LogicalRelation.<init>(LogicalRelation.scala:37)\n\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125)\n\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:109)\n\tat org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:244)</div>","startTime":1.459392732565E12,"submitTime":1.459392734545E12,"finishTime":1.459392733093E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2ccab800-05a0-4dc9-af4c-c129e84cc8e1"},{"version":"CommandV1","origId":89469,"guid":"f9c7c341-b5eb-49ac-9a85-94a4a3c08ee6","subtype":"command","commandType":"auto","position":26.5,"command":"outJson.printSchema()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">root\n |-- contributorsIDs: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- createdAt: string (nullable = true)\n |-- currentUserRetweetId: long (nullable = true)\n |-- extendedMediaEntities: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- displayURL: string (nullable = true)\n | | |-- end: long (nullable = true)\n | | |-- expandedURL: string (nullable = true)\n | | |-- id: long (nullable = true)\n | | |-- mediaURL: string (nullable = true)\n | | |-- mediaURLHttps: string (nullable = true)\n | | |-- sizes: struct (nullable = true)\n | | | |-- 0: struct (nullable = true)\n | | | | |-- height: long (nullable = true)\n | | | | |-- resize: long (nullable = true)\n | | | | |-- width: long (nullable = true)\n | | | |-- 1: struct (nullable = true)\n | | | | |-- height: long (nullable = true)\n | | | | |-- resize: long (nullable = true)\n | | | | |-- width: long (nullable = true)\n | | | |-- 2: struct (nullable = true)\n | | | | |-- height: long (nullable = true)\n | | | | |-- resize: long (nullable = true)\n | | | | |-- width: long (nullable = true)\n | | | |-- 3: struct (nullable = true)\n | | | | |-- height: long (nullable = true)\n | | | | |-- resize: long (nullable = true)\n | | | | |-- width: long (nullable = true)\n | | |-- start: long (nullable = true)\n | | |-- type: string (nullable = true)\n | | |-- url: string (nullable = true)\n | | |-- videoAspectRatioHeight: long (nullable = true)\n | | |-- videoAspectRatioWidth: long (nullable = true)\n | | |-- videoDurationMillis: long (nullable = true)\n | | |-- videoVariants: array (nullable = true)\n | | | |-- element: struct (containsNull = true)\n | | | | |-- bitrate: long (nullable = true)\n | | | | |-- contentType: string (nullable = true)\n | | | | |-- url: string (nullable = true)\n |-- favoriteCount: long (nullable = true)\n |-- hashtagEntities: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- end: long (nullable = true)\n | | |-- start: long (nullable = true)\n | | |-- text: string (nullable = true)\n |-- id: long (nullable = true)\n |-- inReplyToScreenName: string (nullable = true)\n |-- inReplyToStatusId: long (nullable = true)\n |-- inReplyToUserId: long (nullable = true)\n |-- isFavorited: boolean (nullable = true)\n |-- isPossiblySensitive: boolean (nullable = true)\n |-- isRetweeted: boolean (nullable = true)\n |-- isTruncated: boolean (nullable = true)\n |-- lang: string (nullable = true)\n |-- mediaEntities: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- displayURL: string (nullable = true)\n | | |-- end: long (nullable = true)\n | | |-- expandedURL: string (nullable = true)\n | | |-- id: long (nullable = true)\n | | |-- mediaURL: string (nullable = true)\n | | |-- mediaURLHttps: string (nullable = true)\n | | |-- sizes: struct (nullable = true)\n | | | |-- 0: struct (nullable = true)\n | | | | |-- height: long (nullable = true)\n | | | | |-- resize: long (nullable = true)\n | | | | |-- width: long (nullable = true)\n | | | |-- 1: struct (nullable = true)\n | | | | |-- height: long (nullable = true)\n | | | | |-- resize: long (nullable = true)\n | | | | |-- width: long (nullable = true)\n | | | |-- 2: struct (nullable = true)\n | | | | |-- height: long (nullable = true)\n | | | | |-- resize: long (nullable = true)\n | | | | |-- width: long (nullable = true)\n | | | |-- 3: struct (nullable = true)\n | | | | |-- height: long (nullable = true)\n | | | | |-- resize: long (nullable = true)\n | | | | |-- width: long (nullable = true)\n | | |-- start: long (nullable = true)\n | | |-- type: string (nullable = true)\n | | |-- url: string (nullable = true)\n |-- quotedStatus: struct (nullable = true)\n | |-- contributorsIDs: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- createdAt: string (nullable = true)\n | |-- currentUserRetweetId: long (nullable = true)\n | |-- extendedMediaEntities: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- favoriteCount: long (nullable = true)\n | |-- hashtagEntities: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- id: long (nullable = true)\n | |-- inReplyToStatusId: long (nullable = true)\n | |-- inReplyToUserId: long (nullable = true)\n | |-- isFavorited: boolean (nullable = true)\n | |-- isPossiblySensitive: boolean (nullable = true)\n | |-- isRetweeted: boolean (nullable = true)\n | |-- isTruncated: boolean (nullable = true)\n | |-- lang: string (nullable = true)\n | |-- mediaEntities: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- quotedStatusId: long (nullable = true)\n | |-- retweetCount: long (nullable = true)\n | |-- source: string (nullable = true)\n | |-- symbolEntities: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- text: string (nullable = true)\n | |-- urlEntities: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- displayURL: string (nullable = true)\n | | | |-- end: long (nullable = true)\n | | | |-- expandedURL: string (nullable = true)\n | | | |-- start: long (nullable = true)\n | | | |-- url: string (nullable = true)\n | |-- user: struct (nullable = true)\n | | |-- createdAt: string (nullable = true)\n | | |-- description: string (nullable = true)\n | | |-- descriptionURLEntities: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- favouritesCount: long (nullable = true)\n | | |-- followersCount: long (nullable = true)\n | | |-- friendsCount: long (nullable = true)\n | | |-- id: long (nullable = true)\n | | |-- isContributorsEnabled: boolean (nullable = true)\n | | |-- isDefaultProfile: boolean (nullable = true)\n | | |-- isDefaultProfileImage: boolean (nullable = true)\n | | |-- isFollowRequestSent: boolean (nullable = true)\n | | |-- isGeoEnabled: boolean (nullable = true)\n | | |-- isProtected: boolean (nullable = true)\n | | |-- isVerified: boolean (nullable = true)\n | | |-- lang: string (nullable = true)\n | | |-- listedCount: long (nullable = true)\n | | |-- location: string (nullable = true)\n | | |-- name: string (nullable = true)\n | | |-- profileBackgroundColor: string (nullable = true)\n | | |-- profileBackgroundImageUrl: string (nullable = true)\n | | |-- profileBackgroundImageUrlHttps: string (nullable = true)\n | | |-- profileBackgroundTiled: boolean (nullable = true)\n | | |-- profileBannerImageUrl: string (nullable = true)\n | | |-- profileImageUrl: string (nullable = true)\n | | |-- profileImageUrlHttps: string (nullable = true)\n | | |-- profileLinkColor: string (nullable = true)\n | | |-- profileSidebarBorderColor: string (nullable = true)\n | | |-- profileSidebarFillColor: string (nullable = true)\n | | |-- profileTextColor: string (nullable = true)\n | | |-- profileUseBackgroundImage: boolean (nullable = true)\n | | |-- screenName: string (nullable = true)\n | | |-- showAllInlineMedia: boolean (nullable = true)\n | | |-- statusesCount: long (nullable = true)\n | | |-- timeZone: string (nullable = true)\n | | |-- translator: boolean (nullable = true)\n | | |-- url: string (nullable = true)\n | | |-- utcOffset: long (nullable = true)\n | |-- userMentionEntities: array (nullable = true)\n | | |-- element: string (containsNull = true)\n |-- quotedStatusId: long (nullable = true)\n |-- retweetCount: long (nullable = true)\n |-- retweetedStatus: struct (nullable = true)\n | |-- contributorsIDs: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- createdAt: string (nullable = true)\n | |-- currentUserRetweetId: long (nullable = true)\n | |-- extendedMediaEntities: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- displayURL: string (nullable = true)\n | | | |-- end: long (nullable = true)\n | | | |-- expandedURL: string (nullable = true)\n | | | |-- id: long (nullable = true)\n | | | |-- mediaURL: string (nullable = true)\n | | | |-- mediaURLHttps: string (nullable = true)\n | | | |-- sizes: struct (nullable = true)\n | | | | |-- 0: struct (nullable = true)\n | | | | | |-- height: long (nullable = true)\n | | | | | |-- resize: long (nullable = true)\n | | | | | |-- width: long (nullable = true)\n | | | | |-- 1: struct (nullable = true)\n | | | | | |-- height: long (nullable = true)\n | | | | | |-- resize: long (nullable = true)\n | | | | | |-- width: long (nullable = true)\n | | | | |-- 2: struct (nullable = true)\n | | | | | |-- height: long (nullable = true)\n | | | | | |-- resize: long (nullable = true)\n | | | | | |-- width: long (nullable = true)\n | | | | |-- 3: struct (nullable = true)\n | | | | | |-- height: long (nullable = true)\n | | | | | |-- resize: long (nullable = true)\n | | | | | |-- width: long (nullable = true)\n | | | |-- start: long (nullable = true)\n | | | |-- type: string (nullable = true)\n | | | |-- url: string (nullable = true)\n | | | |-- videoAspectRatioHeight: long (nullable = true)\n | | | |-- videoAspectRatioWidth: long (nullable = true)\n | | | |-- videoDurationMillis: long (nullable = true)\n | | | |-- videoVariants: array (nullable = true)\n | | | | |-- element: struct (containsNull = true)\n | | | | | |-- bitrate: long (nullable = true)\n | | | | | |-- contentType: string (nullable = true)\n | | | | | |-- url: string (nullable = true)\n | |-- favoriteCount: long (nullable = true)\n | |-- hashtagEntities: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- end: long (nullable = true)\n | | | |-- start: long (nullable = true)\n | | | |-- text: string (nullable = true)\n | |-- id: long (nullable = true)\n | |-- inReplyToScreenName: string (nullable = true)\n | |-- inReplyToStatusId: long (nullable = true)\n | |-- inReplyToUserId: long (nullable = true)\n | |-- isFavorited: boolean (nullable = true)\n | |-- isPossiblySensitive: boolean (nullable = true)\n | |-- isRetweeted: boolean (nullable = true)\n | |-- isTruncated: boolean (nullable = true)\n | |-- lang: string (nullable = true)\n | |-- mediaEntities: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- displayURL: string (nullable = true)\n | | | |-- end: long (nullable = true)\n | | | |-- expandedURL: string (nullable = true)\n | | | |-- id: long (nullable = true)\n | | | |-- mediaURL: string (nullable = true)\n | | | |-- mediaURLHttps: string (nullable = true)\n | | | |-- sizes: struct (nullable = true)\n | | | | |-- 0: struct (nullable = true)\n | | | | | |-- height: long (nullable = true)\n | | | | | |-- resize: long (nullable = true)\n | | | | | |-- width: long (nullable = true)\n | | | | |-- 1: struct (nullable = true)\n | | | | | |-- height: long (nullable = true)\n | | | | | |-- resize: long (nullable = true)\n | | | | | |-- width: long (nullable = true)\n | | | | |-- 2: struct (nullable = true)\n | | | | | |-- height: long (nullable = true)\n | | | | | |-- resize: long (nullable = true)\n | | | | | |-- width: long (nullable = true)\n | | | | |-- 3: struct (nullable = true)\n | | | | | |-- height: long (nullable = true)\n | | | | | |-- resize: long (nullable = true)\n | | | | | |-- width: long (nullable = true)\n | | | |-- start: long (nullable = true)\n | | | |-- type: string (nullable = true)\n | | | |-- url: string (nullable = true)\n | |-- quotedStatus: struct (nullable = true)\n | | |-- contributorsIDs: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- createdAt: string (nullable = true)\n | | |-- currentUserRetweetId: long (nullable = true)\n | | |-- extendedMediaEntities: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- favoriteCount: long (nullable = true)\n | | |-- hashtagEntities: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- id: long (nullable = true)\n | | |-- inReplyToStatusId: long (nullable = true)\n | | |-- inReplyToUserId: long (nullable = true)\n | | |-- isFavorited: boolean (nullable = true)\n | | |-- isPossiblySensitive: boolean (nullable = true)\n | | |-- isRetweeted: boolean (nullable = true)\n | | |-- isTruncated: boolean (nullable = true)\n | | |-- lang: string (nullable = true)\n | | |-- mediaEntities: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- quotedStatusId: long (nullable = true)\n | | |-- retweetCount: long (nullable = true)\n | | |-- source: string (nullable = true)\n | | |-- symbolEntities: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- text: string (nullable = true)\n | | |-- urlEntities: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- user: struct (nullable = true)\n | | | |-- createdAt: string (nullable = true)\n | | | |-- description: string (nullable = true)\n | | | |-- descriptionURLEntities: array (nullable = true)\n | | | | |-- element: string (containsNull = true)\n | | | |-- favouritesCount: long (nullable = true)\n | | | |-- followersCount: long (nullable = true)\n | | | |-- friendsCount: long (nullable = true)\n | | | |-- id: long (nullable = true)\n | | | |-- isContributorsEnabled: boolean (nullable = true)\n | | | |-- isDefaultProfile: boolean (nullable = true)\n | | | |-- isDefaultProfileImage: boolean (nullable = true)\n | | | |-- isFollowRequestSent: boolean (nullable = true)\n | | | |-- isGeoEnabled: boolean (nullable = true)\n | | | |-- isProtected: boolean (nullable = true)\n | | | |-- isVerified: boolean (nullable = true)\n | | | |-- lang: string (nullable = true)\n | | | |-- listedCount: long (nullable = true)\n | | | |-- location: string (nullable = true)\n | | | |-- name: string (nullable = true)\n | | | |-- profileBackgroundColor: string (nullable = true)\n | | | |-- profileBackgroundImageUrl: string (nullable = true)\n | | | |-- profileBackgroundImageUrlHttps: string (nullable = true)\n | | | |-- profileBackgroundTiled: boolean (nullable = true)\n | | | |-- profileImageUrl: string (nullable = true)\n | | | |-- profileImageUrlHttps: string (nullable = true)\n | | | |-- profileLinkColor: string (nullable = true)\n | | | |-- profileSidebarBorderColor: string (nullable = true)\n | | | |-- profileSidebarFillColor: string (nullable = true)\n | | | |-- profileTextColor: string (nullable = true)\n | | | |-- profileUseBackgroundImage: boolean (nullable = true)\n | | | |-- screenName: string (nullable = true)\n | | | |-- showAllInlineMedia: boolean (nullable = true)\n | | | |-- statusesCount: long (nullable = true)\n | | | |-- translator: boolean (nullable = true)\n | | | |-- utcOffset: long (nullable = true)\n | | |-- userMentionEntities: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | |-- quotedStatusId: long (nullable = true)\n | |-- retweetCount: long (nullable = true)\n | |-- source: string (nullable = true)\n | |-- symbolEntities: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- text: string (nullable = true)\n | |-- urlEntities: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- displayURL: string (nullable = true)\n | | | |-- end: long (nullable = true)\n | | | |-- expandedURL: string (nullable = true)\n | | | |-- start: long (nullable = true)\n | | | |-- url: string (nullable = true)\n | |-- user: struct (nullable = true)\n | | |-- createdAt: string (nullable = true)\n | | |-- description: string (nullable = true)\n | | |-- descriptionURLEntities: array (nullable = true)\n | | | |-- element: string (containsNull = true)\n | | |-- favouritesCount: long (nullable = true)\n | | |-- followersCount: long (nullable = true)\n | | |-- friendsCount: long (nullable = true)\n | | |-- id: long (nullable = true)\n | | |-- isContributorsEnabled: boolean (nullable = true)\n | | |-- isDefaultProfile: boolean (nullable = true)\n | | |-- isDefaultProfileImage: boolean (nullable = true)\n | | |-- isFollowRequestSent: boolean (nullable = true)\n | | |-- isGeoEnabled: boolean (nullable = true)\n | | |-- isProtected: boolean (nullable = true)\n | | |-- isVerified: boolean (nullable = true)\n | | |-- lang: string (nullable = true)\n | | |-- listedCount: long (nullable = true)\n | | |-- location: string (nullable = true)\n | | |-- name: string (nullable = true)\n | | |-- profileBackgroundColor: string (nullable = true)\n | | |-- profileBackgroundImageUrl: string (nullable = true)\n | | |-- profileBackgroundImageUrlHttps: string (nullable = true)\n | | |-- profileBackgroundTiled: boolean (nullable = true)\n | | |-- profileBannerImageUrl: string (nullable = true)\n | | |-- profileImageUrl: string (nullable = true)\n | | |-- profileImageUrlHttps: string (nullable = true)\n | | |-- profileLinkColor: string (nullable = true)\n | | |-- profileSidebarBorderColor: string (nullable = true)\n | | |-- profileSidebarFillColor: string (nullable = true)\n | | |-- profileTextColor: string (nullable = true)\n | | |-- profileUseBackgroundImage: boolean (nullable = true)\n | | |-- screenName: string (nullable = true)\n | | |-- showAllInlineMedia: boolean (nullable = true)\n | | |-- statusesCount: long (nullable = true)\n | | |-- timeZone: string (nullable = true)\n | | |-- translator: boolean (nullable = true)\n | | |-- url: string (nullable = true)\n | | |-- utcOffset: long (nullable = true)\n | |-- userMentionEntities: array (nullable = true)\n | | |-- element: struct (containsNull = true)\n | | | |-- end: long (nullable = true)\n | | | |-- id: long (nullable = true)\n | | | |-- name: string (nullable = true)\n | | | |-- screenName: string (nullable = true)\n | | | |-- start: long (nullable = true)\n |-- source: string (nullable = true)\n |-- symbolEntities: array (nullable = true)\n | |-- element: string (containsNull = true)\n |-- text: string (nullable = true)\n |-- urlEntities: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- displayURL: string (nullable = true)\n | | |-- end: long (nullable = true)\n | | |-- expandedURL: string (nullable = true)\n | | |-- start: long (nullable = true)\n | | |-- url: string (nullable = true)\n |-- user: struct (nullable = true)\n | |-- createdAt: string (nullable = true)\n | |-- description: string (nullable = true)\n | |-- descriptionURLEntities: array (nullable = true)\n | | |-- element: string (containsNull = true)\n | |-- favouritesCount: long (nullable = true)\n | |-- followersCount: long (nullable = true)\n | |-- friendsCount: long (nullable = true)\n | |-- id: long (nullable = true)\n | |-- isContributorsEnabled: boolean (nullable = true)\n | |-- isDefaultProfile: boolean (nullable = true)\n | |-- isDefaultProfileImage: boolean (nullable = true)\n | |-- isFollowRequestSent: boolean (nullable = true)\n | |-- isGeoEnabled: boolean (nullable = true)\n | |-- isProtected: boolean (nullable = true)\n | |-- isVerified: boolean (nullable = true)\n | |-- lang: string (nullable = true)\n | |-- listedCount: long (nullable = true)\n | |-- location: string (nullable = true)\n | |-- name: string (nullable = true)\n | |-- profileBackgroundColor: string (nullable = true)\n | |-- profileBackgroundImageUrl: string (nullable = true)\n | |-- profileBackgroundImageUrlHttps: string (nullable = true)\n | |-- profileBackgroundTiled: boolean (nullable = true)\n | |-- profileBannerImageUrl: string (nullable = true)\n | |-- profileImageUrl: string (nullable = true)\n | |-- profileImageUrlHttps: string (nullable = true)\n | |-- profileLinkColor: string (nullable = true)\n | |-- profileSidebarBorderColor: string (nullable = true)\n | |-- profileSidebarFillColor: string (nullable = true)\n | |-- profileTextColor: string (nullable = true)\n | |-- profileUseBackgroundImage: boolean (nullable = true)\n | |-- screenName: string (nullable = true)\n | |-- showAllInlineMedia: boolean (nullable = true)\n | |-- statusesCount: long (nullable = true)\n | |-- timeZone: string (nullable = true)\n | |-- translator: boolean (nullable = true)\n | |-- url: string (nullable = true)\n | |-- utcOffset: long (nullable = true)\n |-- userMentionEntities: array (nullable = true)\n | |-- element: struct (containsNull = true)\n | | |-- end: long (nullable = true)\n | | |-- id: long (nullable = true)\n | | |-- name: string (nullable = true)\n | | |-- screenName: string (nullable = true)\n | | |-- start: long (nullable = true)\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392740341E12,"submitTime":1.459392742317E12,"finishTime":1.459392740428E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"65740adc-110f-4062-b400-dfd835b6645f"},{"version":"CommandV1","origId":89470,"guid":"f3154a02-b795-4b6f-b100-521ba67a0c01","subtype":"command","commandType":"auto","position":27.0,"command":"outJson.select(\"id\",\"text\").show(false)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n|id |text |\n+------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n|715369685726863360|RT @camerondallas: Borderline kinda obsessed |\n|715369685735223296|RT @laurimarrr: los retos me motivan. |\n|715369685726855170|?????????????????????????????????????????????????????\n????????????????????????????????????????????????????????????\n??????????? |\n|715369685714206720|RT @PUPMemes: GROUP STUDY SA HOTEL SOGO STA. MESA! Tara na!!!\n\nEvent link: https://t.co/WBj5Wso0xZ https://t.co/zGP7jK68nT |\n|715369685718474752|RT @ksushma140: #ServicesByMSG Inspires youngsters 2 stay away from drug n taste d nostrum f GOD's true name by method f meditation \nhttps:?|\n|715369685718466561|Jesse Pinkman is Noctis confirmed- Drug Dealer Noctis |\n|715369685714214914|@Ed_TechSource 1 |\n|715369685726826496|RT @villalobossebas: Nuevo v?deooo https://t.co/agonOjoq1j ?? |\n|715369685735223298|???????? https://t.co/1SFfoGWYNM |\n|715369685718466560|Hindi mo ako kauri. Nakakainis. |\n|715369685743591424|RT @exnews24: ?GENE?4/5???????\n???????????????????\n?TVCM?GENERATIONS????\n\n????????????????????????????????CM???CM???TRANSFORM?? https://t.c?|\n|715369685722742786|RT @Martinlvrz: Quiero dormirme cada d?a un poco m?s temprano y me sale todo al rev?s en unos d?as t?rmino acostandome a las 10 de la ma?ana|\n|715369685739520000|https://t.co/3MzFhSyrc9 |\n|715369685752090625|RT @umvirge: B E B A\nA G U A |\n|715369685751980033|THIS IS SO CUTE WTH I LOVE YOU https://t.co/f8HjTmbUFe |\n|715369685722595328|????????????????????????????\nhttps://t.co/RJkzZFIV2T |\n|715369685730988035|? |\n|715369685722742785|RT @MicaViciconte: Hermosa la nueva colecci?n de @ingratasanta .Ya subir? fotos para q vean;) |\n|715369685731119104|??????? ????? ??????? ..???? ?? .. ??? ???? ! ??????? ????? ??????? .. ???? ?? ??? ???? .. https://t.co/bbf1CveBZ6 |\n|715369685739438081|RT @1dperformances: where do broken hearts go - harry and zayn http://t.co/4UjqkEGMG1 |\n+------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\nonly showing top 20 rows\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392742365E12,"submitTime":1.459392744333E12,"finishTime":1.459392742901E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"c43512ac-0ea4-4f8d-9540-1432dd85f396"},{"version":"CommandV1","origId":89471,"guid":"4ea2340e-c15a-42b8-bc76-0241f15945e5","subtype":"command","commandType":"auto","position":28.0,"command":"%md\nClearly there is a lot one can do with tweets!","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"fc06daf7-a902-4e00-9f83-725b28004a89"},{"version":"CommandV1","origId":89472,"guid":"78e3797c-b2a5-48f4-b571-0d340930813e","subtype":"command","commandType":"auto","position":29.0,"command":"%md\n## Next, let's write the tweets into a scalable commercial cloud storage system\n\nWe will make sure to write the tweets to AWS's simple storage service or S3, a scalable storage system in the cloud. See [https://aws.amazon.com/s3/](https://aws.amazon.com/s3/).","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"69c9b248-2cb6-42f9-90d8-b7fbcf9cbe1a"},{"version":"CommandV1","origId":89473,"guid":"c4e918d8-c167-4978-a20a-b7d9044bf967","subtype":"command","commandType":"auto","position":30.0,"command":"// Replace with your AWS S3 credentials\n//\n// NOTE: Set the access to this notebook appropriately to protect the security of your keys.\n// Or you can delete this cell after you run the mount command below once successfully.\n\nval AccessKey = getArgument(\"1. ACCESS_KEY\", \"REPLACE_WITH_YOUR_ACCESS_KEY\")\nval SecretKey = getArgument(\"2. SECRET_KEY\", \"REPLACE_WITH_YOUR_SECRET_KEY\")\nval EncodedSecretKey = SecretKey.replace(\"/\", \"%2F\")\nval AwsBucketName = getArgument(\"3. S3_BUCKET\", \"REPLACE_WITH_YOUR_S3_BUCKET\")\nval MountName = getArgument(\"4. MNT_NAME\", \"REPLACE_WITH_YOUR_MOUNT_NAME\")\nval s3Filename = \"tweetDump\"","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"><console>:49: warning: method getArgument in trait ArgumentsHandler is deprecated: Use createText() or createSelect() to create a widget and getArgument(widgetName) to get its bound value.\n val AccessKey = getArgument("1. ACCESS_KEY", "REPLACE_WITH_YOUR_ACCESS_KEY")\n ^\n<console>:50: warning: method getArgument in trait ArgumentsHandler is deprecated: Use createText() or createSelect() to create a widget and getArgument(widgetName) to get its bound value.\n val SecretKey = getArgument("2. SECRET_KEY", "REPLACE_WITH_YOUR_SECRET_KEY")\n ^\n<console>:52: warning: method getArgument in trait ArgumentsHandler is deprecated: Use createText() or createSelect() to create a widget and getArgument(widgetName) to get its bound value.\n val AwsBucketName = getArgument("3. S3_BUCKET", "REPLACE_WITH_YOUR_S3_BUCKET")\n ^\n<console>:53: warning: method getArgument in trait ArgumentsHandler is deprecated: Use createText() or createSelect() to create a widget and getArgument(widgetName) to get its bound value.\n val MountName = getArgument("4. MNT_NAME", "REPLACE_WITH_YOUR_MOUNT_NAME")\n ^\nAccessKey: String = ""\nSecretKey: String = ""\nEncodedSecretKey: String = ""\nAwsBucketName: String = ""\nMountName: String = s3Data\ns3Filename: String = tweetDump\n</div>","arguments":{"1. ACCESS_KEY":"","2. SECRET_KEY":"","3. S3_BUCKET":"","4. MNT_NAME":"s3Data"},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459393540786E12,"submitTime":1.459393468536E12,"finishTime":1.459393540886E12,"collapsed":false,"bindings":{"1. ACCESS_KEY":"","2. SECRET_KEY":"","3. S3_BUCKET":"","4. MNT_NAME":"s3Data"},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":true,"iPythonMetadata":null,"nuid":"ceb79c0b-e718-4562-b30d-771b366a2793"},{"version":"CommandV1","origId":89474,"guid":"b1cb3c14-3bd8-4869-979d-f2f7a7039b39","subtype":"command","commandType":"auto","position":31.0,"command":"%md\nNow just mount s3 as follows:","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"d968572e-aa73-47ba-8232-0618370b5435"},{"version":"CommandV1","origId":89475,"guid":"31f89cb4-5314-4394-a34f-a196da914223","subtype":"command","commandType":"auto","position":32.0,"command":"dbutils.fs.mount(s\"s3a://$AccessKey:$EncodedSecretKey@$AwsBucketName\", s\"/mnt/$MountName\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res14: Boolean = true\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392771573E12,"submitTime":1.459392773545E12,"finishTime":1.459392772745E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"a44c4dd2-e71e-4c44-9b94-c33e27a0bf23"},{"version":"CommandV1","origId":89476,"guid":"de2b95d1-ca9a-4002-a4b5-d2c0f0910c44","subtype":"command","commandType":"auto","position":33.0,"command":"%md\nNow you can use the `dbutils` commands freely to access data in S3.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"144059af-bc2e-4854-a43e-403ddc989eca"},{"version":"CommandV1","origId":89477,"guid":"0e267143-7f7f-4c17-95cf-cede628bd448","subtype":"command","commandType":"auto","position":34.0,"command":"dbutils.fs.help()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392790282E12,"submitTime":1.459392792264E12,"finishTime":1.459392790343E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2f09514d-6556-46ca-99fb-e75310af29be"},{"version":"CommandV1","origId":89478,"guid":"1fc50ef8-cf7b-4318-a6d5-898cce40bb91","subtype":"command","commandType":"auto","position":35.0,"command":"display(dbutils.fs.ls(s\"/mnt/\")) // list the files in s3","commandVersion":0,"state":"finished","results":{"type":"table","data":[["dbfs:/mnt/Coopworth_New.txt/","Coopworth_New.txt/",0.0],["dbfs:/mnt/FileStore/","FileStore/",0.0],["dbfs:/mnt/databricks-test-paul/","databricks-test-paul/",0.0],["dbfs:/mnt/s3Data/","s3Data/",0.0]],"arguments":{},"addedWidgets":{},"removedWidgets":[],"schema":[{"name":"path","type":"\"string\""},{"name":"name","type":"\"string\""},{"name":"size","type":"\"long\""}],"overflow":false,"aggData":[],"aggSchema":[],"aggOverflow":false,"aggSeriesLimitReached":false,"aggError":"","aggType":"","plotOptions":null,"isJsonSchema":true,"dbfsResultPath":null},"errorSummary":"java.io.FileNotFoundException: /mnt/s3Data","error":"<div class=\"ansiout\">\tat com.databricks.backend.daemon.data.client.DbfsClient.send0(DbfsClient.scala:63)\n\tat com.databricks.backend.daemon.data.client.DbfsClient.sendIdempotent(DbfsClient.scala:40)\n\tat com.databricks.backend.daemon.data.client.DatabricksFileSystem.listStatus(DatabricksFileSystem.scala:174)\n\tat com.databricks.backend.daemon.dbutils.FSUtils$.ls(DBUtilsCore.scala:60)\n\tat com.databricks.dbutils_v1.package$fs$.ls(dbutils_v1.scala:44)</div>","startTime":1.459392803661E12,"submitTime":1.459392805643E12,"finishTime":1.459392804073E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"254d47d2-e88e-4e8b-8493-22bf448e8cff"},{"version":"CommandV1","origId":89479,"guid":"ec9a88ee-3d2e-493c-8bfe-780fef328c38","subtype":"command","commandType":"auto","position":36.0,"command":"dbutils.fs.cp(\"dbfs:/rawTweets\",s\"/mnt/$MountName/rawTweetsInS3/\",recurse=true) // copy all the tweets","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res18: Boolean = true\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392819554E12,"submitTime":1.459392821527E12,"finishTime":1.459392883023E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6229ebcd-6636-4421-a061-a00ec22293a0"},{"version":"CommandV1","origId":89480,"guid":"6131a903-d3bd-46d3-bc48-799112a8b71f","subtype":"command","commandType":"auto","position":37.0,"command":"display(dbutils.fs.ls(s\"/mnt/$MountName/rawTweetsInS3\")) // list the files copied into s3","commandVersion":0,"state":"finished","results":{"type":"table","data":[["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392400000/","tweets_1459392400000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392401000/","tweets_1459392401000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392402000/","tweets_1459392402000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392403000/","tweets_1459392403000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392404000/","tweets_1459392404000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392405000/","tweets_1459392405000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392406000/","tweets_1459392406000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392407000/","tweets_1459392407000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392408000/","tweets_1459392408000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392409000/","tweets_1459392409000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392410000/","tweets_1459392410000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392411000/","tweets_1459392411000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392412000/","tweets_1459392412000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392413000/","tweets_1459392413000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392414000/","tweets_1459392414000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392415000/","tweets_1459392415000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392416000/","tweets_1459392416000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392417000/","tweets_1459392417000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392418000/","tweets_1459392418000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392419000/","tweets_1459392419000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392420000/","tweets_1459392420000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392421000/","tweets_1459392421000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392422000/","tweets_1459392422000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392423000/","tweets_1459392423000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392424000/","tweets_1459392424000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392425000/","tweets_1459392425000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392426000/","tweets_1459392426000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392427000/","tweets_1459392427000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392428000/","tweets_1459392428000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392429000/","tweets_1459392429000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392430000/","tweets_1459392430000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392431000/","tweets_1459392431000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392432000/","tweets_1459392432000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392433000/","tweets_1459392433000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392434000/","tweets_1459392434000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392435000/","tweets_1459392435000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392436000/","tweets_1459392436000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392437000/","tweets_1459392437000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392438000/","tweets_1459392438000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392439000/","tweets_1459392439000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392440000/","tweets_1459392440000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392441000/","tweets_1459392441000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392442000/","tweets_1459392442000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392443000/","tweets_1459392443000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392444000/","tweets_1459392444000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392445000/","tweets_1459392445000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392446000/","tweets_1459392446000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392447000/","tweets_1459392447000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392448000/","tweets_1459392448000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392449000/","tweets_1459392449000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392450000/","tweets_1459392450000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392451000/","tweets_1459392451000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392452000/","tweets_1459392452000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392453000/","tweets_1459392453000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392454000/","tweets_1459392454000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392455000/","tweets_1459392455000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392456000/","tweets_1459392456000/",0.0],["dbfs:/mnt/s3Data/rawTweetsInS3/tweets_1459392457000/","tweets_1459392457000/",0.0]],"arguments":{},"addedWidgets":{},"removedWidgets":[],"schema":[{"name":"path","type":"\"string\""},{"name":"name","type":"\"string\""},{"name":"size","type":"\"long\""}],"overflow":false,"aggData":[],"aggSchema":[],"aggOverflow":false,"aggSeriesLimitReached":false,"aggError":"","aggType":"","plotOptions":null,"isJsonSchema":true,"dbfsResultPath":null},"errorSummary":null,"error":null,"startTime":1.459392891878E12,"submitTime":1.459392893859E12,"finishTime":1.459392892232E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"9906587c-5ef8-4402-8aee-856b9e742ace"},{"version":"CommandV1","origId":89481,"guid":"a2c46a58-dda9-4fca-9242-a442199dbc25","subtype":"command","commandType":"auto","position":38.0,"command":"dbutils.fs.rm(s\"/mnt/$MountName/rawTweetsInS3\",recurse=true) // remove all the files from s3","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res20: Boolean = true\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459392911687E12,"submitTime":1.459392913663E12,"finishTime":1.45939291275E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"c8d67829-41ee-47da-bbee-91f78be6d220"},{"version":"CommandV1","origId":89482,"guid":"5b08589f-5bc5-439b-ac9f-0957bac95f95","subtype":"command","commandType":"auto","position":39.0,"command":"display(dbutils.fs.ls(\"/mnt/\")) // list the files in s3","commandVersion":0,"state":"finished","results":{"type":"table","data":[["dbfs:/mnt/Coopworth_New.txt/","Coopworth_New.txt/",0.0],["dbfs:/mnt/FileStore/","FileStore/",0.0],["dbfs:/mnt/databricks-test-paul/","databricks-test-paul/",0.0],["dbfs:/mnt/s3Data/","s3Data/",0.0]],"arguments":{},"addedWidgets":{},"removedWidgets":[],"schema":[{"name":"path","type":"\"string\""},{"name":"name","type":"\"string\""},{"name":"size","type":"\"long\""}],"overflow":false,"aggData":[],"aggSchema":[],"aggOverflow":false,"aggSeriesLimitReached":false,"aggError":"","aggType":"","plotOptions":null,"isJsonSchema":true,"dbfsResultPath":null},"errorSummary":null,"error":null,"startTime":1.459393064837E12,"submitTime":1.459393066818E12,"finishTime":1.459393065533E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"5eb53996-1ceb-4874-be1a-1b7458bae488"},{"version":"CommandV1","origId":89483,"guid":"f304b050-543d-4df2-8e44-b4921ad80c55","subtype":"command","commandType":"auto","position":39.25,"command":"//display(dbutils.fs.ls(s\"/mnt/$MountName/rawTweetsInS3\")) // it has been removed","commandVersion":0,"state":"error","results":null,"errorSummary":"java.io.FileNotFoundException: /mnt/s3Data/rawTweetsInS3","error":"<div class=\"ansiout\">\tat com.databricks.backend.daemon.data.client.DbfsClient.send0(DbfsClient.scala:63)\n\tat com.databricks.backend.daemon.data.client.DbfsClient.sendIdempotent(DbfsClient.scala:40)\n\tat com.databricks.backend.daemon.data.client.DatabricksFileSystem.listStatus(DatabricksFileSystem.scala:184)\n\tat com.databricks.backend.daemon.dbutils.FSUtils$.ls(DBUtilsCore.scala:60)\n\tat com.databricks.dbutils_v1.impl.DbfsUtilsImpl.ls(DbfsUtilsImpl.scala:29)</div>","startTime":1.459393036769E12,"submitTime":1.459393036769E12,"finishTime":1.459393036923E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"34fd88c0-ac67-40da-ab40-c82a14197a8d"},{"version":"CommandV1","origId":89484,"guid":"cd401b6c-fcd4-42e0-8e93-d3947884925b","subtype":"command","commandType":"auto","position":39.5,"command":"dbutils.fs.unmount(s\"/mnt/$MountName\") // finally unmount when done","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">/mnt/s3Data has been unmounted.\nres30: Boolean = true\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459393082563E12,"submitTime":1.459393084551E12,"finishTime":1.459393084034E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"96d80377-392f-483f-a728-556ded1bb39e"},{"version":"CommandV1","origId":89485,"guid":"08513741-bb59-4f02-8869-6f51c6df09a4","subtype":"command","commandType":"auto","position":40.0,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2c283051-c2a4-4379-afeb-f6440702ba51"}],"dashboards":[],"guid":"0286027c-4fb3-4b91-a706-44f2af9b7a96","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script> <script src="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js" onerror="window.mainJsLoadError = true;"></script> </head> <body> <script> if (window.mainJsLoadError) { var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js'; var b = document.getElementsByTagName('body')[0]; var c = document.createElement('div'); c.innerHTML = ('<h1>Network Error</h1>' + '<p><b>Please check your network connection and try again.</b></p>' + '<p>Could not load a required resource: ' + u + '</p>'); c.style.margin = '30px'; c.style.padding = '20px 50px'; c.style.backgroundColor = '#f5f5f5'; c.style.borderRadius = '5px'; b.appendChild(c); } </script> </body> </html>