<!DOCTYPE html> <html> <head> <meta name="databricks-html-version" content="1"> <title>005_RDDsTransformationsActionsHOMEWORK - Databricks</title> <meta charset="utf-8"> <meta name="google" content="notranslate"> <meta http-equiv="Content-Language" content="en"> <meta http-equiv="Content-Type" content="text/html; charset=UTF8"> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/jquery-ui-bundle/jquery-ui.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/main.css"> <link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/print.css" media="print"> <link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/img/favicon.ico"/> <script>window.settings = {"sparkDocsSearchGoogleCx":"004588677886978090460:_rj0wilqwdm","dbcForumURL":"http://forums.databricks.com/","dbfsS3Host":"https://databricks-prod-storage-sydney.s3.amazonaws.com","enableThirdPartyApplicationsUI":false,"enableClusterAcls":false,"notebookRevisionVisibilityHorizon":0,"enableTableHandler":true,"isAdmin":true,"enableLargeResultDownload":false,"nameAndEmail":"Raazesh Sainudiin (r.sainudiin@math.canterbury.ac.nz)","enablePresentationTimerConfig":true,"enableFullTextSearch":true,"enableElasticSparkUI":true,"clusters":true,"hideOffHeapCache":false,"applications":false,"useStaticGuide":false,"fileStoreBase":"FileStore","configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableResetPassword":true,"enableJobsSparkUpgrade":true,"sparkVersions":[{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0","packageLabel":"spark-1.3-jenkins-ip-10-30-9-162-U0c2673ac85-Sa2ee4664b2-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1","packageLabel":"spark-1.4-jenkins-ip-10-30-9-162-U0c2673ac85-S33a1e4b9c6-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.0","packageLabel":"spark-1.6-jenkins-ip-10-30-9-162-U0c2673ac85-Scabba801f3-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"master","displayName":"Spark master (dev)","packageLabel":"","upgradable":true,"deprecated":false,"customerVisible":false}],"enableRestrictedClusterCreation":false,"enableFeedback":false,"defaultNumWorkers":8,"serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","driverStdoutFilePrefix":"stdout","enableSparkDocsSearch":true,"prefetchSidebarNodes":true,"sparkHistoryServerEnabled":true,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableNotebookHistoryDiffing":true,"branch":"2.12.3","accountsLimit":-1,"enableNotebookGitBranching":true,"local":false,"displayDefaultContainerMemoryGB":6,"deploymentMode":"production","useSpotForWorkers":false,"enableUserInviteWorkflow":false,"enableStaticNotebooks":true,"dbcGuideURL":"#workspace/databricks_guide/00 Welcome to Databricks","enableCssTransitions":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"orgId":0,"enableNotebookGitVersioning":true,"files":"files/","enableDriverLogsUI":true,"disableLegacyDashboards":false,"enableWorkspaceAclsConfig":true,"dropzoneMaxFileSize":4096,"enableNewDashboardViews":false,"driverLog4jFilePrefix":"log4j","enableMavenLibraries":true,"displayRowLimit":1000,"defaultSparkVersion":{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},"clusterPublisherRootId":5,"enableLatestJobRunResultPermalink":true,"disallowAddingAdmins":false,"enableSparkConfUI":true,"enableOrgSwitcherUI":false,"clustersLimit":-1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":false,"enableClusterDeltaUpdates":true,"csrfToken":"1f2013f6-c2fd-4ab5-b68c-a2ff4e325639","useFixedStaticNotebookVersionForDevelopment":false,"enableBasicReactDialogBoxes":true,"requireEmailUserName":true,"enableDashboardViews":false,"dbcFeedbackURL":"http://feedback.databricks.com/forums/263785-product-feedback","enableWorkspaceAclService":true,"someName":"Raazesh Sainudiin","enableWorkspaceAcls":true,"gitHash":"0c2673ac858e227cad536fdb45d140aeded238db","userFullname":"Raazesh Sainudiin","enableClusterCreatePage":false,"enableImportFromUrl":true,"enableMiniClusters":false,"enableWebSocketDeltaUpdates":true,"enableDebugUI":false,"showHiddenSparkVersions":false,"allowNonAdminUsers":true,"userId":100005,"dbcSupportURL":"","staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/","enableSparkPackages":true,"enableHybridClusterType":false,"enableNotebookHistoryUI":true,"availableWorkspaces":[{"name":"Workspace 0","orgId":0}],"enableFolderHtmlExport":true,"enableSparkVersionsUI":true,"databricksGuideStaticUrl":"","enableHybridClusters":true,"notebookLoadingBackground":"#fff","enableNewJobRunDetailsPage":true,"enableDashboardExport":true,"user":"r.sainudiin@math.canterbury.ac.nz","enableServerAutoComplete":true,"enableStaticHtmlImport":true,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"tablesPublisherRootId":7,"enableNewInputWidgetUI":false,"accounts":true,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4};</script> <script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":8808,"name":"005_RDDsTransformationsActionsHOMEWORK","language":"scala","commands":[{"version":"CommandV1","origId":130051,"guid":"ae7dcf81-ee57-48d4-9b91-f590d8690796","subtype":"command","commandType":"auto","position":0.125,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"df15b71a-563b-45e3-9ea8-aa12a2092512"},{"version":"CommandV1","origId":130050,"guid":"b03b13cc-abf9-48f5-aa5b-6f844fa89475","subtype":"command","commandType":"auto","position":0.25,"command":"%md\nThe [html source url](https://raw.githubusercontent.com/raazesh-sainudiin/scalable-data-science/master/db/week2/02_SparkEssentials/005_RDDsTransformationsActionsHOMEWORK.html) of this databricks notebook and its recorded Uji :\n\n[](https://www.youtube.com/v/zgkvusQdNLY?rel=0&autoplay=1&modestbranding=1&start=4519&end=4612)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"d0084d0b-cae5-442b-ab93-1caf162e7367"},{"version":"CommandV1","origId":8819,"guid":"9988fc53-9479-4e7e-894f-db83afbbaa83","subtype":"command","commandType":"auto","position":0.5,"command":"%md\n# HOMEWORK notebook - RDDs Transformations and Actions\nJust go through the notebook and familiarize yourself with these transformations and actions.\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"005fd748-9252-4be3-9cb7-58db9b82fd4b"},{"version":"CommandV1","origId":8810,"guid":"6ee12405-d99e-463a-b7ab-39e3fffd3936","subtype":"command","commandType":"auto","position":1.0,"command":"%md\n##### 1. Perform the ``takeOrdered`` action on the RDD\n\nTo illustrate ``take`` and ``takeOrdered`` actions, let's create a bigger RDD named ``rdd0_1000000`` that is made up of a million integers from 0 to 1000000. \nWe will ``sc.parallelize`` the ``Seq`` Scala collection by using its ``.range(startInteger,stopInteger)`` method.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"08e11cc4-e4b1-4bb2-80ce-1e67866eb9ab"},{"version":"CommandV1","origId":8811,"guid":"686b82dc-ba0c-445c-8e37-f70d925eab8b","subtype":"command","commandType":"auto","position":2.0,"command":"val rdd0_1000000 = sc.parallelize(Seq.range(0, 1000000)) // <Shift+Enter> to create an RDD of million integers: 0,1,2,...,10^6","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rdd0_1000000: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[666] at parallelize at <console>:34\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456128961201E12,"submitTime":1.456128953935E12,"finishTime":1.456128962622E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"8c300428-d70f-44f2-88dc-4e6f54327023"},{"version":"CommandV1","origId":8812,"guid":"76fce79b-068f-4ad8-9e5d-d027b4c57efa","subtype":"command","commandType":"auto","position":3.0,"command":"rdd0_1000000.take(5) // <Ctrl+Enter> gives the first 5 elements of the RDD, (0, 1, 2, 3, 4)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res0: Array[Int] = Array(0, 1, 2, 3, 4)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456128971745E12,"submitTime":1.456128964494E12,"finishTime":1.45612897187E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"fa10d297-7782-4b68-b98b-89ea630e744a"},{"version":"CommandV1","origId":8813,"guid":"c45b29f9-cf4e-4337-805a-bb80c5538f77","subtype":"command","commandType":"auto","position":4.0,"command":"%md\n``takeordered(n)`` returns ``n`` elements ordered in ascending order (by default) or as specified by the optional key function, as shown below.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"d34e98cf-ae90-4839-a50e-b0210233b7ae"},{"version":"CommandV1","origId":8814,"guid":"008e9a44-cf66-43e5-bc2f-8ac97a1b21e6","subtype":"command","commandType":"auto","position":5.0,"command":"rdd0_1000000.takeOrdered(5) // <Shift+Enter> is same as rdd0_1000000.take(5) ","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res1: Array[Int] = Array(0, 1, 2, 3, 4)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129003838E12,"submitTime":1.456128996583E12,"finishTime":1.456129003993E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b140fa89-980f-467b-bd3a-acb0b1b8299f"},{"version":"CommandV1","origId":8815,"guid":"2d7257d6-8bb1-49ac-8827-e95dba66abd7","subtype":"command","commandType":"auto","position":6.0,"command":"rdd0_1000000.takeOrdered(5)(Ordering[Int].reverse) // <Ctrl+Enter> to get the last 5 elements of the RDD 999999, 999998, ..., 999995","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res2: Array[Int] = Array(999999, 999998, 999997, 999996, 999995)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129023427E12,"submitTime":1.456129016175E12,"finishTime":1.45612902363E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"446aa391-3fcf-4ca4-9b67-2a5e891e497a"},{"version":"CommandV1","origId":8816,"guid":"96707105-02e7-41dd-8ed7-3df879ab5f0c","subtype":"command","commandType":"auto","position":7.0,"command":"// HOMEWORK: edit the numbers below to get the last 20 elements of an RDD made of a sequence of integers from 669966 to 969696\nsc.parallelize(Seq.range(0, 10)).takeOrdered(5)(Ordering[Int].reverse) // <Ctrl+Enter> evaluate this cell after editing it for the right answer","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res3: Array[Int] = Array(9, 8, 7, 6, 5)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129046601E12,"submitTime":1.456129039352E12,"finishTime":1.456129046691E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"5b8910bb-b1fb-4a34-9479-19995277e099"},{"version":"CommandV1","origId":8817,"guid":"1f2e4b93-9a42-4f44-8574-652af12aaa4f","subtype":"command","commandType":"auto","position":8.0,"command":"%md\n##### 2. More examples of `map`","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"9099dc68-46c6-41e8-8b50-43fb1b619149"},{"version":"CommandV1","origId":8818,"guid":"125b918e-a565-47ae-81b5-e4f3839a4b9e","subtype":"command","commandType":"auto","position":9.0,"command":"val rdd = sc.parallelize(Seq(1, 2, 3, 4)) // <Shift+Enter> to evaluate this cell (using default number of partitions)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[781] at parallelize at <console>:34\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129516023E12,"submitTime":1.456129508765E12,"finishTime":1.456129516172E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"8b631cb7-3102-43b5-96c9-b1e12e59745f"},{"version":"CommandV1","origId":8820,"guid":"e8f2e7bf-adf2-440c-8745-0b496aeacd8d","subtype":"command","commandType":"auto","position":10.0,"command":"rdd.map( x => x*2) // <Ctrl+Enter> to transform rdd by map that doubles each element","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res4: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[784] at map at <console>:38\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129527742E12,"submitTime":1.456129520477E12,"finishTime":1.456129527825E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"bcb126c2-f9d1-49da-a704-832972754c13"},{"version":"CommandV1","origId":8821,"guid":"f09a812f-d9d4-4784-9225-232a2ef45bde","subtype":"command","commandType":"auto","position":11.0,"command":"%md\nTo see what's in the transformed RDD, let's perform the actions of ``count`` and ``collect`` on the ``rdd.map( x => x*2)``, the transformation of ``rdd`` by the ``map`` given by the closure ``x => x*2``.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6026f42b-e2f3-486f-bffe-49ae2f12b46f"},{"version":"CommandV1","origId":8822,"guid":"569dea4f-5297-40ce-8a06-ced465bd0cb8","subtype":"command","commandType":"auto","position":12.0,"command":"rdd.map( x => x*2).count() // <Shift+Enter> to perform count (action) the element of the RDD = 4","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res5: Long = 4\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129561258E12,"submitTime":1.456129553978E12,"finishTime":1.456129561353E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"a2a14d58-b79f-42b0-9047-a56113ad257a"},{"version":"CommandV1","origId":8823,"guid":"fcb16c5d-47fa-4267-a900-ed8732f0c61c","subtype":"command","commandType":"auto","position":13.0,"command":"rdd.map( x => x*2).collect() // <Shift+Enter> to perform collect (action) to show 2, 4, 6, 8","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res6: Array[Int] = Array(2, 4, 6, 8)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129573281E12,"submitTime":1.456129566019E12,"finishTime":1.456129573375E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"c8a8a69a-12c0-4d48-9d97-acc56ae0237b"},{"version":"CommandV1","origId":8824,"guid":"216a7ecf-f344-4793-a326-11d7fe558d12","subtype":"command","commandType":"auto","position":14.0,"command":"// HOMEWORK: modify the '???' in the code below to collect and display the square (x*x) of each element of the RDD\n// the answer should be Array[Int] = Array(1, 4, 9, 16) Press <Cntrl+Enter> to evaluate the cell after modifying '???'\nsc.parallelize(Seq(1, 2, 3, 4)).map( x => ???).collect()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129593537E12,"submitTime":1.456129586275E12,"finishTime":1.456129593582E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"3fa527a5-ba15-49d0-a393-341f139a55b4"},{"version":"CommandV1","origId":8825,"guid":"dea155ff-d6b3-4933-a5fd-15f09b67b6ca","subtype":"command","commandType":"auto","position":15.0,"command":"%md\n\n##### 3. More examples of `filter`\nLet's declare another ``val`` RDD named ``rddFiltered`` by transforming our first RDD named ``rdd`` via the ``filter`` transformation ``x%2==0`` (of being even). \n\nThis filter transformation based on the closure ``x => x%2==0`` will return ``true`` if the element, modulo two, equals zero. The closure is automatically passed on to the workers for evaluation (when an action is called later). \nSo this will take our RDD of (1,2,3,4) and return RDD of (2, 4).","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"56f29b9b-0689-4cd6-9325-3d317b4d9f93"},{"version":"CommandV1","origId":8826,"guid":"e1921037-3967-4b4e-9509-f36d3675aa9d","subtype":"command","commandType":"auto","position":16.0,"command":"val rddFiltered = rdd.filter( x => x%2==0 ) // <Ctrl+Enter> to declare rddFiltered from transforming rdd","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rddFiltered: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[874] at filter at <console>:36\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129960638E12,"submitTime":1.456129953368E12,"finishTime":1.456129960696E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"023ef126-7c83-45c6-a5fb-27324287b486"},{"version":"CommandV1","origId":8827,"guid":"aefd0d1b-78f4-4f8b-b158-59eb6311d078","subtype":"command","commandType":"auto","position":17.0,"command":"rddFiltered.collect() // <Ctrl+Enter> to collect (action) elements of rddFiltered; should be (2, 4)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res8: Array[Int] = Array(2, 4)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456129965938E12,"submitTime":1.456129958671E12,"finishTime":1.456129966023E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6eed6cc9-b8ec-45e4-bcd6-afd78f2785ac"},{"version":"CommandV1","origId":8828,"guid":"841cad22-003c-401c-a406-b2d9321eb2d0","subtype":"command","commandType":"auto","position":18.0,"command":"%md\n##### 4. More examples of `reduce`","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"ee010e93-fc22-4f36-b0e0-c0c474c6e2ee"},{"version":"CommandV1","origId":8829,"guid":"76d0b9b3-4eaf-4cbe-94c9-d1af1820db3e","subtype":"command","commandType":"auto","position":19.0,"command":"val rdd = sc.parallelize(Array(1,2,3,4,5))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[896] at parallelize at <console>:34\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130069068E12,"submitTime":1.45613006179E12,"finishTime":1.456130069189E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6e4b17e0-c246-4e77-a4aa-a44143ee9ba8"},{"version":"CommandV1","origId":8830,"guid":"6f7eee02-aff0-4f1b-88e0-7c71e2618cc1","subtype":"command","commandType":"auto","position":20.0,"command":"rdd.reduce( (x,y)=>x+y ) // <Shift+Enter> to do reduce (action) to sum and return Int = 15","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res9: Int = 15\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130083964E12,"submitTime":1.456130076692E12,"finishTime":1.456130084126E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"0306c15e-92d7-4817-a528-4c9b3447a90f"},{"version":"CommandV1","origId":8831,"guid":"f44e74b4-fa22-4be1-862d-4326a6bf812b","subtype":"command","commandType":"auto","position":21.0,"command":"rdd.reduce( _ + _ ) // <Shift+Enter> to do same sum as above and return Int = 15 (undescore syntax)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res10: Int = 15\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130097446E12,"submitTime":1.456130090171E12,"finishTime":1.45613009753E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"88a96d26-4fe1-4797-a418-566fc175ef0e"},{"version":"CommandV1","origId":8832,"guid":"d2c786cb-e48b-4342-8691-b8b2b54041aa","subtype":"command","commandType":"auto","position":22.0,"command":"rdd.reduce( (x,y)=>x*y ) // <Shift+Enter> to do reduce (action) to multiply and return Int = 120","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res11: Int = 120\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130114865E12,"submitTime":1.45613010759E12,"finishTime":1.45613011495E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2a420e8c-8492-4a9b-a7f0-d8d1cefc935e"},{"version":"CommandV1","origId":8833,"guid":"6c582a8e-34f8-4cd4-81a7-f6a5cf29a397","subtype":"command","commandType":"auto","position":23.0,"command":"val rdd0_1000000 = sc.parallelize(Seq.range(0, 1000000)) // <Shift+Enter> to create an RDD of million integers: 0,1,2,...,10^6","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rdd0_1000000: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[909] at parallelize at <console>:34\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130128635E12,"submitTime":1.456130121362E12,"finishTime":1.456130128717E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"659b426d-3264-43ed-af96-2c7a527679a5"},{"version":"CommandV1","origId":8834,"guid":"9db7bcc1-9f33-497f-8811-694c14f60301","subtype":"command","commandType":"auto","position":24.0,"command":"rdd0_1000000.reduce( (x,y)=>x+y ) // <Ctrl+Enter> to do reduce (action) to sum and return Int 1783293664","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res12: Int = 1783293664\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130141761E12,"submitTime":1.456130134456E12,"finishTime":1.456130141951E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"327999be-5d60-4f64-b362-7cd3272070da"},{"version":"CommandV1","origId":8835,"guid":"26b10166-1562-461b-8393-16046ff7fa7f","subtype":"command","commandType":"auto","position":25.0,"command":"// the following correctly returns Int = 0 although for wrong reason \n// we have flowed out of Int's numeric limits!!! (but got lucky with 0*x=0 for any Int x)\n// <Shift+Enter> to do reduce (action) to multiply and return Int = 0\nrdd0_1000000.reduce( (x,y)=>x*y ) ","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res13: Int = 0\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130156092E12,"submitTime":1.456130148815E12,"finishTime":1.456130156244E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"dce7fcac-fbd5-46ed-8ebb-510b6617d6fc"},{"version":"CommandV1","origId":8836,"guid":"cb914855-d01b-4fb0-a776-6b62ab4303bd","subtype":"command","commandType":"auto","position":26.0,"command":"// <Ctrl+Enter> to do reduce (action) to multiply 1*2*...*9*10 and return correct answer Int = 3628800\nsc.parallelize(Seq.range(1, 11)).reduce( (x,y)=>x*y ) ","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res14: Int = 3628800\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130176707E12,"submitTime":1.456130169432E12,"finishTime":1.456130176817E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"9ffd7601-1a5e-4350-9ad1-0956bd7cee39"},{"version":"CommandV1","origId":8837,"guid":"aff88769-3591-44c9-9cae-191a1ae7d6ae","subtype":"command","commandType":"auto","position":27.0,"command":"%md\n**CAUTION: Know the limits of your numeric types!**","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"c8c1f6dc-bbaf-4733-b98b-b21a6cbb6893"},{"version":"CommandV1","origId":8846,"guid":"83918d40-ccfe-4855-a471-cc55cd2a4732","subtype":"command","commandType":"auto","position":27.5,"command":"%md\nThe minimum and maximum value of `Int` and `Long` types are as follows:","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"e24d471d-a59f-4285-a280-847c26e13f84"},{"version":"CommandV1","origId":8847,"guid":"03b9fdcd-4c4d-4062-9dff-17ff48114fae","subtype":"command","commandType":"auto","position":27.75,"command":"(Int.MinValue , Int.MaxValue)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res25: (Int, Int) = (-2147483648,2147483647)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130582307E12,"submitTime":1.456130575004E12,"finishTime":1.45613058236E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"8b6bf414-d03a-4a43-888a-fcb5338329da"},{"version":"CommandV1","origId":8848,"guid":"1bed73eb-f7b4-4f71-a9d3-f6bfa6333237","subtype":"command","commandType":"auto","position":27.875,"command":"(Long.MinValue, Long.MaxValue)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res26: (Long, Long) = (-9223372036854775808,9223372036854775807)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130602513E12,"submitTime":1.456130595216E12,"finishTime":1.456130602557E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"ebdde3a7-053c-4f50-8aae-b52ab40d5f79"},{"version":"CommandV1","origId":8838,"guid":"13ff5e6c-8a85-4553-adcc-802ba59a2b06","subtype":"command","commandType":"auto","position":28.0,"command":"// <Ctrl+Enter> to do reduce (action) to multiply 1*2*...*20 and return wrong answer as Int = -2102132736\n// we have overflowed out of Int's in a circle back to negative Ints!!! (rigorous distributed numerics, anyone?)\nsc.parallelize(Seq.range(1, 21)).reduce( (x,y)=>x*y ) ","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res15: Int = -2102132736\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130246177E12,"submitTime":1.456130238901E12,"finishTime":1.45613024628E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"e754b4e0-841b-4661-aa06-9e46fde42081"},{"version":"CommandV1","origId":8839,"guid":"0efabc38-8237-4442-b1cf-3b53a31de47e","subtype":"command","commandType":"auto","position":29.0,"command":"//<Ctrl+Enter> we can accomplish the multiplication using Long Integer types \n// by adding 'L' ro integer values, Scala infers that it is type Long\nsc.parallelize(Seq.range(1L, 21L)).reduce( (x,y)=>x*y ) ","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res16: Long = 2432902008176640000\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130263606E12,"submitTime":1.456130256328E12,"finishTime":1.456130263706E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"11c71af8-0004-489e-af18-3bac9d0360f0"},{"version":"CommandV1","origId":8840,"guid":"6abcc727-2f24-4fa1-bb5e-ecdcb9965901","subtype":"command","commandType":"auto","position":30.0,"command":"%md\nAs the following products over Long Integers indicate, they are limited too!","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"fa0e8ea8-e36c-49b8-8636-5b5fa5f328f9"},{"version":"CommandV1","origId":8841,"guid":"4186e100-4a23-4a2d-8468-3a1a525143a5","subtype":"command","commandType":"auto","position":31.0,"command":" // <Shift+Enter> for wrong answer Long = -8718968878589280256 (due to Long's numeric limits)\nsc.parallelize(Seq.range(1L, 61L)).reduce( (x,y)=>x*y )","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res17: Long = -8718968878589280256\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130317773E12,"submitTime":1.456130310499E12,"finishTime":1.456130317871E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"e42f6adc-519b-45f6-8471-c64a5d462d0e"},{"version":"CommandV1","origId":8842,"guid":"14b7ca95-9e4f-45d1-9729-f51fd85df914","subtype":"command","commandType":"auto","position":32.0,"command":"// <Cntrl+Enter> for wrong answer Long = 0 (due to Long's numeric limits)\nsc.parallelize(Seq.range(1L, 100L)).reduce( (x,y)=>x*y ) ","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res18: Long = 0\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.45613033975E12,"submitTime":1.456130332466E12,"finishTime":1.456130339842E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"c735427e-456e-4638-8179-3f203e526a68"},{"version":"CommandV1","origId":8845,"guid":"7c146857-b536-42f5-86c4-1184651380be","subtype":"command","commandType":"auto","position":35.0,"command":"%md\n***\n\n##### 5. Let us do a bunch of transformations to our RDD and perform an action\n\n* start from a Scala ``Seq``,\n* ``sc.parallelize`` the list to create an RDD,\n* ``filter`` that RDD, creating a new filtered RDD,\n* do a ``map`` transformation that maps that RDD to a new mapped RDD,\n* and finally, perform a ``reduce`` action to sum the elements in the RDD.\n\nThis last ``reduce`` action causes the ``parallelize``, the ``filter``, and the ``map`` transformations to actually be executed, and return a result back to the driver machine.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b11f87c4-ba8b-42dd-a7ff-296f057e026f"},{"version":"CommandV1","origId":8849,"guid":"cb9720ad-34f1-4d2d-8ef2-c164706ddfc3","subtype":"command","commandType":"auto","position":36.0,"command":"sc.parallelize(Seq(1, 2, 3, 4)) // <Ctrl+Enter> will return Array(4, 8)\n .filter(x => x%2==0) // (2, 4) is the filtered RDD\n .map(x => x*2) // (4, 8) is the mapped RDD\n .reduce(_+_) // 4+8=12 is the final result from reduce","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res27: Int = 12\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130774412E12,"submitTime":1.456130767121E12,"finishTime":1.456130774542E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"a8842505-366b-4c7c-ab97-9f639fcfa98d"},{"version":"CommandV1","origId":8850,"guid":"b312503f-97ed-4587-b17f-b5196b0b768e","subtype":"command","commandType":"auto","position":37.0,"command":"%md\n##### 6. Transform the RDD by ``distinct`` to make another RDD\n\nLet's declare another RDD named ``rdd2`` that has some repeated elements to apply the ``distinct`` transformation to it. \nThat would give us a new RDD that only contains the distinct elements of the input RDD.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"e8bdf850-2b0e-4205-a739-f657dfb0f0bf"},{"version":"CommandV1","origId":8851,"guid":"954cda8f-618b-408b-8abb-285b4c76bec5","subtype":"command","commandType":"auto","position":38.0,"command":"val rdd2 = sc.parallelize(Seq(4, 1, 3, 2, 2, 2, 3, 4)) // <Ctrl+Enter> to declare rdd2","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1061] at parallelize at <console>:34\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130847209E12,"submitTime":1.456130839916E12,"finishTime":1.456130847276E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"258c7ab1-3961-43ea-a9b4-33ea2c68cb6c"},{"version":"CommandV1","origId":8852,"guid":"0675cfaa-78e3-499a-993e-77d444ade745","subtype":"command","commandType":"auto","position":39.0,"command":"%md\nLet's apply the ``distinct`` transformation to ``rdd2`` and have it return a new RDD named ``rdd2Distinct`` that contains the distinct elements of the source RDD ``rdd2``.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"65c765fb-b2b1-4140-8702-cc72e03aac2c"},{"version":"CommandV1","origId":8853,"guid":"86c101dd-0aee-440d-abd3-8e92de2a5641","subtype":"command","commandType":"auto","position":40.0,"command":"val rdd2Distinct = rdd2.distinct() // <Ctrl+Enter> transformation: distinct gives distinct elements of rdd2","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rdd2Distinct: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[1070] at distinct at <console>:36\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130877655E12,"submitTime":1.456130870368E12,"finishTime":1.456130877728E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"58420a4c-2338-4aef-accc-7dd6571a0625"},{"version":"CommandV1","origId":8854,"guid":"f8e79768-06b0-4b7d-912b-05fe618ebd1e","subtype":"command","commandType":"auto","position":41.0,"command":"rdd2Distinct.collect() // <Ctrl+Enter> to collect (action) as Array(4, 2, 1, 3)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res28: Array[Int] = Array(4, 2, 1, 3)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456130889346E12,"submitTime":1.456130882051E12,"finishTime":1.456130889596E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f0a98f20-6fb4-4e3e-9a8d-67610e619f5b"},{"version":"CommandV1","origId":8855,"guid":"7c173a65-4df8-497b-babc-a067aac11f7d","subtype":"command","commandType":"auto","position":42.0,"command":"%md\n##### 7. more flatMap","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6a6b28bb-0ae9-4622-af0c-f98aa81283b1"},{"version":"CommandV1","origId":8860,"guid":"7c5f7a88-98df-45f1-aae9-a65686295154","subtype":"command","commandType":"auto","position":43.0,"command":"val rdd = sc. parallelize(Array(1,2,3)) // <Shift+Enter> to create an RDD of three Int elements 1,2,3","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1364] at parallelize at <console>:34\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456132305817E12,"submitTime":1.456132298489E12,"finishTime":1.456132305946E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b89f0557-8add-4ccd-8d24-1757c1c89868"},{"version":"CommandV1","origId":8861,"guid":"122364cf-570c-41f5-897a-0157c67225ab","subtype":"command","commandType":"auto","position":44.0,"command":"%md\nLet us pass the ``rdd`` above to a map with a closure that will take in each element ``x`` and return ``Array(x, x+5)``.\nSo each element of the mapped RDD named ``rddOfArrays`` is an `Array[Int]`, an array of integers.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"e7aaebeb-8433-4269-8d45-d93a58133187"},{"version":"CommandV1","origId":8862,"guid":"02e59cbb-3843-4cb3-886c-38eec7ce71cb","subtype":"command","commandType":"auto","position":45.0,"command":"// <Shift+Enter> to make RDD of Arrays, i.e., RDD[Array[int]]\nval rddOfArrays = rdd.map( x => Array(x, x+5) ) ","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rddOfArrays: org.apache.spark.rdd.RDD[Array[Int]] = MapPartitionsRDD[1371] at map at <console>:37\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456132336667E12,"submitTime":1.456132329341E12,"finishTime":1.456132336741E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"8c524f0c-52d0-4600-9007-362ee0109662"},{"version":"CommandV1","origId":8863,"guid":"f74ba48e-f86f-41fd-a2f8-8be07c1fb5ee","subtype":"command","commandType":"auto","position":46.0,"command":"rddOfArrays.collect() // <Ctrl+Enter> to see it is RDD[Array[int]] = (Array(1, 6), Array(2, 7), Array(3, 8))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res29: Array[Array[Int]] = Array(Array(1, 6), Array(2, 7), Array(3, 8))\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456132351644E12,"submitTime":1.45613234432E12,"finishTime":1.456132351745E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"c3999210-06df-4f96-8a43-53c9ee417835"},{"version":"CommandV1","origId":8864,"guid":"076b583c-ba31-4a62-8c12-c9c6553193c4","subtype":"command","commandType":"auto","position":47.0,"command":"%md\nNow let's observer what happens when we use ``flatMap`` to transform the same ``rdd`` and create another RDD called ``rddfM``.\n\nInterestingly, ``flatMap`` *flattens* our ``rdd`` by taking each ``Array`` (or sequence in general) and truning it into individual elements.\n\nThus, we end up with the RDD ``rddfM`` consisting of the elements (1, 6, 2, 7, 3, 8) as shown from the output of ``rddfM.collect`` below.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"09e7bcce-9703-411c-9ab0-7acc95474ef6"},{"version":"CommandV1","origId":8866,"guid":"4bdd5081-06f2-4001-96d5-9a7e9a2862cb","subtype":"command","commandType":"auto","position":48.0,"command":"val rddfM = rdd.flatMap(x => Array(x, x+5)) // <Shift+Enter> to flatMap the rdd using closure (x => Array(x, x+5))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">rddfM: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[1390] at flatMap at <console>:36\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456132427993E12,"submitTime":1.456132420662E12,"finishTime":1.45613242812E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b28c7b53-1f1c-4806-b48a-fdee4377e77c"},{"version":"CommandV1","origId":8867,"guid":"18701998-cef7-4350-95b4-1c2e64688008","subtype":"command","commandType":"auto","position":49.0,"command":"rddfM.collect // <Ctrl+Enter> to collect rddfM = (1, 6, 2, 7, 3, 8)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res30: Array[Int] = Array(1, 6, 2, 7, 3, 8)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.456132438437E12,"submitTime":1.456132431109E12,"finishTime":1.456132438517E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"r.sainudiin@math.canterbury.ac.nz","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"c9a03f2e-d55e-4b78-a4d8-ecd267e5718c"},{"version":"CommandV1","origId":8868,"guid":"ac19c015-ea5b-4161-bacf-f9113b000145","subtype":"command","commandType":"auto","position":50.0,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"a9b100d5-1e0d-452b-bd45-9cf0ec59292d"}],"dashboards":[],"guid":"255e29ac-b7c6-4b9b-9ea5-f4d85ddcce69","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script> <script src="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js" onerror="window.mainJsLoadError = true;"></script> </head> <body> <script> if (window.mainJsLoadError) { var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js'; var b = document.getElementsByTagName('body')[0]; var c = document.createElement('div'); c.innerHTML = ('<h1>Network Error</h1>' + '<p><b>Please check your network connection and try again.</b></p>' + '<p>Could not load a required resource: ' + u + '</p>'); c.style.margin = '30px'; c.style.padding = '20px 50px'; c.style.backgroundColor = '#f5f5f5'; c.style.borderRadius = '5px'; b.appendChild(c); } </script> </body> </html>