<!DOCTYPE html> <html> <head> <meta name="databricks-html-version" content="1"> <title>019_DistLAlgForLinRegIntro - Databricks</title> <meta charset="utf-8"> <meta name="google" content="notranslate"> <meta http-equiv="Content-Language" content="en"> <meta http-equiv="Content-Type" content="text/html; charset=UTF8"> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/jquery-ui-bundle/jquery-ui.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/main.css"> <link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/print.css" media="print"> <link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/img/favicon.ico"/> <script>window.settings = {"sparkDocsSearchGoogleCx":"004588677886978090460:_rj0wilqwdm","dbcForumURL":"http://forums.databricks.com/","dbfsS3Host":"https://databricks-prod-storage-sydney.s3.amazonaws.com","enableThirdPartyApplicationsUI":false,"enableClusterAcls":false,"notebookRevisionVisibilityHorizon":0,"enableTableHandler":true,"isAdmin":true,"enableLargeResultDownload":false,"nameAndEmail":"Raazesh Sainudiin (r.sainudiin@math.canterbury.ac.nz)","enablePresentationTimerConfig":true,"enableFullTextSearch":true,"enableElasticSparkUI":true,"clusters":true,"hideOffHeapCache":false,"applications":false,"useStaticGuide":false,"fileStoreBase":"FileStore","configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableResetPassword":true,"enableJobsSparkUpgrade":true,"sparkVersions":[{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0","packageLabel":"spark-1.3-jenkins-ip-10-30-9-162-U0c2673ac85-Sa2ee4664b2-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1","packageLabel":"spark-1.4-jenkins-ip-10-30-9-162-U0c2673ac85-S33a1e4b9c6-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.0","packageLabel":"spark-1.6-jenkins-ip-10-30-9-162-U0c2673ac85-Scabba801f3-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"master","displayName":"Spark master (dev)","packageLabel":"","upgradable":true,"deprecated":false,"customerVisible":false}],"enableRestrictedClusterCreation":false,"enableFeedback":false,"defaultNumWorkers":8,"serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","driverStdoutFilePrefix":"stdout","enableSparkDocsSearch":true,"prefetchSidebarNodes":true,"sparkHistoryServerEnabled":true,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableNotebookHistoryDiffing":true,"branch":"2.12.3","accountsLimit":-1,"enableNotebookGitBranching":true,"local":false,"displayDefaultContainerMemoryGB":6,"deploymentMode":"production","useSpotForWorkers":false,"enableUserInviteWorkflow":false,"enableStaticNotebooks":true,"dbcGuideURL":"#workspace/databricks_guide/00 Welcome to Databricks","enableCssTransitions":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"orgId":0,"enableNotebookGitVersioning":true,"files":"files/","enableDriverLogsUI":true,"disableLegacyDashboards":false,"enableWorkspaceAclsConfig":true,"dropzoneMaxFileSize":4096,"enableNewDashboardViews":false,"driverLog4jFilePrefix":"log4j","enableMavenLibraries":true,"displayRowLimit":1000,"defaultSparkVersion":{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},"clusterPublisherRootId":5,"enableLatestJobRunResultPermalink":true,"disallowAddingAdmins":false,"enableSparkConfUI":true,"enableOrgSwitcherUI":false,"clustersLimit":-1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":false,"enableClusterDeltaUpdates":true,"csrfToken":"e07818b4-f977-4954-be1d-08dfbf7848d3","useFixedStaticNotebookVersionForDevelopment":false,"enableBasicReactDialogBoxes":true,"requireEmailUserName":true,"enableDashboardViews":false,"dbcFeedbackURL":"http://feedback.databricks.com/forums/263785-product-feedback","enableWorkspaceAclService":true,"someName":"Raazesh Sainudiin","enableWorkspaceAcls":true,"gitHash":"0c2673ac858e227cad536fdb45d140aeded238db","userFullname":"Raazesh Sainudiin","enableClusterCreatePage":false,"enableImportFromUrl":true,"enableMiniClusters":false,"enableWebSocketDeltaUpdates":true,"enableDebugUI":false,"showHiddenSparkVersions":false,"allowNonAdminUsers":true,"userId":100005,"dbcSupportURL":"","staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/","enableSparkPackages":true,"enableHybridClusterType":false,"enableNotebookHistoryUI":true,"availableWorkspaces":[{"name":"Workspace 0","orgId":0}],"enableFolderHtmlExport":true,"enableSparkVersionsUI":true,"databricksGuideStaticUrl":"","enableHybridClusters":true,"notebookLoadingBackground":"#fff","enableNewJobRunDetailsPage":true,"enableDashboardExport":true,"user":"r.sainudiin@math.canterbury.ac.nz","enableServerAutoComplete":true,"enableStaticHtmlImport":true,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"tablesPublisherRootId":7,"enableNewInputWidgetUI":false,"accounts":true,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4};</script> <script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":77572,"name":"019_DistLAlgForLinRegIntro","language":"scala","commands":[{"version":"CommandV1","origId":79563,"guid":"885693f6-394c-4c14-8d42-350d472dc6f4","subtype":"command","commandType":"auto","position":0.5,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f4abb528-d607-4b42-b87f-4d2f9e9fc855"},{"version":"CommandV1","origId":129729,"guid":"3de27301-2a49-44f5-8b2e-87a91eca02c5","subtype":"command","commandType":"auto","position":0.625,"command":"%md\nThe [html source url](https://raw.githubusercontent.com/raazesh-sainudiin/scalable-data-science/master/db/week5/10_LinearRegressionIntro/019_DistLAlgForLinRegIntro.html) of this databricks notebook and its recorded Uji :\n\n[](https://www.youtube.com/v/y6F-e6m1m2s?rel=0&autoplay=1&modestbranding=1&start=3919&end=4399)\n","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2723c25a-1a8e-4605-b331-090393d83070"},{"version":"CommandV1","origId":79564,"guid":"84ec81d1-1bee-4fde-8167-19ef9e2b17fb","subtype":"command","commandType":"auto","position":0.75,"command":"%md\n**HOMEWORK:** \n* read: [http://arxiv.org/pdf/1509.02256.pdf](http://arxiv.org/pdf/1509.02256.pdf) (also see References and Appendix A).\n* and go through the notebooks here: [Data Types - MLlib Programming Guide](/#workspace/scalable-data-science/xtraResources/ProgGuides1_6/MLlibProgrammingGuide/dataTypes/000_dataTypesProgGuide)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f4a5eac8-f7fa-45d4-82fb-6407d6fde5b2"},{"version":"CommandV1","origId":77574,"guid":"9beeac45-44cb-486f-969d-c81e1e6635fa","subtype":"command","commandType":"auto","position":1.0,"command":"%md\n#### Distributed Machine Learning: Computation and Storage by Ameet Talwalkar in BerkeleyX: CS190.1x Scalable Machine Learning\n**(watch now/later? 8:48)**:\n\n[](https://www.youtube.com/v/r2EWm82rneY?rel=0&autoplay=1&modestbranding=1&start=1)\n\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"9c542ea4-d465-483e-a5bd-482f71996a77"},{"version":"CommandV1","origId":77575,"guid":"e90f547b-8d43-4971-853e-8b1be7017daf","subtype":"command","commandType":"auto","position":2.0,"command":"%md\nThis is a transcript of Ameet's lecture in the video above:\n\n**HOMEWORK:** Make sure you are understanding everything he is trying to convey!\n\n**Recall** from week 1's lectures that we are weaving over Ameet's course here to get to advanced applications \"in a hurry\" and it is assumed you have already done the edX Big Data Series from 2015 or will be doing the 2016 version, ideally with certification :)\n\nIn this segment, we'll begin our discussion\nof **distributed machine learning principles related\nto computation and storage**.\nWe'll use linear regression as a running example\nto illustrate these ideas.\nAs we previously discussed, the size of datasets\nis rapidly growing.\nAnd this has led to scalability issues\nfor many standard machine learning methods.\nFor instance, consider the least squares regression model.\nThe fact that a closed-form solution exists\nis an appealing property of this method.\nBut what happens when we try to solve this closed-form solution\nat scale?\nIn this segment, we'll focus on computational issues associated\nwith linear regression.\nThough I should note that these same ideas apply\nin the context of ridge regression,\nas it has a very similar computational profile.\nSo let's figure out the time and space\ncomplexity of solving for this closed-form solution.\nWe'll start with time complexity,\nconsidering arithmetic operations\nas the basic units of computation\nwhen discussing big O complexity.\nLooking at the expression for w, if we\nperform each of the required operations separately,\nwe see that computing X transpose\nX takes O of nd squared time, and inverting this resulting\nmatrix takes O of d cubed time.\nSince matrix multiplication is an associative operation,\nwe can multiply X transpose and y in O of nd time\nto get some intermediate d dimensional vector,\nand then perform the multiplication\nbetween the inverse matrix and this intermediate d dimensional\nvector in O of d squared time.\nHence, in summary, the two computational bottlenecks\nin this process involve computing X transpose X,\nand subsequently computing its inverse.\nThere are other methods to solve this equation that\ncan be faster in practice, but they\nare only faster in terms of constants,\nand therefore they all have the same overall time complexity.\nIn summary, we say that computing\nthe closed-form solution for linear regression\ntakes O of nd squared plus d cubed time.\nNow let's consider the space complexity.\nAnd recall that our basic unit of storage\nhere is the storage required to store a single float, which\nis typically 8 bytes.\nIn order to compute w, we must first\nstore the data matrix, which requires O of nd floats.\nAdditionally, we must compute X transpose X and its inverse.\nIn order to solve for w, each of these matrices\nare d by d and thus require O of d squared floats.\nThese are the two bottlenecks storage-wise.\nAnd thus our space complexity is O of nd plus d squared.\nSo now that we've considered the time and space\ncomplexity required to solve for the closed-form solution, let's\nconsider what happens as our data grows large.\nThe first situation we'll consider\nis one where n, or the number of observations,\nis large, while d, or the number of features,\nis relatively small.\nSpecifically, we'll assume that we're\nin a setting where d is small enough such that O of d cubed\noperate computation and O of d squared storage\nis feasible on a single machine.\nIn this scenario, the terms in our big O complexity involving\nn are the ones that dominate, and thus\nstoring X and computing X transpose X\nare the bottlenecks.\nIt turns out that in this scenario,\nwe're well suited for a distributed computation.\nFirst, we can store the data points or rows of X\nacross several machines, thus reducing the storage burden.\nSecond, we can compute X transpose X\nin parallel across the machines by treating this multiplication\nas a sum of outer products.\nTo understand this alternative interpretation\nof matrix multiplication in terms of outer products,\nlet's first recall our typical definition\nof matrix multiplication.\nWe usually think about each entry\nof the output matrix being computed\nvia an inner product between rows and columns of the input\nmatrices.\nSo, for instance, in the example on the slide,\nto compute the top left entry, we\ncompute the inner product between the first row\nof the left input matrix and the first column of the right input\nmatrix.\nSimilarly, to compute the top right entry,\nwe compute the inner product between the first row\nof the left input matrix and the second column\nof the right input matrix.\nWe perform additional inner products\nto compute the two remaining entries of the output matrix.\nThere is, however, an alternative interpretation\nof matrix multiplication as the sum\nof outer products between corresponding rows and columns\nof the input matrices.\nLet's look at the same example from the last slide\nto get a better sense of what this means.\nFirst consider the first column of the left input matrix\nand the first row of the right input matrix.\nWe can compute their outer product\nwith the result being the 2 by 2 matrix\non the bottom of the slide.\nNext, we can consider the second column of the left input matrix\nand the second row of the right input matrix,\nand again compute their outer product, resulting in another 2\nby 2 matrix.\nWe can repeat this process a third time\nto generate a third outer product or 2 by 2 matrix.\nThe sum of these outer products matches the result\nwe obtained in the previous slide using\nthe traditional definition of matrix multiplication.\nAnd more generally, taking a sum of outer products\nof corresponding rows and columns of the input matrices,\nalways returns the desired matrix multiplication result.\nNow we can use this new interpretation\nof matrix multiplication to our benefit\nwhen distributing the computation of X transpose\nX for linear regression.\nLet's first represent X visually by its rows or data points.\nThen we can express this matrix multiplication\nas a sum of outer products where each outer product involves\nonly a single row of X or a single data point.\nLet's see how we can use this insight\nto effectively distribute our computation.\nConsider a toy example where we have\na cluster of three workers and a data set with six data points.\nWe can distribute the storage of our six data points\nacross the three workers so that each worker\nis storing two data points.\nNow we can express matrix multiplication\nas a simple MapReduce operation.\nIn the map step, we take each point\nand compute its outer product with itself.\nAnd in the subsequent reduce step,\nwe simply sum over all of these outer products.\nWe can then solve for the final linear regression model\nlocally, which includes computing the inverse\nof this resulting matrix.\nNow let's look at the storage and computation\ninvolved at each step.\nIn the first step, we're not doing any computation,\nbut we need to store the input data, which\nrequires O of nd storage.\nThis is a bottleneck in our setting since n is large.\nHowever, the storage can be distributed\nover several machines.\nNext, during the map step, we perform an outer product\nfor each data point.\nEach outer product takes O of d squared time,\nand we have to compute n of these outer products.\nThis is the computational bottleneck in our setting,\nbut again, it is distributed across multiple workers.\nIn terms of storage, we must store the outer products\ncomputed on each machine.\nNote that although we may be computing\nseveral outer products per machine,\nwe can keep a running sum of these outer products,\nso the local storage required for each machine\nis O of d squared.\nFinally, in the reduce step, we must\ntake the sum of these outer products,\nthough the computational bottleneck\nis, in fact, inverting the resulting matrix, which\nis cubic nd.\nHowever, we're assuming that d is small enough\nfor this computation to be feasible on a single machine.\nSimilarly, the O of d squared storage required\nto store X transpose X and its inverse\nis also feasible on a single machine by assumption.\nThis entire process can be concisely\nsummarized via the following Spark code snippet.\nIn this code, train data is an RDD of rows of X.\nIn the map step, we compute an outer product for each row.\nAnd in the reduce step, we sum these outer products\nand invert the resulting matrix.\nIn the final reduce step, we can also\nperform the remaining steps required to obtain\nour final regression model.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"5c3dc411-5afd-438d-a244-58e2ddbf636b"},{"version":"CommandV1","origId":77576,"guid":"d734bb20-afed-4fa3-baf0-a40466fe3b36","subtype":"command","commandType":"auto","position":3.0,"command":"%md\n#### Distributed Machine Learning: Computation and Storage (Part 2) by Ameet Talwalkar in BerkeleyX: CS190.1x Scalable Machine Learning\n**(watch now/later? 4:02)**:\n\n[](https://www.youtube.com/v/CYMZKbnDKsU?rel=0&autoplay=1&modestbranding=1&start=1)\n\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f12e1a0d-a469-4582-8877-72f88ac0c7dd"},{"version":"CommandV1","origId":77577,"guid":"5036b765-311b-42c5-8547-29b37eb981cf","subtype":"command","commandType":"auto","position":4.0,"command":"%md\nThis is a transcript of Ameet's lecture in the video above:\n\n**Homework:** Make sure you are understanding everything he is trying to convey!\n\nIn this segment, we'll continue our discussion\nof distributed machine learning principles related\nto computation and storage.\nWe'll focus on the problem when D, the number of features,\ngrows large.\nIn the previous segment, we discussed the big N small D\nsetting.\nIn this setting, we can naturally\nuse a distributed computing environment\nto solve for the linear regression closed\nform solution.\nTo do this, we store our data across multiple machines\nand we compute X transpose X as a sum of outer products.\nThis strategy can be written as a simple MapReduce\noperation, expressed very concisely in Spark.\nNow, let's consider what happens when D grows large.\nAs before, storing X and computing X transpose X\nare bottlenecks.\nHowever, storing and operating on X transpose X\nis now also a bottleneck.\nAnd we can no longer use our previous strategy.\nSo let's see what goes wrong.\nHere's what our strategy looks like in the small D\nsetting with data stored across workers,\nouter products computed in the map step,\nand sum of these outer products performed in the reduced step.\nHowever, we can no longer perform D cubed operations\nlocally or store D squared floats locally\nin our new setting.\nThis issue leads to a more general rule of thumb,\nwhich is that when N and D are large,\nwe need the computation and storage complexity to be\nat most linear in N and D.\nSo how do we devise methods that are linear in space and time\ncomplexity?\nOne idea is to exploit sparsity.\nSparse data is quite prevalent in practice.\nSome data is inherently sparse, such as rating information\nand collaborative filtering problems or social networking\nor other grafted.\nAdditionally, we often generate sparse features\nduring a process of feature extraction,\nsuch as when we represent text documents\nvia a bag-of-words features or when\nwe convert categorical features into numerical representations.\nAccounting for sparsity can lead to orders\nof magnitudes of savings in terms\nof storage and computation.\nA second idea is to make a late and sparsity assumption,\nwhereby we make the assumption that our high dimensional data\ncan in fact be represented in a more succinct fashion,\neither exactly or approximately.\nFor example, we can make a low rank modeling assumption\nwhere we might assume that our data matrix can in fact be\nrepresented by the product of two skinny matrices, where\nthe skinny dimension R is much smaller than either N or D.\nExploiting this assumption can also\nyield significant computational and storage gigs.\nA third option is to use different algorithms.\nFor instance, instead of learning a linear regression\nmodel via the closed form solution,\nwe could alternatively use gradient descent.\nGradient descent is an iterative algorithm\nthat requires layer computation and storage at each iteration\nthus making it attractive in the big N and big D setting.\nSo let's see how gradient descent stacks up\nwith a closed form solution in our toy example on a cluster\nwith three machines.\nAs before, we can store the data across the worker machines.\nNow in the map step, we require O of ND computation,\nand this computation is distributed across workers.\nAnd we also require O of D storage locally.\nIn the reduced step, we require O of D local computation\nas well as O of D local storage.\nMoreover, unlike the closed form case,\nwe need to repeat this process several times\nsince gradient descent is an iterative algorithm.\nAt this point, I haven't really told you\nhow these question marks work.\nAnd in the next segment, we'll talk\nabout what actually is going on with gradient\ndecent.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6969cc1e-6ee9-4b2c-b07d-798acdd5e007"},{"version":"CommandV1","origId":77578,"guid":"513fede7-8158-4894-adee-13d56e704c3c","subtype":"command","commandType":"auto","position":5.0,"command":"%md\n#### Communication Hierarchy by Ameet Talwalkar in BerkeleyX: CS190.1x Scalable Machine Learning\n**(watch later 2:32)**:\n\n[](https://www.youtube.com/v/ABkUwJWn1d8?rel=0&autoplay=1&modestbranding=1&start=1)\n\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"3f69b22c-0967-4038-b88d-61ed2ce01c57"},{"version":"CommandV1","origId":79501,"guid":"a1b7fd7d-5c95-4f6f-a7a7-9095f03c5c93","subtype":"command","commandType":"auto","position":6.0,"command":"%md\n##### SUMMARY: Access rates fall sharply with distance.\n\n* roughly 50 x gap between reading from memory and reading from either disk or the network.\n\nWe must take this communication hierarchy into consideration when developing parallel and distributed algorithms.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"68476bb9-fb44-41c3-95ca-99aa0efd8dea"},{"version":"CommandV1","origId":79502,"guid":"3d80b46b-7aa9-4624-a654-ec03d3803fc4","subtype":"command","commandType":"auto","position":7.0,"command":"%md\n#### Distributed Machine Learning: Communication Principles by Ameet Talwalkar in BerkeleyX: CS190.1x Scalable Machine Learning\n**(watch later 11:28)**:\n\n[](https://www.youtube.com/v/VT5F9tsV4hY?rel=0&autoplay=1&modestbranding=1&start=1)\n\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"cab93c52-9eb6-4488-85a0-35f3e3c51d2a"},{"version":"CommandV1","origId":79503,"guid":"cf141328-9a2b-40e5-9344-f41a4792ac4a","subtype":"command","commandType":"auto","position":8.0,"command":"%md\n#### Focusing on strategies to reduce communication costs.\n\n* access rates fall sharply with distance.\n* so this communication hierarchy needs to be accounted for when developing parallel and distributed algorithms.\n\n**Lessons:**\n* parallelism makes our computation faster\n* but network communication slows us down\n\n* BINGO: perform parallel and in-memory computation.\n* Persisting in memory is a particularly attractive option when working with iterative algorithms that\nread the same data multiple times, as is the case in gradient descent.\n\n* Several machine learning algorithms are iterative!\n\n* Limits of multi-core scaling (powerful multicore machine with several CPUs,\nand a huge amount of RAM).\n * advantageous: \n * sidestep any network communication when working with a single multicore machine\n * can indeed handle fairly large data sets, and they're an attractive option in many settings.\n * disadvantages: \n * can be quite expensive (due to specialized hardware),\n * not as widely accessible as commodity computing nodes.\n * this approach does have scalability limitations, as we'll eventually hit a wall when the data grows large enough! This is not the case for a distributed environment (like the AWS EC2 cloud under the hood here).\n\n\n\n#### Simple strategies for algorithms in a distributed setting: to reduce network communication, simply keep large objects local\n* In the big n, small d case for linear regression \n * we can solve the problem via a closed form solution.\n * And this requires us to communicate \\\\(O(d)^2\\\\) intermediate data.\n * the largest object in this example is our initial data, which we store in a distributed fashion and never communicate! This is a *data parallel setting*.\n* In the big n, big d case:\n * for linear regression.\n * we use gradient descent to iteratively train our model and are again in a *data parallel setting*.\n * At each iteration we communicate the current parameter vector \\\\(w_i\\\\) and the required \\\\(O(d)\\\\) communication is feasible even for fairly large d.\n\n* In the small n, small d case:\n * for ridge regression\n * we can communicate the small data to all of the workers.\n * this is an example of a *model parallel setting* where we can train the model for each hyper-parameter in parallel.\n\n* Linear regression with big n and huge d is an example of both data and model parallelism.\n\n**HOMEWORK:** Watch the video and find out why Linear regression with big n and huge d is an example of both data and model parallelism.\n\nIn this setting, since our data is large,\nwe must still store it across multiple machines.\nWe can still use gradient descent, or stochastic variants\nof gradient descent to train our model,\nbut we may not want to communicate\nthe entire d dimensional parameter\nvector at each iteration, when we have 10s,\nor hundreds of millions of features.\nIn this setting we often rely on sparsity\nto reduce the communication.\nSo far we discussed how we can reduce communication\nby keeping large data local.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"9f4a3954-9544-45f8-8677-450933d6490f"},{"version":"CommandV1","origId":79506,"guid":"24470def-8a1f-4d05-9684-e0f915cd55a5","subtype":"command","commandType":"auto","position":8.5,"command":"%md\n#### Simple strategies for algorithms in a distributed setting: compute more and communicate less per iteration\n**HOMEWORK:** watch the video and understand why it is important at each iteration of an iterative algorithm\nto compute more and communicate less.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"165341a0-e864-4aba-adda-55159513b590"},{"version":"CommandV1","origId":132187,"guid":"15e31f6b-338f-4b82-b9bb-567f9be1f574","subtype":"command","commandType":"auto","position":9.25,"command":"%md\n**Recall** from week 1's lecture that the ideal mathematical preparation to fully digest this material requires a set of self-tutorials from Reza Zadeh's course in Distributed Algorithms and Optimization from Stanford:\n\n* [http://stanford.edu/~rezab/dao/](http://stanford.edu/~rezab/dao/).\n\nThis is a minimal pre-requisite for designing new algorithms or improving exixting ones!!!","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"243ea0f3-feec-45af-8469-e25c32cc8560"},{"version":"CommandV1","origId":79505,"guid":"6a11a439-ed6f-4673-9492-4287de95ee9f","subtype":"command","commandType":"auto","position":10.0,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"17f2ad27-d15c-4ed8-bd64-8eb7a4b81764"}],"dashboards":[],"guid":"ccc35162-a0ab-49e2-91d6-cfa712b7b1e5","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script> <script src="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js" onerror="window.mainJsLoadError = true;"></script> </head> <body> <script> if (window.mainJsLoadError) { var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js'; var b = document.getElementsByTagName('body')[0]; var c = document.createElement('div'); c.innerHTML = ('<h1>Network Error</h1>' + '<p><b>Please check your network connection and try again.</b></p>' + '<p>Could not load a required resource: ' + u + '</p>'); c.style.margin = '30px'; c.style.padding = '20px 50px'; c.style.backgroundColor = '#f5f5f5'; c.style.borderRadius = '5px'; b.appendChild(c); } </script> </body> </html>