<!DOCTYPE html> <html> <head> <meta name="databricks-html-version" content="1"> <title>049_Introduction_HighOrderSpectralClustering - Databricks</title> <meta charset="utf-8"> <meta name="google" content="notranslate"> <meta http-equiv="Content-Language" content="en"> <meta http-equiv="Content-Type" content="text/html; charset=UTF8"> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/jquery-ui-bundle/jquery-ui.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/main.css"> <link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/print.css" media="print"> <link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/img/favicon.ico"/> <script>window.settings = {"sparkDocsSearchGoogleCx":"004588677886978090460:_rj0wilqwdm","dbcForumURL":"http://forums.databricks.com/","dbfsS3Host":"https://databricks-prod-storage-sydney.s3.amazonaws.com","enableThirdPartyApplicationsUI":false,"enableClusterAcls":false,"notebookRevisionVisibilityHorizon":0,"enableTableHandler":true,"isAdmin":true,"enableLargeResultDownload":false,"nameAndEmail":"Raazesh Sainudiin (r.sainudiin@math.canterbury.ac.nz)","enablePresentationTimerConfig":true,"enableFullTextSearch":true,"enableElasticSparkUI":true,"clusters":true,"hideOffHeapCache":false,"applications":false,"useStaticGuide":false,"fileStoreBase":"FileStore","configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableResetPassword":true,"enableJobsSparkUpgrade":true,"sparkVersions":[{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0","packageLabel":"spark-1.3-jenkins-ip-10-30-9-162-U0c2673ac85-Sa2ee4664b2-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1","packageLabel":"spark-1.4-jenkins-ip-10-30-9-162-U0c2673ac85-S33a1e4b9c6-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.0","packageLabel":"spark-1.6-jenkins-ip-10-30-9-162-U0c2673ac85-Scabba801f3-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"master","displayName":"Spark master (dev)","packageLabel":"","upgradable":true,"deprecated":false,"customerVisible":false}],"enableRestrictedClusterCreation":false,"enableFeedback":false,"defaultNumWorkers":8,"serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","driverStdoutFilePrefix":"stdout","enableSparkDocsSearch":true,"prefetchSidebarNodes":true,"sparkHistoryServerEnabled":true,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableNotebookHistoryDiffing":true,"branch":"2.12.3","accountsLimit":-1,"enableNotebookGitBranching":true,"local":false,"displayDefaultContainerMemoryGB":6,"deploymentMode":"production","useSpotForWorkers":false,"enableUserInviteWorkflow":false,"enableStaticNotebooks":true,"dbcGuideURL":"#workspace/databricks_guide/00 Welcome to Databricks","enableCssTransitions":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"orgId":0,"enableNotebookGitVersioning":true,"files":"files/","enableDriverLogsUI":true,"disableLegacyDashboards":false,"enableWorkspaceAclsConfig":true,"dropzoneMaxFileSize":4096,"enableNewDashboardViews":false,"driverLog4jFilePrefix":"log4j","enableMavenLibraries":true,"displayRowLimit":1000,"defaultSparkVersion":{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},"clusterPublisherRootId":5,"enableLatestJobRunResultPermalink":true,"disallowAddingAdmins":false,"enableSparkConfUI":true,"enableOrgSwitcherUI":false,"clustersLimit":-1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":false,"enableClusterDeltaUpdates":true,"csrfToken":"4c37e4ff-a908-4b05-8c61-2b76819fa34c","useFixedStaticNotebookVersionForDevelopment":false,"enableBasicReactDialogBoxes":true,"requireEmailUserName":true,"enableDashboardViews":false,"dbcFeedbackURL":"http://feedback.databricks.com/forums/263785-product-feedback","enableWorkspaceAclService":true,"someName":"Raazesh Sainudiin","enableWorkspaceAcls":true,"gitHash":"0c2673ac858e227cad536fdb45d140aeded238db","userFullname":"Raazesh Sainudiin","enableClusterCreatePage":false,"enableImportFromUrl":true,"enableMiniClusters":false,"enableWebSocketDeltaUpdates":true,"enableDebugUI":false,"showHiddenSparkVersions":false,"allowNonAdminUsers":true,"userId":100005,"dbcSupportURL":"","staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/","enableSparkPackages":true,"enableHybridClusterType":false,"enableNotebookHistoryUI":true,"availableWorkspaces":[{"name":"Workspace 0","orgId":0}],"enableFolderHtmlExport":true,"enableSparkVersionsUI":true,"databricksGuideStaticUrl":"","enableHybridClusters":true,"notebookLoadingBackground":"#fff","enableNewJobRunDetailsPage":true,"enableDashboardExport":true,"user":"r.sainudiin@math.canterbury.ac.nz","enableServerAutoComplete":true,"enableStaticHtmlImport":true,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"tablesPublisherRootId":7,"enableNewInputWidgetUI":false,"accounts":true,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4};</script> <script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":123693,"name":"049_Introduction_HighOrderSpectralClustering","language":"scala","commands":[{"version":"CommandV1","origId":123695,"guid":"d7b9e73c-a403-4258-ae93-6c62216e03dc","subtype":"command","commandType":"auto","position":0.5,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n## Course Project - High Order Spectral Clustering\n### by Xin Zhao\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"4f6c5912-d721-4689-babb-d62ffe532647"},{"version":"CommandV1","origId":140756,"guid":"78cfaca3-63aa-4aba-87b4-1a27d5357bf4","subtype":"command","commandType":"auto","position":1.5,"command":"%md\nThe [html source url](https://raw.githubusercontent.com/raazesh-sainudiin/scalable-data-science/master/db/studentProjects/04_XinZhao/049_Introduction_HighOrderSpectralClustering.html) of this databricks notebook and its recorded Uji :\n\n[](https://www.youtube.com/v/zJirlHAV6YU?rel=0&autoplay=1&modestbranding=1&start=3113&end=4018)\n","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"c11c3be9-b197-4fb0-9f1d-eb88586a1fd1"},{"version":"CommandV1","origId":123696,"guid":"74866b0d-e058-4347-924a-f89754ac9bbd","subtype":"command","commandType":"auto","position":2.5,"command":"%md\n### 1. Spectral clustering\n#### 1.1 What is spectral clustering\nIn multivariate statistics and the clustering of data, spectral clustering techniques make use of the spectrum (eigenvalues) of the **similarity matrix** of the data to perform dimensionality reduction before clustering in fewer dimensions. \n\nThe similarity matrix is provided as an input and consists of a quantitative assessment of the relative similarity of each pair of points in the dataset. Such measure is used to transform data to overcome difficulties related to lack of convexity in the shape of the data distribution. The measure gives rise to an (n, n)-sized similarity matrix for a set of (n,d) points, where the entry (i,j) in the matrix can be simply the Euclidean distance between i and j, or it can be a more complex measure of distance such as the **Gaussian**. Further modifying this result with network analysis techniques is also common.\n\nIn mathematics, spectral graph theory is the study of properties of a graph in relationship to the characteristic polynomial, **eigenvalues, and eigenvectors** of matrices associated to the graph, such as its adjacency matrix or **Laplacian** matrix.\n\n\n<img src=\"http://image.slidesharecdn.com/11clusadvanced-140913212136-phpapp02/95/data-mining-concepts-and-techniques-chapter-11review-basic-cluster-analysis-methods-11-clusadvanced-45-638.jpg?cb=1410644502\" width=\"800\">\n#### 1.2 Spectral clustering with different clustering method\nThe following two examples show two different Spectral clustering method as they uses different clustering algorithm:\n\n \n\n#### 1.3 Compare variaty clustering methods\n<img src=\"http://opensource.datacratic.com/mtlpy50/clustering.png\" width=\"1200\">\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":1.463717738167E12,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"a2127295-aa10-4918-88ff-192e949404ce"},{"version":"CommandV1","origId":123697,"guid":"68b683cc-2005-48ca-8127-79db625173a4","subtype":"command","commandType":"auto","position":2.625,"command":"%md\n### 2. High Order Spectral Clustering\n\n#### 2.1 Why high order\n>- kernel principal component analysis (kernel PCA) in not enough in multivariate analysis\n\n>- linkage to the original feature\n\n#### 2.2 How\n>- high order spectral cluster diagrams \n\n>- **Data --> Graph/Similarity Tensor --> Compute leading K (vector) svd and core tensor --> Clustering on the new latent tensor --> Project clusters back to original data**\n\n#### 2.3 Tensor\n>- What is tensor\n\n<img src=\"http://i.stack.imgur.com/K4Cg9.png\" width=\"800\">\n\n>- High order SVD\n\n<img src=\"https://www.microsoft.com/en-us/research/wp-content/uploads/2016/03/trajectorycomputing-tensors.png\" width=\"1200\">\n\n>- Tensor unfolding\n\n<img src=\"https://s3-eu-west-1.amazonaws.com/ppreviews-plos-725668748/1522614/preview.jpg\" width=\"400\">\n<img src=\"http://i.stack.imgur.com/K4Cg9.png\" width=\"600\">\n\nPlease refer to [De Lathauwer, L., De Moor, B. and Vandewalle, J. (2000)](http://www.sandia.gov/~tgkolda/tdw2004/ldl-94-31.pdf) for details in high order singular decomposition.\n\n#### Disadvantage\n>- The size of (n,d) increase to (n,n,d), which means very expensive in memory and computation performance. E.g, (100k,100k) double requires about 80G\n>- Not suitble for large number of clusters\n>- Difficult to do out of sample embedding.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2ecdfba3-e543-4e01-983c-9cc83dee0504"},{"version":"CommandV1","origId":123698,"guid":"b3e5a453-36a2-4799-aed9-fd43a4f5d28b","subtype":"command","commandType":"auto","position":2.6875,"command":"%md\n### High Order Spectral Clustering Package\nIn this project, a high order spectral clustering package is developed as attached spectralclustering_01.jar. The package is implemented as:\n>- Implement in Scala with Apache Spark\n>- Tensor implemented as both distributed and non-distributed frame\n>- The clustering method is K-means\n>- The current implementation can handle continuous and discrete features but not catergorical features","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"4e306ab9-4300-4839-a658-8d7d2f07c03c"},{"version":"CommandV1","origId":123699,"guid":"b73ee9e1-cce6-426e-a50a-e04e66a33f94","subtype":"command","commandType":"auto","position":2.75,"command":"%md\n#### Sample code\n>- The following is just some sample source code of the library \"high order spectral clustering\"\n\n```\npackage tensorSVD\n\nimport org.apache.spark.SparkContext\nimport org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix\nimport org.apache.spark.rdd.RDD\n\n/**\n * Define RDD Tensor Class\n * @constructor create rdd tensor class which contains dimension, values in Rdd and others\n * @param dim1 first dimension\n * @param dim2 second dimension\n * @param dim3 third dimension, keep the smallest dimension as the third\n */\nclass RddTensor(dim1: Int, dim2: Int, dim3: Int){\n //tensor vale is defined as a RDD of Tuple3(dim3Index: Int, rowIndex: Int, rowValue: Array[Double])\n //Note: Keep dim3 as the smallest dimension\n // Keep dim1 as the most important dimension such as along sample point\n\n //Second constructor\n def this(dims: Array[Int])=this(dims(0),dims(1),dims(2))\n\n val dim=Array(dim1,dim2,dim3)\n //class field\n //val dim=(dim1,dim2,dim3) //dimension of tensor\n var nonemptyDim3 = 0\n var values: RDD[(Int,Int,Array[Double])]=null\n var isSymmetric=true //if every matrix along dim3 A(:,:,i) is symetrix\n\n\n /**\n * The method add value into the rdd Tensor. The input must be dim1*dim2 values (as RddMatrix)\n * @param newRddmat the new value matrix to be added in\n * @param dim3Index the third dimension index\n */\n\n def addDim3(newRddmat: RddMatrix, dim3Index: Int) {\n if(dim3Index>=this.dim(2)){\n throw new IllegalArgumentException(\"arg 2 need be in range of 0 to dim3 of the tensor class\")\n }\n if(newRddmat.cols!=this.dim(1) || newRddmat.rows!=this.dim(0)){\n throw new IllegalArgumentException(\"the adding matrix dimension is not match the tensor\")\n }\n if(this.nonemptyDim3==0){\n this.values=newRddmat.values.map(v=>(dim3Index,v._1,v._2))\n }else{\n this.values=this.values.union(newRddmat.values.map(v=>(dim3Index,v._1,v._2)))\n }\n this.nonemptyDim3 +=1\n }\n\n /**\n * The method unfold the rdd tensor along the required order (0,1 or 2)\n * @param dimi the unfold direction/order (0,1 or 2)\n * @param sc the spark context\n * @return the result matrix (RddMatrix)\n */\n def unfoldTensor(dimi: Int, sc: SparkContext): RddMatrix = {\n val dims=this.dim\n var outmat:RddMatrix = null\n\n if(dimi==0){\n val rmat:RddMatrix = new RddMatrix(this.dim(dimi),0)\n rmat.setValues(this.values.map(v=>(v._2,(v._1,v._3))).groupByKey()\n .map(x=>(x._1,x._2.toArray.sortBy(_._1))).map(b=>(b._1,b._2.flatMap(bi=>bi._2))))\n outmat=rmat\n }\n if(dimi==1){\n val rmat:RddMatrix = new RddMatrix(this.dim(dimi),0)\n if(this.isSymmetric){\n val dd=2\n val temp1=Range(0,2).toArray\n val temp2=Range(0,3).toArray\n val indx: IndexedSeq[Int]=temp1.map(v=>(v,temp2)).flatMap(v=>v._2.map(_*dd+v._1))\n rmat.setValues(this.values.map(v=>(v._2,(v._1,v._3))).groupByKey()\n .map(x=>(x._1,x._2.toArray.sortBy(_._1)))\n .map(b=>(b._1,b._2.flatMap(bi=>bi._2.zipWithIndex)))\n .map(v=>(v._1,v._2.sortBy(_._2).map(vi=>vi._1))))\n outmat=rmat\n }else{\n throw new Exception(\"Folding for dim2 not apply to asymmetric tensor in dim1 by dim2\")\n }\n }\n if(dimi==2){\n val rmat:RddMatrix = new RddMatrix(this.dim(0)*this.dim(1),this.dim(dimi))\n //Note: as dim(2) is small, this returns the transpose of the unfold matrix\n val cc=this.dim(1)\n rmat.setValues(this.values.flatMap(v=>v._3.zipWithIndex.map(vi=>(v._1,v._2,vi._2,vi._1)))\n .map(b=>((b._2,b._3),(b._1,b._4))).groupByKey()\n .map(x=>(x._1._1*cc+x._1._2,x._2.toArray.sortBy(_._1).map(b=>b._2))))\n outmat=rmat\n\n }\n outmat\n }\n\n /**\n * The method fold a matrix back to rdd tensor for along the required order/dim\n * @param rddMat the input rdd matrix to be folded\n * @param dimi the folding order\n * @return the folded rdd tensor\n */\n def foldTensorDimOne(rddMat:IndexedRowMatrix, dimi:Int) :RddTensor = {\n if(dimi!=1){\n throw new IllegalArgumentException(\"The fold method of rddTensor only available along the first dimension\")\n }\n val ndim=this.dim.length\n val size=this.dim\n //val tempMat=rddMat.rows.map\n val result: RddTensor = null\n result\n }\n\n}\n\n...\n\n```\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"3111199a-f11b-4945-a2f6-43158572387c"},{"version":"CommandV1","origId":123700,"guid":"3a49ab07-ad49-4b0b-a07d-ebbda8fb776a","subtype":"command","commandType":"auto","position":3.0,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n## Course Project - High Order Spectral Clustering\n### by Xin Zhao\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":1.463804508244E12,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"44bd8dd6-1ca6-4774-bc26-b02d611021d2"}],"dashboards":[],"guid":"d7d4c8ac-61be-49ea-9c15-99556ad2014f","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script> <script src="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js" onerror="window.mainJsLoadError = true;"></script> </head> <body> <script> if (window.mainJsLoadError) { var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js'; var b = document.getElementsByTagName('body')[0]; var c = document.createElement('div'); c.innerHTML = ('<h1>Network Error</h1>' + '<p><b>Please check your network connection and try again.</b></p>' + '<p>Could not load a required resource: ' + u + '</p>'); c.style.margin = '30px'; c.style.padding = '20px 50px'; c.style.backgroundColor = '#f5f5f5'; c.style.borderRadius = '5px'; b.appendChild(c); } </script> </body> </html>