################################## # Scotch ################################## library(proxy) download.file("https://raw.githubusercontent.com/mlakolar/BUS41000/master/data/scotch.csv", destfile="scotch.csv") # there are two header lines # in order to deal with this, we follow # http://stackoverflow.com/questions/17797840/reading-two-line-headers-in-r header = scan("scotch.csv", nlines = 1, what = character(), sep=",") header2 = scan("scotch.csv", skip = 1, nlines = 1, what = character(), sep=",") scotch_df = read.csv("scotch.csv", skip = 2, header = FALSE, row.names = 1) names(scotch_df) = paste0(header[-1], "/", header2[-1]) scotch_df = scotch_df[,1:68] # drop columns that do not have characteristics head(scotch_df) indA = which(rownames(scotch_df) == "Ardberg") # find a row corresponding to ardbeg # compute distance between "Ardbeg" and all other scotches using Jaccard distance # see: https://en.wikipedia.org/wiki/Jaccard_index distM = proxy::dist(scotch_df[indA, ], y=scotch_df, method="jaccard") # names of 5 most similar scotches (the distance of ardbeg to itself is 0) rownames(scotch_df)[ order(distM)[1:6] ] # show distance distM[ order(distM)[1:6] ]