{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Clustering and collaborative filtering (via clustering) algorithms\n", "\n", "- [Importable source code (most up-to-date version)](https://github.com/sylvaticus/BetaML.jl/blob/master/src/clusters.jl) - [Julia Package](https://github.com/sylvaticus/BetaML.jl)\n", "- [Demonstrative static notebook](https://github.com/sylvaticus/BetaML.jl/blob/master/notebooks/Clustering.ipynb)\n", "- [Demonstrative live notebook](https://mybinder.org/v2/gh/sylvaticus/BetaML.jl/master?filepath=notebooks%2FClustering.ipynb) (temporary personal online computational environment on myBinder) - it can takes minutes to start with!\n", "- Theory based on [MITx 6.86x - Machine Learning with Python: from Linear Models to Deep Learning](https://github.com/sylvaticus/MITx_6.86x) ([Unit 4](https://github.com/sylvaticus/MITx_6.86x/blob/master/Unit%2004%20-%20Unsupervised%20Learning/Unit%2004%20-%20Unsupervised%20Learning.md))\n", "- New to Julia? [A concise Julia tutorial](https://github.com/sylvaticus/juliatutorial) - [Julia Quick Syntax Reference book](https://julia-book.com)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "#using Pkg\n", "#if ! haskey(Pkg.dependencies(), \"Distributions\")\n", "# Pkg.add(\"Distributions\")\n", "#end\n", "using LinearAlgebra\n", "using Random\n", "using Statistics\n", "using DelimitedFiles\n", "#using Distributions\n", "using BetaML.Clustering" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9×2 Array{Float64,2}:\n", " 1.0 10.5\n", " 1.5 10.8\n", " 1.8 8.0\n", " 1.7 15.0\n", " 3.2 40.0\n", " 3.6 32.0\n", " 3.3 38.0\n", " 5.1 -2.3\n", " 5.2 -2.4" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "K = 3\n", "X = [1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2×2 Array{Float64,2}:\n", " 1.0 8.0\n", " 3.6 40.0" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Z₀ = initRepresentatives([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.6 38],2,initStrategy=\"grid\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([2, 2, 2, 2, 3, 3, 3, 1, 1], [5.15 -2.3499999999999996; 1.5 11.075; 3.366666666666667 36.666666666666664])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(clIdx,Z) = kmeans([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([2, 2, 2, 2, 3, 3, 3, 1, 1], [5.1 -2.3; 1.5 10.8; 3.3 38.0])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(clIdx,Z) = kmedoids([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,dist = (x,y) -> norm(x-y)^2,initStrategy=\"grid\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter. 1:\tVar. of the post 2.163665445135426 \t Log-likelihood -57.57647125026064\n", "Iter. 2:\tVar. of the post 0.8859993363738548 \t Log-likelihood -55.411635885609805\n", "Iter. 3:\tVar. of the post 0.4631675430495836 \t Log-likelihood -52.363399702650966\n", "Iter. 4:\tVar. of the post 0.9539686982943021 \t Log-likelihood -48.45986544166918\n", "Iter. 5:\tVar. of the post 0.2801904268692464 \t Log-likelihood -33.125175201316154\n", "Iter. 6:\tVar. of the post 7.287993222852859e-22 \t Log-likelihood -30.93024764051858\n", "Iter. 7:\tVar. of the post 0.0 \t Log-likelihood -30.93024764051858\n" ] }, { "data": { "text/plain": [ "9×3 Array{Float64,2}:\n", " 1.0 7.7918e-78 5.95934e-37\n", " 1.0 3.90859e-62 1.48106e-28\n", " 1.0 3.2796e-51 2.19577e-26\n", " 1.0 6.51017e-58 2.83748e-21\n", " 1.51765e-28 8.63292e-51 1.0\n", " 2.45605e-27 7.29549e-33 1.0\n", " 3.04396e-28 2.61747e-46 1.0\n", " 4.63159e-60 1.0 4.03358e-42\n", " 2.78564e-63 1.0 8.12962e-44" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clusters = em([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,verbosity=FULL)\n", "clusters.pₙₖ" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5-element Array{Int64,1}:\n", " 0\n", " 1\n", " 2\n", " 3\n", " 4" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cd(@__DIR__)\n", "K = [1,12]\n", "seeds = [0,1,2,3,4]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Working with (k,seed) = (1, 0)\n", "[INFO] Working with (k,seed) = (1, 1)\n", "[INFO] Working with (k,seed) = (1, 2)\n", "[INFO] Working with (k,seed) = (1, 3)\n", "[INFO] Working with (k,seed) = (1, 4)\n", "Upper logLikelihood with 1 clusters: -1307.2234317600933 (seed 0)\n", "[INFO] Working with (k,seed) = (12, 0)\n", "[INFO] Working with (k,seed) = (12, 1)\n", "[INFO] Working with (k,seed) = (12, 2)\n", "[INFO] Working with (k,seed) = (12, 3)\n", "[INFO] Working with (k,seed) = (12, 4)\n", "Upper logLikelihood with 12 clusters: -1118.6190434326675 (seed 2)\n" ] } ], "source": [ "# Test data\n", "baseDir = \"assets/netflix/toy_data/\"\n", "X = readdlm(joinpath(baseDir,\"toy_data.txt\"))\n", "X = map(x -> x == 0 ? missing : x, X)\n", "(n,d) = size(X)\n", "for k in K\n", " ulL = -Inf\n", " bestSeed = -1\n", " bestOut = nothing\n", " for s in seeds\n", " println(\"[INFO] Working with (k,seed) = ($(k), $(s))\")\n", " μ₀ = readdlm(joinpath(baseDir,\"init_mu_$(k)_$(s).csv\"), ' ')\n", " σ²₀ = dropdims(readdlm(joinpath(baseDir,\"init_var_$(k)_$(s).csv\"), ' '),dims=2)\n", " p₀ = dropdims(readdlm(joinpath(baseDir,\"init_p_$(k)_$(s).csv\"), ' '),dims=2)\n", " emOut = em(X,k;p₀=p₀,mixtures=[SphericalGaussian(μ₀[i,:],σ²₀[i]) for i in 1:k],verbosity=NONE,minVariance=0.25)\n", " lL = emOut.lL\n", " if lL > ulL\n", " ulL = lL\n", " bestSeed = s\n", " bestOut = emOut\n", " end\n", " end\n", " println(\"Upper logLikelihood with $(k) clusters: $(ulL) (seed $(bestSeed))\")\n", "end" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1200×1200 Array{Union{Missing, Int64},2}:\n", " 2 4 5 missing … 4 4 4\n", " 3 5 5 3 5 3 4\n", " 2 missing 4 3 4 4 3\n", " 4 3 4 4 4 4 4\n", " 2 2 5 4 4 missing missing\n", " 3 missing missing 4 … 2 5 4\n", " 1 4 5 4 4 5 4\n", " 2 missing 5 4 5 3 3\n", " 3 5 missing 5 2 5 5\n", " missing missing missing missing missing 5 5\n", " 3 missing 5 3 … missing 3 missing\n", " 2 4 5 missing 4 3 missing\n", " missing 4 5 missing 3 4 3\n", " ⋮ ⋱ \n", " missing missing 5 3 missing 5 4\n", " 3 2 5 missing 5 missing missing\n", " missing 4 5 4 … 3 5 4\n", " missing 2 5 3 4 4 missing\n", " 3 4 5 3 3 4 3\n", " missing missing 4 4 missing 4 4\n", " 3 2 5 4 3 3 2\n", " 2 4 5 3 … 5 5 3\n", " missing 4 5 missing 3 3 3\n", " 2 3 5 4 3 4 2\n", " 3 4 missing 4 4 5 4\n", " 3 missing 5 4 5 4 missing" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Full NetFlix dataset.. may take time !!!!\n", "baseDir = \"assets/netflix/full/\"\n", "X = convert(Array{Int64,2},readdlm(joinpath(baseDir,\"netflix_incomplete.txt\")))\n", "(n,d) = size(X)\n", "X = map(x -> x == 0 ? missing : x, X)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Working with (k,seed) = (1, 0)\n", "[INFO] Working with (k,seed) = (1, 1)\n", "[INFO] Working with (k,seed) = (1, 2)\n", "[INFO] Working with (k,seed) = (1, 3)\n", "[INFO] Working with (k,seed) = (1, 4)\n", "Upper logLikelihood with 1 clusters: -1.5210609539852452e6 (seed 0)\n", "[INFO] Working with (k,seed) = (12, 0)\n", "[INFO] Working with (k,seed) = (12, 1)\n", "[INFO] Working with (k,seed) = (12, 2)\n", "[INFO] Working with (k,seed) = (12, 3)\n", "[INFO] Working with (k,seed) = (12, 4)\n", "Upper logLikelihood with 12 clusters: -1.3902809991574623e6 (seed 1)\n" ] } ], "source": [ "for k in K\n", " ulL = -Inf\n", " bestSeed = -1\n", " bestOut = nothing\n", " for s in seeds\n", " println(\"[INFO] Working with (k,seed) = ($(k), $(s))\")\n", " μ₀ = readdlm(joinpath(baseDir,\"init_mu_$(k)_$(s).csv\"), ' ')\n", " σ²₀ = dropdims(readdlm(joinpath(baseDir,\"init_var_$(k)_$(s).csv\"), ' '),dims=2)\n", " p₀ = dropdims(readdlm(joinpath(baseDir,\"init_p_$(k)_$(s).csv\"), ' '),dims=2)\n", " emOut = em(X,k;p₀=p₀,mixtures=[SphericalGaussian(μ₀[i,:],σ²₀[i]) for i in 1:k],verbosity=NONE,minVariance=0.25)\n", " lL = emOut.lL\n", " if lL > ulL\n", " ulL = lL\n", " bestSeed = s\n", " bestOut = emOut\n", " end\n", " end\n", " println(\"Upper logLikelihood with $(k) clusters: $(ulL) (seed $(bestSeed))\")\n", "end" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9×2 Array{Union{Missing, Float64},2}:\n", " 1.0 10.5\n", " 1.5 missing\n", " 1.8 8.0\n", " 1.7 15.0\n", " 3.2 40.0\n", " missing missing\n", " 3.3 38.0\n", " missing -2.3\n", " 5.2 -2.4" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = [1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter. 1:\tVar. of the post 2.658995979045184 \t Log-likelihood -46.60839895825246\n", "Iter. 2:\tVar. of the post 0.5972000744973046 \t Log-likelihood -34.3316693714349\n", "Iter. 3:\tVar. of the post 0.3033156311165382 \t Log-likelihood -32.64175983202529\n", "Iter. 4:\tVar. of the post 0.3218644885234808 \t Log-likelihood -29.812356340540394\n", "Iter. 5:\tVar. of the post 0.044179093958920966 \t Log-likelihood -27.683492280198745\n", "Iter. 6:\tVar. of the post 0.008700550767783852 \t Log-likelihood -27.681894241887314\n", "Iter. 7:\tVar. of the post 0.0033482623120286137 \t Log-likelihood -27.681809792362692\n", "Iter. 8:\tVar. of the post 0.0015987062820508016 \t Log-likelihood -27.681790513684003\n" ] }, { "data": { "text/plain": [ "9×2 Array{Union{Missing, Float64},2}:\n", " 1.0 10.5\n", " 1.5 14.1872\n", " 1.8 8.0\n", " 1.7 15.0\n", " 3.2 40.0\n", " 2.86281 15.1282\n", " 3.3 38.0\n", " 5.2 -2.3\n", " 5.2 -2.4" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cFOut = predictMissing(X,3,mixtures=[SphericalGaussian() for i in 1:3],verbosity=FULL,minVariance=0.25)\n", "cFOut.X̂" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "@webio": { "lastCommId": null, "lastKernelId": null }, "kernelspec": { "display_name": "Julia 1.4.1", "language": "julia", "name": "julia-1.4" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.4.1" } }, "nbformat": 4, "nbformat_minor": 4 }