{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "using TextAnalysis, Languages, Clustering" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"data/highlights.csv\"" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filename = joinpath(dirname(@__FILE__), \"data\", \"highlights.csv\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(String[\"updatedAt\", \"_typeName\", \"reactions\", \"sourceUrl\", \"text\", \"id\", \"createdAt\", \"embedUrl\"], Any[\"2018-03-02T12:15:51.000Z\" \"Highlight\" … \"2018-03-02T11:27:37.000Z\" \"\"; \"2018-03-02T11:27:52.000Z\" \"Highlight\" … \"2018-03-02T11:27:52.000Z\" \"\"; … ; \"2018-03-02T12:18:56.000Z\" \"Production\" … \"2018-03-02T12:17:55.000Z\" \"https://www.youtube.com/embed/Vbaf9yJ6HBc\"; \"2018-03-02T12:18:39.000Z\" \"Production\" … \"2018-03-02T12:18:39.000Z\" \"https://www.youtube.com/embed/sxKcEsT9qLM\"])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "function get_headers(source)\n", " headers = [ String(s) for s in source[1,:] ]\n", " source = source[2:end,:] # clear the headers\n", " headers, source\n", "end\n", "\n", "source = readcsv(filename)\n", "headers, source = get_headers(source)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "19" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample = source[:,5]\n", "length(sample)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A Corpus" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#fd = FileDocument(sample)\n", "#sd = StringDocument(fd)\n", "\n", "text = join(sample, \",\")\n", "sd = StringDocument(text)\n", "crps = Corpus([sd])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dict{String,Int64} with 252 entries:\n", " \"gut\" => 1\n", " \"Bei\" => 1\n", " \"regelkonformer\" => 1\n", " \"hat\" => 4\n", " \"Vincenz'\" => 1\n", " \"Handelskrieg\" => 1\n", " \"Zöllen\" => 1\n", " \"mal\" => 2\n", " \"das\" => 5\n", " \"des\" => 4\n", " \"zurück\" => 1\n", " \"jedem\" => 1\n", " \"verstehen\" => 1\n", " \"Schutzinseln\" => 1\n", " \"Unternehmen\" => 1\n", " \"Tag\" => 1\n", " \"Gisel\" => 1\n", " \"Markierungen\" => 1\n", " \"von\" => 6\n", " \"Trottoirkanten\" => 1\n", " \"hin\" => 1\n", " \"Mütchen\" => 1\n", " \"kleinste\" => 2\n", " \"bis\" => 2\n", " \"Klaus\" => 1\n", " ⋮ => ⋮" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "remove_punctuation!(sd)\n", "update_lexicon!(crps)\n", "lexicon(crps)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1-element Array{Int64,1}:\n", " 1" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "update_inverse_index!(crps)\n", "inverse_index(crps)\n", "hash_function!(crps, TextHashFunction())\n", "\n", "crps[\"Handelskrieg\"]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[1m\u001b[33mWARNING: \u001b[39m\u001b[22m\u001b[33mmin(x::AbstractArray{T1}, y::AbstractArray{T2}) where {T1 <: Real, T2 <: Real} is deprecated, use min.(x, y) instead.\u001b[39m\n", "Stacktrace:\n", " [1] \u001b[1mdepwarn\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./deprecated.jl:70\u001b[22m\u001b[22m\n", " [2] \u001b[1mmin\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Float64,1}, ::Array{Float64,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./deprecated.jl:57\u001b[22m\u001b[22m\n", " [3] \u001b[1mrepick_unused_centers\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Float64,2}, ::Array{Float64,1}, ::Array{Float64,2}, ::Array{Int64,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:373\u001b[22m\u001b[22m\n", " [4] \u001b[1m_kmeans!\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Float64,2}, ::Void, ::Array{Float64,2}, ::Array{Int64,1}, ::Array{Float64,1}, ::Array{Int64,1}, ::Array{Float64,1}, ::Int64, ::Float64, ::Int64, ::Distances.SqEuclidean\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:108\u001b[22m\u001b[22m\n", " [5] \u001b[1m#kmeans!#1\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Void, ::Int64, ::Float64, ::Symbol, ::Distances.SqEuclidean, ::Function, ::Array{Float64,2}, ::Array{Float64,2}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:38\u001b[22m\u001b[22m\n", " [6] \u001b[1m(::Clustering.#kw##kmeans!)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Any,1}, ::Clustering.#kmeans!, ::Array{Float64,2}, ::Array{Float64,2}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./:0\u001b[22m\u001b[22m\n", " [7] \u001b[1m#kmeans#2\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Void, ::Symbol, ::Int64, ::Float64, ::Symbol, ::Distances.SqEuclidean, ::Function, ::Array{Float64,2}, ::Int64\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:55\u001b[22m\u001b[22m\n", " [8] \u001b[1mkmeans\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Float64,2}, ::Int64\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:51\u001b[22m\u001b[22m\n", " [9] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:522\u001b[22m\u001b[22m\n", " [10] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Module, ::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Compat/src/Compat.jl:71\u001b[22m\u001b[22m\n", " [11] \u001b[1mexecute_request\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket, ::IJulia.Msg\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/IJulia/src/execute_request.jl:158\u001b[22m\u001b[22m\n", " [12] \u001b[1m(::Compat.#inner#17{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Compat/src/Compat.jl:385\u001b[22m\u001b[22m\n", " [13] \u001b[1meventloop\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m/home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/IJulia/src/eventloop.jl:8\u001b[22m\u001b[22m\n", " [14] \u001b[1m(::IJulia.##14#17)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./task.jl:335\u001b[22m\u001b[22m\n", "while loading In[17], in expression starting on line 4\n" ] }, { "data": { "text/plain": [ "Clustering.KmeansResult{Float64}([0.0 0.0 … 0.0 0.0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1 … 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 … 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [252, 0, 0, 0, 0], [252.0, 6.91213e-310, 6.91213e-310, 6.91213e-310, 6.91213e-310], 0.0, 1, true)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m = DocumentTermMatrix(crps)\n", "D = dtm(m, :dense)\n", "T = tf_idf(D)\n", "cl = kmeans(T, 5)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2×252 SparseMatrixCSC{Float64,Int64} with 263 stored entries:\n", " [1 , 1] = 0.00512821\n", " [1 , 2] = 0.00512821\n", " [2 , 3] = 0.00531915\n", " [2 , 4] = 0.00531915\n", " [2 , 5] = 0.00531915\n", " [1 , 6] = 0.00512821\n", " [2 , 7] = 0.00531915\n", " [2 , 8] = 0.00531915\n", " [2 , 9] = 0.00531915\n", " [1 , 10] = 0.00512821\n", " ⋮\n", " [2 , 242] = 0.0106383\n", " [1 , 243] = 0.00512821\n", " [1 , 244] = 0.00512821\n", " [2 , 245] = 0.00531915\n", " [1 , 246] = 0.00512821\n", " [1 , 247] = 0.00512821\n", " [1 , 248] = 0.00512821\n", " [1 , 249] = 0.00512821\n", " [1 , 250] = 0.00512821\n", " [2 , 251] = 0.0106383\n", " [1 , 252] = 0.00512821" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m = DocumentTermMatrix(crps)\n", "k = 2 # number of topics\n", "iteration = 1000 # number of gibbs sampling iterations\n", "alpha = 0.1 # hyper parameter\n", "beta = 0.1 # hyber parameter\n", "l = lda(m, k, iteration, alpha, beta) # l is k x word matrix.\n", " # value is probablity of occurrence of a word in a topic.\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Julia 0.6.1", "language": "julia", "name": "julia-0.6" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "0.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }