{ "cells": [ { "cell_type": "code", "execution_count": 196, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: using MLDataPattern.obsview in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.BatchView in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.RandomObs in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.batchsize in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.undersample in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.oversample in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.randobs in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.ObsView in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.RandomBatches in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.eachbatch in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.DataSubset in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.BufferGetObs in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.kfolds in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.splitobs in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.getobs! in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.batchview in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.shuffleobs in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.leaveout in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.datasubset in module Main conflicts with an existing identifier.\n", "WARNING: using MLDataPattern.eachobs in module Main conflicts with an existing identifier.\n" ] }, { "data": { "text/plain": [ "Plots.GRBackend()" ] }, "execution_count": 196, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using SwiftObjectStores\n", "using ColoringNames\n", "using Distributions\n", "using MLDataPattern\n", "using Iterators\n", "using MLLabelUtils\n", "using StaticArrays\n", "using Juno\n", "using StatsBase\n", "using Colors\n", "using DataFrames\n", "using Query\n", "using Plots\n", "gr()" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[1m\u001b[34mINFO: Recompiling stale cache file /home/ubuntu/.julia/lib/v0.5/MLLabelUtils.ji for module MLLabelUtils.\n", "\u001b[0m\u001b[1m\u001b[34mINFO: Recompiling stale cache file /home/ubuntu/.julia/lib/v0.5/MLDataPattern.ji for module MLDataPattern.\n", "\u001b[0mWARNING: Method definition getobs(Any) in module MLDataPattern at /home/ubuntu/.julia/v0.5/MLDataPattern/src/container.jl:1 overwritten in module MLDataUtils at /home/ubuntu/.julia/v0.5/MLDataUtils/src/accesspattern/datasubset.jl:1.\n", "WARNING: Method definition nobs(Any) in module MLDataPattern at /home/ubuntu/.julia/v0.5/MLDataPattern/src/container.jl:41 overwritten in module MLDataUtils at /home/ubuntu/.julia/v0.5/MLDataUtils/src/accesspattern/datasubset.jl:30.\n", "WARNING: Method definition #nobs(Array{Any, 1}, StatsBase.#nobs, Any) in module MLDataPattern overwritten in module MLDataUtils.\n", "WARNING: Method definition getobs(Any, Any) in module MLDataPattern at /home/ubuntu/.julia/v0.5/MLDataPattern/src/container.jl:67 overwritten in module MLDataUtils at /home/ubuntu/.julia/v0.5/MLDataUtils/src/accesspattern/datasubset.jl:37.\n", "WARNING: Method definition getobs(Base.SubArray) in module MLDataPattern at /home/ubuntu/.julia/v0.5/MLDataPattern/src/container.jl:92 overwritten in module MLDataUtils at /home/ubuntu/.julia/v0.5/MLDataUtils/src/accesspattern/datasubset.jl:385.\n", "WARNING: Method definition nobs(Tuple) in module MLDataPattern at /home/ubuntu/.julia/v0.5/MLDataPattern/src/container.jl:173 overwritten in module MLDataUtils at /home/ubuntu/.julia/v0.5/MLDataUtils/src/accesspattern/datasubset.jl:445.\n", "WARNING: Method definition nobs(Tuple, Tuple) in module MLDataPattern at /home/ubuntu/.julia/v0.5/MLDataPattern/src/container.jl:183 overwritten in module MLDataUtils at /home/ubuntu/.julia/v0.5/MLDataUtils/src/accesspattern/datasubset.jl:455.\n", "WARNING: Method definition getobs(Tuple, Any) in module MLDataPattern at /home/ubuntu/.julia/v0.5/MLDataPattern/src/container.jl:198 overwritten in module MLDataUtils at /home/ubuntu/.julia/v0.5/MLDataUtils/src/accesspattern/datasubset.jl:462.\n", "WARNING: Method definition getobs(Tuple, Any, Tuple) in module MLDataPattern at /home/ubuntu/.julia/v0.5/MLDataPattern/src/container.jl:208 overwritten in module MLDataUtils at /home/ubuntu/.julia/v0.5/MLDataUtils/src/accesspattern/datasubset.jl:472.\n", "\u001b[1m\u001b[34mINFO: Recompiling stale cache file /home/ubuntu/.julia/lib/v0.5/SortingAlgorithms.ji for module SortingAlgorithms.\n", "\u001b[0m\u001b[1m\u001b[34mINFO: Recompiling stale cache file /home/ubuntu/.julia/lib/v0.5/DataFrames.ji for module DataFrames.\n", "\u001b[0mWARNING: Method definition describe(AbstractArray) in module StatsBase at /home/ubuntu/.julia/v0.5/StatsBase/src/scalarstats.jl:573 overwritten in module DataFrames at /home/ubuntu/.julia/v0.5/DataFrames/src/abstractdataframe/abstractdataframe.jl:407.\n", "WARNING: Method definition describe(AbstractArray) in module StatsBase at /home/ubuntu/.julia/v0.5/StatsBase/src/scalarstats.jl:573 overwritten in module DataFrames at /home/ubuntu/.julia/v0.5/DataFrames/src/abstractdataframe/abstractdataframe.jl:407.\n", "WARNING: Method definition require(Symbol) in module Base at loading.jl:345 overwritten in module Query at /home/ubuntu/.julia/v0.5/Requires/src/require.jl:12.\n" ] }, { "data": { "text/plain": [ "1523108-element Array{Any,1}:\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " \"acid green\"\n", " ⋮ \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" \n", " \"yuck\" " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "\n", "const od =(MLDataUtils.ObsDim.First(), MLDataUtils.ObsDim.Last())\n", "const serv=SwiftService()\n", "\n", "const valid_raw = get_file(fh->readdlm(fh,'\\t'), serv, \"color\", \"monroe/dev.csv\")\n", "const valid_hsv, valid_terms_padded, encoding = prepare_data(valid_raw; do_demacate=false)\n", "const valid_text = valid_raw[:, 1]\n", "\n", "const train_raw = get_file(fh->readdlm(fh,'\\t'), serv, \"color\", \"monroe/train.csv\")\n", "const train_hsv, train_terms_padded, encoding = prepare_data(train_raw, encoding; do_demacate=false)\n", "const train_text = train_raw[:, 1]\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name\n", "Summary Stats:\n", "Length: 829\n", "Type: String\n", "Number Unique: 829\n", "\n", "n_samples\n", "Summary Stats:\n", "Mean: 1837.283474\n", "Minimum: 70.000000\n", "1st Quartile: 109.000000\n", "Median: 214.000000\n", "3rd Quartile: 627.000000\n", "Maximum: 152953.000000\n", "Length: 829\n", "Type: Int64\n", "\n", "hs\n", "Summary Stats:\n", "Mean: 0.022951\n", "Minimum: -0.429653\n", "1st Quartile: -0.086345\n", "Median: 0.019934\n", "3rd Quartile: 0.121130\n", "Maximum: 0.577041\n", "Length: 829\n", "Type: Float64\n", "\n", "hv\n", "Summary Stats:\n", "Mean: 0.000171\n", "Minimum: -0.577392\n", "1st Quartile: -0.101860\n", "Median: 0.003642\n", "3rd Quartile: 0.103550\n", "Maximum: 0.497044\n", "Length: 829\n", "Type: Float64\n", "\n", "vs\n", "Summary Stats:\n", "Mean: -0.048629\n", "Minimum: -0.497048\n", "1st Quartile: -0.132881\n", "Median: -0.050129\n", "3rd Quartile: 0.031004\n", "Maximum: 0.455614\n", "Length: 829\n", "Type: Float64\n", "\n" ] } ], "source": [ "\n", "\n", "function pairwise_stats(fun, names, hsvs)\n", " dt = DataFrame(name=String[], n_samples=Int[], hs=Float64[], hv=Float64[], vs=Float64[])\n", " @progress for (name, inds) in labelmap(names)\n", " eg_hsvs = @view hsvs[inds, :]\n", "\n", " hs = fun(eg_hsvs[:,1], eg_hsvs[:,2])\n", " hv = fun(eg_hsvs[:,1], eg_hsvs[:,3])\n", " vs = fun(eg_hsvs[:,3], eg_hsvs[:,2])\n", "\n", " push!(dt, [name, length(inds), hs, hv, vs])\n", " end\n", " dt\n", "end\n", "spearman = pairwise_stats(corspearman, train_text, train_hsv)\n", "\n", "\n", "\n", "describe(spearman)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train_cols = hsv2colorant(train_hsv)|>vec" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "pairwise_fields (generic function with 1 method)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "function pairwise_fields(T)\n", " ns = fieldnames(T)\n", " pairwise_ns = Tuple{Symbol,Symbol}[]\n", " for (ii, n1) in enumerate(ns)\n", " for (jj, n2) in enumerate(ns)\n", " jj<=ii && continue\n", " push!(pairwise_ns, (n1,n2))\n", " end\n", " end\n", " pairwise_ns\n", "end" ] }, { "cell_type": "code", "execution_count": 235, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: Method definition pairwise_stats(Any, Any, Array{#T<:Any, 1}) in module Main at In[233]:2 overwritten at In[235]:2.\n" ] }, { "data": { "text/plain": [ "pairwise_stats (generic function with 2 methods)" ] }, "execution_count": 235, "metadata": {}, "output_type": "execute_result" } ], "source": [ "function pairwise_stats{T}(fun, names, data::Vector{T})\n", " pairwise_ns = pairwise_fields(T)\n", " dt = DataFrame([String;Int;fill(Float64, 2*length(pairwise_ns))], \n", " [:name; :n_samples; \n", " [Symbol(n1,n2) for (n1,n2) in pairwise_ns];\n", " [Symbol(:abs_,n1,n2) for (n1,n2) in pairwise_ns]],\n", " 0)\n", " \n", " for (name, inds) in labelmap(names)\n", " sub = @view data[inds]\n", " row=Dict{Symbol, Any}(:name=>name, :n_samples=>length(inds))\n", " for (n1,n2) in pairwise_ns\n", " v1 = getfield.(sub, Scalar(n1))\n", " v2 = getfield.(sub, Scalar(n2))\n", " val = fun(v1, v2)\n", " row[Symbol(n1,n2)] = val\n", " row[Symbol(:abs_, n1,n2)] = abs(val)\n", " end\n", " push!(dt, row)\n", " end\n", " dt\n", "end\n" ] }, { "cell_type": "code", "execution_count": 263, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: Method definition Q1(Any) in module Main at In[262]:3 overwritten at In[263]:2.\n" ] }, { "data": { "text/plain": [ "Q3 (generic function with 1 method)" ] }, "execution_count": 263, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Q1(v)=quantile(v,0.25)\n", "Q3(v)=quantile(v,0.75)" ] }, { "cell_type": "code", "execution_count": 285, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: Method definition summarise(Any, Any, Any) in module Main at In[280]:2 overwritten at In[285]:2.\n" ] }, { "data": { "text/html": [ "
name_statf12f13f23
1RGB_Q30.60304681731617350.44719269806454920.5655902318754856
2HSV_Q30.186133418362447840.186659440736361340.1627735443944819
3HSI_Q30.24460867882115840.239117831051627760.6301694579237653
4HSL_Q30.165517311460995020.21470748631732640.3112894699855042
5xyY_Q30.72302603767980170.50241203396883620.41654070660522274
6XYZ_Q30.97260113347664650.81665027538282050.7843590386564617
7xyY_Q30.72302603767980170.50241203396883620.41654070660522274
8Lab_Q30.57301805309261710.45974988895342880.6390289755602638
9Luv_Q30.55979354387192730.6111774751288590.4379400128184171
10LCHab_Q30.52583446516030790.4110302351623740.3687582570400751
11LCHuv_Q30.61235119571869910.40719403394644070.34158057511350925
12DIN99_Q30.54494743351886210.4930922692126550.5235018403496664
13DIN99d_Q30.54423076923076920.44260730098802840.4802693716414654
14DIN99o_Q30.56081964889545830.408223817910852160.5211370440261909
15LMS_Q30.96803671042317420.74575342465753420.7789878283151825
16YIQ_Q30.40883095937080920.497525057167740.4064337083345767
17YCbCr_Q30.40046051834282650.439294990723562150.3377021554829844
" ], "text/plain": [ "17×4 DataFrames.DataFrame\n", "│ Row │ name_stat │ f12 │ f13 │ f23 │\n", "├─────┼───────────┼──────────┼──────────┼──────────┤\n", "│ 1 │ RGB_Q3 │ 0.603047 │ 0.447193 │ 0.56559 │\n", "│ 2 │ HSV_Q3 │ 0.186133 │ 0.186659 │ 0.162774 │\n", "│ 3 │ HSI_Q3 │ 0.244609 │ 0.239118 │ 0.630169 │\n", "│ 4 │ HSL_Q3 │ 0.165517 │ 0.214707 │ 0.311289 │\n", "│ 5 │ xyY_Q3 │ 0.723026 │ 0.502412 │ 0.416541 │\n", "│ 6 │ XYZ_Q3 │ 0.972601 │ 0.81665 │ 0.784359 │\n", "│ 7 │ xyY_Q3 │ 0.723026 │ 0.502412 │ 0.416541 │\n", "│ 8 │ Lab_Q3 │ 0.573018 │ 0.45975 │ 0.639029 │\n", "│ 9 │ Luv_Q3 │ 0.559794 │ 0.611177 │ 0.43794 │\n", "│ 10 │ LCHab_Q3 │ 0.525834 │ 0.41103 │ 0.368758 │\n", "│ 11 │ LCHuv_Q3 │ 0.612351 │ 0.407194 │ 0.341581 │\n", "│ 12 │ DIN99_Q3 │ 0.544947 │ 0.493092 │ 0.523502 │\n", "│ 13 │ DIN99d_Q3 │ 0.544231 │ 0.442607 │ 0.480269 │\n", "│ 14 │ DIN99o_Q3 │ 0.56082 │ 0.408224 │ 0.521137 │\n", "│ 15 │ LMS_Q3 │ 0.968037 │ 0.745753 │ 0.778988 │\n", "│ 16 │ YIQ_Q3 │ 0.408831 │ 0.497525 │ 0.406434 │\n", "│ 17 │ YCbCr_Q3 │ 0.400461 │ 0.439295 │ 0.337702 │" ] }, "execution_count": 285, "metadata": {}, "output_type": "execute_result" } ], "source": [ "function summarise(funs, names, colors)\n", " dt = DataFrame(name_stat=Symbol[], f12=Float64[], f13=Float64[], f23=Float64[])\n", " for space in [RGB, HSV, HSI, HSL, xyY, XYZ, xyY, Lab, Luv, LCHab, LCHuv, DIN99, DIN99d, DIN99o, LMS, YIQ, YCbCr]\n", " space_name = space.name.name\n", " \n", " colors_space = convert(Vector{space}, colors)\n", " stats = pairwise_stats(corspearman, names, colors_space)\n", " for fun in funs\n", " row = Any[Symbol(space_name, :_, fun)]\n", " for col_ii in [2,1,0]\n", " column = stats[:, end-col_ii]\n", " push!(row, fun(column))\n", " end\n", " push!(dt, row)\n", " end\n", " end\n", " dt\n", "end\n", "\n", "summarise([Q3], train_text, train_cols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Julia 0.5.1", "language": "julia", "name": "julia-0.5" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "0.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }