{ "metadata": { "name": "", "signature": "sha256:e7c691f4d5704391b97142837168537910b02e871e7635fd5001d746984dbd41" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "iScatter Configurator" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Routines for generating draft *iScatter* charts from a *pandas* dataframe.\n", "\n", "The *[iScatter](http://michel.wermelinger.ws/chezmichel/software/iscatter/)* library is a d3.js powered Javascript library for generating intereactive scatterplots that can be used to support simple visual exploratory statistical analysis of a small, grouped dataset.\n", "\n", "*iScatter* parses datasets described according to Stephens' NOIR model. Scale definitions as then used a basis for supporting different statistical operations. The statistics supported are as follows (items higher up the stack inherit statistics from lower down the stack).\n", "\n", "````\n", " case \"ratio\":\n", " case \"interval\":\n", " i = i.concat([\"mean\", \"range\", \"midrange\", \"stddev\"]);\n", " case \"ordinal\":\n", " i = i.concat([\"min\", \"lq\", \"median\", \"uq\", \"max\"]);\n", " case \"nominal\":\n", " i = i.concat([\"count\", \"mode\"])\n", "````\n", "\n", "To get a feel for the operations supported by *iScatter* charts, the chart embeds a tour, access via the *?* button on the chart as in the example found on the [*iScatter* homepage](http://michel.wermelinger.ws/chezmichel/software/iscatter/).\n", "\n", "This notebook provides a routine for automatically drafting configuration and files (the data schema, a chart configuration file, a CSV data file) and publishing a quick draft *iScatter* chart for a dataset provided as a *pandas* dataframe.\n", "\n", "*iScatter is copyright \u00a9 2013\u20132014 by The Open University, UK. It was implemented by Michel Wermelinger with contributions from Sam Leicester and Callum Lester, and is written in Javascript, using d3.js, jQuery, Guiders-JS and Glyphicons.*" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#This is a simple HTML webpage template for publishing the chart\n", "#There is a single piece of customisation required - specifying the name of the configuration file\n", "#The configuration file name is currently constrained to be of the form: SLUG_config.js\n", "#The configuration file name is currently constrained to be located in the data/ folder\n", "iscatterTemplate='''\n", "\n", "\n", "\n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "\n", "\n", "'''\n", "#iscatterTemplate.format(SLUG)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "#Minimal function to produce a dummy schema file based on the contents of a pandas dataframe\n", "\n", "#Inspired by http://stackoverflow.com/a/25039627/454773\n", "#If we have a hierarchical column definition, this could be brought in to the description?\n", "#TO DO: improve the logic\n", "#TO DO: either here or in main function, support a way of overriding estimated values eg of scale type.\n", "import pandas as pd\n", "def iscatter_dataProcess(dfi):\n", " C_AUTO_NOMINALS=[]\n", " C_AUTO_INTS=[]\n", " \n", " schemaCols=['id','name','description','unit','type','level']\n", " schema=pd.DataFrame(columns=schemaCols)\n", "\n", " z=dfi.dtypes.to_dict()\n", " ix=0\n", " for col in z:\n", " idx=col #col.replace(' ','').lower()\n", " tmp={'id':idx,\n", " 'name':col,#.title(),\n", " 'description':col,#.title(),\n", " 'unit':'',\n", " 'type':'string',\n", " 'level':'nominal'}\n", " if issubclass(np.dtype(z[col]).type, np.number):\n", " tmp['type']='number'#No unit;\n", " tmp['level']='ratio' #use ratio as the default for a numeric?\n", " #Strings appear to be represented as objects\n", " elif np.dtype(z[col]).type == np.object_:\n", " tmp['type']='string'#No unit;\n", " tmp['level']='nominal' #use ratio as the default for a numeric?\n", " C_AUTO_NOMINALS.append(idx)\n", " schema=pd.concat([schema,pd.DataFrame(data=tmp,index=[ix])])\n", " ix+=1\n", " return schema[schemaCols],{'C_AUTO_NOMINALS':C_AUTO_NOMINALS,'C_AUTO_INTS':C_AUTO_INTS}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "#This function generates a draft iScatter chart from a pandas dataframe\n", "#The function estimates settings for the schema file from the the dataframe\n", "#TO DO: generate config and schema files from an extended datapackage.json file that includes shcema information\n", "#TO DO: generate a draft extended datapackage.json file from a pandas dataframe\n", "def iScatterGen(tmp,STUB,name,desc,xinit,yinit,ainit='',colour='',xattr=[],yattr=[],xscale='',yscale='',yrange=''):\n", " ''' Generate a draft iScatter chart from a pandas dataframe. '''\n", " \n", " dfp=tmp\n", "\n", " C_COLOURGROUP=colour\n", " #--\n", " dfp.reset_index()\n", " \n", " PATH='./'\n", " \n", " #Save the data file\n", " dfp.to_csv(PATH+'data/'+STUB+'.txt', index =False)\n", "\n", " dfp=pd.read_csv(PATH+'data/'+STUB+'.txt')\n", " #dfp.rename(columns=lambda x: x.replace(' ','').lower(), inplace=True)\n", "\n", " sc,ac=iscatter_dataProcess(dfp)\n", "\n", " #Save the schema file\n", " sc.to_csv(PATH+'data/'+STUB+'_schema.txt',index=False)\n", "\n", " C_NAME=name\n", " C_DESC=desc\n", "\n", " C_SCHEMA_URL=PATH+'data/'+STUB+'_schema.txt'\n", "\n", " C_DATA_URL=PATH+'data/'+STUB+'.txt'\n", "\n", " C_XINIT=xinit\n", " C_XSCALE=xscale\n", " C_YINIT=yinit\n", " C_YSCALE=yscale\n", " C_YRANGE=yrange\n", " C_AINIT=ainit\n", "\n", " X_STATS=''\n", " Y_STATS='C_STANDARD_STATS'\n", "\n", " #-- PLAN IS NOT TO NEED ANYTHING BELOW HERE\n", " C_STANDARD_STATS= ['median', 'mean', 'stddev', 'range']\n", " C_AUTO_NOMINALS=ac['C_AUTO_NOMINALS']\n", " C_AUTO_INTS=ac['C_AUTO_INTS']\n", "\n", " \n", "\n", " C_YATTR=yattr\n", " #C_XATTR= list(set(dfp.columns.tolist()) - set(C_YATTR))\n", " C_XATTR=[]\n", "\n", " config={\n", " 'C_SCHEMA': C_SCHEMA_URL,\n", " 'C_DATA':C_DATA_URL,\n", " 'C_NAME':C_NAME,\n", " 'C_DESC':C_DESC,\n", "\n", " #Optional ID for chart block element\n", " 'C_CHARTID':'', #default is 'iScatterChart'\n", "\n", " #To what extent can we automatically estimate these, or derive sensible defaults from schema?\n", " 'C_XATTR':C_XATTR,\n", " 'C_YATTR':C_YATTR,\n", "\n", " 'C_XINIT':C_XINIT,\n", " 'C_YINIT':C_YINIT,\n", " 'C_AINIT':C_AINIT,\n", "\n", " 'C_XSCALE':C_XSCALE,\n", " 'C_YSCALE':C_YSCALE,\n", "\n", " 'C_YRANGE':C_YRANGE,\n", "\n", " #Can we automagically guess these or derive sensible defaults from schema?\n", " #example: C_STANDARD_STATS ['mode', 'mean', 'stddev', 'range']\n", " 'C_STATS': {\n", " C_XINIT: C_STANDARD_STATS if (X_STATS=='C_STANDARD_STATS') else [] ,\n", " C_YINIT: C_STANDARD_STATS if (Y_STATS=='C_STANDARD_STATS') else []\n", " },\n", "\n", " 'C_REFERENCES' : {},\n", "\n", " #Can we automagically guess these or derive sensible defaults from schema? For example, from nominals\n", " #C_AUTO_NOMINALS\n", " 'C_SUBSETS': C_AUTO_NOMINALS,\n", "\n", " 'C_COLOURGROUP':C_COLOURGROUP,\n", "\n", " #Can we automagically guess these from schema?\n", " #C_AUTO_INTS\n", " 'C_INTVARS':C_AUTO_INTS\n", " }\n", " import json\n", " #Save the config file\n", " with open(PATH+\"data/\"+STUB+\"_config.js\", \"w\") as f:\n", " f.write('CONFIG='+json.dumps(config, indent=4,sort_keys=True))\n", "\n", " #Save the html wrapper page\n", " with open(PATH+STUB+\".html\", \"w\") as f:\n", " f.write(iscatterTemplate.format(STUB))\n", "\n", " print('CONFIG=',json.dumps(config, indent=4,sort_keys=True))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Example" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Simple example..." ] }, { "cell_type": "code", "collapsed": false, "input": [ "#Generate some dummy data\n", "incd=pd.DataFrame(\n", " numpy.random.randint(low=5000, high=10000, size=5).tolist() +\n", " numpy.random.randint(low=10000, high=20000, size=4).tolist() +\n", " numpy.random.randint(low=20000, high=40000, size=3).tolist() +\n", " numpy.random.randint(low=40000, high=80000, size=2).tolist() +\n", " numpy.random.randint(low=80000, high=160000, size=1).tolist(),columns=['income'])\n", "incd=incd.sort('income').reset_index(drop=True).reset_index()\n", "\n", "\n", "name='Dummy pop'\n", "desc='Dummy data'\n", "xinit='index'\n", "yinit='income'\n", "ainit=''\n", "xattr=[]#['income']\n", "yattr=[]#['index']\n", "colour=''\n", "STUB='demo'\n", "\n", "#Note: I extended the minified version of the iScatter js to accept axis range setting\n", "iScatterGen(incd,STUB,name,desc,xinit,yinit,ainit,xattr=xattr,yattr=yattr,yrange=[0,175000])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CONFIG= {\n", " \"C_AINIT\": \"\",\n", " \"C_CHARTID\": \"\",\n", " \"C_COLOURGROUP\": \"\",\n", " \"C_DATA\": \"./data/demo.txt\",\n", " \"C_DESC\": \"Dummy data\",\n", " \"C_INTVARS\": [],\n", " \"C_NAME\": \"Dummy pop\",\n", " \"C_REFERENCES\": {},\n", " \"C_SCHEMA\": \"./data/demo_schema.txt\",\n", " \"C_STATS\": {\n", " \"income\": [\n", " \"median\",\n", " \"mean\",\n", " \"stddev\",\n", " \"range\"\n", " ],\n", " \"index\": []\n", " },\n", " \"C_SUBSETS\": [],\n", " \"C_XATTR\": [],\n", " \"C_XINIT\": \"index\",\n", " \"C_XSCALE\": \"\",\n", " \"C_YATTR\": [],\n", " \"C_YINIT\": \"income\",\n", " \"C_YRANGE\": [\n", " 0,\n", " 175000\n", " ],\n", " \"C_YSCALE\": \"log\"\n", "}\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "incd=pd.DataFrame(\n", " numpy.random.randint(low=5000, high=10000, size=5).tolist() +\n", " numpy.random.randint(low=100000, high=500000, size=5).tolist() +\n", " numpy.random.randint(low=1000000, high=5000000, size=5).tolist() +\n", " numpy.random.randint(low=10000000, high=50000000, size=5).tolist(),columns=['income'])\n", "incd=incd.sort('income').reset_index(drop=True).reset_index()\n", "\n", "yscale='log'\n", "STUB='demo2'\n", "iScatterGen(incd,STUB,name,desc,xinit,yinit,ainit,xattr=xattr,yattr=yattr,yscale=yscale)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CONFIG= {\n", " \"C_AINIT\": \"\",\n", " \"C_CHARTID\": \"\",\n", " \"C_COLOURGROUP\": \"\",\n", " \"C_DATA\": \"./data/demo2.txt\",\n", " \"C_DESC\": \"Dummy data\",\n", " \"C_INTVARS\": [],\n", " \"C_NAME\": \"Dummy pop\",\n", " \"C_REFERENCES\": {},\n", " \"C_SCHEMA\": \"./data/demo2_schema.txt\",\n", " \"C_STATS\": {\n", " \"income\": [\n", " \"median\",\n", " \"mean\",\n", " \"stddev\",\n", " \"range\"\n", " ],\n", " \"index\": []\n", " },\n", " \"C_SUBSETS\": [],\n", " \"C_XATTR\": [],\n", " \"C_XINIT\": \"index\",\n", " \"C_XSCALE\": \"\",\n", " \"C_YATTR\": [],\n", " \"C_YINIT\": \"income\",\n", " \"C_YRANGE\": \"\",\n", " \"C_YSCALE\": \"log\"\n", "}\n" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }