{ "metadata": { "name": "", "signature": "sha256:feac633e1109801a7af35fcee14e424dd0ed0b5221807af52f96e0176106e770" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import print_function, division\n", "\n", "import pandas as pd\n", "import thinkstats2\n", "import thinkplot\n", "\n", "import math\n", "import gzip\n", "\n", "%matplotlib inline\n", "formats = ['png', 'pdf']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "def ReadFile(filename='soc-Slashdot0902.txt.gz', n=None):\n", " \"\"\"Reads a compressed data file.\n", "\n", " Args:\n", " filename: string name of the file to read\n", " \"\"\"\n", " if filename.endswith('gz'):\n", " fp = gzip.open(filename)\n", " else:\n", " fp = open(filename)\n", "\n", " srcs = {}\n", " for i, line in enumerate(fp):\n", " if i == n:\n", " break\n", "\n", " if line.startswith('#'):\n", " continue\n", "\n", " src, dest = line.split()\n", " srcs.setdefault(src, []).append(dest)\n", "\n", " fp.close()\n", "\n", " return srcs\n", "\n", "srcs = ReadFile()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "def Summarize(srcs):\n", " \"\"\"Computes the number of edges for each source.\"\"\"\n", " lens = [len(t) for t in srcs.itervalues()]\n", " mu, sigma2 = thinkstats2.MeanVar(lens)\n", " print(mu, math.sqrt(sigma2))\n", " return lens\n", "\n", "lens = Summarize(srcs)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "12.0914317767 37.6969124125\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "pmf = thinkstats2.Pmf(lens, 'actual')\n", "cdf = pmf.MakeCdf()\n", "thinkplot.PrePlot(2)\n", "thinkplot.Plot(cdf.xs, cdf.ps, label=cdf.label)\n", "thinkplot.Config(xlabel='number of friends/foes', ylabel='CDF', xscale='log', \n", " xlim=[1, 3000],\n", " loc='lower right')\n", "thinkplot.Save('social1', formats=formats)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Writing social1.png\n", "Writing" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " social1.pdf\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "def BiasPmf(pmf, label, invert=False):\n", " \"\"\"Returns the Pmf with oversampling proportional to value.\n", "\n", " If pmf is the distribution of true values, the result is the\n", " distribution that would be seen if values are oversampled in\n", " proportion to their values; for example, if you ask students\n", " how big their classes are, large classes are oversampled in\n", " proportion to their size.\n", "\n", " If invert=True, computes in inverse operation; for example,\n", " unbiasing a sample collected from students.\n", "\n", " Args:\n", " pmf: Pmf object.\n", " invert: boolean\n", "\n", " Returns:\n", " Pmf object\n", " \"\"\"\n", " new_pmf = pmf.Copy()\n", " new_pmf.label = label\n", "\n", " for x, p in pmf.Items():\n", " if invert:\n", " new_pmf.Mult(x, 1.0/x)\n", " else:\n", " new_pmf.Mult(x, x)\n", " \n", " new_pmf.Normalize()\n", " return new_pmf\n", "\n", "\n", "biased_pmf = BiasPmf(pmf, 'biased')\n", "cdf2 = biased_pmf.MakeCdf()\n", "\n", "pmf.Mean(), biased_pmf.Mean()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "(12.091431776749411, 129.61739823546282)" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "thinkplot.PrePlot(2)\n", "thinkplot.Plot(cdf.xs, cdf.ps, label=cdf.label)\n", "thinkplot.Plot(cdf2.xs, cdf2.ps, label=cdf2.label)\n", "thinkplot.Config(xlabel='number of friends/foes', ylabel='CDF', xscale='log', \n", " xlim=[1, 3000],\n", " loc='lower right')\n", "thinkplot.Save('social2', formats=formats)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Writing social2.png\n", "Writing" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " social2.pdf\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 } ], "metadata": {} } ] }