{ "metadata": { "name": "Karung" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "##Karung | By Andrew Conti" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Lets take a look at our data:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import csv\n", "with open(\"sample.txt\", 'rb') as f1:\n", " reader = csv.reader(f1)\n", " for line in reader:\n", " print line" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999']\n", "['0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999']\n", "['0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999']\n", "['0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999']\n", "['0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999']\n" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat sample.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999\n", "0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999\n", "0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999\n", "0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999\n", "0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999\n" ] } ], "prompt_number": 35 }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Let's define the 'map' part our our mapreduce function:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile max_temperature_map.py\n", "\n", "#!/usr/bin/env python\n", "class max_temp_map(Mapreduce):\n", "\t'''\n", "\tA simple doc string\n", "\t'''\n", "\timport re\n", "\timport sys\n", "\n", "\tfor line in sys.stdin:\n", "\t val = line.strip()\n", "\t (year, temp, q) = (val[15:19], val[87:92], val[92:93])\n", "\t if (temp != \"+9999\" and re.match(\"[01459]\", q)):\n", "\t print \"%s\\t%s\" % (year, temp)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Writing max_temperature_map.py\n" ] } ], "prompt_number": 31 }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Let's define the reduce:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile max_temperature_reduce.py\n", "#!/usr/bin/env python\n", "class max_temp_reduce(Mapred):\n", "\t'''\n", "\n", "\t'''\n", "\timport sys\n", "\n", "\t(last_key, max_val) = (None, -sys.maxint)\n", "\tfor line in sys.stdin:\n", "\t (key, val) = line.strip().split(\"\\t\")\n", "\t if last_key and last_key != key:\n", "\t print \"%s\\t%s\" % (last_key, max_val)\n", "\t (last_key, max_val) = (key, int(val))\n", "\t else:\n", "\t (last_key, max_val) = (key, max(max_val, int(val)))\n", "\n", "\tif last_key:\n", "\t print \"%s\\t%s\" % (last_key, max_val)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Writing max_temperature_reduce.py\n" ] } ], "prompt_number": 32 }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Let's test out our functions and see the results:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Simple test" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!cat sample.txt | max_temperature_map.py | \\sort | max_temperature_reduce.py" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "'\\sort' is not recognized as an internal or external command,\n", "operable program or batch file.\n" ] } ], "prompt_number": 38 }, { "cell_type": "markdown", "metadata": {}, "source": [ "hadoop test with combiner function (for effiency)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%cmd\n", "hadoop jar HADOOP_INSTALL/contrib/streaming/hadoop-*-streaming.jar \\\n", "-input sample.txt \\\n", "-output output \\\n", "-mapper \"max_temperature_map.py | sort | max_temperature_reduce.py\" \\\n", "-reducer max_temperature_reduce.py" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Results:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "Great but that process was a little time consuming\n", "\n", "Lets try this out in Pig" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Below is a pig file to find the max temp, just like we did before." ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile max_temp.pig\n", "-- max_temp.pig: Finds the maximum temperature by year\n", "records = LOAD 'input/ncdc/micro-tab/sample.txt'\n", " AS (year:chararray, temperature:int, quality:int);\n", "filtered_records = FILTER records BY temperature != 9999 AND\n", " (quality == 0 OR quality == 1 OR quality == 4 OR quality == 5 OR quality == 9);\n", "grouped_records = GROUP filtered_records BY year;\n", "max_temp = FOREACH grouped_records GENERATE group,\n", " MAX(filtered_records.temperature);\n", "DUMP max_temp;\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Starts pig in local mode:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!pig -x local" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Tests out or script:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!pig max_temp.pig" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Results:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Version Control" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cd C:\\\\Users\\\\Andrew\\\\Documents\\\\Hadoop\\\\Karung/\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "C:\\Users\\Andrew\\Documents\\Hadoop\\Karung\n" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "!git add Karung.ipynb" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "!git commit" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[master a76538e] created basic structure of presenation, added pig file\n", " 1 file changed, 118 insertions(+), 46 deletions(-)\n" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }