{ "metadata": { "name": "Day_18_Common_Crawl" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# this key, secret access to aws-publicdatasets only -- createdd for WwOD 13 student usage\n", "KEY = 'AKIAJH2FD7572FCTVSSQ'\n", "SECRET = '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'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "!s3cmd ls s3://aws-publicdatasets/common-crawl/parse-output/valid_segments.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "2012-11-09 11:28 2478 s3://aws-publicdatasets/common-crawl/parse-output/valid_segments.txt\r\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# http://boto.s3.amazonaws.com/s3_tut.html\n", "\n", "import boto\n", "from boto.s3.connection import S3Connection\n", "\n", "from itertools import islice\n", "\n", "conn = S3Connection(KEY,SECRET)\n", "bucket = conn.get_bucket('aws-publicdatasets')\n", "for key in islice(bucket.list(prefix=\"common-crawl/parse-output/\", delimiter=\"/\"),None):\n", " print key.name.encode('utf-8')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "common-crawl/parse-output/checkpoint_staging_$folder$\n", "common-crawl/parse-output/checkpoints_$folder$\n", "common-crawl/parse-output/segment_$folder$\n", "common-crawl/parse-output/valid_segments.txt\n", "common-crawl/parse-output/valid_segments2_$folder$\n", "common-crawl/parse-output/valid_segments_$folder$\n", "common-crawl/parse-output/checkpoint_staging/\n", "common-crawl/parse-output/checkpoints/\n", "common-crawl/parse-output/segment/\n", "common-crawl/parse-output/valid_segments2/\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "# get valid_segments\n", "import boto\n", "from boto.s3.connection import S3Connection\n", "\n", "conn = S3Connection(KEY,SECRET)\n", "bucket = conn.get_bucket('aws-publicdatasets')\n", "\n", "k = bucket.get_key(\"common-crawl/parse-output/valid_segments.txt\")\n", "s = k.get_contents_as_string()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "len(s.split(\"\\n\"))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 8, "text": [ "178" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "s.split(\"\\n\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 7, "text": [ "['1346823845675',\n", " '1346823846036',\n", " '1346823846039',\n", " '1346823846110',\n", " '1346823846125',\n", " '1346823846150',\n", " '1346823846176',\n", " '1346876860445',\n", " '1346876860454',\n", " '1346876860467',\n", " '1346876860493',\n", " '1346876860565',\n", " '1346876860567',\n", " '1346876860596',\n", " '1346876860609',\n", " '1346876860611',\n", " '1346876860614',\n", " '1346876860648',\n", " '1346876860765',\n", " '1346876860767',\n", " '1346876860774',\n", " '1346876860777',\n", " '1346876860779',\n", " '1346876860782',\n", " '1346876860786',\n", " '1346876860789',\n", " '1346876860791',\n", " '1346876860795',\n", " '1346876860798',\n", " '1346876860804',\n", " '1346876860807',\n", " '1346876860817',\n", " '1346876860819',\n", " '1346876860828',\n", " '1346876860835',\n", " '1346876860838',\n", " '1346876860840',\n", " '1346876860843',\n", " '1346876860877',\n", " '1346981172137',\n", " '1346981172142',\n", " '1346981172155',\n", " '1346981172184',\n", " '1346981172186',\n", " '1346981172229',\n", " '1346981172231',\n", " '1346981172234',\n", " '1346981172239',\n", " '1346981172250',\n", " '1346981172253',\n", " '1346981172255',\n", " '1346981172258',\n", " '1346981172261',\n", " '1346981172264',\n", " '1346981172266',\n", " '1346981172268',\n", " '1350433106986',\n", " '1350433106987',\n", " '1350433106988',\n", " '1350433106989',\n", " '1350433106990',\n", " '1350433106991',\n", " '1350433106992',\n", " '1350433106993',\n", " '1350433106994',\n", " '1350433106995',\n", " '1350433106996',\n", " '1350433106997',\n", " '1350433106998',\n", " '1350433106999',\n", " '1350433107000',\n", " '1350433107001',\n", " '1350433107002',\n", " '1350433107003',\n", " '1350433107004',\n", " '1350433107005',\n", " '1350433107006',\n", " '1350433107007',\n", " '1350433107008',\n", " '1350433107009',\n", " '1350433107010',\n", " '1350433107011',\n", " '1350433107012',\n", " '1350433107013',\n", " '1350433107014',\n", " '1350433107015',\n", " '1350433107016',\n", " '1350433107017',\n", " '1350433107018',\n", " '1350433107019',\n", " '1350433107020',\n", " '1350433107021',\n", " '1350433107022',\n", " '1350433107023',\n", " '1350433107024',\n", " '1350433107025',\n", " '1350433107026',\n", " '1350433107027',\n", " '1350433107028',\n", " '1350433107029',\n", " '1350433107030',\n", " '1350433107031',\n", " '1350433107032',\n", " '1350433107033',\n", " '1350433107034',\n", " '1350433107035',\n", " '1350433107036',\n", " '1350433107037',\n", " '1350433107038',\n", " '1350433107039',\n", " '1350433107040',\n", " '1350433107041',\n", " '1350433107042',\n", " '1350433107043',\n", " '1350433107044',\n", " '1350433107045',\n", " '1350433107046',\n", " '1350433107047',\n", " '1350433107048',\n", " '1350433107049',\n", " '1350433107050',\n", " '1350433107051',\n", " '1350433107052',\n", " '1350433107053',\n", " '1350433107054',\n", " '1350433107055',\n", " '1350433107056',\n", " '1350433107057',\n", " '1350433107058',\n", " '1350433107059',\n", " '1350433107060',\n", " '1350433107061',\n", " '1350433107062',\n", " '1350433107063',\n", " '1350433107064',\n", " '1350433107065',\n", " '1350433107066',\n", " '1350433107067',\n", " '1350433107068',\n", " '1350433107069',\n", " '1350433107070',\n", " '1350433107071',\n", " '1350433107072',\n", " '1350433107073',\n", " '1350433107074',\n", " '1350433107075',\n", " '1350433107076',\n", " '1350433107077',\n", " '1350433107078',\n", " '1350433107079',\n", " '1350433107080',\n", " '1350433107081',\n", " '1350433107082',\n", " '1350433107083',\n", " '1350433107084',\n", " '1350433107085',\n", " '1350433107086',\n", " '1350433107087',\n", " '1350433107088',\n", " '1350433107089',\n", " '1350433107090',\n", " '1350433107091',\n", " '1350433107092',\n", " '1350433107093',\n", " '1350433107094',\n", " '1350433107095',\n", " '1350433107096',\n", " '1350433107097',\n", " '1350433107098',\n", " '1350433107099',\n", " '1350433107100',\n", " '1350433107101',\n", " '1350433107102',\n", " '1350433107103',\n", " '1350433107104',\n", " '1350433107105',\n", " '1350433107106',\n", " '']" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# what to do with a valid segment instance?\n", "# https://groups.google.com/forum/#!msg/common-crawl/QYTmnttZZyo/NPiXvK8ZeiMJ" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }