{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Using server-side cursors with PostgreSQL and MySQL " ] }, { "cell_type": "code", "collapsed": false, "input": [ "# see http://pynash.org/2013/03/06/timing-and-profiling.html for setup of profiling magics" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import sys\n", "sys.path.insert(0, '../src')\n", "import petl; print petl.VERSION\n", "from petl.fluent import etl\n", "import psycopg2\n", "import MySQLdb" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.18-SNAPSHOT\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "tbl_dummy_data = etl().dummytable(100000)\n", "tbl_dummy_data.look()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "+-------+-----------+----------------------+\n", "| 'foo' | 'bar' | 'baz' |\n", "+=======+===========+======================+\n", "| 31 | 'pears' | 0.12509339133627373 |\n", "+-------+-----------+----------------------+\n", "| 90 | 'oranges' | 0.05715662664829624 |\n", "+-------+-----------+----------------------+\n", "| 12 | 'oranges' | 0.8525934855236975 |\n", "+-------+-----------+----------------------+\n", "| 68 | 'apples' | 0.911131148945329 |\n", "+-------+-----------+----------------------+\n", "| 77 | 'apples' | 0.8115001426786242 |\n", "+-------+-----------+----------------------+\n", "| 94 | 'apples' | 0.6671472950408706 |\n", "+-------+-----------+----------------------+\n", "| 55 | 'apples' | 0.003432210002982883 |\n", "+-------+-----------+----------------------+\n", "| 50 | 'apples' | 0.7744929413714756 |\n", "+-------+-----------+----------------------+\n", "| 82 | 'oranges' | 0.46001316056152297 |\n", "+-------+-----------+----------------------+\n", "| 13 | 'bananas' | 0.9602502583307483 |\n", "+-------+-----------+----------------------+\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "%memit print tbl_dummy_data.nrows()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "100000\n", "peak memory: 51.38 MiB, increment: 0.20 MiB\n" ] } ], "prompt_number": 4 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "PostgreSQL" ] }, { "cell_type": "code", "collapsed": false, "input": [ "psql_connection = psycopg2.connect(host='localhost', dbname='petl', user='petl', password='petl')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "cursor = psql_connection.cursor()\n", "cursor.execute('DROP TABLE IF EXISTS issue_219;')\n", "cursor.execute('CREATE TABLE issue_219 (foo INTEGER, bar TEXT, baz FLOAT);')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%memit -r1 tbl_dummy_data.progress(10000).todb(psql_connection, 'issue_219')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "10000 rows in 3.61s (2770 row/s); batch in 3.61s (2770 row/s)\n", "20000 rows in 7.21s (2774 row/s); batch in 3.60s (2778 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "30000 rows in 10.96s (2736 row/s); batch in 3.75s (2663 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "40000 rows in 14.69s (2723 row/s); batch in 3.72s (2685 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "50000 rows in 18.32s (2728 row/s); batch in 3.64s (2748 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "60000 rows in 22.04s (2722 row/s); batch in 3.72s (2689 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "70000 rows in 25.76s (2717 row/s); batch in 3.72s (2687 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "80000 rows in 29.58s (2704 row/s); batch in 3.82s (2617 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "90000 rows in 33.41s (2693 row/s); batch in 3.83s (2613 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "100000 rows in 37.14s (2692 row/s); batch in 3.73s (2680 row/s)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "peak memory: 53.32 MiB, increment: 0.01 MiB\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "100000 rows in 37.14s (2692 row/s)\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# memory usage using default cursor\n", "%memit print etl.fromdb(psql_connection, 'select * from issue_219 order by foo').look(2)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "+-------+-----------+-------------------+\n", "| 'foo' | 'bar' | 'baz' |\n", "+=======+===========+===================+\n", "| 0 | 'pears' | 0.625346298507174 |\n", "+-------+-----------+-------------------+\n", "| 0 | 'bananas' | 0.191535466509102 |\n", "+-------+-----------+-------------------+\n", "\n", "peak memory: 60.89 MiB, increment: 7.32 MiB\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# memory usage using server-side cursor\n", "%memit print etl.fromdb(lambda: psql_connection.cursor(name='server-side'), 'select * from issue_219 order by foo').look(2)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "+-------+-----------+-------------------+\n", "| 'foo' | 'bar' | 'baz' |\n", "+=======+===========+===================+\n", "| 0 | 'pears' | 0.625346298507174 |\n", "+-------+-----------+-------------------+\n", "| 0 | 'bananas' | 0.191535466509102 |\n", "+-------+-----------+-------------------+\n", "\n", "peak memory: 54.38 MiB, increment: 0.00 MiB\n" ] } ], "prompt_number": 9 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "MySQL" ] }, { "cell_type": "code", "collapsed": false, "input": [ "mysql_connection = MySQLdb.connect(host='127.0.0.1', db='petl', user='petl', passwd='petl')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "cursor = mysql_connection.cursor()\n", "cursor.execute('SET SQL_MODE=ANSI_QUOTES')\n", "cursor.execute('DROP TABLE IF EXISTS issue_219;')\n", "cursor.execute('CREATE TABLE issue_219 (foo INTEGER, bar TEXT, baz FLOAT);')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "0L" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "%memit -r1 tbl_dummy_data.progress(10000).todb(mysql_connection, 'issue_219')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "10000 rows in 2.96s (3373 row/s); batch in 2.96s (3373 row/s)\n", "20000 rows in 6.22s (3214 row/s); batch in 3.26s (3069 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "30000 rows in 9.45s (3174 row/s); batch in 3.23s (3097 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "40000 rows in 12.50s (3199 row/s); batch in 3.05s (3278 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "50000 rows in 15.60s (3205 row/s); batch in 3.10s (3229 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "60000 rows in 18.65s (3217 row/s); batch in 3.05s (3280 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "70000 rows in 21.90s (3196 row/s); batch in 3.25s (3072 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "80000 rows in 24.88s (3215 row/s); batch in 2.98s (3353 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "90000 rows in 28.08s (3205 row/s); batch in 3.19s (3130 row/s)" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "100000 rows in 31.24s (3200 row/s); batch in 3.17s (3158 row/s)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "peak memory: 54.77 MiB, increment: 0.01 MiB\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "\n", "100000 rows in 31.24s (3200 row/s)\n" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "# memory usage with default cursor\n", "%memit print etl.fromdb(mysql_connection, 'select * from issue_219 order by foo').look(2)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "+-------+-----------+-----------+\n", "| 'foo' | 'bar' | 'baz' |\n", "+=======+===========+===========+\n", "| 0L | 'bananas' | 0.191535 |\n", "+-------+-----------+-----------+\n", "| 0L | 'bananas' | 0.0228774 |\n", "+-------+-----------+-----------+\n", "\n", "peak memory: 79.88 MiB, increment: 25.11 MiB\n" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "# memory usage with server-side cursor\n", "%memit print etl.fromdb(lambda: mysql_connection.cursor(MySQLdb.cursors.SSCursor), 'select * from issue_219 order by foo').look(2)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "+-------+-----------+-----------+\n", "| 'foo' | 'bar' | 'baz' |\n", "+=======+===========+===========+\n", "| 0L | 'bananas' | 0.191535 |\n", "+-------+-----------+-----------+\n", "| 0L | 'bananas' | 0.0228774 |\n", "+-------+-----------+-----------+\n", "\n", "peak memory: 80.00 MiB, increment: 0.09 MiB\n" ] } ], "prompt_number": 15 } ], "metadata": {} } ] }