{ "metadata": { "name": "", "signature": "sha256:81aed14b5c7bdf9461390525cbd56f1b2754be3bd424493d7fcd7b81d0350bdf" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "https://groups.google.com/forum/#!topic/python-etl/XRIJovpb6Qc\n", "\n", "The conversion I'm trying to make is:\n", "- If nb_day is not integer:\n", " - if notes contains 'am':\n", " - split line:\n", " 1 with full days, modifying date_end = previous date_end - 1\n", " 1 with 0.5 day, modifying date_begin = date_end\n", " - if notes contacts 'pm':\n", " - split line:\n", " 1 with 0.5 day, modifying date_end = date_begin\n", " 1 with full days, modifying date_begin = previous date_begin + 1" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import datetime\n", "import petl.interactive as etl\n", "print etl.__version__" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.26\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "data = \"\"\"user_id,date_begin,date_end,nb_days,notes,projet_id\n", "user1,2014-07-31,2014-08-07,5.5,5 days + am,cp\n", "user2,2014-07-31,2014-08-07,5.5,5 days + pm,cp\n", "user3,2014-07-31,2014-08-06,5,5 days,cp\n", "\"\"\"" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "day = datetime.timedelta(days=1)\n", "\n", "\n", "def split_partial_days(row):\n", " if isinstance(row.nb_days, float):\n", " # split out partial days into separate row\n", " if 'am' in row.notes:\n", " # full days\n", " yield (row.user_id, \n", " row.date_begin, \n", " row.date_end - day,\n", " int(row.nb_days),\n", " row.notes.split('+')[0].strip(), \n", " row.projet_id)\n", " # partial days\n", " yield (row.user_id, \n", " row.date_end, \n", " row.date_end, \n", " row.nb_days - int(row.nb_days),\n", " row.notes.split('+')[1].strip(), \n", " row.projet_id)\n", " if 'pm' in row.notes:\n", " # partial days\n", " yield (row.user_id, \n", " row.date_begin, \n", " row.date_begin, \n", " row.nb_days - int(row.nb_days),\n", " row.notes.split('+')[1].strip(), \n", " row.projet_id)\n", " # full days\n", " yield (row.user_id, \n", " row.date_begin + day, \n", " row.date_end,\n", " int(row.nb_days),\n", " row.notes.split('+')[0].strip(), \n", " row.projet_id)\n", " else:\n", " # do nothing\n", " yield row\n", "\n", " \n", "tbl = (etl\n", " .fromcsv(etl.StringSource(data))\n", " .convert(('date_begin', 'date_end'), etl.dateparser('%Y-%m-%d'))\n", " .convert('nb_days', etl.parsenumber)\n", " .rowmapmany(split_partial_days, fields=['user_id', 'date_begin', 'date_end', 'nb_days', 'notes', 'projet_id'])\n", ")\n", "tbl" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
user_id | \r\n", "date_begin | \r\n", "date_end | \r\n", "nb_days | \r\n", "notes | \r\n", "projet_id | \r\n", "
---|---|---|---|---|---|
user1 | \r\n", "2014-07-31 | \r\n", "2014-08-06 | \r\n", "5 | \r\n", "5 days | \r\n", "cp | \r\n", "
user1 | \r\n", "2014-08-07 | \r\n", "2014-08-07 | \r\n", "0.5 | \r\n", "am | \r\n", "cp | \r\n", "
user2 | \r\n", "2014-07-31 | \r\n", "2014-07-31 | \r\n", "0.5 | \r\n", "pm | \r\n", "cp | \r\n", "
user2 | \r\n", "2014-08-01 | \r\n", "2014-08-07 | \r\n", "5 | \r\n", "5 days | \r\n", "cp | \r\n", "
user3 | \r\n", "2014-07-31 | \r\n", "2014-08-06 | \r\n", "5 | \r\n", "5 days | \r\n", "cp | \r\n", "