{ "metadata": { "name": "", "signature": "sha256:d608ac6a36e4dde40c20744a357d543a092bfdd98c6492e849be5b8246d17e7a" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Wrangling DHS Border Crossing Data\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:**" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np\n", "import os" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "cur_dir = os.path.dirname(os.path.realpath('__file__'))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a list of elements counting from 1995 to 2013\n", "years = list(range(1995, 2014, 1))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.DataFrame()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "for year in years:\n", " data = pd.read_csv(cur_dir + '/data/cross_raw_data/bc_' + str(year) + '.csv', header=1, skipfooter=8)\n", " df = pd.concat([df, data])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "df = df.ix[:, 0:-1]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create two lists for the loop results to be placed\n", "city = []\n", "state = []\n", "\n", "# For each row in a varible,\n", "for row in df['Port Name']:\n", " # Try to,\n", " try:\n", " # Split the row by comma and append\n", " # everything before the comma to lat\n", " city.append(row.split(': ')[1])\n", " # Split the row by comma and append\n", " # everything after the comma to lon\n", " state.append(row.split(': ')[0])\n", " # But if you get an error\n", " except:\n", " # append a missing value to lat\n", " city.append(np.NaN)\n", " # append a missing value to lon\n", " state.append(np.NaN)\n", "\n", "# Create two new columns from lat and lon\n", "df['City'] = city\n", "df['State'] = state" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "df = df.reset_index()\n", "df = df.drop('index', axis=1)\n", "df = df.rename(columns=lambda x: x.strip())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "df.to_csv(cur_dir + '/data/cross_raw_data/bc_' + 'full_crossing_data.csv', index=False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 } ], "metadata": {} } ] }