{ "metadata": { "name": "", "signature": "sha256:8b61ab0ebd65400891631366b0a489bae710b52c49321d9d33e3782b600398a8" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# String Munging In Dataframe\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### import modules" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np\n", "import re as re" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create dataframe" ] }, { "cell_type": "code", "collapsed": false, "input": [ "raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], \n", " 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], \n", " 'email': ['jas203@gmail.com', 'momomolly@gmail.com', np.NAN, 'battler@milner.com', 'Ames1234@yahoo.com'], \n", " 'preTestScore': [4, 24, 31, 2, 3],\n", " 'postTestScore': [25, 94, 57, 62, 70]}\n", "df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'email', 'preTestScore', 'postTestScore'])\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameemailpreTestScorepostTestScore
0 Jason Miller jas203@gmail.com 4 25
1 Molly Jacobson momomolly@gmail.com 24 94
2 Tina Ali NaN 31 57
3 Jake Milner battler@milner.com 2 62
4 Amy Cooze Ames1234@yahoo.com 3 70
\n", "

5 rows \u00d7 5 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 21, "text": [ " first_name last_name email preTestScore postTestScore\n", "0 Jason Miller jas203@gmail.com 4 25\n", "1 Molly Jacobson momomolly@gmail.com 24 94\n", "2 Tina Ali NaN 31 57\n", "3 Jake Milner battler@milner.com 2 62\n", "4 Amy Cooze Ames1234@yahoo.com 3 70\n", "\n", "[5 rows x 5 columns]" ] } ], "prompt_number": 21 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Which strings in the email column contains 'gmail'" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['email'].str.contains('gmail')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 22, "text": [ "0 True\n", "1 True\n", "2 NaN\n", "3 False\n", "4 False\n", "Name: email, dtype: object" ] } ], "prompt_number": 22 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a regular expression pattern that breaks apart emails" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\\\.([A-Z]{2,4})'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 23 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find everything in df.email that contains that pattern" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['email'].str.findall(pattern, flags=re.IGNORECASE)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 24, "text": [ "0 [(jas203, gmail, com)]\n", "1 [(momomolly, gmail, com)]\n", "2 NaN\n", "3 [(battler, milner, com)]\n", "4 [(Ames1234, yahoo, com)]\n", "Name: email, dtype: object" ] } ], "prompt_number": 24 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a pandas series containing the email elements" ] }, { "cell_type": "code", "collapsed": false, "input": [ "matches = df['email'].str.match(pattern, flags=re.IGNORECASE)\n", "matches" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ "0 (jas203, gmail, com)\n", "1 (momomolly, gmail, com)\n", "2 NaN\n", "3 (battler, milner, com)\n", "4 (Ames1234, yahoo, com)\n", "Name: email, dtype: object" ] } ], "prompt_number": 29 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select the domains of the df.email" ] }, { "cell_type": "code", "collapsed": false, "input": [ "matches.str[1]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 42, "text": [ "0 gmail\n", "1 gmail\n", "2 NaN\n", "3 milner\n", "4 yahoo\n", "Name: email, dtype: object" ] } ], "prompt_number": 42 } ], "metadata": {} } ] }