{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# ref : https://github.com/jadianes/spark-py-notebooks\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'\\n** RUN SPARK NOTEBOOK : \\n\\n$ source activate J_env ; cd NOTEBOOK ; IPYTHON_OPTS=\"notebook\" /Users/GGV/spark/./bin/pyspark\\n\\n\\n** RUN SPARK NOTEBOOK and processing CSV files with Spark SQL \\n\\n$ source activate J_env ; cd NOTEBOOK ; IPYTHON_OPTS=\\'notebook\\' /Users/GGV/spark/./bin/pyspark --packages com.databricks:spark-csv_2.11:1.2.0\\n\\n'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Launch Spark ipython notebook\n", "\n", "\"\"\"\n", "** RUN SPARK NOTEBOOK : \n", "\n", "$ source activate J_env ; cd NOTEBOOK ; IPYTHON_OPTS=\"notebook\" /Users/GGV/spark/./bin/pyspark\n", "\n", "\n", "** RUN SPARK NOTEBOOK and processing CSV files with Spark SQL \n", "\n", "$ source activate J_env ; cd NOTEBOOK ; IPYTHON_OPTS='notebook' /Users/GGV/spark/./bin/pyspark --packages com.databricks:spark-csv_2.11:1.2.0\n", "\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sc" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import datetime as dt \n", "import time\n", "import csv\n", "import requests\n", "import pandas as pd, numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Spark read csv, dataframe, SparkSQL " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pandas dataframe \n", "\n", "pdf=pd.read_csv('/Users/GGV/Desktop/test.csv')\n", "pdf.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---+\n", "| A| B|\n", "+---+---+\n", "| 1| 4|\n", "| 2| 5|\n", "| 3| 6|\n", "+---+---+\n", "\n" ] } ], "source": [ "# Spark dataframe \n", "\n", "df = sqlCtx.createDataFrame([(1, 4), (2, 5), (3, 6)], [\"A\", \"B\"])\n", "df.show()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| _1|\n", "+--------------------+\n", "|PassengerId,Pclas...|\n", "|892,3,\"Kelly, Mr....|\n", "|893,3,\"Wilkes, Mr...|\n", "|894,2,\"Myles, Mr....|\n", "|895,3,\"Wirz, Mr. ...|\n", "|896,3,\"Hirvonen, ...|\n", "|897,3,\"Svensson, ...|\n", "|898,3,\"Connolly, ...|\n", "|899,2,\"Caldwell, ...|\n", "|900,3,\"Abrahim, M...|\n", "|901,3,\"Davies, Mr...|\n", "|902,3,\"Ilieff, Mr...|\n", "|903,1,\"Jones, Mr....|\n", "|904,1,\"Snyder, Mr...|\n", "|905,2,\"Howard, Mr...|\n", "|906,1,\"Chaffee, M...|\n", "|907,2,\"del Carlo,...|\n", "|908,2,\"Keane, Mr....|\n", "|909,3,\"Assaf, Mr....|\n", "|910,3,\"Ilmakangas...|\n", "+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# Spark read .csv as dataframe \n", "\n", "df_ = sc.textFile(\"/Users/GGV/Desktop/test.csv\")\n", "df_ = df_.map(lambda x: (x, )).toDF()\n", "df_.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# read CSV with PySpark SQL \n", "\n", "from pyspark.sql import SQLContext\n", "sqlContext = SQLContext(sc)\n", "\n", "\n", "df_test = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('/Users/GGV/Desktop/test.csv')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "pyspark.sql.dataframe.DataFrame" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(df_test)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- PassengerId: integer (nullable = true)\n", " |-- Pclass: integer (nullable = true)\n", " |-- Name: string (nullable = true)\n", " |-- Sex: string (nullable = true)\n", " |-- Age: double (nullable = true)\n", " |-- SibSp: integer (nullable = true)\n", " |-- Parch: integer (nullable = true)\n", " |-- Ticket: string (nullable = true)\n", " |-- Fare: double (nullable = true)\n", " |-- Cabin: string (nullable = true)\n", " |-- Embarked: string (nullable = true)\n", "\n" ] } ], "source": [ "df_test.printSchema()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['PassengerId',\n", " 'Pclass',\n", " 'Name',\n", " 'Sex',\n", " 'Age',\n", " 'SibSp',\n", " 'Parch',\n", " 'Ticket',\n", " 'Fare',\n", " 'Cabin',\n", " 'Embarked']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.columns" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n", "|PassengerId|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|\n", "+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n", "| 892| 3| Kelly, Mr. James| male|34.5| 0| 0| 330911| 7.8292| | Q|\n", "| 893| 3|Wilkes, Mrs. Jame...|female|47.0| 1| 0| 363272| 7.0| | S|\n", "| 894| 2|Myles, Mr. Thomas...| male|62.0| 0| 0| 240276| 9.6875| | Q|\n", "| 895| 3| Wirz, Mr. Albert| male|27.0| 0| 0| 315154| 8.6625| | S|\n", "| 896| 3|Hirvonen, Mrs. Al...|female|22.0| 1| 1| 3101298|12.2875| | S|\n", "| 897| 3|Svensson, Mr. Joh...| male|14.0| 0| 0| 7538| 9.225| | S|\n", "| 898| 3|Connolly, Miss. Kate|female|30.0| 0| 0| 330972| 7.6292| | Q|\n", "| 899| 2|Caldwell, Mr. Alb...| male|26.0| 1| 1| 248738| 29.0| | S|\n", "| 900| 3|Abrahim, Mrs. Jos...|female|18.0| 0| 0| 2657| 7.2292| | C|\n", "| 901| 3|Davies, Mr. John ...| male|21.0| 2| 0| A/4 48871| 24.15| | S|\n", "| 902| 3| Ilieff, Mr. Ylio| male|null| 0| 0| 349220| 7.8958| | S|\n", "| 903| 1|Jones, Mr. Charle...| male|46.0| 0| 0| 694| 26.0| | S|\n", "| 904| 1|Snyder, Mrs. John...|female|23.0| 1| 0| 21228|82.2667| B45| S|\n", "| 905| 2|Howard, Mr. Benjamin| male|63.0| 1| 0| 24065| 26.0| | S|\n", "| 906| 1|Chaffee, Mrs. Her...|female|47.0| 1| 0| W.E.P. 5734| 61.175| E31| S|\n", "| 907| 2|del Carlo, Mrs. S...|female|24.0| 1| 0| SC/PARIS 2167|27.7208| | C|\n", "| 908| 2| Keane, Mr. Daniel| male|35.0| 0| 0| 233734| 12.35| | Q|\n", "| 909| 3| Assaf, Mr. Gerios| male|21.0| 0| 0| 2692| 7.225| | C|\n", "| 910| 3|Ilmakangas, Miss....|female|27.0| 1| 0|STON/O2. 3101270| 7.925| | S|\n", "| 911| 3|Assaf Khalil, Mrs...|female|45.0| 0| 0| 2696| 7.225| | C|\n", "+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "df_test.show()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# make pyspark dataframe as SQL \n", "\n", "df_test.registerTempTable(\"numeric\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+\n", "|PassengerId|Pclass| Name| Sex| Age|SibSp|Parch|Ticket| Fare|Cabin|Embarked|\n", "+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+\n", "| 892| 3| Kelly, Mr. James| male|34.5| 0| 0|330911|7.8292| | Q|\n", "| 893| 3|Wilkes, Mrs. Jame...|female|47.0| 1| 0|363272| 7.0| | S|\n", "+-----------+------+--------------------+------+----+-----+-----+------+------+-----+--------+\n", "\n" ] } ], "source": [ "# query \n", "\n", "sqlContext.sql(\"\"\"SELECT * FROM numeric limit 2 \"\"\").show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Spark basics functions " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(u'1289,1,\"Frolicher-Stehli,', 1), (u'John\",male,,1,0,367227,7.75,,Q', 1), (u'1103,3,\"Finoli,', 1), (u'Gertrud', 1), (u'1244,2,\"Dibden,', 1), (u'1212,3,\"Andersson,', 1), (u'909,3,\"Assaf,', 1), (u'(Helen', 1), (u'Joseph\",male,,0,0,240261,10.7083,,Q', 1), (u'Johannes\",male,20,0,0,SOTON/O2', 1), (u'Howard)\",female,45,1,0,350026,14.1083,,S', 1), (u'33112,36.75,,S', 1), (u'James\",male,34.5,0,0,330911,7.8292,,Q', 1), (u'Nils\",male,29,0,0,347467,7.8542,,S', 1), (u'1220,2,\"Clarke,', 1), (u'William\",male,36,0,0,A/5', 1), (u'Robert\",male,23,0,0,C.A.', 1), (u'12998,25.7417,,C', 1), (u'(Charles', 1), (u'Mrs.', 1), (u'17757,227.525,C62', 1), (u'1102,3,\"Andersen,', 1), (u'Metcalfe\",male,36,0,0,242963,13,,S', 1), (u'Herbert\",male,18,0,0,347090,7.75,,S', 1), (u'Joshua', 1), (u'Thomas', 1), (u'1058,1,\"Brandeis,', 1), (u'3101263,7.85,,S', 1), (u'Einar', 1), (u'Ferdinand\",male,21,0,0,350410,7.8542,,S', 1), (u'B55,C', 1), (u'Louise\",female,1,1,2,SC/Paris', 1), (u'1290,3,\"Larsson-Rondberg,', 1), (u'908,2,\"Keane,', 1), (u'898,3,\"Connolly,', 1), (u'Marshall', 1), (u'Elizabeth\",female,,0,0,A.', 1), (u'982,3,\"Dyker,', 1), (u'Thorpe)\",female,37,1,0,19928,90,C78,Q', 1), (u'1238,2,\"Botsford,', 1), (u'1283,1,\"Lines,', 1), (u'1215,1,\"Rowe,', 1), (u'Crothers\",male,39,0,0,PC', 1), (u'(Nellie', 1), (u'31416,8.05,,S', 1), (u'Cervin\",male,14,0,0,7538,9.225,,S', 1), (u'1265,2,\"Harbeck,', 1), (u'(Pauline', 1), (u'Folke\",male,6,3,1,349909,21.075,,S', 1), (u'1188,2,\"Laroche,', 1), (u'1048,1,\"Bird,', 1), (u'1168,2,\"Parker,', 1), (u'1267,1,\"Bowen,', 1), (u'Salli', 1), (u'Lamson)\",female,59,2,0,11769,51.4792,C101,S', 1), (u'1171,2,\"Oxenham,', 1), (u'3236,8.05,,S', 1), (u'Maria\",female,20,0,0,347471,7.8542,,S', 1), (u'851,14.5,,S', 1), (u'Marta\",female,18,1,1,250650,13,,S', 1), (u'1037,3,\"Vander', 1), (u'Sanni\"\"\"\"\",female,22,0,0,3101295,39.6875,,S', 1), (u'Thomas\",male,60.5,0,0,3701,,,S', 1), (u'Arthur', 1), (u'Julius\",male,31,3,0,345763,18,,S', 1), (u'Frank\",male,0.83,0,1,392091,9.35,,S', 1), (u'369943,8.05,,S', 1), (u'Giuseppe\",male,30,0,0,C.A.', 1), (u'1231,3,\"Betros,', 1), (u'976,2,\"Lamb,', 1), (u'927,3,\"Katavelas,', 1), (u'1169,2,\"Faunthorpe,', 1), (u'Samuel\",male,28,0,0,363611,8.05,,S', 1), (u'Pedro\",male,17,0,0,113059,47.1,,S', 1), (u'A\",male,50,1,0,A/5.', 1), (u'Elisabeth', 1), (u'Percy', 1), (u'(Miriam\"\")\"\"\",female,45,0,0,2696,7.225,,C', 1), (u'William\",male,,1,1,A/5.', 1), (u'Patrick\",male,,0,0,370374,7.75,,Q', 1), (u'Robert', 1), (u'1300,3,\"Riordan,', 1), (u'1003,3,\"Shine,', 1), (u'942,1,\"Smith,', 1), (u'Isaac', 1), (u'1274,3,\"Risien,', 1), (u'894,2,\"Myles,', 1), (u'42795,7.55,,S', 1), (u'McDougald)\",female,60,1,4,19950,263,C23', 1), (u'971,3,\"Doyle,', 1), (u'1243,2,\"Stokes,', 1), (u'1301,3,\"Peacock,', 1), (u'Truelove', 1), (u'(Emma)\",female,,0,0,364498,14.5,,S', 1), (u'Maria\",female,22,2,0,315152,8.6625,,S', 1), (u'33595,15.75,,S', 1), (u'1287,1,\"Smith,', 1), (u'Luigi\",male,,0,0,SOTON/O.Q.', 1), (u'1021,3,\"Petersen,', 1), (u'1052,3,\"Smyth,', 1), (u'1113,3,\"Reynolds,', 1), (u'1186,3,\"Wittevrongel,', 1), (u'Khalil,', 1), (u'Rogers)\",female,22,0,0,W./C.', 1), (u'1222,2,\"Davies,', 1), (u'1164,1,\"Clark,', 1), (u'1141,3,\"Khalil,', 1), (u'Joseph\",male,26,0,0,330910,7.8792,,Q', 1), (u'William\",male,17,0,0,S.O.C.', 1), (u'Margareta\",female,38,4,2,347091,7.775,,S', 1), (u'Frederick\",male,40,1,6,CA', 1), (u'Cresson\",male,46,0,0,694,26,,S', 1), (u'1291,3,\"Conlon,', 1), (u'936,1,\"Kimball,', 1), (u'Mr.', 1), (u'(Elizabeth', 1), (u'992,1,\"Stengel,', 1), (u'15185,10.5,,S', 1), (u'960,1,\"Tucker,', 1), (u'Henry\",male,31,0,0,21332,7.7333,,Q', 1), (u'Flora\",female,28,3,2,19950,263,C23', 1), (u'Colvin)\",female,30,0,0,237249,13,,S', 1), (u'Fermina\",female,39,0,0,PC', 1), (u'Isidor', 1), (u'Ralph', 1), (u'Franz\",male,27,0,0,SC/PARIS', 1), (u'Winifred\",female,22,0,1,112378,59.4,,C', 1), (u'Toogood)\",female,47,1,0,W.E.P.', 1), (u'957,2,\"Corey,', 1), (u'1121,2,\"Hocking,', 1), (u'1228,2,\"de', 1), (u'Edwin\",male,32.5,0,0,113503,211.5,C132,C', 1), (u'IV\",male,53,0,0,113780,28.5,C51,C', 1), (u'952,3,\"Dika,', 1), (u'1110,1,\"Widener,', 1), (u'1279,2,\"Ashby,', 1), (u'926,1,\"Mock,', 1), (u'950,3,\"Davison,', 1), (u'1050,1,\"Borebank,', 1), (u'Elias)\"\"\",female,,1,0,2660,14.4542,,C', 1), (u'1226,3,\"Cor,', 1), (u'Hughes)\",female,18,1,0,13695,60,C31,S', 1), (u'923,2,\"Jefferys,', 1), (u'William\",male,47,0,0,C.A.', 1), (u'Reginald', 1), (u'Edgar\",male,5,4,2,347077,31.3875,,S', 1), (u'Gordon\",male,32,0,0,237216,13.5,,S', 1), (u'Gerios\",male,21,0,0,2692,7.225,,C', 1), (u'4001,22.525,,S', 1), (u'1036,1,\"Lindeberg-Lind,', 1), (u'Watson\",male,18,2,2,W./C.', 1), (u'930,3,\"Sap,', 1), (u'1122,2,\"Sweet,', 1), (u'Edward\",male,0.75,1,1,SOTON/O.Q.', 1), (u'1053,3,\"Touma,', 1), (u'Grga\",male,18,0,0,315091,8.6625,,S', 1), (u'Edvard\",male,29,0,0,STON/O', 1), (u'Maxmillian', 1), (u'Magnin)\",female,24,1,1,S.C./PARIS', 1), (u'956,1,\"Ryerson,', 1), (u'Henry\",male,33,0,0,A./5.', 1), (u'Ocana,', 1), (u'1106,3,\"Andersson,', 1), (u'(Mr', 1), (u'Caroline\",female,30,0,0,36928,164.8667,C7,S', 1), (u'Emilie\",female,39,0,0,24160,211.3375,,S', 1), (u'Benjamin', 1), (u'Gerald\",male,43,1,0,17765,27.7208,D40,C', 1), (u'2168,15.0333,,C', 1), (u'Thomson\",male,36,0,0,13050,75.2417,C6,C', 1), (u'Larned', 1), (u'1049,3,\"Lundin,', 1), (u'1045,3,\"Klasen,', 1), (u'Mowad)\"\"\",female,19,1,1,2653,15.7417,,C', 1), (u'2.', 1), (u'958,3,\"Burns,', 1), (u'Pillsbury\",male,24,1,0,21228,82.2667,B45,S', 1), (u'1112,2,\"Duran', 1), (u'Alice\",female,31,0,0,16966,134.5,E39', 1), (u'George\",male,18.5,0,0,248734,13,F,S', 1), (u'Samuel\",male,26,0,0,347075,7.775,,S', 1), (u'Planke,', 1), (u'1303,1,\"Minahan,', 1), (u'1190,1,\"Loring,', 1), (u'Siegel)\",female,76,1,0,19877,78.85,C46,S', 1), (u'17758,108.9,C105,C', 1), (u'1094,1,\"Astor,', 1), (u'940,1,\"Bucknell,', 1), (u'Baxter)\",female,27,1,1,PC', 1), (u'1135,3,\"Hyman,', 1), (u'Castello,', 1), (u'J\",female,12,0,0,C.A.', 1), (u'(Baron', 1), (u'1095,2,\"Quick,', 1), (u'Augustus\",male,57,1,0,PC', 1), (u'Eliza', 1), (u'1155,3,\"Klasen,', 1), (u'17608,262.375,,C', 1), (u'1286,3,\"Kink-Heilmann,', 1), (u'Guillaume', 1), (u'14266,10.5,F33,S', 1), (u'1249,3,\"Lockyer,', 1), (u'1181,3,\"Ford,', 1), (u'Francis\",male,26,1,1,248738,29,,S', 1), (u'Joseph\",male,13,0,2,C.A.', 1), (u'(Edith', 1), (u'Manta', 1), (u'C27,S', 1), (u'Manley\",male,64,1,0,110813,75.25,D37,C', 1), (u'William', 1), (u'Gustaf', 1), (u'951,1,\"Chaudanson,', 1), (u'1167,2,\"Bryhl,', 1), (u'Johan\",male,24,0,0,7266,9.325,,S', 1), (u'1256,1,\"Harder,', 1), (u'993,2,\"Weisz,', 1), (u'G', 1), (u'1588,7.575,,S', 1), (u'Harold', 1), (u'1302,3,\"Naughton,', 1), (u'(Eleanor', 1), (u'Patrick\",male,24,0,0,371109,7.25,,Q', 1), (u'Paul', 1), (u'Ilario', 1), (u'Lucien', 1), (u'Helena\",female,2,1,1,370129,20.2125,,S', 1), (u'Samuel\",male,21,2,0,A/4', 1), (u'Dunton\",male,50,1,1,113503,211.5,C80,C', 1), (u'Julia\",female,,0,0,335432,7.7333,,Q', 1), (u'1196,3,\"McCarthy,', 1), (u'Nassr\",male,,0,0,2676,7.225,,C', 1), (u'Long)\",female,31,0,0,CA', 1), (u'2861,15.5792,,C', 1), (u'913,3,\"Olsen,', 1), (u'Harry\",male,19,0,0,28004,10.5,,S', 1), (u'3101270,7.925,,S', 1), (u'Bridget', 1), (u'1051,3,\"Peacock,', 1), (u'Vassilios\"\")\"\"\",male,18.5,0,0,2682,7.2292,,C', 1), (u'Joseph\",male,36.5,1,0,345572,17.4,,S', 1), (u'1269,2,\"Cotterill,', 1), (u'Axel', 1), (u'Gilbert', 1), (u'2079,37.0042,,C', 1), (u'17580,29.7,A18,C', 1), (u'Olof', 1), (u'1085,2,\"Lingane,', 1), (u'1478,8.05,,S', 1), (u'17368,3.1708,,S', 1), (u'1033,1,\"Daniels,', 1), (u'Frederick\",male,,0,0,359309,8.05,,S', 1), (u'(Jennie', 1), (u'1182,1,\"Rheims,', 1), (u'1067,2,\"Brown,', 1), (u'1208,1,\"Spencer,', 1), (u'1144,1,\"Clark,', 1), (u'Scott\",female,45,0,0,PC', 1), (u'Winifred', 1), (u'Eugen\",male,33,0,0,347465,7.8542,,S', 1), (u'1108,3,\"Mahon,', 1), (u'James\",male,42,0,0,110489,26.55,D22,S', 1), (u'Kalle', 1), (u'1263,1,\"Wilson,', 1), (u'1307,3,\"Saether,', 1), (u'999,3,\"Ryan,', 1), (u'Tyrell', 1), (u'17760,135.6333,C32,C', 1), (u'1248,1,\"Brown,', 1), (u'Joseph\",male,24,0,0,S.O./P.P.', 1), (u'Bridget\",female,,0,0,364856,7.75,,Q', 1), (u'Potter)\",female,23,0,1,11767,83.1583,C54,C', 1), (u'Ingvar\",male,21,0,0,236854,13,,S', 1), (u'Brown\",male,49,0,0,19924,26,,S', 1), (u'John\",male,61,0,0,235509,12.35,,Q', 1), (u'1099,2,\"Collett,', 1), (u'Richard', 1), (u'1056,2,\"Peruschitz,', 1), (u'Raffull\",male,20,0,0,2679,7.225,,C', 1), (u'Hull\",male,26,0,0,237670,13,,S', 1), (u'Marie\",female,16,0,0,348125,7.65,,S', 1), (u'(Shawneene', 1), (u'1096,2,\"Andrew,', 1), (u'Vilhelm', 1), (u'1132,1,\"Lindstrom,', 1), (u'1255,3,\"Strilic,', 1), (u'Marius\",male,24,0,0,342441,8.05,,S', 1), (u'21175,7.25,,S', 1), (u'Nile)\",female,26,0,2,SOTON/O.Q.', 1), (u'1057,3,\"Kink-Heilmann,', 1), (u'Albert', 1), (u'Morris)\",female,43,1,0,11778,55.4417,C116,C', 1), (u'Shedid\",male,22.5,0,0,2698,7.225,,C', 1), (u'1187,3,\"Angheloff,', 1), (u'(Abi', 1), (u'1233,3,\"Lundstrom,', 1), (u'1059,3,\"Ford,', 1), (u'1054,2,\"Wright,', 1), (u'955,3,\"Bradley,', 1), (u'Minnie\"\"', 1), (u'2123,41.5792,,C', 1), (u'1158,1,\"Chisholm,', 1), (u'Holland\",male,30,0,0,113801,45.5,,S', 1), (u'Jennie\",female,23,0,0,SOTON/OQ', 1), (u'Frederick', 1), (u'Trevaskis)\"\"\",female,29,0,2,29103,23,,S', 1), (u'Arman)\",female,60,1,0,24065,26,,S', 1), (u'Mapriededer\",male,26.5,0,0,2656,7.225,,C', 1), (u'(Sara', 1), (u'Billiard,', 1), (u'1296,1,\"Frauenthal,', 1), (u'Patrick\",male,,0,0,7935,7.75,,Q', 1), (u'1197,1,\"Crosby,', 1), (u'Isidor\",male,67,1,0,PC', 1), (u'Donald', 1), (u'L\",male,,0,0,111163,26,,S', 1), (u'Patrick\",male,,0,0,368573,7.75,,Q', 1), (u'1200,1,\"Hays,', 1), (u'1073,1,\"Compton,', 1), (u'Dutton)\",female,48,1,0,PC', 1), (u'1251,3,\"Lindell,', 1), (u'Noel\",male,,0,0,237735,15.0458,D,C', 1), (u'Vivian\",male,42,1,1,28220,32.5,,S', 1), (u'1042,1,\"Earnshaw,', 1), (u'Farquarson)\",female,18,1,0,113773,53.1,D30,S', 1), (u'Stephen', 1), (u'J\",male,21,0,0,342684,8.05,,S', 1), (u'(Ellen', 1), (u'Katherine\",female,35,0,0,9232,7.75,,Q', 1), (u'Evan\",male,22,0,0,SC/A4', 1), (u'(Mary', 1), (u'A\",male,34,1,0,226875,26,,S', 1), (u'Escott', 1), (u'Jr', 1), (u'White)', 1), (u'Willie\"\"\"\"\",male,,1,2,W./C.', 1), (u'985,3,\"Guest,', 1), (u'Gerda', 1), (u'(Caroline', 1), (u'Dennick\",male,57,1,1,36928,164.8667,,S', 1), (u'(Virginia', 1), (u'1041,2,\"Lahtinen,', 1), (u'Emilia\",female,1,1,1,350405,12.1833,,S', 1), (u'Washington', 1), (u'G63,S', 1), (u'Lovisa\",female,28,0,0,347086,7.775,,S', 1), (u'James\",male,30,1,0,CA', 1), (u'(Charlotte', 1), (u'G\",male,33,0,0,113790,26.55,,S', 1), (u'Thorne\"\")\"\"\",male,46,0,0,PC', 1), (u'1254,2,\"Ware,', 1), (u'1077,2,\"Maybery,', 1), (u'17608,262.375,B61,C', 1), (u'17597,61.3792,,C', 1), (u'1000,3,\"Willer,', 1), (u'C', 1), (u'1205,3,\"Carr,', 1), (u'Gifford', 1), (u'Philip\",male,24,1,0,13695,60,C31,S', 1), (u'Ada\",female,,8,2,CA.', 1), (u'(Hulda', 1), (u'Minko\",male,26,0,0,349202,7.8958,,S', 1), (u'(Eva', 1), (u'Kate\",female,17,0,0,AQ/3.', 1), (u'B66,C', 1), (u'984,1,\"Davidson,', 1), (u'17483,221.7792,C55', 1), (u'More,', 1), (u'1156,2,\"Portaluppi,', 1), (u'Palmquist,', 1), (u'Thomas\",male,25,0,0,C.A.', 1), (u'1242,1,\"Greenfield,', 1), (u'Ralph\",male,24,0,0,248726,13.5,,S', 1), (u'Constance\",female,21,0,0,113795,26.55,,S', 1), (u'Frank\",male,,0,0,359306,8.05,,S', 1), (u'P\",male,,1,0,2621,6.4375,,C', 1), (u'Edvin\",male,20,0,0,350416,7.8542,,S', 1), (u'34050,10.5,,S', 1), (u'1199,3,\"Aks,', 1), (u'(Alice', 1), (u'1131,1,\"Douglas,', 1), (u'17756,83.1583,E45,C', 1), (u'Florence', 1), (u'1014,1,\"Schabert,', 1), (u'916,1,\"Ryerson,', 1), (u'Laura', 1), (u'Karen', 1), (u'Olaf\",male,,0,0,345498,7.775,,S', 1), (u'Graham', 1), (u'1176,3,\"Rosblom,', 1), (u'2147,13.8583,,C', 1), (u'Algot\",male,24,0,0,349911,7.775,,S', 1), (u'Ward)\",female,60,0,0,11813,76.2917,D15,C', 1), (u'14879,73.5,,S', 1), (u'978,3,\"Barry,', 1), (u'5734,61.175,E31,S', 1), (u'17608,262.375,B57', 1), (u'Leonard', 1), (u'(Catavelas', 1), (u'990,3,\"Braf,', 1), (u'Mariana', 1), (u'Martin\",male,28,0,0,C', 1), (u'Amalie\",female,35,0,0,113503,211.5,C130,C', 1), (u'986,1,\"Birnbaum,', 1), (u'Col.', 1), (u'Carl', 1), (u'1029,2,\"Schmidt,', 1), (u'Tannous\",male,,0,0,2684,7.225,,C', 1), (u'Demetrios\",male,18,1,0,2680,14.4542,,C', 1), (u'Maggie\"\"\"\"\",female,30,0,0,382650,6.95,,Q', 1), (u'Allis\",female,10,5,2,CA', 1), (u'Abraham\",male,,0,0,3470,7.8875,,S', 1), (u'Bertram', 1), (u'1198,1,\"Allison,', 1), (u'1047,3,\"Duquemin,', 1), (u'(Omine', 1), (u'Daniel\",male,21,0,0,330920,7.8208,,Q', 1), (u'Barbara', 1), (u'1309,3,\"Peter,', 1), (u'B60,C', 1), (u'Dagmar', 1), (u'1294,1,\"Gibson,', 1), (u'Elin', 1), (u'(Luise', 1), (u'1252,3,\"Sage,', 1), (u'Martinez', 1), (u'3101308,7.05,,S', 1), (u'Ylio\",male,,0,0,349220,7.8958,,S', 1), (u'1174,3,\"Fleming,', 1), (u'924,3,\"Dean,', 1), (u'1028,3,\"Zakarian,', 1), (u'2167,27.7208,,C', 1), (u'A\",male,22,0,0,347065,7.775,,S', 1), (u'John\",male,32,0,0,STON/OQ.', 1), (u'Edwin', 1), (u'Martin\",male,55,1,0,PC', 1), (u'1013,3,\"Kiernan,', 1), (u'1189,3,\"Samaan,', 1), (u'Lofqvist)\",female,36,0,2,350405,12.1833,,S', 1), (u'Jr\",male,37,1,1,PC', 1), (u'Susanna', 1), (u'39186,8.05,,S', 1), (u'1015,3,\"Carver,', 1), (u'Nellie\"\"\"\"\",female,20,2,1,29105,23,,S', 1), (u'1225,3,\"Nakid,', 1), (u'Leander\",male,26,0,0,347070,7.775,,S', 1), (u'Frederic', 1), (u'Borie\",male,13,2,2,PC', 1), (u'Phyllis', 1), (u'Bertram\",male,41,0,0,113054,30.5,A21,S', 1), (u'Boulton', 1), (u'31029,31.5,,S', 1), (u'1101,3,\"Delalic,', 1), (u'Harry\",male,,0,0,LP', 1), (u'Juhantytar', 1), (u'Michael', 1), (u'899,2,\"Caldwell,', 1), (u'Blun)\",female,63,1,0,PC', 1), (u'Patrick\",male,,0,0,366713,7.75,,Q', 1), (u'1129,3,\"Baccos,', 1), (u'Brito,', 1), (u'921,3,\"Samaan,', 1), (u'Jacob\",male,47,1,0,PC', 1), (u'988,1,\"Cavendish,', 1), (u'Redjo\",male,25,0,0,349250,7.8958,,S', 1), (u'W\",female,19,0,0,28404,13,,S', 1), (u'1133,2,\"Christy,', 1), (u'1161,3,\"Pokrnic,', 1), (u'(Nelle', 1), (u'y', 1), (u'1084,3,\"van', 1), (u'Karl', 1), (u'Sarah', 1), (u'E41,C', 1), (u'Posse)\",female,55,0,0,112377,27.7208,,C', 1), (u'1114,2,\"Cook,', 1), (u'915,1,\"Williams,', 1), (u'1211,2,\"Jefferys,', 1), (u'1005,3,\"Buckley,', 1), (u'Vassilios', 1), (u'Nils', 1), (u'Louise\",female,33,0,0,PC', 1), (u'947,3,\"Rice,', 1), (u'(Antoinette', 1), (u'Judith', 1), (u'1138,2,\"Karnes,', 1), (u'R\",male,41,1,0,17464,51.8625,D21,S', 1), (u'Sidney', 1), (u'1245,2,\"Herman,', 1), (u'II\",male,21,0,1,PC', 1), (u'B54', 1), (u'1043,3,\"Matinoff,', 1), (u'Lucien\",male,,0,0,PC', 1), (u'(Emily', 1), (u'1072,2,\"McCrie,', 1), (u'Philip', 1), (u'991,3,\"Nancarrow,', 1), (u'Thornton', 1), (u'1204,3,\"Sadowitz,', 1), (u'1234,3,\"Sage,', 1), (u'2166,13.8625,D38,C', 1), (u'897,3,\"Svensson,', 1), (u'1001,2,\"Swane,', 1), (u'1142,2,\"West,', 1), (u'John\",male,57,0,0,244346,13,,S', 1), (u'892,3,\"Kelly,', 1), (u'Edvard', 1), (u'PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked', 1), (u'Nikolai', 1), (u'973,1,\"Straus,', 1), (u'Fredrik', 1), (u'1030,3,\"Drapkin,', 1), (u'(Ruth', 1), (u'1140,2,\"Hold,', 1), (u'893,3,\"Wilkes,', 1), (u'Victorine\",female,36,0,0,PC', 1), (u'Light)\",female,33,1,2,C.A.', 1), (u'1120,3,\"Everett,', 1), (u'Fuller', 1), (u'1150,2,\"Bentham,', 1), (u'Weller\"\")\"\"\",male,,0,0,3410,8.7125,,S', 1), (u'Honora\",female,,0,0,364859,7.75,,Q', 1), (u'1071,1,\"Compton,', 1), (u'1025,3,\"Thomas,', 1), (u'Alfred', 1), (u'963,3,\"Minkoff,', 1), (u'Lily\"\"', 1), (u'Hannah\",female,,0,0,365237,7.75,,Q', 1), (u'1232,2,\"Fillbrook,', 1), (u'Benjamin\",male,63,1,0,24065,26,,S', 1), (u'Crispin\",male,,0,0,112051,0,,S', 1), (u'Julius', 1), (u'Dunton', 1), (u'1178,3,\"Franklin,', 1), (u'Simon\",male,,0,0,S.O./P.P.', 1), (u'13528,21,,S', 1), (u'John\",male,11.5,1,1,A/5.', 1), (u'Neal', 1), (u'Valentine\",male,29,1,0,2003,26,,S', 1), (u'Bruce\",male,49,0,0,112058,0,B52', 1), (u'17594,29.7,A9,C', 1), (u'1213,3,\"Krekorian,', 1), (u'1304,3,\"Henriksson,', 1), (u'Michael\",male,,0,0,330971,7.8792,,Q', 1), (u'von', 1), (u'Oakley\",male,45,1,1,16966,134.5,E34,C', 1), (u'946,2,\"Mangiavacchi,', 1), (u'Florentina\",female,30,1,0,SC/PARIS', 1), (u'Christopher\",male,42,0,0,113038,42.5,B11,S', 1), (u'1207,3,\"Hagardon,', 1), (u'Catherine', 1), (u'Karl\",male,9,0,1,C', 1), (u'981,2,\"Wells,', 1), (u'17569,146.5208,B78,C', 1), (u'953,2,\"McCrae,', 1), (u'1093,3,\"Danbom,', 1), (u'911,3,\"Assaf', 1), (u'Heilmann)\",female,26,1,1,315153,22.025,,S', 1), (u'Youssef\",male,7,1,1,2650,15.2458,,C', 1), (u'1117,3,\"Moubarek,', 1), (u'30631,7.7333,,Q', 1), (u'17756,83.1583,E52,C', 1), (u'1184,3,\"Nasr,', 1), (u'Aaron', 1), (u'1009,3,\"Sandstrom,', 1), (u'Gladys', 1), (u'Jakob\",male,25,0,0,13905,26,,C', 1), (u'Messemaeker,', 1), (u'Ivan\",male,27,0,0,315083,8.6625,,S', 1), (u'1280,3,\"Canavan,', 1), (u'1086,2,\"Drew,', 1), (u'(Malvina', 1), (u'Albert\",male,21,0,0,345501,7.775,,S', 1), (u'Hays)\",female,27,1,2,F.C.', 1), (u'1191,3,\"Johansson,', 1), (u'17562,27.7208,D43,C', 1), (u'Nathan\",male,41,0,0,SOTON/O.Q.', 1), (u'Maria\",female,22,0,0,7548,8.9625,,S', 1), (u'1154,2,\"Wells,', 1), (u'Bengtsson', 1), (u'1002,2,\"Stanton,', 1), (u'1149,3,\"Niklasson,', 1), (u'1148,3,\"Mahon,', 1), (u'William\",male,30,1,1,250651,26,,S', 1), (u'1268,3,\"Kink,', 1), (u'Fredrik\",male,23,1,0,347072,13.9,,S', 1), (u'925,3,\"Johnston,', 1), (u'Jean', 1), (u'Johanna', 1), (u'B63', 1), (u'Abi-Saab)\",female,38,0,0,2688,7.2292,,C', 1), (u'1083,1,\"Salomon,', 1), (u'Marinko\",male,,0,0,349238,7.8958,,S', 1), (u'17585,79.2,,C', 1), (u'Murray', 1), (u'Hannah\"\"\"\"\",female,,0,0,334915,7.7208,,Q', 1), (u'Tome\",male,24,0,0,315092,8.6625,,S', 1), (u'Bartol\",male,35,0,0,349230,7.8958,,S', 1), (u'1298,2,\"Ware,', 1), (u'1305,3,\"Spector,', 1), (u'938,1,\"Chevre,', 1), (u'1276,2,\"Wheeler,', 1), (u'Lane', 1), (u'1032,3,\"Goodwin,', 1), (u'Ester', 1), (u'(Thamine', 1), (u'1017,3,\"Cribb,', 1), (u'1180,3,\"Mardirosian,', 1), (u'Ortin\",male,27,0,0,2670,7.225,,C', 1), (u'Andersson)\",female,22,1,0,347072,13.9,,S', 1), (u'May', 1), (u'Henry\",male,54,1,0,11778,55.4417,C116,C', 1), (u'Easu)\",female,18,0,0,2657,7.2292,,C', 1), (u'1235,1,\"Cardeza,', 1), (u'Johannes\",male,25,0,0,STON/O', 1), (u'Marion\",female,26,0,0,220844,13.5,,S', 1), (u'Ling\",male,,0,0,1601,56.4958,,S', 1), (u'E', 1), (u'Irene\",female,1,1,1,PP', 1), (u'1076,1,\"Douglas,', 1), (u'H\",male,44,0,0,248746,13,,S', 1), (u'1136,3,\"Johnston,', 1), (u'1275,3,\"McNamee,', 1), (u'1137,1,\"Kenyon,', 1), (u'1082,2,\"Angle,', 1), (u'Morgan', 1), (u'1078,2,\"Phillips,', 1), (u'Needs)\",female,47,1,0,363272,7,,S', 1), (u'Frances)\",female,45,0,2,237789,30,,S', 1), (u'Vera\",female,8,1,1,26360,26,,S', 1), (u'\",female,48,0,2,C.A.', 1), (u'Valtcho\",male,43,0,0,349226,7.8958,,S', 1), (u'1090,2,\"Baimbrigge,', 1), (u'1299,1,\"Widener,', 1), (u'Bertha', 1), (u'Dorothy', 1), (u'998,3,\"Buckley,', 1), (u'1247,1,\"Julian,', 1), (u'Stuart', 1), (u'Hanna\",male,,2,0,2662,21.6792,,C', 1), (u'Larned\",male,61,1,3,PC', 1), (u'1130,2,\"Hiltunen,', 1), (u'Sarkis\",male,,0,0,2655,7.2292,F', 1), (u'1270,1,\"Hipkins,', 1), (u'1201,3,\"Hansen,', 1), (u'30769,10.5,,S', 1), (u'Emilio\",male,29,0,0,SC/PARIS', 1), (u'C64,C', 1), (u'Augustus\",male,30,0,0,248744,13,,S', 1), (u'1012,2,\"Watt,', 1), (u'John\",male,,0,0,AQ/4', 1), (u'Norris', 1), (u'Betros', 1), (u'2343,69.55,,S', 1), (u'1259,3,\"Riihivouri,', 1), (u'(Dorothy', 1), (u'Neshan\",male,25,0,0,2654,7.2292,F', 1), (u'Emanuel\",male,0.33,0,2,347080,14.4,,S', 1), (u'1221,2,\"Enander,', 1), (u'J', 1), (u'Henry', 1), (u'Georgetta', 1), (u'3101291,7.925,,S', 1), (u'Warburton', 1), (u'Rodriguez,', 1), (u'929,3,\"Cacic,', 1), (u'(Rosalie', 1), (u'Joaquim\",male,32,0,0,244360,13,,S', 1), (u'1068,2,\"Sincock,', 1), (u'(Addie\"\"', 1), (u'Lamson)\",female,55,2,0,11770,25.7,C101,S', 1), (u'Fardon)\",male,,0,0,SOTON/O.Q.', 1), (u'1044,3,\"Storey,', 1), (u'E57,C', 1), (u'Elizabeth\",female,24,0,0,368702,7.75,,Q', 1), (u'Elias\",male,,2,0,2662,21.6792,,C', 1), (u'Francis\",male,46,0,0,13050,75.2417,C6,C', 1), (u'Jelka\",female,23,0,0,315085,8.6625,,S', 1), (u'George', 1), (u'1055,3,\"Pearce,', 1), (u'(Florence', 1), (u'Howard', 1), (u'3337,14.5,,S', 1), (u'Kate\",female,30,0,0,330972,7.6292,,Q', 1), (u'1145,3,\"Salander,', 1), (u'3085,26,,S', 1), (u'2315,20.575,,S', 1), (u'Persson)\",female,30,1,0,349910,15.55,,S', 1), (u'Camille\",male,36,0,0,345771,9.5,,S', 1), (u'Edward\",male,55,0,0,680,50,C39,S', 1), (u'Miller', 1), (u'Ponsonby\",male,23,0,0,12749,93.5,B24,S', 1), (u'902,3,\"Ilieff,', 1), (u'Patrick\",male,21,0,0,364858,7.75,,Q', 1), (u'(Waika', 1), (u'1060,1,\"Cassebeer,', 1), (u'1166,3,\"Saade,', 1), (u'987,3,\"Tenglin,', 1), (u'1064,3,\"Dyker,', 1), (u'August\",male,26,0,0,248659,13,,S', 1), (u'1098,3,\"McGowan,', 1), (u'David', 1), (u'Alexander\",male,50,1,0,SC/AH', 1), (u'Warner', 1), (u'Joseph\",male,25,0,0,F.C.C.', 1), (u'1105,2,\"Howard,', 1), (u'1206,1,\"White,', 1), (u'Mirko\",male,17,0,0,349232,7.8958,,S', 1), (u'49867,7.55,,S', 1), (u'Beatrice', 1), (u'Lazar\",male,21,0,0,349211,7.8958,,S', 1), (u'1162,1,\"McCaffry,', 1), (u'1010,1,\"Beattie,', 1), (u'Claus', 1), (u'954,3,\"Bjorklund,', 1), (u'1257,3,\"Sage,', 1), (u'John\",male,28,0,0,392095,7.25,,S', 1), (u'1218,2,\"Becker,', 1), (u'Filip', 1), (u'Sebastiano', 1), (u'1179,1,\"Snyder,', 1), (u'Mate\",male,17,0,0,315095,8.6625,,S', 1), (u'Jenny', 1), (u'903,1,\"Jones,', 1), (u'Ellen\",female,29,0,0,PC', 1), (u'August', 1), (u'Washington\",male,53,1,1,33638,81.8583,A34,S', 1), (u'Elida\",female,23,0,0,347469,7.8542,,S', 1), (u'1262,2,\"Giles,', 1), (u'Parham\",male,,0,0,113778,26.55,D34,S', 1), (u'974,1,\"Case,', 1), (u'1229,3,\"Elias,', 1), (u'Agnes', 1), (u'1224,3,\"Thomas,', 1), (u'Harry\",male,38,1,0,28664,21,,S', 1), (u'1241,2,\"Walcroft,', 1), (u'Olga', 1), (u'Olaus', 1), (u'Edward\",male,,0,0,383162,7.75,,Q', 1), (u'(Sophie', 1), (u'17599,71.2833,C85,C', 1), (u'Ali\",male,23,0,0,SOTON/O.Q.', 1), (u'1153,3,\"Nilsson,', 1), (u'1253,2,\"Mallet,', 1), (u'1306,1,\"Oliva', 1), (u'(Carrie', 1), (u'Joseph\",male,,1,0,2689,14.4583,,C', 1), (u'Carlo,', 1), (u'Miller\",male,27,1,0,13508,136.7792,C89,C', 1), (u'Thure', 1), (u'Isidor\",male,25,0,0,350033,7.7958,,S', 1), (u'1074,1,\"Marvin,', 1), (u'1170,2,\"Ware,', 1), (u'Milligan', 1), (u'Jacobsen', 1), (u'Thelma\"\")\"\"\",female,16,1,1,2625,8.5167,,C', 1), (u'1070,2,\"Becker,', 1), (u'Kate\",female,24,1,2,220845,65,,S', 1), (u'Said', 1), (u'941,3,\"Coutts,', 1), (u'Treasteall\",female,3,1,1,SOTON/O.Q.', 1), (u'Frances', 1), (u'Seman\",male,,0,0,2622,7.2292,,C', 1), (u'Walter', 1), (u'Carmichael', 1), (u'1123,1,\"Willard,', 1), (u'Philipp', 1), (u'Emil', 1), (u'Julius\",male,25,0,0,345768,9.5,,S', 1), (u'Wilhelm\",male,32,0,0,347079,7.775,,S', 1), (u'Ragnhild\",female,22,0,1,113509,61.9792,B36,C', 1), (u'1271,3,\"Asplund,', 1), (u'(Gertrude', 1), (u'Johan', 1), (u'J\",female,0.92,1,2,C.A.', 1), (u'1119,3,\"McNeill,', 1), (u'Eloise', 1), (u'1165,3,\"Lennon,', 1), (u'Churchill', 1), (u'1284,3,\"Abbott,', 1), (u'Pillsbury', 1), (u'Harry\",male,40,1,0,2926,26,,S', 1), (u'Hungerford)\",female,53,0,0,PC', 1), (u'Treanor)\"\"\",female,36,0,2,C.A.', 1), (u'935,2,\"Corbett,', 1), (u'Oscar\",male,13,4,2,347077,31.3875,,S', 1), (u'1104,2,\"Deacon,', 1), (u'Dart', 1), (u'961,1,\"Fortune,', 1), (u'Edith', 1), (u'1089,3,\"Nilsson,', 1), (u'Delia\",female,22,0,0,334914,7.725,,Q', 1), (u'1297,2,\"Nourney,', 1), (u'1152,3,\"de', 1), (u'Daniel\",male,35,0,0,233734,12.35,,Q', 1), (u'995,3,\"Johansson', 1), (u'Sivertsen\",male,38.5,0,0,SOTON/O.Q.', 1), (u'Creighton\",male,30,1,2,113781,151.55,C22', 1), (u'1203,3,\"Vartanian,', 1), (u'Robert\",male,,0,0,376563,8.05,,S', 1), (u'(Frances)\",female,,0,4,4133,25.4667,,S', 1), (u'1023,1,\"Gracie,', 1), (u'Helen', 1), (u'C57,S', 1), (u'Grace', 1), (u'1125,3,\"Linehan,', 1), (u'(Olive', 1), (u'Albert\",male,27,0,0,315154,8.6625,,S', 1), (u'Louisa\",female,21,0,1,S.O./P.P.', 1), (u'L', 1), (u'Herbert', 1), (u'Strouse)\",female,45,0,1,PC', 1), (u'Melville\",male,55,1,1,12749,93.5,B69,S', 1), (u'Corse\",female,36,0,0,PC', 1), (u'Akar\",male,6,1,1,2678,15.2458,,C', 1), (u'2144,46.9,,S', 1), (u'1172,3,\"Oreskovic,', 1), (u'1116,1,\"Candee,', 1), (u'980,3,\"O\\'Donoghue,', 1), (u'Amenia\"\"', 1), (u'392083,8.05,,S', 1), (u'17531,31.6792,A29,C', 1), (u'Jeannie\",female,37,0,0,368364,7.75,,Q', 1), (u'1022,3,\"Spinner,', 1), (u'37671,15.9,,S', 1), (u'Nora\",female,,0,0,36568,15.5,,Q', 1), (u'3101314,7.25,,S', 1), (u'(Margaretha', 1), (u'1115,3,\"Karlsson,', 1), (u'912,1,\"Rothschild,', 1), (u'Annan)\",female,25,1,0,11765,55.4417,E50,C', 1), (u'920,1,\"Brady,', 1), (u'1209,2,\"Rogers,', 1), (u'2159,12.875,,S', 1), (u'George\",male,,1,9,CA.', 1), (u'1128,1,\"Warren,', 1), (u'Joseph', 1), (u'34644,12.7375,,C', 1), (u'Eugenia', 1), (u'Marcella', 1), (u'Leopold\",male,27,1,0,228414,26,,S', 1), (u'Oskar', 1), (u'Charles\",male,23,0,0,350054,7.7958,,S', 1), (u'Elizabeth', 1), (u'Solvang)\",female,,0,0,65305,8.1125,,S', 1), (u'William\",male,18,0,0,S.O.C.', 1), (u'Samuel', 1), (u'989,3,\"Makinen,', 1), (u'Bullen)\",female,,1,9,CA.', 1), (u'Serafino', 1), (u'(Eileen', 1), (u'Fosdick)\",female,,0,0,17770,27.7208,,C', 1), (u'Hilding\",male,27,0,0,350408,7.8542,,S', 1), (u'Parsons)\",female,45,1,0,11753,52.5542,D19,S', 1), (u'1227,1,\"Maguire,', 1), (u'Joseph\",male,17,2,0,A/4', 1), (u'Matthew\",male,30,0,0,233478,13,,S', 1), (u'E\",female,24,0,0,382653,7.75,,Q', 1), (u'1127,3,\"Vendel,', 1), (u'Vidaver)\",female,54,1,1,33638,81.8583,A34,S', 1), (u'Ward\",male,41,0,0,237734,15.0458,,C', 1), (u'1046,3,\"Asplund,', 1), (u'959,1,\"Moore,', 1), (u'1214,2,\"Nesson,', 1), (u'Jorgensen\",male,25,0,0,348122,7.65,F', 1), (u'Erik', 1), (u'1087,3,\"Karlsson,', 1), (u'Drake)\",female,58,0,1,PC', 1), (u'Edward\",male,,0,0,1222,7.8792,,S', 1), (u'Adolf', 1), (u'977,3,\"Khalil,', 1), (u'Emil\",male,48,0,0,PC', 1), (u'Stehli)\",female,48,1,1,13567,79.2,B41,C', 1), (u'14260,10.5,,S', 1), (u'(Julia', 1), (u'Natalia\",female,,0,0,330968,7.7792,,Q', 1), (u'Alice\",female,17,0,1,371362,16.1,,S', 1), (u'1018,3,\"Brobeck,', 1), (u'1139,2,\"Drew,', 1), (u'Linhart\",male,32.5,0,0,345775,9.5,,S', 1), (u'Hubert\",male,40,0,0,239059,16,,S', 1), (u'Richard\",male,28,0,0,SC', 1), (u'Sarah\",female,33,0,0,113781,151.55,,S', 1), (u'Lester\",male,2,1,1,29103,23,,S', 1), (u'(Zahie', 1), (u'Mustafa\",male,,0,0,2652,7.2292,,C', 1), (u'1177,3,\"Dennis,', 1), (u'Alexander', 1), (u'Henry\",male,,1,0,386525,16.1,,S', 1), (u'1147,3,\"MacKay,', 1), (u'Rev.', 1), (u'917,3,\"Robins,', 1), (u'J\",male,,1,1,2668,22.3583,,C', 1), (u'James)\",female,51,0,1,PC', 1), (u'1031,3,\"Goodwin,', 1), (u'9549,16.7,G6,S', 1), (u'1258,3,\"Caram,', 1), (u'48871,24.15,,S', 1), (u'17591,50.4958,B10,C', 1), (u'Boeson)\",female,45,0,1,112378,59.4,,C', 1), (u'Forbes\",male,50,0,0,113044,26,E60,S', 1), (u'968,3,\"Miles,', 1), (u'1272,3,\"O\\'Connor,', 1), (u'Peter', 1), (u'Francis\",male,62,0,0,240276,9.6875,,Q', 1), (u'1026,3,\"Dintcheff,', 1), (u'Oliver', 1), (u'3130,7.75,,Q', 1), (u'Julia\",female,27,0,0,330844,7.8792,,Q', 1), (u'937,3,\"Peltomaki,', 1), (u'1260,1,\"Gibson,', 1), (u'945,1,\"Fortune,', 1), (u'1081,2,\"Veal,', 1), (u'D12,C', 1), (u'Emerentia', 1), (u'1160,3,\"Howard,', 1), (u'Israel\",male,26,0,0,244368,13,F2,S', 1), (u'933,1,\"Franklin,', 1), (u'1065,3,\"Torfa,', 1), (u'Mary\"\"', 1), (u'Harry\"\"\"\"\",male,21,0,0,29107,11.5,,S', 1), (u'Katie\"\"\"\"\",female,,0,0,383123,7.75,,Q', 1), (u'969,1,\"Cornell,', 1), (u'6607,23.45,,S', 1), (u'922,2,\"Louch,', 1), (u'1088,1,\"Spedden,', 1), (u'17603,59.4,,C', 1), (u'1210,3,\"Jonsson,', 1), (u'1066,3,\"Asplund,', 1), (u'994,3,\"Foley,', 1), (u'Abraham', 1), (u'Maude\",female,20,0,0,C.A.', 1), (u'1079,3,\"Davies,', 1), (u'896,3,\"Hirvonen,', 1), (u'1246,3,\"Dean,', 1), (u'Robert\",male,24,0,0,350409,7.8542,,S', 1), (u'3101315,13.775,,S', 1), (u'962,3,\"Mulvihill,', 1), (u'Samuel\",male,49,1,2,220845,65,,S', 1), (u'1175,3,\"Touma,', 1), (u'752,7.55,,S', 1), (u'Herbert\",male,25,0,0,C.A.', 1), (u'James\",male,40.5,0,0,C.A.', 1), (u'Ivan\",male,27,0,0,349229,7.8958,,S', 1), (u'(Claire', 1), (u'Bloomfield\",male,47,0,0,113796,42.4,,S', 1), (u'Genevieve', 1), (u'17607,39.6,,S', 1), (u'Mock)\",female,35,1,0,13236,57.75,C28,C', 1), (u'Maria\",male,41,0,0,237393,13,,S', 1), (u'1277,2,\"Herman,', 1), (u'Master.', 1), (u'Romaine\",male,45,0,0,PC', 1), (u'1039,3,\"Davies,', 1), (u'(Selena', 1), (u'(Catherine', 1), (u'Elkins)\",female,50,1,1,113503,211.5,C80,C', 1), (u'17755,512.3292,B51', 1), (u'Edward\",male,30,0,0,110469,26,C106,S', 1), (u'1292,1,\"Bonnell,', 1), (u'1173,3,\"Peacock,', 1), (u'Frederick\",male,14,0,0,220845,65,,S', 1), (u'Holmes)\",female,55,0,0,PC', 1), (u'900,3,\"Abrahim,', 1), (u'Bridget\",female,,0,0,370368,7.75,,Q', 1), (u'1151,3,\"Midtsjo,', 1), (u'C25', 1), (u'H', 1), (u'1143,3,\"Abrahamsson,', 1), (u'1040,1,\"Crafton,', 1), (u'1107,1,\"Head,', 1), (u'E46,C', 1), (u'(Lena', 1), (u'Fernand\",male,,0,0,F.C.', 1), (u'3101262,7.25,,S', 1), (u'1293,2,\"Gale,', 1), (u'895,3,\"Wirz,', 1), (u'Allen', 1), (u'2,21,,S', 1), (u'Henry\",male,,0,0,17463,51.8625,E46,S', 1), (u'Clifford', 1), (u'(Argenia', 1), (u'Solomon\",male,42,0,0,211535,13,,S', 1), (u'34651,27.75,,S', 1), (u'1285,2,\"Gilbert,', 1), (u'Lilian', 1), (u'John', 1), (u'(Helga', 1), (u'Edmund\",male,30,1,0,13236,57.75,C78,C', 1), (u'David\",male,22,0,0,2658,7.225,,C', 1), (u'Alicia\",female,,2,0,367226,23.25,,Q', 1), (u'1185,1,\"Dodge,', 1), (u'Jego', 1), (u'979,3,\"Badman,', 1), (u'Gustafsson\",male,40,1,5,347077,31.3875,,S', 1), (u'Helene', 1), (u'12750,52,B71,S', 1), (u'B53', 1), (u'Roderick', 1), (u'996,3,\"Thomas,', 1), (u'Archibald', 1), (u'Ingeborg', 1), (u'1250,3,\"O\\'Keefe,', 1), (u'Constance', 1), (u'Ethel', 1), (u'Lindqvist)\",female,22,1,1,3101298,12.2875,,S', 1), (u'Lindsey', 1), (u'Miller)\",female,,0,0,F.C.C.', 1), (u'John\",male,,0,0,368783,7.75,,Q', 1), (u'983,3,\"Pedersen,', 1), (u'Hilda', 1), (u'Hill)\",female,29,1,0,26707,26,,S', 1), (u'918,1,\"Ostby,', 1), (u'1124,3,\"Wiklund,', 1), (u'Betros\",male,,1,0,2660,14.4542,,C', 1), (u'Millvina\"\"\"\"\",female,0.17,1,2,C.A.', 1), (u'1004,1,\"Evans,', 1), (u'James\",male,28,0,0,244358,26,,S', 1), (u'Eileen\",female,15,0,2,29750,39,,S', 1), (u'Jose', 1), (u'1038,1,\"Hilliard,', 1), (u'2148,13.8583,,C', 1), (u'13534,21,,S', 1), (u'997,3,\"Holthen,', 1), (u'928,3,\"Roth,', 1), (u'Ernest', 1), (u'931,3,\"Hee,', 1), (u'905,2,\"Howard,', 1), (u'17761,106.425,C86,C', 1), (u'939,3,\"Shaughnessy,', 1), (u'Clarence', 1), (u'906,1,\"Chaffee,', 1), (u'914,1,\"Flegenheim,', 1), (u'Anton', 1), (u'949,3,\"Abelseth,', 1), (u'(Annie', 1), (u'1194,2,\"Phillips,', 1), (u'Hudson', 1), (u'Berta', 1), (u'17606,27.4458,,C', 1), (u'910,3,\"Ilmakangas,', 1), (u'Achilles', 1), (u'934,3,\"Goldsmith,', 1), (u'Gervasius\",male,21,0,0,350053,7.7958,,S', 1), (u'Stevenson)\",female,23,1,0,21228,82.2667,B45,S', 1), (u'Wardle', 1), (u'1264,1,\"Ismay,', 1), (u'3101284,7.925,,S', 1), (u'Bennett)\",female,22,0,0,F.C.C.', 1), (u'1266,1,\"Dodge,', 1), (u'1092,3,\"Murphy,', 1), (u'Dona.', 1), (u'1100,1,\"Rosenbaum,', 1), (u'Ingersoll)\",female,64,0,2,PC', 1), (u'Johan\",male,21,1,0,3101266,6.4958,,S', 1), (u'(Elin', 1), (u'Ernest\",male,,0,0,343271,7,,S', 1), (u'17613,27.7208,A11,C', 1), (u'1109,1,\"Wick,', 1), (u'William\",male,,0,0,C.A.', 1), (u'Jeffery\",male,23,1,0,28666,10.5,,S', 1), (u'1126,1,\"Cumings,', 1), (u'John\",male,,0,0,2681,6.4375,,C', 1), (u'Bradley\",male,39,1,0,PC', 1), (u'23568,8.05,,S', 1), (u'6608,34.375,,S', 1), (u'17759,63.3583,D10', 1), (u'(Mahala', 1), (u'Katherine\",female,18.5,0,0,329944,7.2833,,Q', 1), (u'(Anna', 1), (u'Morrison\",male,,0,0,32302,8.05,,S', 1), (u'(Lillian', 1), (u'Charles\",male,18,0,0,C.A.', 1), (u'Douglas\",male,6,0,2,16966,134.5,E34,C', 1), (u'Gunnar', 1), (u'Delia\",female,,0,0,330924,7.8792,,Q', 1), (u'Drachstedt\"\")\"\"\",male,20,0,0,SC/PARIS', 1), (u'1159,3,\"Warren,', 1), (u'(Orian', 1), (u'14888,10.5,,S', 1), (u'(Emma', 1), (u'1034,1,\"Ryerson,', 1), (u'A\",female,,0,0,342712,8.05,,S', 1), (u'Wilfred\",male,22,2,0,C.A.', 1), (u'(Irene', 1), (u'Augusta', 1), (u'251,7.55,,S', 1), (u'2673,20.25,,S', 1), (u'932,3,\"Karun,', 1), (u'Baumgardner)\",female,36,0,3,230136,39,F4,S', 1), (u'Borie)\",female,48,1,3,PC', 1), (u'3338,8.05,,S', 1), (u'Servando\",male,28.5,0,0,PC', 1), (u'964,3,\"Nieminen,', 1), (u'Frederick\"\"\"\"\",male,,0,0,SC/PARIS', 1), (u'901,3,\"Davies,', 1), (u'1223,1,\"Dulles,', 1), (u'Karvin\",male,32,0,0,C', 1), (u'Olivia\",female,18,0,0,347066,7.775,,S', 1), (u'1024,3,\"Lefebre,', 1), (u'Nelson', 1), (u'1282,1,\"Payne,', 1), (u'1016,3,\"Kennedy,', 1), (u'Daniel', 1), (u'1308,3,\"Ware,', 1), (u'Thomas\",male,24,2,0,C.A.', 1), (u'948,3,\"Cor,', 1), (u'1237,3,\"Abelseth,', 1), (u'Maria\"\"', 1), (u'B59', 1), (u'Vivian', 1), (u'Oscar', 1), (u'Arthur\",male,,0,0,A/5', 1), (u'Louisa\",female,18,0,0,A/4', 1), (u'Margaret', 1), (u'1157,3,\"Lyntakoff,', 1), (u'Halaut', 1), (u'Anton\",male,29,3,1,315153,22.025,,S', 1), (u'C26,S', 1), (u'966,1,\"Geiger,', 1), (u'1183,3,\"Daly,', 1), (u'Miss.', 1), (u'Georges', 1), (u'James', 1), (u'1230,2,\"Denbury,', 1), (u'Alice', 1), (u'1069,1,\"Stengel,', 1), (u'970,2,\"Aldworth,', 1), (u'Emilio', 1), (u'1075,3,\"Lane,', 1), (u'Jr\",male,31,0,0,2543,28.5375,C53,C', 1), (u'Albert\",male,10,4,1,382652,29.125,,Q', 1), (u'\",female,20,1,0,236853,26,,S', 1), (u'1020,2,\"Bowenur,', 1), (u'(Antoinette)\",female,,0,0,PC', 1), (u'29037,26,,S', 1), (u'1288,3,\"Colbert,', 1), (u'13540,10.5,,S', 1), (u'1193,2,\"Malachard,', 1), (u'1219,1,\"Rosenshine,', 1), (u'(Blanche', 1), (u'Mark', 1), (u'Stanko\",male,,0,0,349235,7.8958,,S', 1), (u'Ellen', 1), (u'1281,3,\"Palsson,', 1), (u'Patrick\",male,,0,0,368402,7.75,,Q', 1), (u'943,2,\"Pulbaum,', 1), (u'Assad\",male,,0,0,2673,7.2292,,C', 1), (u'1195,3,\"Pokrnic,', 1), (u'1202,3,\"Cacic,', 1), (u'Watson)\"\"\",female,,1,2,W./C.', 1), (u'Frank', 1), (u'904,1,\"Snyder,', 1), (u'1111,3,\"Thomson,', 1), (u'Leo', 1), (u'Elizabeth\",female,12,2,1,230136,39,F4,S', 1), (u'Woolf\",male,,0,0,A.5.', 1), (u'907,2,\"del', 1), (u'31352,21,,S', 1), (u'1261,2,\"Pallas', 1), (u'Robert\",male,43,0,1,S.O./P.P.', 1), (u'1278,3,\"Aronsson,', 1), (u'1163,3,\"Fox,', 1), (u'17592,39.4,D28,S', 1), (u'1007,3,\"Chronopoulos,', 1), (u'Manda\",female,21,0,0,315087,8.6625,,S', 1), (u'1063,3,\"Zakarian,', 1), (u'Taylor', 1), (u'Stuart\",male,24,0,0,28034,10.5,,S', 1), (u'Ruth', 1), (u'972,3,\"Boulos,', 1), (u'Konrad', 1), (u'1192,3,\"Olsson,', 1), (u'1008,3,\"Thomas,', 1), (u'Mary\",female,,1,0,370371,15.5,,Q', 1), (u'6212,15.1,,S', 1), (u'Bertram\",male,,0,0,113791,26.55,,S', 1), (u'Simon', 1), (u'1240,2,\"Giles,', 1), (u'1006,1,\"Straus,', 1), (u'975,3,\"Demetri,', 1), (u'Lawry)\",female,29,1,0,SC/AH', 1), (u'Ernst', 1), (u'James\",male,40,0,0,28221,13,,S', 1), (u'Josefina\",female,29,0,0,3101297,7.925,,S', 1), (u'O\\'Leary)\",female,19,1,0,376566,16.1,,S', 1), (u'3101268,7.925,,S', 1), (u'1146,3,\"Wenzel,', 1), (u'967,1,\"Keeping,', 1), (u'1062,3,\"Lithman,', 1), (u'Sigvard', 1), (u'1217,3,\"Assam,', 1), (u'Emilio\",male,,0,0,SC/A.3', 1), (u'B56,S', 1), (u'1273,3,\"Foley,', 1), (u'1035,2,\"Beauchamp,', 1), (u'Alexander)\"\"\",female,,0,2,2661,15.2458,,C', 1), (u'1295,1,\"Carrau,', 1), (u'Edgar\",male,21,1,0,28133,11.5,,S', 1), (u'17483,221.7792,C97,S', 1), (u'48873,8.05,,S', 1), (u'919,3,\"Daher,', 1), (u'1091,3,\"Rasmussen,', 1), (u'Youssef\",female,9,1,1,2650,15.2458,,C', 1), (u'Ms.', 1), (u'1239,3,\"Whabee,', 1), (u'965,1,\"Ovies', 1), (u'(Sigrid', 1), (u'Brines\",male,8,0,2,28220,32.5,,S', 1), (u'Nicola\",male,,0,0,349255,7.8958,,C', 1), (u'Rudolf\",male,22,0,0,350045,7.7958,,S', 1), (u'1011,2,\"Chapman,', 1), (u'Edvin\",male,32,0,0,350403,7.5792,,S', 1), (u'Lingrey\"\")\"\"\",male,42,0,0,17475,26.55,,S', 1), (u'Delia\",female,18,0,0,330963,7.8792,,Q', 1), (u'William\",male,,0,0,365235,7.75,,Q', 1), (u'Halstead)\",female,64,1,1,112901,26.55,B26,S', 1), (u'Kristina', 1), (u'Franz\",male,39,0,1,349256,13.4167,,C', 1), (u'Nellie\",female,31,0,0,F.C.C.', 1), (u'1236,3,\"van', 1), (u'1027,3,\"Carlsson,', 1), (u'Jessie', 1), (u'Thomas\",male,22,0,0,W./C.', 1), (u'Louise', 1), (u'Dr.', 1), (u'1134,1,\"Spedden,', 1), (u'1080,3,\"Sage,', 1), (u'1061,3,\"Hellstrom,', 1), (u'Edward', 1), (u'31030,10.5,,S', 1), (u'17558,247.5208,B58', 1), (u'17598,31.6833,,S', 1), (u'1216,1,\"Kreuchen,', 1), (u'Andrew', 1), (u'1019,3,\"McCoy,', 1), (u'3101309,7.05,,S', 1), (u'Livija\",female,27,1,0,STON/O2.', 1), (u'1118,3,\"Asplund,', 1), (u'Eugene', 1), (u'Joseph\",male,39,0,2,2675,7.2292,,C', 1), (u'Maria', 1), (u'944,2,\"Hocking,', 1), (u'1097,1,\"Omont,', 1), (u'(Winnie', 1), (u'McDowell)\",female,26,1,0,13508,136.7792,C89,C', 1), (u'Ida', 1), (u'Mary', 1), (u'Charles', 1), (u'Genovesi)\",female,24,1,0,SC/PARIS', 1), (u'(Ella', 1), (u'Henry\",male,14.5,8,2,CA.', 1), (u'Artur', 1), (u'Emily', 1)]\n" ] } ], "source": [ "# count words \n", "\n", "import os\n", "\n", "\n", "def counts(text_file):\n", "\tword_counts = text_file \\\n", "\t .flatMap(lambda line: line.split()) \\\n", "\t .map(lambda word: (word, 1)) \\\n", "\t .reduceByKey(lambda a, b: a)\n", "\t \n", "\tprint word_counts.collect() \n", " \n", " \n", "text_file2 = sc.textFile(\"/Users/GGV/Desktop/test.csv\")\n", "counts(text_file2)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[u'957,2,\"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)\",female,,0,0,F.C.C. 13534,21,,S',\n", " u'1144,1,\"Clark, Mr. Walter Miller\",male,27,1,0,13508,136.7792,C89,C',\n", " u'1164,1,\"Clark, Mrs. Walter Miller (Virginia McDowell)\",female,26,1,0,13508,136.7792,C89,C']" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# filter \n", "\n", "df__ = sc.textFile(\"/Users/GGV/Desktop/test.csv\")\n", "df__.filter(lambda x: 'Miller' in x).collect()" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[u'PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked'],\n", " [u'892,3,\"Kelly, Mr. James\",male,34.5,0,0,330911,7.8292,,Q'],\n", " [u'893,3,\"Wilkes, Mrs. James (Ellen Needs)\",female,47,1,0,363272,7,,S'],\n", " [u'894,2,\"Myles, Mr. Thomas Francis\",male,62,0,0,240276,9.6875,,Q'],\n", " [u'895,3,\"Wirz, Mr. Albert\",male,27,0,0,315154,8.6625,,S']]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# map \n", "# only take 5 of the elements in list \n", "\n", "df__ = sc.textFile(\"/Users/GGV/Desktop/test.csv\")\n", "df__.map(lambda x: x.split(\"u\")).take(5)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[u'1087,3,\"Karlsson, Mr. Julius Konrad Eugen\",male,33,0,0,347465,7.8542,,S', u'1056,2,\"Peruschitz, Rev. Joseph Maria\",male,41,0,0,237393,13,,S', u'1134,1,\"Spedden, Mr. Frederic Oakley\",male,45,1,1,16966,134.5,E34,C', u'1187,3,\"Angheloff, Mr. Minko\",male,26,0,0,349202,7.8958,,S', u'976,2,\"Lamb, Mr. John Joseph\",male,,0,0,240261,10.7083,,Q']\n", "====\n", "[]\n" ] } ], "source": [ "# substract \n", "\n", "df__ = sc.textFile(\"/Users/GGV/Desktop/test.csv\")\n", "normal_raw_data = df__.filter(lambda x: 'Miller' in x) \n", "attack_raw_data= df__.subtract(normal_raw_data)\n", "\n", "print (attack_raw_data.take(5))\n", "print (\"====\")\n", "print (attack_raw_data.filter(lambda x: 'Miller' in x).collect())" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[u'a', u'1', u'3', u'5', u'7', u'9', u'0', u'8', u'2', u'4', u'6']" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Cartesian product between two RDDs by using cartesian transformation\n", "\n", "\n", "\n", "df__ = sc.textFile(\"/Users/GGV/Desktop/test.csv\")\n", "df__.map(lambda x : x.split(\",\"))\n", "protocols = df__.map(lambda x: x[1]).distinct()\n", "protocols.collect()\n", "\n", "\n", "#csv_data = raw_data.map(lambda x: x.split(\",\"))\n", "#protocols = csv_data.map(lambda x: x[1]).distinct()\n", "#protocols.collect()" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# https://github.com/jadianes/spark-py-notebooks/blob/master/nb5-rdd-aggregations/nb5-rdd-aggregations.ipynb" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }