{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-07-27T09:35:24.592320Z", "start_time": "2019-07-27T09:35:24.572890Z" } }, "outputs": [], "source": [ "in_filename = '../data/twittersampletest.csv'\n", "out_filename ='../data/twittersampleclean.csv'\n", "\n", "\n", "with open(in_filename, 'rb') as in_file:\n", " with open(out_filename, 'w') as out_file:\n", " for line in in_file:\n", " line = line.decode(encoding='utf-8', errors= 'replace') # 'replace')\n", " out_file.write(line) # unicode_escape" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-07-27T09:35:37.609490Z", "start_time": "2019-07-27T09:35:37.515998Z" } }, "outputs": [], "source": [ "import csv\n", "import sys\n", "\n", "csv.field_size_limit(sys.maxsize)\n", "\n", "dat = []\n", "with open(out_filename, encoding='utf8', newline='') as f:\n", " lines = csv.reader((line.replace('\\0','') for line in f), delimiter='\\t', quotechar='\"')\n", " for row in lines:\n", " if row:\n", " dat.append(row)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-07-27T09:35:57.043282Z", "start_time": "2019-07-27T09:35:56.521789Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
��DateHeadlineURLOpening TextHit SentenceSourceInfluencerCountrySubregionLanguageReachNational ViewershipAVESentimentKey PhrasesInput NameKeywordsDocument Tags
006-Nov-2018 11:59PMhttp://twitter.com/meesposito/statuses/1059837...RT @OFA: If you\u0019 re concerned about the escala...Twitter@meespositoUnited StatesEnglish4198038.83Neutralescalating effects,politicians,front,polls,cli...Adhoc Search Exportclimate change,believe
106-Nov-2018 11:59PMhttp://twitter.com/jjbigbend/statuses/10598377...@MichaelEMann Of the 100 (family, coffee shops...Twitter@jjbigbendUnited StatesEnglish800.07Positivecoffee shops,denial responses,Globalist Agenda...Adhoc Search Exportclimate change,Believe
206-Nov-2018 11:59PMhttp://twitter.com/Sherri60827582/statuses/105...RT @RachelRGonzalez: \u001d", " I just don't like (Fill...Twitter@Sherri60827582United StatesEnglish19701.82NegativeFill,criminal justice reform,kids,school,clima...Adhoc Search Exportbelieve,climate change,real
306-Nov-2018 11:59PMhttp://twitter.com/Texan4Truth/statuses/105983...@realDonaldTrump If you need motivation to VOT...Twitter@Texan4TruthUnited StatesEnglish1987018.38PositiveTrump Sided,Tomorrow \" Climate Change,Existing...Adhoc Search ExportClimate Change,Real
406-Nov-2018 11:59PMhttp://twitter.com/esapesaqe/statuses/10598376...you live on this planet outside of the US and ...Twitter@esapesaqeSwedenEnglish1800.17Negativepanic,planet,effort,rest,ours,climate changeAdhoc Search Exportclimate change,real
\n", "
" ], "text/plain": [ " ��Date Headline \\\n", "0 06-Nov-2018 11:59PM \n", "1 06-Nov-2018 11:59PM \n", "2 06-Nov-2018 11:59PM \n", "3 06-Nov-2018 11:59PM \n", "4 06-Nov-2018 11:59PM \n", "\n", " URL Opening Text \\\n", "0 http://twitter.com/meesposito/statuses/1059837... \n", "1 http://twitter.com/jjbigbend/statuses/10598377... \n", "2 http://twitter.com/Sherri60827582/statuses/105... \n", "3 http://twitter.com/Texan4Truth/statuses/105983... \n", "4 http://twitter.com/esapesaqe/statuses/10598376... \n", "\n", " Hit Sentence Source \\\n", "0 RT @OFA: If you\u0019 re concerned about the escala... Twitter \n", "1 @MichaelEMann Of the 100 (family, coffee shops... Twitter \n", "2 RT @RachelRGonzalez: \n", " I just don't like (Fill... Twitter \n", "3 @realDonaldTrump If you need motivation to VOT... Twitter \n", "4 you live on this planet outside of the US and ... Twitter \n", "\n", " Influencer Country Subregion Language Reach \\\n", "0 @meesposito United States English 4198 \n", "1 @jjbigbend United States English 8 \n", "2 @Sherri60827582 United States English 197 \n", "3 @Texan4Truth United States English 1987 \n", "4 @esapesaqe Sweden English 18 \n", "\n", " National Viewership AVE Sentiment \\\n", "0 0 38.83 Neutral \n", "1 0 0.07 Positive \n", "2 0 1.82 Negative \n", "3 0 18.38 Positive \n", "4 0 0.17 Negative \n", "\n", " Key Phrases Input Name \\\n", "0 escalating effects,politicians,front,polls,cli... Adhoc Search Export \n", "1 coffee shops,denial responses,Globalist Agenda... Adhoc Search Export \n", "2 Fill,criminal justice reform,kids,school,clima... Adhoc Search Export \n", "3 Trump Sided,Tomorrow \" Climate Change,Existing... Adhoc Search Export \n", "4 panic,planet,effort,rest,ours,climate change Adhoc Search Export \n", "\n", " Keywords Document Tags \n", "0 climate change,believe \n", "1 climate change,Believe \n", "2 believe,climate change,real \n", "3 Climate Change,Real \n", "4 climate change,real " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "dat = [i for i in dat if len(i) == 18]\n", "df = pd.DataFrame(dat[1:], columns = dat[0])\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }