{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-07-27T09:35:24.592320Z", "start_time": "2019-07-27T09:35:24.572890Z" } }, "outputs": [], "source": [ "in_filename = '../data/twittersampletest.csv'\n", "out_filename ='../data/twittersampleclean.csv'\n", "\n", "\n", "with open(in_filename, 'rb') as in_file:\n", " with open(out_filename, 'w') as out_file:\n", " for line in in_file:\n", " line = line.decode(encoding='utf-8', errors= 'replace') # 'replace')\n", " out_file.write(line) # unicode_escape" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-07-27T09:35:37.609490Z", "start_time": "2019-07-27T09:35:37.515998Z" } }, "outputs": [], "source": [ "import csv\n", "import sys\n", "\n", "csv.field_size_limit(sys.maxsize)\n", "\n", "dat = []\n", "with open(out_filename, encoding='utf8', newline='') as f:\n", " lines = csv.reader((line.replace('\\0','') for line in f), delimiter='\\t', quotechar='\"')\n", " for row in lines:\n", " if row:\n", " dat.append(row)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-07-27T09:35:57.043282Z", "start_time": "2019-07-27T09:35:56.521789Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " | ��Date | \n", "Headline | \n", "URL | \n", "Opening Text | \n", "Hit Sentence | \n", "Source | \n", "Influencer | \n", "Country | \n", "Subregion | \n", "Language | \n", "Reach | \n", "National Viewership | \n", "AVE | \n", "Sentiment | \n", "Key Phrases | \n", "Input Name | \n", "Keywords | \n", "Document Tags | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "06-Nov-2018 11:59PM | \n", "\n", " | http://twitter.com/meesposito/statuses/1059837... | \n", "\n", " | RT @OFA: If you\u0019 re concerned about the escala... | \n", "@meesposito | \n", "United States | \n", "\n", " | English | \n", "4198 | \n", "0 | \n", "38.83 | \n", "Neutral | \n", "escalating effects,politicians,front,polls,cli... | \n", "Adhoc Search Export | \n", "climate change,believe | \n", "\n", " | |
1 | \n", "06-Nov-2018 11:59PM | \n", "\n", " | http://twitter.com/jjbigbend/statuses/10598377... | \n", "\n", " | @MichaelEMann Of the 100 (family, coffee shops... | \n", "@jjbigbend | \n", "United States | \n", "\n", " | English | \n", "8 | \n", "0 | \n", "0.07 | \n", "Positive | \n", "coffee shops,denial responses,Globalist Agenda... | \n", "Adhoc Search Export | \n", "climate change,Believe | \n", "\n", " | |
2 | \n", "06-Nov-2018 11:59PM | \n", "\n", " | http://twitter.com/Sherri60827582/statuses/105... | \n", "\n", " | RT @RachelRGonzalez: \u001d", " I just don't like (Fill... | \n", "@Sherri60827582 | \n", "United States | \n", "\n", " | English | \n", "197 | \n", "0 | \n", "1.82 | \n", "Negative | \n", "Fill,criminal justice reform,kids,school,clima... | \n", "Adhoc Search Export | \n", "believe,climate change,real | \n", "\n", " | |
3 | \n", "06-Nov-2018 11:59PM | \n", "\n", " | http://twitter.com/Texan4Truth/statuses/105983... | \n", "\n", " | @realDonaldTrump If you need motivation to VOT... | \n", "@Texan4Truth | \n", "United States | \n", "\n", " | English | \n", "1987 | \n", "0 | \n", "18.38 | \n", "Positive | \n", "Trump Sided,Tomorrow \" Climate Change,Existing... | \n", "Adhoc Search Export | \n", "Climate Change,Real | \n", "\n", " | |
4 | \n", "06-Nov-2018 11:59PM | \n", "\n", " | http://twitter.com/esapesaqe/statuses/10598376... | \n", "\n", " | you live on this planet outside of the US and ... | \n", "@esapesaqe | \n", "Sweden | \n", "\n", " | English | \n", "18 | \n", "0 | \n", "0.17 | \n", "Negative | \n", "panic,planet,effort,rest,ours,climate change | \n", "Adhoc Search Export | \n", "climate change,real | \n", "\n", " |