{ "cells": [ { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "from os import walk\n", "from datetime import datetime\n", "import pandas as pd\n", "from subprocess import check_call\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# output_dict = {'speaker': [],\n", "# 'date': [],\n", "# 'start_size': [],\n", "# 'end_size': [],\n", "# 'change_size': [], \n", "# 'change_size_perc': []}\n", "\n", "text_dir = './Corpus_of_Presential_Speeches/'\n", "for _, _, filenames in walk('./Corpus_of_Presential_Speeches/'):\n", " for file in filenames:\n", " president = file.split(\"_\")[0]\n", " entire_filename = text_dir + president + '/' + file\n", " #check_call(['gzip', entire_filename])\n", " with open(entire_filename+'.infgen', 'w') as outfile:\n", " check_call([\"./infgen\", entire_filename], stdout=outfile)\n", " \n", "# with open(entire_filename) as f:\n", "# file_contents = [line.split() for line in f]\n", " \n", "# text = [item for sublist in file_contents[2:] for item in sublist]\n", "# text = ' '.join(text)\n", " \n", "# output_dict['speaker'].append(president)\n", "# output_dict['date'].append(pd.to_datetime(' '.join(file_contents[1]).split('\"')[1]).strftime('%m-%d-%Y'))\n", "# output_dict['start_size'].append(getsizeof(text))\n", " \n", "# compressed_text = compress(text)\n", "# output_dict['end_size'].append(getsizeof(compressed_text))\n", "# output_dict['change_size'].append(output_dict['start_size'][-1] - output_dict['end_size'][-1])\n", "# output_dict['change_size_perc'].append(output_dict['end_size'][-1]*1.0 / output_dict['start_size'][-1])" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }