{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## TweetIDを紐づけてテキストを取得する" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import json\n", "from requests_oauthlib import OAuth1Session\n", "import time\n", "import datetime\n", "import configparser\n", "from tqdm import tqdm_notebook" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgenre_idstatus_idis_bothis_positiveis_negativeis_neutralis_irrelevant
0100251000052240771809136640000110.0
1100261000052240776800359219200100.0
2100271000052240801864262860900110.0
3100281000052240839487167283200010.0
4100291000052240845477892915300010.0
\n", "
" ], "text/plain": [ " id genre_id status_id is_both is_positive is_negative \\\n", "0 10025 10000 522407718091366400 0 0 1 \n", "1 10026 10000 522407768003592192 0 0 1 \n", "2 10027 10000 522408018642628609 0 0 1 \n", "3 10028 10000 522408394871672832 0 0 0 \n", "4 10029 10000 522408454778929153 0 0 0 \n", "\n", " is_neutral is_irrelevant \n", "0 1 0.0 \n", "1 0 0.0 \n", "2 1 0.0 \n", "3 1 0.0 \n", "4 1 0.0 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# http://bigdata.naist.jp/~ysuzuki/data/twitter/\n", "raw_df = pd.read_csv('../data/tweets_open.csv.bz2', header=None,\n", " names=['id', 'genre_id', 'status_id', 'is_both', 'is_positive', 'is_negative', 'is_neutral', 'is_irrelevant'])\n", "raw_df.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idstatus_idis_bothis_positiveis_negativeis_neutralis_irrelevant
genre_id
100002.533529e+104.993758e+22603.05650.09526.051404.018818.0
100018.759670e+091.036641e+22479.01996.03060.03557.010027.0
100024.096559e+104.227049e+22115.0909.02128.012434.057611.0
100207.331238e+097.468428e+2141.0741.0311.06894.04371.0
100211.559539e+115.147131e+22343.03443.06074.044822.028622.0
100228.453269e+104.705838e+2279.01499.0933.013516.054459.0
100241.226563e+114.242347e+22240.03881.03482.029866.035235.0
100251.278649e+114.487937e+2249.0949.01084.020417.047780.0
100261.080555e+114.715787e+2275.0744.04420.040787.025988.0
\n", "
" ], "text/plain": [ " id status_id is_both is_positive is_negative \\\n", "genre_id \n", "10000 2.533529e+10 4.993758e+22 603.0 5650.0 9526.0 \n", "10001 8.759670e+09 1.036641e+22 479.0 1996.0 3060.0 \n", "10002 4.096559e+10 4.227049e+22 115.0 909.0 2128.0 \n", "10020 7.331238e+09 7.468428e+21 41.0 741.0 311.0 \n", "10021 1.559539e+11 5.147131e+22 343.0 3443.0 6074.0 \n", "10022 8.453269e+10 4.705838e+22 79.0 1499.0 933.0 \n", "10024 1.226563e+11 4.242347e+22 240.0 3881.0 3482.0 \n", "10025 1.278649e+11 4.487937e+22 49.0 949.0 1084.0 \n", "10026 1.080555e+11 4.715787e+22 75.0 744.0 4420.0 \n", "\n", " is_neutral is_irrelevant \n", "genre_id \n", "10000 51404.0 18818.0 \n", "10001 3557.0 10027.0 \n", "10002 12434.0 57611.0 \n", "10020 6894.0 4371.0 \n", "10021 44822.0 28622.0 \n", "10022 13516.0 54459.0 \n", "10024 29866.0 35235.0 \n", "10025 20417.0 47780.0 \n", "10026 40787.0 25988.0 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_df.groupby('genre_id').sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ジャンル10025(ルンバ)のネガティブorポジティブなツイートのみ集める" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgenre_idstatus_idis_bothis_positiveis_negativeis_neutralis_irrelevant
011411591002555194912596125286401010.0
111353371002555085254232258150701000.0
211382421002555141374156093030401000.0
311395291002555169249726969036801000.0
411336341002555031875989891891301000.0
\n", "
" ], "text/plain": [ " id genre_id status_id is_both is_positive is_negative \\\n", "0 1141159 10025 551949125961252864 0 1 0 \n", "1 1135337 10025 550852542322581507 0 1 0 \n", "2 1138242 10025 551413741560930304 0 1 0 \n", "3 1139529 10025 551692497269690368 0 1 0 \n", "4 1133634 10025 550318759898918913 0 1 0 \n", "\n", " is_neutral is_irrelevant \n", "0 1 0.0 \n", "1 0 0.0 \n", "2 0 0.0 \n", "3 0 0.0 \n", "4 0 0.0 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "roomba_negative_df = raw_df.query('genre_id==10025 and is_positive==0 and is_negative==1') #1079件\n", "roomba_positive_df = raw_df.query('genre_id==10025 and is_positive==1 and is_negative==0') #944件\n", "roomba_df = pd.concat([roomba_positive_df, roomba_negative_df], axis=0, ignore_index=True)\n", "roomba_df.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "tweet_ids = roomba_df.status_id.values" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['../config/setting.ini']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conf = configparser.ConfigParser()\n", "conf.read('../config/setting.ini')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def get_tweet_text(tweet_id):\n", " twitter = OAuth1Session(\n", " conf['twitterapi']['CONSUMER_KEY'],\n", " conf['twitterapi']['CONSUMER_SECRET'],\n", " conf['twitterapi']['TOKEN'],\n", " conf['twitterapi']['TOKEN_SECRET']\n", " )\n", "\n", " url = 'https://api.twitter.com/1.1/statuses/show.json' \n", "\n", " params ={'id' : tweet_id}\n", " res = twitter.get(url, params = params)\n", " \n", " if res.status_code == 200:\n", " return json.loads(res.text)['text'], res\n", " else:\n", " return np.nan, res" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8139148c4c144d43baa04401f5009472", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=2023), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "2019-05-05 18:26:33.633980: 15分後に起きます。\n", "2019-05-05 18:41:35.680097: 起きました。\n", "2019-05-05 18:44:33.241502: 15分後に起きます。\n", "2019-05-05 18:59:35.290670: 起きました。\n", "\n" ] } ], "source": [ "tweet_texts = []\n", "for tweet_id in tqdm_notebook(tweet_ids):\n", " text, res = get_tweet_text(tweet_id)\n", " tweet_texts.append(text)\n", " #API制限までの残り回数が来たら寝る(15分900回)\n", " try: \n", " if int(res.headers['x-rate-limit-remaining']) <= 0:\n", " print(f'{datetime.datetime.now()}: 15分後に起きます。')\n", " time.sleep(15 * 60 + 2)\n", " print(f'{datetime.datetime.now()}: 起きました。')\n", " except KeyError:\n", " print('なんやろね')\n", " pass" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2023" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tweet_texts)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "roomba_df['tweet_text'] = tweet_texts" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "roomba_df.to_csv('../data/roomba.csv.gz', compression='gzip', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }