## TweetIDを紐づけてテキストを取得する

In [1]:
import numpy as np
import pandas as pd
import json
from requests_oauthlib import OAuth1Session
import time
import datetime
import configparser
from tqdm import tqdm_notebook

In [5]:
# http://bigdata.naist.jp/~ysuzuki/data/twitter/
raw_df = pd.read_csv('../data/tweets_open.csv.bz2', header=None,
                     names=['id', 'genre_id', 'status_id', 'is_both', 'is_positive', 'is_negative', 'is_neutral', 'is_irrelevant'])
raw_df.head()

Unnamed: 0,id,genre_id,status_id,is_both,is_positive,is_negative,is_neutral,is_irrelevant
0,10025,10000,522407718091366400,0,0,1,1,0.0
1,10026,10000,522407768003592192,0,0,1,0,0.0
2,10027,10000,522408018642628609,0,0,1,1,0.0
3,10028,10000,522408394871672832,0,0,0,1,0.0
4,10029,10000,522408454778929153,0,0,0,1,0.0


In [10]:
raw_df.groupby('genre_id').sum()

Unnamed: 0_level_0,id,status_id,is_both,is_positive,is_negative,is_neutral,is_irrelevant
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10000,25335290000.0,4.993758e+22,603.0,5650.0,9526.0,51404.0,18818.0
10001,8759670000.0,1.036641e+22,479.0,1996.0,3060.0,3557.0,10027.0
10002,40965590000.0,4.227049e+22,115.0,909.0,2128.0,12434.0,57611.0
10020,7331238000.0,7.468428e+21,41.0,741.0,311.0,6894.0,4371.0
10021,155953900000.0,5.147131e+22,343.0,3443.0,6074.0,44822.0,28622.0
10022,84532690000.0,4.705838e+22,79.0,1499.0,933.0,13516.0,54459.0
10024,122656300000.0,4.242347e+22,240.0,3881.0,3482.0,29866.0,35235.0
10025,127864900000.0,4.487937e+22,49.0,949.0,1084.0,20417.0,47780.0
10026,108055500000.0,4.715787e+22,75.0,744.0,4420.0,40787.0,25988.0


## ジャンル10025（ルンバ）のネガティブorポジティブなツイートのみ集める

In [14]:
roomba_negative_df = raw_df.query('genre_id==10025 and is_positive==0 and is_negative==1') #1079件
roomba_positive_df = raw_df.query('genre_id==10025 and is_positive==1 and is_negative==0') #944件
roomba_df = pd.concat([roomba_positive_df, roomba_negative_df], axis=0, ignore_index=True)
roomba_df.head()

Unnamed: 0,id,genre_id,status_id,is_both,is_positive,is_negative,is_neutral,is_irrelevant
0,1141159,10025,551949125961252864,0,1,0,1,0.0
1,1135337,10025,550852542322581507,0,1,0,0,0.0
2,1138242,10025,551413741560930304,0,1,0,0,0.0
3,1139529,10025,551692497269690368,0,1,0,0,0.0
4,1133634,10025,550318759898918913,0,1,0,0,0.0


In [15]:
tweet_ids = roomba_df.status_id.values

In [16]:
conf = configparser.ConfigParser()
conf.read('../config/setting.ini')

['../config/setting.ini']

In [17]:
def get_tweet_text(tweet_id):
    twitter = OAuth1Session(
        conf['twitterapi']['CONSUMER_KEY'],
        conf['twitterapi']['CONSUMER_SECRET'],
        conf['twitterapi']['TOKEN'],
        conf['twitterapi']['TOKEN_SECRET']
    )

    url = 'https://api.twitter.com/1.1/statuses/show.json' 

    params ={'id' : tweet_id}
    res = twitter.get(url, params = params)
    
    if res.status_code == 200:
        return json.loads(res.text)['text'], res
    else:
        return np.nan, res

In [18]:
tweet_texts = []
for tweet_id in tqdm_notebook(tweet_ids):
    text, res = get_tweet_text(tweet_id)
    tweet_texts.append(text)
    #API制限までの残り回数が来たら寝る（15分900回）
    try:                                                                                                                                                                                                                                       
        if int(res.headers['x-rate-limit-remaining']) <= 0:
            print(f'{datetime.datetime.now()}: 15分後に起きます。')
            time.sleep(15 * 60 + 2)
            print(f'{datetime.datetime.now()}: 起きました。')
    except KeyError:
        print('なんやろね')
        pass

HBox(children=(IntProgress(value=0, max=2023), HTML(value='')))

2019-05-05 18:26:33.633980: 15分後に起きます。
2019-05-05 18:41:35.680097: 起きました。
2019-05-05 18:44:33.241502: 15分後に起きます。
2019-05-05 18:59:35.290670: 起きました。



In [19]:
len(tweet_texts)

2023

In [20]:
roomba_df['tweet_text'] = tweet_texts

In [22]:
roomba_df.to_csv('../data/roomba.csv.gz', compression='gzip', index=False)