{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Exports nodes and edges from tweets (Retweets, Mentions, or Replies) [JSON]\n", "Exports nodes and edges from tweets (either from retweets or mentions) in json format that can be exported from SFM, and saves it in a file format compatible with various social network graph tools such as Gephi, Cytoscape, Kumu, etc. These are for directed graphs." ] }, { "cell_type": "code", "execution_count": 397, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sys\n", "import json\n", "import re\n", "import numpy as np\n", "from datetime import datetime\n", "import pandas as pd \n", "\n", "tweetfile = 'elites.json'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. Export edges from Retweets, Mentions, or Replies\n", "* Run one of three blocks of codes below for your purpose. " ] }, { "cell_type": "code", "execution_count": 398, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# 1. Export edges from Retweets\n", "\n", "fh = open(tweetfile, 'r')\n", "\n", "userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))\n", "edges = pd.DataFrame(columns=('Source','Target','Strength'))\n", "\n", "for line in fh:\n", " try:\n", " tweet = json.loads(line)\n", " except:\n", " continue\n", " if 'retweeted_status' not in tweet:\n", " continue\n", " \n", " userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],\n", " tweet['user']['screen_name'],\n", " tweet['user']['created_at'],\n", " tweet['user']['profile_image_url_https'],\n", " tweet['user']['followers_count'],\n", " tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)\n", " userdata = userdata.append(pd.DataFrame([[tweet['retweeted_status']['user']['id_str'],\n", " tweet['retweeted_status']['user']['screen_name'],\n", " tweet['retweeted_status']['user']['created_at'],\n", " tweet['retweeted_status']['user']['profile_image_url_https'],\n", " tweet['retweeted_status']['user']['followers_count'],\n", " tweet['retweeted_status']['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True) \n", " edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],\n", " tweet['retweeted_status']['user']['id_str'],\n", " str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))]]\n", " , columns=('Source','Target','Strength')), ignore_index=True) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# 2. Export edges from Mentions\n", "\n", "fh = open(tweetfile, 'r')\n", "\n", "userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))\n", "edges = pd.DataFrame(columns=('Source','Target','Strength'))\n", "\n", "for line in fh:\n", " try:\n", " tweet = json.loads(line)\n", " except:\n", " continue\n", " if len(tweet['entities']['user_mentions']) == 0:\n", " continue\n", " \n", " for mention in tweet['entities']['user_mentions']:\n", " userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],\n", " tweet['user']['screen_name'],\n", " tweet['user']['created_at'],\n", " tweet['user']['profile_image_url_https'],\n", " tweet['user']['followers_count'],\n", " tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)\n", " if len(userdata[userdata['Id'].str.contains(mention['id_str'])]) == 0:\n", " userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],\n", " tweet['user']['screen_name'],\n", " np.nan,\n", " np.nan,\n", " np.nan,\n", " np.nan]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)\n", " edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],\n", " mention['id_str'],\n", " str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))]]\n", " , columns=('Source','Target','Strength')), ignore_index=True) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# 3. Export edges from Replies\n", "\n", "fh = open(tweetfile, 'r')\n", "\n", "userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))\n", "edges = pd.DataFrame(columns=('Source','Target','Strength'))\n", "\n", "for line in fh:\n", " try:\n", " tweet = json.loads(line)\n", " except:\n", " continue\n", " if tweet['in_reply_to_user_id_str'] is None:\n", " continue\n", "\n", " userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],\n", " tweet['user']['screen_name'],\n", " tweet['user']['created_at'],\n", " tweet['user']['profile_image_url_https'],\n", " tweet['user']['followers_count'],\n", " tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)\n", " if len(userdata[userdata['Id'].str.contains(tweet['in_reply_to_user_id_str'])]) == 0:\n", " userdata = userdata.append(pd.DataFrame([[tweet['in_reply_to_user_id_str'],\n", " tweet['in_reply_to_screen_name'],\n", " np.nan,\n", " np.nan,\n", " np.nan,\n", " np.nan]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)\n", " edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],\n", " tweet['in_reply_to_user_id_str'],\n", " str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))]]\n", " , columns=('Source','Target','Strength')), ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Leave only the tweets whose strength level >= user specified level (directed)" ] }, { "cell_type": "code", "execution_count": 403, "metadata": { "collapsed": true }, "outputs": [], "source": [ "strengthLevel = 3 # Network connection strength level: the number of times in total each of the tweeters responded to or mentioned the other.\n", " # If you have 1 as the level, then all tweeters who mentioned or replied to another at least once will be displayed. But if you have 5, only those who have mentioned or responded to a particular tweeter at least 5 times will be displayed, which means that only the strongest bonds are shown.\n", "\n", "edges2 = edges.groupby(['Source','Target'])['Strength'].count()\n", "edges2 = edges2.reset_index()\n", "edges2 = edges2[edges2['Strength'] >= strengthLevel]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Export nodes" ] }, { "cell_type": "code", "execution_count": 404, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Export nodes from the edges and add node attributes for both Sources and Targets.\n", "userdata = userdata.sort_values(['Id','followers_count'], ascending=[True, False])\n", "userdata = userdata.drop_duplicates(['Id'], keep='first') \n", "\n", "ids = edges2['Source'].append(edges2['Target']).to_frame()\n", "ids.columns = ['Id']\n", "ids = ids.drop_duplicates()\n", "\n", "nodes = pd.merge(ids, userdata, on='Id', how='left')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4. Export nodes and edges to csv files" ] }, { "cell_type": "code", "execution_count": 362, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# change column names for Kumu import (Run this when using Kumu)\n", "nodes.columns = ['Id', 'Label', 'Date', 'Image', 'followers_count', 'friends_count']\n", "edges2.columns = ['From','To','Strength']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Print nodes to check\n", "nodes.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Print edges to check\n", "edges2.head(3)" ] }, { "cell_type": "code", "execution_count": 413, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Export nodes and edges to csv files\n", "nodes.to_csv('nodes.csv', encoding='utf-8', index=False)\n", "edges2.to_csv('edges.csv', encoding='utf-8', index=False)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }