{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# 加载数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import twitter\n",
    "consumer_key = \"52Nu7ubm2szT1JyJEOB7V2lGM\"\n",
    "consumer_secret = \"mqA94defqjioyWeMxdJsSduthxdMMGd2vfOUKvOFpm0n7JTqfY\"\n",
    "access_token = \"16065520-USf3DBbQAh6ZA8CnSAi6NAUlkorXdppRXpC4cQCKk\"\n",
    "access_token_secret = \"DowMQeXqh5ZsGvZGrmUmkI0iCmI34ShFzKF3iOdiilpX5\"\n",
    "authorization = twitter.OAuth(access_token, access_token_secret, consumer_key, consumer_secret)\n",
    "t = twitter.Twitter(auth=authorization, retry=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n",
    "output_filename = os.path.join(data_folder, \"python_tweets.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "original_users = []\n",
    "tweets = []\n",
    "user_ids = {}\n",
    "\n",
    "search_results = t.search.tweets(q=\"python\", count=100)['statuses']\n",
    "for tweet in search_results:\n",
    "    if 'text' in tweet:\n",
    "        original_users.append(tweet['user']['screen_name'])\n",
    "        user_ids[tweet['user']['screen_name']] = tweet['user']['id']\n",
    "        tweets.append(tweet['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "len(tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model_filename = os.path.join(os.path.expanduser(\"~\"), \"Models\", \"twitter\", \"python_context.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import TransformerMixin\n",
    "from nltk import word_tokenize\n",
    "\n",
    "class NLTKBOW(TransformerMixin):\n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    "\n",
    "    def transform(self, X):\n",
    "        return [{word: True for word in word_tokenize(document)}\n",
    "                 for document in X]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from sklearn.externals import joblib\n",
    "context_classifier = joblib.load(model_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "y_pred = context_classifier.predict(tweets)\n",
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "relevant_tweets = [tweets[i] for i in range(len(tweets)) if y_pred[i] == 1]\n",
    "relevant_users = [original_users[i] for i in range(len(tweets)) if y_pred[i] == 1]\n",
    "print(len(relevant_tweets))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "import sys\n",
    "\n",
    "def get_friends(t, user_id):\n",
    "    friends = []\n",
    "    cursor = -1  # Start with the first page\n",
    "    while cursor != 0:  # If zero, that is the end:\n",
    "        try:\n",
    "            results = t.friends.ids(user_id=user_id, cursor=cursor, count=5000)\n",
    "            friends.extend([friends for friends in results['ids']])\n",
    "            cursor = results['next_cursor']\n",
    "            if len(friends) >= 10000:\n",
    "                break\n",
    "            if cursor != 0:\n",
    "                print(\"Collected {} friends so far, but there are more\".format(len(friends)))\n",
    "                sys.stdout.flush\n",
    "        except TypeError as e:\n",
    "            if results is None:\n",
    "                print(\"You probably reached your API limit, waiting for 5 minutes\")\n",
    "                sys.stdout.flush()\n",
    "                time.sleep(5*60) # 5 minute wait\n",
    "            else:\n",
    "                raise e\n",
    "        except twitter.TwitterHTTPError as e:\n",
    "            break\n",
    "        finally:\n",
    "            time.sleep(60)  # Wait 1 minute before continuing\n",
    "    return friends\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "\n",
    "test_friends = get_friends(t, user_ids[relevant_users[0]])\n",
    "print(test_friends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "friends = {}\n",
    "for screen_name in relevant_users:\n",
    "    print(\"Obtaining friends for user {}\".format(user))\n",
    "    sys.stdout.flush()\n",
    "    user_id = user_id[user]\n",
    "    friends[user_id] = get_friends(t, user_id)\n",
    "friends = {user_id:friends[user_id] for user_id in friends\n",
    "             if len(friends[user_id]) > 0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def count_friends(friends):\n",
    "    friend_count = defaultdict(int)\n",
    "    for friend_list in friends.values():\n",
    "        for friend in friend_list:\n",
    "            friend_count[friend] += 1\n",
    "    return friend_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "#TODO: Remove before production\n",
    "import os\n",
    "import json\n",
    "\n",
    "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n",
    "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n",
    "with open(friends_filename) as inf:\n",
    "    friends = json.load(inf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "friend_count = count_friends(friends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from operator import itemgetter\n",
    "best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "best_friends[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "while len(friends) < 150:\n",
    "    # Get the best friend that isn't already in our list\n",
    "    for user_id, count in best_friends:\n",
    "        if user_id not in friends and str(user_id) != '467407284':\n",
    "            break\n",
    "    print(\"Getting friends of user {}\".format(user_id))\n",
    "    sys.stdout.flush()\n",
    "    friends[user_id] = get_friends(t, user_id)\n",
    "    print(\"Received {} friends\".format(len(friends[user_id])))\n",
    "    print(\"We now have the friends of {} users\".format(len(friends)))\n",
    "    sys.stdout.flush()\n",
    "    # Update friend_count\n",
    "    for friend in friends[user_id]:\n",
    "        friend_count[friend] += 1\n",
    "    # Update the best friends list\n",
    "    best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n",
    "with open(friends_filename, 'w') as outf:\n",
    "    json.dump(friends, outf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "G = nx.DiGraph()\n",
    "\n",
    "main_users = friends.keys()\n",
    "G.add_nodes_from(main_users)\n",
    "\n",
    "for user_id in friends:\n",
    "    for friend in friends[user_id]:\n",
    "        if friend in main_users:\n",
    "           G.add_edge(user_id, friend) \n",
    "G"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "nx.draw(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "plt.figure(3,figsize=(10, 10))\n",
    "nx.draw(G)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# 寻找子图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n",
    "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n",
    "with open(friends_filename) as inf:\n",
    "    friends = json.load(inf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "len(friends.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "G = nx.DiGraph()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "main_users = [int(f) for f in friends.keys()]\n",
    "G.add_nodes_from(main_users)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "for user_id in friends:\n",
    "    for friend in friends[user_id]:\n",
    "        if friend in main_users:\n",
    "            G.add_edge(user_id, friend)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "mu = set(main_users)\n",
    "au = set([f for ff in friends.values() for f in ff])\n",
    "len(mu), len(au), len(mu & au)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "nx.draw(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "plt.figure(3,figsize=(40,40))\n",
    "nx.draw(G, alpha=0.1, edge_color='b', node_color='g', node_size=2000)\n",
    "plt.axis('on')\n",
    "plt.xlim(0.45, 0.55)\n",
    "plt.ylim(0.45, 0.55)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "plt.figure(3,figsize=(10, 10))\n",
    "connection_number = sorted(nx.degree(G).values(), reverse=True)\n",
    "plt.plot(connection_number,'r-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# Power law example\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "data = 1 - np.random.power(5, (20000,))\n",
    "\n",
    "plt.hist(data, bins=10,facecolor=\"r\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "friends = {user: set(friends[user]) for user in friends}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def compute_similarity(friends1, friends2):\n",
    "    return len(friends1 & friends2) / len(friends1 | friends2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def create_graph(followers, threshold=0):\n",
    "    G = nx.Graph()\n",
    "    for user1 in friends.keys():\n",
    "        for user2 in friends.keys():\n",
    "            if user1 == user2:\n",
    "                continue\n",
    "            weight = compute_similarity(friends[user1], friends[user2])\n",
    "            if weight >= threshold:\n",
    "                G.add_node(user1)\n",
    "                G.add_node(user2)\n",
    "                G.add_edge(user1, user2, weight=weight)\n",
    "    return G"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "G = create_graph(friends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,10))\n",
    "pos = nx.spring_layout(G)\n",
    "nx.draw_networkx_nodes(G, pos)\n",
    "edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n",
    "nx.draw_networkx_edges(G, pos, width=edgewidth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "G = create_graph(friends, 0.1)\n",
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    n_nodes = len(sub_graph.nodes())\n",
    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "G = create_graph(friends, 0.25)\n",
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    n_nodes = len(sub_graph.nodes())\n",
    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "nx.draw(list(sub_graphs)[6])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "fig = plt.figure()\n",
    "fig.add_subplot?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "n_subgraphs = nx.number_connected_components(G)\n",
    "fig = plt.figure(figsize=(20, (n_subgraphs * 2)))\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)\n",
    "    ax.get_xaxis().set_visible(False)\n",
    "    ax.get_yaxis().set_visible(False)\n",
    "    pos = nx.spring_layout(G)\n",
    "    nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)\n",
    "    nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import silhouette_score\n",
    "\n",
    "def compute_silhouette(threshold, friends):\n",
    "    G = create_graph(friends, threshold=threshold)\n",
    "    if len(G.nodes()) < 2:\n",
    "        return -99\n",
    "    sub_graphs = nx.connected_components(G)\n",
    "    if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):\n",
    "        return -99\n",
    "    label_dict = {}\n",
    "    for i, sub_graph in enumerate(sub_graphs):\n",
    "        print(type(sub_graph))\n",
    "        print(sub_graph)\n",
    "        for node in sub_graph:  #.nodes():\n",
    "            label_dict[node] = i\n",
    "    labels = np.array([label_dict[node] for node in G.nodes()])\n",
    "    X = nx.to_scipy_sparse_matrix(G)#.todense()\n",
    "    #X = 1 - X\n",
    "    return silhouette_score(X, labels, metric='precomputed')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "compute_silhouette(0.25, friends)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第二部分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n",
    "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n",
    "with open(friends_filename) as inf:\n",
    "    friends = json.load(inf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "friends = {user: set(friends[user]) for user in friends}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_similarity(friends1, friends2):\n",
    "    set_friends1 = set(friends1)\n",
    "    set_friends2 = set(friends2)\n",
    "    return len(set_friends1 & set_friends2) / len(set_friends1 | set_friends2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "def create_graph(friends, threshold=0):\n",
    "    G = nx.Graph()\n",
    "    weights = []\n",
    "    for user1 in friends.keys():\n",
    "        for user2 in friends.keys():\n",
    "            if user1 == user2:\n",
    "                continue\n",
    "            weight = compute_similarity(friends[user1], friends[user2])\n",
    "            weights.append(weight)\n",
    "            if weight >= threshold:\n",
    "                G.add_node(user1)\n",
    "                G.add_node(user2)\n",
    "                G.add_edge(user1, user2, weight=weight)\n",
    "    return G\n",
    "\n",
    "G = create_graph(friends, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from matplotlib import pyplot as plt\n",
    "plt.figure(figsize=(10,10))\n",
    "pos = nx.spring_layout(G)\n",
    "nx.draw_networkx_nodes(G, pos, node_size=500)\n",
    "\n",
    "edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n",
    "nx.draw_networkx_edges(G, pos, width=edgewidth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "G = create_graph(friends, 0.1)\n",
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    n_nodes = len(sub_graph.nodes())\n",
    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "G = create_graph(friends, 0.15)\n",
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    n_nodes = len(sub_graph.nodes())\n",
    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "label_dict = {}\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    for node in sub_graph.nodes():\n",
    "        label_dict[node] = i\n",
    "labels = [label_dict[node] for node in G.nodes()]\n",
    "\n",
    "plt.figure(figsize=(10,10))\n",
    "nx.draw(G,node_color=labels,cmap=plt.cm.Paired, node_size=500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "plt.figure(figsize=(10,10))\n",
    "pos = nx.spring_layout(G)\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    nodes = sub_graph.nodes()\n",
    "    edges = sub_graph.edges()\n",
    "    nx.draw_networkx_nodes(G, pos, nodes,node_size=500)\n",
    "    nx.draw_networkx_edges(G, pos, edges)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "n_subgraphs = nx.number_connected_components(G)\n",
    "\n",
    "fig = plt.figure(figsize=(20, (n_subgraphs * 3)))\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)\n",
    "    ax.get_xaxis().set_visible(False)\n",
    "    ax.get_yaxis().set_visible(False)\n",
    "    pos = nx.spring_layout(G)\n",
    "    nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)\n",
    "    nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from sklearn.metrics import silhouette_score\n",
    "import numpy as np\n",
    "\n",
    "def compute_silhouette(threshold, friends):\n",
    "    G = create_graph(friends, threshold=threshold)\n",
    "    if len(G.nodes()) == 0:\n",
    "        return -99  # Invalid graph\n",
    "    sub_graphs = nx.connected_component_subgraphs(G)\n",
    "    if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):\n",
    "        return -99  # Invalid number of components, Silhouette not defined\n",
    "    label_dict = {}\n",
    "    for i, sub_graph in enumerate(sub_graphs):\n",
    "        for node in sub_graph.nodes():\n",
    "            label_dict[node] = i\n",
    "    labels = np.array([label_dict[node] for node in G.nodes()])\n",
    "    X = nx.to_scipy_sparse_matrix(G).todense()\n",
    "    X = 1 - X\n",
    "    return silhouette_score(X, labels, metric='precomputed')\n",
    "\n",
    "\n",
    "print(compute_silhouette(0.1, friends))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.optimize import minimize #(fun, x0, args=(),\n",
    "\n",
    "def invert(func):\n",
    "    def inverted_function(*args, **kwds):\n",
    "        return -func(*args, **kwds)\n",
    "    return inverted_function\n",
    "\n",
    "result = minimize(invert(compute_silhouette), 0.1, method='nelder-mead', args=(friends,), options={'maxiter':10, })\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "G = create_graph(friends, threshold=0.135)\n",
    "sub_graphs = nx.connected_component_subgraphs(G)\n",
    "\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    n_nodes = len(sub_graph.nodes())\n",
    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = 1-nx.to_scipy_sparse_matrix(G).todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def silhouette_score(X, labels, metric='precomputed'):\n",
    "    labels = np.array(labels)\n",
    "    print(labels.shape)\n",
    "    return np.mean(silhouette_samples(X, labels, metric=metric))\n",
    "\n",
    "def silhouette_samples(X, labels, metric='precomputed'):\n",
    "    print(X.shape)\n",
    "    distances = X  #pairwise_distances(X, metric=metric, **kwds)\n",
    "    n = labels.shape[0]\n",
    "    A = np.array([_intra_cluster_distance(distances[i], labels, i)\n",
    "                  for i in range(n)])\n",
    "    B = np.array([_nearest_cluster_distance(distances[i], labels, i)\n",
    "                  for i in range(n)])\n",
    "    sil_samples = (B - A) / np.maximum(A, B)\n",
    "    # nan values are for clusters of size 1, and should be 0\n",
    "    return np.nan_to_num(sil_samples)\n",
    "\n",
    "def _intra_cluster_distance(distances_row, labels, i):\n",
    "    \"\"\"Calculate the mean intra-cluster distance for sample i.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    distances_row : array, shape = [n_samples]\n",
    "        Pairwise distance matrix between sample i and each sample.\n",
    "\n",
    "    labels : array, shape = [n_samples]\n",
    "        label values for each sample\n",
    "\n",
    "    i : int\n",
    "        Sample index being calculated. It is excluded from calculation and\n",
    "        used to determine the current label\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    a : float\n",
    "        Mean intra-cluster distance for sample i\n",
    "    \"\"\"\n",
    "    mask = (labels == labels[i])\n",
    "    mask[i] = False\n",
    "    mask = mask.reshape(distances_row.shape)\n",
    "    #print(\"Cluster {}\".format(i))\n",
    "    #print(mask)\n",
    "    #print(distances_row.flatten())\n",
    "    #print(distances_row.flatten()[mask])\n",
    "    a = np.mean(distances_row[mask])\n",
    "    return a\n",
    "\n",
    "\n",
    "def _nearest_cluster_distance(distances_row, labels, i):\n",
    "    \"\"\"Calculate the mean nearest-cluster distance for sample i.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    distances_row : array, shape = [n_samples]\n",
    "        Pairwise distance matrix between sample i and each sample.\n",
    "\n",
    "    labels : array, shape = [n_samples]\n",
    "        label values for each sample\n",
    "\n",
    "    i : int\n",
    "        Sample index being calculated. It is used to determine the current\n",
    "        label.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    b : float\n",
    "        Mean nearest-cluster distance for sample i\n",
    "    \"\"\"\n",
    "    label = labels[i]\n",
    "    b = np.min([np.mean(distances_row[(labels == cur_label).reshape(distances_row.shape)])\n",
    "               for cur_label in set(labels) if not cur_label == label])\n",
    "    return b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "silhouette_score(X, labels, metric='precomputed')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "13px",
    "width": "253px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}