{ "cells": [ { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "# 加载数据集" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "hidden": true }, "outputs": [], "source": [ "import twitter\n", "consumer_key = \"52Nu7ubm2szT1JyJEOB7V2lGM\"\n", "consumer_secret = \"mqA94defqjioyWeMxdJsSduthxdMMGd2vfOUKvOFpm0n7JTqfY\"\n", "access_token = \"16065520-USf3DBbQAh6ZA8CnSAi6NAUlkorXdppRXpC4cQCKk\"\n", "access_token_secret = \"DowMQeXqh5ZsGvZGrmUmkI0iCmI34ShFzKF3iOdiilpX5\"\n", "authorization = twitter.OAuth(access_token, access_token_secret, consumer_key, consumer_secret)\n", "t = twitter.Twitter(auth=authorization, retry=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "import os\n", "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n", "output_filename = os.path.join(data_folder, \"python_tweets.json\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "original_users = []\n", "tweets = []\n", "user_ids = {}\n", "\n", "search_results = t.search.tweets(q=\"python\", count=100)['statuses']\n", "for tweet in search_results:\n", " if 'text' in tweet:\n", " original_users.append(tweet['user']['screen_name'])\n", " user_ids[tweet['user']['screen_name']] = tweet['user']['id']\n", " tweets.append(tweet['text'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "len(tweets)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "model_filename = os.path.join(os.path.expanduser(\"~\"), \"Models\", \"twitter\", \"python_context.pkl\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "from sklearn.base import TransformerMixin\n", "from nltk import word_tokenize\n", "\n", "class NLTKBOW(TransformerMixin):\n", " def fit(self, X, y=None):\n", " return self\n", "\n", " def transform(self, X):\n", " return [{word: True for word in word_tokenize(document)}\n", " for document in X]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "from sklearn.externals import joblib\n", "context_classifier = joblib.load(model_filename)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "y_pred = context_classifier.predict(tweets)\n", "y_pred" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "relevant_tweets = [tweets[i] for i in range(len(tweets)) if y_pred[i] == 1]\n", "relevant_users = [original_users[i] for i in range(len(tweets)) if y_pred[i] == 1]\n", "print(len(relevant_tweets))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "import time\n", "import sys\n", "\n", "def get_friends(t, user_id):\n", " friends = []\n", " cursor = -1 # Start with the first page\n", " while cursor != 0: # If zero, that is the end:\n", " try:\n", " results = t.friends.ids(user_id=user_id, cursor=cursor, count=5000)\n", " friends.extend([friends for friends in results['ids']])\n", " cursor = results['next_cursor']\n", " if len(friends) >= 10000:\n", " break\n", " if cursor != 0:\n", " print(\"Collected {} friends so far, but there are more\".format(len(friends)))\n", " sys.stdout.flush\n", " except TypeError as e:\n", " if results is None:\n", " print(\"You probably reached your API limit, waiting for 5 minutes\")\n", " sys.stdout.flush()\n", " time.sleep(5*60) # 5 minute wait\n", " else:\n", " raise e\n", " except twitter.TwitterHTTPError as e:\n", " break\n", " finally:\n", " time.sleep(60) # Wait 1 minute before continuing\n", " return friends\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "\n", "test_friends = get_friends(t, user_ids[relevant_users[0]])\n", "print(test_friends)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "friends = {}\n", "for screen_name in relevant_users:\n", " print(\"Obtaining friends for user {}\".format(user))\n", " sys.stdout.flush()\n", " user_id = user_id[user]\n", " friends[user_id] = get_friends(t, user_id)\n", "friends = {user_id:friends[user_id] for user_id in friends\n", " if len(friends[user_id]) > 0}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "from collections import defaultdict" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "def count_friends(friends):\n", " friend_count = defaultdict(int)\n", " for friend_list in friends.values():\n", " for friend in friend_list:\n", " friend_count[friend] += 1\n", " return friend_count" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "#TODO: Remove before production\n", "import os\n", "import json\n", "\n", "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n", "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n", "with open(friends_filename) as inf:\n", " friends = json.load(inf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "friend_count = count_friends(friends)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "from operator import itemgetter\n", "best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "best_friends[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "while len(friends) < 150:\n", " # Get the best friend that isn't already in our list\n", " for user_id, count in best_friends:\n", " if user_id not in friends and str(user_id) != '467407284':\n", " break\n", " print(\"Getting friends of user {}\".format(user_id))\n", " sys.stdout.flush()\n", " friends[user_id] = get_friends(t, user_id)\n", " print(\"Received {} friends\".format(len(friends[user_id])))\n", " print(\"We now have the friends of {} users\".format(len(friends)))\n", " sys.stdout.flush()\n", " # Update friend_count\n", " for friend in friends[user_id]:\n", " friend_count[friend] += 1\n", " # Update the best friends list\n", " best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "import json\n", "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n", "with open(friends_filename, 'w') as outf:\n", " json.dump(friends, outf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "import networkx as nx\n", "G = nx.DiGraph()\n", "\n", "main_users = friends.keys()\n", "G.add_nodes_from(main_users)\n", "\n", "for user_id in friends:\n", " for friend in friends[user_id]:\n", " if friend in main_users:\n", " G.add_edge(user_id, friend) \n", "G" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "%matplotlib inline\n", "nx.draw(G)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "plt.figure(3,figsize=(10, 10))\n", "nx.draw(G)" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "# 寻找子图" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "import os\n", "import json\n", "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n", "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n", "with open(friends_filename) as inf:\n", " friends = json.load(inf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "len(friends.keys())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "import networkx as nx\n", "G = nx.DiGraph()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "main_users = [int(f) for f in friends.keys()]\n", "G.add_nodes_from(main_users)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "for user_id in friends:\n", " for friend in friends[user_id]:\n", " if friend in main_users:\n", " G.add_edge(user_id, friend)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "mu = set(main_users)\n", "au = set([f for ff in friends.values() for f in ff])\n", "len(mu), len(au), len(mu & au)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "%matplotlib inline\n", "nx.draw(G)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "plt.figure(3,figsize=(40,40))\n", "nx.draw(G, alpha=0.1, edge_color='b', node_color='g', node_size=2000)\n", "plt.axis('on')\n", "plt.xlim(0.45, 0.55)\n", "plt.ylim(0.45, 0.55)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "plt.figure(3,figsize=(10, 10))\n", "connection_number = sorted(nx.degree(G).values(), reverse=True)\n", "plt.plot(connection_number,'r-')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "# Power law example\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "data = 1 - np.random.power(5, (20000,))\n", "\n", "plt.hist(data, bins=10,facecolor=\"r\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "friends = {user: set(friends[user]) for user in friends}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "def compute_similarity(friends1, friends2):\n", " return len(friends1 & friends2) / len(friends1 | friends2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "def create_graph(followers, threshold=0):\n", " G = nx.Graph()\n", " for user1 in friends.keys():\n", " for user2 in friends.keys():\n", " if user1 == user2:\n", " continue\n", " weight = compute_similarity(friends[user1], friends[user2])\n", " if weight >= threshold:\n", " G.add_node(user1)\n", " G.add_node(user2)\n", " G.add_edge(user1, user2, weight=weight)\n", " return G" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "G = create_graph(friends)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "plt.figure(figsize=(10,10))\n", "pos = nx.spring_layout(G)\n", "nx.draw_networkx_nodes(G, pos)\n", "edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n", "nx.draw_networkx_edges(G, pos, width=edgewidth)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "G = create_graph(friends, 0.1)\n", "sub_graphs = nx.connected_component_subgraphs(G)\n", "for i, sub_graph in enumerate(sub_graphs):\n", " n_nodes = len(sub_graph.nodes())\n", " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "G = create_graph(friends, 0.25)\n", "sub_graphs = nx.connected_component_subgraphs(G)\n", "\n", "for i, sub_graph in enumerate(sub_graphs):\n", " n_nodes = len(sub_graph.nodes())\n", " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "sub_graphs = nx.connected_component_subgraphs(G)\n", "nx.draw(list(sub_graphs)[6])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "fig = plt.figure()\n", "fig.add_subplot?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "sub_graphs = nx.connected_component_subgraphs(G)\n", "n_subgraphs = nx.number_connected_components(G)\n", "fig = plt.figure(figsize=(20, (n_subgraphs * 2)))\n", "for i, sub_graph in enumerate(sub_graphs):\n", " ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)\n", " ax.get_xaxis().set_visible(False)\n", " ax.get_yaxis().set_visible(False)\n", " pos = nx.spring_layout(G)\n", " nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)\n", " nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "from sklearn.metrics import silhouette_score\n", "\n", "def compute_silhouette(threshold, friends):\n", " G = create_graph(friends, threshold=threshold)\n", " if len(G.nodes()) < 2:\n", " return -99\n", " sub_graphs = nx.connected_components(G)\n", " if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):\n", " return -99\n", " label_dict = {}\n", " for i, sub_graph in enumerate(sub_graphs):\n", " print(type(sub_graph))\n", " print(sub_graph)\n", " for node in sub_graph: #.nodes():\n", " label_dict[node] = i\n", " labels = np.array([label_dict[node] for node in G.nodes()])\n", " X = nx.to_scipy_sparse_matrix(G)#.todense()\n", " #X = 1 - X\n", " return silhouette_score(X, labels, metric='precomputed')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [ "compute_silhouette(0.25, friends)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 第二部分" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "\n", "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\")\n", "friends_filename = os.path.join(data_folder, \"python_friends.json\")\n", "with open(friends_filename) as inf:\n", " friends = json.load(inf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "friends = {user: set(friends[user]) for user in friends}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def compute_similarity(friends1, friends2):\n", " set_friends1 = set(friends1)\n", " set_friends2 = set(friends2)\n", " return len(set_friends1 & set_friends2) / len(set_friends1 | set_friends2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import networkx as nx\n", "def create_graph(friends, threshold=0):\n", " G = nx.Graph()\n", " weights = []\n", " for user1 in friends.keys():\n", " for user2 in friends.keys():\n", " if user1 == user2:\n", " continue\n", " weight = compute_similarity(friends[user1], friends[user2])\n", " weights.append(weight)\n", " if weight >= threshold:\n", " G.add_node(user1)\n", " G.add_node(user2)\n", " G.add_edge(user1, user2, weight=weight)\n", " return G\n", "\n", "G = create_graph(friends, 0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "from matplotlib import pyplot as plt\n", "plt.figure(figsize=(10,10))\n", "pos = nx.spring_layout(G)\n", "nx.draw_networkx_nodes(G, pos, node_size=500)\n", "\n", "edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n", "nx.draw_networkx_edges(G, pos, width=edgewidth)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G = create_graph(friends, 0.1)\n", "sub_graphs = nx.connected_component_subgraphs(G)\n", "\n", "for i, sub_graph in enumerate(sub_graphs):\n", " n_nodes = len(sub_graph.nodes())\n", " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G = create_graph(friends, 0.15)\n", "sub_graphs = nx.connected_component_subgraphs(G)\n", "\n", "for i, sub_graph in enumerate(sub_graphs):\n", " n_nodes = len(sub_graph.nodes())\n", " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sub_graphs = nx.connected_component_subgraphs(G)\n", "label_dict = {}\n", "for i, sub_graph in enumerate(sub_graphs):\n", " for node in sub_graph.nodes():\n", " label_dict[node] = i\n", "labels = [label_dict[node] for node in G.nodes()]\n", "\n", "plt.figure(figsize=(10,10))\n", "nx.draw(G,node_color=labels,cmap=plt.cm.Paired, node_size=500)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sub_graphs = nx.connected_component_subgraphs(G)\n", "plt.figure(figsize=(10,10))\n", "pos = nx.spring_layout(G)\n", "for i, sub_graph in enumerate(sub_graphs):\n", " nodes = sub_graph.nodes()\n", " edges = sub_graph.edges()\n", " nx.draw_networkx_nodes(G, pos, nodes,node_size=500)\n", " nx.draw_networkx_edges(G, pos, edges)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sub_graphs = nx.connected_component_subgraphs(G)\n", "n_subgraphs = nx.number_connected_components(G)\n", "\n", "fig = plt.figure(figsize=(20, (n_subgraphs * 3)))\n", "for i, sub_graph in enumerate(sub_graphs):\n", " ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)\n", " ax.get_xaxis().set_visible(False)\n", " ax.get_yaxis().set_visible(False)\n", " pos = nx.spring_layout(G)\n", " nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)\n", " nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#from sklearn.metrics import silhouette_score\n", "import numpy as np\n", "\n", "def compute_silhouette(threshold, friends):\n", " G = create_graph(friends, threshold=threshold)\n", " if len(G.nodes()) == 0:\n", " return -99 # Invalid graph\n", " sub_graphs = nx.connected_component_subgraphs(G)\n", " if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):\n", " return -99 # Invalid number of components, Silhouette not defined\n", " label_dict = {}\n", " for i, sub_graph in enumerate(sub_graphs):\n", " for node in sub_graph.nodes():\n", " label_dict[node] = i\n", " labels = np.array([label_dict[node] for node in G.nodes()])\n", " X = nx.to_scipy_sparse_matrix(G).todense()\n", " X = 1 - X\n", " return silhouette_score(X, labels, metric='precomputed')\n", "\n", "\n", "print(compute_silhouette(0.1, friends))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from scipy.optimize import minimize #(fun, x0, args=(),\n", "\n", "def invert(func):\n", " def inverted_function(*args, **kwds):\n", " return -func(*args, **kwds)\n", " return inverted_function\n", "\n", "result = minimize(invert(compute_silhouette), 0.1, method='nelder-mead', args=(friends,), options={'maxiter':10, })\n", "print(result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G = create_graph(friends, threshold=0.135)\n", "sub_graphs = nx.connected_component_subgraphs(G)\n", "\n", "for i, sub_graph in enumerate(sub_graphs):\n", " n_nodes = len(sub_graph.nodes())\n", " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = 1-nx.to_scipy_sparse_matrix(G).todense()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def silhouette_score(X, labels, metric='precomputed'):\n", " labels = np.array(labels)\n", " print(labels.shape)\n", " return np.mean(silhouette_samples(X, labels, metric=metric))\n", "\n", "def silhouette_samples(X, labels, metric='precomputed'):\n", " print(X.shape)\n", " distances = X #pairwise_distances(X, metric=metric, **kwds)\n", " n = labels.shape[0]\n", " A = np.array([_intra_cluster_distance(distances[i], labels, i)\n", " for i in range(n)])\n", " B = np.array([_nearest_cluster_distance(distances[i], labels, i)\n", " for i in range(n)])\n", " sil_samples = (B - A) / np.maximum(A, B)\n", " # nan values are for clusters of size 1, and should be 0\n", " return np.nan_to_num(sil_samples)\n", "\n", "def _intra_cluster_distance(distances_row, labels, i):\n", " \"\"\"Calculate the mean intra-cluster distance for sample i.\n", "\n", " Parameters\n", " ----------\n", " distances_row : array, shape = [n_samples]\n", " Pairwise distance matrix between sample i and each sample.\n", "\n", " labels : array, shape = [n_samples]\n", " label values for each sample\n", "\n", " i : int\n", " Sample index being calculated. It is excluded from calculation and\n", " used to determine the current label\n", "\n", " Returns\n", " -------\n", " a : float\n", " Mean intra-cluster distance for sample i\n", " \"\"\"\n", " mask = (labels == labels[i])\n", " mask[i] = False\n", " mask = mask.reshape(distances_row.shape)\n", " #print(\"Cluster {}\".format(i))\n", " #print(mask)\n", " #print(distances_row.flatten())\n", " #print(distances_row.flatten()[mask])\n", " a = np.mean(distances_row[mask])\n", " return a\n", "\n", "\n", "def _nearest_cluster_distance(distances_row, labels, i):\n", " \"\"\"Calculate the mean nearest-cluster distance for sample i.\n", "\n", " Parameters\n", " ----------\n", " distances_row : array, shape = [n_samples]\n", " Pairwise distance matrix between sample i and each sample.\n", "\n", " labels : array, shape = [n_samples]\n", " label values for each sample\n", "\n", " i : int\n", " Sample index being calculated. It is used to determine the current\n", " label.\n", "\n", " Returns\n", " -------\n", " b : float\n", " Mean nearest-cluster distance for sample i\n", " \"\"\"\n", " label = labels[i]\n", " b = np.min([np.mean(distances_row[(labels == cur_label).reshape(distances_row.shape)])\n", " for cur_label in set(labels) if not cur_label == label])\n", " return b" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "silhouette_score(X, labels, metric='precomputed')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "colors": { "hover_highlight": "#DAA520", "navigate_num": "#000000", "navigate_text": "#333333", "running_highlight": "#FF0000", "selected_highlight": "#FFD700", "sidebar_border": "#EEEEEE", "wrapper_background": "#FFFFFF" }, "moveMenuLeft": true, "nav_menu": { "height": "13px", "width": "253px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 2 }