{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b6535345",
"metadata": {},
"outputs": [],
"source": [
"# netowrks\n",
"import networkx as nx\n",
"import igraph as ig\n",
"\n",
"# data processing\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"#some functions to make our lifes easier\n",
"import sys\n",
"sys.path.append(\"./\")\n",
"from common_functions import *\n",
"\n",
"# viz\n",
"import pylab as plt\n",
"import matplotlib as mpl\n",
"import seaborn as sns\n",
"%matplotlib inline\n",
"\n",
"#Change the default options of visualization (improving the defaults)\n",
"custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False, \"axes.spines.left\": False, \"axes.spines.bottom\":\n",
"False,\"lines.linewidth\": 2, \"grid.color\": \"lightgray\", \"legend.frameon\": False, \"xtick.labelcolor\": \"#484848\", \"ytick.labelcolor\":\n",
"\"#484848\", \"xtick.color\": \"#484848\", \"ytick.color\": \"#484848\",\"text.color\": \"#484848\", \"axes.labelcolor\": \"#484848\",\n",
"\"axes.titlecolor\":\"#484848\",\"figure.figsize\": [5,3],\n",
"\"axes.titlelocation\":\"left\",\"xaxis.labellocation\":\"left\",\"yaxis.labellocation\":\"bottom\"}\n",
"palette = [\"#3d348b\",\"#e6af2e\",\"#191716\",\"#e0e2db\"] #use your favourite colours\n",
"sns.set_theme(context='paper', style='white', palette=palette, font='Verdana', font_scale=1.3, color_codes=True,\n",
"rc=custom_params)\n",
"\n",
"from IPython.display import display, HTML\n",
"display(HTML(\"\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb6649c1",
"metadata": {},
"outputs": [],
"source": [
"path_data = \"../Data/\""
]
},
{
"cell_type": "markdown",
"id": "4767bb68",
"metadata": {},
"source": [
"# Exercise 1: Reading and visualizing graphs\n",
"\n",
"Goal: Get used to the `networkx` library"
]
},
{
"cell_type": "markdown",
"id": "9a2566d2",
"metadata": {},
"source": [
"## 1.1. Basic plot\n",
"- Read and understand the following code"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17549962",
"metadata": {},
"outputs": [],
"source": [
"# Read an example data on florentine marriage families in the XV century\n",
"G = nx.florentine_families_graph()\n",
"\n",
"# Show the nodes and edges\n",
"print(\"Nodes: \", G.nodes())\n",
"print(\"Edges: \", G.edges())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b408279",
"metadata": {},
"outputs": [],
"source": [
"# Visualize it\n",
"plt.figure(figsize=(5,4)) #set up dimensions\n",
"\n",
"# create layout (once so we can reuse it if needed)\n",
"pos = nx.spring_layout(G, seed = 1)\n",
"nx.draw(G, pos = pos, with_labels = True, \n",
" edge_color = \"gray\", node_color = \"lightgray\")"
]
},
{
"cell_type": "markdown",
"id": "a8178dbf",
"metadata": {},
"source": [
"## 1.2. Read and visualize your network\n",
"Network here: https://tinyurl.com/network-game\n",
"(I added the direct CSV link. But you could also download it as a CSV file and move it to the folder of this notebook) \n",
"\n",
"Use the `nx.from_pandas_edgelist` function (you need to set up the `source` and `target` parameters)\n",
"Print number of nodes and edges"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4aad5d3d",
"metadata": {},
"outputs": [],
"source": [
"# Read edgelist (link)\n",
"df = pd.read_csv(\"https://docs.google.com/spreadsheets/d/e/2PACX-1vSuZC86KjXYKSPr0Nw7mfRha4zwea3aMw-gTKliVcbRt_m3NRiCEyxbH_d5M8MBL0LWayg1WDmnqBET/pub?gid=0&single=true&output=csv\")\n",
"display(df.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09de5f10",
"metadata": {},
"outputs": [],
"source": [
"#get help\n",
"nx.from_pandas_edgelist?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c667a9a2",
"metadata": {},
"outputs": [],
"source": [
"# Convert to networkx\n",
"G = nx.from_pandas_edgelist(...)\n",
"print(len(G.nodes()))\n",
"print(len(G.edges()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea2cc766",
"metadata": {},
"outputs": [],
"source": [
"# create figure and plot\n",
"plt.figure(figsize=(10,8))\n",
"\n",
"# create layout (once so we can reuse it if needed)\n",
"pos = nx.spring_layout(G, seed = 1)\n",
"nx.draw(G, pos = pos, with_labels = True, node_size=10,\n",
" edge_color = \"lightgray\", node_color = \"gray\")"
]
},
{
"cell_type": "markdown",
"id": "1da80e98",
"metadata": {},
"source": [
"## 1.3. Read and visualize the Twitter network\n",
"\n",
"We will be using (parts of) this network in the following days.\n",
"\n",
"It contains mentions, replies and quotes of user A to user B (but let's treat is as undirected)\n",
"```\n",
"replied_to 40255\n",
"mentioned 10407\n",
"quoted 3448\n",
"```\n",
"\n",
"Use the nx.from_pandas_edgelist function\n",
"Print number of nodes and edges"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d0170e2d",
"metadata": {},
"outputs": [],
"source": [
"# Read edgelist (the separator is a TAB)\n",
"path = f\"{path_data}/ic2s2_netsci_3.tsv\"\n",
"\n",
"df = pd.read_csv(...)\n",
"display(df.head())\n",
"\n",
"# Convert to networkx\n",
"G = nx.from_pandas_edgelist(df)\n",
"G.remove_edges_from(nx.selfloop_edges(G)) #remove self-edges\n",
"print(len(G.nodes()))\n",
"print(len(G.edges()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5251186d",
"metadata": {},
"outputs": [],
"source": [
"# create figure and plot\n",
"plt.figure(figsize=(20,16))\n",
"\n",
"# create layout (once so we can reuse it if needed)\n",
"pos = nx.spring_layout(G, seed = 1)\n",
"nx.draw(G, pos = pos, with_labels = False, node_size=10,\n",
" edge_color = \"lightgray\", node_color = \"gray\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "edf6cafd",
"metadata": {},
"outputs": [],
"source": [
"# igraph is faster. Here is how to convert to igraph\n",
"h = ig.Graph().from_networkx(G)\n",
"# create layout\n",
"layout = h.layout_fruchterman_reingold()\n",
"# create positions in the networx format (as a dictionary)\n",
"pos = dict(zip(G.nodes(), layout.coords))\n",
"\n",
"# Plot as before\n",
"plt.figure(figsize=(20, 16))\n",
"nx.draw(G, pos = pos, with_labels = False, node_size=10,\n",
" edge_color = \"lightgray\", node_color = \"gray\")"
]
},
{
"cell_type": "markdown",
"id": "e6b1b5a9",
"metadata": {},
"source": [
"## 1.4. Read and visualize a large(r) network \n",
"- Read network in f\"{path_data}/wiki-Vote.txt\". Careful, it is a directed network, you need to use the create_using parameter.\n",
"\n",
"(use iterations = 30 in the spring_layout)\n",
"\n",
"\n",
"Format of the file:\n",
"```\n",
"# Directed graph (each unordered pair of nodes is saved once): Wiki-Vote.txt \n",
"# Wikipedia voting on promotion to administratorship (till January 2008). Directed edge A->B means user A voted on B becoming Wikipedia administrator.\n",
"# Nodes: 7115 Edges: 103689\n",
"# FromNodeId\tToNodeId\n",
"30\t1412\n",
"30\t3352\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54c36775",
"metadata": {},
"outputs": [],
"source": [
"# Read directed graph\n",
"path = f\"{path_data}/wiki-Vote.txt\"\n",
"G_wiki = nx.read_edgelist(path, create_using=...)\n",
"print(len(G_wiki.nodes()))\n",
"print(len(G_wiki.edges()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cfcc7e9c",
"metadata": {},
"outputs": [],
"source": [
"# Create layout (this will take a couple minutes). Networkx is a particularly slow library\n",
"pos = nx.spring_layout(G_wiki, seed = 1, iterations=30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e9a6c51",
"metadata": {},
"outputs": [],
"source": [
"# Nobody wants to see your hairball, but let's plot it anyway\n",
"plt.figure(figsize=(20, 20))\n",
"\n",
"# Plot only nodes (too many lines)\n",
"nx.draw_networkx_nodes(G_wiki, pos = pos, node_size = 1, node_color = \"k\")"
]
},
{
"cell_type": "markdown",
"id": "f2ff6d51",
"metadata": {},
"source": [
"# Exercise 2: Network characteristics\n",
"\n",
"Network example: Protein interaction (PPI) in yeast. Nodes = Proteins, Edges = Recorded interactions in experiments\n",
"\n",
"__Exercise__: Compare the PPI network with the Twitter network (ic2s2_netsci_3). What characteristics apply to both?\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42e9de76",
"metadata": {},
"outputs": [],
"source": [
"# Read PPI network\n",
"path_network = f\"{path_data}/ppi_network_prediction.graphml\"\n",
"G = nx.read_graphml(path_network, node_type=int)\n",
"len(G.nodes()), len(G.edges())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "69caf98f",
"metadata": {},
"outputs": [],
"source": [
"# igraph is faster. Here is how to convert to igraph\n",
"h = ig.Graph().from_networkx(G)\n",
"# create layout\n",
"layout = h.layout_fruchterman_reingold()\n",
"# create positions in the networx format (as a dictionary)\n",
"pos = dict(zip(G.nodes(), layout.coords))\n",
"\n",
"# To use networkx (slow), uncomment the line below\n",
"#pos = nx.spring_layout(G, seed = 1)\n",
"\n",
"# Plot as before\n",
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"#This function is defined in common_functions.py file. It adds automatic coloring and nicer defaults\n",
"plot_network(G, a0 = ax, pos = pos, with_labels=False) "
]
},
{
"cell_type": "markdown",
"id": "b7204d45",
"metadata": {},
"source": [
"## 2.1 Connectedness"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e57cfb8",
"metadata": {},
"outputs": [],
"source": [
"# Number of connected components\n",
"nx.components.number_connected_components(G)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c088402",
"metadata": {},
"outputs": [],
"source": [
"# Size of the connected components (first 20 sorted)\n",
"sorted([len(_) for _ in nx.connected_components(G)])[:20]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac07c20c",
"metadata": {},
"outputs": [],
"source": [
"# Let's keep the largest component (be careful, don't do this in your data withuot a very good reason)\n",
"largest_cc = max(nx.connected_components(G), key=len)\n",
"\n",
"# Create a subgraph of G consisting only of nodes in largest_cc\n",
"G = G.subgraph(largest_cc).copy()\n",
"len(G)"
]
},
{
"cell_type": "markdown",
"id": "de92ae88",
"metadata": {},
"source": [
"## 2.2 Small diameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1392dae1",
"metadata": {},
"outputs": [],
"source": [
"# Diameter\n",
"nx.diameter(G)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "827778dd",
"metadata": {},
"outputs": [],
"source": [
"# Average shortest path\n",
"nx.average_shortest_path_length(G)"
]
},
{
"cell_type": "markdown",
"id": "f28e551b",
"metadata": {},
"source": [
"## 2.3 Density"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eba3dd49",
"metadata": {},
"outputs": [],
"source": [
"nx.density(G)"
]
},
{
"cell_type": "markdown",
"id": "c7cbd38f",
"metadata": {},
"source": [
"## 2.4 Transitivity and average clustering "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98a6dc93",
"metadata": {},
"outputs": [],
"source": [
"# Average at the network level (# triangles / possible number of triangles)\n",
"print(nx.transitivity(G))\n",
"\n",
"# Average at the node level = Average(# triangles_i / possible triangles_i )\n",
"print(nx.average_clustering(G))"
]
},
{
"cell_type": "markdown",
"id": "0e9ba007",
"metadata": {},
"source": [
"## 2.5 Assortativity and homophily\n",
"\n",
"What does it mean to have a negative/positive degree assortativity?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fa3b399",
"metadata": {},
"outputs": [],
"source": [
"nx.assortativity.degree_assortativity_coefficient(G)"
]
},
{
"cell_type": "markdown",
"id": "b5b01ea4",
"metadata": {},
"source": [
"## 2.6 Heavy tails"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d396865",
"metadata": {},
"outputs": [],
"source": [
"# Find the degree of each node\n",
"degree_list = dict(G.degree()).values()\n",
"\n",
"# count the number of nodes with a specific degree, sort it\n",
"from collections import Counter\n",
"C = Counter(degree_list)\n",
"deg, cnt = zip(*sorted(C.items()))\n",
"\n",
"# Plot\n",
"plt.plot(deg, cnt, \"o-\")\n",
"plt.xlabel(\"Degree\")\n",
"plt.ylabel(\"Count (degree)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a5b0c1f",
"metadata": {},
"outputs": [],
"source": [
"# Defined in common_functions.py\n",
"plot_cdf(degree_list, compl = True, xlabel = \"Degree (d)\", ylabel = \"p(Degree > d)\")\n"
]
},
{
"cell_type": "markdown",
"id": "ad5df77d",
"metadata": {},
"source": [
"## 2.7 Robustness to failures / Fragility to targeted attacks"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8684247e",
"metadata": {},
"outputs": [],
"source": [
"def attack_network(G, how=\"targeted\", n_remove=200):\n",
" \"\"\"\n",
" Removes nodes iteratively and keeps the (harmonic average of distance between all pairs of nodes)\n",
" \n",
" how = \"targeted\" targets the node with the highest degree\n",
" \"\"\"\n",
" n_components = []\n",
"\n",
" G2 = G.copy()\n",
" for i in range(n_remove):\n",
" if how == \"targeted\":\n",
" # Find the node with the highest degree\n",
" node_to_remove = max(G2.degree(), key=lambda x: x[1])[0]\n",
"\n",
" else:\n",
" # Random node\n",
" node_to_remove = np.random.choice(G2.nodes(), 1)[0]\n",
"\n",
" # Remove node\n",
" G2.remove_node(node_to_remove)\n",
" # Number of nodes in giant component\n",
" n_comp = max([len(_) for _ in nx.connected_components(G2)])\n",
" n_components.append(100*n_comp/len(G2))\n",
" \n",
" \n",
" return n_components"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00e0fd95",
"metadata": {},
"outputs": [],
"source": [
"# Create an ER graph (random graph)\n",
"G_r = nx.random_graphs.gnm_random_graph(1000,len(G.edges()))\n",
"# Let's keep the largest component (be careful, don't do this in your data withuot a very good reason)\n",
"largest_cc = max(nx.connected_components(G_r), key=len)\n",
"\n",
"# Create a subgraph of G consisting only of nodes in largest_cc\n",
"G_r = G_r.subgraph(largest_cc).copy()\n",
"\n",
"# Plot network random\n",
"plot_network(G_r, with_labels=False)\n",
"plt.show()\n",
"\n",
"# Plot network PPI\n",
"plot_network(G, with_labels=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fe19ba3",
"metadata": {},
"outputs": [],
"source": [
"# Remove 200 nodes\n",
"n_remove = 200\n",
"n_comp_attack_t = attack_network(G, how=\"targeted\", n_remove = n_remove)\n",
"n_comp_attack_r = attack_network(G, how=\"random\", n_remove = n_remove)\n",
"\n",
"ref_n_comp_attack_t = attack_network(G_r, how=\"targeted\", n_remove = n_remove)\n",
"ref_n_comp_attack_r = attack_network(G_r, how=\"random\", n_remove = n_remove)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d51b399",
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(10, 5))\n",
"plt.subplot(121)\n",
"plt.plot(100*np.arange(n_remove)/len(G), n_comp_attack_t, label=\"Protein Interaction\")\n",
"plt.plot(100*np.arange(n_remove)/len(G), ref_n_comp_attack_t, label=\"Random network\")\n",
"plt.xlabel(\"Share of nodes removed\")\n",
"plt.ylabel(\"Share of nodes in giant component\")\n",
"plt.legend()\n",
"plt.title(\"Targeted attack\")\n",
"plt.ylim(0,100)\n",
"\n",
"plt.subplot(122)\n",
"plt.plot(100*np.arange(n_remove)/len(G), n_comp_attack_r, label=\"Protein Interaction\")\n",
"plt.plot(100*np.arange(n_remove)/len(G), ref_n_comp_attack_r, label=\"Random network\")\n",
"plt.xlabel(\"Share of nodes removed\")\n",
"plt.ylabel(\"Share of nodes in giant component\")\n",
"plt.legend()\n",
"plt.title(\"Random attack\")\n",
"plt.ylim(0,100)"
]
},
{
"cell_type": "markdown",
"id": "e9985571",
"metadata": {},
"source": [
"# Exercise 3: Distributions (i.e. looking at characteristics of the nodes)\n",
"- Degree\n",
"- Number of triangles\n",
"- Clustering (transitivity)\n",
"- Local assortativity (homophily)\n",
"- Path length\n",
"\n",
"__Goal__: Understand how to calculate distributions in `networkx`, interpret differences in the degree distribution \n",
"\n",
"__Exercise__: In 3.7 compare the Wikipedia network (default example), the PPI network and the Twitter network (IC2S2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16f880a5",
"metadata": {},
"outputs": [],
"source": [
"# Read data on florentine marriage families in the XV century\n",
"G = nx.florentine_families_graph()\n",
"len(G)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d4837fe",
"metadata": {},
"outputs": [],
"source": [
"# Use the following function to plot the CDF of the degree distributions\n",
"def plot_cdf(values, scale = \"log\", ax = None, cum = True, compl = False, marker = 'o-', xlabel = \"Degree (d)\", ylabel = \"p(Degree < d)\"):\n",
" \"\"\"\n",
" Calculates and plot CDF\n",
" \"\"\"\n",
" from collections import Counter\n",
"\n",
" # count the number of instance per each degree, sort it\n",
" C = Counter(values)\n",
" deg, cnt = zip(*sorted(C.items()))\n",
" \n",
" # calcualte the cumulative distribution, normalize to be a probability instead of a count\n",
" if cum:\n",
" cs = np.cumsum(cnt)/np.sum(cnt)\n",
" else:\n",
" cs = cnt/np.sum(cnt)\n",
" \n",
" if compl:\n",
" cs = 1 - cs\n",
" \n",
" if ax is None:\n",
" ax = plt.subplot()\n",
" # plot\n",
" ax.plot(deg, cs, marker)\n",
" ax.set_ylabel(ylabel)\n",
" ax.set_xlabel(xlabel)\n",
" plt.tight_layout()\n",
" sns.despine(left=True, bottom=True)\n",
" plt.xscale(scale)\n",
" plt.yscale(scale)\n",
" "
]
},
{
"cell_type": "markdown",
"id": "998712a3",
"metadata": {},
"source": [
"## 3.1 Degree distribution"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "913ddeb2",
"metadata": {},
"outputs": [],
"source": [
"def plot_network_distribution(G, values, mult = 1000, with_labels = True):\n",
" \"\"\"\n",
" Plots network (color and node size depends on values) and distributions\n",
" \"\"\"\n",
" import matplotlib as mpl\n",
" norm = mpl.colors.Normalize(vmin=min(values), vmax=max(values), clip=True)\n",
" mapper = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.coolwarm)\n",
"\n",
" f, (a0, a1, a2) = plt.subplots(1, 3, gridspec_kw={'width_ratios': [2, 1, 1]}, figsize=(12,4))\n",
" \n",
" node_size = mult*np.array(list(values))\n",
" if min(node_size) < 0:\n",
" node_size -= min(node_size)\n",
" node_size += 1\n",
" \n",
" nx.draw(G, pos = nx.spring_layout(G, seed = 1), with_labels = with_labels, node_size = node_size, edge_color = \"gray\", \n",
" node_color = [mapper.to_rgba(i) for i in values], ax = a0,)\n",
"\n",
"\n",
" sns.histplot(values, ax = a1)\n",
" \n",
" \n",
" plot_cdf(values, ax = a2, compl = True, xlabel = \"Cent c\", ylabel = \"p(Cent > c)\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3033f75e",
"metadata": {},
"outputs": [],
"source": [
"# Degree distribution\n",
"degree = G.degree() #also nx.degree(G)\n",
"degree_values = dict(degree).values()\n",
"\n",
"# Plot using sns.histplot\n",
"sns.histplot(degree_values)\n",
"print(sorted(degree_values))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d1faa5f",
"metadata": {},
"outputs": [],
"source": [
"# Plot CDF in log-log scale\n",
"plot_cdf(degree_values, scale = \"log\", ax = None, cum = True, compl = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a9dcf85",
"metadata": {},
"outputs": [],
"source": [
"# change \"mult\" to change the size of the nodes \n",
"plot_network_distribution(G, degree_values, mult = 100)"
]
},
{
"cell_type": "markdown",
"id": "3f57f36b",
"metadata": {},
"source": [
"## 3.2 Distribution of number of triangles"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbe7b2f0",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Distribution of the number of triangles per node\n",
"n_triangs = nx.triangles(G).values() \n",
"display(list(zip(G.nodes(), n_triangs))[:20])\n",
"\n",
"plot_network_distribution(G, n_triangs, mult = 100)"
]
},
{
"cell_type": "markdown",
"id": "4ec072d6",
"metadata": {},
"source": [
"## 3.3 Distribution of clustering"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f713c1f6",
"metadata": {},
"outputs": [],
"source": [
"n_clus = ...\n",
"display(list(zip(G.nodes(), n_clus))[:20])\n",
"\n",
"plot_network_distribution(G, n_clus, mult = 1000)"
]
},
{
"cell_type": "markdown",
"id": "a3a41d71",
"metadata": {},
"source": [
"### 3.4 Distribution of shortest paths"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c47c87af",
"metadata": {},
"outputs": [],
"source": [
"# Calculate all shortest paths (careful, this quickly becomes unfeasible)\n",
"path_lenghts = ...\n",
"\n",
"# Get results from a nested dictionary\n",
"path_lenghts = [list(_[1].values()) for _ in path_lenghts]\n",
"path_lenghts = [subitem for item in path_lenghts for subitem in item ]\n",
"\n",
"# Plot using sns.histplot\n",
"sns.histplot(path_lenghts)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "5d69dcf0",
"metadata": {},
"source": [
"## 3.5 Distribution of local degree assortativity"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c1f70d5",
"metadata": {},
"outputs": [],
"source": [
"attribute = [k for v,k in G.degree()]\n",
"print(attribute)\n",
"\n",
"# Defined in common_functions (based on Peel et al 2018)\n",
"local_assort = calculate_local_assort(G, attribute)\n",
"display(list(zip(G.nodes(), local_assort))[:20])\n",
"\n",
"plot_network_distribution(G, local_assort, mult = 1000)"
]
},
{
"cell_type": "markdown",
"id": "8e863fc5",
"metadata": {},
"source": [
"## 3.6 Degree distribution if ~ normal distribution (used in the lectures)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70f6f7ef",
"metadata": {},
"outputs": [],
"source": [
"# Don't change this\n",
"def plot_distributions(d, scale=\"log\"):\n",
" \"\"\"\n",
" Plot the PDF, CDF and CCDF.\n",
" \"\"\"\n",
" plt.figure(figsize=(12,4))\n",
" ax = plt.subplot(131)\n",
" plot_cdf(d, cum = False, ax = ax, xlabel = \"Degree (d)\", ylabel = \"p(Degree)\", marker = \".\", scale=scale)\n",
" if scale == \"log\":\n",
" plt.plot([7E1, 5E2],[2.5E-3,1E-4],\"--\")\n",
" plt.title(\"PDF\")\n",
" ax = plt.subplot(132)\n",
" plot_cdf(d, cum = True, ax = ax, xlabel = \"Degree (d)\", ylabel = \"p(Degree < d)\", marker=\"-\", scale=scale)\n",
" plt.title(\"CDF\")\n",
" ax = plt.subplot(133)\n",
" plot_cdf(d, compl = True, ax = ax, xlabel = \"Degree (d)\", ylabel = \"p(Degree > d)\", marker=\".-\", scale=scale)\n",
" plt.title(\"CCDF\")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d68e564",
"metadata": {},
"outputs": [],
"source": [
"## Degree distribution (random normally distributed data)\n",
"## Try changing the scale to \"log\"\n",
"d = np.random.binomial(500, p = 30/500, size = 10000)\n",
"plot_distributions(d, scale=\"linear\")\n"
]
},
{
"cell_type": "markdown",
"id": "782ac2e0",
"metadata": {},
"source": [
"## 3.7 Degree distribution in the wiki network"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "02989329",
"metadata": {},
"outputs": [],
"source": [
"## Degree distribution (wiki network)\n",
"# Try the code using scale = \"linear\"\n",
"G_wiki = nx.read_edgelist(f\"{path_data}/wiki-Vote.txt\", create_using=nx.DiGraph())\n",
"d = [v for k,v in G_wiki.degree()]\n",
"plot_distributions(d, \"log\")\n",
"plt.xlim([100,1000])\n",
"plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29321b85",
"metadata": {},
"outputs": [],
"source": [
"# Twitter data\n",
"df = pd.read_csv(f\"{path_data}/ic2s2_netsci_3.tsv\", sep=\"\\t\")\n",
"G_twitter = nx.from_pandas_edgelist(df)\n",
"G_twitter.remove_edges_from(nx.selfloop_edges(G_twitter)) #remove self-edges\n",
"\n",
"d = [v for k,v in G_twitter.degree()]\n",
"plot_distributions(d, \"log\")\n",
"plt.xlim([100,1000])\n",
"plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43b2e02e",
"metadata": {},
"outputs": [],
"source": [
"# PPI data\n",
"path_network = f\"{path_data}/ppi_network_prediction.graphml\"\n",
"G_ppi = nx.read_graphml(path_network, node_type=int)\n",
"\n",
"d = [v for k,v in G_ppi.degree()]\n",
"plot_distributions(d, \"log\")\n",
"\n",
"plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")"
]
},
{
"cell_type": "markdown",
"id": "429bea65",
"metadata": {},
"source": [
"# Exercise 4A \n",
"Is the network homophilic? (are people linked to people like them?). You can use the function `nx.assortativity.attribute_assortativity_coefficient`\n",
"\n",
"Solution: Shuffle the node labels\n",
"\n",
"__Goal__: Understand how a permutation test works (4A) and how reference models work (4B)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24e6abfa",
"metadata": {},
"outputs": [],
"source": [
"# Create labels\n",
"ns = ['Acciaiuoli', 'Medici', 'Castellani', 'Peruzzi', 'Strozzi', 'Barbadori', 'Ridolfi', 'Tornabuoni', 'Albizzi', 'Salviati', 'Pazzi', 'Bischeri', 'Guadagni', 'Ginori', 'Lamberteschi']\n",
"classes = [\"m\", \"m\", \"o\", \"o\", \"o\", \"o\", \"o\", \"m\", \"m\", \"m\", \"o\", \"o\", \"o\", \"o\", \"o\"]\n",
"nx.set_node_attributes(G, dict(zip(ns,classes)), \"classes\")\n",
"\n",
"# Plot\n",
"nx.draw(G, pos = nx.spring_layout(G, seed = 1), with_labels = True, edge_color = \"gray\", \n",
" node_color = [palette[0] if c == \"m\" else palette[1] for c in classes])\n",
" \n",
"\n",
"# calculate assortativity coefficient\n",
"assort = ...\n",
"print(assort)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "daf87a30",
"metadata": {},
"outputs": [],
"source": [
"# Permutenode labels\n",
"G2 = G.copy()\n",
"\n",
"# Randomize classes and calculate assortativity\n",
"iters = 10000\n",
"values = []\n",
"for i in range(iters):\n",
" # shuffle the classes\n",
" nx.set_node_attributes(G2, dict(zip(ns,np.random.permutation(classes))), \"classes\")\n",
" # calculate assortativity and keep in a list\n",
" values.append(nx.assortativity.attribute_assortativity_coefficient(G2, \"classes\"))\n",
"values = np.array(values)\n",
"\n",
"# Plot results\n",
"sns.histplot(values, binwidth=0.05)\n",
"plt.plot([assort,assort],[0,iters/10], \"--\", color=\"gray\")\n",
"\n",
"# p-value (probability that we would observe a value equal or more extreme to the one observed given \n",
"# that the null hyphotesis is true---i.e. the graph is the real graph and the links \n",
"# are not correlated with the classes\n",
"print(\"p-value\", 1-len(values[values