In [1]:
import gc
import networkx as nx
import numpy as np
import os
import pandas as pd
import time
import scipy
import sklearn
from sklearn import cluster, linear_model
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sys
import warnings # Silence perf warning

sys.path.append(os.path.realpath('..'))

import nodevectors
import csrgraph as cg
from csrgraph import methods
from nodevectors.evaluation import link_pred
from nodevectors.evaluation import graph_eval

# UMAP to test (on pip)
import umap

warnings.simplefilter("ignore")

def nx_node_weights(G, method, **kwargs):
 """Node Weights through networkX API"""
 pr = np.zeros(len(G))
 prdict = method(G, **kwargs)
 for i in G.nodes:
 pr[i] = prdict[i]
 return pr

# Data Availability

Data for these notebooks can be found here: https://github.com/VHRanger/Graph-Data
Just download it and point the graph generation methods below to it

The data is in a different repo to avoid polluting the pip package.

In [2]:
#### CONFIG
TEST_SIZE = 0.2
OUT_FILE = 'email.csv'
SEED = 42
ALL_COMPONENTS = [1, 2, 4, 8, 16, 32, 64, 128, 256]

In [3]:
#### GRAPHS
#### Uncomment one to choose which graph to run evaluation on

#### Artificial random graphs
# G = nx.binomial_graph(700, 0.6)
G, labels = graph_eval.make_cluster_graph(n_nodes=820, n_clusters=18, connections=1000, drop_pct=0.5)
# G, labels = graph_eval.make_weighed_cluster_graph(n_nodes=500, n_clusters=6, connections=1500, drop_pct=0.2, max_edge_weight=15)
#### Social graphs
# G, labels = graph_eval.make_blogcatalog(dedupe=True)
# G, mlabels = graph_eval.make_blogcatalog(dedupe=False)
# G, labels = graph_eval.make_email()
# G, labels = graph_eval.get_karateclub("facebook") # twitch, github, facebook, wikipedia
# G = graph_eval.get_from_snap(url="http://snap.stanford.edu/data/facebook_combined.txt.gz", sep=' ', header=None, comment='#')
#### Biology Graphs
# G, mlabels = graph_eval.get_n2v_ppi("../data/bioNEV/node2vec_PPI")


#### Needs OutOfBounds Nodes support from CSRGraphs to work
# G = graph_eval.get_drugbank_ddi("../data/bioNEV/DrugBank_DDI")
# G, mlabels = graph_eval.get_mashup_ppi("../data/bioNEV/Mashup_PPI")

In [4]:
#### For Link Prediction: Split graph into train and test edge sets
#### (All nodes are still present in both)
G_train, testing_pos_edges = link_pred.split_train_test_graph(G, testing_ratio=TEST_SIZE)

#### Lazy way to set up evaluation
try:
 y = labels.label
 n_clusters = y.nunique()
 HAS_LABELS = True
 print(f"clusters: {n_clusters}")
except:
 try: # Multilabels 
 y = MultiLabelBinarizer().fit_transform(mlabels.mlabels)
 HAS_LABELS = True
 print(f"multilabels: {y.shape[1]}")
 except: # No Labels
 HAS_LABELS = False
 print("No Labels")
NNODES = len(G)
print(f"Nodes: {NNODES}\nEdges: {len(G.edges)}\nconnected: {nx.is_connected(G_train)}")

clusters: 18
Nodes: 820
Edges: 9658
connected: True


In [5]:
### GGVEC ####
for N_COMPONENTS in ALL_COMPONENTS:
 print(f"\n\n-------N: {N_COMPONENTS}--------")
 ggvec_params = dict(
 n_components=N_COMPONENTS,
 order=1,
 tol=0.05,
 tol_samples=75,
 max_epoch=6_000,
 learning_rate=0.05,
 negative_ratio=0.33,
 exponent=0.33,
 verbose=True,
 )
 start_t = time.time()
 time.sleep(0.3)
 w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)
 print(f"Time: {time.time() - start_t :.4f}")
 lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

 lpred['algorithm'] = 'ggvec'
 lpred['dim'] = N_COMPONENTS
 lpred['time'] = str(f"{time.time() - start_t :.1f}")
 lpred = pd.DataFrame([pd.Series(lpred)])
 time.sleep(0.3)
 LPRED_FILE = "linkpred_" + OUT_FILE
 if os.path.isfile(LPRED_FILE):
 lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
 else:
 lpred.to_csv(LPRED_FILE, float_format='%.3f')


 w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
 labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

 labelpred['algorithm'] = 'ggvec'
 labelpred['dim'] = N_COMPONENTS
 labelpred['time'] = str(f"{time.time() - start_t :.1f}")
 labelpred = pd.DataFrame([pd.Series(labelpred)])
 time.sleep(0.3)
 LPRED_FILE = OUT_FILE
 if os.path.isfile(OUT_FILE):
 labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
 else:
 labelpred.to_csv(OUT_FILE, float_format='%.3f')



-------N: 1--------


Loss: 0.1498	: 1%|▏ | 88/6000 [00:02<03:04, 32.13it/s] 


Converged! Loss: 0.1484
Time: 3.0665
Link Prediction:
	(logit) AUC-ROC: 0.517, AUC-PR: 0.463, Acc: 0.509, F1: 0.488
	(lgbm) AUC-ROC: 0.772, AUC-PR: 0.700, Acc: 0.735, F1: 0.784


Loss: 0.1475	: 1%|▏ | 86/6000 [00:00<00:10, 562.90it/s]


Converged! Loss: 0.1490
Label Prediction:
	(logit) Acc: 0.091, F1 micro: 0.091, F1 macro: 0.091
	(lgbm) Acc: 0.165, F1 micro: 0.165, F1 macro: 0.165
MI: 0.37, RAND 0.32, FM: 0.32


In [6]:
### GGVEC - 2 ####
for N_COMPONENTS in ALL_COMPONENTS:
 print(f"\n\n-------N: {N_COMPONENTS}--------")
 ggvec_params = dict(
 n_components=N_COMPONENTS,
 order=2,
 tol=0.1,
 tol_samples=10,
 max_epoch=500,
 learning_rate=0.1,
 negative_ratio=0.1,
 exponent=0.33,
 verbose=True,
 )
 start_t = time.time()
 time.sleep(0.3)
 w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)
 print(f"Time: {time.time() - start_t :.4f}")
 lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

 lpred['algorithm'] = 'ggvec2'
 lpred['dim'] = N_COMPONENTS
 lpred['time'] = str(f"{time.time() - start_t :.1f}")
 lpred = pd.DataFrame([pd.Series(lpred)])
 time.sleep(0.3)
 LPRED_FILE = "linkpred_" + OUT_FILE
 if os.path.isfile(LPRED_FILE):
 lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
 else:
 lpred.to_csv(LPRED_FILE, float_format='%.3f')


 w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
 labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

 labelpred['algorithm'] = 'ggvec2'
 labelpred['dim'] = N_COMPONENTS
 labelpred['time'] = str(f"{time.time() - start_t :.1f}")
 labelpred = pd.DataFrame([pd.Series(labelpred)])
 time.sleep(0.3)
 LPRED_FILE = OUT_FILE
 if os.path.isfile(OUT_FILE):
 labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
 else:
 labelpred.to_csv(OUT_FILE, float_format='%.3f')



-------N: 1--------


Loss: 0.0341	: 3%|▎ | 15/500 [00:00<00:01, 265.23it/s]


Converged! Loss: 0.0339
Time: 1.2681
Link Prediction:
	(logit) AUC-ROC: 0.466, AUC-PR: 0.436, Acc: 0.486, F1: 0.504
	(lgbm) AUC-ROC: 0.819, AUC-PR: 0.776, Acc: 0.760, F1: 0.783


Loss: 0.0335	: 3%|▎ | 16/500 [00:00<00:01, 257.96it/s]


Converged! Loss: 0.0336
Label Prediction:
	(logit) Acc: 0.177, F1 micro: 0.177, F1 macro: 0.177
	(lgbm) Acc: 0.293, F1 micro: 0.293, F1 macro: 0.293
MI: 0.31, RAND 0.30, FM: 0.30


In [7]:
### N2V ####
for N_COMPONENTS in ALL_COMPONENTS:
 print(f"\n\n-------N: {N_COMPONENTS}--------")
 n2v_params = dict(
 n_components=N_COMPONENTS,
 epochs=20,
 walklen=60,
 return_weight=1.,
 neighbor_weight=1.,
 w2vparams={
 "window":3, 
 "negative":5, 
 "iter":2,
 "batch_words":128}
 )
 start_t = time.time()
 w_train = nodevectors.Node2Vec(**n2v_params).fit_transform(G_train)
 print(f"Time: {time.time() - start_t :.4f}")
 lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

 lpred['algorithm'] = 'node2vec'
 lpred['dim'] = N_COMPONENTS
 lpred['time'] = str(f"{time.time() - start_t :.1f}")
 lpred = pd.DataFrame([pd.Series(lpred)])
 time.sleep(0.3)
 LPRED_FILE = "linkpred_" + OUT_FILE
 if os.path.isfile(LPRED_FILE):
 lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
 else:
 lpred.to_csv(LPRED_FILE, float_format='%.3f')


 w = nodevectors.Node2Vec(**n2v_params).fit_transform(G)
 labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

 labelpred['algorithm'] = 'node2vec'
 labelpred['dim'] = N_COMPONENTS
 labelpred['time'] = str(f"{time.time() - start_t :.1f}")
 labelpred = pd.DataFrame([pd.Series(labelpred)])
 time.sleep(0.3)
 LPRED_FILE = OUT_FILE
 if os.path.isfile(OUT_FILE):
 labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
 else:
 labelpred.to_csv(OUT_FILE, float_format='%.3f')



-------N: 1--------
Making walks... Done, T=1.21
Mapping Walk Names... Done, T=1.01
Training W2V... Done, T=3.06
Time: 5.3080
Link Prediction:
	(logit) AUC-ROC: 0.520, AUC-PR: 0.471, Acc: 0.510, F1: 0.505
	(lgbm) AUC-ROC: 0.781, AUC-PR: 0.729, Acc: 0.718, F1: 0.759
Making walks... Done, T=0.13
Mapping Walk Names... Done, T=1.16
Training W2V... Done, T=2.81
Label Prediction:
	(logit) Acc: 0.110, F1 micro: 0.110, F1 macro: 0.110
	(lgbm) Acc: 0.183, F1 micro: 0.183, F1 macro: 0.183
MI: -0.00, RAND 0.23, FM: 0.23


In [8]:
### ProNE ####
for N_COMPONENTS in ALL_COMPONENTS:
 print(f"\n\n-------N: {N_COMPONENTS}--------")
 pne_params = dict(
 n_components=N_COMPONENTS,
 step=5,
 mu=0.2,
 theta=0.5,
 )
 start_t = time.time()
 w_train = nodevectors.ProNE(**pne_params).fit_transform(G_train)
 print(f"Time: {time.time() - start_t :.4f}")
 lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

 lpred['algorithm'] = 'prone'
 lpred['dim'] = N_COMPONENTS
 lpred['time'] = str(f"{time.time() - start_t :.1f}")
 lpred = pd.DataFrame([pd.Series(lpred)])
 time.sleep(0.3)
 LPRED_FILE = "linkpred_" + OUT_FILE
 if os.path.isfile(LPRED_FILE):
 lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
 else:
 lpred.to_csv(LPRED_FILE, float_format='%.3f')


 w = nodevectors.ProNE(**pne_params).fit_transform(G)
 labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

 labelpred['algorithm'] = 'prone'
 labelpred['dim'] = N_COMPONENTS
 labelpred['time'] = str(f"{time.time() - start_t :.1f}")
 labelpred = pd.DataFrame([pd.Series(labelpred)])
 time.sleep(0.3)
 LPRED_FILE = OUT_FILE
 if os.path.isfile(OUT_FILE):
 labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
 else:
 labelpred.to_csv(OUT_FILE, float_format='%.3f')



-------N: 1--------
Time: 0.0400
Link Prediction:
	(logit) AUC-ROC: 0.555, AUC-PR: 0.552, Acc: 0.592, F1: 0.656
	(lgbm) AUC-ROC: 0.720, AUC-PR: 0.653, Acc: 0.675, F1: 0.752
Label Prediction:
	(logit) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024
	(lgbm) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024
MI: -0.00, RAND 0.23, FM: 0.23


In [9]:
### GRaRep ####
for N_COMPONENTS in ALL_COMPONENTS:
 print(f"\n\n-------N: {N_COMPONENTS}--------")
 grarep_params = dict(
 n_components=N_COMPONENTS,
 order=1,
 embedder=TruncatedSVD(
 n_iter=10,
 random_state=42),
 merger=(lambda x : np.sum(x, axis=0)),
 )
 start_t = time.time()
 w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)
 print(f"Time: {time.time() - start_t :.4f}")
 lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

 lpred['algorithm'] = 'grarep'
 lpred['dim'] = N_COMPONENTS
 lpred['time'] = str(f"{time.time() - start_t :.1f}")
 lpred = pd.DataFrame([pd.Series(lpred)])
 time.sleep(0.3)
 LPRED_FILE = "linkpred_" + OUT_FILE
 if os.path.isfile(LPRED_FILE):
 lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
 else:
 lpred.to_csv(LPRED_FILE, float_format='%.3f')


 w = nodevectors.GraRep(**grarep_params).fit_transform(G)
 labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

 labelpred['algorithm'] = 'grarep'
 labelpred['dim'] = N_COMPONENTS
 labelpred['time'] = str(f"{time.time() - start_t :.1f}")
 labelpred = pd.DataFrame([pd.Series(labelpred)])
 time.sleep(0.3)
 LPRED_FILE = OUT_FILE
 if os.path.isfile(OUT_FILE):
 labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
 else:
 labelpred.to_csv(OUT_FILE, float_format='%.3f')


100%|██████████| 1/1 [00:00<00:00, 37.95it/s]



-------N: 1--------
Time: 0.0507
Link Prediction:





	(logit) AUC-ROC: 0.519, AUC-PR: 0.572, Acc: 0.517, F1: 0.540
	(lgbm) AUC-ROC: 0.895, AUC-PR: 0.865, Acc: 0.821, F1: 0.829


100%|██████████| 1/1 [00:00<00:00, 54.70it/s]


Label Prediction:
	(logit) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024
	(lgbm) Acc: 0.354, F1 micro: 0.354, F1 macro: 0.354
MI: -0.00, RAND 0.23, FM: 0.23


In [10]:
### GLoVe with random walks ###
for N_COMPONENTS in ALL_COMPONENTS:
 print(f"\n\n-------N: {N_COMPONENTS}--------")
 glove_params = dict(
 n_components=N_COMPONENTS,
 tol=0.001,
 max_epoch=6_000,
 learning_rate=0.01, 
 max_loss=10.,
 max_count=50, 
 exponent=0.5,
 )
 start_t = time.time()
 wg = cg.csrgraph(G_train).random_walk_resample(walklen=7, epochs=30)
 w_train = nodevectors.Glove(**glove_params).fit_transform(wg)
 print(f"Time: {time.time() - start_t :.4f}")
 lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

 lpred['algorithm'] = 'glove'
 lpred['dim'] = N_COMPONENTS
 lpred['time'] = str(f"{time.time() - start_t :.1f}")
 lpred = pd.DataFrame([pd.Series(lpred)])
 time.sleep(0.3)
 LPRED_FILE = "linkpred_" + OUT_FILE
 if os.path.isfile(LPRED_FILE):
 lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
 else:
 lpred.to_csv(LPRED_FILE, float_format='%.3f')

 wg = cg.csrgraph(G).random_walk_resample(walklen=7, epochs=30)
 w = nodevectors.Glove(**glove_params).fit_transform(wg)
 labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

 labelpred['algorithm'] = 'glove'
 labelpred['dim'] = N_COMPONENTS
 labelpred['time'] = str(f"{time.time() - start_t :.1f}")
 labelpred = pd.DataFrame([pd.Series(labelpred)])
 time.sleep(0.3)
 LPRED_FILE = OUT_FILE
 if os.path.isfile(OUT_FILE):
 labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
 else:
 labelpred.to_csv(OUT_FILE, float_format='%.3f')




-------N: 1--------


 1%|▏ | 83/6000 [00:02<03:00, 32.78it/s] 


Time: 4.1126
Link Prediction:
	(logit) AUC-ROC: 0.502, AUC-PR: 0.449, Acc: 0.504, F1: 0.502
	(lgbm) AUC-ROC: 0.797, AUC-PR: 0.730, Acc: 0.756, F1: 0.797


 2%|▏ | 116/6000 [00:00<00:40, 144.66it/s]


Label Prediction:
	(logit) Acc: 0.189, F1 micro: 0.189, F1 macro: 0.189
	(lgbm) Acc: 0.244, F1 micro: 0.244, F1 macro: 0.244
MI: 0.34, RAND 0.31, FM: 0.31
