{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import polars as pl\n",
"import multiprocess as mp\n",
"\n",
"from sklearn.neighbors import BallTree\n",
"\n",
"\n",
"# Create synthetic data of the location of people (latitude/longitude of India, approx)\n",
"# For 40 M people, they will have on average ~15 connections, for 10 M people, 4 connections\n",
"np.random.seed(0)\n",
"N = 40_000_000\n",
"\n",
"lat = np.random.uniform(low=10, high=35, size=N)\n",
"lon = np.random.uniform(low=70, high=95, size=N)\n",
"\n",
"# Make it a pandas dataframe\n",
"df = pd.DataFrame({'id': range(N), 'lat': lat, 'lon': lon})"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Fit BallTree for fast queries\n",
"bt = BallTree(df[[\"lat\", \"lon\"]].values, metric=\"euclidean\")\n",
"\n",
"# Approximate distance to match euclidean to geographical distance (1 decimal degree ~ 111 km), \n",
"# should create only a small error over small distances (but double check)\n",
"df[\"neighbor_id\"] = bt.query_radius(df[[\"lat\", \"lon\"]].values, r=1/111) \n",
"\n",
"## This would be more precise but much much slower\n",
"# bt = BallTree(df[[\"lat\", \"lon\"]].values, metric=\"haversine\")\n",
"# radius = 1 / 6371.0\n",
"# neighboors = bt.query_radius(df[[\"lat\", \"lon\"]].values, r=radius) \n",
"\n",
"print(df.shape)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10520530"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum([len(x) for x in df[\"neighbor_id\"]])/1E6"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert to polars for efficiency\n",
"df = pl.LazyFrame(df[[\"id\", \"neighbor_id\"]])\n",
"\n",
"# Remove from neighor_id the id of column id\n",
"# Save to disk to allow for larger than memory operations\n",
"(df\n",
" .explode(\"neighbor_id\")\n",
" .filter(pl.col(\"id\") != pl.col(\"neighbor_id\"))\n",
" .sink_parquet(\"~/Downloads/neighbors.parquet\")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set sorted makes the join faster, but ensure df is sorted by ID!!!!! (otherwise bad things happen)\n",
"df = pl.scan_parquet(\"~/Downloads/neighbors.parquet\").set_sorted(\"id\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
shape: (5, 2)id | neighbor_id |
---|
i64 | i64 |
0 | 1861004 |
0 | 1413696 |
1 | 678927 |
2 | 1056336 |
3 | 3011696 |
"
],
"text/plain": [
"shape: (5, 2)\n",
"┌─────┬─────────────┐\n",
"│ id ┆ neighbor_id │\n",
"│ --- ┆ --- │\n",
"│ i64 ┆ i64 │\n",
"╞═════╪═════════════╡\n",
"│ 0 ┆ 1861004 │\n",
"│ 0 ┆ 1413696 │\n",
"│ 1 ┆ 678927 │\n",
"│ 2 ┆ 1056336 │\n",
"│ 3 ┆ 3011696 │\n",
"└─────┴─────────────┘"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head().collect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Join with itself to find peers of peers and save to parquet (lazy operations, does not need to fit in memory)\n",
"(df.join(df,\n",
" left_on=\"neighbor_id\", \n",
" right_on=\"id\", \n",
" how=\"inner\")\n",
" .sink_parquet(\"~/Downloads/neighbors_of_neighbors.parquet\")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = pl.scan_parquet(\"~/Downloads/neighbors_of_neighbors.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
shape: (5, 3)id | neighbor_id | neighbor_id_right |
---|
i64 | i64 | i64 |
34810 | 3 | 34810 |
16481 | 37 | 16481 |
20681 | 46 | 20681 |
16268 | 73 | 16268 |
32768 | 5040 | 32768 |
"
],
"text/plain": [
"shape: (5, 3)\n",
"┌───────┬─────────────┬───────────────────┐\n",
"│ id ┆ neighbor_id ┆ neighbor_id_right │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ i64 ┆ i64 │\n",
"╞═══════╪═════════════╪═══════════════════╡\n",
"│ 34810 ┆ 3 ┆ 34810 │\n",
"│ 16481 ┆ 37 ┆ 16481 │\n",
"│ 20681 ┆ 46 ┆ 20681 │\n",
"│ 16268 ┆ 73 ┆ 16268 │\n",
"│ 32768 ┆ 5040 ┆ 32768 │\n",
"└───────┴─────────────┴───────────────────┘"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Head of file\n",
"data.head().collect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
"shape: (1, 1)\n",
"┌─────┐\n",
"│ len │\n",
"│ --- │\n",
"│ u32 │\n",
"╞═════╡\n",
"│ 630 │\n",
"└─────┘"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of rows\n",
"data.select(pl.len()).collect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "st",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}