{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import polars as pl\n", "import multiprocess as mp\n", "\n", "from sklearn.neighbors import BallTree\n", "\n", "\n", "# Create synthetic data of the location of people (latitude/longitude of India, approx)\n", "# For 40 M people, they will have on average ~15 connections, for 10 M people, 4 connections\n", "np.random.seed(0)\n", "N = 40_000_000\n", "\n", "lat = np.random.uniform(low=10, high=35, size=N)\n", "lon = np.random.uniform(low=70, high=95, size=N)\n", "\n", "# Make it a pandas dataframe\n", "df = pd.DataFrame({'id': range(N), 'lat': lat, 'lon': lon})" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Fit BallTree for fast queries\n", "bt = BallTree(df[[\"lat\", \"lon\"]].values, metric=\"euclidean\")\n", "\n", "# Approximate distance to match euclidean to geographical distance (1 decimal degree ~ 111 km), \n", "# should create only a small error over small distances (but double check)\n", "df[\"neighbor_id\"] = bt.query_radius(df[[\"lat\", \"lon\"]].values, r=1/111) \n", "\n", "## This would be more precise but much much slower\n", "# bt = BallTree(df[[\"lat\", \"lon\"]].values, metric=\"haversine\")\n", "# radius = 1 / 6371.0\n", "# neighboors = bt.query_radius(df[[\"lat\", \"lon\"]].values, r=radius) \n", "\n", "print(df.shape)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10520530" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum([len(x) for x in df[\"neighbor_id\"]])/1E6" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Convert to polars for efficiency\n", "df = pl.LazyFrame(df[[\"id\", \"neighbor_id\"]])\n", "\n", "# Remove from neighor_id the id of column id\n", "# Save to disk to allow for larger than memory operations\n", "(df\n", " .explode(\"neighbor_id\")\n", " .filter(pl.col(\"id\") != pl.col(\"neighbor_id\"))\n", " .sink_parquet(\"~/Downloads/neighbors.parquet\")\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set sorted makes the join faster, but ensure df is sorted by ID!!!!! (otherwise bad things happen)\n", "df = pl.scan_parquet(\"~/Downloads/neighbors.parquet\").set_sorted(\"id\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 2)
idneighbor_id
i64i64
01861004
01413696
1678927
21056336
33011696
" ], "text/plain": [ "shape: (5, 2)\n", "┌─────┬─────────────┐\n", "│ id ┆ neighbor_id │\n", "│ --- ┆ --- │\n", "│ i64 ┆ i64 │\n", "╞═════╪═════════════╡\n", "│ 0 ┆ 1861004 │\n", "│ 0 ┆ 1413696 │\n", "│ 1 ┆ 678927 │\n", "│ 2 ┆ 1056336 │\n", "│ 3 ┆ 3011696 │\n", "└─────┴─────────────┘" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head().collect()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Join with itself to find peers of peers and save to parquet (lazy operations, does not need to fit in memory)\n", "(df.join(df,\n", " left_on=\"neighbor_id\", \n", " right_on=\"id\", \n", " how=\"inner\")\n", " .sink_parquet(\"~/Downloads/neighbors_of_neighbors.parquet\")\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = pl.scan_parquet(\"~/Downloads/neighbors_of_neighbors.parquet\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 3)
idneighbor_idneighbor_id_right
i64i64i64
34810334810
164813716481
206814620681
162687316268
32768504032768
" ], "text/plain": [ "shape: (5, 3)\n", "┌───────┬─────────────┬───────────────────┐\n", "│ id ┆ neighbor_id ┆ neighbor_id_right │\n", "│ --- ┆ --- ┆ --- │\n", "│ i64 ┆ i64 ┆ i64 │\n", "╞═══════╪═════════════╪═══════════════════╡\n", "│ 34810 ┆ 3 ┆ 34810 │\n", "│ 16481 ┆ 37 ┆ 16481 │\n", "│ 20681 ┆ 46 ┆ 20681 │\n", "│ 16268 ┆ 73 ┆ 16268 │\n", "│ 32768 ┆ 5040 ┆ 32768 │\n", "└───────┴─────────────┴───────────────────┘" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Head of file\n", "data.head().collect()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (1, 1)
len
u32
630
" ], "text/plain": [ "shape: (1, 1)\n", "┌─────┐\n", "│ len │\n", "│ --- │\n", "│ u32 │\n", "╞═════╡\n", "│ 630 │\n", "└─────┘" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of rows\n", "data.select(pl.len()).collect()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "st", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }