{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import polars as pl\n",
    "import multiprocess as mp\n",
    "\n",
    "from sklearn.neighbors import BallTree\n",
    "\n",
    "\n",
    "# Create synthetic data of the location of people (latitude/longitude of India, approx)\n",
    "# For 40 M people, they will have on average ~15 connections, for 10 M people, 4 connections\n",
    "np.random.seed(0)\n",
    "N = 40_000_000\n",
    "\n",
    "lat = np.random.uniform(low=10, high=35, size=N)\n",
    "lon = np.random.uniform(low=70, high=95, size=N)\n",
    "\n",
    "# Make it a pandas dataframe\n",
    "df = pd.DataFrame({'id': range(N), 'lat': lat, 'lon': lon})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fit BallTree for fast queries\n",
    "bt = BallTree(df[[\"lat\", \"lon\"]].values, metric=\"euclidean\")\n",
    "\n",
    "# Approximate distance to match euclidean to geographical distance (1 decimal degree ~ 111 km), \n",
    "# should create only a small error over small distances (but double check)\n",
    "df[\"neighbor_id\"] = bt.query_radius(df[[\"lat\", \"lon\"]].values, r=1/111) \n",
    "\n",
    "## This would be more precise but much much slower\n",
    "# bt = BallTree(df[[\"lat\", \"lon\"]].values, metric=\"haversine\")\n",
    "# radius = 1 / 6371.0\n",
    "# neighboors = bt.query_radius(df[[\"lat\", \"lon\"]].values, r=radius) \n",
    "\n",
    "print(df.shape)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10520530"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum([len(x) for x in df[\"neighbor_id\"]])/1E6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to polars for efficiency\n",
    "df = pl.LazyFrame(df[[\"id\", \"neighbor_id\"]])\n",
    "\n",
    "# Remove from neighor_id the id of column id\n",
    "# Save to disk to allow for larger than memory operations\n",
    "(df\n",
    " .explode(\"neighbor_id\")\n",
    " .filter(pl.col(\"id\") != pl.col(\"neighbor_id\"))\n",
    " .sink_parquet(\"~/Downloads/neighbors.parquet\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set sorted makes the join faster, but ensure df is sorted by ID!!!!! (otherwise bad things happen)\n",
    "df = pl.scan_parquet(\"~/Downloads/neighbors.parquet\").set_sorted(\"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (5, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>id</th><th>neighbor_id</th></tr><tr><td>i64</td><td>i64</td></tr></thead><tbody><tr><td>0</td><td>1861004</td></tr><tr><td>0</td><td>1413696</td></tr><tr><td>1</td><td>678927</td></tr><tr><td>2</td><td>1056336</td></tr><tr><td>3</td><td>3011696</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (5, 2)\n",
       "┌─────┬─────────────┐\n",
       "│ id  ┆ neighbor_id │\n",
       "│ --- ┆ ---         │\n",
       "│ i64 ┆ i64         │\n",
       "╞═════╪═════════════╡\n",
       "│ 0   ┆ 1861004     │\n",
       "│ 0   ┆ 1413696     │\n",
       "│ 1   ┆ 678927      │\n",
       "│ 2   ┆ 1056336     │\n",
       "│ 3   ┆ 3011696     │\n",
       "└─────┴─────────────┘"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head().collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Join with itself to find peers of peers and save to parquet (lazy operations, does not need to fit in memory)\n",
    "(df.join(df,\n",
    "        left_on=\"neighbor_id\", \n",
    "        right_on=\"id\", \n",
    "        how=\"inner\")\n",
    "    .sink_parquet(\"~/Downloads/neighbors_of_neighbors.parquet\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pl.scan_parquet(\"~/Downloads/neighbors_of_neighbors.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (5, 3)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>id</th><th>neighbor_id</th><th>neighbor_id_right</th></tr><tr><td>i64</td><td>i64</td><td>i64</td></tr></thead><tbody><tr><td>34810</td><td>3</td><td>34810</td></tr><tr><td>16481</td><td>37</td><td>16481</td></tr><tr><td>20681</td><td>46</td><td>20681</td></tr><tr><td>16268</td><td>73</td><td>16268</td></tr><tr><td>32768</td><td>5040</td><td>32768</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (5, 3)\n",
       "┌───────┬─────────────┬───────────────────┐\n",
       "│ id    ┆ neighbor_id ┆ neighbor_id_right │\n",
       "│ ---   ┆ ---         ┆ ---               │\n",
       "│ i64   ┆ i64         ┆ i64               │\n",
       "╞═══════╪═════════════╪═══════════════════╡\n",
       "│ 34810 ┆ 3           ┆ 34810             │\n",
       "│ 16481 ┆ 37          ┆ 16481             │\n",
       "│ 20681 ┆ 46          ┆ 20681             │\n",
       "│ 16268 ┆ 73          ┆ 16268             │\n",
       "│ 32768 ┆ 5040        ┆ 32768             │\n",
       "└───────┴─────────────┴───────────────────┘"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Head of file\n",
    "data.head().collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (1, 1)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>len</th></tr><tr><td>u32</td></tr></thead><tbody><tr><td>630</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (1, 1)\n",
       "┌─────┐\n",
       "│ len │\n",
       "│ --- │\n",
       "│ u32 │\n",
       "╞═════╡\n",
       "│ 630 │\n",
       "└─────┘"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Number of rows\n",
    "data.select(pl.len()).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "st",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}