{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data\\hookup_0.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e526fbdfd2d94cf5a7d4bdb5f64c9bd9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_1.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "df60df57087b45e7bedeb64e2116006c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_2.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5fb4e2d63ca9450cbd3597c07a8422ed", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_3.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "13669471840940bdb65ac87e60928a4f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_4.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3aea7d7ed1e149d0a799fbb653007c36", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_5.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5b644aa6fd8447a7bc721061bed7e4b3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_6.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9513417533404f36abfa8c8ca80de00f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_7.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ca99ae1d68e6424caaed906d43053659", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_8.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5f24edb65c2646eba3620f45fb417d76", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_9.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5f7229b919794aa0a034f037c9b733e7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_10.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7dffdf822899444285f34aca23c5b37e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "data\\hookup_11.pa.pq\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5e7afb046f7846fbac9b39a4bf823354", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import concurrent.futures\n", "from itertools import count\n", "from pathlib import Path\n", "import numpy as np\n", "import requests\n", "from random import choices, randint\n", "import hashlib\n", "from bs4 import BeautifulSoup\n", "from tqdm.auto import tqdm\n", "import pandas as pd\n", "\n", "base = 'https://hookup-qubsu.org/home/GetResults'\n", "\n", "categories = [\n", " \"Activism\",\n", " \"Community\",\n", " \"Competing\",\n", " \"Culture\",\n", " \"Democracy\",\n", " \"Gaming\",\n", " \"Learning\",\n", " \"MakeFriends\",\n", " \"Network\",\n", " \"Outdoors\",\n", " \"Perform\",\n", " \"Stayactive\"\n", "]\n", "\n", "def gen_q():\n", " \n", " c = list(np.random.permutation(categories)[:int(np.random.normal((len(categories)-1)//2))])\n", " _c = [categories.index(k) for k in (c)]\n", " q = {\n", " \"Categories\": c,\n", " \"Budget\": str(randint(0,10)),\n", " \"Time\": str(randint(0,10)),\n", " \"Travel\": str(randint(0,10)),\n", " \"Joined\": str(randint(0,10))\n", " }\n", " h = hashlib.md5(str(q).encode('utf-8')).digest()\n", " return h,q,_c\n", " \n", "def get_clubs(q):\n", " response = requests.post(base, data=q)\n", " content = response.content\n", " duration = response.elapsed.total_seconds()\n", " s = BeautifulSoup(content, 'html.parser')\n", " clubs = [h.get_text() for h in s.select('div.answers > h2')]\n", " return clubs, duration\n", "\n", "def get_random_result():\n", " h,q,_c = gen_q()\n", " q['Recommended'], q['Duration']=get_clubs(q)\n", " return q\n", "\n", "batch_size=int(1e4)\n", "with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:\n", " for i in count(): #run forever\n", " while (dest:=Path(f'data/hookup_{i}.pa.pq')).exists():\n", " i+=1\n", " print(dest)\n", " results =[]\n", " futures = {executor.submit(get_random_result) for _ in range(batch_size)}\n", " for future in tqdm(concurrent.futures.as_completed(futures), total=batch_size):\n", " results.append(future.result())\n", " pd.DataFrame(results).to_parquet(dest, engine='pyarrow')\n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }