{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Which files are the most popular by downloads * size?\n", "====\n", "\n", "Query most popular files.\n", "\n", "```\n", "SELECT * FROM (\n", " SELECT url, count(*) as downloads FROM `the-psf.pypi.file_downloads` \n", " WHERE DATE(timestamp) = \"2020-04-16\" GROUP by url\n", ") ORDER BY downloads DESC\n", "LIMIT 1000\n", "```\n", "\n", "Download query as json. Attach Content-Length to each file.\n", "\n", "```\n", "prefix = 'https://files.pythonhosted.org'\n", "for item in data: \n", " url = item['url'] \n", " content = session.head(prefix + url) \n", " length = content.headers['Content-Length'] \n", " item['length'] = int(length)\n", " item['downloads'] = int(item['downloads'])\n", "```" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "import json\n", "data = json.load(open('pypipopular.json'))\n", "for item in data:\n", " item['aggregate'] = item['length'] * item['downloads']\n", "data.sort(key=lambda x: -x['downloads'])\n", "for i, item in enumerate(data): item['rank'] = i+1 # ranked by number of downloads\n", "data.sort(key=lambda x: -x['aggregate']) # sorted by downloads * file size" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "rows = []\n", "for item in data[:128]:\n", " rows.append((item['url'].split('/')[-1], \"{:.0f}\".format(item['aggregate']/2**30), item['rank']))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
fileGiB/daypopularity
pyspark-2.4.5.tar.gz15345369
tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl15083643
tensorflow-2.1.0-cp37-cp37m-manylinux2010_x86_64.whl9552905
tensorflow-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl8004298
botocore-1.15.39-py2.py3-none-any.whl776912
numpy-1.18.2-cp36-cp36m-manylinux1_x86_64.whl700160
scipy-1.4.1-cp36-cp36m-manylinux1_x86_64.whl4243177
xgboost-1.0.2-py3-none-manylinux1_x86_64.whl3488705
scipy-1.1.0-cp27-cp27mu-manylinux1_x86_64.whl3200269
numpy-1.18.2-cp37-cp37m-manylinux1_x86_64.whl3096185
xgboost-0.90-py2.py3-none-manylinux1_x86_64.whl3054939
scipy-1.4.1-cp37-cp37m-manylinux1_x86_64.whl2426297
numpy-1.16.6-cp27-cp27mu-manylinux1_x86_64.whl2376202
pyarrow-0.16.0-cp36-cp36m-manylinux1_x86_64.whl2185659
gensim-3.6.0-cp27-cp27mu-manylinux1_x86_64.whl2137310
botocore-1.15.40-py2.py3-none-any.whl208063
scipy-1.2.3-cp27-cp27mu-manylinux1_x86_64.whl2064329
awscli-1.18.39-py2.py3-none-any.whl197131
pip-20.0.2-py2.py3-none-any.whl182213
ansible-2.9.6.tar.gz1551254
numpy-1.18.2-cp35-cp35m-manylinux1_x86_64.whl1432364
Babel-2.8.0-py2.py3-none-any.whl1417171
pyarrow-0.16.0-cp36-cp36m-manylinux2014_x86_64.whl1393921
pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl1383206
botocore-1.13.50-py2.py3-none-any.whl1300118
matplotlib-3.2.1-cp36-cp36m-manylinux1_x86_64.whl1246278
botocore-1.12.253-py2.py3-none-any.whl1235124
notebook-5.7.8-py2.py3-none-any.whl1121232
virtualenv-20.0.17-py2.py3-none-any.whl1059113
scipy-1.4.1-cp35-cp35m-manylinux1_x86_64.whl1012603
pandas-1.0.3-cp37-cp37m-manylinux1_x86_64.whl975291
notebook-6.0.3-py3-none-any.whl903296
pycryptodomex-3.9.7-cp37-cp37m-manylinux1_x86_64.whl867413
matplotlib-3.2.1-cp37-cp37m-manylinux1_x86_64.whl864371
pandas-0.24.2-cp27-cp27mu-manylinux1_x86_64.whl801340
mypy-0.770-cp37-cp37m-manylinux1_x86_64.whl772644
grpcio-1.28.1.tar.gz767597
numpy-1.16.4-cp35-cp35m-manylinux1_x86_64.whl752548
cryptography-2.9-cp35-abi3-manylinux2010_x86_64.whl71788
pandas-0.24.2-cp35-cp35m-manylinux1_x86_64.whl680380
mlflow-1.7.2-py3-none-any.whl636590
virtualenv-20.0.18-py2.py3-none-any.whl635207
awscli-1.18.40-py2.py3-none-any.whl558143
pytz-2019.3-py2.py3-none-any.whl54615
setuptools-46.1.3-py3-none-any.whl54418
scikit_learn-0.22.2.post1-cp36-cp36m-manylinux1_x86_64.whl542347
matplotlib-2.2.5-cp27-cp27mu-manylinux1_x86_64.whl540562
numpy-1.18.2-cp38-cp38-manylinux1_x86_64.whl535819
pycryptodome-3.9.7-cp37-cp37m-manylinux1_x86_64.whl492642
snowflake_connector_python-2.2.4-cp37-cp37m-manylinux2010_x86_64.whl491588
docutils-0.15.2-py2-none-any.whl47422
plotly-4.6.0-py2.py3-none-any.whl472391
docutils-0.15.2-py3-none-any.whl46823
python_dateutil-2.8.1-py2.py3-none-any.whl4474
widgetsnbextension-3.5.1-py2.py3-none-any.whl445130
scikit_learn-0.20.3-cp36-cp36m-manylinux1_x86_64.whl445331
future-0.18.2.tar.gz42839
numpy-1.17.4-cp35-cp35m-manylinux1_x86_64.whl424938
numpy-1.17.4-cp37-cp37m-manylinux1_x86_64.whl412959
numpy-1.18.1-cp37-cp37m-manylinux1_x86_64.whl405984
SQLAlchemy-1.3.16.tar.gz391404
SQLAlchemy-1.3.13.tar.gz389403
awscli-1.16.314-py2.py3-none-any.whl384204
boto-2.49.0-py2.py3-none-any.whl37785
PyYAML-5.3.1.tar.gz36311
tensorboard-2.0.2-py2-none-any.whl349304
cryptography-2.9-cp35-abi3-manylinux1_x86_64.whl345228
pipenv-2018.11.26-py3-none-any.whl344389
lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl337435
pycryptodomex-3.9.7-cp36-cp36m-manylinux1_x86_64.whl335857
Django-3.0.5-py3-none-any.whl334538
statsmodels-0.11.1-cp36-cp36m-manylinux1_x86_64.whl331611
lxml-4.5.0-cp37-cp37m-manylinux1_x86_64.whl323445
certifi-2020.4.5.1-py2.py3-none-any.whl3123
pandas-0.25.3-cp36-cp36m-manylinux1_x86_64.whl299760
grpcio-1.28.1-cp36-cp36m-manylinux2010_x86_64.whl294272
scikit_learn-0.22.2.post1-cp37-cp37m-manylinux1_x86_64.whl290572
Pygments-2.6.1-py3-none-any.whl28971
pandas-0.24.2-cp36-cp36m-manylinux1_x86_64.whl273793
chardet-3.0.4-py2.py3-none-any.whl2722
pandas-0.25.3-cp37-cp37m-manylinux1_x86_64.whl271815
pandas-0.23.4-cp37-cp37m-manylinux1_x86_64.whl271722
matplotlib-3.0.3-cp35-cp35m-manylinux1_x86_64.whl262980
sphinx_rtd_theme-0.4.3-py2.py3-none-any.whl252596
Pillow-6.2.2-cp27-cp27mu-manylinux1_x86_64.whl248241
tensorboard-2.1.1-py3-none-any.whl244412
botocore-1.14.17-py2.py3-none-any.whl244569
setuptools-44.1.0-py2.py3-none-any.whl24354
h5py-2.10.0-cp36-cp36m-manylinux1_x86_64.whl237333
mysql-connector-2.2.9.tar.gz2361000
protobuf-3.11.3-cp36-cp36m-manylinux1_x86_64.whl226151
cryptography-2.9-cp27-cp27mu-manylinux2010_x86_64.whl225328
pycountry-19.8.18.tar.gz221920
protobuf-3.11.3-cp37-cp37m-manylinux1_x86_64.whl212164
urllib3-1.25.8-py2.py3-none-any.whl2107
scikit_learn-0.20.4-cp27-cp27mu-manylinux1_x86_64.whl209620
networkx-2.4-py3-none-any.whl208222
tensorflow_data_validation-0.15.0-cp27-cp27mu-manylinux2010_x86_64.whl208313
lxml-4.5.0-cp27-cp27mu-manylinux1_x86_64.whl204641
distlib-0.3.0.zip19564
cryptography-2.9-cp27-cp27mu-manylinux1_x86_64.whl182387
cryptography-2.8-cp34-abi3-manylinux2010_x86_64.whl177344
Django-2.2.12-py3-none-any.whl176883
scikit_learn-0.22.2.post1-cp35-cp35m-manylinux1_x86_64.whl175838
numpy-1.18.2.zip174699
botocore-1.13.20-py2.py3-none-any.whl171714
tfx_bsl-0.15.3-cp27-cp27mu-manylinux2010_x86_64.whl169314
tensorboard-1.15.0-py3-none-any.whl169533
Pygments-2.5.2-py2.py3-none-any.whl168141
grpcio-1.28.1-cp37-cp37m-manylinux2010_x86_64.whl168432
jedi-0.17.0-py2.py3-none-any.whl164194
protobuf-3.11.3-cp35-cp35m-manylinux1_x86_64.whl163223
protobuf-3.11.3-cp27-cp27mu-manylinux1_x86_64.whl162225
scikit_learn-0.21.3-cp36-cp36m-manylinux1_x86_64.whl161864
protobuf-3.6.0-cp36-cp36m-manylinux1_x86_64.whl149950
docutils-0.16-py2.py3-none-any.whl14689
netaddr-0.7.19-py2.py3-none-any.whl144316
lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl142248
Werkzeug-1.0.1-py2.py3-none-any.whl14247
tensorboard-1.14.0-py3-none-any.whl140541
pyzmq-19.0.0-cp27-cp27mu-manylinux1_x86_64.whl136221
SQLAlchemy-1.3.15.tar.gz135915
Sphinx-3.0.1-py3-none-any.whl133518
joblib-0.14.1-py2.py3-none-any.whl13351
h5py-2.10.0-cp27-cp27mu-manylinux1_x86_64.whl132512
imageio-2.8.0-py3-none-any.whl129604
Pillow-7.1.1-cp37-cp37m-manylinux1_x86_64.whl127429
tensorflow_model_analysis-0.15.4-py2-none-any.whl126312
" ], "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table = (\"\" + \"\" + \"\\n\".join(\"\".format(*row) for row in rows) + \"
fileGiB/daypopularity
{}{}{}
\")\n", "import IPython.display\n", "IPython.display.HTML(table)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }