{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Which files are the most popular by downloads * size?\n",
"====\n",
"\n",
"Query most popular files.\n",
"\n",
"```\n",
"SELECT * FROM (\n",
" SELECT url, count(*) as downloads FROM `the-psf.pypi.file_downloads` \n",
" WHERE DATE(timestamp) = \"2020-04-16\" GROUP by url\n",
") ORDER BY downloads DESC\n",
"LIMIT 1000\n",
"```\n",
"\n",
"Download query as json. Attach Content-Length to each file.\n",
"\n",
"```\n",
"prefix = 'https://files.pythonhosted.org'\n",
"for item in data: \n",
" url = item['url'] \n",
" content = session.head(prefix + url) \n",
" length = content.headers['Content-Length'] \n",
" item['length'] = int(length)\n",
" item['downloads'] = int(item['downloads'])\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"data = json.load(open('pypipopular.json'))\n",
"for item in data:\n",
" item['aggregate'] = item['length'] * item['downloads']\n",
"data.sort(key=lambda x: -x['downloads'])\n",
"for i, item in enumerate(data): item['rank'] = i+1 # ranked by number of downloads\n",
"data.sort(key=lambda x: -x['aggregate']) # sorted by downloads * file size"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"rows = []\n",
"for item in data[:128]:\n",
" rows.append((item['url'].split('/')[-1], \"{:.0f}\".format(item['aggregate']/2**30), item['rank']))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
file | GiB/day | popularity |
---|
pyspark-2.4.5.tar.gz | 15345 | 369 |
\n",
"tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl | 15083 | 643 |
\n",
"tensorflow-2.1.0-cp37-cp37m-manylinux2010_x86_64.whl | 9552 | 905 |
\n",
"tensorflow-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl | 8004 | 298 |
\n",
"botocore-1.15.39-py2.py3-none-any.whl | 7769 | 12 |
\n",
"numpy-1.18.2-cp36-cp36m-manylinux1_x86_64.whl | 7001 | 60 |
\n",
"scipy-1.4.1-cp36-cp36m-manylinux1_x86_64.whl | 4243 | 177 |
\n",
"xgboost-1.0.2-py3-none-manylinux1_x86_64.whl | 3488 | 705 |
\n",
"scipy-1.1.0-cp27-cp27mu-manylinux1_x86_64.whl | 3200 | 269 |
\n",
"numpy-1.18.2-cp37-cp37m-manylinux1_x86_64.whl | 3096 | 185 |
\n",
"xgboost-0.90-py2.py3-none-manylinux1_x86_64.whl | 3054 | 939 |
\n",
"scipy-1.4.1-cp37-cp37m-manylinux1_x86_64.whl | 2426 | 297 |
\n",
"numpy-1.16.6-cp27-cp27mu-manylinux1_x86_64.whl | 2376 | 202 |
\n",
"pyarrow-0.16.0-cp36-cp36m-manylinux1_x86_64.whl | 2185 | 659 |
\n",
"gensim-3.6.0-cp27-cp27mu-manylinux1_x86_64.whl | 2137 | 310 |
\n",
"botocore-1.15.40-py2.py3-none-any.whl | 2080 | 63 |
\n",
"scipy-1.2.3-cp27-cp27mu-manylinux1_x86_64.whl | 2064 | 329 |
\n",
"awscli-1.18.39-py2.py3-none-any.whl | 1971 | 31 |
\n",
"pip-20.0.2-py2.py3-none-any.whl | 1822 | 13 |
\n",
"ansible-2.9.6.tar.gz | 1551 | 254 |
\n",
"numpy-1.18.2-cp35-cp35m-manylinux1_x86_64.whl | 1432 | 364 |
\n",
"Babel-2.8.0-py2.py3-none-any.whl | 1417 | 171 |
\n",
"pyarrow-0.16.0-cp36-cp36m-manylinux2014_x86_64.whl | 1393 | 921 |
\n",
"pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl | 1383 | 206 |
\n",
"botocore-1.13.50-py2.py3-none-any.whl | 1300 | 118 |
\n",
"matplotlib-3.2.1-cp36-cp36m-manylinux1_x86_64.whl | 1246 | 278 |
\n",
"botocore-1.12.253-py2.py3-none-any.whl | 1235 | 124 |
\n",
"notebook-5.7.8-py2.py3-none-any.whl | 1121 | 232 |
\n",
"virtualenv-20.0.17-py2.py3-none-any.whl | 1059 | 113 |
\n",
"scipy-1.4.1-cp35-cp35m-manylinux1_x86_64.whl | 1012 | 603 |
\n",
"pandas-1.0.3-cp37-cp37m-manylinux1_x86_64.whl | 975 | 291 |
\n",
"notebook-6.0.3-py3-none-any.whl | 903 | 296 |
\n",
"pycryptodomex-3.9.7-cp37-cp37m-manylinux1_x86_64.whl | 867 | 413 |
\n",
"matplotlib-3.2.1-cp37-cp37m-manylinux1_x86_64.whl | 864 | 371 |
\n",
"pandas-0.24.2-cp27-cp27mu-manylinux1_x86_64.whl | 801 | 340 |
\n",
"mypy-0.770-cp37-cp37m-manylinux1_x86_64.whl | 772 | 644 |
\n",
"grpcio-1.28.1.tar.gz | 767 | 597 |
\n",
"numpy-1.16.4-cp35-cp35m-manylinux1_x86_64.whl | 752 | 548 |
\n",
"cryptography-2.9-cp35-abi3-manylinux2010_x86_64.whl | 717 | 88 |
\n",
"pandas-0.24.2-cp35-cp35m-manylinux1_x86_64.whl | 680 | 380 |
\n",
"mlflow-1.7.2-py3-none-any.whl | 636 | 590 |
\n",
"virtualenv-20.0.18-py2.py3-none-any.whl | 635 | 207 |
\n",
"awscli-1.18.40-py2.py3-none-any.whl | 558 | 143 |
\n",
"pytz-2019.3-py2.py3-none-any.whl | 546 | 15 |
\n",
"setuptools-46.1.3-py3-none-any.whl | 544 | 18 |
\n",
"scikit_learn-0.22.2.post1-cp36-cp36m-manylinux1_x86_64.whl | 542 | 347 |
\n",
"matplotlib-2.2.5-cp27-cp27mu-manylinux1_x86_64.whl | 540 | 562 |
\n",
"numpy-1.18.2-cp38-cp38-manylinux1_x86_64.whl | 535 | 819 |
\n",
"pycryptodome-3.9.7-cp37-cp37m-manylinux1_x86_64.whl | 492 | 642 |
\n",
"snowflake_connector_python-2.2.4-cp37-cp37m-manylinux2010_x86_64.whl | 491 | 588 |
\n",
"docutils-0.15.2-py2-none-any.whl | 474 | 22 |
\n",
"plotly-4.6.0-py2.py3-none-any.whl | 472 | 391 |
\n",
"docutils-0.15.2-py3-none-any.whl | 468 | 23 |
\n",
"python_dateutil-2.8.1-py2.py3-none-any.whl | 447 | 4 |
\n",
"widgetsnbextension-3.5.1-py2.py3-none-any.whl | 445 | 130 |
\n",
"scikit_learn-0.20.3-cp36-cp36m-manylinux1_x86_64.whl | 445 | 331 |
\n",
"future-0.18.2.tar.gz | 428 | 39 |
\n",
"numpy-1.17.4-cp35-cp35m-manylinux1_x86_64.whl | 424 | 938 |
\n",
"numpy-1.17.4-cp37-cp37m-manylinux1_x86_64.whl | 412 | 959 |
\n",
"numpy-1.18.1-cp37-cp37m-manylinux1_x86_64.whl | 405 | 984 |
\n",
"SQLAlchemy-1.3.16.tar.gz | 391 | 404 |
\n",
"SQLAlchemy-1.3.13.tar.gz | 389 | 403 |
\n",
"awscli-1.16.314-py2.py3-none-any.whl | 384 | 204 |
\n",
"boto-2.49.0-py2.py3-none-any.whl | 377 | 85 |
\n",
"PyYAML-5.3.1.tar.gz | 363 | 11 |
\n",
"tensorboard-2.0.2-py2-none-any.whl | 349 | 304 |
\n",
"cryptography-2.9-cp35-abi3-manylinux1_x86_64.whl | 345 | 228 |
\n",
"pipenv-2018.11.26-py3-none-any.whl | 344 | 389 |
\n",
"lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl | 337 | 435 |
\n",
"pycryptodomex-3.9.7-cp36-cp36m-manylinux1_x86_64.whl | 335 | 857 |
\n",
"Django-3.0.5-py3-none-any.whl | 334 | 538 |
\n",
"statsmodels-0.11.1-cp36-cp36m-manylinux1_x86_64.whl | 331 | 611 |
\n",
"lxml-4.5.0-cp37-cp37m-manylinux1_x86_64.whl | 323 | 445 |
\n",
"certifi-2020.4.5.1-py2.py3-none-any.whl | 312 | 3 |
\n",
"pandas-0.25.3-cp36-cp36m-manylinux1_x86_64.whl | 299 | 760 |
\n",
"grpcio-1.28.1-cp36-cp36m-manylinux2010_x86_64.whl | 294 | 272 |
\n",
"scikit_learn-0.22.2.post1-cp37-cp37m-manylinux1_x86_64.whl | 290 | 572 |
\n",
"Pygments-2.6.1-py3-none-any.whl | 289 | 71 |
\n",
"pandas-0.24.2-cp36-cp36m-manylinux1_x86_64.whl | 273 | 793 |
\n",
"chardet-3.0.4-py2.py3-none-any.whl | 272 | 2 |
\n",
"pandas-0.25.3-cp37-cp37m-manylinux1_x86_64.whl | 271 | 815 |
\n",
"pandas-0.23.4-cp37-cp37m-manylinux1_x86_64.whl | 271 | 722 |
\n",
"matplotlib-3.0.3-cp35-cp35m-manylinux1_x86_64.whl | 262 | 980 |
\n",
"sphinx_rtd_theme-0.4.3-py2.py3-none-any.whl | 252 | 596 |
\n",
"Pillow-6.2.2-cp27-cp27mu-manylinux1_x86_64.whl | 248 | 241 |
\n",
"tensorboard-2.1.1-py3-none-any.whl | 244 | 412 |
\n",
"botocore-1.14.17-py2.py3-none-any.whl | 244 | 569 |
\n",
"setuptools-44.1.0-py2.py3-none-any.whl | 243 | 54 |
\n",
"h5py-2.10.0-cp36-cp36m-manylinux1_x86_64.whl | 237 | 333 |
\n",
"mysql-connector-2.2.9.tar.gz | 236 | 1000 |
\n",
"protobuf-3.11.3-cp36-cp36m-manylinux1_x86_64.whl | 226 | 151 |
\n",
"cryptography-2.9-cp27-cp27mu-manylinux2010_x86_64.whl | 225 | 328 |
\n",
"pycountry-19.8.18.tar.gz | 221 | 920 |
\n",
"protobuf-3.11.3-cp37-cp37m-manylinux1_x86_64.whl | 212 | 164 |
\n",
"urllib3-1.25.8-py2.py3-none-any.whl | 210 | 7 |
\n",
"scikit_learn-0.20.4-cp27-cp27mu-manylinux1_x86_64.whl | 209 | 620 |
\n",
"networkx-2.4-py3-none-any.whl | 208 | 222 |
\n",
"tensorflow_data_validation-0.15.0-cp27-cp27mu-manylinux2010_x86_64.whl | 208 | 313 |
\n",
"lxml-4.5.0-cp27-cp27mu-manylinux1_x86_64.whl | 204 | 641 |
\n",
"distlib-0.3.0.zip | 195 | 64 |
\n",
"cryptography-2.9-cp27-cp27mu-manylinux1_x86_64.whl | 182 | 387 |
\n",
"cryptography-2.8-cp34-abi3-manylinux2010_x86_64.whl | 177 | 344 |
\n",
"Django-2.2.12-py3-none-any.whl | 176 | 883 |
\n",
"scikit_learn-0.22.2.post1-cp35-cp35m-manylinux1_x86_64.whl | 175 | 838 |
\n",
"numpy-1.18.2.zip | 174 | 699 |
\n",
"botocore-1.13.20-py2.py3-none-any.whl | 171 | 714 |
\n",
"tfx_bsl-0.15.3-cp27-cp27mu-manylinux2010_x86_64.whl | 169 | 314 |
\n",
"tensorboard-1.15.0-py3-none-any.whl | 169 | 533 |
\n",
"Pygments-2.5.2-py2.py3-none-any.whl | 168 | 141 |
\n",
"grpcio-1.28.1-cp37-cp37m-manylinux2010_x86_64.whl | 168 | 432 |
\n",
"jedi-0.17.0-py2.py3-none-any.whl | 164 | 194 |
\n",
"protobuf-3.11.3-cp35-cp35m-manylinux1_x86_64.whl | 163 | 223 |
\n",
"protobuf-3.11.3-cp27-cp27mu-manylinux1_x86_64.whl | 162 | 225 |
\n",
"scikit_learn-0.21.3-cp36-cp36m-manylinux1_x86_64.whl | 161 | 864 |
\n",
"protobuf-3.6.0-cp36-cp36m-manylinux1_x86_64.whl | 149 | 950 |
\n",
"docutils-0.16-py2.py3-none-any.whl | 146 | 89 |
\n",
"netaddr-0.7.19-py2.py3-none-any.whl | 144 | 316 |
\n",
"lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl | 142 | 248 |
\n",
"Werkzeug-1.0.1-py2.py3-none-any.whl | 142 | 47 |
\n",
"tensorboard-1.14.0-py3-none-any.whl | 140 | 541 |
\n",
"pyzmq-19.0.0-cp27-cp27mu-manylinux1_x86_64.whl | 136 | 221 |
\n",
"SQLAlchemy-1.3.15.tar.gz | 135 | 915 |
\n",
"Sphinx-3.0.1-py3-none-any.whl | 133 | 518 |
\n",
"joblib-0.14.1-py2.py3-none-any.whl | 133 | 51 |
\n",
"h5py-2.10.0-cp27-cp27mu-manylinux1_x86_64.whl | 132 | 512 |
\n",
"imageio-2.8.0-py3-none-any.whl | 129 | 604 |
\n",
"Pillow-7.1.1-cp37-cp37m-manylinux1_x86_64.whl | 127 | 429 |
\n",
"tensorflow_model_analysis-0.15.4-py2-none-any.whl | 126 | 312 |
"
],
"text/plain": [
"