{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[Python/pdfquery: Scraping the FIFA World Player of the Year votes](http://www.markhneedham.com/blog/2015/01/22/pythonpdfquery-scraping-the-fifa-world-player-of-the-year-votes-pdf-into-shape/) (here: compare with `pdftotext` and text processing)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections.abc import Iterable, Iterator\n", "import csv\n", "import pathlib\n", "import re\n", "import subprocess\n", "import urllib.request\n", "\n", "URL = 'https://raw.githubusercontent.com/mneedham/fifa/master/fboaward_menplayer2014_neutral.pdf'\n", "\n", "PDF = pathlib.Path(URL.rpartition('/')[2])\n", "TXT = PDF.with_suffix('.txt')\n", "CSV = PDF.with_suffix('.csv')\n", "\n", "def load_pages(filepath: pathlib.Path = TXT, /, *,\n", " encoding: str = 'utf-8',\n", " delimiter: str = '\\f') -> list[str]:\n", " text = filepath.read_text(encoding=encoding)\n", " return text.split(delimiter)\n", "\n", "if not PDF.exists():\n", " urllib.request.urlretrieve(URL, PDF)\n", "\n", "if not TXT.exists(): # requires one of popper-utils, miktex-poppler-bin, xpdf\n", " subprocess.run(['pdftotext', '-layout', PDF, TXT], check=True)\n", "\n", "pages = load_pages()\n", "\n", "len(pages)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['Vote',\n", " 'Country',\n", " 'Name',\n", " 'First (5 points)',\n", " 'Second (3 points)',\n", " 'Third (1 point)'],\n", " ['Captain',\n", " 'Afghanistan',\n", " 'Amiri Islam',\n", " 'Messi Lionel',\n", " 'Cristiano Ronaldo',\n", " 'Ibrahimovic Zlatan']]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def iterrows(pages: Iterable[str]) -> Iterator[list[str]]:\n", " first = None\n", " for p in pages:\n", " if not p:\n", " continue\n", " (title, _, rest) = p.partition('\\n\\n\\n')\n", " (header, _, rest) = rest.partition('\\n')\n", " header = re.split(r' {2,}', header)\n", " if first is None:\n", " yield header\n", " first = (title, header)\n", " else:\n", " assert (title, header) == first\n", " (table, _, footer) = rest.partition('\\n ')\n", " (table, footer) = (s.strip() for s in (table, footer))\n", " (page, total) = map(int, re.match(r'(\\d+) / (\\d+)$', footer).groups())\n", " for line in table.splitlines():\n", " if not line:\n", " continue\n", " if line.startswith(' '):\n", " assert line.endswith(' Grenadines')\n", " continue\n", " row = re.split(r' {2,}', line)\n", " assert len(row) == len(header)\n", " if row[1] == 'St. Vincent and the':\n", " row[1] += ' Grenadines'\n", " yield row\n", "\n", "rows = list(iterrows(pages))\n", "\n", "rows[:2]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Captain',\n", " 'Sweden',\n", " 'Ibrahimovic Zlatan',\n", " 'Messi Lionel',\n", " 'Neuer Manuel',\n", " 'Cristiano Ronaldo']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rows[163]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "544" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def write_csv(rows, /, filepath=CSV, *,\n", " encoding: str = 'utf-8',\n", " dialect=csv.excel,\n", " has_header: bool = True) -> int:\n", " with filepath.open('w', encoding=encoding, newline='') as f:\n", " writer = csv.writer(f, dialect=dialect)\n", " ndata = 0\n", " for ndata, r in enumerate(rows, start=0 if has_header else 1):\n", " writer.writerow(r)\n", " return ndata\n", "\n", "write_csv(rows)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 544 entries, 0 to 543\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Vote 544 non-null object\n", " 1 Country 544 non-null object\n", " 2 Name 544 non-null object\n", " 3 First (5 points) 544 non-null object\n", " 4 Second (3 points) 544 non-null object\n", " 5 Third (1 point) 544 non-null object\n", "dtypes: object(6)\n", "memory usage: 25.6+ KB\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VoteCountryNameFirst (5 points)Second (3 points)Third (1 point)
0CaptainAfghanistanAmiri IslamMessi LionelCristiano RonaldoIbrahimovic Zlatan
1CaptainAlbaniaCana LorikCristiano RonaldoRobben ArjenMueller Thomas
2CaptainAlgeriaBougherra MadjidCristiano RonaldoRobben ArjenBenzema Karim
3CaptainAmerican SamoaLuvu Rafe TalaleleiNeymarRobben ArjenCristiano Ronaldo
4CaptainAndorraSonejee Masand OscarCristiano RonaldoMueller ThomasKroos Toni
\n", "
" ], "text/plain": [ " Vote Country Name First (5 points) \\\n", "0 Captain Afghanistan Amiri Islam Messi Lionel \n", "1 Captain Albania Cana Lorik Cristiano Ronaldo \n", "2 Captain Algeria Bougherra Madjid Cristiano Ronaldo \n", "3 Captain American Samoa Luvu Rafe Talalelei Neymar \n", "4 Captain Andorra Sonejee Masand Oscar Cristiano Ronaldo \n", "\n", " Second (3 points) Third (1 point) \n", "0 Cristiano Ronaldo Ibrahimovic Zlatan \n", "1 Robben Arjen Mueller Thomas \n", "2 Robben Arjen Benzema Karim \n", "3 Robben Arjen Cristiano Ronaldo \n", "4 Mueller Thomas Kroos Toni " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(CSV, encoding='utf-8', dialect=csv.excel)\n", "\n", "df.info()\n", "assert df.set_index(['Vote', 'Country']).index.is_unique\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Votes
Vote
Captain182
Coach181
Media181
\n", "
" ], "text/plain": [ " Votes\n", "Vote \n", "Captain 182\n", "Coach 181\n", "Media 181" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Vote'].value_counts().to_frame('Votes').sort_index()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VoteCaptainCoachMedia
Country
Afghanistan110
Albania111
Algeria111
American Samoa110
Andorra111
Angola111
Anguilla110
Antigua and Barbuda111
Argentina111
Armenia111
Aruba111
Australia111
Austria111
Azerbaijan111
Bahamas110
Bahrain111
Bangladesh111
Barbados110
Belarus111
Belgium111
Belize111
Benin001
Bermuda111
Bhutan111
Bolivia111
Bosnia and Herzegovina110
Bosnia-Herzegovina001
Botswana001
Brazil111
British Virgin Islands110
Brunei Darussalam111
Bulgaria111
Burkina Faso111
Burma001
Burundi110
Cambodia111
Cameroon111
Canada111
Cape Verde Islands111
Cayman Islands101
Chad111
Chile111
China PR111
Chinese Taipei110
Colombia111
Comoros111
Congo111
Congo DR111
Cook Islands110
Costa Rica111
Croatia111
Cuba111
Curaçao111
Cyprus111
Czech Republic111
Côte d'Ivoire011
Denmark111
Djibouti111
Dominica110
Dominican Republic111
Ecuador111
Egypt111
El Salvador111
England111
Eritrea001
Estonia111
Ethiopia111
FYR Macedonia111
Faroe Islands111
Fiji111
Finland111
France111
Gabon111
Gambia001
Georgia111
Germany111
Ghana111
Greece111
Grenada111
Guam110
Guatemala111
Guinea001
Guinea-Bissau001
Guyana111
Haiti001
Honduras111
Hong Kong111
Hungary111
Iceland111
India111
Indonesia111
Iran111
Iraq001
Ireland001
Israel111
Italy111
Jamaica110
Japan111
Jordan110
Kazakhstan111
Kenya111
Korea Republic110
Kuwait111
Kyrgyzstan111
Laos110
Latvia111
Lebanon111
Lesotho111
Liberia111
Libya001
Liechtenstein111
Lithuania111
Luxembourg111
Macau111
Madagascar111
Malawi111
Malaysia001
Maldives111
Mali111
Malta111
Mauritania111
Mauritius111
Mexico111
Moldova111
Mongolia111
Montenegro111
Montserrat110
Morocco111
Mozambique111
Myanmar110
Namibia111
Nepal111
Netherlands111
New Caledonia111
New Zealand111
Nicaragua111
Niger111
Nigeria111
North Korea001
Northern Ireland001
Norway111
Oman111
Pakistan101
Palestine111
Panama111
Paraguay111
Peru111
Philippines111
Poland111
Portugal111
Puerto Rico111
Qatar111
Republic of Ireland110
Romania111
Russia111
Rwanda111
Samoa110
San Marino111
Saudi Arabia111
Scotland111
Senegal111
Serbia111
Seychelles111
Sierra Leone111
Singapore111
Slovakia111
Slovenia111
Solomon Islands110
South Africa111
South Korea001
South Sudan110
Spain111
Sri Lanka110
St Kitts & Nevis001
St. Kitts and Nevis110
St. Lucia111
St. Vincent and the Grenadines110
Sudan111
Suriname111
Swaziland111
Sweden111
Switzerland111
Syria111
Tahiti111
Tajikistan111
Tanzania111
Thailand111
Togo001
Trinidad and Tobago001
Tunisia001
Turkey111
Turkmenistan001
Turks and Caicos Islands111
US Virgin Islands110
USA111
Uganda001
Ukraine111
United Arab Emirates111
Uruguay111
Uzbekistan111
Vanuatu001
Venezuela111
Vietnam111
Wales111
Yemen001
Zambia111
Zimbabwe111
\n", "
" ], "text/plain": [ "Vote Captain Coach Media\n", "Country \n", "Afghanistan 1 1 0\n", "Albania 1 1 1\n", "Algeria 1 1 1\n", "American Samoa 1 1 0\n", "Andorra 1 1 1\n", "Angola 1 1 1\n", "Anguilla 1 1 0\n", "Antigua and Barbuda 1 1 1\n", "Argentina 1 1 1\n", "Armenia 1 1 1\n", "Aruba 1 1 1\n", "Australia 1 1 1\n", "Austria 1 1 1\n", "Azerbaijan 1 1 1\n", "Bahamas 1 1 0\n", "Bahrain 1 1 1\n", "Bangladesh 1 1 1\n", "Barbados 1 1 0\n", "Belarus 1 1 1\n", "Belgium 1 1 1\n", "Belize 1 1 1\n", "Benin 0 0 1\n", "Bermuda 1 1 1\n", "Bhutan 1 1 1\n", "Bolivia 1 1 1\n", "Bosnia and Herzegovina 1 1 0\n", "Bosnia-Herzegovina 0 0 1\n", "Botswana 0 0 1\n", "Brazil 1 1 1\n", "British Virgin Islands 1 1 0\n", "Brunei Darussalam 1 1 1\n", "Bulgaria 1 1 1\n", "Burkina Faso 1 1 1\n", "Burma 0 0 1\n", "Burundi 1 1 0\n", "Cambodia 1 1 1\n", "Cameroon 1 1 1\n", "Canada 1 1 1\n", "Cape Verde Islands 1 1 1\n", "Cayman Islands 1 0 1\n", "Chad 1 1 1\n", "Chile 1 1 1\n", "China PR 1 1 1\n", "Chinese Taipei 1 1 0\n", "Colombia 1 1 1\n", "Comoros 1 1 1\n", "Congo 1 1 1\n", "Congo DR 1 1 1\n", "Cook Islands 1 1 0\n", "Costa Rica 1 1 1\n", "Croatia 1 1 1\n", "Cuba 1 1 1\n", "Curaçao 1 1 1\n", "Cyprus 1 1 1\n", "Czech Republic 1 1 1\n", "Côte d'Ivoire 0 1 1\n", "Denmark 1 1 1\n", "Djibouti 1 1 1\n", "Dominica 1 1 0\n", "Dominican Republic 1 1 1\n", "Ecuador 1 1 1\n", "Egypt 1 1 1\n", "El Salvador 1 1 1\n", "England 1 1 1\n", "Eritrea 0 0 1\n", "Estonia 1 1 1\n", "Ethiopia 1 1 1\n", "FYR Macedonia 1 1 1\n", "Faroe Islands 1 1 1\n", "Fiji 1 1 1\n", "Finland 1 1 1\n", "France 1 1 1\n", "Gabon 1 1 1\n", "Gambia 0 0 1\n", "Georgia 1 1 1\n", "Germany 1 1 1\n", "Ghana 1 1 1\n", "Greece 1 1 1\n", "Grenada 1 1 1\n", "Guam 1 1 0\n", "Guatemala 1 1 1\n", "Guinea 0 0 1\n", "Guinea-Bissau 0 0 1\n", "Guyana 1 1 1\n", "Haiti 0 0 1\n", "Honduras 1 1 1\n", "Hong Kong 1 1 1\n", "Hungary 1 1 1\n", "Iceland 1 1 1\n", "India 1 1 1\n", "Indonesia 1 1 1\n", "Iran 1 1 1\n", "Iraq 0 0 1\n", "Ireland 0 0 1\n", "Israel 1 1 1\n", "Italy 1 1 1\n", "Jamaica 1 1 0\n", "Japan 1 1 1\n", "Jordan 1 1 0\n", "Kazakhstan 1 1 1\n", "Kenya 1 1 1\n", "Korea Republic 1 1 0\n", "Kuwait 1 1 1\n", "Kyrgyzstan 1 1 1\n", "Laos 1 1 0\n", "Latvia 1 1 1\n", "Lebanon 1 1 1\n", "Lesotho 1 1 1\n", "Liberia 1 1 1\n", "Libya 0 0 1\n", "Liechtenstein 1 1 1\n", "Lithuania 1 1 1\n", "Luxembourg 1 1 1\n", "Macau 1 1 1\n", "Madagascar 1 1 1\n", "Malawi 1 1 1\n", "Malaysia 0 0 1\n", "Maldives 1 1 1\n", "Mali 1 1 1\n", "Malta 1 1 1\n", "Mauritania 1 1 1\n", "Mauritius 1 1 1\n", "Mexico 1 1 1\n", "Moldova 1 1 1\n", "Mongolia 1 1 1\n", "Montenegro 1 1 1\n", "Montserrat 1 1 0\n", "Morocco 1 1 1\n", "Mozambique 1 1 1\n", "Myanmar 1 1 0\n", "Namibia 1 1 1\n", "Nepal 1 1 1\n", "Netherlands 1 1 1\n", "New Caledonia 1 1 1\n", "New Zealand 1 1 1\n", "Nicaragua 1 1 1\n", "Niger 1 1 1\n", "Nigeria 1 1 1\n", "North Korea 0 0 1\n", "Northern Ireland 0 0 1\n", "Norway 1 1 1\n", "Oman 1 1 1\n", "Pakistan 1 0 1\n", "Palestine 1 1 1\n", "Panama 1 1 1\n", "Paraguay 1 1 1\n", "Peru 1 1 1\n", "Philippines 1 1 1\n", "Poland 1 1 1\n", "Portugal 1 1 1\n", "Puerto Rico 1 1 1\n", "Qatar 1 1 1\n", "Republic of Ireland 1 1 0\n", "Romania 1 1 1\n", "Russia 1 1 1\n", "Rwanda 1 1 1\n", "Samoa 1 1 0\n", "San Marino 1 1 1\n", "Saudi Arabia 1 1 1\n", "Scotland 1 1 1\n", "Senegal 1 1 1\n", "Serbia 1 1 1\n", "Seychelles 1 1 1\n", "Sierra Leone 1 1 1\n", "Singapore 1 1 1\n", "Slovakia 1 1 1\n", "Slovenia 1 1 1\n", "Solomon Islands 1 1 0\n", "South Africa 1 1 1\n", "South Korea 0 0 1\n", "South Sudan 1 1 0\n", "Spain 1 1 1\n", "Sri Lanka 1 1 0\n", "St Kitts & Nevis 0 0 1\n", "St. Kitts and Nevis 1 1 0\n", "St. Lucia 1 1 1\n", "St. Vincent and the Grenadines 1 1 0\n", "Sudan 1 1 1\n", "Suriname 1 1 1\n", "Swaziland 1 1 1\n", "Sweden 1 1 1\n", "Switzerland 1 1 1\n", "Syria 1 1 1\n", "Tahiti 1 1 1\n", "Tajikistan 1 1 1\n", "Tanzania 1 1 1\n", "Thailand 1 1 1\n", "Togo 0 0 1\n", "Trinidad and Tobago 0 0 1\n", "Tunisia 0 0 1\n", "Turkey 1 1 1\n", "Turkmenistan 0 0 1\n", "Turks and Caicos Islands 1 1 1\n", "US Virgin Islands 1 1 0\n", "USA 1 1 1\n", "Uganda 0 0 1\n", "Ukraine 1 1 1\n", "United Arab Emirates 1 1 1\n", "Uruguay 1 1 1\n", "Uzbekistan 1 1 1\n", "Vanuatu 0 0 1\n", "Venezuela 1 1 1\n", "Vietnam 1 1 1\n", "Wales 1 1 1\n", "Yemen 0 0 1\n", "Zambia 1 1 1\n", "Zimbabwe 1 1 1" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option('display.max_rows', 250)\n", "\n", "df.groupby('Country')['Vote'].value_counts().unstack(fill_value=0)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VoteCountryNameFirst (5 points)Second (3 points)Third (1 point)
0CaptainAfghanistanAmiri IslamMessi LionelCristiano RonaldoIbrahimovic Zlatan
1CaptainAlbaniaCana LorikCristiano RonaldoRobben ArjenMueller Thomas
2CaptainAlgeriaBougherra MadjidCristiano RonaldoRobben ArjenBenzema Karim
3CaptainAmerican SamoaLuvu Rafe TalaleleiNeymarRobben ArjenCristiano Ronaldo
4CaptainAndorraSonejee Masand OscarCristiano RonaldoMueller ThomasKroos Toni
\n", "
" ], "text/plain": [ " Vote Country Name First (5 points) \\\n", "0 Captain Afghanistan Amiri Islam Messi Lionel \n", "1 Captain Albania Cana Lorik Cristiano Ronaldo \n", "2 Captain Algeria Bougherra Madjid Cristiano Ronaldo \n", "3 Captain American Samoa Luvu Rafe Talalelei Neymar \n", "4 Captain Andorra Sonejee Masand Oscar Cristiano Ronaldo \n", "\n", " Second (3 points) Third (1 point) \n", "0 Cristiano Ronaldo Ibrahimovic Zlatan \n", "1 Robben Arjen Mueller Thomas \n", "2 Robben Arjen Benzema Karim \n", "3 Robben Arjen Cristiano Ronaldo \n", "4 Mueller Thomas Kroos Toni " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lensumTotal
RankFirst (5 points)Second (3 points)Third (1 point)First (5 points)Second (3 points)Third (1 point)
Name
Cristiano Ronaldo30396411515288411844
Messi Lionel55132101275396101772
Neuer Manuel85897742526777769
Robben Arjen1769598520759351
Mueller Thomas22355011010550265
Lahm Philipp121731605131142
Neymar61339303939108
Rodriguez James681830241872
Kroos Toni5111225331270
Di Maria Angel481920241963
Costa Diego291310271350
Ibrahimovic Zlatan381015241049
Bale Gareth5662518649
Toure Yaya09150271542
Goetze Mario5512515141
Benzema Karim17115211137
Iniesta Andres3461512633
Schweinsteiger Bastian239109928
Mascherano Javier2451012527
Courtois Thibaut412203225
Hazard Eden04110121123
Pogba Paul143512320
Ramos Sergio12556516
\n", "
" ], "text/plain": [ " len \\\n", "Rank First (5 points) Second (3 points) Third (1 point) \n", "Name \n", "Cristiano Ronaldo 303 96 41 \n", "Messi Lionel 55 132 101 \n", "Neuer Manuel 85 89 77 \n", "Robben Arjen 17 69 59 \n", "Mueller Thomas 22 35 50 \n", "Lahm Philipp 12 17 31 \n", "Neymar 6 13 39 \n", "Rodriguez James 6 8 18 \n", "Kroos Toni 5 11 12 \n", "Di Maria Angel 4 8 19 \n", "Costa Diego 2 9 13 \n", "Ibrahimovic Zlatan 3 8 10 \n", "Bale Gareth 5 6 6 \n", "Toure Yaya 0 9 15 \n", "Goetze Mario 5 5 1 \n", "Benzema Karim 1 7 11 \n", "Iniesta Andres 3 4 6 \n", "Schweinsteiger Bastian 2 3 9 \n", "Mascherano Javier 2 4 5 \n", "Courtois Thibaut 4 1 2 \n", "Hazard Eden 0 4 11 \n", "Pogba Paul 1 4 3 \n", "Ramos Sergio 1 2 5 \n", "\n", " sum \\\n", "Rank First (5 points) Second (3 points) Third (1 point) \n", "Name \n", "Cristiano Ronaldo 1515 288 41 \n", "Messi Lionel 275 396 101 \n", "Neuer Manuel 425 267 77 \n", "Robben Arjen 85 207 59 \n", "Mueller Thomas 110 105 50 \n", "Lahm Philipp 60 51 31 \n", "Neymar 30 39 39 \n", "Rodriguez James 30 24 18 \n", "Kroos Toni 25 33 12 \n", "Di Maria Angel 20 24 19 \n", "Costa Diego 10 27 13 \n", "Ibrahimovic Zlatan 15 24 10 \n", "Bale Gareth 25 18 6 \n", "Toure Yaya 0 27 15 \n", "Goetze Mario 25 15 1 \n", "Benzema Karim 5 21 11 \n", "Iniesta Andres 15 12 6 \n", "Schweinsteiger Bastian 10 9 9 \n", "Mascherano Javier 10 12 5 \n", "Courtois Thibaut 20 3 2 \n", "Hazard Eden 0 12 11 \n", "Pogba Paul 5 12 3 \n", "Ramos Sergio 5 6 5 \n", "\n", " Total \n", "Rank \n", "Name \n", "Cristiano Ronaldo 1844 \n", "Messi Lionel 772 \n", "Neuer Manuel 769 \n", "Robben Arjen 351 \n", "Mueller Thomas 265 \n", "Lahm Philipp 142 \n", "Neymar 108 \n", "Rodriguez James 72 \n", "Kroos Toni 70 \n", "Di Maria Angel 63 \n", "Costa Diego 50 \n", "Ibrahimovic Zlatan 49 \n", "Bale Gareth 49 \n", "Toure Yaya 42 \n", "Goetze Mario 41 \n", "Benzema Karim 37 \n", "Iniesta Andres 33 \n", "Schweinsteiger Bastian 28 \n", "Mascherano Javier 27 \n", "Courtois Thibaut 25 \n", "Hazard Eden 23 \n", "Pogba Paul 20 \n", "Ramos Sergio 16 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pf = (df.melt('Vote', df.columns[-3:].tolist(), var_name='Rank', value_name='_Name')\n", " .rename(columns={'_Name': 'Name'})\n", " .assign(Points=lambda x: x['Rank'].str.extract(r'(\\d+)', expand=False).astype(int))\n", " .pivot_table('Points', ['Vote', 'Name'], 'Rank', aggfunc=[len, 'sum'], fill_value=0)\n", " .assign(Total=lambda x: x['sum'].sum(axis='columns')))\n", "\n", "pf.groupby(level='Name').sum().sort_values(by='Total', ascending=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lensumTotal
RankFirst (5 points)Second (3 points)Third (1 point)First (5 points)Second (3 points)Third (1 point)
Name
Cristiano Ronaldo99371349511113619
Messi Lionel31442515513225312
Neuer Manuel2213271103927176
Robben Arjen32522157522112
Mueller Thomas3111615331664
\n", "
" ], "text/plain": [ " len \\\n", "Rank First (5 points) Second (3 points) Third (1 point) \n", "Name \n", "Cristiano Ronaldo 99 37 13 \n", "Messi Lionel 31 44 25 \n", "Neuer Manuel 22 13 27 \n", "Robben Arjen 3 25 22 \n", "Mueller Thomas 3 11 16 \n", "\n", " sum Total \n", "Rank First (5 points) Second (3 points) Third (1 point) \n", "Name \n", "Cristiano Ronaldo 495 111 13 619 \n", "Messi Lionel 155 132 25 312 \n", "Neuer Manuel 110 39 27 176 \n", "Robben Arjen 15 75 22 112 \n", "Mueller Thomas 15 33 16 64 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pf.loc['Captain'].sort_values(by='Total', ascending=False).head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lensumTotal
RankFirst (5 points)Second (3 points)Third (1 point)First (5 points)Second (3 points)Third (1 point)
Name
Cristiano Ronaldo8931144459314552
Messi Lionel20493510014735282
Neuer Manuel3231251609325278
Robben Arjen102620507820148
Mueller Thomas881740241781
\n", "
" ], "text/plain": [ " len \\\n", "Rank First (5 points) Second (3 points) Third (1 point) \n", "Name \n", "Cristiano Ronaldo 89 31 14 \n", "Messi Lionel 20 49 35 \n", "Neuer Manuel 32 31 25 \n", "Robben Arjen 10 26 20 \n", "Mueller Thomas 8 8 17 \n", "\n", " sum Total \n", "Rank First (5 points) Second (3 points) Third (1 point) \n", "Name \n", "Cristiano Ronaldo 445 93 14 552 \n", "Messi Lionel 100 147 35 282 \n", "Neuer Manuel 160 93 25 278 \n", "Robben Arjen 50 78 20 148 \n", "Mueller Thomas 40 24 17 81 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pf.loc['Coach'].sort_values(by='Total', ascending=False).head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lensumTotal
RankFirst (5 points)Second (3 points)Third (1 point)First (5 points)Second (3 points)Third (1 point)
Name
Cristiano Ronaldo11528145758414673
Neuer Manuel31452515513525315
Messi Lionel439412011741178
Mueller Thomas111617554817120
Robben Arjen4181720541791
\n", "
" ], "text/plain": [ " len \\\n", "Rank First (5 points) Second (3 points) Third (1 point) \n", "Name \n", "Cristiano Ronaldo 115 28 14 \n", "Neuer Manuel 31 45 25 \n", "Messi Lionel 4 39 41 \n", "Mueller Thomas 11 16 17 \n", "Robben Arjen 4 18 17 \n", "\n", " sum Total \n", "Rank First (5 points) Second (3 points) Third (1 point) \n", "Name \n", "Cristiano Ronaldo 575 84 14 673 \n", "Neuer Manuel 155 135 25 315 \n", "Messi Lionel 20 117 41 178 \n", "Mueller Thomas 55 48 17 120 \n", "Robben Arjen 20 54 17 91 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pf.loc['Media'].sort_values(by='Total', ascending=False).head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.0" } }, "nbformat": 4, "nbformat_minor": 4 }