Olympics

In [1]:

import pandas as pd

In [2]:

import numpy as np

In [3]:

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:

from scipy.interpolate import make_interp_spline

In [5]:

filepath = "C:\\Users\\abdelilah\\Downloads\\Olympic Games\\"

In [6]:

imagepath = "C:\\Users\\abdelilah\\Downloads\\Olympic Games\\Viz\\"

In [7]:

# this file contains the data of athletes and the events they competed in the olympic games from 1896 to 2016 in both the 
# summer and winter games, but since i'll focus on the summer games i'll get rid of the winter seasons

athlete_events_t = pd.read_csv(filepath + "athlete_events.csv", delimiter = ",")
athlete_events_t.head(3)

Out[7]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Games	Year	Season	City	Sport	Event	Medal
0	1	A Dijiang	M	24.0	180.0	80.0	China	CHN	1992 Summer	1992	Summer	Barcelona	Basketball	Basketball Men's Basketball	NaN
1	2	A Lamusi	M	23.0	170.0	60.0	China	CHN	2012 Summer	2012	Summer	London	Judo	Judo Men's Extra-Lightweight	NaN
2	3	Gunnar Nielsen Aaby	M	24.0	NaN	NaN	Denmark	DEN	1920 Summer	1920	Summer	Antwerpen	Football	Football Men's Football	NaN

In [8]:

athlete_events_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      271116 non-null  int64  
 1   Name    271116 non-null  object 
 2   Sex     271116 non-null  object 
 3   Age     261642 non-null  float64
 4   Height  210945 non-null  float64
 5   Weight  208241 non-null  float64
 6   Team    271116 non-null  object 
 7   NOC     271116 non-null  object 
 8   Games   271116 non-null  object 
 9   Year    271116 non-null  int64  
 10  Season  271116 non-null  object 
 11  City    271116 non-null  object 
 12  Sport   271116 non-null  object 
 13  Event   271116 non-null  object 
 14  Medal   39783 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 31.0+ MB

In [9]:

print(*sorted(athlete_events_t.Year.value_counts().index))

1896 1900 1904 1906 1908 1912 1920 1924 1928 1932 1936 1948 1952 1956 1960 1964 1968 1972 1976 1980 1984 1988 1992 1994 1996 1998 2000 2002 2004 2006 2008 2010 2012 2014 2016

In [10]:

athlete_events_s = athlete_events_t[athlete_events_t["Season"] == "Summer"]

In [11]:

sns.heatmap(athlete_events_s.isnull())

Out[11]:

<AxesSubplot:>

In [12]:

# it looks like all the columns that have missing values are the "Height", "Weight", "Age" and specially the "Medal" column
# but in this case i don't think it's a bad thing since not all the athletes will win something, meaning that a missing value 
# means the athlete won nothing 

In [13]:

((athlete_events_s["Games"].str.strip(" Summer")).astype(int)).equals(athlete_events_s["Year"].astype(int))

Out[13]:

True

In [14]:

# now that i'm sure the two columns are the same i'll drop the "Games" one
athlete_events_s = athlete_events_s.drop("Games", axis = 1)

In [15]:

athlete_events_s.Season.unique()

Out[15]:

array(['Summer'], dtype=object)

In [16]:

#seems like the "Season" column has just one value which is Summer so i'll drop it

In [17]:

athlete_events_s = athlete_events_s.drop("Season", axis = 1)

In [18]:

print(sorted(list(athlete_events_s["Year"].value_counts().index)))

[1896, 1900, 1904, 1906, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016]

In [19]:

athlete_events_s.head()

Out[19]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Event	Medal
0	1	A Dijiang	M	24.0	180.0	80.0	China	CHN	1992	Barcelona	Basketball	Basketball Men's Basketball	NaN
1	2	A Lamusi	M	23.0	170.0	60.0	China	CHN	2012	London	Judo	Judo Men's Extra-Lightweight	NaN
2	3	Gunnar Nielsen Aaby	M	24.0	NaN	NaN	Denmark	DEN	1920	Antwerpen	Football	Football Men's Football	NaN
3	4	Edgar Lindenau Aabye	M	34.0	NaN	NaN	Denmark/Sweden	DEN	1900	Paris	Tug-Of-War	Tug-Of-War Men's Tug-Of-War	Gold
26	8	Cornelia "Cor" Aalten (-Strannood)	F	18.0	168.0	NaN	Netherlands	NED	1932	Los Angeles	Athletics	Athletics Women's 100 metres	NaN

In [20]:

# this DF contains the data about each country and the medals they won since 1896 to 2018
olympics_t = pd.read_csv(filepath + "Olympics.csv", delimiter = ",")
olympics_t.head()

Out[20]:

	Nation	Population	Exists	Code	First_App	Medal	S_Medal	Apps	Medal.1	...	L20_Apps	L20_Medal	L20_Gold	L20_Silver	L20_Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes
0	Afghanistan	35530081	YES	AFG	1936	0.14	0.14	14	2	...	4	2	0	0	2	Taekwondo	2	0	0	2
1	Albania	2930187	YES	ALB	1972	0.00	0.00	12	0	...	9	0	0	0	0	-	0	0	0	0
2	Algeria	41318142	YES	ALG	1964	1.06	1.31	16	17	...	7	10	2	4	4	Athletics	9	4	3	2
3	American Samoa	55641	YES	ASA	1988	0.00	0.00	9	0	...	5	0	0	0	0	-	0	0	0	0
4	Andorra	76965	YES	AND	1976	0.00	0.00	23	0	...	11	0	0	0	0	-	0	0	0	0

5 rows × 33 columns

In [21]:

olympics_t.columns

Out[21]:

Index(['Nation', 'Population', 'Exists', 'Code', 'First_App', 'Medal',
       'S_Medal', 'W_Medal', 'Apps', 'Medal.1', 'Gold', 'Silver', 'Bronze',
       'SO_Apps', 'SO_Medal', 'SO_Gold', 'SO_Silver', 'SO_Bronze', 'WO_Apps',
       'WO_Medal', 'WO_Gold', 'WO_Silver', 'WO_Bronze', 'L20_Apps',
       'L20_Medal', 'L20_Gold', 'L20_Silver', 'L20_Bronze',
       'MostSuccessfulSport', 'Medals', 'Golds', 'Silvers', 'Bronzes'],
      dtype='object')

In [22]:

olympics_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 33 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Nation               226 non-null    object 
 1   Population           226 non-null    int64  
 2   Exists               226 non-null    object 
 3   Code                 226 non-null    object 
 4   First_App            226 non-null    int64  
 5   Medal                226 non-null    float64
 6   S_Medal              226 non-null    float64
 7   W_Medal              226 non-null    float64
 8   Apps                 226 non-null    int64  
 9   Medal.1              226 non-null    int64  
 10  Gold                 226 non-null    int64  
 11  Silver               226 non-null    int64  
 12  Bronze               226 non-null    int64  
 13  SO_Apps              226 non-null    int64  
 14  SO_Medal             226 non-null    int64  
 15  SO_Gold              226 non-null    int64  
 16  SO_Silver            226 non-null    int64  
 17  SO_Bronze            226 non-null    int64  
 18  WO_Apps              226 non-null    int64  
 19  WO_Medal             226 non-null    int64  
 20  WO_Gold              226 non-null    int64  
 21  WO_Silver            226 non-null    int64  
 22  WO_Bronze            226 non-null    int64  
 23  L20_Apps             226 non-null    int64  
 24  L20_Medal            226 non-null    int64  
 25  L20_Gold             226 non-null    int64  
 26  L20_Silver           226 non-null    int64  
 27  L20_Bronze           226 non-null    int64  
 28  MostSuccessfulSport  226 non-null    object 
 29  Medals               226 non-null    int64  
 30  Golds                226 non-null    int64  
 31  Silvers              226 non-null    int64  
 32  Bronzes              226 non-null    int64  
dtypes: float64(3), int64(26), object(4)
memory usage: 58.4+ KB

In [23]:

olympics_s = olympics_t.drop(["Medal", "W_Medal", "Apps", "Medal.1", "Gold", "Silver", "Bronze", 
                              "WO_Apps", "WO_Medal", "WO_Gold", "WO_Silver", "WO_Bronze"], axis = 1)

In [24]:

olympics_s.head(3)

Out[24]:

	Nation	Population	Exists	Code	First_App	S_Medal	SO_Apps	SO_Medal	SO_Gold	SO_Silver	...	L20_Apps	L20_Medal	L20_Gold	L20_Silver	L20_Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes
0	Afghanistan	35530081	YES	AFG	1936	0.14	14	2	0	0	...	4	2	0	0	2	Taekwondo	2	0	0	2
1	Albania	2930187	YES	ALB	1972	0.00	8	0	0	0	...	9	0	0	0	0	-	0	0	0	0
2	Algeria	41318142	YES	ALG	1964	1.31	13	17	5	4	...	7	10	2	4	4	Athletics	9	4	3	2

3 rows × 21 columns

In [25]:

olympics_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Nation               226 non-null    object 
 1   Population           226 non-null    int64  
 2   Exists               226 non-null    object 
 3   Code                 226 non-null    object 
 4   First_App            226 non-null    int64  
 5   S_Medal              226 non-null    float64
 6   SO_Apps              226 non-null    int64  
 7   SO_Medal             226 non-null    int64  
 8   SO_Gold              226 non-null    int64  
 9   SO_Silver            226 non-null    int64  
 10  SO_Bronze            226 non-null    int64  
 11  L20_Apps             226 non-null    int64  
 12  L20_Medal            226 non-null    int64  
 13  L20_Gold             226 non-null    int64  
 14  L20_Silver           226 non-null    int64  
 15  L20_Bronze           226 non-null    int64  
 16  MostSuccessfulSport  226 non-null    object 
 17  Medals               226 non-null    int64  
 18  Golds                226 non-null    int64  
 19  Silvers              226 non-null    int64  
 20  Bronzes              226 non-null    int64  
dtypes: float64(1), int64(16), object(4)
memory usage: 37.2+ KB

In [26]:

olympics_s = olympics_s.rename(columns = {"S_Medal" : "Avg_Medal", "SO_Apps" : "Apps", "SO_Medal" : "Total", 
                             "SO_Gold" : "Gold", "SO_Silver" : "Silver", "SO_Bronze" : "Bronze"})

In [27]:

# i dropped a few columns that had info either about the total apps in the olympics summer and winter, or the total winter apps
# "Medal", "W_Medal", "Apps", "Medal.1", "Gold", "Silver", "Bronze", "WO_Apps", "WO_Medal", "WO_Gold", "WO_Silver", "WO_Bronze"
# and i will drop more columns that seem to have info that have to do with total games
# "L20_Apps", "L20_Medal", "L20_Gold", "L20_Silver", "L20_Bronze"
# "Medals", "Golds", "Silvers", "Bronzes" this might have data about Winter games but most are more likely to do with the Summer
# Games

In [28]:

olympics_s = olympics_s.drop(["L20_Apps", "L20_Medal", "L20_Gold", "L20_Silver", "L20_Bronze"], axis = 1)

In [29]:

sns.heatmap(olympics_s.isnull())

Out[29]:

<AxesSubplot:>

In [30]:

print(sorted(list(olympics_s.First_App.value_counts().index)))

[1896, 1900, 1904, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1966, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1994, 1996, 2000, 2004, 2008, 2016, 2018]

In [31]:

olympics_s.head()

Out[31]:

	Nation	Population	Exists	Code	First_App	Avg_Medal	Apps	Total	Gold	Silver	Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes
0	Afghanistan	35530081	YES	AFG	1936	0.14	14	2	0	0	2	Taekwondo	2	0	0	2
1	Albania	2930187	YES	ALB	1972	0.00	8	0	0	0	0	-	0	0	0	0
2	Algeria	41318142	YES	ALG	1964	1.31	13	17	5	4	8	Athletics	9	4	3	2
3	American Samoa	55641	YES	ASA	1988	0.00	8	0	0	0	0	-	0	0	0	0
4	Andorra	76965	YES	AND	1976	0.00	11	0	0	0	0	-	0	0	0	0

In [32]:

# this DF has data about the medals for each event in the summer olympics 1976-2008
summer_olympics = pd.read_csv(filepath + "Summer-Olympic-medals-1976-to-2008.csv", delimiter = ",")
summer_olympics

Out[32]:

	City	Year	Sport	Discipline	Event	Athlete	Gender	Country_Code	Country	Event_gender	Medal
0	Montreal	1976.0	Aquatics	Diving	3m springboard	K�HLER, Christa	Women	GDR	East Germany	W	Silver
1	Montreal	1976.0	Aquatics	Diving	3m springboard	KOSENKOV, Aleksandr	Men	URS	Soviet Union	M	Bronze
2	Montreal	1976.0	Aquatics	Diving	3m springboard	BOGGS, Philip George	Men	USA	United States	M	Gold
3	Montreal	1976.0	Aquatics	Diving	3m springboard	CAGNOTTO, Giorgio Franco	Men	ITA	Italy	M	Silver
4	Montreal	1976.0	Aquatics	Diving	10m platform	WILSON, Deborah Keplar	Women	USA	United States	W	Bronze
...	...	...	...	...	...	...	...	...	...	...	...
15428	Beijing	2008.0	Wrestling	Wrestling Gre-R	66 - 74kg	GUENOT, Christophe	Men	FRA	France	M	Bronze
15429	Beijing	2008.0	Wrestling	Wrestling Gre-R	66 - 74kg	KVIRKELIA, Manuchar	Men	GEO	Georgia	M	Gold
15430	Beijing	2008.0	Wrestling	Wrestling Gre-R	55 - 60kg	RAHIMOV, Vitaliy	Men	AZE	Azerbaijan	M	Silver
15431	Beijing	2008.0	Wrestling	Wrestling Gre-R	60 - 66kg	GUENOT, Steeve	Men	FRA	France	M	Gold
15432	Beijing	2008.0	Wrestling	Wrestling Gre-R	96 - 120kg	LOPEZ, Mijain	Men	CUB	Cuba	M	Gold

15433 rows × 11 columns

In [33]:

summer_olympics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15433 entries, 0 to 15432
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   City          15316 non-null  object 
 1   Year          15316 non-null  float64
 2   Sport         15316 non-null  object 
 3   Discipline    15316 non-null  object 
 4   Event         15316 non-null  object 
 5   Athlete       15316 non-null  object 
 6   Gender        15316 non-null  object 
 7   Country_Code  15316 non-null  object 
 8   Country       15316 non-null  object 
 9   Event_gender  15316 non-null  object 
 10  Medal         15316 non-null  object 
dtypes: float64(1), object(10)
memory usage: 1.3+ MB

In [34]:

sns.heatmap(summer_olympics[summer_olympics["Year"].isnull() == True].isnull())

Out[34]:

<AxesSubplot:>

In [35]:

sns.heatmap(summer_olympics.isnull())

Out[35]:

<AxesSubplot:>

In [36]:

# it looks like there's a big portion of the dataset that's missing, luckily it's connected so i can drop it at once

In [37]:

idx_missing = summer_olympics[summer_olympics["Year"].isnull() == True].index

In [38]:

idx_missing

Out[38]:

Int64Index([770, 771, 772, 773, 774, 775, 776, 777, 778, 779,
            ...
            877, 878, 879, 880, 881, 882, 883, 884, 885, 886],
           dtype='int64', length=117)

In [39]:

summer_olympics = summer_olympics.drop(idx_missing)

In [40]:

summer_olympics["Year"] = pd.to_numeric(summer_olympics.Year, downcast = 'integer')

In [41]:

print(sorted(list(summer_olympics.Year.value_counts().index)))

[1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008]

In [42]:

summer_olympics

Out[42]:

	City	Year	Sport	Discipline	Event	Athlete	Gender	Country_Code	Country	Event_gender	Medal
0	Montreal	1976	Aquatics	Diving	3m springboard	K�HLER, Christa	Women	GDR	East Germany	W	Silver
1	Montreal	1976	Aquatics	Diving	3m springboard	KOSENKOV, Aleksandr	Men	URS	Soviet Union	M	Bronze
2	Montreal	1976	Aquatics	Diving	3m springboard	BOGGS, Philip George	Men	USA	United States	M	Gold
3	Montreal	1976	Aquatics	Diving	3m springboard	CAGNOTTO, Giorgio Franco	Men	ITA	Italy	M	Silver
4	Montreal	1976	Aquatics	Diving	10m platform	WILSON, Deborah Keplar	Women	USA	United States	W	Bronze
...	...	...	...	...	...	...	...	...	...	...	...
15428	Beijing	2008	Wrestling	Wrestling Gre-R	66 - 74kg	GUENOT, Christophe	Men	FRA	France	M	Bronze
15429	Beijing	2008	Wrestling	Wrestling Gre-R	66 - 74kg	KVIRKELIA, Manuchar	Men	GEO	Georgia	M	Gold
15430	Beijing	2008	Wrestling	Wrestling Gre-R	55 - 60kg	RAHIMOV, Vitaliy	Men	AZE	Azerbaijan	M	Silver
15431	Beijing	2008	Wrestling	Wrestling Gre-R	60 - 66kg	GUENOT, Steeve	Men	FRA	France	M	Gold
15432	Beijing	2008	Wrestling	Wrestling Gre-R	96 - 120kg	LOPEZ, Mijain	Men	CUB	Cuba	M	Gold

15316 rows × 11 columns

In [43]:

olympics_s.head()

Out[43]:

	Nation	Population	Exists	Code	First_App	Avg_Medal	Apps	Total	Gold	Silver	Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes
0	Afghanistan	35530081	YES	AFG	1936	0.14	14	2	0	0	2	Taekwondo	2	0	0	2
1	Albania	2930187	YES	ALB	1972	0.00	8	0	0	0	0	-	0	0	0	0
2	Algeria	41318142	YES	ALG	1964	1.31	13	17	5	4	8	Athletics	9	4	3	2
3	American Samoa	55641	YES	ASA	1988	0.00	8	0	0	0	0	-	0	0	0	0
4	Andorra	76965	YES	AND	1976	0.00	11	0	0	0	0	-	0	0	0	0

In [44]:

athlete_events_s.head()

Out[44]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Event	Medal
0	1	A Dijiang	M	24.0	180.0	80.0	China	CHN	1992	Barcelona	Basketball	Basketball Men's Basketball	NaN
1	2	A Lamusi	M	23.0	170.0	60.0	China	CHN	2012	London	Judo	Judo Men's Extra-Lightweight	NaN
2	3	Gunnar Nielsen Aaby	M	24.0	NaN	NaN	Denmark	DEN	1920	Antwerpen	Football	Football Men's Football	NaN
3	4	Edgar Lindenau Aabye	M	34.0	NaN	NaN	Denmark/Sweden	DEN	1900	Paris	Tug-Of-War	Tug-Of-War Men's Tug-Of-War	Gold
26	8	Cornelia "Cor" Aalten (-Strannood)	F	18.0	168.0	NaN	Netherlands	NED	1932	Los Angeles	Athletics	Athletics Women's 100 metres	NaN

In [45]:

tokyo_athletes = pd.read_excel(r'C:\Users\abdelilah\Downloads\Olympic Games\2021 Tokyo\Athletes.xlsx')
tokyo_genders = pd.read_excel(r'C:\Users\abdelilah\Downloads\Olympic Games\2021 Tokyo\EntriesGender.xlsx')
tokyo_medals = pd.read_excel(r'C:\Users\abdelilah\Downloads\Olympic Games\2021 Tokyo\Medals.xlsx')

C:\Users\abdelilah\anaconda3\lib\site-packages\openpyxl\styles\stylesheet.py:221: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

In [46]:

tokyo_athletes.head()

Out[46]:

	Name	NOC	Discipline
0	AALERUD Katrine	Norway	Cycling Road
1	ABAD Nestor	Spain	Artistic Gymnastics
2	ABAGNALE Giovanni	Italy	Rowing
3	ABALDE Alberto	Spain	Basketball
4	ABALDE Tamara	Spain	Basketball

In [47]:

tokyo_genders.head()

Out[47]:

	Discipline	Female	Male	Total
0	3x3 Basketball	32	32	64
1	Archery	64	64	128
2	Artistic Gymnastics	98	98	196
3	Artistic Swimming	105	0	105
4	Athletics	969	1072	2041

In [48]:

tokyo_medals.head()

Out[48]:

	Rank	Team/NOC	Gold	Silver	Bronze	Total	Rank by Total
0	1	United States of America	39	41	33	113	1
1	2	People's Republic of China	38	32	18	88	2
2	3	Japan	27	14	17	58	5
3	4	Great Britain	22	21	22	65	4
4	5	ROC	20	28	23	71	3

In [49]:

noc = pd.read_csv("https://gist.githubusercontent.com/radcliff/f09c0f88344a7fcef373/raw/2753c482ad091c54b1822288ad2e4811c021d8ec/wikipedia-iso-country-codes.csv")
noc.head(3)

Out[49]:

	English short name lower case	Alpha-2 code	Alpha-3 code	Numeric code	ISO 3166-2
0	Afghanistan	AF	AFG	4	ISO 3166-2:AF
1	Åland Islands	AX	ALA	248	ISO 3166-2:AX
2	Albania	AL	ALB	8	ISO 3166-2:AL

In [50]:

noc = noc.drop(["Alpha-2 code", "Numeric code", "ISO 3166-2"], axis = 1)

In [51]:

noc = noc.rename(columns = {"English short name lower case" : "Team/NOC", "Alpha-3 code" : "Code"})

In [52]:

noc

Out[52]:

	Team/NOC	Code
0	Afghanistan	AFG
1	Åland Islands	ALA
2	Albania	ALB
3	Algeria	DZA
4	American Samoa	ASM
...	...	...
241	Wallis and Futuna	WLF
242	Western Sahara	ESH
243	Yemen	YEM
244	Zambia	ZMB
245	Zimbabwe	ZWE

246 rows × 2 columns

In [53]:

tokyo_medals = tokyo_medals.merge(noc, on = "Team/NOC", how = 'left')

In [54]:

mssg_nats = list(tokyo_medals[tokyo_medals["Code"].isnull() == True]["Team/NOC"])

In [55]:

mssg_codes = ["USA", "CHN", "ROC", "GBR", "KOR", "TWN", "IRN", "HKG", "VEN", "XKX", "MDA", "MKD"]

In [56]:

mssg = pd.DataFrame(columns = ["Nation", "Code"])

In [57]:

mssg["Nation"] = mssg_nats
mssg["Code"] = mssg_codes

In [58]:

mssg

Out[58]:

	Nation	Code
0	United States of America	USA
1	People's Republic of China	CHN
2	Great Britain	ROC
3	ROC	GBR
4	Republic of Korea	KOR
5	Islamic Republic of Iran	TWN
6	Chinese Taipei	IRN
7	Kosovo	HKG
8	Venezuela	VEN
9	Hong Kong, China	XKX
10	North Macedonia	MDA
11	Republic of Moldova	MKD

In [59]:

for nat in mssg_nats:
    tokyo_medals.loc[tokyo_medals["Team/NOC"] == nat,"Code"] = mssg.loc[mssg["Nation"] == nat, "Code"]

In [60]:

tokyo_medals = tokyo_medals[["Rank", "Team/NOC", "Code", "Gold", "Silver", "Bronze", "Total", "Rank by Total"]]

In [61]:

tokyo_medals.head(2)

Out[61]:

	Rank	Team/NOC	Code	Gold	Silver	Bronze	Total	Rank by Total
0	1	United States of America	USA	39	41	33	113	1
1	2	People's Republic of China	CHN	38	32	18	88	2

In [62]:

olympics_s.head(2)

Out[62]:

	Nation	Population	Exists	Code	First_App	Avg_Medal	Apps	Total	Gold	Silver	Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes
0	Afghanistan	35530081	YES	AFG	1936	0.14	14	2	0	0	2	Taekwondo	2	0	0	2
1	Albania	2930187	YES	ALB	1972	0.00	8	0	0	0	0	-	0	0	0	0

In [63]:

updated = olympics_s.merge(tokyo_medals, how='left', on=['Code'], suffixes=('', '_new'))

In [64]:

updated = updated.drop(["Rank by Total", "Rank", "Team/NOC"], axis = 1)

In [65]:

updated.head(2)

Out[65]:

	Nation	Population	Exists	Code	First_App	Avg_Medal	Apps	Total	Gold	Silver	Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes	Gold_new	Silver_new	Bronze_new	Total_new
0	Afghanistan	35530081	YES	AFG	1936	0.14	14	2	0	0	2	Taekwondo	2	0	0	2	NaN	NaN	NaN	NaN
1	Albania	2930187	YES	ALB	1972	0.00	8	0	0	0	0	-	0	0	0	0	NaN	NaN	NaN	NaN

In [66]:

tokyo_cntrs = list(tokyo_athletes["NOC"].value_counts().index)
tokyo_cntrs

Out[66]:

['United States of America',
 'Japan',
 'Australia',
 "People's Republic of China",
 'Germany',
 'France',
 'Canada',
 'Great Britain',
 'Italy',
 'Spain',
 'ROC',
 'Brazil',
 'Netherlands',
 'Republic of Korea',
 'New Zealand',
 'Poland',
 'Argentina',
 'South Africa',
 'Hungary',
 'Mexico',
 'Ukraine',
 'Egypt',
 'Sweden',
 'Belgium',
 'India',
 'Czech Republic',
 'Ireland',
 'Switzerland',
 'Belarus',
 'Denmark',
 'Turkey',
 'Romania',
 'Kazakhstan',
 'Norway',
 'Israel',
 'Portugal',
 'Serbia',
 'Kenya',
 'Greece',
 'Austria',
 'Cuba',
 'Chinese Taipei',
 'Islamic Republic of Iran',
 'Colombia',
 'Uzbekistan',
 'Dominican Republic',
 'Jamaica',
 'Nigeria',
 'Tunisia',
 'Croatia',
 'Chile',
 'Slovenia',
 'Morocco',
 'Ecuador',
 'Finland',
 'Mongolia',
 'Venezuela',
 'Ethiopia',
 'Azerbaijan',
 'Bulgaria',
 'Algeria',
 'Hong Kong, China',
 'Thailand',
 'Slovakia',
 'Lithuania',
 'Puerto Rico',
 'Georgia',
 'Montenegro',
 'Peru',
 'Estonia',
 'Saudi Arabia',
 'Bahrain',
 'Trinidad and Tobago',
 "Côte d'Ivoire",
 'Zambia',
 'Latvia',
 'Malaysia',
 'Fiji',
 'Refugee Olympic Team',
 'Indonesia',
 'Honduras',
 'Uganda',
 'Singapore',
 'Guatemala',
 'Angola',
 'Republic of Moldova',
 'Philippines',
 'Vietnam',
 'Armenia',
 'Bahamas',
 'Kyrgyzstan',
 'Cyprus',
 'Qatar',
 'Ghana',
 'Botswana',
 'Eritrea',
 'Costa Rica',
 'Uruguay',
 'Jordan',
 'Namibia',
 'Luxembourg',
 'Cameroon',
 'Pakistan',
 'Kuwait',
 'Kosovo',
 'Panama',
 'Senegal',
 'Sri Lanka',
 'Mozambique',
 'Albania',
 'Tajikistan',
 'Nicaragua',
 'Samoa',
 'Paraguay',
 'Turkmenistan',
 'North Macedonia',
 'Papua New Guinea',
 'Burkina Faso',
 'Niger',
 'Guyana',
 'Democratic Republic of the Congo',
 'Benin',
 'Bosnia and Herzegovina',
 'Mauritius',
 'Barbados',
 'Grenada',
 'Lebanon',
 'Monaco',
 'Cook Islands',
 'Malta',
 'Madagascar',
 'Syrian Arab Republic',
 'Burundi',
 'Cape Verde',
 'Antigua and Barbuda',
 'Haiti',
 'Bangladesh',
 'Nepal',
 'Oman',
 'Gabon',
 'Guinea',
 'Rwanda',
 'Seychelles',
 'El Salvador',
 'Bolivia',
 'Tonga',
 'Saint Lucia',
 'Sudan',
 'Malawi',
 'Guam',
 'Cayman Islands',
 'Afghanistan',
 'Liechtenstein',
 'American Samoa',
 'Zimbabwe',
 'Eswatini',
 'Libya',
 'Virgin Islands, US',
 "Lao People's Democratic Republic",
 'United Arab Emirates',
 'Djibouti',
 'Maldives',
 'Mali',
 'Sierra Leone',
 'Togo',
 'San Marino',
 'Palestine',
 'Iceland',
 'Iraq',
 'Guinea-Bissau',
 'Belize',
 'Aruba',
 'Chad',
 'Sao Tome and Principe',
 'Congo',
 'Liberia',
 'Palau',
 'Yemen',
 'Suriname',
 'Equatorial Guinea',
 'Comoros',
 'Solomon Islands',
 'Kiribati',
 'Cambodia',
 'Bhutan',
 'Virgin Islands, British',
 'Democratic Republic of Timor-Leste',
 'Federated States of Micronesia',
 'Gambia',
 'Brunei Darussalam',
 'South Sudan',
 'Nauru',
 'United Republic of Tanzania',
 'Mauritania',
 'Bermuda',
 'Central African Republic',
 'Dominica',
 'St Vincent and the Grenadines',
 'Myanmar',
 'Andorra',
 'Somalia',
 'Marshall Islands',
 'Tuvalu',
 'Saint Kitts and Nevis',
 'Vanuatu',
 'Lesotho']

In [67]:

# again i have to replace the nations' in this list with its code as the names in the list could be different form the df, such
# as "United States"
tokyo_nocs = []
not_fnd = []
for nat in tokyo_cntrs:
    if nat in tokyo_medals["Team/NOC"].values:
        tokyo_nocs.append(tokyo_medals.loc[tokyo_medals["Team/NOC"] == nat,"Code"].values[0])
        if str(tokyo_medals.loc[tokyo_medals["Team/NOC"] == nat,"Code"].values[0]) == 'nan':
            not_fnd.append(nat)
    else: 
        not_fnd.append(nat)

In [68]:

# i did this as i got 'nan' in my list so i had to clean it
tokyo_nocs = [x for x in tokyo_nocs if str(x) != 'nan']

In [69]:

# i chose to use the tokyo_medals for this as the names are the same and i could avoid not including any names that are
# different, plus this list gives us a clear idea of who participated but won nothing
not_fnd

Out[69]:

['Great Britain',
 'ROC',
 'Republic of Korea',
 'Chinese Taipei',
 'Islamic Republic of Iran',
 'Chile',
 'Venezuela',
 'Algeria',
 'Hong Kong, China',
 'Montenegro',
 'Peru',
 'Trinidad and Tobago',
 'Zambia',
 'Refugee Olympic Team',
 'Honduras',
 'Singapore',
 'Guatemala',
 'Angola',
 'Republic of Moldova',
 'Vietnam',
 'Cyprus',
 'Eritrea',
 'Costa Rica',
 'Uruguay',
 'Luxembourg',
 'Cameroon',
 'Pakistan',
 'Kosovo',
 'Panama',
 'Senegal',
 'Sri Lanka',
 'Mozambique',
 'Albania',
 'Tajikistan',
 'Nicaragua',
 'Samoa',
 'Paraguay',
 'North Macedonia',
 'Papua New Guinea',
 'Niger',
 'Guyana',
 'Democratic Republic of the Congo',
 'Benin',
 'Bosnia and Herzegovina',
 'Mauritius',
 'Barbados',
 'Lebanon',
 'Monaco',
 'Cook Islands',
 'Malta',
 'Madagascar',
 'Burundi',
 'Cape Verde',
 'Antigua and Barbuda',
 'Haiti',
 'Bangladesh',
 'Nepal',
 'Oman',
 'Gabon',
 'Guinea',
 'Rwanda',
 'Seychelles',
 'El Salvador',
 'Bolivia',
 'Tonga',
 'Saint Lucia',
 'Sudan',
 'Malawi',
 'Guam',
 'Cayman Islands',
 'Afghanistan',
 'Liechtenstein',
 'American Samoa',
 'Zimbabwe',
 'Eswatini',
 'Libya',
 'Virgin Islands, US',
 "Lao People's Democratic Republic",
 'United Arab Emirates',
 'Djibouti',
 'Maldives',
 'Mali',
 'Sierra Leone',
 'Togo',
 'Palestine',
 'Iceland',
 'Iraq',
 'Guinea-Bissau',
 'Belize',
 'Aruba',
 'Chad',
 'Sao Tome and Principe',
 'Congo',
 'Liberia',
 'Palau',
 'Yemen',
 'Suriname',
 'Equatorial Guinea',
 'Comoros',
 'Solomon Islands',
 'Kiribati',
 'Cambodia',
 'Bhutan',
 'Virgin Islands, British',
 'Democratic Republic of Timor-Leste',
 'Federated States of Micronesia',
 'Gambia',
 'Brunei Darussalam',
 'South Sudan',
 'Nauru',
 'United Republic of Tanzania',
 'Mauritania',
 'Central African Republic',
 'Dominica',
 'St Vincent and the Grenadines',
 'Myanmar',
 'Andorra',
 'Somalia',
 'Marshall Islands',
 'Tuvalu',
 'Saint Kitts and Nevis',
 'Vanuatu',
 'Lesotho']

In [70]:

not_fnd_2 = []
for nat in not_fnd:
    if nat in noc["Team/NOC"].values:
        tokyo_nocs.append(noc.loc[noc["Team/NOC"] == nat,"Code"].values[0])
        if str(noc.loc[noc["Team/NOC"] == nat,"Code"].values[0]) == 'nan':
            not_fnd_2.append(nat)
    else :
        not_fnd_2.append(nat)

In [71]:

not_fnd_2

Out[71]:

['Great Britain',
 'ROC',
 'Republic of Korea',
 'Chinese Taipei',
 'Islamic Republic of Iran',
 'Venezuela',
 'Hong Kong, China',
 'Refugee Olympic Team',
 'Republic of Moldova',
 'Vietnam',
 'Kosovo',
 'North Macedonia',
 'Democratic Republic of the Congo',
 'Bolivia',
 'Eswatini',
 'Libya',
 'Virgin Islands, US',
 'Palestine',
 'Democratic Republic of Timor-Leste',
 'Federated States of Micronesia',
 'South Sudan',
 'United Republic of Tanzania',
 'St Vincent and the Grenadines']

In [72]:

# i had  to add these last ones by typimg them one by one as they couldn't be found in the "Country/Code" DF i have, maybe 
# i chose an old one or the names are mixed and don't have the same notation or both
missing_nocs = ["GBR", "ROC", "KOR", "TWN", "IRN", "MDA", "VNM", "XKX", "MLD", "COD", "BOL", "SWZ", "PSE", 
                "VIR", "LBY", "FSM", "TLS", "TZA", "SSD", "VCT"]

In [73]:

for i in missing_nocs :
    tokyo_nocs.append(i)

In [74]:

updated.head(1)

Out[74]:

	Nation	Population	Exists	Code	First_App	Avg_Medal	Apps	Total	Gold	Silver	Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes	Gold_new	Silver_new	Bronze_new	Total_new
0	Afghanistan	35530081	YES	AFG	1936	0.14	14	2	0	0	2	Taekwondo	2	0	0	2	NaN	NaN	NaN	NaN

In [75]:

for cod in updated["Code"].values:
    if cod in tokyo_nocs:
        updated.loc[updated["Code"] == cod, "Apps"] += 1

In [76]:

updated["Total_n"] = ""
updated["Gold_n"] = ""
updated["Silver_n"] = ""
updated["Bronze_n"] = ""

In [77]:

updated["Total_n"] = updated[["Total", "Total_new"]].sum(axis = 1)
updated["Gold_n"] = updated[["Gold", "Gold_new"]].sum(axis = 1)
updated["Silver_n"] = updated[["Silver", "Silver_new"]].sum(axis = 1)
updated["Bronze_n"] = updated[["Bronze", "Bronze_new"]].sum(axis = 1)

In [78]:

updated = updated.drop(["Total", "Total_new", "Gold", "Gold_new", "Silver", "Silver_new", "Bronze", "Bronze_new"], axis = 1)

In [79]:

updated = updated.rename(columns = {"Total_n" : "Total", "Gold_n" : "Gold", "Silver_n" : "Silver", "Bronze_n" : "Bronze"})

In [80]:

updated = updated[["Nation", "Population", "Exists", "Code", "First_App", "Apps", 
                   "Avg_Medal", "Total", "Gold", "Silver", "Bronze", "MostSuccessfulSport", 
                   "Medals", "Golds", "Silvers", "Bronzes"]]

In [81]:

updated["Total"] = updated["Total"].astype(int)
updated["Gold"] = updated["Gold"].astype(int)
updated["Silver"] = updated["Silver"].astype(int)
updated["Bronze"] = updated["Bronze"].astype(int)

In [82]:

updated["Avg_Medal"] = updated["Total"].div(updated["Apps"]).round(2)

In [83]:

updated.head(10)

Out[83]:

	Nation	Population	Exists	Code	First_App	Apps	Avg_Medal	Total	Gold	Silver	Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes
0	Afghanistan	35530081	YES	AFG	1936	15	0.13	2	0	0	2	Taekwondo	2	0	0	2
1	Albania	2930187	YES	ALB	1972	9	0.00	0	0	0	0	-	0	0	0	0
2	Algeria	41318142	YES	ALG	1964	13	1.31	17	5	4	8	Athletics	9	4	3	2
3	American Samoa	55641	YES	ASA	1988	8	0.00	0	0	0	0	-	0	0	0	0
4	Andorra	76965	YES	AND	1976	12	0.00	0	0	0	0	-	0	0	0	0
5	Angola	29784193	YES	ANG	1980	9	0.00	0	0	0	0	-	0	0	0	0
6	Antigua and Barbuda	102012	YES	ANT	1976	10	0.00	0	0	0	0	-	0	0	0	0
7	Argentina	44271041	YES	ARG	1900	25	3.08	77	21	26	30	Footbal	4	2	2	0
8	Armenia	2930450	YES	ARM	1994	7	2.57	18	2	8	8	Wrestling	8	2	3	3
9	Aruba	105264	YES	ARU	1988	8	0.00	0	0	0	0	-	0	0	0	0

In [84]:

updated.columns

Out[84]:

Index(['Nation', 'Population', 'Exists', 'Code', 'First_App', 'Apps',
       'Avg_Medal', 'Total', 'Gold', 'Silver', 'Bronze', 'MostSuccessfulSport',
       'Medals', 'Golds', 'Silvers', 'Bronzes'],
      dtype='object')

In [85]:

updated = updated.sort_values(by = "Total", ascending = False)

In [86]:

updated = updated.reset_index(drop = True)

In [87]:

updated.to_csv(filepath + 'Updated Athlete Events.csv')

In [88]:

#1.1.1 and best one for this is "olympics_s"
sns.set_theme(palette = "Set2")
updated.head(30).plot(x = "Nation", 
                         y = ["Gold", "Silver", "Bronze"], 
                         kind = "bar", 
                         figsize = (15, 8))
plt.xlabel("Nations", 
           fontdict = {"fontsize" : 8, 
                       "fontweight" : "bold"})
plt.xticks(rotation = 60)
plt.ylabel("Medals Won", 
           fontdict = {"fontsize" : 8, 
                       "fontweight" : "bold"})
plt.title("Most Medals Won By The 30 Top Countries", 
          fontdict = {"fontsize" : 12,
                      "fontweight" : "bold"})

plt.savefig(imagepath + 'Most Medals Won By The 30 Top Countries.png')

In [89]:

arab_land = pd.read_csv(filepath + "arab_countries.csv", 
                        delimiter = ",", 
                        usecols = ["Country"], 
                        skipfooter = 5, 
                        engine = 'python')
arab_land.head()

Out[89]:

	Country
0	NaN
1	NaN
2	NaN
3	NaN
4	Egypt

In [90]:

arab_land = arab_land[4:]

In [91]:

arab_land = arab_land.reset_index(drop = True)

In [92]:

arab_land["Code"] = ""

In [93]:

for nat in arab_land["Country"].values:
    if nat in noc["Team/NOC"].values:
        arab_land.loc[arab_land["Country"] == nat, "Code"] = noc.loc[noc["Team/NOC"] == nat]["Code"].values

In [94]:

arab_land.loc[arab_land["Country"] == "Syria", "Code"] = "SYR"
arab_land.loc[arab_land["Country"] == "Libya", "Code"] = "LBY"
arab_land.loc[arab_land["Country"] == "Palestine", "Code"] = "PSE"

In [95]:

arab_land

Out[95]:

	Country	Code
0	Egypt	EGY
1	Algeria	DZA
2	Iraq	IRQ
3	Sudan	SDN
4	Morocco	MAR
5	Saudi Arabia	SAU
6	Yemen	YEM
7	Syria	SYR
8	Tunisia	TUN
9	Somalia	SOM
10	United Arab Emirates	ARE
11	Jordan	JOR
12	Libya	LBY
13	Palestine	PSE
14	Lebanon	LBN
15	Oman	OMN
16	Kuwait	KWT
17	Mauritania	MRT
18	Qatar	QAT
19	Bahrain	BHR
20	Djibouti	DJI
21	Comoros	COM

In [96]:

arab_nocs = list(arab_land["Code"].values)

In [97]:

# at first i thought of using the country codes but it seems like the codes on the 'noc' and the ones on the 'updated' datasets
# are different for some coutries like "United Arab Emirates"
arab_olympics = updated.loc[updated["Nation"].isin(arab_land["Country"].values)]

In [98]:

arab_olympics.head()

Out[98]:

	Nation	Population	Exists	Code	First_App	Apps	Avg_Medal	Total	Gold	Silver	Bronze	MostSuccessfulSport	Medals	Golds	Silvers	Bronzes
50	Egypt	97553151	YES	EGY	1912	23	1.65	38	8	11	19	Weightlifting	14	5	3	6
63	Morocco	35739580	YES	MAR	1960	15	1.60	24	7	5	12	Athletics	19	6	5	8
71	Algeria	41318142	YES	ALG	1964	13	1.31	17	5	4	8	Athletics	9	4	3	2
72	Tunisia	11532127	YES	TUN	1960	15	1.00	15	5	3	7	Athletics	5	2	2	1
85	Qatar	2639211	YES	QAT	1984	10	0.80	8	2	1	5	Athletics	3	0	1	2

In [99]:

arab_olympics.to_csv(filepath + 'Arab Olympics.csv')

In [100]:

sns.set_theme(palette = "Set2")
arab_olympics.head(30).plot(x = "Nation", 
                            y = ["Gold", "Silver", "Bronze"], 
                            kind = "bar", 
                            figsize = (15, 8))
plt.xlabel("Nations", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.xticks(rotation = 60)
plt.ylabel("Medals Won", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.title("Most Medals Won By Arab Countries", 
          fontdict = {"fontsize" : 16, 
                      "fontweight" : "bold"})

plt.savefig(imagepath + "Most Medals Won By Arab Countries.png")

In [101]:

succ_sports = pd.DataFrame(columns = ["Sport", "Count"])
succ_sports.Sport = updated["MostSuccessfulSport"].value_counts().index
succ_sports.Count = updated["MostSuccessfulSport"].value_counts().values

In [102]:

succ_sports.head()

Out[102]:

	Sport	Count
0	-	79
1	Athletics	41
2	Wrestling	12
3	Boxing	11
4	Weightlifting	7

In [103]:

succ_sports[1:].plot(x = "Sport", 
                 y = "Count", 
                 kind = "bar", 
                 figsize = (15, 6))
plt.xlabel("Sport", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.xticks(rotation = 60)
plt.ylabel("Count", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.title("Most Popular Sports", 
          fontdict = {"fontsize" : 16, 
                      "fontweight" : "bold"})

plt.savefig(imagepath + "Most popular Sports.png")

In [104]:

plt.stem(updated[:20]["Nation"], updated[:20]["Avg_Medal"])

Out[104]:

<StemContainer object of 3 artists>

In [105]:

athlete_events_s.head(3)

Out[105]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Event	Medal
0	1	A Dijiang	M	24.0	180.0	80.0	China	CHN	1992	Barcelona	Basketball	Basketball Men's Basketball	NaN
1	2	A Lamusi	M	23.0	170.0	60.0	China	CHN	2012	London	Judo	Judo Men's Extra-Lightweight	NaN
2	3	Gunnar Nielsen Aaby	M	24.0	NaN	NaN	Denmark	DEN	1920	Antwerpen	Football	Football Men's Football	NaN

In [106]:

plt.figure(figsize = (15, 8))
sns.countplot(x = "Year", 
              hue = "Sex", 
              data = athlete_events_s[athlete_events_s["Medal"].notna()])
plt.xlabel("Year", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.xticks(rotation = 60)
plt.ylabel("Medals Won", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.title("Gender Distribution Over The Years", 
          fontdict = {"fontsize" : 16, 
                      "fontweight" : "bold"})
plt.legend(loc = 'upper left')

plt.savefig(imagepath + 'Gender Distribution Over The Years.png')

In [107]:

athlete_events_s["Medal"].dropna()

Out[107]:

3           Gold
37        Bronze
38        Bronze
41        Bronze
42          Gold
           ...  
271078    Silver
271080    Bronze
271082    Bronze
271102    Bronze
271103    Silver
Name: Medal, Length: 34088, dtype: object

In [108]:

# what i wanna do here is see what's the distribution like for each country's athletes and their gender, in conclusion figure
# out if there's a country where the count of female participants is more than their male counterpart
# it looks like a scatter plot is the most convenient kind of graph i can use as it will plot every country

In [109]:

athlete_events_s.sort_values(by = "NOC")

Out[109]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Event	Medal
67193	34349	Hussain Fazal	M	29.0	NaN	NaN	Afghanistan	AFG	1936	Berlin	Hockey	Hockey Men's Hockey	NaN
217225	109113	Abouwi Ahmad Shah	M	NaN	NaN	NaN	Afghanistan	AFG	1936	Berlin	Hockey	Hockey Men's Hockey	NaN
217241	109123	Mian Faruq Shah	M	29.0	NaN	NaN	Afghanistan	AFG	1936	Berlin	Hockey	Hockey Men's Hockey	NaN
217252	109130	Zahir Shah Al-Zadah	M	25.0	NaN	NaN	Afghanistan	AFG	1936	Berlin	Hockey	Hockey Men's Hockey	NaN
217294	109153	Shakar Khan Shakar	M	24.0	NaN	74.0	Afghanistan	AFG	1964	Tokyo	Wrestling	Wrestling Men's Welterweight, Freestyle	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...
45643	23549	Kirsty Leigh Coventry (-Seward)	F	20.0	176.0	64.0	Zimbabwe	ZIM	2004	Athina	Swimming	Swimming Women's 200 metres Individual Medley	Bronze
22428	11813	Clyde Cecil Bissett	M	NaN	NaN	NaN	Zimbabwe	ZIM	1928	Amsterdam	Boxing	Boxing Men's Lightweight	NaN
154398	77513	Michael McFadden	M	28.0	183.0	94.0	Zimbabwe	ZIM	1964	Tokyo	Sailing	Sailing Mixed One Person Dinghy	NaN
164733	82724	Samukeliso Moyo	F	26.0	160.0	48.0	Zimbabwe	ZIM	2000	Sydney	Athletics	Athletics Women's 5,000 metres	NaN
261480	130832	Hillary Wilson	F	15.0	157.0	65.0	Zimbabwe	ZIM	1960	Roma	Swimming	Swimming Women's 100 metres Butterfly	NaN

222552 rows × 13 columns

In [110]:

print(sorted(athlete_events_s.NOC.unique())) # length is 230

['AFG', 'AHO', 'ALB', 'ALG', 'AND', 'ANG', 'ANT', 'ANZ', 'ARG', 'ARM', 'ARU', 'ASA', 'AUS', 'AUT', 'AZE', 'BAH', 'BAN', 'BAR', 'BDI', 'BEL', 'BEN', 'BER', 'BHU', 'BIH', 'BIZ', 'BLR', 'BOH', 'BOL', 'BOT', 'BRA', 'BRN', 'BRU', 'BUL', 'BUR', 'CAF', 'CAM', 'CAN', 'CAY', 'CGO', 'CHA', 'CHI', 'CHN', 'CIV', 'CMR', 'COD', 'COK', 'COL', 'COM', 'CPV', 'CRC', 'CRO', 'CRT', 'CUB', 'CYP', 'CZE', 'DEN', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'ERI', 'ESA', 'ESP', 'EST', 'ETH', 'EUN', 'FIJ', 'FIN', 'FRA', 'FRG', 'FSM', 'GAB', 'GAM', 'GBR', 'GBS', 'GDR', 'GEO', 'GEQ', 'GER', 'GHA', 'GRE', 'GRN', 'GUA', 'GUI', 'GUM', 'GUY', 'HAI', 'HKG', 'HON', 'HUN', 'INA', 'IND', 'IOA', 'IRI', 'IRL', 'IRQ', 'ISL', 'ISR', 'ISV', 'ITA', 'IVB', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KIR', 'KOR', 'KOS', 'KSA', 'KUW', 'LAO', 'LAT', 'LBA', 'LBR', 'LCA', 'LES', 'LIB', 'LIE', 'LTU', 'LUX', 'MAD', 'MAL', 'MAR', 'MAS', 'MAW', 'MDA', 'MDV', 'MEX', 'MGL', 'MHL', 'MKD', 'MLI', 'MLT', 'MNE', 'MON', 'MOZ', 'MRI', 'MTN', 'MYA', 'NAM', 'NBO', 'NCA', 'NED', 'NEP', 'NFL', 'NGR', 'NIG', 'NOR', 'NRU', 'NZL', 'OMA', 'PAK', 'PAN', 'PAR', 'PER', 'PHI', 'PLE', 'PLW', 'PNG', 'POL', 'POR', 'PRK', 'PUR', 'QAT', 'RHO', 'ROT', 'ROU', 'RSA', 'RUS', 'RWA', 'SAA', 'SAM', 'SCG', 'SEN', 'SEY', 'SGP', 'SKN', 'SLE', 'SLO', 'SMR', 'SOL', 'SOM', 'SRB', 'SRI', 'SSD', 'STP', 'SUD', 'SUI', 'SUR', 'SVK', 'SWE', 'SWZ', 'SYR', 'TAN', 'TCH', 'TGA', 'THA', 'TJK', 'TKM', 'TLS', 'TOG', 'TPE', 'TTO', 'TUN', 'TUR', 'TUV', 'UAE', 'UAR', 'UGA', 'UKR', 'UNK', 'URS', 'URU', 'USA', 'UZB', 'VAN', 'VEN', 'VIE', 'VIN', 'VNM', 'WIF', 'YAR', 'YEM', 'YMD', 'YUG', 'ZAM', 'ZIM']

In [111]:

noc_genders = pd.DataFrame(columns = ["Country", "Code", "Male", "Female", "Total"])

In [112]:

noc_genders["Code"] = athlete_events_s.NOC.unique()

In [113]:

not_fnd_3 = []
for cod in noc_genders.Code.values:
    if cod in noc["Code"].values:
        noc_genders.loc[noc_genders["Code"] == cod, "Country"] = noc.loc[noc["Code"] == cod, "Team/NOC"].values
    else : 
        not_fnd_3.append(cod)

In [114]:

print(not_fnd_3)

['DEN', 'NED', 'IRI', 'BUL', 'CHA', 'SUD', 'GRE', 'CHI', 'URS', 'NCA', 'NGR', 'ALG', 'KUW', 'UAR', 'LIB', 'MAS', 'GER', 'RSA', 'TAN', 'LBA', 'PLE', 'BRU', 'KSA', 'UAE', 'YAR', 'INA', 'PHI', 'EUN', 'CGO', 'SUI', 'GDR', 'MON', 'URU', 'SRI', 'NIG', 'CRC', 'SLO', 'POR', 'PAR', 'ANG', 'FRG', 'BAN', 'ESA', 'PUR', 'HON', 'MRI', 'SEY', 'TCH', 'MTN', 'SKN', 'VIN', 'NEP', 'MGL', 'TOG', 'AHO', 'ASA', 'SAM', 'CRO', 'HAI', 'GUI', 'BIZ', 'YMD', 'BER', 'ANZ', 'SCG', 'IOA', 'OMA', 'FIJ', 'VAN', 'BAH', 'GUA', 'YUG', 'LAT', 'IVB', 'ISV', 'MAD', 'MAL', 'CAY', 'BAR', 'GBS', 'BOT', 'ROT', 'CAM', 'SOL', 'CRT', 'GEQ', 'SAA', 'ZIM', 'GRN', 'MYA', 'MAW', 'ZAM', 'RHO', 'TPE', 'BOH', 'GAM', 'WIF', 'BUR', 'NBO', 'ARU', 'VIE', 'BHU', 'UNK', 'TGA', 'NFL', 'KOS', 'SSD', 'LES']

In [115]:

not_fnd_4 = []
for cod in not_fnd_3:
    if cod in athlete_events_s["NOC"].values:
        noc_genders.loc[noc_genders["Code"] == cod, "Country"] = athlete_events_s.loc[athlete_events_s["NOC"] == cod, "Team"].values[0]
    else : 
        not_fnd_4.append(cod)

In [116]:

noc_genders

Out[116]:

	Country	Code	Male	Female	Total
0	China	CHN	NaN	NaN	NaN
1	Denmark	DEN	NaN	NaN	NaN
2	Netherlands	NED	NaN	NaN	NaN
3	Finland	FIN	NaN	NaN	NaN
4	Norway	NOR	NaN	NaN	NaN
...	...	...	...	...	...
225	Tonga	TGA	NaN	NaN	NaN
226	Newfoundland	NFL	NaN	NaN	NaN
227	Kosovo	KOS	NaN	NaN	NaN
228	South Sudan	SSD	NaN	NaN	NaN
229	Lesotho	LES	NaN	NaN	NaN

230 rows × 5 columns

In [117]:

ath_events_np = athlete_events_s.to_numpy()

In [118]:

noc_genders_np = noc_genders.to_numpy()

In [119]:

unique_keys, indices = np.unique(ath_events_np[:,1], return_index=True)

In [120]:

# this array included one record of each athlete
ath_events = ath_events_np[indices]

In [121]:

np.set_printoptions(threshold=np.inf)

In [122]:

ath_events[1]

Out[122]:

array([107953, ' Eleonora Margarida Josephina Scmitt', 'F', 16.0, nan,
       nan, 'Brazil', 'BRA', 1948, 'London', 'Swimming',
       "Swimming Women's 100 metres Freestyle", nan], dtype=object)

In [123]:

noc_genders_np[1]

Out[123]:

array(['Denmark', 'DEN', nan, nan, nan], dtype=object)

In [124]:

ath_events[(ath_events[:,7] == "BRA") & (ath_events[:,2] == "F")].shape[0]

Out[124]:

In [125]:

ath_events[ath_events[:,1] == "Michael Fred Phelps, II"]

Out[125]:

array([[94406, 'Michael Fred Phelps, II', 'M', 15.0, 193.0, 91.0,
        'United States', 'USA', 2000, 'Sydney', 'Swimming',
        "Swimming Men's 200 metres Butterfly", nan]], dtype=object)

In [126]:

noc_genders_np[noc_genders_np[:,1] == "BRA", 3]

Out[126]:

array([nan], dtype=object)

In [127]:

male = 0
female = 0
for cod in noc_genders_np[:,[1]]:
    male = ath_events[(ath_events[:,7] == cod) & (ath_events[:,2] == "M")].shape[0]
    female = ath_events[(ath_events[:,7] == cod) & (ath_events[:,2] == "F")].shape[0]
    noc_genders_np[noc_genders_np[:,1] == cod, 3] = male
    noc_genders_np[noc_genders_np[:,1] == cod, 2] = female
    noc_genders_np[noc_genders_np[:,1] == cod, 4] = male + female

In [128]:

noc_genders.loc[[1,2,3]]

Out[128]:

	Country	Code	Male	Female	Total
1	Denmark	DEN	329	1532	1861
2	Netherlands	NED	775	1943	2718
3	Finland	FIN	219	1410	1629

In [129]:

noc_genders_np[[1,2,3]]

Out[129]:

array([['Denmark', 'DEN', 329, 1532, 1861],
       ['Netherlands', 'NED', 775, 1943, 2718],
       ['Finland', 'FIN', 219, 1410, 1629]], dtype=object)

In [130]:

noc_genders = noc_genders.sort_values(by = "Total", ascending = False)
noc_genders = noc_genders.reset_index(drop = True)

In [131]:

noc_genders.to_csv(filepath + 'Gender Distribution per Country.csv')

In [132]:

plt.figure(figsize = (16, 8))
sns.scatterplot(data = noc_genders, 
                x = "Male", 
                y = "Female", 
                hue = "Total", 
                size = "Total", 
                sizes = (20, 200))
plt.xlabel("Male Participants", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.ylabel("Female Participants", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.title("Gender Distribution by Country", 
          fontdict = {"fontsize" : 16, 
                      "fontweight" : "bold"})
for i in range(0,20):
    ax = plt.text(x = noc_genders.Male[i], 
             y = noc_genders.Female[i], 
             s = noc_genders.Code[i], 
             fontdict = dict(color='red',size=10)
             #bbox = dict(facecolor='yellow',alpha=0.5)
                 )
    ax.set_alpha(.8)

plt.savefig(imagepath + 'Gender Distribution by Country.png')

In [133]:

tokyo_genders.Discipline.unique() #46

Out[133]:

array(['3x3 Basketball', 'Archery', 'Artistic Gymnastics',
       'Artistic Swimming', 'Athletics', 'Badminton', 'Baseball/Softball',
       'Basketball', 'Beach Volleyball', 'Boxing', 'Canoe Slalom',
       'Canoe Sprint', 'Cycling BMX Freestyle', 'Cycling BMX Racing',
       'Cycling Mountain Bike', 'Cycling Road', 'Cycling Track', 'Diving',
       'Equestrian', 'Fencing', 'Football', 'Golf', 'Handball', 'Hockey',
       'Judo', 'Karate', 'Marathon Swimming', 'Modern Pentathlon',
       'Rhythmic Gymnastics', 'Rowing', 'Rugby Sevens', 'Sailing',
       'Shooting', 'Skateboarding', 'Sport Climbing', 'Surfing',
       'Swimming', 'Table Tennis', 'Taekwondo', 'Tennis',
       'Trampoline Gymnastics', 'Triathlon', 'Volleyball', 'Water Polo',
       'Weightlifting', 'Wrestling'], dtype=object)

In [134]:

sports = (athlete_events_s.Sport.unique()) #52, and 28 in common

In [135]:

yyyy = []

In [136]:

yyyy = [tokyo_genders.Discipline.unique()[i] for i in range(46) if tokyo_genders.Discipline.unique()[i] in athlete_events_s.Sport.unique()]

In [137]:

print(yyyy)

['Archery', 'Athletics', 'Badminton', 'Basketball', 'Beach Volleyball', 'Boxing', 'Diving', 'Fencing', 'Football', 'Golf', 'Handball', 'Hockey', 'Judo', 'Modern Pentathlon', 'Rhythmic Gymnastics', 'Rowing', 'Rugby Sevens', 'Sailing', 'Shooting', 'Swimming', 'Table Tennis', 'Taekwondo', 'Tennis', 'Triathlon', 'Volleyball', 'Water Polo', 'Weightlifting', 'Wrestling']

In [138]:

tokyo_genders.head(2)

Out[138]:

	Discipline	Female	Male	Total
0	3x3 Basketball	32	32	64
1	Archery	64	64	128

In [139]:

athlete_events_s.head(2)

Out[139]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Event	Medal
0	1	A Dijiang	M	24.0	180.0	80.0	China	CHN	1992	Barcelona	Basketball	Basketball Men's Basketball	NaN
1	2	A Lamusi	M	23.0	170.0	60.0	China	CHN	2012	London	Judo	Judo Men's Extra-Lightweight	NaN

In [140]:

evnt_genders = pd.DataFrame(columns = ["Discipline", "Female", "Male", "Total"])
evnt_genders["Discipline"] = athlete_events_s.Sport.unique()

In [141]:

evnt_genders["Female"] = 0
evnt_genders["Male"] = 0
evnt_genders["Total"] = 0

In [142]:

athlete_events_s.loc[athlete_events_s["Sport"] == "Basketball", "Sex"].values.size

Out[142]:

In [143]:

evnt_genders.loc[evnt_genders["Discipline"] == "Basketball", "Male"]

Out[143]:

0    0
Name: Male, dtype: int64

In [144]:

evnt_genders_np = evnt_genders.to_numpy()

In [145]:

evnt_genders_np[1] #disc , f, m, t

Out[145]:

array(['Judo', 0, 0, 0], dtype=object)

In [146]:

ath_events[1]

Out[146]:

array([107953, ' Eleonora Margarida Josephina Scmitt', 'F', 16.0, nan,
       nan, 'Brazil', 'BRA', 1948, 'London', 'Swimming',
       "Swimming Women's 100 metres Freestyle", nan], dtype=object)

In [147]:

(ath_events[ath_events[:,10] == "Judo"][:,2]).size

Out[147]:

In [148]:

evnt_genders_np[evnt_genders_np[:,0] == "Judo", 2]

Out[148]:

array([0], dtype=object)

In [149]:

for evnt in sports:
    for s in ath_events[ath_events[:,10] == evnt][:,2]:
        if s == "M":
            evnt_genders_np[evnt_genders_np[:,0] == evnt, 2] += 1
            evnt_genders_np[evnt_genders_np[:,0] == evnt, 3] += 1
        elif s == "F" :
            evnt_genders_np[evnt_genders_np[:,0] == evnt, 1] += 1
            evnt_genders_np[evnt_genders_np[:,0] == evnt, 3] += 1

In [150]:

evnt_genders_np

Out[150]:

array([['Basketball', 926, 2460, 3386],
       ['Judo', 752, 1950, 2702],
       ['Football', 728, 5401, 6129],
       ['Tug-Of-War', 0, 155, 155],
       ['Athletics', 6501, 15406, 21907],
       ['Swimming', 3603, 5000, 8603],
       ['Badminton', 409, 396, 805],
       ['Sailing', 628, 3826, 4454],
       ['Gymnastics', 1491, 2592, 4083],
       ['Art Competitions', 203, 1587, 1790],
       ['Handball', 1017, 1658, 2675],
       ['Weightlifting', 354, 2487, 2841],
       ['Wrestling', 219, 4721, 4940],
       ['Water Polo', 335, 2194, 2529],
       ['Hockey', 990, 2798, 3788],
       ['Rowing', 1476, 6151, 7627],
       ['Fencing', 873, 3156, 4029],
       ['Equestrianism', 459, 1862, 2321],
       ['Shooting', 728, 4091, 4819],
       ['Boxing', 63, 5158, 5221],
       ['Taekwondo', 229, 241, 470],
       ['Cycling', 708, 5077, 5785],
       ['Diving', 622, 824, 1446],
       ['Canoeing', 697, 2485, 3182],
       ['Tennis', 485, 752, 1237],
       ['Modern Pentathlon', 113, 729, 842],
       ['Golf', 70, 148, 218],
       ['Softball', 363, 0, 363],
       ['Archery', 496, 609, 1105],
       ['Volleyball', 1118, 1371, 2489],
       ['Synchronized Swimming', 543, 0, 543],
       ['Table Tennis', 374, 369, 743],
       ['Baseball', 0, 752, 752],
       ['Rhythmic Gymnastics', 564, 0, 564],
       ['Rugby Sevens', 148, 151, 299],
       ['Trampolining', 44, 48, 92],
       ['Beach Volleyball', 175, 188, 363],
       ['Triathlon', 171, 180, 351],
       ['Rugby', 0, 151, 151],
       ['Lacrosse', 0, 60, 60],
       ['Polo', 0, 86, 86],
       ['Cricket', 0, 24, 24],
       ['Ice Hockey', 0, 60, 60],
       ['Racquets', 0, 5, 5],
       ['Motorboating', 1, 12, 13],
       ['Croquet', 3, 7, 10],
       ['Figure Skating', 18, 26, 44],
       ['Jeu De Paume', 0, 10, 10],
       ['Roque', 0, 4, 4],
       ['Basque Pelota', 0, 2, 2],
       ['Alpinism', 1, 3, 4],
       ['Aeronautics', 0, 1, 1]], dtype=object)

In [151]:

evnt_genders = pd.DataFrame(evnt_genders_np, columns = ["Discipline", "Female", "Male", "Total"])

In [152]:

evnt_genders

Out[152]:

	Discipline	Female	Male	Total
0	Basketball	926	2460	3386
1	Judo	752	1950	2702
2	Football	728	5401	6129
3	Tug-Of-War	0	155	155
4	Athletics	6501	15406	21907
5	Swimming	3603	5000	8603
6	Badminton	409	396	805
7	Sailing	628	3826	4454
8	Gymnastics	1491	2592	4083
9	Art Competitions	203	1587	1790
10	Handball	1017	1658	2675
11	Weightlifting	354	2487	2841
12	Wrestling	219	4721	4940
13	Water Polo	335	2194	2529
14	Hockey	990	2798	3788
15	Rowing	1476	6151	7627
16	Fencing	873	3156	4029
17	Equestrianism	459	1862	2321
18	Shooting	728	4091	4819
19	Boxing	63	5158	5221
20	Taekwondo	229	241	470
21	Cycling	708	5077	5785
22	Diving	622	824	1446
23	Canoeing	697	2485	3182
24	Tennis	485	752	1237
25	Modern Pentathlon	113	729	842
26	Golf	70	148	218
27	Softball	363	0	363
28	Archery	496	609	1105
29	Volleyball	1118	1371	2489
30	Synchronized Swimming	543	0	543
31	Table Tennis	374	369	743
32	Baseball	0	752	752
33	Rhythmic Gymnastics	564	0	564
34	Rugby Sevens	148	151	299
35	Trampolining	44	48	92
36	Beach Volleyball	175	188	363
37	Triathlon	171	180	351
38	Rugby	0	151	151
39	Lacrosse	0	60	60
40	Polo	0	86	86
41	Cricket	0	24	24
42	Ice Hockey	0	60	60
43	Racquets	0	5	5
44	Motorboating	1	12	13
45	Croquet	3	7	10
46	Figure Skating	18	26	44
47	Jeu De Paume	0	10	10
48	Roque	0	4	4
49	Basque Pelota	0	2	2
50	Alpinism	1	3	4
51	Aeronautics	0	1	1

In [153]:

all_genders = evnt_genders.merge(tokyo_genders, how = 'left', on = ["Discipline"])

In [154]:

tokyo_genders[tokyo_genders["Discipline"] == "Judo"]

Out[154]:

	Discipline	Female	Male	Total
24	Judo	192	201	393

In [155]:

evnt_genders[evnt_genders["Discipline"] == "Judo"]

Out[155]:

	Discipline	Female	Male	Total
1	Judo	752	1950	2702

In [156]:

all_genders[["Female", "Male", "Total"]] = 0

In [157]:

all_genders["Female"] = all_genders[["Female_y", "Female_x"]].sum(axis = 1).astype(int)
all_genders["Male"] = all_genders[["Male_y", "Male_x"]].sum(axis = 1).astype(int)
all_genders["Total"] = all_genders[["Total_y", "Total_x"]].sum(axis = 1).astype(int)

In [158]:

all_genders = all_genders.drop(["Female_x", "Male_x", "Total_x", "Female_y", "Male_y", "Total_y"], axis = 1)

In [159]:

all_genders = all_genders.sort_values(by = "Total", ascending = False)
all_genders = all_genders.reset_index(drop = True)

In [160]:

all_genders.head()

Out[160]:

	Discipline	Female	Male	Total
0	Athletics	7470	16478	23948
1	Swimming	3964	5418	9382
2	Rowing	1733	6416	8149
3	Football	992	5745	6737
4	Cycling	708	5077	5785

In [161]:

all_genders.to_csv(filepath + 'Gender Distribution per Discipline.csv')

In [162]:

plt.figure(figsize = (16, 8))
sns.scatterplot(data = all_genders, 
                x = "Male", 
                y = "Female", 
                hue = "Total", 
                size = "Total", 
                sizes = (20, 200))
plt.xlabel("Male Participants", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.ylabel("Female Participants", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.title("Gender Distribution by Discipline", 
          fontdict = {"fontsize" : 16, 
                      "fontweight" : "bold"})
for i in range(0,15):
    ax = plt.text(x = all_genders.Male[i], 
             y = all_genders.Female[i], 
             s = all_genders.Discipline[i], 
             fontdict = dict(color='green',size=10)
             #bbox = dict(facecolor='yellow',alpha=0.5)
                 )
    ax.set_alpha(.8)
    
plt.savefig(imagepath + 'Gender Distribution by Discipline.png')

In [163]:

athlete_events_s.head(4)

Out[163]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Event	Medal
0	1	A Dijiang	M	24.0	180.0	80.0	China	CHN	1992	Barcelona	Basketball	Basketball Men's Basketball	NaN
1	2	A Lamusi	M	23.0	170.0	60.0	China	CHN	2012	London	Judo	Judo Men's Extra-Lightweight	NaN
2	3	Gunnar Nielsen Aaby	M	24.0	NaN	NaN	Denmark	DEN	1920	Antwerpen	Football	Football Men's Football	NaN
3	4	Edgar Lindenau Aabye	M	34.0	NaN	NaN	Denmark/Sweden	DEN	1900	Paris	Tug-Of-War	Tug-Of-War Men's Tug-Of-War	Gold

In [164]:

succ_athletes = athlete_events_s[~athlete_events_s["Medal"].isna()].drop("Event", axis = 1)
succ_athletes.head(3) #dataframe

Out[164]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Medal
3	4	Edgar Lindenau Aabye	M	34.0	NaN	NaN	Denmark/Sweden	DEN	1900	Paris	Tug-Of-War	Gold
37	15	Arvo Ossian Aaltonen	M	30.0	NaN	NaN	Finland	FIN	1920	Antwerpen	Swimming	Bronze
38	15	Arvo Ossian Aaltonen	M	30.0	NaN	NaN	Finland	FIN	1920	Antwerpen	Swimming	Bronze

In [165]:

succ_athletes.Name.unique().size #34088 and unique ones 24545

Out[165]:

In [166]:

succ_athletes_np = succ_athletes.drop(["City", "Team"], axis = 1).to_numpy()

In [167]:

succ_athletes_np[[0,1,2,3]] #array

Out[167]:

array([[4, 'Edgar Lindenau Aabye', 'M', 34.0, nan, nan, 'DEN', 1900,
        'Tug-Of-War', 'Gold'],
       [15, 'Arvo Ossian Aaltonen', 'M', 30.0, nan, nan, 'FIN', 1920,
        'Swimming', 'Bronze'],
       [15, 'Arvo Ossian Aaltonen', 'M', 30.0, nan, nan, 'FIN', 1920,
        'Swimming', 'Bronze'],
       [17, 'Paavo Johannes Aaltonen', 'M', 28.0, 175.0, 64.0, 'FIN',
        1948, 'Gymnastics', 'Bronze']], dtype=object)

In [168]:

succ_ath = pd.DataFrame(columns = ["Name", "NOC", "Apps", "Gold", "Silver", "Bronze", "Total", "Sport"])
succ_ath["Name"] = succ_athletes.Name
succ_ath["NOC"] = succ_athletes.NOC
succ_ath["Sport"] = succ_athletes.Sport

In [169]:

succ_ath["Apps"] = 0
succ_ath["Gold"] = 0
succ_ath["Silver"] = 0
succ_ath["Bronze"] = 0
succ_ath["Total"] = 0

In [170]:

succ_ath = succ_ath.drop_duplicates("Name")

In [171]:

succ_ath.head(3)

Out[171]:

	Name	NOC	Sport
3	Edgar Lindenau Aabye	DEN	Tug-Of-War
37	Arvo Ossian Aaltonen	FIN	Swimming
41	Paavo Johannes Aaltonen	FIN	Gymnastics

In [172]:

succ_ath_np = succ_ath.to_numpy()

In [173]:

succ_athletes_np[succ_athletes_np[:,1] == "Michael Fred Phelps, II"].shape[0]

Out[173]:

In [174]:

for item in succ_ath_np[:,0]:
    if item.find("Phelps") != -1:
        print(item)

Brian Eric Phelps
Jaycie Lynn Phelps (-McClure, -Marus)
Mason Elliott Phelps
Michael Fred Phelps, II
Richard Lawson Phelps, Jr.
Harlow Phelps Rothert

In [175]:

athlete_events_np = athlete_events_s.to_numpy()

In [176]:

athlete_events_np[athlete_events_np[:,1] == "Michael Fred Phelps, II"].shape[0]

Out[176]:

In [177]:

succ_ath_np[0]

Out[177]:

array(['Edgar Lindenau Aabye', 'DEN', 0, 0, 0, 0, 0, 'Tug-Of-War'],
      dtype=object)

In [178]:

for i in range(0, succ_athletes_np.shape[0]):
    name = succ_athletes_np[i,1]
    succ_ath_np[succ_ath_np[:,0] == name, 5] = succ_athletes_np[succ_athletes_np[:,1] == name].shape[0] 
    if succ_athletes_np[i, 9] == "Gold":
        succ_ath_np[succ_ath_np[:,0] == name, 2] += 1
    elif succ_athletes_np[i, 9] == "Silver":
        succ_ath_np[succ_ath_np[:,0] == name, 3] += 1
    else : #succ_athletes_np[i, 9] == "Bronze":
        succ_ath_np[succ_ath_np[:,0] == name, 4] += 1

In [179]:

succ_ath_np[succ_ath_np[:,0] == "Michael Fred Phelps, II"]
#succ_ath_np[succ_ath_np[:,0] == name, 6] = athlete_events_np[athlete_events_np[:,1] == name].shape[0]

Out[179]:

array([['Michael Fred Phelps, II', 'USA', 23, 3, 2, 28, 0, 'Swimming']],
      dtype=object)

In [180]:

ath_apps = pd.DataFrame(columns = ["Name", "Count"])
ath_apps["Name"] = athlete_events_s["Name"].value_counts().index
ath_apps["Count"] = athlete_events_s["Name"].value_counts().values

In [181]:

ath_apps[ath_apps["Name"] == "Michael Fred Phelps, II"]

Out[181]:

	Name	Count
14	Michael Fred Phelps, II	30

In [182]:

ath_apps_np = ath_apps.to_numpy()

In [183]:

succ_ath_np[succ_ath_np[:,0] == "Michael Fred Phelps, II"]

Out[183]:

array([['Michael Fred Phelps, II', 'USA', 23, 3, 2, 28, 0, 'Swimming']],
      dtype=object)

In [184]:

for name in succ_ath["Name"].values:
    succ_ath_np[succ_ath_np[:,0] == name, 6] = ath_apps_np[ath_apps_np[:,0] == name, 1]

In [185]:

succ_ath = pd.DataFrame(succ_ath_np, columns = ["Name", "NOC", "Gold", "Silver", "Bronze", "Total", "Apps", "Sport"])

In [186]:

succ_ath = succ_ath.sort_values(by = "Total", ascending = False)

In [187]:

succ_ath = succ_ath.reset_index(drop = True)

In [188]:

succ_ath["Ratio(T/A)"] = ""

In [189]:

succ_ath["Ratio(T/A)"] = succ_ath["Total"]/succ_ath["Apps"]

In [190]:

succ_ath["Ratio(T/A)"] = succ_ath["Ratio(T/A)"].apply(lambda x: round(x, 2))

In [191]:

succ_ath = succ_ath[["Name", "NOC", "Gold", "Silver", "Bronze", "Total", "Apps", "Ratio(T/A)", "Sport"]]

In [192]:

succ_ath.head(20)

Out[192]:

	Name	NOC	Gold	Silver	Bronze	Total	Apps	Ratio(T/A)	Sport
0	Michael Fred Phelps, II	USA	23	3	2	28	30	0.93	Swimming
1	Larysa Semenivna Latynina (Diriy-)	URS	9	5	4	18	19	0.95	Gymnastics
2	Nikolay Yefimovich Andrianov	URS	7	5	3	15	24	0.62	Gymnastics
3	Borys Anfiyanovych Shakhlin	URS	7	4	2	13	24	0.54	Gymnastics
4	Takashi Ono	JPN	5	4	4	13	33	0.39	Gymnastics
5	Edoardo Mangiarotti	ITA	6	5	2	13	14	0.93	Fencing
6	Paavo Johannes Nurmi	FIN	9	3	0	12	12	1.00	Athletics
7	Birgit Fischer-Schmidt	GDR	8	4	0	12	13	0.92	Canoeing
8	Jennifer Elisabeth "Jenny" Thompson (-Cumpelik)	USA	8	3	1	12	17	0.71	Swimming
9	Sawao Kato	JPN	8	3	1	12	24	0.50	Gymnastics
10	Natalie Anne Coughlin (-Hall)	USA	3	4	5	12	12	1.00	Swimming
11	Dara Grace Torres (-Hoffman, -Minas)	USA	4	4	4	12	13	0.92	Swimming
12	Ryan Steven Lochte	USA	6	3	3	12	14	0.86	Swimming
13	Aleksey Yuryevich Nemov	RUS	4	2	6	12	21	0.57	Gymnastics
14	Mark Andrew Spitz	USA	9	1	1	11	12	0.92	Swimming
15	Viktor Ivanovych Chukarin	URS	7	3	1	11	16	0.69	Gymnastics
16	Matthew Nicholas "Matt" Biondi	USA	8	2	1	11	12	0.92	Swimming
17	Vra slavsk (-Odloilov)	TCH	7	4	0	11	18	0.61	Gymnastics
18	Carl Townsend Osburn	USA	5	4	2	11	16	0.69	Shooting
19	Franziska van Almsick	GER	0	4	6	10	23	0.43	Swimming

In [193]:

succ_ath.to_csv(filepath + 'Count of Medals per Athlete.csv')

In [194]:

color = ['#FFD700', '#C0C0C0', '#B08D57']

succ_ath.drop(["Total", "Apps", "Sport"], axis = 1).head(20).plot(x = "Name",
                                               kind = 'barh', 
                                               stacked = True, 
                                               figsize = (15, 12), 
                                               color = color)

plt.ylabel("Athletes", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.xlabel("Medals Won", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.title("Most Medals Won by Top 20 Athletes", 
          fontdict = {"fontsize" : 16, 
                      "fontweight" : "bold"})
plt.gca().invert_yaxis()

plt.savefig(imagepath + 'Most Medals Won by Top 20 Athletes.png')

In [195]:

top_30 = succ_ath.sort_values(by = "Total", ascending = False).head(30)

In [226]:

plt.figure(figsize = (10,12))
sns.set(style="whitegrid", color_codes=True)

data = top_30.groupby("Ratio(T/A)").size()   
pal = sns.color_palette("PuBu", len(data))

rank = data.argsort().argsort()   

sns.barplot(x = top_30.sort_values(by = "Ratio(T/A)", ascending = False).head(30)["Ratio(T/A)"], 
            y = top_30.sort_values(by = "Ratio(T/A)", ascending = False).head(30)["Name"], 
            palette = np.array(pal[::-1])[rank])

plt.ylabel("Athletes", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.xlabel("Ratio of Medals Won", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.title("Distibution of Medals and Count of Participations Among the Top 30 Athletes", 
          fontdict = {"fontsize" : 16, 
                      "fontweight" : "bold"})

plt.savefig(imagepath + 'Ratio of Medals won Among the Top 30 Athletes.png', bbox_inches='tight')

In [198]:

#so my goal here is to see the distribution of age among gold winnig athletes and the sports played since 2000
#something like the plot in this website https://venngage.com/blog/olympics/
succ_athletes.head(2)

Out[198]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Medal
3	4	Edgar Lindenau Aabye	M	34.0	NaN	NaN	Denmark/Sweden	DEN	1900	Paris	Tug-Of-War	Gold
37	15	Arvo Ossian Aaltonen	M	30.0	NaN	NaN	Finland	FIN	1920	Antwerpen	Swimming	Bronze

In [199]:

succ_athletes["Sport"].unique()

Out[199]:

array(['Tug-Of-War', 'Swimming', 'Gymnastics', 'Handball', 'Hockey',
       'Rowing', 'Football', 'Sailing', 'Cycling', 'Fencing', 'Taekwondo',
       'Athletics', 'Canoeing', 'Water Polo', 'Wrestling',
       'Modern Pentathlon', 'Golf', 'Softball', 'Boxing', 'Basketball',
       'Diving', 'Baseball', 'Volleyball', 'Shooting', 'Judo',
       'Equestrianism', 'Tennis', 'Rugby Sevens', 'Rhythmic Gymnastics',
       'Weightlifting', 'Badminton', 'Beach Volleyball', 'Rugby',
       'Lacrosse', 'Synchronized Swimming', 'Archery', 'Triathlon',
       'Polo', 'Table Tennis', 'Cricket', 'Racquets', 'Croquet',
       'Art Competitions', 'Ice Hockey', 'Roque', 'Figure Skating',
       'Trampolining', 'Basque Pelota', 'Alpinism', 'Motorboating',
       'Jeu De Paume', 'Aeronautics'], dtype=object)

In [200]:

pop_sports = ['Gymnastics', 'Swimming', 'Athletics', 'Football', 'Weightlifting', 'Basketball', 'Volleyball', 
              'Water Sport', 'Tennis', 'Rugby']

# 'Water Sports' = ['Diving', 'Rowing', 'Sailing', 'Rhythmic Gymnastics', 'Synchronized Swimming']

In [201]:

succ_athletes.loc[succ_athletes["Name"] == 'Thomas Robert "Tom" Daley', "Sport"]

Out[201]:

48440    Diving
48443    Diving
Name: Sport, dtype: object

In [202]:

succ_athletes['Sport'].replace({'Diving' : 'Water Sport', 
                                'Rowing' : 'Water Sport',
                                'Sailing' : 'Water Sport', 
                                'Rhythmic Gymnastics' : 'Water Sport', 
                                'Synchronized Swimming' : 'Water Sport'}, 
                               inplace = True)

In [203]:

age_distr = succ_athletes[succ_athletes["Sport"].isin(pop_sports) & 
                          (succ_athletes["Medal"] == "Gold")]

In [204]:

age_distr[(age_distr["Year"] == 2012) & (age_distr["Sport"] == "Water Sport")]

Out[204]:

	ID	Name	Sex	Age	Height	Weight	Team	NOC	Year	City	Sport	Medal
1533	849	Filip Kamil Adamski	M	29.0	189.0	87.0	Germany	GER	2012	London	Water Sport	Gold
2692	1498	Charles Benedict "Ben" Ainslie	M	35.0	183.0	90.0	Great Britain	GBR	2012	London	Water Sport	Gold
3833	2210	Marina Alabau Neira	F	26.0	164.0	55.0	Spain	ESP	2012	London	Water Sport	Gold
4317	2464	Jo Qesem Ayela Aleh	F	26.0	171.0	58.0	New Zealand	NZL	2012	London	Water Sport	Gold
17010	9111	Anna Rose Bebington-Watkins	F	29.0	183.0	78.0	Great Britain	GBR	2012	London	Water Sport	Gold
...	...	...	...	...	...	...	...	...	...	...	...	...
263399	131805	Wu Minxia	F	26.0	167.0	52.0	China	CHN	2012	London	Water Sport	Gold
263400	131805	Wu Minxia	F	26.0	167.0	52.0	China	CHN	2012	London	Water Sport	Gold
263936	132072	Xu Lijia	F	24.0	176.0	60.0	China	CHN	2012	London	Water Sport	Gold
267384	133777	Ilya Leonidovich Zakharov	M	21.0	177.0	70.0	Russia	RUS	2012	London	Water Sport	Gold
269010	134595	Zhang Yanquan	M	18.0	158.0	52.0	China	CHN	2012	London	Water Sport	Gold

94 rows × 12 columns

In [205]:

age_event = pd.DataFrame(columns = ["Year"] + pop_sports)

In [206]:

age_event["Year"] = sorted(age_distr["Year"].unique())

In [207]:

age_event.head()

Out[207]:

	Year	Gymnastics	Swimming	Athletics	Football	Weightlifting	Basketball	Volleyball	Water Sport	Tennis	Rugby
0	1896	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1900	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1904	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1906	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1908	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

In [208]:

age_distr.loc[(age_distr["Sport"] == "Swimming") & (age_distr["Year"] == 2016), "Age"].mean()

Out[208]:

23.267605633802816

In [209]:

for yr in age_event["Year"].values:
    for evnt in pop_sports:
        age_event.loc[age_event["Year"] == yr, evnt] = age_distr.loc[(age_distr["Sport"] == evnt) & (age_distr["Year"] == yr), "Age"].mean()

In [210]:

# some sports weren't introduced to the olympics until later and some existed but kept recurring over and over so i'll just 
# replace the missing values with the mean
age_event = age_event.fillna(age_event.mean())

In [211]:

age_event

Out[211]:

	Year	Gymnastics	Swimming	Athletics	Football	Weightlifting	Basketball	Volleyball	Water Sport	Tennis	Rugby
0	1896	25.045455	18.666667	21.916667	24.684524	21.000000	25.263921	25.711675	26.376283	23.333333	24.197658
1	1900	28.000000	21.750000	24.148148	23.857143	25.156349	25.263921	25.711675	29.380952	26.666667	23.928571
2	1904	26.736842	22.461538	24.571429	25.083333	21.000000	25.263921	25.711675	25.294118	23.666667	24.197658
3	1906	24.772727	21.000000	26.428571	23.000000	24.500000	25.263921	25.711675	21.384615	22.600000	24.197658
4	1908	22.487179	22.625000	25.705882	26.000000	25.156349	25.263921	25.711675	32.289474	35.375000	24.266667
5	1912	24.119403	21.533333	23.566038	25.642857	25.156349	25.263921	25.711675	28.080000	27.916667	24.197658
6	1920	25.722222	21.437500	24.954545	25.928571	27.400000	25.263921	25.711675	30.206897	27.750000	23.437500
7	1924	28.000000	18.944444	24.697674	24.714286	26.200000	25.263921	25.711675	25.378378	26.750000	25.157895
8	1928	24.666667	19.809524	22.777778	26.117647	25.166667	25.263921	25.711675	25.750000	26.115625	24.197658
9	1932	24.466667	19.000000	22.868421	24.684524	26.000000	25.263921	25.711675	26.306122	26.115625	24.197658
10	1936	25.478261	18.647059	23.657895	22.214286	24.333333	24.857143	25.711675	26.750000	26.115625	24.197658
11	1948	29.400000	20.809524	26.452381	28.583333	27.166667	24.428571	25.711675	27.023256	26.115625	24.197658
12	1952	26.222222	20.363636	24.619048	24.800000	26.000000	23.214286	25.711675	26.065217	26.115625	24.197658
13	1956	26.906250	18.772727	23.761905	26.222222	24.714286	23.833333	25.711675	26.390244	26.115625	24.197658
14	1960	26.153846	18.350000	24.348837	23.062500	27.714286	22.833333	25.711675	24.195122	26.115625	24.197658
15	1964	25.625000	18.380000	24.577778	23.714286	27.428571	23.500000	24.608696	27.951220	26.115625	24.197658
16	1968	22.576923	19.550000	24.133333	23.823529	28.285714	23.000000	25.086957	24.595238	26.115625	24.197658
17	1972	22.000000	18.816667	24.800000	24.764706	26.000000	25.500000	25.958333	27.139535	26.115625	24.197658
18	1976	21.875000	18.375000	23.897959	25.764706	27.555556	22.958333	24.375000	24.126761	26.115625	24.197658
19	1980	20.760000	18.613636	24.754717	26.562500	24.300000	26.083333	24.652174	24.873239	26.115625	24.197658
20	1984	20.142857	20.203390	24.666667	24.625000	25.000000	21.833333	24.954545	25.146667	26.115625	24.197658
21	1988	19.250000	20.596491	25.431034	24.833333	24.800000	24.500000	24.583333	25.644737	23.833333	24.197658
22	1992	18.461538	21.133333	25.967213	21.294118	24.100000	26.916667	23.727273	25.897436	22.833333	24.197658
23	1996	20.115385	22.287879	26.883333	22.903226	24.800000	28.166667	25.863636	26.082353	26.166667	24.197658
24	2000	20.208333	22.454545	27.047619	23.028571	25.533333	28.347826	26.736842	26.389474	23.166667	24.197658
25	2004	20.541667	21.936508	25.564516	25.666667	24.400000	27.750000	26.500000	27.218750	24.000000	24.197658
26	2008	21.166667	22.873239	24.825397	23.771429	23.066667	27.083333	28.625000	26.187500	25.166667	24.197658
27	2012	21.000000	22.814286	25.953125	25.818182	23.333333	27.000000	27.291667	25.723404	30.250000	24.197658
28	2016	22.636364	23.267606	26.575758	24.685714	24.266667	28.208333	27.000000	27.065217	28.375000	24.197658

In [212]:

age_event.to_csv(filepath + 'Average Age of Popular Sports.csv')

In [213]:

fig,ax = plt.subplots(figsize = (16, 8))

for evnt in pop_sports[:5]:
    
    model = make_interp_spline(age_event["Year"], age_event[evnt])
    xs = np.linspace(1896, 2016 ,500)
    ys = model(xs)
    
    ax.scatter(age_event["Year"], age_event[evnt], marker = 'x')
    data = ax.plot(xs, ys, label = evnt)

plt.xlabel("Years", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.ylabel("Age", 
           fontdict = {"fontsize" : 12, 
                       "fontweight" : "bold"})
plt.title("Mean Age Distribution of Olympic Sports 1896-2016", 
          fontdict = {"fontsize" : 16, 
                      "fontweight" : "bold"})

ax.set_xticks(age_event["Year"])
plt.xticks(rotation = 60)
plt.legend(loc = 'best', ncol=len(pop_sports), framealpha = 0.4)

plt.savefig(imagepath + 'Mean Age Distribution of Popular Sports 1896-2016.png')

In [214]:

# how the height/weight of athletes changed over time for these main sports
# 'Gymnastics', 'Swimming', 'Athletics', 'Weightlifting', 'Basketball', 'Water Sport'
# i'll stick with the successful athletes

In [215]:

physique_h = succ_athletes[succ_athletes["Height"].notnull()].sort_values(by = "Year").drop(["Name", "Age", "Team", "City"], axis = 1)

In [216]:

physique_h.head(2)

Out[216]:

	ID	Sex	Height	Weight	NOC	Year	Sport	Medal
44063	22700	M	175.0	72.0	USA	1896	Athletics	Bronze
105834	53542	M	173.0	69.0	USA	1896	Athletics	Silver

In [217]:

#For MEN in BASKETBALL:
plt.figure(figsize = (16, 7))
sns.boxplot(x = "Year", 
            y = "Height", 
            data = physique_h[(physique_h["Sex"] == "M") & (physique_h["Sport"] == 'Basketball')])

plt.title("Evolution of Height among Basketball Male Players", fontweight = 'bold')

plt.savefig(imagepath + 'Evolution of Height among Basketball Male Players.png')

In [218]:

#For MEN in SWIMMING:
plt.figure(figsize = (16, 7))
sns.boxplot(x = "Year", 
            y = "Height",
            data = physique_h[(physique_h["Sex"] == "F") & (physique_h["Sport"] == 'Swimming')])

plt.title("Evolution of Height among Male Swimmers", fontweight = 'bold')

plt.savefig(imagepath + 'Evolution of Height among Male Swimmers.png')

In [219]:

#For WOMEN in GYMNASTICS:
plt.figure(figsize = (16, 7))
sns.boxplot(x = "Year", 
            y = "Height", 
            data = physique_h[(physique_h["Sex"] == "F") & (physique_h["Sport"] == 'Gymnastics')])

plt.title("Evolution of Height among Female Gymnasts", fontweight = 'bold')

plt.savefig(imagepath + 'Evolution of Height among Female Gymnasts.png')

In [220]:

#For WOMEN in ATHLETICS:
plt.figure(figsize = (16, 7))
sns.boxplot(x = "Year", 
            y = "Height", 
            hue = "Sex",
            data = physique_h[(physique_h["Sport"] == 'Athletics')])
plt.legend(loc = 'best')
plt.title("Evolution of Height among Athletes in Athletics", fontweight = 'bold')

plt.savefig(imagepath + 'Evolution of Height among Athletes in Athletics.png')

In [221]:

physique_w = succ_athletes[succ_athletes["Weight"].notnull()].sort_values(by = "Year").drop(["Name", "Age", "Team", "City"], axis = 1)

In [222]:

plt.figure(figsize = (16, 8))
sns.pointplot(x = "Year", 
              y = "Weight", 
              hue = "Sex",
              data = physique_w[(physique_w["Sport"] == 'Weightlifting')])

plt.title("Evolution of Weight among Weightlifting Athletes", fontweight = 'bold')

plt.savefig(imagepath + 'Evolution of Weight among Weightlifting Athletes.png')

In [223]:

plt.figure(figsize = (16, 8))
sns.pointplot(x = "Year", 
              y = "Weight", 
              hue = "Sex",
              data = physique_w[(physique_w["Sport"] == 'Gymnastics')])

plt.title("Evolution of Weight among Gymnasts", fontweight = 'bold')

plt.savefig(imagepath + 'Evolution of Weight among Gymnasts.png')