Dataset can be found at European Soccer Kaggle Dataset
by Hugo Mathien
You can download the sqlite db for this exercise from here
# import libraries
import pandas as pd
import sqlite3
# connect to database
# database.sqlite is the name of the database
db = sqlite3.connect("database.sqlite")
# prepare the sql statement
sql = "SELECT * FROM Player limit 5000"
# execute it on the database
player_df = pd.read_sql(sql, db)
# let's view the data
player_df.head()
id | player_api_id | player_name | player_fifa_api_id | birthday | height | weight | |
---|---|---|---|---|---|---|---|
0 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 |
1 | 2 | 155782 | Aaron Cresswell | 189615 | 1989-12-15 00:00:00 | 170.18 | 146 |
2 | 3 | 162549 | Aaron Doran | 186170 | 1991-05-13 00:00:00 | 170.18 | 163 |
3 | 4 | 30572 | Aaron Galindo | 140161 | 1982-05-08 00:00:00 | 182.88 | 198 |
4 | 5 | 23780 | Aaron Hughes | 17725 | 1979-11-08 00:00:00 | 182.88 | 154 |
# Now it is your turn to fetch 500 entries from Player_Attributes
sql = "SELECT * FROM Player_Attributes limit 5000"
atts_df = pd.read_sql(sql, db)
atts_df.head()
id | player_fifa_api_id | player_api_id | date | overall_rating | potential | preferred_foot | attacking_work_rate | defensive_work_rate | crossing | ... | vision | penalties | marking | standing_tackle | sliding_tackle | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 218353 | 505942 | 2016-02-18 00:00:00 | 67 | 71 | right | medium | medium | 49 | ... | 54 | 48 | 65 | 69 | 69 | 6 | 11 | 10 | 8 | 8 |
1 | 2 | 218353 | 505942 | 2015-11-19 00:00:00 | 67 | 71 | right | medium | medium | 49 | ... | 54 | 48 | 65 | 69 | 69 | 6 | 11 | 10 | 8 | 8 |
2 | 3 | 218353 | 505942 | 2015-09-21 00:00:00 | 62 | 66 | right | medium | medium | 49 | ... | 54 | 48 | 65 | 66 | 69 | 6 | 11 | 10 | 8 | 8 |
3 | 4 | 218353 | 505942 | 2015-03-20 00:00:00 | 61 | 65 | right | medium | medium | 48 | ... | 53 | 47 | 62 | 63 | 66 | 5 | 10 | 9 | 7 | 7 |
4 | 5 | 218353 | 505942 | 2007-02-22 00:00:00 | 61 | 65 | right | medium | medium | 48 | ... | 53 | 47 | 62 | 63 | 66 | 5 | 10 | 9 | 7 | 7 |
5 rows × 42 columns
Examine the two data frames and suggest a key to use to combine
# Let's try inner join
player_df.merge(atts_df,how="inner",on="player_api_id").head()
id_x | player_api_id | player_name | player_fifa_api_id_x | birthday | height | weight | id_y | player_fifa_api_id_y | date | ... | vision | penalties | marking | standing_tackle | sliding_tackle | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 1 | 218353 | 2016-02-18 00:00:00 | ... | 54 | 48 | 65 | 69 | 69 | 6 | 11 | 10 | 8 | 8 |
1 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 2 | 218353 | 2015-11-19 00:00:00 | ... | 54 | 48 | 65 | 69 | 69 | 6 | 11 | 10 | 8 | 8 |
2 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 3 | 218353 | 2015-09-21 00:00:00 | ... | 54 | 48 | 65 | 66 | 69 | 6 | 11 | 10 | 8 | 8 |
3 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 4 | 218353 | 2015-03-20 00:00:00 | ... | 53 | 47 | 62 | 63 | 66 | 5 | 10 | 9 | 7 | 7 |
4 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 5 | 218353 | 2007-02-22 00:00:00 | ... | 53 | 47 | 62 | 63 | 66 | 5 | 10 | 9 | 7 | 7 |
5 rows × 48 columns
# Outer join will give us an idea of where the data went
player_df.merge(atts_df,how="outer",on="player_api_id").head()
id_x | player_api_id | player_name | player_fifa_api_id_x | birthday | height | weight | id_y | player_fifa_api_id_y | date | ... | vision | penalties | marking | standing_tackle | sliding_tackle | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 1 | 218353 | 2016-02-18 00:00:00 | ... | 54 | 48 | 65 | 69 | 69 | 6 | 11 | 10 | 8 | 8 |
1 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 2 | 218353 | 2015-11-19 00:00:00 | ... | 54 | 48 | 65 | 69 | 69 | 6 | 11 | 10 | 8 | 8 |
2 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 3 | 218353 | 2015-09-21 00:00:00 | ... | 54 | 48 | 65 | 66 | 69 | 6 | 11 | 10 | 8 | 8 |
3 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 4 | 218353 | 2015-03-20 00:00:00 | ... | 53 | 47 | 62 | 63 | 66 | 5 | 10 | 9 | 7 | 7 |
4 | 1 | 505942 | Aaron Appindangoye | 218353 | 1992-02-29 00:00:00 | 182.88 | 187 | 5 | 218353 | 2007-02-22 00:00:00 | ... | 53 | 47 | 62 | 63 | 66 | 5 | 10 | 9 | 7 | 7 |
5 rows × 48 columns
# Try to compare both operations by check counts, and null values
# can you find differences?
# Can you explain why these differences exist?
Load players.csv and teams.csv into player_df and team_df respectively
# Load players and teams data here
player_df =
team_df =
player_df.head()
id_x | player_api_id | player_name | player_fifa_api_id_x | birthday | height | weight | id_y | player_fifa_api_id_y | date | ... | penalties | marking | standing_tackle | sliding_tackle | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | team_api_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
765 | 48 | 439366 | Abdoulaye Toure | 210450 | 1994-03-03 00:00:00 | 187.96 | 170 | 766 | 210450 | 2016-05-05 00:00:00 | ... | 39 | 57 | 62 | 58 | 8 | 15 | 9 | 9 | 7 | 8674 |
1050 | 68 | 37422 | Abella Perez Damia | 159580 | 1982-04-15 00:00:00 | 187.96 | 174 | 1051 | 159580 | 2016-01-28 00:00:00 | ... | 46 | 65 | 70 | 69 | 13 | 9 | 12 | 19 | 12 | 8674 |
2118 | 129 | 160447 | Adam Smith | 190885 | 1991-04-29 00:00:00 | 180.34 | 179 | 2119 | 190885 | 2015-11-12 00:00:00 | ... | 48 | 71 | 70 | 68 | 8 | 9 | 15 | 9 | 6 | 8674 |
4180 | 253 | 32547 | Alan Gow | 140307 | 1982-10-09 00:00:00 | 182.88 | 154 | 4181 | 140307 | 2015-02-27 00:00:00 | ... | 62 | 25 | 23 | 25 | 6 | 9 | 10 | 12 | 9 | 8674 |
3081 | 182 | 168047 | Adrian Stoian | 192072 | 1991-02-11 00:00:00 | 177.80 | 146 | 3082 | 192072 | 2016-02-04 00:00:00 | ... | 65 | 28 | 30 | 32 | 10 | 11 | 7 | 11 | 6 | 8674 |
5 rows × 49 columns
team_df.head()
id_x | team_api_id | team_fifa_api_id_x | team_long_name | team_short_name | id_y | team_fifa_api_id_y | date | buildUpPlaySpeed | buildUpPlaySpeedClass | ... | chanceCreationShooting | chanceCreationShootingClass | chanceCreationPositioningClass | defencePressure | defencePressureClass | defenceAggression | defenceAggressionClass | defenceTeamWidth | defenceTeamWidthClass | defenceDefenderLineClass | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
317 | 26548 | 8674 | 1915 | FC Groningen | GRO | 427 | 1915 | 2010-02-22 00:00:00 | 41 | Balanced | ... | 69 | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover |
150 | 11822 | 4087 | 111271 | Évian Thonon Gaillard FC | ETG | 411 | 111271 | 2011-02-22 00:00:00 | 35 | Balanced | ... | 65 | Normal | Organised | 45 | Medium | 55 | Press | 65 | Normal | Cover |
456 | 43053 | 9906 | 240 | Atlético Madrid | AMA | 95 | 240 | 2010-02-22 00:00:00 | 64 | Balanced | ... | 70 | Lots | Free Form | 70 | High | 34 | Press | 55 | Normal | Offside Trap |
374 | 35284 | 9807 | 1889 | CF Os Belenenses | BEL | 156 | 1889 | 2010-02-22 00:00:00 | 30 | Slow | ... | 60 | Normal | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Offside Trap |
355 | 27780 | 10218 | 1971 | Excelsior | EXC | 416 | 1971 | 2011-02-22 00:00:00 | 73 | Fast | ... | 52 | Normal | Organised | 25 | Deep | 47 | Press | 33 | Narrow | Cover |
5 rows × 29 columns
team_df[["team_api_id","buildUpPlaySpeed"]].head()
team_api_id | buildUpPlaySpeed | |
---|---|---|
317 | 8674 | 41 |
150 | 4087 | 35 |
456 | 9906 | 64 |
374 | 9807 | 30 |
355 | 10218 | 73 |
player_df[["player_api_id","team_api_id","overall_rating"]].head()
player_api_id | team_api_id | overall_rating | |
---|---|---|---|
765 | 439366 | 8674 | 64 |
1050 | 37422 | 8674 | 71 |
2118 | 160447 | 8674 | 70 |
4180 | 32547 | 8674 | 63 |
3081 | 168047 | 8674 | 70 |
# First we have to merge based on team_api_id
merged_df = player_df.merge(team_df, how="inner", on="team_api_id")
merged_df.columns
Index(['id_x_x', 'player_api_id', 'player_name', 'player_fifa_api_id_x', 'birthday', 'height', 'weight', 'id_y_x', 'player_fifa_api_id_y', 'date_x', 'overall_rating', 'potential', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes', 'team_api_id', 'id_x_y', 'team_fifa_api_id_x', 'team_long_name', 'team_short_name', 'id_y_y', 'team_fifa_api_id_y', 'date_y', 'buildUpPlaySpeed', 'buildUpPlaySpeedClass', 'buildUpPlayDribbling', 'buildUpPlayDribblingClass', 'buildUpPlayPassing', 'buildUpPlayPassingClass', 'buildUpPlayPositioningClass', 'chanceCreationPassing', 'chanceCreationPassingClass', 'chanceCreationCrossing', 'chanceCreationCrossingClass', 'chanceCreationShooting', 'chanceCreationShootingClass', 'chanceCreationPositioningClass', 'defencePressure', 'defencePressureClass', 'defenceAggression', 'defenceAggressionClass', 'defenceTeamWidth', 'defenceTeamWidthClass', 'defenceDefenderLineClass'], dtype='object')
# notice that team entries are duplicated
# and that we have an entry for every player
merged_df[["player_name","team_long_name","overall_rating","buildUpPlaySpeed"]].head()
player_name | team_long_name | overall_rating | buildUpPlaySpeed | |
---|---|---|---|---|
0 | Abdoulaye Toure | FC Groningen | 64 | 41 |
1 | Abella Perez Damia | FC Groningen | 71 | 41 |
2 | Adam Smith | FC Groningen | 70 | 41 |
3 | Alan Gow | FC Groningen | 63 | 41 |
4 | Adrian Stoian | FC Groningen | 70 | 41 |
# To analyze teams, you must aggregate player observations if you want to include them
# otherwise, you have to filter on team attributes and remove duplicates
# let's aggregate overall_rating by calculating the average for the players
merged_df[
["player_name","team_api_id","team_long_name","overall_rating","buildUpPlaySpeed"]
].groupby("team_api_id").agg({"overall_rating":"mean"})
overall_rating | |
---|---|
team_api_id | |
4087 | 62.090909 |
7788 | 67.909091 |
7819 | 68.000000 |
8262 | 67.818182 |
8322 | 66.000000 |
8342 | 67.090909 |
8526 | 69.636364 |
8535 | 69.000000 |
8559 | 67.545455 |
8674 | 68.000000 |
9789 | 66.727273 |
9807 | 69.363636 |
9810 | 67.363636 |
9825 | 65.454545 |
9826 | 70.636364 |
9880 | 72.363636 |
9906 | 66.272727 |
9987 | 69.000000 |
10218 | 64.909091 |
208931 | 67.454545 |
# simply merge it to team_df to start analyzing teams
# but dont forget to reset_index to convert the index into a regular column
ratings_df = merged_df[
["player_name","team_api_id","team_long_name","overall_rating","buildUpPlaySpeed"]
].groupby("team_api_id").agg({"overall_rating":"mean"}).reset_index()
team_df.merge(ratings_df, how="inner", on="team_api_id").head()
id_x | team_api_id | team_fifa_api_id_x | team_long_name | team_short_name | id_y | team_fifa_api_id_y | date | buildUpPlaySpeed | buildUpPlaySpeedClass | ... | chanceCreationShootingClass | chanceCreationPositioningClass | defencePressure | defencePressureClass | defenceAggression | defenceAggressionClass | defenceTeamWidth | defenceTeamWidthClass | defenceDefenderLineClass | overall_rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 26548 | 8674 | 1915 | FC Groningen | GRO | 427 | 1915 | 2010-02-22 00:00:00 | 41 | Balanced | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
1 | 11822 | 4087 | 111271 | Évian Thonon Gaillard FC | ETG | 411 | 111271 | 2011-02-22 00:00:00 | 35 | Balanced | ... | Normal | Organised | 45 | Medium | 55 | Press | 65 | Normal | Cover | 62.090909 |
2 | 43053 | 9906 | 240 | Atlético Madrid | AMA | 95 | 240 | 2010-02-22 00:00:00 | 64 | Balanced | ... | Lots | Free Form | 70 | High | 34 | Press | 55 | Normal | Offside Trap | 66.272727 |
3 | 35284 | 9807 | 1889 | CF Os Belenenses | BEL | 156 | 1889 | 2010-02-22 00:00:00 | 30 | Slow | ... | Normal | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Offside Trap | 69.363636 |
4 | 27780 | 10218 | 1971 | Excelsior | EXC | 416 | 1971 | 2011-02-22 00:00:00 | 73 | Fast | ... | Normal | Organised | 25 | Deep | 47 | Press | 33 | Narrow | Cover | 64.909091 |
5 rows × 30 columns
# you can perform your analysis directly on marged_df
# because the level of analysis is the player there
merged_df[["player_name","team_api_id","team_long_name","overall_rating","buildUpPlaySpeed"]].head()
player_name | team_api_id | team_long_name | overall_rating | buildUpPlaySpeed | |
---|---|---|---|---|---|
0 | Abdoulaye Toure | 8674 | FC Groningen | 64 | 41 |
1 | Abella Perez Damia | 8674 | FC Groningen | 71 | 41 |
2 | Adam Smith | 8674 | FC Groningen | 70 | 41 |
3 | Alan Gow | 8674 | FC Groningen | 63 | 41 |
4 | Adrian Stoian | 8674 | FC Groningen | 70 | 41 |
If you want to create a column in merged_df that contains the average overall_rating then you use transform
import numpy as np
merged_df.groupby("team_api_id").transform(np.mean).overall_rating.head(15)
0 68.000000 1 68.000000 2 68.000000 3 68.000000 4 68.000000 5 68.000000 6 68.000000 7 68.000000 8 68.000000 9 68.000000 10 68.000000 11 62.090909 12 62.090909 13 62.090909 14 62.090909 Name: overall_rating, dtype: float64
# simply assign this column to merged_df and give it an appropriate name
merged_df["mean_overall_rating"] = merged_df.groupby("team_api_id").transform(np.mean).overall_rating
merged_df.head(15)
id_x_x | player_api_id | player_name | player_fifa_api_id_x | birthday | height | weight | id_y_x | player_fifa_api_id_y | date_x | ... | chanceCreationShootingClass | chanceCreationPositioningClass | defencePressure | defencePressureClass | defenceAggression | defenceAggressionClass | defenceTeamWidth | defenceTeamWidthClass | defenceDefenderLineClass | mean_overall_rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 48 | 439366 | Abdoulaye Toure | 210450 | 1994-03-03 00:00:00 | 187.96 | 170 | 766 | 210450 | 2016-05-05 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
1 | 68 | 37422 | Abella Perez Damia | 159580 | 1982-04-15 00:00:00 | 187.96 | 174 | 1051 | 159580 | 2016-01-28 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
2 | 129 | 160447 | Adam Smith | 190885 | 1991-04-29 00:00:00 | 180.34 | 179 | 2119 | 190885 | 2015-11-12 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
3 | 253 | 32547 | Alan Gow | 140307 | 1982-10-09 00:00:00 | 182.88 | 154 | 4181 | 140307 | 2015-02-27 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
4 | 182 | 168047 | Adrian Stoian | 192072 | 1991-02-11 00:00:00 | 177.80 | 146 | 3082 | 192072 | 2016-02-04 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
5 | 246 | 34268 | Alain Nef | 49939 | 1982-02-06 00:00:00 | 190.50 | 194 | 4057 | 49939 | 2016-03-10 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
6 | 65 | 302985 | Abel Khaled | 207541 | 1992-11-09 00:00:00 | 180.34 | 148 | 1023 | 207541 | 2015-03-13 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
7 | 206 | 213366 | Afriyie Acquah | 201223 | 1992-01-05 00:00:00 | 177.80 | 154 | 3487 | 201223 | 2016-05-12 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
8 | 73 | 80592 | Aboubakar Oumarou | 218548 | 1987-04-01 00:00:00 | 182.88 | 168 | 1129 | 218548 | 2015-04-01 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
9 | 275 | 37503 | Albano Benjamin Bizzarri | 14907 | 1977-11-09 00:00:00 | 193.04 | 196 | 4532 | 14907 | 2015-11-26 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
10 | 37 | 173955 | Abdoul Razzagui Camara | 193953 | 1990-02-20 00:00:00 | 177.80 | 157 | 555 | 193953 | 2016-04-21 00:00:00 | ... | Lots | Organised | 30 | Deep | 30 | Contain | 30 | Narrow | Cover | 68.000000 |
11 | 247 | 182847 | Alain Pierre Mendy | 209352 | 1989-11-17 00:00:00 | 182.88 | 159 | 4088 | 209352 | 2013-03-15 00:00:00 | ... | Normal | Organised | 45 | Medium | 55 | Press | 65 | Normal | Cover | 62.090909 |
12 | 162 | 121643 | Adrian Chomiuk | 186629 | 1988-06-23 00:00:00 | 182.88 | 179 | 2749 | 186629 | 2010-08-30 00:00:00 | ... | Normal | Organised | 45 | Medium | 55 | Press | 65 | Normal | Cover | 62.090909 |
13 | 243 | 127255 | Akwetey Mensah | 198781 | 1983-04-15 00:00:00 | 172.72 | 163 | 4025 | 198781 | 2010-08-30 00:00:00 | ... | Normal | Organised | 45 | Medium | 55 | Press | 65 | Normal | Cover | 62.090909 |
14 | 51 | 419681 | Abdul Aziz Tetteh | 190193 | 1989-02-10 00:00:00 | 182.88 | 190 | 803 | 190193 | 2016-05-05 00:00:00 | ... | Normal | Organised | 45 | Medium | 55 | Press | 65 | Normal | Cover | 62.090909 |
15 rows × 78 columns
# At the team level of analysis
# create 4 new columns from player sprint_speed data:
# mean_sprint_speed, max_sprint_speed, min_sprint_speed, and std_sprint_speed
# at the player unit of analysis
# create 4 new columns from player sprint_speed data:
# mean_sprint_speed, max_sprint_speed, min_sprint_speed, and std_sprint_speed