-
Notifications
You must be signed in to change notification settings - Fork 12
/
notebook_functions.py
153 lines (118 loc) · 6.8 KB
/
notebook_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import pandas as pd
import numpy as np
from openml.datasets import get_dataset
from utils import read_pickle
import soccer_basics
from gtda.pipeline import Pipeline
from sub_space_extraction import SubSpaceExtraction
from gtda.homology import VietorisRipsPersistence
from cross_validation import extract_features_for_prediction
COLUMNS_TO_KEEP = ["home_best_attack", "home_best_defense", "home_avg_attack", "home_avg_defense",
"home_std_attack", "home_std_defense", "gk_home_player_1",
"away_avg_attack", "away_avg_defense", "away_std_attack", "away_std_defense",
"away_best_attack", "away_best_defense", "gk_away_player_1"
]
pl_team_names = ['Burnley', 'Leicester City', 'Chelsea', 'Manchester City', 'Southampton', 'Sunderland',
'Tottenham Hotspur', 'Liverpool', 'West Ham United', 'West Bromwich Albion', 'Hull City',
'Everton', 'Arsenal', 'Crystal Palace', 'Swansea City', 'Queens Park Ranger', 'Stoke City',
'Aston Villa', 'Manchester United', 'Newcastle United']
serie_a_team_names = ['Sassuolo', 'Atalanta', 'Chievo Verona', 'Empoli', 'Fiorentina', 'Palermo', 'Lazio', 'Milan',
'Udinese', 'Inter', 'Roma', 'Torino', 'Bologna', 'Napoli', 'Hellas Verona', 'Sampdoria',
'Juventus', 'Frosinone', 'Genoa', 'Carpi']
teams_with_messi = pd.DataFrame([('Chelsea', '%+d' % 0, ' 0.25', '0.84', '0.00', ' 0.41', '0.94', '0.00'),
('Manchester City', '%+d' % 1, '0.59', '0.97', '0.00', '0.58', '0.97', '0.00'),
('Arsenal', '%+d' % 0, '0.05', '0.56', '0.00', '0.17', '0.82', '0.00'),
('Manchester United', '%+d' % 1, ' 0.10', '0.69', '0.00', '0.17', '0.81', '0.00'),
('Tottenham', '%+d' % 1, '0.03', '0.44', '0.00', '0.06', '0.56', '0.00'),
('Liverpool', '%+d' % 3, '0.01', ' 0.17', '0.01', '0.10', '0.66', '0.00'),
('Southampton', '%+d' % 0, '0.00', '0.01', '0.14', '0.01', '0.19', '0.01'),
('Swansea City', '%+d' % 0, '0.00', '0.01', '0.16', '0.01', '0.04', '0.08'),
('Stoke City', '%+d' % 2, '0.00', '0.01', '0.15', '0.01', '0.16', '0.01'),
('Crystal Palace', '%+d' % 2, '0.00', '0.00', '0.29', '0.00', '0.05', '0.07'),
('Everton', '%+d' % 4, '0.01', '0.22', '0.01', '0.09', '0.20', '0.01'),
('West Ham United', '%+d' % 4, '0.00', '0.01', '0.18', '0.01', '0.11', '0.03'),
('West Bromwich', '%+d' % 5, '0.00', '0.01', '0.25', '0.00', '0.06', '0.07'),
('Leicester City', '%+d' % 5, '0.00', '0.00', '0.42', '0.19', '0.72', '0.00'),
('Newcastle United', '%+d' % 9, '0.00', '0.02', '0.11', '0.01', '0.24', '0.01'),
('Aston Villa', '%+d' % 8, '0.00', '0.01', '0.15', '0.01', '0.11', '0.02'),
('Sunderland', '%+d' % 9, '0.00', '0.01', '0.31', '0.00', '0.04', '0.08'),
('Hull City', '%+d' % 9, '0.00', '0.01', '0.30', '0.00', '0.03', '0.09'),
('Burnley', '%+d' % 10, '0.00', '0.00', '0.31', '0.00', '0.04', '0.11'),
('QPR', '%+d' % 11, '0.00', '0.00', '0.22', '0.00', '0.13', '0.02'),
], columns=['Team', 'Delta Pos.', 'Pr. Win', 'Pr. TOP 4', 'Pr. Rel.',
'Pr. Win. with Messi', 'Pr. TOP 4. with Messi', 'Pr. Rel. with Messi']
)
def compute_final_standings(prob_match_df, championship='premier league'):
p1 = prob_match_df.home_team_prob.reset_index(drop=True)
px = prob_match_df.draw_prob.reset_index(drop=True)
p2 = prob_match_df.away_team_prob.reset_index(drop=True)
x, y = soccer_basics.simulation_champion(prob_match_df, p1, px, p2, 1000)
if championship == 'premier league':
names = pl_team_names
else:
names = serie_a_team_names
teams = np.sort(prob_match_df.home_team_api_id.unique())
soccer_basics.printer_ranks(x, y, teams, names)
def get_pipeline(top_feat_params):
pipeline = Pipeline([('extract_point_clouds', SubSpaceExtraction(**top_feat_params)),
('create_diagrams', VietorisRipsPersistence(n_jobs=-1))])
return pipeline
def get_best_params():
cv_output = read_pickle('cv_output.pickle')
best_model_params, top_feat_params, top_model_feat_params, *_ = cv_output
return top_feat_params, top_model_feat_params
def get_useful_cols(players_df):
return players_df[COLUMNS_TO_KEEP]
def load_dataset():
x_y = get_dataset(42188).get_data(dataset_format='array')[0]
x_train_with_topo = x_y[:, :-1]
y_train = x_y[:, -1]
return x_train_with_topo, y_train
def extract_x_test_features(x_train, y_train, players_df, pipeline):
"""Extract the topological features from the test set. This requires also the train set
Parameters
----------
x_train:
The x used in the training phase
y_train:
The y used in the training phase
players_df: pd.DataFrame
The DataFrame containing the matches with all the players, from which to extract the test set
pipeline: Pipeline
The Giotto pipeline
Returns
-------
x_test:
The x_test with the topological features
"""
x_train_no_topo = x_train[:, :14]
y_test = np.zeros(len(players_df)) # Artificial y_test for features computation
x_test_topo = extract_features_for_prediction(x_train_no_topo, y_train, players_df.values, y_test, pipeline)
return x_test_topo
def get_team_ids(players_df):
"""Get the team ids contained in the players_df DataFrame
Parameters
----------
players_df: pd.DataFrame
The DataFrame containing all the matches
"""
return players_df[['home_team_api_id', 'away_team_api_id']]
def get_probabilities(model, x_test, team_ids):
"""Get the probabilities on the outcome of the matches contained in the test set
Parameters
----------
model:
The model (must have the 'predict_proba' function)
x_test:
The test set
team_ids: pd.DataFrame
The DataFrame containing, for each match in the test set, the ids of the two teams
Returns
-------
probabilities:
The probabilities for each match in the test set
"""
prob_pred = model.predict_proba(x_test)
prob_match_df = pd.DataFrame(data=prob_pred, columns=['away_team_prob', 'draw_prob', 'home_team_prob'])
prob_match_df = pd.concat([team_ids.reset_index(drop=True), prob_match_df], axis=1)
return prob_match_df