Skip to content

Commit 1ec50f7

Browse files
authored
fixes to glicko time handling and benchmarks scoring (#39)
* fixes to glicko time handling and benchmarks scoring * test fixes * Update base.py
1 parent 7d95ac7 commit 1ec50f7

24 files changed

+750
-525
lines changed

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1-
v1.0.0 (Unreleased)
1+
v1.1.0
2+
======
3+
4+
* Glicko and Glicko-2 now properly handle time since last match
5+
* Bugfix in evaluation of draws in benchmarking
6+
7+
v1.0.0
28
======
39

410
* [] Added end to end examples using the chess and cfb datasets

docs/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
4646
# ones.
4747
extensions = [
48-
"sphinx.ext.autodoc",
48+
"sphinx.ext.autodoc",
4949
"sphinx.ext.viewcode",
5050
"sphinx_rtd_dark_mode",
5151
"sphinxcontrib.googleanalytics",

elote/arenas/base.py

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,13 @@ def confusion_matrix(self, lower_threshold: float = 0.45, upper_threshold: float
145145
elif predicted_prob is None:
146146
continue
147147

148-
# Determine the predicted winner based on thresholds
148+
# Determine the predicted outcome
149149
if predicted_prob > upper_threshold:
150150
predicted_winner = "a"
151151
elif predicted_prob < lower_threshold:
152152
predicted_winner = "b"
153153
else:
154-
# This is an uncertain prediction - skip it for confusion matrix calculation
155-
continue
154+
predicted_winner = "draw"
156155

157156
# Normalize actual winner to 'a', 'b', or 'draw'
158157
if isinstance(actual_winner, str):
@@ -162,7 +161,6 @@ def confusion_matrix(self, lower_threshold: float = 0.45, upper_threshold: float
162161
elif actual_winner in ["b", "loss", "false", "0"]:
163162
actual_winner = "b"
164163
else:
165-
# Treat other values as draw
166164
actual_winner = "draw"
167165
elif isinstance(actual_winner, (int, float)):
168166
if actual_winner == 1:
@@ -179,24 +177,31 @@ def confusion_matrix(self, lower_threshold: float = 0.45, upper_threshold: float
179177
continue
180178

181179
# Update confusion matrix
182-
if actual_winner == "a":
183-
if predicted_winner == "a":
180+
if predicted_winner == "draw":
181+
if actual_winner == "draw":
182+
true_positives += 1 # Correctly predicted draw
183+
else:
184+
false_positives += 1 # Incorrectly predicted draw
185+
elif actual_winner == "draw":
186+
false_negatives += 1 # Failed to predict draw
187+
elif predicted_winner == "a":
188+
if actual_winner == "a":
184189
true_positives += 1
185-
elif predicted_winner == "b":
186-
false_negatives += 1
187-
elif actual_winner == "b":
188-
if predicted_winner == "b":
189-
true_negatives += 1
190-
elif predicted_winner == "a":
190+
else:
191191
false_positives += 1
192-
else: # actual_winner == "draw"
193-
if predicted_winner == "a":
194-
false_positives += 1 # Predicted a win but was a draw
195-
else: # predicted_winner == "b"
196-
false_negatives += 1 # Predicted b win but was a draw
192+
elif predicted_winner == "b":
193+
if actual_winner == "b":
194+
true_negatives += 1
195+
else:
196+
false_negatives += 1
197197

198198
# Return results as a dictionary
199-
return {"tp": true_positives, "fp": false_positives, "tn": true_negatives, "fn": false_negatives}
199+
return {
200+
"tp": true_positives,
201+
"fp": false_positives,
202+
"tn": true_negatives,
203+
"fn": false_negatives
204+
}
200205

201206
def random_search(self, trials=1000):
202207
"""Search for optimal prediction thresholds using random sampling.

elote/arenas/lambda_arena.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def tournament(self, matchups):
7373
for data in tqdm(matchups):
7474
self.matchup(*data)
7575

76-
def matchup(self, a, b, attributes=None):
76+
def matchup(self, a, b, attributes=None, match_time=None):
7777
"""Process a single matchup between two competitors.
7878
7979
This method handles a matchup between two competitors, creating them
@@ -84,6 +84,7 @@ def matchup(self, a, b, attributes=None):
8484
a: The first competitor or competitor identifier.
8585
b: The second competitor or competitor identifier.
8686
attributes (dict, optional): Additional attributes to record with this bout.
87+
match_time (datetime, optional): The time when the match occurred.
8788
8889
Returns:
8990
The result of the matchup.
@@ -100,14 +101,26 @@ def matchup(self, a, b, attributes=None):
100101
else:
101102
res = self.func(a, b)
102103

104+
# Check if the competitor supports time-based ratings
105+
supports_time = hasattr(self.competitors[a], "_last_activity")
106+
103107
if res is None:
104-
self.competitors[a].tied(self.competitors[b])
108+
if supports_time:
109+
self.competitors[a].tied(self.competitors[b], match_time=match_time)
110+
else:
111+
self.competitors[a].tied(self.competitors[b])
105112
self.history.add_bout(Bout(a, b, predicted_outcome, outcome="tie", attributes=attributes))
106113
elif res is True:
107-
self.competitors[a].beat(self.competitors[b])
114+
if supports_time:
115+
self.competitors[a].beat(self.competitors[b], match_time=match_time)
116+
else:
117+
self.competitors[a].beat(self.competitors[b])
108118
self.history.add_bout(Bout(a, b, predicted_outcome, outcome="win", attributes=attributes))
109119
else:
110-
self.competitors[b].beat(self.competitors[a])
120+
if supports_time:
121+
self.competitors[b].beat(self.competitors[a], match_time=match_time)
122+
else:
123+
self.competitors[b].beat(self.competitors[a])
111124
self.history.add_bout(Bout(a, b, predicted_outcome, outcome="loss", attributes=attributes))
112125

113126
def expected_score(self, a, b):

elote/competitors/glicko.py

Lines changed: 77 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import math
2-
from typing import Dict, Any, ClassVar, Tuple, Type, TypeVar
2+
from typing import Dict, Any, ClassVar, Tuple, Type, TypeVar, Optional
3+
from datetime import datetime
34

45
from elote.competitors.base import BaseCompetitor, InvalidRatingValueException, InvalidParameterException
56

@@ -17,19 +18,25 @@ class GlickoCompetitor(BaseCompetitor):
1718
the reliability of the rating. A higher RD indicates a less reliable rating.
1819
1920
Class Attributes:
20-
_c (float): Constant that determines how quickly the RD increases over time. Default: 1.
21+
_c (float): Rating volatility constant that determines how quickly the RD increases over time.
22+
Default: 34.6, which is calibrated so that it takes about 100 rating periods
23+
for a player's RD to grow from 50 to 350 (maximum uncertainty).
2124
_q (float): Scaling factor used in the rating calculation. Default: 0.0057565.
25+
_rating_period_days (float): Number of days that constitute one rating period.
26+
Default: 1.0 (one day per rating period).
2227
"""
2328

24-
_c: ClassVar[float] = 1
29+
_c: ClassVar[float] = 34.6 # sqrt((350^2 - 50^2)/100) as per Glickman's paper
2530
_q: ClassVar[float] = 0.0057565
31+
_rating_period_days: ClassVar[float] = 1.0
2632

27-
def __init__(self, initial_rating: float = 1500, initial_rd: float = 350):
33+
def __init__(self, initial_rating: float = 1500, initial_rd: float = 350, initial_time: Optional[datetime] = None):
2834
"""Initialize a Glicko competitor.
2935
3036
Args:
3137
initial_rating (float, optional): The initial rating of this competitor. Default: 1500.
3238
initial_rd (float, optional): The initial rating deviation of this competitor. Default: 350.
39+
initial_time (datetime, optional): The initial timestamp for this competitor. Default: current time.
3340
3441
Raises:
3542
InvalidRatingValueException: If the initial rating is below the minimum rating.
@@ -47,6 +54,7 @@ def __init__(self, initial_rating: float = 1500, initial_rd: float = 350):
4754
self._initial_rd = initial_rd
4855
self._rating = initial_rating
4956
self.rd = initial_rd
57+
self._last_activity = initial_time if initial_time is not None else datetime.now()
5058

5159
def __repr__(self) -> str:
5260
"""Return a string representation of this competitor.
@@ -84,6 +92,7 @@ def _export_current_state(self) -> Dict[str, Any]:
8492
return {
8593
"rating": self._rating,
8694
"rd": self.rd,
95+
"last_activity": self._last_activity.isoformat(),
8796
}
8897

8998
def _import_parameters(self, parameters: Dict[str, Any]) -> None:
@@ -130,6 +139,12 @@ def _import_current_state(self, state: Dict[str, Any]) -> None:
130139
raise InvalidParameterException("RD must be positive")
131140
self.rd = rd
132141

142+
# Set last activity time
143+
if "last_activity" in state:
144+
self._last_activity = datetime.fromisoformat(state["last_activity"])
145+
else:
146+
self._last_activity = datetime.now()
147+
133148
@classmethod
134149
def _create_from_parameters(cls: Type[T], parameters: Dict[str, Any]) -> T:
135150
"""Create a new competitor instance from parameters.
@@ -268,46 +283,65 @@ def expected_score(self, competitor: BaseCompetitor) -> float:
268283
E = 1 / (1 + 10 ** ((-1 * g_term * (self._rating - competitor.rating)) / 400))
269284
return E
270285

271-
def beat(self, competitor: "GlickoCompetitor") -> None:
286+
def beat(self, competitor: "GlickoCompetitor", match_time: Optional[datetime] = None) -> None:
272287
"""Update ratings after this competitor has won against the given competitor.
273288
274289
This method updates the ratings of both this competitor and the opponent
275290
based on the match outcome where this competitor won.
276291
277292
Args:
278293
competitor (GlickoCompetitor): The opponent competitor that lost.
294+
match_time (datetime, optional): The time when the match occurred. Default: current time.
279295
280296
Raises:
281297
MissMatchedCompetitorTypesException: If the competitor types don't match.
282298
"""
283299
self.verify_competitor_types(competitor)
284-
self._compute_match_result(competitor, s=1)
300+
self._compute_match_result(competitor, s=1, match_time=match_time)
285301

286-
def tied(self, competitor: "GlickoCompetitor") -> None:
302+
def tied(self, competitor: "GlickoCompetitor", match_time: Optional[datetime] = None) -> None:
287303
"""Update ratings after this competitor has tied with the given competitor.
288304
289305
This method updates the ratings of both this competitor and the opponent
290306
based on a drawn match outcome.
291307
292308
Args:
293309
competitor (GlickoCompetitor): The opponent competitor that tied.
310+
match_time (datetime, optional): The time when the match occurred. Default: current time.
294311
295312
Raises:
296313
MissMatchedCompetitorTypesException: If the competitor types don't match.
297314
"""
298315
self.verify_competitor_types(competitor)
299-
self._compute_match_result(competitor, s=0.5)
316+
self._compute_match_result(competitor, s=0.5, match_time=match_time)
300317

301-
def _compute_match_result(self, competitor: "GlickoCompetitor", s: float) -> None:
318+
def _compute_match_result(
319+
self, competitor: "GlickoCompetitor", s: float, match_time: Optional[datetime] = None
320+
) -> None:
302321
"""Compute the result of a match and update ratings.
303322
304323
Args:
305324
competitor (GlickoCompetitor): The opponent competitor.
306325
s (float): The score of this competitor (1 for win, 0.5 for draw, 0 for loss).
326+
match_time (datetime, optional): The time when the match occurred. Default: current time.
307327
308328
Raises:
309329
MissMatchedCompetitorTypesException: If the competitor types don't match.
330+
InvalidParameterException: If the match time is before either competitor's last activity.
310331
"""
332+
# Get the match time
333+
current_time = match_time if match_time is not None else datetime.now()
334+
335+
# Validate match time is not before last activity
336+
if current_time < self._last_activity:
337+
raise InvalidParameterException("Match time cannot be before competitor's last activity time")
338+
if current_time < competitor._last_activity:
339+
raise InvalidParameterException("Match time cannot be before opponent's last activity time")
340+
341+
# Update RDs for both competitors based on inactivity
342+
self.update_rd_for_inactivity(current_time)
343+
competitor.update_rd_for_inactivity(current_time)
344+
311345
self.verify_competitor_types(competitor)
312346
# first we update ourselves
313347
s_new_r, s_new_rd = self.update_competitor_rating(competitor, s)
@@ -322,6 +356,10 @@ def _compute_match_result(self, competitor: "GlickoCompetitor", s: float) -> Non
322356
competitor.rating = c_new_r
323357
competitor.rd = c_new_rd
324358

359+
# Update last activity time for both competitors
360+
self._last_activity = current_time
361+
competitor._last_activity = current_time
362+
325363
def update_competitor_rating(self, competitor: "GlickoCompetitor", s: float) -> Tuple[float, float]:
326364
"""Update the rating and RD of this competitor based on a match result.
327365
@@ -333,11 +371,39 @@ def update_competitor_rating(self, competitor: "GlickoCompetitor", s: float) ->
333371
tuple: A tuple containing the new rating and RD.
334372
"""
335373
E_term = self.expected_score(competitor)
336-
d_squared = (self._q**2 * (self._g(competitor.rd) ** 2 * E_term * (1 - E_term))) ** -1
337-
s_new_r = self._rating + (self._q / (1 / self.rd**2 + 1 / d_squared)) * self._g(competitor.rd) * (s - E_term)
374+
g = self._g(competitor.rd**2)
375+
d_squared = (self._q**2 * (g**2 * E_term * (1 - E_term))) ** -1
376+
377+
# The rating change is proportional to 1/RD^2, so a higher RD means a larger change
378+
rating_change = (self._q / (1 / self.rd**2 + 1 / d_squared)) * g * (s - E_term)
379+
s_new_r = self._rating + rating_change
338380

339381
# Ensure the new rating doesn't go below the minimum rating
340382
s_new_r = max(self._minimum_rating, s_new_r)
341383

384+
# The new RD is smaller (more certain) after a match
342385
s_new_rd = math.sqrt((1 / self.rd**2 + 1 / d_squared) ** -1)
343386
return s_new_r, s_new_rd
387+
388+
def update_rd_for_inactivity(self, current_time: datetime = None) -> None:
389+
"""Update the rating deviation based on time elapsed since last activity.
390+
391+
This implements Glickman's formula for increasing uncertainty in ratings
392+
over time when a player is inactive. The RD increase is controlled by the _c parameter
393+
and the number of rating periods that have passed.
394+
395+
Args:
396+
current_time (datetime, optional): The current time to calculate inactivity against.
397+
If None, uses the current system time.
398+
"""
399+
if current_time is None:
400+
current_time = datetime.now()
401+
402+
# Calculate number of rating periods (can be fractional)
403+
days_inactive = (current_time - self._last_activity).total_seconds() / (24 * 3600)
404+
rating_periods = days_inactive / self._rating_period_days
405+
406+
if rating_periods > 0:
407+
# Use Glickman's formula for RD increase over time
408+
new_rd = min([350, math.sqrt(self.rd**2 + (self._c**2 * rating_periods))])
409+
self.rd = new_rd

0 commit comments

Comments
 (0)