Merge pull request #599 from mdekstrand/feature/training-rework

Move Trainable interface and add TrainingOptions
lenskit · Jan 11, 2025 · 1954efb · 1954efb
2 parents b287d65 + 60e0972
commit 1954efb
Show file tree

Hide file tree

Showing 23 changed files with 302 additions and 224 deletions.
diff --git a/docs/api/index.rst b/docs/api/index.rst
@@ -12,6 +12,7 @@ Core Abstractions
     lenskit.pipeline
     lenskit.diagnostics
     lenskit.operations
+    lenskit.training
 
 .. toctree::
     :caption: Core
@@ -21,6 +22,7 @@ Core Abstractions
     pipeline
     operations
     diagnostics
+    training
 
 Components and Models
 ~~~~~~~~~~~~~~~~~~~~~

diff --git a/docs/api/training.rst b/docs/api/training.rst
@@ -0,0 +1,4 @@
+Model Training
+==============
+
+.. automodule:: lenskit.training
diff --git a/docs/guide/conventions.rst b/docs/guide/conventions.rst
@@ -24,19 +24,32 @@ Random Seeds
 
 .. _SPEC 7: https://scientific-python.org/specs/spec-0007/
 
-LensKit components follow `SPEC 7`_ for specifying random number seeds.
-Components that use randomization (either at runtime, or to set initial
-conditions for training) have a constructor parameter `rng` that takes either a
-:class:`~numpy.random.Generator` or seed material.  If you want reproducible
-stochastic pipelines, configure the random seeds for your components.
-
-This convention is also followed for other LensKit code, such as the `data
-splitting support <./splitting>`_.
+LensKit components follow `SPEC 7`_ for specifying random number seeds.  If you
+want reproducible stochastic pipelines, configure the random seeds for your
+components and/or training process.
+
+Components that use randomization at **inference time** take either seed
+material or a :class:`~numpy.random.Generator` as an ``rng`` constructor
+parameter; if seed material is supplied, that seed should be considered part of
+the configuration (see the source code in :mod:`lenskit.basic.random` for
+examples).
+
+Components that use randomization at **training time** (e.g. to shuffle data or
+to initialize parameter values) should obtain their generator or seed from the
+:attr:`~lenskit.training.TrainingOptions`.  This makes it easy to configure a
+seed for the training process without needing to configure each component.  For
+consistent configurability, it's best for components using other frameworks such
+as PyTorch to use NumPy to initialize the parameter values and then convert the
+initial values to the appropriate compute backend.
+
+Other LensKit code, such as the `data splitting support <./splitting>`_, follow
+SPEC 7 directly by accepting an ``rng`` keyword parameter.
 
 .. important::
 
-    If you specify random seeds, we strongly recommend specifying seeds instead of
-    generators, so that the seed can be included in serialized configurations.
+    If you specify random seeds for component configurations, we strongly
+    recommend specifying seeds instead of generators, so that the seed can be
+    included in serialized configurations.
 
 .. versionchanged:: 2025.1
 
@@ -46,19 +59,10 @@ splitting support <./splitting>`_.
 
 LensKit extends SPEC 7 with a global RNG that components can use as a fallback,
 to make it easier to configure system-wide generation for things like tests.
-This is configured with :func:`~lenskit.random.set_global_rng`.
-
-When implementing a component that uses randomness in its training, we recommend
-deferring conversion of the provided RNG into an actual generator until
-model-training time, so that serializing an untrained pipeline or its
-configuration includes the original seed instead of the resulting generator.
-When using the RNG to create initial state for e.g. training a model with
-PyTorch, it can be useful to create that state in NumPy and then convert to a
-tensor, so that components are consistent in their random number generation
-behavior instead of having variation between NumPy and other backends.
-Components can use the :func:`~lenskit.random_generator` function to
-convert seed material or a generator into a NumPy generator, falling back to the
-global RNG if one is specified.
+This is configured with :func:`~lenskit.random.set_global_rng`. Components can
+use the :func:`~lenskit.random_generator` function to convert seed material or a
+generator into a NumPy generator, falling back to the global RNG if one is
+specified.
 
 Derived Seeds
 -------------

diff --git a/docs/guide/pipeline.rst b/docs/guide/pipeline.rst
@@ -342,8 +342,9 @@ a component that requires no training or configuration can simply be a Python
 function.
 
 Most components will extend the :class:`Component` base class to expose
-configuration capabilities, and implement the :class:`Trainable` protocol if
-they contain a model that needs to be trained.
+configuration capabilities, and implement the
+:class:`lenskit.training.Trainable` protocol if they contain a model that needs
+to be trained.
 
 Components also must be pickleable, as LensKit uses pickling for shared memory
 parallelism in its batch-inference code.

diff --git a/lenskit-funksvd/lenskit/funksvd.py b/lenskit-funksvd/lenskit/funksvd.py
@@ -20,8 +20,8 @@
 from lenskit import util
 from lenskit.basic import BiasModel, Damping
 from lenskit.data import Dataset, ItemList, QueryInput, RecQuery, Vocabulary
-from lenskit.pipeline import Component, Trainable
-from lenskit.random import ConfiguredSeed, random_generator
+from lenskit.pipeline import Component
+from lenskit.training import Trainable, TrainingOptions
 
 _logger = logging.getLogger(__name__)
 
@@ -53,10 +53,6 @@ class FunkSVDConfig(BaseModel):
     """
     Min/max range of ratings to clamp output.
     """
-    rng: ConfiguredSeed = None
-    """
-    RNG seed.
-    """
 
 
 @jitclass(
@@ -257,18 +253,17 @@ class FunkSVDScorer(Trainable, Component[ItemList]):
     items_: Vocabulary
     item_features_: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 
-    @property
-    def is_trained(self) -> bool:
-        return hasattr(self, "item_features_")
-
     @override
-    def train(self, data: Dataset):
+    def train(self, data: Dataset, options: TrainingOptions = TrainingOptions()):
         """
         Train a FunkSVD model.
 
         Args:
             ratings: the ratings data frame.
         """
+        if hasattr(self, "item_features_") and not options.retrain:
+            return
+
         timer = util.Stopwatch()
         rate_df = data.interaction_matrix(format="pandas", layout="coo", field="rating")
 
@@ -278,7 +273,7 @@ def train(self, data: Dataset):
         _logger.info("[%s] preparing rating data for %d samples", timer, len(rate_df))
         _logger.debug("shuffling rating data")
         shuf = np.arange(len(rate_df), dtype=np.int_)
-        rng = random_generator(self.config.rng)
+        rng = options.random_generator()
         rng.shuffle(shuf)
         rate_df = rate_df.iloc[shuf, :]
 

diff --git a/lenskit-hpf/lenskit/hpf.py b/lenskit-hpf/lenskit/hpf.py
@@ -12,7 +12,8 @@
 from typing_extensions import override
 
 from lenskit.data import Dataset, ItemList, QueryInput, RecQuery, Vocabulary
-from lenskit.pipeline import Component, Trainable
+from lenskit.pipeline import Component
+from lenskit.training import Trainable, TrainingOptions
 
 _logger = logging.getLogger(__name__)
 
@@ -47,12 +48,11 @@ class HPFScorer(Component[ItemList], Trainable):
     items_: Vocabulary
     item_features_: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 
-    @property
-    def is_trained(self) -> bool:
-        return hasattr(self, "item_features_")
-
     @override
-    def train(self, data: Dataset):
+    def train(self, data: Dataset, options: TrainingOptions = TrainingOptions()):
+        if hasattr(self, "item_features_") and not options.retrain:
+            return
+
         log = data.interaction_matrix("pandas", field="rating")
         log = log.rename(
             columns={

diff --git a/lenskit-hpf/tests/test_hpf.py b/lenskit-hpf/tests/test_hpf.py
@@ -16,6 +16,7 @@
 from lenskit.metrics import quick_measure_model
 from lenskit.pipeline import topn_pipeline
 from lenskit.testing import BasicComponentTests, ScorerTests
+from lenskit.training import TrainingOptions
 
 hpf = importorskip("lenskit.hpf")
 
@@ -47,7 +48,7 @@ def test_hpf_train_large(tmp_path, ml_ratings):
     assert np.all(a2.item_features_ == algo.item_features_)
 
     pipe = topn_pipeline(algo)
-    pipe.train(ds, retrain=False)
+    pipe.train(ds, TrainingOptions(retrain=False))
 
     for u in np.random.choice(ratings.user_id.unique(), size=50, replace=False):
         recs = pipe.run("recommender", query=u, n=50)
@@ -76,7 +77,7 @@ def test_hpf_train_binary(tmp_path, ml_ratings):
     assert np.all(a2.item_features_ == algo.item_features_)
 
     pipe = topn_pipeline(algo)
-    pipe.train(ds, retrain=False)
+    pipe.train(ds, TrainingOptions(retrain=False))
 
     for u in np.random.choice(ratings.user_id.unique(), size=50, replace=False):
         recs = pipe.run("recommender", query=u, n=50)

diff --git a/lenskit-implicit/lenskit/implicit.py b/lenskit-implicit/lenskit/implicit.py
@@ -15,7 +15,8 @@
 from typing_extensions import override
 
 from lenskit.data import Dataset, ItemList, QueryInput, RecQuery, Vocabulary
-from lenskit.pipeline import Component, Trainable
+from lenskit.pipeline import Component
+from lenskit.training import Trainable, TrainingOptions
 
 _logger = logging.getLogger(__name__)
 
@@ -62,12 +63,11 @@ class BaseRec(Component[ItemList], Trainable):
     The item ID mapping from training.
     """
 
-    @property
-    def is_trained(self):
-        return hasattr(self, "matrix_")
-
     @override
-    def train(self, data: Dataset):
+    def train(self, data: Dataset, options: TrainingOptions = TrainingOptions()):
+        if hasattr(self, "delegate") and not options.retrain:
+            return
+
         matrix = data.interaction_matrix("scipy", layout="csr", legacy=True)
         uir = matrix * self.weight
 
@@ -100,8 +100,8 @@ def __call__(self, query: QueryInput, items: ItemList) -> ItemList:
         mask = inos >= 0
         good_inos = inos[mask]
 
-        ifs = self.delegate.item_factors[good_inos]
-        uf = self.delegate.user_factors[user_num]
+        ifs = self.delegate.item_factors[good_inos]  # type: ignore
+        uf = self.delegate.user_factors[user_num]  # type: ignore
 
         # convert back if these are on CUDA
         if hasattr(ifs, "to_numpy"):

diff --git a/lenskit-sklearn/lenskit/sklearn/svd.py b/lenskit-sklearn/lenskit/sklearn/svd.py
@@ -16,7 +16,8 @@
 from lenskit.basic import BiasModel, Damping
 from lenskit.data import Dataset, ItemList, QueryInput, RecQuery
 from lenskit.data.vocab import Vocabulary
-from lenskit.pipeline import Component, Trainable
+from lenskit.pipeline import Component
+from lenskit.training import Trainable, TrainingOptions
 from lenskit.util import Stopwatch
 
 try:
@@ -61,12 +62,11 @@ class BiasedSVDScorer(Component[ItemList], Trainable):
     items_: Vocabulary
     user_components_: NDArray[np.float64]
 
-    @property
-    def is_trained(self):
-        return hasattr(self, "factorization_")
-
     @override
-    def train(self, data: Dataset):
+    def train(self, data: Dataset, options: TrainingOptions = TrainingOptions()):
+        if hasattr(self, "factorization_") and not options.retrain:
+            return
+
         timer = Stopwatch()
         _log.info("[%s] computing bias", timer)
         self.bias_ = BiasModel.learn(data, self.config.damping)
@@ -90,7 +90,7 @@ def train(self, data: Dataset):
             self.config.features, algorithm=self.config.algorithm, n_iter=self.config.n_iter
         )
         _log.info("[%s] training SVD (k=%d)", timer, self.factorization_.n_components)  # type: ignore
-        Xt = self.factorization_.fit_transform(r_mat)
+        Xt = self.factorization_.fit_transform(r_mat)  # type: ignore
         self.user_components_ = Xt
         self.users_ = data.users.copy()
         self.items_ = data.items.copy()

diff --git a/lenskit/lenskit/als/_common.py b/lenskit/lenskit/als/_common.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Any, Literal, TypeAlias
+from typing import Literal, TypeAlias
 
 import numpy as np
 import structlog
@@ -20,8 +20,8 @@
 from lenskit.data.types import UIPair
 from lenskit.logging import item_progress
 from lenskit.parallel.config import ensure_parallel_init
-from lenskit.pipeline import Component, Trainable
-from lenskit.random import ConfiguredSeed, RNGInput, RNGLike, random_generator
+from lenskit.pipeline import Component
+from lenskit.training import Trainable, TrainingOptions
 
 EntityClass: TypeAlias = Literal["user", "item"]
 
@@ -43,10 +43,6 @@ class ALSConfig(BaseModel):
     """
     L2 regularization strength.
     """
-    rng: ConfiguredSeed = None
-    """
-    Random number seed.
-    """
     save_user_features: bool = True
     """
     Whether to retain user feature values after training.
@@ -139,7 +135,6 @@ class ALSBase(ABC, Component[ItemList], Trainable):
     """
 
     config: ALSConfig
-    rng: RNGLike | None = None
 
     users_: Vocabulary | None
     items_: Vocabulary
@@ -148,30 +143,24 @@ class ALSBase(ABC, Component[ItemList], Trainable):
 
     logger: structlog.stdlib.BoundLogger
 
-    def __init__(self, config: ALSConfig | None = None, *, rng: RNGInput = None, **kwargs: Any):
-        # hadle non-configurable RNG
-        if isinstance(rng, (np.random.Generator, np.random.BitGenerator)):
-            self.rng = rng
-        elif rng is not None:
-            kwargs = kwargs | {"rng": rng}
-        super().__init__(config, **kwargs)
-
-    @property
-    def is_trained(self) -> bool:
-        return hasattr(self, "item_features_")
-
     @override
-    def train(self, data: Dataset):
+    def train(self, data: Dataset, options: TrainingOptions = TrainingOptions()) -> bool:
         """
         Run ALS to train a model.
 
         Args:
             ratings: the ratings data frame.
+
+        Returns:
+            ``True`` if the model was trained.
         """
+        if hasattr(self, "item_features_") and not options.retrain:
+            return False
+
         ensure_parallel_init()
         timer = util.Stopwatch()
 
-        for algo in self.fit_iters(data):
+        for algo in self.fit_iters(data, options):
             pass  # we just need to do the iterations
 
         if self.user_features_ is not None:
@@ -190,7 +179,9 @@ def train(self, data: Dataset):
                 features=self.config.features,
             )
 
-    def fit_iters(self, data: Dataset) -> Iterator[Self]:
+        return True
+
+    def fit_iters(self, data: Dataset, options: TrainingOptions) -> Iterator[Self]:
         """
         Run ALS to train a model, yielding after each iteration.
 
@@ -199,12 +190,13 @@ def fit_iters(self, data: Dataset) -> Iterator[Self]:
         """
 
         log = self.logger = self.logger.bind(features=self.config.features)
+        rng = options.random_generator()
 
         train = self.prepare_data(data)
         self.users_ = train.users
         self.items_ = train.items
 
-        self.initialize_params(train)
+        self.initialize_params(train, rng)
 
         assert self.user_features_ is not None
         assert self.item_features_ is not None
@@ -250,11 +242,10 @@ def prepare_data(self, data: Dataset) -> TrainingData:  # pragma: no cover
         """
         ...
 
-    def initialize_params(self, data: TrainingData):
+    def initialize_params(self, data: TrainingData, rng: np.random.Generator):
         """
         Initialize the model parameters at the beginning of training.
         """
-        rng = random_generator(self.rng or self.config.rng)
         self.logger.debug("initializing item matrix")
         self.item_features_ = self.initial_params(data.n_items, self.config.features, rng)
         self.logger.debug("|Q|: %f", torch.norm(self.item_features_, "fro"))