Merge pull request #245 from bashtage/qa-5

Qa 5
bashtage · Jun 30, 2020 · ca3100f · ca3100f
2 parents 0f005ad + 6225711
commit ca3100f
Show file tree

Hide file tree

Showing 17 changed files with 1,131 additions and 233 deletions.
diff --git a/doc/source/change-log.rst b/doc/source/change-log.rst
@@ -15,11 +15,12 @@ Change Log
   maintained until after NumPy 1.21 (or 2 releases after NumPy 1.19) for users who
   cannot update NumPy.
 
-Since v1.19.0
-=============
+v1.19.1
+=======
 - Added :class:`randomgen.romu.Romu` which is among the fastest available bit generators.
 - Added :func:`~randomgen.sfc.SFC64.weyl_increments` to simplify generating increments for
   use in parallel applications of :class:`~randomgen.sfc.SFC64`.
+- Completed * :ref:`quality-assurance` of all bit generators to at least 4TB.
 
 v1.19.0
 =======

diff --git a/doc/source/testing.rst b/doc/source/testing.rst
@@ -1,10 +1,15 @@
+.. _quality-assurance:
+
 =================
 Quality Assurance
 =================
 
-A values below are the maximum output size where a bit generator or sequence of bit generators
-has passed PractRand_. A -- indicates that configuration is not relevant. Failures are marked
-with FAIL. Most bit generators were only tested in their default configuration.
+Core Testing
+------------
+
+A values in the below are the maximum output size where a bit generator or sequence of
+bit generators has passed PractRand_. A -- indicates that configuration is not relevant.
+Failures are marked with FAIL. Most bit generators were only tested in their default configuration.
 Non-default configurations are indicated by listing the keyword arguments to the bit generator.
 Two sets of tests were performed. The first tested all configurations using 128GB of data using
 PractRand's extended set of tests and additional bit folding. The second set of tests used
@@ -16,11 +21,11 @@ initialized with the same 256-bits of entropy taken from random.org.
 .. include:: test-results.txt
 
 Notes
------
+~~~~~
 ¹ Failures at or before 128GB were generated by tests that used the expanded
 set of tests and extra bt folds (``-te 1`` and ``-tf 2``). Failures at sample
 sizes above 128GB were produces using the default configuration
-(``-te 0`` and ``-tf 0``).
+(``-te 0`` and ``-tf 1``).
 
 ² PCG64DXSM and PCG64(variant=dxsm) are identical and so the latter not separately reported.
 
@@ -32,10 +37,8 @@ is required.
 
 ⁵ Identical output to the version included in NumPy 1.19.
 
-.. _PractRand: http://pracrand.sourceforge.net/
-
 Example Configuration
----------------------
+~~~~~~~~~~~~~~~~~~~~~
 All configurations are constructed using the same template. The code below tests a
 configuration using 8,196 streams of :class:`~randomgen.aes.AESCounter`. The other
 configurations simply make changes to either ``JUMPED`` or ``STREAMS``.
@@ -66,3 +69,83 @@ configurations simply make changes to either ``JUMPED`` or ``STREAMS``.
       for child in SEED_SEQ.spawn(STREAMS):
          bit_gens.append(rg.AESCounter(child, **BIT_GENERATOR_KWARGS))
    output = 64
+
+Additional Experiments
+----------------------
+The best practice for using any of the bit generators is to initialize
+a single :class:`~numpy.random.SeedSequence` with a reasonably random seed,
+and then to use this seed sequence to initialize all bit generators.
+Some additional experiments were used to check that the quality of output
+streams is not excessively sensitive to use that deviates from this best practice.
+
+Correlated Seeds
+~~~~~~~~~~~~~~~~
+While the recommended practice is to use a :class:`~numpy.random.SeedSequence`,
+it is natural to worry about bad seeds.  A common sequence of bad seeds are
+those which set a single bit to be non-zero: 1, 2, 4, 8, 16, and so on.
+By default, bit generators use a :class:`~numpy.random.SeedSequence` to transform
+seed values into an initial state for the bit generator.
+:class:`~numpy.random.SeedSequence` is itself a random number generator that always
+escapes low-entropy states -- that is, those with many 0s or 1s -- immediately.
+All bit generators were tested with 8 streams using seeds of the form :math:`2^i` for
+i in 0, 1, ..., 7. Only three bit generators failed this experiment: :class:`~randomgen.dsfmt.DSFMT`,
+:class:`~randomgen.mt19937.MT19937`, and :class:`~randomgen.sfmt.SFMT`. These are all
+members of the Mersenne Twister family which commonly fail ``BRank`` tests.
+
+Sequential Seeds
+~~~~~~~~~~~~~~~~
+The recommended practice for constructing multiple :class:`~numpy.random.Generator`s
+is to use :class:`~numpy.random.SeedSequence`'s :func:`~numpy.random.SeedSequence.spawn`
+method.
+
+::
+
+    from numpy.random import default_rng, Generator, SeedSequence
+    from randomgen import Romu
+
+    NUM_STREAMS = 2**15
+    seed_seq = SeedSequence(5897100938578919857511)
+    # To use the default bit generator, which is not guaranteed to be stable
+    generators = [default_rng(child) for child in seed_seq.spawn(NUM_STREAMS)]
+
+    # To use a specific bit generator
+    generators = [Generator(Romu(child)) for child in seed_seq.spawn(NUM_STREAMS)]
+
+It is common to see examples that use sequential seed that resemble:
+
+::
+
+    generators = [default_rng(i) for i in range(NUM_STREAMS)]
+
+This practice was examined with all bit generators using 8,196 streams
+seeded using 0, 1, 2, ..., 8,195 by intertwining the output of the
+generators. **None** of the generators failed these tests.
+
+Zero (0) Seeding
+~~~~~~~~~~~~~~~~
+Bit generators use a :class:`~numpy.random.SeedSequence` that always
+escapes low-entropy states immediately to transform
+seed values into an initial state for the bit generator.
+To ensure that this is not an issue, all bit generators were tested using 4, 32 or 8196
+streams using 128GB in PractRand_ with expanded tests and extra folding. The table
+below reports **only** the configurations that failed. These were all Mersenne Twister-class
+generators and so failure is attributable to the bit generator and not the seeding.
+All other generators passed these tests.
+
+
++--------------+---------------+----------------+------+
+| Streams      | 4             | 32             | 8196 |
++==============+===============+================+======+
+| DSFMT        | FAIL at 64 GB | FAIL at 64 GB  | --   |
++--------------+---------------+----------------+------+
+| MT19937      | FAIL at 64 GB | FAIL at 64 GB  | --   |
++--------------+---------------+----------------+------+
+| SFMT         | FAIL at 64 GB | FAIL at 64 GB  | --   |
++--------------+---------------+----------------+------+
+
+The non-failures at 8196 are due to the relatively short length of each sequence tested since
+128GB shared across 8196 streams only samples :math:`2^{37}/(2^{13}\times2^{3})=2^{21}` values
+from each stream since each value is 8-bytes.
+
+
+.. _PractRand: http://pracrand.sourceforge.net/
diff --git a/randomgen/_seed_sequence.pyx b/randomgen/_seed_sequence.pyx
@@ -384,13 +384,22 @@ cdef class SeedSequence(object):
         -------
         entropy_array : 1D uint32 array
         """
-        # Convert run-entropy, program-entropy, and the spawn key into uint32
+        # Convert run-entropy and the spawn key into uint32
         # arrays and concatenate them.
 
         # We MUST have at least some run-entropy. The others are optional.
         assert self.entropy is not None
         run_entropy = _coerce_to_uint32_array(self.entropy)
         spawn_entropy = _coerce_to_uint32_array(self.spawn_key)
+        if len(spawn_entropy) > 0 and len(run_entropy) < self.pool_size:
+            # Explicitly fill out the entropy with 0s to the pool size to avoid
+            # conflict with spawn keys. We changed this in 1.19.0 to fix
+            # gh-16539. In order to preserve stream-compatibility with
+            # unspawned SeedSequences with small entropy inputs, we only do
+            # this when a spawn_key is specified.
+            diff = self.pool_size - len(run_entropy)
+            run_entropy = np.concatenate(
+                [run_entropy, np.zeros(diff, dtype=np.uint32)])
         entropy_array = np.concatenate([run_entropy, spawn_entropy])
         return entropy_array
 

diff --git a/randomgen/mtrand.pyx b/randomgen/mtrand.pyx
@@ -813,7 +813,7 @@ warnings.filterwarnings("ignore", "RandomState", FutureWarning)
                 raise ValueError("Cannot take a larger sample than "
                                  "population when replace=False")
             elif size < 0:
-                raise ValueError("negative dimensions are not allowed")
+                raise ValueError("Negative dimensions are not allowed")
 
             if p is not None:
                 if np.count_nonzero(p > 0) < size:
@@ -3703,7 +3703,7 @@ warnings.filterwarnings("ignore", "RandomState", FutureWarning)
         [True, True] # random
 
         """
-        from numpy.dual import svd
+        from numpy.linalg import svd
 
         # Check preconditions on arguments
         mean = np.array(mean)

diff --git a/randomgen/tests/test_seed_sequence.py b/randomgen/tests/test_seed_sequence.py
@@ -1,5 +1,5 @@
 import numpy as np
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_equal, assert_array_compare
 import pytest
 
 from randomgen._seed_sequence import SeedlessSeedSequence, SeedSequence
@@ -11,7 +11,7 @@
     HAS_NP_SEED_SEQUENCE = True
 except (ImportError, AttributeError):
     try:
-        from numpy.random.bit_generator import SeedSequence as NPSeedSequence
+        from numpy.random import SeedSequence as NPSeedSequence
 
         HAS_NP_SEED_SEQUENCE = True
     except (ImportError, AttributeError):
@@ -205,3 +205,26 @@ def test_against_numpy_spawn():
     assert ss.n_children_spawned == np_ss.n_children_spawned
     for child, np_child in zip(ss_children, np_ss_children):
         assert_array_equal(child.generate_state(10), np_child.generate_state(10))
+
+
+def test_zero_padding():
+    """ Ensure that the implicit zero-padding does not cause problems.
+    """
+    # Ensure that large integers are inserted in little-endian fashion to avoid
+    # trailing 0s.
+    ss0 = SeedSequence(42)
+    ss1 = SeedSequence(42 << 32)
+    assert_array_compare(np.not_equal, ss0.generate_state(4), ss1.generate_state(4))
+
+    # Ensure backwards compatibility with the original 0.17 release for small
+    # integers and no spawn key.
+    expected42 = np.array(
+        [3444837047, 2669555309, 2046530742, 3581440988], dtype=np.uint32
+    )
+    assert_array_equal(SeedSequence(42).generate_state(4), expected42)
+
+    # Regression test for gh-16539 to ensure that the implicit 0s don't
+    # conflict with spawn keys.
+    assert_array_compare(
+        np.not_equal, SeedSequence(42, spawn_key=(0,)).generate_state(4), expected42
+    )
diff --git a/tools/configuration.py b/tools/configuration.py
@@ -0,0 +1,82 @@
+from collections import defaultdict
+
+import jinja2
+
+from randomgen import (
+    DSFMT,
+    EFIIX64,
+    HC128,
+    JSF,
+    LXM,
+    MT19937,
+    PCG64,
+    SFC64,
+    SFMT,
+    SPECK128,
+    AESCounter,
+    ChaCha,
+    LCG128Mix,
+    Philox,
+    Romu,
+    ThreeFry,
+    Xoshiro256,
+    Xoshiro512,
+)
+
+ALL_BIT_GENS = [
+    AESCounter,
+    ChaCha,
+    DSFMT,
+    EFIIX64,
+    HC128,
+    JSF,
+    LXM,
+    PCG64,
+    LCG128Mix,
+    MT19937,
+    Philox,
+    SFC64,
+    SFMT,
+    SPECK128,
+    ThreeFry,
+    Xoshiro256,
+    Xoshiro512,
+    Romu,
+]
+JUMPABLE = [bg for bg in ALL_BIT_GENS if hasattr(bg, "jumped")]
+
+SPECIALS = {
+    ChaCha: {"rounds": [8, 20]},
+    JSF: {"seed_size": [1, 3]},
+    SFC64: {"k": [1, 3394385948627484371, "weyl"]},
+    LCG128Mix: {"output": ["upper"]},
+    PCG64: {"variant": ["dxsm", "dxsm-128", "xsl-rr"]},
+    Romu: {"variant": ["quad", "trio"]},
+}
+OUTPUT = defaultdict(lambda: 64)
+OUTPUT.update({MT19937: 32, DSFMT: 32})
+with open("templates/configuration.jinja") as tmpl:
+    TEMPLATE = jinja2.Template(tmpl.read())
+
+DSFMT_WRAPPER = """\
+
+class Wrapper32:
+    def __init__(self, seed, **kwargs):
+        if isinstance(seed, rg.DSFMT):
+            self._bit_gen = seed
+        else:
+            self._bit_gen = rg.DSFMT(seed)
+
+    def random_raw(self, n=None):
+        return self._bit_gen.random_raw(n).astype("u4")
+
+    def jumped(self):
+        return Wrapper32(self._bit_gen.jumped())
+
+rg.Wrapper32 = Wrapper32
+"""
+# Specials
+# SFC64
+DEFAULT_ENTOPY = (
+    86316980830225721106033794313786972513572058861498566720023788662568817403978
+)