Skip to content

Commit

Permalink
Merge pull request #192 from bashtage/sync-choice
Browse files Browse the repository at this point in the history
ENH: Synchronize upstream improvements in choice
  • Loading branch information
bashtage authored Dec 20, 2019
2 parents 8a07c1d + 69f1c82 commit 809ae48
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 78 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,4 @@ after_success:
cd ${BUILD_DIR}
python benchmark.py;
fi
- if [[ "$COVERAGE" = true ]]; then codecov; coveralls --rcfile="$SRCDIR"/.coveragerc; fi
- if [[ "$COVERAGE" = true ]]; then codecov; coveralls --rcfile="$SRCDIR"/.coveragerc || true; fi
2 changes: 2 additions & 0 deletions doc/source/change-log.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Change Log

v1.18.0
=======
- :meth:`~randomgen.generator.Generator.choice` pulled in upstream performance improvement that
use a hash set when choosing without replacement and without user-provided probabilities.
- Added support for :class:`~randomgen.seed_sequence.SeedSequence` (and NumPy's ``SeedSequence``).
- Fixed a bug that affected both :class:`~randomgen.generator.Generator.randint`
in :class:`~randomgen.generator.Generator` and :meth:`~randomgen.mtrand.RandomState.randint`
Expand Down
2 changes: 1 addition & 1 deletion doc/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ generator be be changed.
There are many changes between v1.16.x and v1.18.x. These reflect API
decision taken in conjunction with NumPy in preparation of the core
of ``randomgen`` being used as the preferred random number generator in
NumPy. These all issue ``DeprecationWarning`` except for ``BasicRNG.generator``
NumPy. These all issue ``DeprecationWarning`` except for ``BitGenerator.generator``
which raises ``NotImplementedError``. The C-API has also changed to reflect
the preferred naming the underlying Pseudo-RNGs, which are now known as
bit generators (or ``BitGenerator``).
Expand Down
129 changes: 82 additions & 47 deletions randomgen/generator.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -570,11 +570,11 @@ cdef class Generator:
Parameters
----------
low : int or array-like of ints
low : {int, array_like[int]}
Lowest (signed) integers to be drawn from the distribution (unless
``high=None``, in which case this parameter is one above the
*highest* such integer).
high : int or array-like of ints, optional
high : {int, array_like[int]}, optional
If provided, one above the largest (signed) integer to be drawn
from the distribution (see above for behavior if ``high=None``).
If array-like, must contain integer values
Expand Down Expand Up @@ -648,10 +648,10 @@ cdef class Generator:
CoRR, Aug. 13, 2018, http://arxiv.org/abs/1805.10941.
"""
if use_masked is not None:
if use_masked is not None and use_masked:
import warnings
warnings.warn("use_masked will be removed in the final release and"
"only the Lemire method will be available.",
" only the Lemire method will be available.",
DeprecationWarning)
if closed is not None:
import warnings
Expand Down Expand Up @@ -725,32 +725,34 @@ cdef class Generator:
return self.integers(0, 4294967296, size=n_uint32, dtype=np.uint32).tobytes()[:length]

@cython.wraparound(True)
def choice(self, a, size=None, replace=True, p=None, axis=0):
def choice(self, a, size=None, replace=True, p=None, axis=0, bint shuffle=True):
"""
choice(a, size=None, replace=True, p=None, axis=0):
Generates a random sample from a given 1-D array
.. versionadded:: 1.7.0
choice(a, size=None, replace=True, p=None, axis=0, shuffle=True):
Parameters
----------
a : 1-D array-like or int
a : {array_like, int}
If an ndarray, a random sample is generated from its elements.
If an int, the random sample is generated as if a were np.arange(a)
size : int or tuple of ints, optional
Output shape. If the given shape is, e.g., ``(m, n, k)``, then
``m * n * k`` samples are drawn. Default is None, in which case a
single value is returned.
Output shape. If the given shape is, e.g., ``(m, n, k)``, then
``m * n * k`` samples are drawn from the 1-d `a`. If `a` has more
than one dimension, the `size` shape will be inserted into the
`axis` dimension, so the output ``ndim`` will be ``a.ndim - 1 +
len(size)``. Default is None, in which case a single value is
returned.
replace : boolean, optional
Whether the sample is with or without replacement
p : 1-D array-like, optional
p : 1-D array_like, optional
The probabilities associated with each entry in a.
If not given the sample assumes a uniform distribution over all
entries in a.
axis : int, optional
The axis along which the selection is performed. The default, 0,
selects by row.
shuffle : boolean, optional
Whether the sample is shuffled when sampling without replacement.
Default is True, False provides a speedup.
Returns
-------
Expand Down Expand Up @@ -805,22 +807,20 @@ cdef class Generator:
dtype='<U11')
"""
cdef char* idx_ptr
cdef int64_t buf
cdef char* buf_ptr

cdef set idx_set
cdef int64_t val, t, loc, size_i, pop_size_i
cdef int64_t *idx_data
cdef np.npy_intp j
cdef uint64_t set_size, mask
cdef uint64_t[::1] hash_set
# Format and Verify input
a = np.array(a, copy=False)
if a.ndim == 0:
try:
# __index__ must return an integer by python rules.
pop_size = operator.index(a.item())
except TypeError:
raise ValueError("`a` must be 1-dimensional or an integer")
raise ValueError("`a` must an array or an integer")
if pop_size <= 0 and np.prod(size) != 0:
raise ValueError("`a` must be greater than 0 unless no samples are taken")
else:
Expand All @@ -837,14 +837,17 @@ cdef class Generator:
atol = max(atol, np.sqrt(np.finfo(p.dtype).eps))

p = <np.ndarray>np.PyArray_FROM_OTF(p, np.NPY_DOUBLE, api.NPY_ARRAY_ALIGNED | api.NPY_ARRAY_C_CONTIGUOUS)
check_array_constraint(p, "p", CONS_BOUNDED_0_1)
pix = <double*>np.PyArray_DATA(p)

if p.ndim != 1:
raise ValueError("`p` must be 1-dimensional")
if p.size != pop_size:
raise ValueError("`a` and `p` must have same size")
p_sum = kahan_sum(pix, d)
if np.isnan(p_sum):
raise ValueError("probabilities contain NaN")
if np.logical_or.reduce(p < 0):
raise ValueError("probabilities are not non-negative")
if abs(p_sum - 1.) > atol:
raise ValueError("probabilities do not sum to 1")

Expand All @@ -863,7 +866,7 @@ cdef class Generator:
idx = cdf.searchsorted(uniform_samples, side="right")
idx = np.array(idx, copy=False, dtype=np.int64) # searchsorted returns a scalar
else:
idx = self.integers(0, pop_size, size=shape, dtype=np.int64)
idx = self.integers(0, pop_size, size=shape, dtype=np.int64, use_masked=False)
else:
if size > pop_size:
raise ValueError("Cannot take a larger sample than "
Expand All @@ -879,7 +882,7 @@ cdef class Generator:
found = np.zeros(shape, dtype=np.int64)
flat_found = found.ravel()
while n_uniq < size:
x = self.random(size - n_uniq)
x = self.random((size - n_uniq,))
if n_uniq > 0:
p[flat_found[0:n_uniq]] = 0
cdf = np.cumsum(p)
Expand All @@ -895,36 +898,46 @@ cdef class Generator:
size_i = size
pop_size_i = pop_size
# This is a heuristic tuning. should be improvable
if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size)):
if shuffle:
cutoff = 50
else:
cutoff = 20

if pop_size_i > 10000 and (size_i > (pop_size_i // cutoff)):
# Tail shuffle size elements
idx = np.arange(pop_size, dtype=np.int64)
idx_ptr = np.PyArray_BYTES(<np.ndarray>idx)
buf_ptr = <char*>&buf
self._shuffle_raw(pop_size_i, max(pop_size_i - size_i, 1),
8, 8, idx_ptr, buf_ptr)
idx = np.PyArray_Arange(0, pop_size_i, 1, np.NPY_INT64)
idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
with self.lock, nogil:
self._shuffle_int(pop_size_i, max(pop_size_i - size_i, 1),
idx_data)
# Copy to allow potentially large array backing idx to be gc
idx = idx[(pop_size - size):].copy()
else:
# Floyds's algorithm with precomputed indices
# Worst case, O(n**2) when size is close to pop_size
# Floyd's algorithm
idx = np.empty(size, dtype=np.int64)
idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
idx_set = set()
loc = 0
# Sample indices with one pass to avoid reacquiring the lock
with self.lock:
for j in range(pop_size_i - size_i, pop_size_i):
idx_data[loc] = random_interval(&self._bitgen, j)
loc += 1
loc = 0
while len(idx_set) < size_i:
# smallest power of 2 larger than 1.2 * size
set_size = <uint64_t>(1.2 * size_i)
mask = _gen_mask(set_size)
set_size = 1 + mask
hash_set = np.full(set_size, <uint64_t>-1, np.uint64)
with self.lock, cython.wraparound(False), nogil:
for j in range(pop_size_i - size_i, pop_size_i):
if idx_data[loc] not in idx_set:
val = idx_data[loc]
else:
idx_data[loc] = val = j
idx_set.add(val)
loc += 1
val = random_bounded_uint64(&self._bitgen, 0, j, 0, 0)
loc = val & mask
while hash_set[loc] != <uint64_t>-1 and hash_set[loc] != <uint64_t>val:
loc = (loc + 1) & mask
if hash_set[loc] == <uint64_t>-1: # then val not in hash_set
hash_set[loc] = val
idx_data[j - pop_size_i + size_i] = val
else: # we need to insert j instead
loc = j & mask
while hash_set[loc] != <uint64_t>-1:
loc = (loc + 1) & mask
hash_set[loc] = j
idx_data[j - pop_size_i + size_i] = j
if shuffle:
self._shuffle_int(size_i, 1, idx_data)
if shape is not None:
idx.shape = shape

Expand Down Expand Up @@ -3908,7 +3921,7 @@ cdef class Generator:
Parameters
----------
n : int or array-like of ints
n : {int, array_like[int]}
Number of experiments.
pvals : sequence of floats, length p
Probabilities of each of the ``p`` different outcomes. These
Expand Down Expand Up @@ -4286,6 +4299,28 @@ cdef class Generator:
string.memcpy(data + j * stride, data + i * stride, itemsize)
string.memcpy(data + i * stride, buf, itemsize)

cdef inline void _shuffle_int(self, np.npy_intp n, np.npy_intp first,
int64_t* data) nogil:
"""
Parameters
----------
n
Number of elements in data
first
First observation to shuffle. Shuffles n-1,
n-2, ..., first, so that when first=1 the entire
array is shuffled
data
Location of data
"""
cdef np.npy_intp i, j
cdef int64_t temp
for i in reversed(range(first, n)):
j = random_bounded_uint64(&self._bitgen, 0, i, 0, 0)
temp = data[j]
data[j] = data[i]
data[i] = temp

def permutation(self, object x):
"""
permutation(x)
Expand Down
8 changes: 4 additions & 4 deletions randomgen/tests/test_against_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def test_standard_exponential(self):
self.rs.standard_exponential)
self._is_state_common_legacy()

@pytest.mark.xfail(reason="Stream broken for simplicity")
@pytest.mark.xfail(reason="Stream broken for simplicity", strict=True)
def test_tomaxint(self):
self._set_common_state()
self._is_state_common()
Expand Down Expand Up @@ -298,7 +298,7 @@ def test_rand(self):
with pytest.deprecated_call():
assert_allclose(f(3, 4, 5), g(3, 4, 5))

@pytest.mark.xfail(reason="Definition of poisson_lam_max changed")
@pytest.mark.xfail(reason="poisson_lam_max changed", strict=True)
def test_poisson_lam_max(self):
assert_allclose(self.rg.poisson_lam_max, self.nprs.poisson_lam_max)

Expand All @@ -309,7 +309,7 @@ def test_triangular(self):
self.rg.triangular)
self._is_state_common()

@pytest.mark.xfail(reason="Changes to hypergeometic break stream")
@pytest.mark.xfail(reason="Changes to hypergeometic", strict=True)
def test_hypergeometric(self):
self._set_common_state()
self._is_state_common()
Expand Down Expand Up @@ -339,7 +339,7 @@ def test_multinomial(self):
g(100, np.array(p), size=(7, 23)))
self._is_state_common()

@pytest.mark.xfail(reason="Stream broken for performance")
@pytest.mark.xfail(reason="Stream broken for performance", strict=True)
def test_choice(self):
self._set_common_state()
self._is_state_common()
Expand Down
4 changes: 2 additions & 2 deletions randomgen/tests/test_final_release_changes.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ def test_integers_closed():
random_gen.integers(0, 10, closed=False)


def test_integers_use_masked(endpoint):
def test_integers_use_masked():
with pytest.deprecated_call():
random_gen.integers(0, 10, use_masked=endpoint)
random_gen.integers(0, 10, use_masked=True)


def test_integers_large_negative_value():
Expand Down
34 changes: 23 additions & 11 deletions randomgen/tests/test_generator_117.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from distutils.version import LooseVersion
from itertools import product

import numpy as np
Expand All @@ -19,6 +20,15 @@
from randomgen import PCG64


v1174 = LooseVersion("1.17.4")
v118 = LooseVersion("1.18")
NP_LT_1174 = LooseVersion(np.__version__) < v1174
NP_LT_1174_OR_GT_118 = NP_LT_1174 or LooseVersion(np.__version__) > v118

pytestmark = pytest.mark.skipif(NP_LT_1174_OR_GT_118,
reason="Only test 1.17.4 to 1.18.x")


def positive_param():
base = Generator(PCG64())
return [base.chisquare(10),
Expand Down Expand Up @@ -221,35 +231,37 @@ def test_permutation():
assert_array_equal(result, expected)


@pytest.mark.xfail(reason="Choice has not been updated")
@pytest.mark.parametrize("replace", [True, False])
def test_choice(replace):
np_gen.bit_generator.state = initial_state
def test_choice_with_p(replace):
x = np.arange(100)
expected = np_gen.choice(x, size=10, replace=replace)
np_gen.bit_generator.state = initial_state
p = (x + 1) / (x + 1).sum()
expected = np_gen.choice(x, size=10, replace=replace, p=p)

gen.bit_generator.state = initial_state
result = gen.choice(x, size=10, replace=replace)
result = gen.choice(x, size=10, replace=replace, p=p)
assert_array_equal(result, expected)


@pytest.mark.parametrize("replace", [True, False])
def test_choice(replace):
np_gen.bit_generator.state = initial_state
p = (x + 1) / (x + 1).sum()
expected = np_gen.choice(x, size=10, replace=replace, p=p)
x = np.arange(100)
expected = np_gen.choice(x, size=10, replace=replace)

gen.bit_generator.state = initial_state
result = gen.choice(x, size=10, replace=replace, p=p)
result = gen.choice(x, size=10, replace=replace)
assert_array_equal(result, expected)


@pytest.mark.xfail(reason="Changes to lemire generators")
@pytest.mark.skipif(NP_LT_1174, reason="Changes to lemire generators")
@pytest.mark.parametrize("args", integers())
def test_integers(args):
np_gen.bit_generator.state = initial_state
expected = np_gen.integers(*args)

gen.bit_generator.state = initial_state
with pytest.deprecated_call():
result = gen.integers(*args, use_masked=False)
result = gen.integers(*args, use_masked=False)
assert_array_equal(result, expected)


Expand Down
Loading

0 comments on commit 809ae48

Please sign in to comment.