Skip to content

Commit

Permalink
math_brute_force: treat reciprocal as unary function
Browse files Browse the repository at this point in the history
Treat reciprocal as a unary function, instead of handling it through
the binary function testing mechanism and special-casing it there.

This addresses two shortcomings of the previous implementation:

 - Testing took significantly longer as the entire input domain was
   tested many times (e.g. fp16 reciprocal has only 2^16 possible
   input values, but binary function testing iterates over 2^16 * 2^16
   input values).

 - The reciprocal test kernel was identical to the divide kernel.
   Thus the device compiler would see a regular divide operation
   instead of a reciprocal operation and would be unlikely to emit a
   specialized reciprocal sequence.

This reverts all of the changes in binary_operator*.cpp made by
bcfa1f7 ("Added corrections to re-enable reciprocal test in
math_brute_force suite for relaxed math mode (#2221)", 2025-02-04).

Signed-off-by: Sven van Haastregt <[email protected]>
  • Loading branch information
svenvh committed Feb 19, 2025
1 parent 32361e1 commit c33e9b0
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 120 deletions.
38 changes: 10 additions & 28 deletions test_conformance/math_brute_force/binary_operator_double.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,12 +214,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
cl_double *s;
cl_double *s2;

bool reciprocal = strcmp(name, "reciprocal") == 0;
const double reciprocalArrayX[] = { 1.0 };
const double *specialValuesX =
reciprocal ? reciprocalArrayX : specialValues;
size_t specialValuesCountX = reciprocal ? 1 : specialValuesCount;

Force64BitFPUPrecision();

cl_event e[VECTOR_SIZE_COUNT];
Expand Down Expand Up @@ -248,7 +242,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
cl_uint idx = 0;
int totalSpecialValueCount = specialValuesCountX * specialValuesCount;
int totalSpecialValueCount = specialValuesCount * specialValuesCount;
int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;

// Test edge cases
Expand All @@ -258,15 +252,14 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
cl_double *fp2 = (cl_double *)p2;
uint32_t x, y;

x = (job_id * buffer_elements) % specialValuesCountX;
x = (job_id * buffer_elements) % specialValuesCount;
y = (job_id * buffer_elements) / specialValuesCount;

for (; idx < buffer_elements; idx++)
{
fp[idx] = specialValuesX[x];
fp[idx] = specialValues[x];
fp2[idx] = specialValues[y];
++x;
if (x >= specialValuesCountX)
if (++x >= specialValuesCount)
{
x = 0;
y++;
Expand All @@ -278,8 +271,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
// Init any remaining values
for (; idx < buffer_elements; idx++)
{
p[idx] =
reciprocal ? ((cl_ulong *)specialValuesX)[0] : genrand_int64(d);
p[idx] = genrand_int64(d);
p2[idx] = genrand_int64(d);
}

Expand Down Expand Up @@ -372,13 +364,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
s = (cl_double *)gIn + thread_id * buffer_elements;
s2 = (cl_double *)gIn2 + thread_id * buffer_elements;

if (reciprocal)
for (size_t j = 0; j < buffer_elements; j++)
r[j] = (float)func.f_f(s2[j]);
else
for (size_t j = 0; j < buffer_elements; j++)
r[j] = (cl_double)func.f_ff(s[j], s2[j]);
for (size_t j = 0; j < buffer_elements; j++)
r[j] = (cl_double)func.f_ff(s[j], s2[j]);

// Read the data back -- no need to wait for the first N-1 buffers but wait
// for the last buffer. This is an in order queue.
Expand Down Expand Up @@ -408,9 +395,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
if (t[j] != q[j])
{
cl_double test = ((cl_double *)q)[j];
long double correct =
reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);

long double correct = func.f_ff(s[j], s2[j]);
float err = Bruteforce_Ulp_Error_Double(test, correct);
int fail = !(fabsf(err) <= ulps);

Expand Down Expand Up @@ -483,11 +468,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
}
else if (IsDoubleSubnormal(s2[j]))
{
long double correct2 =
reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
long double correct3 =
reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);

long double correct2 = func.f_ff(s[j], 0.0);
long double correct3 = func.f_ff(s[j], -0.0);
float err2 =
Bruteforce_Ulp_Error_Double(test, correct2);
float err3 =
Expand Down
70 changes: 19 additions & 51 deletions test_conformance/math_brute_force/binary_operator_float.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
cl_float *s2 = 0;
RoundingMode oldRoundMode;

bool reciprocal = strcmp(name, "reciprocal") == 0;
const float reciprocalArrayX[] = { 1.f };
const float *specialValuesX = reciprocal ? reciprocalArrayX : specialValues;
size_t specialValuesCountX = reciprocal ? 1 : specialValuesCount;

if (relaxedMode)
{
func = job->f->rfunc;
Expand Down Expand Up @@ -244,23 +239,23 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
cl_uint idx = 0;
int totalSpecialValueCount = specialValuesCountX * specialValuesCount;
int totalSpecialValueCount = specialValuesCount * specialValuesCount;
int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;

if (job_id <= (cl_uint)lastSpecialJobIndex)
{
// Insert special values
uint32_t x, y;

x = (job_id * buffer_elements) % specialValuesCountX;
x = (job_id * buffer_elements) % specialValuesCount;
y = (job_id * buffer_elements) / specialValuesCount;

for (; idx < buffer_elements; idx++)
{
p[idx] = ((cl_uint *)specialValuesX)[x];
p[idx] = ((cl_uint *)specialValues)[x];
p2[idx] = ((cl_uint *)specialValues)[y];
++x;
if (x >= specialValuesCountX)
if (x >= specialValuesCount)
{
x = 0;
y++;
Expand All @@ -274,19 +269,13 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
}
else if (relaxedMode && reciprocal)
{
cl_uint p2j = p2[idx] & 0x7fffffff;
// Replace values outside [2^-126, 2^126] with QNaN
if (p2j < 0x00807d99 || p2j > 0x7e800000) p2[idx] = 0x7fc00000;
}
}
}

// Init any remaining values
for (; idx < buffer_elements; idx++)
{
p[idx] = reciprocal ? ((cl_uint *)specialValuesX)[0] : genrand_int32(d);
p[idx] = genrand_int32(d);
p2[idx] = genrand_int32(d);

if (relaxedMode && strcmp(name, "divide") == 0)
Expand All @@ -297,12 +286,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
}
else if (relaxedMode && reciprocal)
{
cl_uint p2j = p2[idx] & 0x7fffffff;
// Replace values outside [2^-126, 2^126] with QNaN
if (p2j < 0x00807d99 || p2j > 0x7e800000) p2[idx] = 0x7fc00000;
}
}

if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
Expand Down Expand Up @@ -408,31 +391,18 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
s2 = (float *)gIn2 + thread_id * buffer_elements;
if (gInfNanSupport)
{
if (reciprocal)
for (size_t j = 0; j < buffer_elements; j++)
r[j] = (float)func.f_f(s2[j]);
else
for (size_t j = 0; j < buffer_elements; j++)
r[j] = (float)func.f_ff(s[j], s2[j]);
for (size_t j = 0; j < buffer_elements; j++)
r[j] = (float)func.f_ff(s[j], s2[j]);
}
else
{
if (reciprocal)
for (size_t j = 0; j < buffer_elements; j++)
{
feclearexcept(FE_OVERFLOW);
r[j] = (float)func.f_f(s2[j]);
overflow[j] =
FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
}
else
for (size_t j = 0; j < buffer_elements; j++)
{
feclearexcept(FE_OVERFLOW);
r[j] = (float)func.f_ff(s[j], s2[j]);
overflow[j] =
FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
}
for (size_t j = 0; j < buffer_elements; j++)
{
feclearexcept(FE_OVERFLOW);
r[j] = (float)func.f_ff(s[j], s2[j]);
overflow[j] =
FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
}
}

if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
Expand Down Expand Up @@ -467,8 +437,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
if (t[j] != q[j])
{
float test = ((float *)q)[j];
double correct =
reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
double correct = func.f_ff(s[j], s2[j]);

// Per section 10 paragraph 6, accept any result if an input or
// output is a infinity or NaN or overflow
Expand Down Expand Up @@ -505,7 +474,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
}

// retry per section 6.5.3.3
if (!reciprocal && IsFloatSubnormal(s[j]))
if (IsFloatSubnormal(s[j]))
{
double correct2, correct3;
float err2, err3;
Expand Down Expand Up @@ -611,10 +580,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)

if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);

correct2 =
reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
correct3 =
reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
correct2 = func.f_ff(s[j], 0.0);
correct3 = func.f_ff(s[j], -0.0);

// Per section 10 paragraph 6, accept any result if an
// input or output is a infinity or NaN or overflow
Expand Down Expand Up @@ -647,6 +614,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
}
}


if (fabsf(err) > tinfo->maxError)
{
tinfo->maxError = fabsf(err);
Expand Down
48 changes: 13 additions & 35 deletions test_conformance/math_brute_force/binary_operator_half.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,6 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
std::vector<float> s(0), s2(0);
RoundingMode oldRoundMode;

bool reciprocal = strcmp(name, "reciprocal") == 0;
const cl_half reciprocalArrayHalfX[] = { 0x3c00 };
const cl_half *specialValuesHalfX =
reciprocal ? reciprocalArrayHalfX : specialValuesHalf;
size_t specialValuesHalfCountX = reciprocal ? 1 : specialValuesHalfCount;

cl_event e[VECTOR_SIZE_COUNT];
cl_half *out[VECTOR_SIZE_COUNT];

Expand Down Expand Up @@ -154,23 +148,22 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements;
cl_uint idx = 0;
int totalSpecialValueCount =
specialValuesHalfCountX * specialValuesHalfCount;
specialValuesHalfCount * specialValuesHalfCount;
int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;

if (job_id <= (cl_uint)lastSpecialJobIndex)
{
// Insert special values
uint32_t x, y;

x = (job_id * buffer_elements) % specialValuesHalfCountX;
x = (job_id * buffer_elements) % specialValuesHalfCount;
y = (job_id * buffer_elements) / specialValuesHalfCount;

for (; idx < buffer_elements; idx++)
{
p[idx] = specialValuesHalfX[x];
p[idx] = specialValuesHalf[x];
p2[idx] = specialValuesHalf[y];
++x;
if (x >= specialValuesHalfCountX)
if (++x >= specialValuesHalfCount)
{
x = 0;
y++;
Expand All @@ -182,8 +175,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
// Init any remaining values
for (; idx < buffer_elements; idx++)
{
p[idx] = reciprocal ? ((cl_half *)specialValuesHalfX)[0]
: (cl_half)genrand_int32(d);
p[idx] = (cl_half)genrand_int32(d);
p2[idx] = (cl_half)genrand_int32(d);
}
if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
Expand Down Expand Up @@ -280,23 +272,11 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
s.resize(buffer_elements);
s2.resize(buffer_elements);

if (reciprocal)
{
for (size_t j = 0; j < buffer_elements; j++)
{
s[j] = HTF(p[j]);
s2[j] = HTF(p2[j]);
r[j] = HFF(func.f_f(s2[j]));
}
}
else
for (size_t j = 0; j < buffer_elements; j++)
{
for (size_t j = 0; j < buffer_elements; j++)
{
s[j] = HTF(p[j]);
s2[j] = HTF(p2[j]);
r[j] = HFF(func.f_ff(s[j], s2[j]));
}
s[j] = HTF(p[j]);
s2[j] = HTF(p2[j]);
r[j] = HFF(func.f_ff(s[j], s2[j]));
}

if (ftz) RestoreFPState(&oldMode);
Expand Down Expand Up @@ -329,8 +309,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
if (r[j] != q[j])
{
float test = HTF(q[j]);
float correct =
reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
float correct = func.f_ff(s[j], s2[j]);

// Per section 10 paragraph 6, accept any result if an input or
// output is a infinity or NaN or overflow
Expand Down Expand Up @@ -456,10 +435,9 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
double correct2, correct3;
float err2, err3;

correct2 =
reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
correct3 =
reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
correct2 = func.f_ff(s[j], 0.0);
correct3 = func.f_ff(s[j], -0.0);


// Per section 10 paragraph 6, accept any result if an
// input or output is a infinity or NaN or overflow
Expand Down
5 changes: 2 additions & 3 deletions test_conformance/math_brute_force/function_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,9 +427,8 @@ const Func functionList[] = {
// basic operations
OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
//ENTRY(reciprocal, 1.0f, 1.0f, FTZ_OFF, unaryF),
{ "reciprocal",
"/",
"reciprocal",
{ (void*)reference_reciprocal },
{ (void*)reference_reciprocall },
{ (void*)reference_relaxed_reciprocal },
Expand All @@ -442,7 +441,7 @@ const Func functionList[] = {
INFINITY,
FTZ_OFF,
RELAXED_ON,
binaryOperatorF },
unaryF},
{ "divide",
"/",
{ (void*)reference_divide },
Expand Down
7 changes: 6 additions & 1 deletion test_conformance/math_brute_force/unary_double.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
BuildKernelInfo &info = *(BuildKernelInfo *)p;
auto generator = [](const std::string &kernel_name, const char *builtin,
cl_uint vector_size_index) {
return GetUnaryKernel(kernel_name, builtin, ParameterType::Double,
const char *builtinCall = builtin;
if (strcmp(builtin, "reciprocal") == 0)
{
builtinCall = "((RETTYPE)(1.0))/";
}
return GetUnaryKernel(kernel_name, builtinCall, ParameterType::Double,
ParameterType::Double, vector_size_index);
};
return BuildKernels(info, job_id, generator);
Expand Down
7 changes: 6 additions & 1 deletion test_conformance/math_brute_force/unary_float.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
BuildKernelInfo &info = *(BuildKernelInfo *)p;
auto generator = [](const std::string &kernel_name, const char *builtin,
cl_uint vector_size_index) {
return GetUnaryKernel(kernel_name, builtin, ParameterType::Float,
const char *builtinCall = builtin;
if (strcmp(builtin, "reciprocal") == 0)
{
builtinCall = "((RETTYPE)(1.0f))/";
}
return GetUnaryKernel(kernel_name, builtinCall, ParameterType::Float,
ParameterType::Float, vector_size_index);
};
return BuildKernels(info, job_id, generator);
Expand Down
Loading

0 comments on commit c33e9b0

Please sign in to comment.