math_brute_force: treat reciprocal as unary function

Treat reciprocal as a unary function, instead of handling it through the binary function testing mechanism and special-casing it there. This addresses two shortcomings of the previous implementation: - Testing took significantly longer as the entire input domain was tested many times (e.g. fp16 reciprocal has only 2^16 possible input values, but binary function testing iterates over 2^16 * 2^16 input values). - The reciprocal test kernel was identical to the divide kernel. Thus the device compiler would see a regular divide operation instead of a reciprocal operation and would be unlikely to emit a specialized reciprocal sequence. This reverts all of the changes in binary_operator*.cpp made by bcfa1f7 ("Added corrections to re-enable reciprocal test in math_brute_force suite for relaxed math mode (#2221)", 2025-02-04). Signed-off-by: Sven van Haastregt <[email protected]>
KhronosGroup · Feb 19, 2025 · c33e9b0 · c33e9b0
1 parent 32361e1
commit c33e9b0
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 120 deletions.
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -214,12 +214,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     cl_double *s;
     cl_double *s2;
 
-    bool reciprocal = strcmp(name, "reciprocal") == 0;
-    const double reciprocalArrayX[] = { 1.0 };
-    const double *specialValuesX =
-        reciprocal ? reciprocalArrayX : specialValues;
-    size_t specialValuesCountX = reciprocal ? 1 : specialValuesCount;
-
     Force64BitFPUPrecision();
 
     cl_event e[VECTOR_SIZE_COUNT];
@@ -248,7 +242,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
     cl_uint idx = 0;
-    int totalSpecialValueCount = specialValuesCountX * specialValuesCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
     // Test edge cases
@@ -258,15 +252,14 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
         cl_double *fp2 = (cl_double *)p2;
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesCountX;
+        x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; idx < buffer_elements; idx++)
         {
-            fp[idx] = specialValuesX[x];
+            fp[idx] = specialValues[x];
             fp2[idx] = specialValues[y];
-            ++x;
-            if (x >= specialValuesCountX)
+            if (++x >= specialValuesCount)
             {
                 x = 0;
                 y++;
@@ -278,8 +271,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     // Init any remaining values
     for (; idx < buffer_elements; idx++)
     {
-        p[idx] =
-            reciprocal ? ((cl_ulong *)specialValuesX)[0] : genrand_int64(d);
+        p[idx] = genrand_int64(d);
         p2[idx] = genrand_int64(d);
     }
 
@@ -372,13 +364,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
     s = (cl_double *)gIn + thread_id * buffer_elements;
     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-
-    if (reciprocal)
-        for (size_t j = 0; j < buffer_elements; j++)
-            r[j] = (float)func.f_f(s2[j]);
-    else
-        for (size_t j = 0; j < buffer_elements; j++)
-            r[j] = (cl_double)func.f_ff(s[j], s2[j]);
+    for (size_t j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
     // Read the data back -- no need to wait for the first N-1 buffers but wait
     // for the last buffer. This is an in order queue.
@@ -408,9 +395,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             if (t[j] != q[j])
             {
                 cl_double test = ((cl_double *)q)[j];
-                long double correct =
-                    reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
-
+                long double correct = func.f_ff(s[j], s2[j]);
                 float err = Bruteforce_Ulp_Error_Double(test, correct);
                 int fail = !(fabsf(err) <= ulps);
 
@@ -483,11 +468,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
                     }
                     else if (IsDoubleSubnormal(s2[j]))
                     {
-                        long double correct2 =
-                            reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
-                        long double correct3 =
-                            reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
-
+                        long double correct2 = func.f_ff(s[j], 0.0);
+                        long double correct3 = func.f_ff(s[j], -0.0);
                         float err2 =
                             Bruteforce_Ulp_Error_Double(test, correct2);
                         float err3 =

diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -208,11 +208,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     cl_float *s2 = 0;
     RoundingMode oldRoundMode;
 
-    bool reciprocal = strcmp(name, "reciprocal") == 0;
-    const float reciprocalArrayX[] = { 1.f };
-    const float *specialValuesX = reciprocal ? reciprocalArrayX : specialValues;
-    size_t specialValuesCountX = reciprocal ? 1 : specialValuesCount;
-
     if (relaxedMode)
     {
         func = job->f->rfunc;
@@ -244,23 +239,23 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     cl_uint idx = 0;
-    int totalSpecialValueCount = specialValuesCountX * specialValuesCount;
+    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)lastSpecialJobIndex)
     {
         // Insert special values
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesCountX;
+        x = (job_id * buffer_elements) % specialValuesCount;
         y = (job_id * buffer_elements) / specialValuesCount;
 
         for (; idx < buffer_elements; idx++)
         {
-            p[idx] = ((cl_uint *)specialValuesX)[x];
+            p[idx] = ((cl_uint *)specialValues)[x];
             p2[idx] = ((cl_uint *)specialValues)[y];
             ++x;
-            if (x >= specialValuesCountX)
+            if (x >= specialValuesCount)
             {
                 x = 0;
                 y++;
@@ -274,19 +269,13 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
                 if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
                 if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
             }
-            else if (relaxedMode && reciprocal)
-            {
-                cl_uint p2j = p2[idx] & 0x7fffffff;
-                // Replace values outside [2^-126, 2^126] with QNaN
-                if (p2j < 0x00807d99 || p2j > 0x7e800000) p2[idx] = 0x7fc00000;
-            }
         }
     }
 
     // Init any remaining values
     for (; idx < buffer_elements; idx++)
     {
-        p[idx] = reciprocal ? ((cl_uint *)specialValuesX)[0] : genrand_int32(d);
+        p[idx] = genrand_int32(d);
         p2[idx] = genrand_int32(d);
 
         if (relaxedMode && strcmp(name, "divide") == 0)
@@ -297,12 +286,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
             if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
         }
-        else if (relaxedMode && reciprocal)
-        {
-            cl_uint p2j = p2[idx] & 0x7fffffff;
-            // Replace values outside [2^-126, 2^126] with QNaN
-            if (p2j < 0x00807d99 || p2j > 0x7e800000) p2[idx] = 0x7fc00000;
-        }
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -408,31 +391,18 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
     s2 = (float *)gIn2 + thread_id * buffer_elements;
     if (gInfNanSupport)
     {
-        if (reciprocal)
-            for (size_t j = 0; j < buffer_elements; j++)
-                r[j] = (float)func.f_f(s2[j]);
-        else
-            for (size_t j = 0; j < buffer_elements; j++)
-                r[j] = (float)func.f_ff(s[j], s2[j]);
+        for (size_t j = 0; j < buffer_elements; j++)
+            r[j] = (float)func.f_ff(s[j], s2[j]);
     }
     else
     {
-        if (reciprocal)
-            for (size_t j = 0; j < buffer_elements; j++)
-            {
-                feclearexcept(FE_OVERFLOW);
-                r[j] = (float)func.f_f(s2[j]);
-                overflow[j] =
-                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
-            }
-        else
-            for (size_t j = 0; j < buffer_elements; j++)
-            {
-                feclearexcept(FE_OVERFLOW);
-                r[j] = (float)func.f_ff(s[j], s2[j]);
-                overflow[j] =
-                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
-            }
+        for (size_t j = 0; j < buffer_elements; j++)
+        {
+            feclearexcept(FE_OVERFLOW);
+            r[j] = (float)func.f_ff(s[j], s2[j]);
+            overflow[j] =
+                FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+        }
     }
 
     if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
@@ -467,8 +437,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
             if (t[j] != q[j])
             {
                 float test = ((float *)q)[j];
-                double correct =
-                    reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
+                double correct = func.f_ff(s[j], s2[j]);
 
                 // Per section 10 paragraph 6, accept any result if an input or
                 // output is a infinity or NaN or overflow
@@ -505,7 +474,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
                     }
 
                     // retry per section 6.5.3.3
-                    if (!reciprocal && IsFloatSubnormal(s[j]))
+                    if (IsFloatSubnormal(s[j]))
                     {
                         double correct2, correct3;
                         float err2, err3;
@@ -611,10 +580,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
                         if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
 
-                        correct2 =
-                            reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
-                        correct3 =
-                            reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
+                        correct2 = func.f_ff(s[j], 0.0);
+                        correct3 = func.f_ff(s[j], -0.0);
 
                         // Per section 10 paragraph 6, accept any result if an
                         // input or output is a infinity or NaN or overflow
@@ -647,6 +614,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
                     }
                 }
 
+
                 if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);

diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp
@@ -120,12 +120,6 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     std::vector<float> s(0), s2(0);
     RoundingMode oldRoundMode;
 
-    bool reciprocal = strcmp(name, "reciprocal") == 0;
-    const cl_half reciprocalArrayHalfX[] = { 0x3c00 };
-    const cl_half *specialValuesHalfX =
-        reciprocal ? reciprocalArrayHalfX : specialValuesHalf;
-    size_t specialValuesHalfCountX = reciprocal ? 1 : specialValuesHalfCount;
-
     cl_event e[VECTOR_SIZE_COUNT];
     cl_half *out[VECTOR_SIZE_COUNT];
 
@@ -154,23 +148,22 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements;
     cl_uint idx = 0;
     int totalSpecialValueCount =
-        specialValuesHalfCountX * specialValuesHalfCount;
+        specialValuesHalfCount * specialValuesHalfCount;
     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)lastSpecialJobIndex)
     {
         // Insert special values
         uint32_t x, y;
 
-        x = (job_id * buffer_elements) % specialValuesHalfCountX;
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
         y = (job_id * buffer_elements) / specialValuesHalfCount;
 
         for (; idx < buffer_elements; idx++)
         {
-            p[idx] = specialValuesHalfX[x];
+            p[idx] = specialValuesHalf[x];
             p2[idx] = specialValuesHalf[y];
-            ++x;
-            if (x >= specialValuesHalfCountX)
+            if (++x >= specialValuesHalfCount)
             {
                 x = 0;
                 y++;
@@ -182,8 +175,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // Init any remaining values
     for (; idx < buffer_elements; idx++)
     {
-        p[idx] = reciprocal ? ((cl_half *)specialValuesHalfX)[0]
-                            : (cl_half)genrand_int32(d);
+        p[idx] = (cl_half)genrand_int32(d);
         p2[idx] = (cl_half)genrand_int32(d);
     }
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -280,23 +272,11 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     s.resize(buffer_elements);
     s2.resize(buffer_elements);
 
-    if (reciprocal)
-    {
-        for (size_t j = 0; j < buffer_elements; j++)
-        {
-            s[j] = HTF(p[j]);
-            s2[j] = HTF(p2[j]);
-            r[j] = HFF(func.f_f(s2[j]));
-        }
-    }
-    else
+    for (size_t j = 0; j < buffer_elements; j++)
     {
-        for (size_t j = 0; j < buffer_elements; j++)
-        {
-            s[j] = HTF(p[j]);
-            s2[j] = HTF(p2[j]);
-            r[j] = HFF(func.f_ff(s[j], s2[j]));
-        }
+        s[j] = HTF(p[j]);
+        s2[j] = HTF(p2[j]);
+        r[j] = HFF(func.f_ff(s[j], s2[j]));
     }
 
     if (ftz) RestoreFPState(&oldMode);
@@ -329,8 +309,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
             if (r[j] != q[j])
             {
                 float test = HTF(q[j]);
-                float correct =
-                    reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
+                float correct = func.f_ff(s[j], s2[j]);
 
                 // Per section 10 paragraph 6, accept any result if an input or
                 // output is a infinity or NaN or overflow
@@ -456,10 +435,9 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         double correct2, correct3;
                         float err2, err3;
 
-                        correct2 =
-                            reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
-                        correct3 =
-                            reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
+                        correct2 = func.f_ff(s[j], 0.0);
+                        correct3 = func.f_ff(s[j], -0.0);
+
 
                         // Per section 10 paragraph 6, accept any result if an
                         // input or output is a infinity or NaN or overflow

diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
@@ -427,9 +427,8 @@ const Func functionList[] = {
     // basic operations
     OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
     OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
-    //ENTRY(reciprocal, 1.0f, 1.0f, FTZ_OFF, unaryF),
     { "reciprocal",
-      "/",
+      "reciprocal",
       { (void*)reference_reciprocal },
       { (void*)reference_reciprocall },
       { (void*)reference_relaxed_reciprocal },
@@ -442,7 +441,7 @@ const Func functionList[] = {
       INFINITY,
       FTZ_OFF,
       RELAXED_ON,
-      binaryOperatorF },
+      unaryF},
     { "divide",
       "/",
       { (void*)reference_divide },

diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
@@ -29,7 +29,12 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
     auto generator = [](const std::string &kernel_name, const char *builtin,
                         cl_uint vector_size_index) {
-        return GetUnaryKernel(kernel_name, builtin, ParameterType::Double,
+        const char *builtinCall = builtin;
+        if (strcmp(builtin, "reciprocal") == 0)
+        {
+            builtinCall = "((RETTYPE)(1.0))/";
+        }
+        return GetUnaryKernel(kernel_name, builtinCall, ParameterType::Double,
                               ParameterType::Double, vector_size_index);
     };
     return BuildKernels(info, job_id, generator);

diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
@@ -28,7 +28,12 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
     auto generator = [](const std::string &kernel_name, const char *builtin,
                         cl_uint vector_size_index) {
-        return GetUnaryKernel(kernel_name, builtin, ParameterType::Float,
+        const char *builtinCall = builtin;
+        if (strcmp(builtin, "reciprocal") == 0)
+        {
+            builtinCall = "((RETTYPE)(1.0f))/";
+        }
+        return GetUnaryKernel(kernel_name, builtinCall, ParameterType::Float,
                               ParameterType::Float, vector_size_index);
     };
     return BuildKernels(info, job_id, generator);