update erthink

miloyip · Apr 3, 2020 · 658b50f · 658b50f
1 parent 604f0b5
commit 658b50f
Show file tree

Hide file tree

Showing 10 changed files with 209 additions and 141 deletions.
diff --git a/README.md b/README.md
@@ -13,17 +13,20 @@ This fork of the benchmark was created to demonstrate the performance superiorit
 
 Briefly, about this double-to-string implementation:
 
-1. it is fastest Grisu-based, but not exactly the Grisu3 nor Grisu2;
+1. It is fastest Grisu-based, but not exactly the Grisu3 nor Grisu2;
 
-2. compared to Ryū this implementation significantly less in code size and spends less clock cycles per digit, but may slightly inferior in a whole on a 16-17 digit values.
+2. Compared to Ryū this implementation significantly less in code size and spends less clock cycles per digit, but may slightly inferior in a whole on a 16-17 digit values.
 
-3. output string representation _always_ roundtrip convertible to the original value, i.e. `strtod()` for character string result will return the exactly original value.
+3. Output string representation _always_ roundtrip convertible to the original value, i.e. `strtod()` for character string result will return the exactly original value.
 
-4. generated string representation is shortest for more than 99.95% of IEEE-754 double values, i.e. one extra digit for less than 0.05% values.
+4. Generated string representation is shortest for more than `99.963%` of
+IEEE-754 double values, i.e. one extra digit for less that `0.037%` values.
+Moreover, for less than `0.06%` of double values, the last digit differs
+from an ideal nearest by `±1`.
 
-5. for now produces only a raw ASCII representation, e.g. `-22250738585072014e-324` without dot and `'\0'` at the end;
+5. For now produces only a raw ASCII representation, e.g. `-22250738585072014e-324` without dot and `'\0'` at the end;
 
-Now I would like to get feedback, assess how much this is in demand and collect suggestions for further improvements. For instance, I think that it is reasonable to implement conversion with a specified precision (i.e., with a specified number of digits), but not provide a printf-like interface.
+Now I would like to get feedback, assess how much this is in demand and collect suggestions for further improvements. For instance, I think that it is reasonable to implement conversion with a specified precision (i.e., with a specified number of digits), but not provide a printf-like interface. For more into see [issue#1](https://github.com/erthink/erthink/issues/1).
 
 Any suggestions are welcome!
 

diff --git a/src/erthink/.circleci/config.yml b/src/erthink/.circleci/config.yml
@@ -3,6 +3,9 @@ jobs:
   build:
     docker:
       - image: circleci/buildpack-deps:bionic-browsers
+    environment:
+      GTEST_SHUFFLE: 1
+      GTEST_RUNTIME_LIMIT: 199
     steps:
       - run: sudo apt update -q && sudo apt install cmake libgtest-dev clang-format-6.0
       - checkout
@@ -11,7 +14,9 @@ jobs:
       - run: cmake --version
       - run: cmake .
       - run: make --keep-going all
-      - run: ulimit -c unlimited && make --keep-going test
+      - run: |
+          ulimit -c unlimited
+          make --keep-going test
       - run:
           command: |
             mkdir -p /tmp/artifacts

diff --git a/src/erthink/.travis.yml b/src/erthink/.travis.yml
@@ -2,6 +2,11 @@ language: cpp
 sudo: false
 dist: xenial
 
+env:
+  global:
+    - GTEST_SHUFFLE=1
+    - GTEST_RUNTIME_LIMIT=199
+
 addons:
   apt:
 #    sources:

diff --git a/src/erthink/appveyor.yml b/src/erthink/appveyor.yml
@@ -2,7 +2,7 @@ version: 0.0.0.{build}
 
 environment:
   GTEST_SHUFFLE: 1
-  GTEST_RUNTIME_LIMIT: 99
+  GTEST_RUNTIME_LIMIT: 199
   matrix:
     - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
       CMAKE_GENERATOR: Visual Studio 16 2019

diff --git a/src/erthink/erthink_d2a.h b/src/erthink/erthink_d2a.h
@@ -17,14 +17,20 @@
 
 #pragma once
 
-/* Double-to-string conversion based on Grisu algorithm by Florian Loitsch,
+/* Double-to-string conversion based on Grisu algorithm by Florian Loitsch
  * https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf
  *
- * 1. Generated string representation always roundtrip convertible to
- *    the original value, i.e. by strtod() function.
+ * Seems this is the fastest Grisu-based implementation,
+ * but it is not exactly Grisu3 nor Grisu2:
  *
- * 2. Generated string representation is shortest for more than 99.95% of
- *    IEEE-754 double values, i.e. one extra digit for less than 0.05% values.
+ * 1. Generated string representation ALWAYS roundtrip convertible to
+ *    the original value, i.e. any correct implementation of strtod()
+ *    will always return EXACTLY the origin double value.
+ *
+ * 2. Generated string representation is shortest for more than 99.963% of
+ *    IEEE-754 double values, i.e. one extra digit for less that 0.037% values.
+ *    Moreover, for less than 0.06% of double values, the last digit differs
+ *    from an ideal nearest by ±1.
  *
  * 3. Compared to Ryū algorithm (by Ulf Adams), this implementation
  *    significantly less in code size and spends less clock cycles per digit,
@@ -167,12 +173,11 @@ struct diy_fp {
     return diy_fp(upper.f - uint64_t(diff >> 1), upper.e);
   }
 
-  void scale(const diy_fp &factor, bool roundup) {
+  uint_fast32_t scale(const diy_fp &factor) {
     const uint64_t l = mul_64x64_128(f, factor.f, &f);
     assert(f < UINT64_MAX - INT32_MAX);
-    if (roundup)
-      f += l >> 63;
     e += factor.e + 64;
+    return static_cast<uint_fast32_t>(l >> 63);
   }
 
   diy_fp operator-(const diy_fp &rhs) const {
@@ -274,12 +279,12 @@ static __always_inline void round(char *&end, uint64_t delta, uint64_t rest,
          (rest + ten_kappa < upper ||
           (rest < upper &&
            /* closer */ upper - rest >= rest + ten_kappa - upper))) {
-    if (unlikely(end[-1] < '2')) {
+    end[-1] -= 1;
+    if (unlikely(end[-1] < '1')) {
       inout_exp10 += 1;
       end -= 1;
       return;
     }
-    end[-1] -= 1;
     rest += ten_kappa;
   }
 }
@@ -464,17 +469,17 @@ static inline char *convert(const bool accurate, diy_fp v, char *const buffer,
   }
 
   const int lead_zeros = clz64(v.f);
-#if 0 /* Given the remaining optimizations, on average it does not have a      \
-         positive effect, although a little faster in a simplest cases. */
-  // LY: check to output as ordinal
-  if (unlikely(v.e >= -52 && v.e <= lead_zeros) &&
+  /* Check to output as ordinal.
+   * Given the remaining optimizations, on average it does not have a positive
+   * effect (although a little faster in a simplest cases).
+   * However, it reduces the number of inaccuracies and non-shortest strings. */
+  if (!accurate && unlikely(v.e >= -52 && v.e <= lead_zeros) &&
       (v.e >= 0 || (v.f << (64 + v.e)) == 0)) {
     uint64_t ordinal = (v.e < 0) ? v.f >> -v.e : v.f << v.e;
     assert(v.f == ((v.e < 0) ? ordinal << -v.e : ordinal >> v.e));
     out_exp10 = 0;
     return u2a(ordinal, buffer);
   }
-#endif
 
   // LY: normalize
   assert(v.f <= UINT64_MAX / 2 && lead_zeros > 1);
@@ -483,12 +488,20 @@ static inline char *convert(const bool accurate, diy_fp v, char *const buffer,
   const diy_fp dec_factor = cached_power(v.e, out_exp10);
 
   // LY: get boundaries
-  const int mojo =
-      v.f >= UINT64_C(0x8000000000001000) ? lead_zeros : lead_zeros - 1;
-  const uint64_t delta = (dec_factor.f >> (64 - mojo)) - 3;
-  v.scale(dec_factor, true);
-  return make_digits(accurate, v.f + delta / 2, delta, buffer, out_exp10, v.f,
-                     -v.e);
+  const int mojo = v.f > UINT64_C(0x80000000000007ff) ? 64 : 65;
+  const uint64_t delta = dec_factor.f >> (mojo - lead_zeros);
+  assert(delta >= 2);
+  const uint_fast32_t lsb = v.scale(dec_factor);
+  if (accurate)
+    // -1 -2 1 0 1: non-shortest 9522 for 25M probes, ratio 0.038088%
+    //              shortest errors: +5727 -9156
+    //              non-shortest errors: +3 -5
+    return make_digits(accurate, v.f + ((delta + lsb - 1) >> 1), delta - 2,
+                       buffer, out_exp10, v.f + lsb, -v.e);
+  else
+    // -1 -2 1 0 0: non-shortest 9522 for 25M probes, ratio 0.038088%
+    return make_digits(accurate, v.f + ((delta + lsb - 1) >> 1), delta - 2,
+                       buffer, out_exp10, v.f, -v.e);
 }
 
 double inline cast(int64_t i64) {
@@ -548,13 +561,13 @@ d2a(const double &value,
   return ptr;
 }
 
-static __maybe_unused char *d2a_accurate(
+static inline __maybe_unused char *d2a_accurate(
     const double &value,
     char *const buffer /* upto d2a_max_chars for -22250738585072014e-324 */) {
   return d2a<true>(value, buffer);
 }
 
-static __maybe_unused char *d2a_fast(
+static inline __maybe_unused char *d2a_fast(
     const double &value,
     char *const buffer /* upto d2a_max_chars for -22250738585072014e-324 */) {
   return d2a<false>(value, buffer);