diff --git a/README.md b/README.md
index 51bca3e..d8b98ec 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ The core algorithms are described in the papers:
 
 ### Speedup
 
-Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times. This reflect the fastest possible throughput if you never have to leave the destination cache-level.
+Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times using the command `benchmark -p -r 10000` (required Linux `perf` subsystem). This reflect the fastest possible throughput if you never have to leave the destination cache-level.
 The host architecture used is a 10 nm Cannon Lake [Core i3-8121U](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html) with gcc (GCC) 8.2.1 20180905 (Red Hat 8.2.1-3).
 
 ### POSPOPCNT
@@ -51,27 +51,27 @@ CPUs compared to a naive unvectorized solution
 Fold speedup compared to a naive unvectorized algorithm
 (`popcount_scalar_naive_nosimd`) for different array sizes as (CPU cycles/64-bit word, Instructions/64-bit word):
 
-| Words   | libalgebra.h  | Scalar        | Speedup |
-|---------|---------------|---------------|---------|
-| 4       | 28.5 (36.75)  | 27.25 (34)    | 1       |
-| 8       | 18.38 (25.5)  | 15.63 (30.5)  | 0.9     |
-| 16      | 10.81 (19.94) | 13.75 (28.75) | 1.3     |
-| 32      | 7.91 (17.16)  | 11.25 (27.88) | 1.4     |
-| 64      | 3.47 (3.92)   | 9.7 (27.44)   | 2.8     |
-| 128     | 2.23 (1.93)   | 8.97 (27.22)  | 4       |
-| 256     | 1.09 (1.29)   | 8.6 (27.11)   | 7.9     |
-| 512     | 0.7 (0.98)    | 8.35 (27.06)  | 11.9    |
-| 1024    | 0.48 (0.82)   | 8.22 (27.03)  | 17.2    |
-| 2048    | 0.35 (0.74)   | 8.12 (27.01)  | 23.5    |
-| 4096    | 0.29 (0.7)    | 8.09 (27.01)  | 28.1    |
-| 8192    | 0.24 (0.68)   | 8.04 (27)     | 33.2    |
-| 16384   | 0.22 (0.67)   | 8.02 (27)     | 36.6    |
-| 32768   | **0.21 (0.66)**   | 8.02 (27)     | **38.7**    |
-| 65536   | 0.83 (0.66)   | 8.01 (27)     | 9.7     |
-| 131072  | 0.47 (0.66)   | 8.02 (27)     | 17      |
-| 262144  | 0.49 (0.66)   | 8.05 (27)     | 16.5    |
-| 524288  | 0.48 (0.66)   | 8.07 (27)     | 16.9    |
-| 1048576 | 0.44 (0.66)   | 8.07 (27)     | 18.5    |
+| Words   | libalgebra.h | Scalar        | Speedup |
+|---------|--------------|---------------|---------|
+| 4       | 27.75 (37)   | 26.75 (33.5)  | 1       |
+| 8       | 16.38 (25.5) | 17.38 (30.25) | 1.1     |
+| 16      | 10.5 (19.94) | 12.75 (28.63) | 1.2     |
+| 32      | 7.72 (17.16) | 10.69 (27.81) | 1.4     |
+| 64      | 3.09 (4.36)  | 9.61 (27.41)  | 3.1     |
+| 128     | 2.53 (2.73)  | 8.84 (27.2)   | 3.5     |
+| 256     | 1.35 (1.7)   | 8.5 (27.1)    | 6.3     |
+| 512     | 0.67 (1.18)  | 8.33 (27.05)  | 12.4    |
+| 1024    | 0.5 (0.92)   | 8.25 (27.03)  | 16.4    |
+| 2048    | 0.41 (0.79)  | 8.15 (27.01)  | 20.1    |
+| 4096    | 0.46 (0.72)  | 8.12 (27.01)  | 17.8    |
+| 8192    | 0.39 (0.69)  | 8.11 (27)     | 21      |
+| 16384   | 0.39 (0.67)  | 8.1 (27)      | 20.6    |
+| 32768   | 0.89 (0.66)  | 8.1 (27)      | 9.1     |
+| 65536   | 0.84 (0.66)  | 8.1 (27)      | 9.6     |
+| 131072  | 0.68 (0.66)  | 8.09 (27)     | 11.9    |
+| 262144  | 1.11 (0.66)  | 8.09 (27)     | 7.3     |
+| 524288  | 1.84 (0.66)  | 8.12 (27)     | 4.4     |
+| 1048576 | 1.95 (0.66)  | 8.15 (27)     | 4.2     |
 
 ### Set algebra
 
@@ -82,25 +82,25 @@ OR, or XOR) which all have identical latency and throughput (CPI).
 
 | Words   | libalgebra.h | Scalar        | Speedup |
 |---------|--------------|---------------|---------|
-| 4       | 18.25 (8.63) | 14.63 (22.75) | 0.8     |
-| 8       | 9.5 (5.44)   | 10.25 (20.88) | 1.1     |
-| 16      | 5.28 (3.84)  | 7.91 (19.94)  | 1.5     |
-| 32      | 2.88 (2.56)  | 6.88 (19.47)  | 2.4     |
-| 64      | 1.98 (2.06)  | 5.94 (19.23)  | 3       |
-| 128     | 1.05 (0.89)  | 5.49 (19.12)  | 5.2     |
-| 256     | 0.61 (0.64)  | 5.24 (19.06)  | 8.6     |
-| 512     | 0.42 (0.51)  | 5.13 (19.03)  | 12.3    |
-| 1024    | 0.29 (0.45)  | 5.06 (19.02)  | 17.4    |
-| 2048    | 0.21 (0.41)  | 5.04 (19.01)  | 24.2    |
-| 4096    | 0.17 (0.4)   | 5.02 (19)     | 29      |
-| 8192    | 0.15 (0.39)  | 5.01 (19)     | 34.3    |
-| 16384   | **0.13 (0.39)**  | 5.01 (19)     | **37.6**    |
-| 32768   | 0.54 (0.39)  | 5 (19)        | 9.3     |
-| 65536   | 0.43 (0.38)  | 5 (19)        | 11.7    |
-| 131072  | 0.28 (0.38)  | 5 (19)        | 17.8    |
-| 262144  | 0.25 (0.38)  | 5 (19)        | 20.4    |
-| 524288  | 0.24 (0.38)  | 5 (19)        | 20.7    |
-| 1048576 | 0.22 (0.38)  | 5 (19)        | 22.9    |
+| 4       | 17.63 (8.63) | 14.63 (22.75) | 0.8     |
+| 8       | 8.13 (5.44)  | 10 (20.88)    | 1.2     |
+| 16      | 4.69 (3.84)  | 7.91 (19.94)  | 1.7     |
+| 32      | 2.38 (2.56)  | 6.59 (19.47)  | 2.8     |
+| 64      | 1.82 (2.06)  | 5.87 (19.23)  | 3.2     |
+| 128     | 0.88 (0.89)  | 5.43 (19.12)  | 6.2     |
+| 256     | 0.57 (0.64)  | 5.18 (19.06)  | 9.2     |
+| 512     | 0.41 (0.51)  | 5.11 (19.03)  | 12.4    |
+| 1024    | 0.33 (0.45)  | 5.06 (19.02)  | 15.3    |
+| 2048    | 0.39 (0.41)  | 5.03 (19.01)  | 13.1    |
+| 4096    | 0.36 (0.4)   | 5.02 (19)     | 13.9    |
+| 8192    | 0.37 (0.39)  | 5.01 (19)     | 13.7    |
+| 16384   | 0.55 (0.39)  | 5.01 (19)     | 9.1     |
+| 32768   | 0.55 (0.39)  | 5 (19)        | 9.2     |
+| 65536   | 0.52 (0.38)  | 5 (19)        | 9.7     |
+| 131072  | 0.56 (0.38)  | 5.01 (19)     | 9       |
+| 262144  | 1.25 (0.38)  | 5.02 (19)     | 4       |
+| 524288  | 1.76 (0.38)  | 5.03 (19)     | 2.9     |
+| 1048576 | 1.81 (0.38)  | 5.07 (19)     | 2.8     |
 
 ## C/C++ API
 
@@ -135,7 +135,7 @@ int STORM_pospopcnt_u16(const uint16_t* data, uint32_t size, uint32_t* flags);
  * Compute the intersection, union, or diff cardinality between pairs of bitmaps
  * @data1: A 64-bit array
  * @data2: A 64-bit array
- * @size: Size of data in bytes
+ * @size: Size of data in 64-bit words
  */
 // Intersect cardinality
 uint64_t STORM_intersect_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size);