Skip to content

Commit 0a7726c

Browse files
authored
Improving first iteration by avoiding string creation as much as possible (gunnarmorling#516)
- It avoids creating unnecessary Strings objects and handles with the station names with its djb2 hashes instead - Initializes hashmaps with capacity and load factor - Adds -XX:+AlwaysPreTouch
1 parent 36ffed1 commit 0a7726c

File tree

3 files changed

+55
-33
lines changed

3 files changed

+55
-33
lines changed

calculate_average_adriacabeza.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,6 @@
1616
#
1717

1818

19-
JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC"
19+
JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch"
2020
java --enable-preview -classpath target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_adriacabeza
2121

github_users.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,4 @@ gnmathur;Gaurav Mathur
5252
vemana;Subrahmanyam
5353
jincongho;Jin Cong Ho
5454
yonatang;Yonatan Graber
55+
adriacabeza;Adrià Cabeza

src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java

Lines changed: 53 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,13 @@
2323
import java.nio.file.Paths;
2424
import java.nio.file.StandardOpenOption;
2525
import java.util.ArrayList;
26+
import java.util.Comparator;
2627
import java.util.HashMap;
28+
import java.util.HashSet;
2729
import java.util.List;
2830
import java.util.Map;
31+
import java.util.Set;
32+
import java.util.concurrent.ConcurrentHashMap;
2933
import java.util.stream.Collectors;
3034

3135
/**
@@ -35,11 +39,22 @@ public class CalculateAverage_adriacabeza {
3539

3640
private static final Path FILE_PATH = Paths.get("./measurements.txt");
3741
public static final int CITY_NAME_MAX_CHARACTERS = 128;
42+
private static final int N_PROCESSORS = Runtime.getRuntime().availableProcessors();
43+
private static final int DJB2_INIT = 5381;
44+
private static final Map<Integer, String> cityMap = new ConcurrentHashMap<>(10_000, 1, N_PROCESSORS);
3845

3946
/**
4047
* Represents result containing a HashMap with city as key and ResultRow as value.
4148
*/
4249
private static class Result {
50+
public void addStation(int hash, int value) {
51+
resultMap.put(hash, new StationData(value));
52+
}
53+
54+
public StationData getData(int hash) {
55+
return resultMap.get(hash);
56+
}
57+
4358
private static class StationData {
4459
private int min, sum, count, max;
4560

@@ -63,28 +78,16 @@ public String toString() {
6378

6479
}
6580

66-
private final Map<String, StationData> resultMap;
81+
private final Map<Integer, StationData> resultMap;
6782

6883
public Result() {
69-
this.resultMap = new HashMap<>();
84+
this.resultMap = new HashMap<>(10_000, 1);
7085
}
7186

72-
public Map<String, StationData> getResultMap() {
87+
public Map<Integer, StationData> getResultMap() {
7388
return resultMap;
7489
}
7590

76-
public void addMeasurement(String city, int value) {
77-
resultMap.compute(city, (_, resultRow) -> {
78-
if (resultRow == null) {
79-
return new StationData(value);
80-
}
81-
else {
82-
resultRow.update(value);
83-
return resultRow;
84-
}
85-
});
86-
}
87-
8891
public void merge(Result other) {
8992
other.getResultMap().forEach((city, resultRow) -> resultMap.merge(city, resultRow, (existing, incoming) -> {
9093
existing.min = Math.min(existing.min, incoming.min);
@@ -96,9 +99,9 @@ public void merge(Result other) {
9699
}
97100

98101
public String toString() {
99-
return this.resultMap.entrySet().stream()
100-
.sorted(Map.Entry.comparingByKey())
101-
.map(entry -> "%s=%s".formatted(entry.getKey(), entry.getValue()))
102+
return this.resultMap.entrySet().parallelStream()
103+
.map(entry -> "%s=%s".formatted(cityMap.get(entry.getKey()), entry.getValue()))
104+
.sorted(Comparator.comparing(s -> s.split("=")[0]))
102105
.collect(Collectors.joining(", ", "{", "}"));
103106
}
104107
}
@@ -155,6 +158,21 @@ private static List<MappedByteBuffer> getMappedByteBuffers(int nProcessors) thro
155158
}
156159
}
157160

161+
private static int readNumberFromBuffer(ByteBuffer buffer, int limit) {
162+
var number = 0;
163+
var sign = 1;
164+
while (buffer.position() < limit) {
165+
var numberByte = buffer.get();
166+
if (numberByte == '-')
167+
sign = -1;
168+
else if (numberByte == '\n')
169+
break;
170+
else if (numberByte != '.')
171+
number = number * 10 + (numberByte - '0');
172+
}
173+
return sign * number;
174+
}
175+
158176
/**
159177
* Calculates average measurements from the file.
160178
*
@@ -167,28 +185,31 @@ private static Result calculateAverageMeasurements(List<MappedByteBuffer> chunks
167185
Result partialResult = new Result();
168186
var limit = buffer.limit();
169187
var field = new byte[CITY_NAME_MAX_CHARACTERS];
188+
Set<Integer> seenHashes = new HashSet<>(10_000, 1);
170189
while (buffer.position() < limit) {
171190
var fieldCurrentIndex = 0;
172-
field[fieldCurrentIndex++] = buffer.get();
191+
var fieldByte = buffer.get();
192+
field[fieldCurrentIndex++] = fieldByte;
193+
// implement djb2 hash: https://theartincode.stanis.me/008-djb2/
194+
int hash = DJB2_INIT;
173195
while (buffer.position() < limit) {
174-
var fieldByte = buffer.get();
196+
// hash = hash * 33 + fieldByte
197+
hash = (((hash << 5) + hash) + fieldByte);
198+
fieldByte = buffer.get();
175199
if (fieldByte == ';')
176200
break;
177201
field[fieldCurrentIndex++] = fieldByte;
178202
}
179-
var fieldStr = new String(field, 0, fieldCurrentIndex);
180-
var number = 0;
181-
var sign = 1;
182-
while (buffer.position() < limit) {
183-
var numberByte = buffer.get();
184-
if (numberByte == '-')
185-
sign = -1;
186-
else if (numberByte == '\n')
187-
break;
188-
else if (numberByte != '.')
189-
number = number * 10 + (numberByte - '0');
203+
204+
var number = readNumberFromBuffer(buffer, limit);
205+
if (!seenHashes.contains(hash)) {
206+
seenHashes.add(hash);
207+
cityMap.put(hash, new String(field, 0, fieldCurrentIndex));
208+
partialResult.addStation(hash, number);
209+
}
210+
else {
211+
partialResult.getData(hash).update(number);
190212
}
191-
partialResult.addMeasurement(fieldStr, sign * number);
192213
}
193214
return partialResult;
194215
}).reduce(new Result(), (partialResult1, partialResult2) -> {

0 commit comments

Comments
 (0)