Skip to content

Commit 936fc1d

Browse files
albertoventuriniAlberto Venturini
andauthored
Second version by albertoventurini (gunnarmorling#609)
* Contribution by albertoventurini * Use byte arrays of size 2^20 --------- Co-authored-by: Alberto Venturini <[email protected]>
1 parent 3e208be commit 936fc1d

File tree

2 files changed

+60
-31
lines changed

2 files changed

+60
-31
lines changed

calculate_average_albertoventurini.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,5 @@
1515
# limitations under the License.
1616
#
1717

18-
JAVA_OPTS="-server -Xnoclassgc"
18+
JAVA_OPTS="-Xnoclassgc"
1919
java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini

src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java

Lines changed: 59 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -58,31 +58,31 @@ private static final class TrieNode {
5858

5959
// Process a chunk and write results in a Trie rooted at 'root'.
6060
private static void processChunk(final TrieNode root, final ChunkReader cr) {
61-
while (cr.hasNext()) {
61+
while (cr.ensureHasMoreRows()) {
6262
TrieNode node = root;
6363

6464
// Process the location name navigating through the trie
65-
int b = cr.getNext() & 0xFF;
66-
while (b != ';') {
65+
int b = cr.getNext();
66+
do {
67+
b &= 0xFF;
6768
if (node.children[b] == null) {
6869
node.children[b] = new TrieNode();
6970
}
7071
node = node.children[b];
71-
b = cr.getNext() & 0xFF;
72-
}
72+
b = cr.getNext();
73+
} while (b != ';');
7374

7475
// Process the reading value (temperature)
75-
int reading;
76+
final int reading;
7677

77-
byte b1 = cr.getNext();
78-
byte b2 = cr.getNext();
79-
byte b3 = cr.getNext();
80-
byte b4 = cr.getNext();
78+
final byte b1 = cr.getNext();
79+
final byte b2 = cr.getNext();
8180
if (b2 == '.') { // value is n.n
82-
reading = (b1 * 10 + b3 - TWO_BYTE_TO_INT);
83-
// b4 == \n
81+
reading = (b1 * 10 + cr.getNext() - TWO_BYTE_TO_INT);
8482
}
8583
else {
84+
final byte b3 = cr.getNext();
85+
final byte b4 = cr.getNext();
8686
if (b4 == '.') { // value is -nn.n
8787
reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT);
8888
}
@@ -92,11 +92,15 @@ else if (b1 == '-') { // value is -n.n
9292
else { // value is nn.n
9393
reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
9494
}
95-
cr.getNext(); // new line
9695
}
96+
cr.cursor++; // new line
9797

98-
node.min = Math.min(node.min, reading);
99-
node.max = Math.max(node.max, reading);
98+
if (reading < node.min) {
99+
node.min = reading;
100+
}
101+
if (reading > node.max) {
102+
node.max = reading;
103+
}
100104
node.sum += reading;
101105
node.count++;
102106
}
@@ -165,26 +169,40 @@ private void printResultsRec(final TrieNode[] nodes, final byte[] bytes, final i
165169
bytes[index] = (byte) i;
166170
printResultsRec(childNodes, bytes, index + 1);
167171
}
168-
169172
}
170173
}
171174
}
172175

173176
private static final String FILE = "./measurements.txt";
174177

178+
/**
179+
* Read a chunk of a {@link RandomAccessFile} file.
180+
* Internally, the chunk is further subdivided into "sub-chunks" (byte arrays).
181+
*/
175182
private static final class ChunkReader {
176-
// Byte arrays of size 2^22 seem to have the best performance on my machine.
177-
private static final int BYTE_ARRAY_SIZE = 1 << 22;
183+
// Byte arrays of size 2^20 seem to have the best performance on my machine.
184+
private static final int BYTE_ARRAY_SIZE = 1 << 20;
178185
private final byte[] bytes;
179186

180187
private final RandomAccessFile file;
188+
189+
// The initial position of this chunk.
181190
private final long chunkBegin;
191+
192+
// The length of this chunk.
182193
private final long chunkLength;
183194

184-
private int readBytes = 0;
195+
// The beginning of the current "sub-chunk", relative to the initial position of the chunk.
196+
private long offset = 0;
197+
198+
// The size of the current "sub-chunk".
199+
private int subChunkSize = 0;
185200

201+
// The current position within the current "sub-chunk".
186202
private int cursor = 0;
187-
private long offset = 0;
203+
204+
// The maximum size of a row
205+
private static final int MAX_ROW_SIZE_BYTES = 107;
188206

189207
ChunkReader(
190208
final RandomAccessFile file,
@@ -197,32 +215,43 @@ private static final class ChunkReader {
197215
int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE;
198216
this.bytes = new byte[byteArraySize];
199217

200-
readNextBytes();
218+
readSubChunk();
201219
}
202220

203-
boolean hasNext() {
204-
return (offset + cursor) < chunkLength;
221+
// Return true if this ChunkReader has more bytes available, false otherwise.
222+
// If this ChunkReader needs to read a new "sub-chunk", it does so in this method.
223+
boolean ensureHasMoreRows() {
224+
if (cursor >= subChunkSize) {
225+
offset += cursor;
226+
if (offset >= chunkLength) {
227+
return false;
228+
}
229+
readSubChunk();
230+
}
231+
232+
return true;
205233
}
206234

207235
byte getNext() {
208-
if (cursor >= readBytes) {
209-
readNextBytes();
210-
}
211236
return bytes[cursor++];
212237
}
213238

214-
private void readNextBytes() {
239+
private void readSubChunk() {
215240
try {
216-
offset += readBytes;
217241
synchronized (file) {
218242
file.seek(chunkBegin + offset);
219-
readBytes = file.read(bytes);
243+
subChunkSize = file.read(bytes);
220244
}
221-
cursor = 0;
222245
}
223246
catch (IOException e) {
224247
throw new RuntimeException(e);
225248
}
249+
250+
// Always "pretend" that we've read a few bytes less,
251+
// so that we don't stop in the middle of reading a row
252+
subChunkSize -= MAX_ROW_SIZE_BYTES;
253+
254+
cursor = 0;
226255
}
227256
}
228257

0 commit comments

Comments
 (0)