@@ -58,31 +58,31 @@ private static final class TrieNode {
58
58
59
59
// Process a chunk and write results in a Trie rooted at 'root'.
60
60
private static void processChunk (final TrieNode root , final ChunkReader cr ) {
61
- while (cr .hasNext ()) {
61
+ while (cr .ensureHasMoreRows ()) {
62
62
TrieNode node = root ;
63
63
64
64
// Process the location name navigating through the trie
65
- int b = cr .getNext () & 0xFF ;
66
- while (b != ';' ) {
65
+ int b = cr .getNext ();
66
+ do {
67
+ b &= 0xFF ;
67
68
if (node .children [b ] == null ) {
68
69
node .children [b ] = new TrieNode ();
69
70
}
70
71
node = node .children [b ];
71
- b = cr .getNext () & 0xFF ;
72
- }
72
+ b = cr .getNext ();
73
+ } while ( b != ';' );
73
74
74
75
// Process the reading value (temperature)
75
- int reading ;
76
+ final int reading ;
76
77
77
- byte b1 = cr .getNext ();
78
- byte b2 = cr .getNext ();
79
- byte b3 = cr .getNext ();
80
- byte b4 = cr .getNext ();
78
+ final byte b1 = cr .getNext ();
79
+ final byte b2 = cr .getNext ();
81
80
if (b2 == '.' ) { // value is n.n
82
- reading = (b1 * 10 + b3 - TWO_BYTE_TO_INT );
83
- // b4 == \n
81
+ reading = (b1 * 10 + cr .getNext () - TWO_BYTE_TO_INT );
84
82
}
85
83
else {
84
+ final byte b3 = cr .getNext ();
85
+ final byte b4 = cr .getNext ();
86
86
if (b4 == '.' ) { // value is -nn.n
87
87
reading = -(b2 * 100 + b3 * 10 + cr .getNext () - THREE_BYTE_TO_INT );
88
88
}
@@ -92,11 +92,15 @@ else if (b1 == '-') { // value is -n.n
92
92
else { // value is nn.n
93
93
reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT );
94
94
}
95
- cr .getNext (); // new line
96
95
}
96
+ cr .cursor ++; // new line
97
97
98
- node .min = Math .min (node .min , reading );
99
- node .max = Math .max (node .max , reading );
98
+ if (reading < node .min ) {
99
+ node .min = reading ;
100
+ }
101
+ if (reading > node .max ) {
102
+ node .max = reading ;
103
+ }
100
104
node .sum += reading ;
101
105
node .count ++;
102
106
}
@@ -165,26 +169,40 @@ private void printResultsRec(final TrieNode[] nodes, final byte[] bytes, final i
165
169
bytes [index ] = (byte ) i ;
166
170
printResultsRec (childNodes , bytes , index + 1 );
167
171
}
168
-
169
172
}
170
173
}
171
174
}
172
175
173
176
private static final String FILE = "./measurements.txt" ;
174
177
178
+ /**
179
+ * Read a chunk of a {@link RandomAccessFile} file.
180
+ * Internally, the chunk is further subdivided into "sub-chunks" (byte arrays).
181
+ */
175
182
private static final class ChunkReader {
176
- // Byte arrays of size 2^22 seem to have the best performance on my machine.
177
- private static final int BYTE_ARRAY_SIZE = 1 << 22 ;
183
+ // Byte arrays of size 2^20 seem to have the best performance on my machine.
184
+ private static final int BYTE_ARRAY_SIZE = 1 << 20 ;
178
185
private final byte [] bytes ;
179
186
180
187
private final RandomAccessFile file ;
188
+
189
+ // The initial position of this chunk.
181
190
private final long chunkBegin ;
191
+
192
+ // The length of this chunk.
182
193
private final long chunkLength ;
183
194
184
- private int readBytes = 0 ;
195
+ // The beginning of the current "sub-chunk", relative to the initial position of the chunk.
196
+ private long offset = 0 ;
197
+
198
+ // The size of the current "sub-chunk".
199
+ private int subChunkSize = 0 ;
185
200
201
+ // The current position within the current "sub-chunk".
186
202
private int cursor = 0 ;
187
- private long offset = 0 ;
203
+
204
+ // The maximum size of a row
205
+ private static final int MAX_ROW_SIZE_BYTES = 107 ;
188
206
189
207
ChunkReader (
190
208
final RandomAccessFile file ,
@@ -197,32 +215,43 @@ private static final class ChunkReader {
197
215
int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int ) chunkLength : BYTE_ARRAY_SIZE ;
198
216
this .bytes = new byte [byteArraySize ];
199
217
200
- readNextBytes ();
218
+ readSubChunk ();
201
219
}
202
220
203
- boolean hasNext () {
204
- return (offset + cursor ) < chunkLength ;
221
+ // Return true if this ChunkReader has more bytes available, false otherwise.
222
+ // If this ChunkReader needs to read a new "sub-chunk", it does so in this method.
223
+ boolean ensureHasMoreRows () {
224
+ if (cursor >= subChunkSize ) {
225
+ offset += cursor ;
226
+ if (offset >= chunkLength ) {
227
+ return false ;
228
+ }
229
+ readSubChunk ();
230
+ }
231
+
232
+ return true ;
205
233
}
206
234
207
235
byte getNext () {
208
- if (cursor >= readBytes ) {
209
- readNextBytes ();
210
- }
211
236
return bytes [cursor ++];
212
237
}
213
238
214
- private void readNextBytes () {
239
+ private void readSubChunk () {
215
240
try {
216
- offset += readBytes ;
217
241
synchronized (file ) {
218
242
file .seek (chunkBegin + offset );
219
- readBytes = file .read (bytes );
243
+ subChunkSize = file .read (bytes );
220
244
}
221
- cursor = 0 ;
222
245
}
223
246
catch (IOException e ) {
224
247
throw new RuntimeException (e );
225
248
}
249
+
250
+ // Always "pretend" that we've read a few bytes less,
251
+ // so that we don't stop in the middle of reading a row
252
+ subChunkSize -= MAX_ROW_SIZE_BYTES ;
253
+
254
+ cursor = 0 ;
226
255
}
227
256
}
228
257
0 commit comments