forked from google/centipede
-
Notifications
You must be signed in to change notification settings - Fork 5
/
feature.h
348 lines (306 loc) · 13.8 KB
/
feature.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
// Copyright 2022 The Centipede Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This library defines the concepts "fuzzing feature" and "feature domain".
// It is used by Centipede, and it can be used by fuzz runners to
// define their features in a way most friendly to Centipede.
// Fuzz runners do not have to use this file nor to obey the rules defined here.
// But using this file and following its rules is the simplest way if you want
// Centipede to understand the details about the features generated by the
// runner.
//
// This library must not depend on anything other than libc so that fuzz targets
// using it doesn't gain redundant coverage. For the same reason this library
// uses raw __builtin_trap instead of CHECKs.
// We make an exception for <algorithm> for std::sort/std::unique,
// since <algorithm> is very lightweight.
// This library is also header-only, with all functions defined as inline.
#ifndef THIRD_PARTY_CENTIPEDE_FEATURE_H_
#define THIRD_PARTY_CENTIPEDE_FEATURE_H_
#include <stddef.h>
#include <string.h>
// WARNING!!!: Be very careful with what STL headers or other dependencies you
// add here. This header needs to remain mostly bare-bones so that we can
// include it into runner.
// <vector> is an exception, because it's too clumsy w/o it, and it introduces
// minimal code footprint.
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>
#include "./concurrent_bitset.h"
#include "./foreach_nonzero.h"
namespace centipede {
// Feature is an integer that identifies some unique behaviour
// of the fuzz target exercised by a given input.
// We say, this input has this feature with regard to this fuzz target.
// One example of a feature: a certain control flow edge being executed.
using feature_t = uint64_t;
// A vector of features. It is not expected to be ordered.
// It typically does not contain repetitions, but it's ok to have them.
using FeatureVec = std::vector<feature_t>;
// Computes a hash of `bits`. The purpose is to use the result for XOR-ing with
// some other values, such that all resulting bits look random.
inline uint64_t Hash64Bits(uint64_t bits) {
// This particular prime number seems to mix bits well.
// TODO(kcc): find a more scientific way to mix bits, e.g. switch to Murmur.
constexpr uint64_t kPrime = 13441014529ULL;
return bits * kPrime;
}
// Returns `bits` rotated left by `n`.
inline uint64_t RotateLeft(uint64_t bits, uint64_t n) {
return (bits << n) | (bits >> (64 - n));
}
namespace feature_domains {
// Feature domain is a subset of 64-bit integers dedicated to a certain
// kind of fuzzing features.
// All domains are of the same size (kDomainSize), This way, we can compute
// a domain for a given feature by dividing by kDomainSize.
class Domain {
public:
// kDomainSize is the largest power of two such that all domains fit.
static constexpr size_t kMaxNumDomainsLog = 6;
static constexpr size_t kMaxNumDomains = 1 << kMaxNumDomainsLog;
static constexpr size_t kDomainSize = 1ULL << (64 - kMaxNumDomainsLog);
static constexpr size_t kLastDomainId = kMaxNumDomains - 1;
constexpr Domain(size_t domain_id) : domain_id_(domain_id) {}
constexpr feature_t begin() const { return kDomainSize * domain_id_; }
constexpr feature_t end() const { return begin() + kDomainSize; }
bool Contains(feature_t feature) const {
return feature >= begin() && feature < end();
}
constexpr size_t domain_id() const { return domain_id_; }
// Converts any `number` into a feature in this domain.
feature_t ConvertToMe(size_t number) const {
return begin() + number % kDomainSize;
}
// Returns the DomainId of the domain that the feature belongs to.
static size_t FeatureToDomainId(feature_t feature) {
return feature / kDomainSize;
}
private:
const size_t domain_id_;
};
// NEXT_DOMAIN_ID() returns a constexpr value for the unique domain ID.
// TODO(kcc): Ideally, it would return consecutive values, but it's unclear how
// to do this with the constexpr magic. Suggestions are welcome, but not
// critical. For now we just use __LINE__ - kFirstDomainLine as the unique ID.
constexpr size_t kFirstDomainLine = __LINE__;
#define NEXT_DOMAIN_ID() (__LINE__ - kFirstDomainLine)
// Catch-all domain for unknown features.
constexpr Domain kUnknown = {NEXT_DOMAIN_ID()};
// Represents PCs, i.e. control flow edges.
// Use ConvertPCFeatureToPcIndex() to convert back to a PC index.
constexpr Domain kPCs = {NEXT_DOMAIN_ID()};
// Features derived from edge counters. See Convert8bitCounterToNumber().
constexpr Domain k8bitCounters = {NEXT_DOMAIN_ID()};
// Features derived from data flow edges.
// A typical data flow edge is a pair of PCs: {store-PC, load-PC}.
// Another variant of a data flow edge is a pair of {global-address, load-PC}.
constexpr Domain kDataFlow = {NEXT_DOMAIN_ID()};
// Features derived from instrumenting CMP instructions.
constexpr Domain kCMP = {NEXT_DOMAIN_ID()};
// Features derived from computing (bounded) control flow paths.
constexpr Domain kBoundedPath = {NEXT_DOMAIN_ID()};
// Features derived from (unordered) pairs of PCs.
constexpr Domain kPCPair = {NEXT_DOMAIN_ID()};
static_assert(NEXT_DOMAIN_ID() < Domain::kLastDomainId);
constexpr Domain kLastDomainId = {Domain::kLastDomainId}; // must be last.
// Special feature used to indicate an absence of features. Typically used where
// a feature array must not be empty, but doesn't have any other features.
constexpr feature_t kNoFeature = kUnknown.begin();
} // namespace feature_domains
// Converts an 8-bit coverage counter, i.e. a pair of {`pc_index`,
// `counter_value` must not be zero.
//
// We convert the 8-bit counter value to a number from 0 to 7
// by computing its binary log, i.e. 1=>0, 2=>1, 4=>2, 8=>3, ..., 128=>7.
// This is a heuristic, similar to that of AFL or libFuzzer
// that tries to encourage inputs with different number of repetitions
// of the same PC.
inline size_t Convert8bitCounterToNumber(size_t pc_index,
uint8_t counter_value) {
if (counter_value == 0) __builtin_trap(); // Wrong input.
// Compute a log2 of counter_value, i.e. a value between 0 and 7.
// __builtin_clz consumes a 32-bit integer.
uint32_t counter_log2 =
sizeof(uint32_t) * 8 - 1 - __builtin_clz(counter_value);
return pc_index * 8 + counter_log2;
}
// Given the `feature` from the PC domain, returns the feature's
// pc_index. I.e. reverse of kPC.ConvertToMe(), assuming all PCs originally
// converted to features were less than Domain::kDomainSize.
inline size_t ConvertPCFeatureToPcIndex(feature_t feature) {
auto domain = feature_domains::kPCs;
if (!domain.Contains(feature)) __builtin_trap();
return feature - domain.begin();
}
// Encodes {`pc1`, `pc2`} into a number.
// `pc1` and `pc2` are in range [0, `max_pc`)
inline size_t ConvertPcPairToNumber(uintptr_t pc1, uintptr_t pc2,
uintptr_t max_pc) {
return pc1 * max_pc + pc2;
}
// Encodes {context, a, b} into a number.
// `a` and `b` are arguments of an instruction "a CMP b".
// `context` identifies the CMP call site.
// Usually, `context` is a random-looking number derived from a hash function.
//
// This function has several mutually conflicting requirements:
// * it must be very fast, as it is executed on every CMP instruction.
// * it must allow to distinguish {a,b} pairs in some non-trivial way.
// * it must not produce too many different values
// (where "too many" is hard to define)
inline size_t ConvertContextAndArgPairToNumber(uintptr_t a, uintptr_t b,
uintptr_t context) {
// Below is a giant unscientific heuristic.
// Expect quite a bit of tuning effort here.
//
// The idea is to treat different {context,a,b} tuples as different features,
// so that a sufficiently new argument pair for a given context is recognized
// as interesting.
// Obviously, we can't generate 2^128 different features per context
// and so we need to bucketize them.
//
// The following relationships between a and b seem worthy of
// differentiation via different feature values:
// * a==b
// * [diff] a==b+K or a==b-K (for some small K, e.g. 1 to 31).
// * [hamming] Different values of hamming distance.
// * [msb_eq] The number of most significant bits being equal.
// * [diff_log2] Different values of Log2(a-b)
// We compute a superposition of these properties.
//
// Similar ideas:
// * https://lafintel.wordpress.com/
// * https://llvm.org/docs/LibFuzzer.html#value-profile
// ab is the component of the feature based on {a,b}.
uintptr_t ab = 0; // Value for the case of a == b;
if (a != b) {
uintptr_t diff = a - b;
// diff_component is in [0,64)
uintptr_t diff_component = diff < 32 ? diff : -diff < 32 ? 32 + -diff : 0;
// hamming_component is in [0, 64)
uintptr_t hamming_component = __builtin_popcountll(a ^ b) - 1;
// diff_log2_component is in [0, 64)
uintptr_t diff_log2_component = __builtin_clzll(diff);
// msb_eq_component is in [0, 64)
uintptr_t msb_eq_component = __builtin_clzll(a ^ b);
ab = (diff_component << 0) | (hamming_component << 6) |
(diff_log2_component << 12) | (msb_eq_component << 18);
// This gives us whooping 2^24 different features for just one context.
// In theory this is pretty bad: it will bloat the corpus beyond reasonable.
// Whether it's bad in practice remains to be seen.
// We may want to reduce the number of different features with e.g. this:
// ab %= 7919; // mod large prime
}
// Combine {a,b} and context.
return ab ^ context;
}
// Fixed-size ring buffer that maintains a hash of its elements.
// Create objects of this type as zero-initialized globals or thread-locals.
// In a zero-initialized object all values and the hash are zero.
// `kSize` indicates the maximum possible size for the ring-buffer.
// The actual size is controlled by the `ring_buffer_size` argument of push().
template <size_t kSize>
class HashedRingBuffer {
public:
// Adds `new_item` and returns the new hash of the entire collection.
// Evicts an old item.
// `ring_buffer_size` must be <= kSize and must be the same for all push()
// calls for a given object.
// We don't enforce these constraints here to avoid overhead.
// The hash function used:
// https://en.wikipedia.org/wiki/Rolling_hash#Cyclic_polynomial
size_t push(size_t new_item, size_t ring_buffer_size) {
size_t new_pos = last_added_pos_ + 1;
if (new_pos >= ring_buffer_size) new_pos = 0;
size_t evicted_item = buffer_[new_pos];
new_item = Hash64Bits(new_item);
buffer_[new_pos] = new_item;
hash_ = RotateLeft(hash_, 1) ^ RotateLeft(evicted_item, ring_buffer_size) ^
new_item;
last_added_pos_ = new_pos;
return hash_;
}
// returns the current hash.
size_t hash() const { return hash_; }
// Zero-initialize the object.
void clear() { memset(this, 0, sizeof(*this)); }
private:
size_t buffer_[kSize]; // All elements.
size_t last_added_pos_; // Position of the last added element.
size_t hash_; // XOR of all elements in buffer_.
};
// A simple fixed-size byte array.
// Each element is an 8-bit counter that can be incremented concurrently.
// The counters are allowed to overflow (i.e. are not saturating).
// Thread-compatible.
// The main use case is to increment an execution counter of a given PC.
template <size_t kSize>
class CounterArray {
public:
// Constructs an empty counter array.
CounterArray() = default;
// Clears all counters.
void Clear() { memset(data_, 0, sizeof(data_)); }
// Increments the counter that corresponds to idx.
// Idx is taken modulo kSize.
void Increment(size_t idx) {
// An atomic increment is quite expensive, even if relaxed.
// We do a non-atomic increment instead, with the full understanding
// that if the increment of the same `idx` happens concurrently the result
// will be unreliable. This is a trade-off between correctness and speed.
// This behavior is similar to that of Clang Coverage and inline counters:
// https://clang.llvm.org/docs/SourceBasedCodeCoverage.html
// https://clang.llvm.org/docs/SanitizerCoverage.html#inline-8bit-counters
// There is also a risk of contention if the same hot code is executed
// concurrently. Both atomic and non-atomic increment will be bad.
uint8_t *ptr = &data_[idx % kSize];
uint8_t value{};
__atomic_load(ptr, &value, __ATOMIC_RELAXED);
++value;
__atomic_store(ptr, &value, __ATOMIC_RELAXED);
}
// Accessors.
const uint8_t *data() const { return &data_[0]; }
size_t size() const { return kSize; }
private:
uint8_t data_[kSize] = {};
};
// A simple fixed-capacity array with push_back.
// Thread-compatible.
template <size_t kSize>
class FeatureArray {
public:
// Constructs an empty feature array.
FeatureArray() = default;
// pushes `feature` back if there is enough space.
void push_back(feature_t feature) {
if (num_features_ < kSize) {
features_[num_features_++] = feature;
}
}
// Makes the array empty.
void clear() { num_features_ = 0; }
// Returns the array's raw data.
feature_t *data() { return &features_[0]; }
// Returns the number of elements in the array.
size_t size() const { return num_features_; }
private:
// NOTE: No initializer needed: object state is captured by `num_features_`.
feature_t features_[kSize];
size_t num_features_ = 0;
};
} // namespace centipede
#endif // THIRD_PARTY_CENTIPEDE_FEATURE_H_