diff --git a/centipede.cc b/centipede.cc index 02dce9a..de4ac62 100644 --- a/centipede.cc +++ b/centipede.cc @@ -206,6 +206,7 @@ void Centipede::Log(std::string_view log_type, size_t min_log_level) { << " df: " << fs_.CountFeatures(FeatureDomains::kDataFlow) << " cmp: " << fs_.CountFeatures(FeatureDomains::kCMP) << " path: " << fs_.CountFeatures(FeatureDomains::kBoundedPath) + << " pair: " << fs_.CountFeatures(FeatureDomains::kPCPair) << " corp: " << corpus_.NumActive() << "/" << corpus_.NumTotal() << " max/avg " << max << " " << avg << " exec/s: " << exec_speed << " mb: " << (MemoryUsage() >> 20); @@ -238,6 +239,47 @@ bool Centipede::ExecuteAndReportCrash(std::string_view binary, return success; } +// *** Highly experimental and risky. May not scale well for large targets. *** +// +// The idea: an unordered pair of two features {a, b} is by itself a feature. +// In the worst case, the number of such synthetic features is a square of +// the number of regular features, which may not scale. +// For now, we only treat pairs of PCs as features, which is still quadratic +// by the number of PCs. But in moderate-sized programs this may be tolerable. +// +// Rationale: if two different parts of the target are exercised simultaneously, +// this may create interesting behaviour that is hard to capture with regular +// control flow (or other) features. +size_t Centipede::AddPcPairFeatures(FeatureVec &fv) { + // Using a scratch vector to avoid allocations. + auto &pcs = add_pc_pair_scratch_; + pcs.clear(); + + size_t num_pcs = pc_table_.size(); + size_t num_added_pairs = 0; + + // Collect PCs from fv. + for (auto feature : fv) { + if (FeatureDomains::k8bitCounters.Contains(feature)) + pcs.push_back(Convert8bitCounterFeatureToPcIndex(feature)); + } + + // The quadratic loop: iterate all PC pairs (!!). + for (size_t i = 0, n = pcs.size(); i < n; ++i) { + size_t pc1 = pcs[i]; + for (size_t j = i + 1; j < n; ++j) { + size_t pc2 = pcs[j]; + feature_t f = FeatureDomains::kPCPair.ConvertToMe( + ConvertPcPairToNumber(pc1, pc2, num_pcs)); + // If we have seen this pair at least once, ignore it. + if (fs_.Frequency(f)) continue; + fv.push_back(f); + ++num_added_pairs; + } + } + return num_added_pairs; +} + bool Centipede::RunBatch(const std::vector &input_vec, BatchResult &batch_result, BlobFileAppender *corpus_file, @@ -265,6 +307,8 @@ bool Centipede::RunBatch(const std::vector &input_vec, bool function_filter_passed = function_filter_.filter(fv); bool input_gained_new_coverage = fs_.CountUnseenAndPruneFrequentFeatures(fv); + if (env_.use_pcpair_features && AddPcPairFeatures(fv)) + input_gained_new_coverage = true; if (unconditional_features_file) { CHECK_OK(unconditional_features_file->Append( PackFeaturesAndHash(input_vec[i], fv))); diff --git a/centipede.h b/centipede.h index f3d64be..7a7f9aa 100644 --- a/centipede.h +++ b/centipede.h @@ -121,6 +121,11 @@ class Centipede { void MergeFromOtherCorpus(std::string_view merge_from_dir, size_t shard_index_to_merge); + // Collects all PCs from `fv`, then adds PC-pair features to `fv`. + // Returns the number of added features. + // See more comments in centipede.cc. + size_t AddPcPairFeatures(FeatureVec &fv); + FeatureSet fs_; Timer timer_; // counts time for coverage collection rate computation Corpus corpus_; @@ -145,6 +150,9 @@ class Centipede { // Counts the number of crashes reported so far. int num_crash_reports_ = 0; + // Scratch object for AddPcPairFeatures. + std::vector add_pc_pair_scratch_; + // Path and command for the input_filter. std::string input_filter_path_; Command input_filter_cmd_; diff --git a/environment.cc b/environment.cc index eb3948f..f869c26 100644 --- a/environment.cc +++ b/environment.cc @@ -136,6 +136,9 @@ ABSL_FLAG(bool, use_counter_features, false, "When available from instrumentation, use features derived from " "counting the number of occurrences of a given PC. " "When enabled, supersedes --use_pc_features."); +ABSL_FLAG(bool, use_pcpair_features, false, + "If true, PC pairs are used as additional synthetic features. " + "Experimental, use with care - it may explode the corpus."); ABSL_FLAG(bool, generate_corpus_stats, false, "If true, a file workdir/corpus-stats-BINARY.json containing" "corpus stats will be generated periodically"); @@ -226,6 +229,7 @@ Environment::Environment(int argc, char** argv) use_cmp_features(absl::GetFlag(FLAGS_use_cmp_features)), use_dataflow_features(absl::GetFlag(FLAGS_use_dataflow_features)), use_counter_features(absl::GetFlag(FLAGS_use_counter_features)), + use_pcpair_features(absl::GetFlag(FLAGS_use_pcpair_features)), generate_corpus_stats(absl::GetFlag(FLAGS_generate_corpus_stats)), distill_shards(absl::GetFlag(FLAGS_distill_shards)), fork_server_helper_path(absl::GetFlag(FLAGS_fork_server_helper_path)), diff --git a/environment.h b/environment.h index 3f6ec66..0bf84f8 100644 --- a/environment.h +++ b/environment.h @@ -55,6 +55,7 @@ struct Environment { bool use_cmp_features; bool use_dataflow_features; bool use_counter_features; + size_t use_pcpair_features; bool generate_corpus_stats; size_t distill_shards; std::string fork_server_helper_path; diff --git a/feature.h b/feature.h index 8fec731..88fd239 100644 --- a/feature.h +++ b/feature.h @@ -96,6 +96,9 @@ constexpr Domain kCMP = {kDataFlow.end(), 1UL << 40, "cmp"}; // their number to 2^32. constexpr Domain kBoundedPath = {kCMP.end(), 1UL << 32, "path"}; +// Features derived from (unordered) pairs of PCs. +constexpr Domain kPCPair = {kBoundedPath.end(), 1UL << 40, "pc-pair"}; + } // namespace FeatureDomains // Converts a 8-bit coverage counter, i.e. a pair of