-
Notifications
You must be signed in to change notification settings - Fork 322
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Benchmark for C++ PDQ index #1699
base: main
Are you sure you want to change the base?
Changes from all commits
df83aae
40795aa
fef0970
4dc7daf
33683c4
3799683
495cc8d
7a8c9e1
9ee1693
8684023
48f6c66
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,323 @@ | ||
#ifndef _GNU_SOURCE | ||
#define _GNU_SOURCE | ||
#endif | ||
|
||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
#include <pdq/cpp/index/mih.h> | ||
#include <pdq/cpp/io/hashio.h> | ||
|
||
#include <algorithm> | ||
#include <chrono> | ||
#include <random> | ||
#include <set> | ||
|
||
// ================================================================ | ||
// Static function declarations | ||
static void usage(char* argv0, int rc); | ||
static void query(char* argv0, int argc, char** argv); | ||
|
||
// Function declarations for each query method | ||
static void queryLinear( | ||
const int maxDistance, | ||
const bool verbose, | ||
const unsigned int seed, | ||
const size_t indexSize, | ||
const size_t querySize, | ||
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>& | ||
queries, | ||
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>& | ||
index); | ||
static void queryMIH( | ||
const int maxDistance, | ||
const bool verbose, | ||
const unsigned int seed, | ||
const size_t indexSize, | ||
const size_t querySize, | ||
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>& | ||
queries, | ||
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>& | ||
index); | ||
|
||
// Helper declarations | ||
static facebook::pdq::hashing::Hash256 generateRandomHash(std::mt19937& gen); | ||
static facebook::pdq::hashing::Hash256 addNoise( | ||
const facebook::pdq::hashing::Hash256& original, | ||
int numBitsToFlip, | ||
std::mt19937& gen); | ||
Comment on lines
+43
to
+48
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These seem like generally useful function, and might be worth including in some kind of pdq/utils method. Some of your peers have ended up building the same functions in python. |
||
|
||
// ---------------------------------------------------------------- | ||
int main(int argc, char** argv) { | ||
if (argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { | ||
usage(argv[0], 0); | ||
} else { | ||
query(argv[0], argc - 1, argv + 1); | ||
} | ||
return 0; | ||
} | ||
|
||
// ---------------------------------------------------------------- | ||
static void usage(char* argv0, int rc) { | ||
FILE* fp = (rc == 0) ? stdout : stderr; | ||
fprintf(fp, "Usage: %s [options]\n", argv0); | ||
fprintf(fp, "Options:\n"); | ||
fprintf(fp, " -v Verbose output\n"); | ||
fprintf(fp, " --seed N Random seed (default: 41)\n"); | ||
16BitNarwhal marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fprintf(fp, " -q N Number of queries to run (default: 1000)\n"); | ||
fprintf( | ||
fp, | ||
" -b N Number of PDQ hashes to query against (default: 10000)\n"); | ||
fprintf( | ||
fp, | ||
" -d N Maximum Hamming distance for matches (default: 31)\n"); | ||
fprintf( | ||
fp, | ||
" -m Method for querying (default: linear), Available: linear, mih\n"); | ||
exit(rc); | ||
} | ||
|
||
static void query(char* argv0, int argc, char** argv) { | ||
int maxDistance = 31; | ||
bool verbose = false; | ||
unsigned int seed = 41; | ||
size_t indexSize = 10000; | ||
size_t querySize = 1000; | ||
std::string method = "linear"; | ||
|
||
// Parse command line arguments | ||
16BitNarwhal marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for (int i = 0; i < argc; i++) { | ||
std::string arg = argv[i]; | ||
if (arg == "-q") { | ||
if (i + 1 < argc) { | ||
querySize = std::stoi(argv[++i]); | ||
} else { | ||
fprintf(stderr, "Error: Missing argument for -q\n"); | ||
usage(argv0, 1); | ||
return; | ||
} | ||
} else if (arg == "-b") { | ||
if (i + 1 < argc) { | ||
indexSize = std::stoi(argv[++i]); | ||
} else { | ||
fprintf(stderr, "Error: Missing argument for -b\n"); | ||
usage(argv0, 1); | ||
return; | ||
} | ||
} else if (arg == "-d") { | ||
if (i + 1 < argc) { | ||
maxDistance = std::stoi(argv[++i]); | ||
} else { | ||
fprintf(stderr, "Error: Missing argument for -d\n"); | ||
usage(argv0, 1); | ||
return; | ||
} | ||
} else if (arg == "--seed") { | ||
if (i + 1 < argc) { | ||
seed = std::stoi(argv[++i]); | ||
} else { | ||
fprintf(stderr, "Error: Missing argument for --seed\n"); | ||
usage(argv0, 1); | ||
return; | ||
} | ||
} else if (arg == "-m") { | ||
if (i + 1 < argc) { | ||
std::string methodName = argv[++i]; | ||
if (methodName == "linear" || methodName == "mih") { | ||
method = methodName; | ||
} else { | ||
fprintf(stderr, "Invalid method: %s\n", methodName.c_str()); | ||
usage(argv0, 1); | ||
return; | ||
} | ||
} | ||
} else if (arg == "-v") { | ||
verbose = true; | ||
} else if (arg == "-h" || arg == "--help") { | ||
usage(argv0, 0); | ||
return; | ||
} else if (arg.length() > 0) { | ||
fprintf(stderr, "Unknown argument: %s\n", arg.c_str()); | ||
usage(argv0, 1); | ||
return; | ||
} | ||
} | ||
|
||
// Initialize random number generator | ||
std::mt19937 gen(seed); | ||
|
||
// Generate random hashes for queries | ||
std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>> queries; | ||
for (size_t i = 0; i < querySize; i++) { | ||
auto hash = generateRandomHash(gen); | ||
queries.push_back({hash, "query_" + std::to_string(i)}); | ||
} | ||
|
||
// Generate random hashes for index | ||
std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>> index; | ||
for (size_t i = 0; i < indexSize - querySize; i++) { | ||
auto hash = generateRandomHash(gen); | ||
index.push_back({hash, "index_" + std::to_string(i)}); | ||
} | ||
|
||
// Add noise to queries then insert into index | ||
std::uniform_int_distribution<int> noiseDist(1, maxDistance); | ||
for (const auto& query : queries) { | ||
int bitsToFlip = noiseDist(gen); | ||
auto noisyHash = addNoise(query.first, bitsToFlip, gen); | ||
index.push_back({noisyHash, "index_noisy_" + query.second}); | ||
} | ||
std::shuffle(index.begin(), index.end(), gen); | ||
|
||
if (verbose) { | ||
printf("GENERATED QUERIES:\n"); | ||
for (const auto& it : queries) { | ||
printf("%s,%s\n", it.first.format().c_str(), it.second.c_str()); | ||
} | ||
printf("\n"); | ||
|
||
printf("GENERATED INDEX:\n"); | ||
for (const auto& it : index) { | ||
printf("%s,%s\n", it.first.format().c_str(), it.second.c_str()); | ||
} | ||
printf("\n"); | ||
} | ||
|
||
if (method == "linear") { | ||
queryLinear( | ||
maxDistance, verbose, seed, indexSize, querySize, queries, index); | ||
} else if (method == "mih") { | ||
queryMIH(maxDistance, verbose, seed, indexSize, querySize, queries, index); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unsure: Is there an else fail missing here? |
||
} | ||
|
||
/////////////////////// | ||
//// Query methods //// | ||
/////////////////////// | ||
|
||
static void queryLinear( | ||
const int maxDistance, | ||
const bool verbose, | ||
const unsigned int seed, | ||
const size_t indexSize, | ||
const size_t querySize, | ||
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>& | ||
queries, | ||
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>& | ||
index) { | ||
// Do linear searches | ||
std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>> matches; | ||
|
||
std::chrono::time_point<std::chrono::steady_clock> t1, t2; | ||
std::chrono::duration<double> elapsedSeconds; | ||
|
||
t1 = std::chrono::steady_clock::now(); | ||
for (const auto& it : queries) { | ||
for (const auto& it2 : index) { | ||
if (it.first.hammingDistance(it2.first) <= maxDistance) { | ||
matches.push_back(it2); | ||
} | ||
} | ||
} | ||
t2 = std::chrono::steady_clock::now(); | ||
elapsedSeconds = t2 - t1; | ||
double seconds = elapsedSeconds.count(); | ||
Comment on lines
+209
to
+224
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it better to pull the timing functionality outside of the query method itself, so you have something like:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the mih has an indexing step which takes some time. i'm not sure how to refactor it cleanly so that the index doesn't get included in the query timing. how would you suggest i do this? // Build the MIH
std::chrono::time_point<std::chrono::steady_clock> t1, t2;
std::chrono::duration<double> elapsedSeconds;
double indexTimeSeconds;
facebook::pdq::index::MIH256<std::string> mih;
t1 = std::chrono::steady_clock::now();
for (const auto& it : index) {
mih.insert(it.first, it.second);
}
t2 = std::chrono::steady_clock::now();
elapsedSeconds = t2 - t1;
indexTimeSeconds = elapsedSeconds.count();
printf("\n");
if (verbose) {
printf("\n");
mih.dump();
printf("\n");
}
// Do indexed searches
std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>> matches;
matches.clear();
t1 = std::chrono::steady_clock::now();
for (const auto& it : queries) {
mih.queryAll(it.first, maxDistance, matches);
}
t2 = std::chrono::steady_clock::now();
elapsedSeconds = t2 - t1;
double seconds = elapsedSeconds.count(); |
||
|
||
printf("METHOD: Linear query\n"); | ||
printf("QUERY COUNT: %d\n", (int)queries.size()); | ||
printf("INDEX COUNT: %d\n", (int)index.size()); | ||
printf("TOTAL MATCH COUNT: %d\n", (int)matches.size()); | ||
printf("TOTAL QUERY SECONDS: %.6lf\n", seconds); | ||
printf( | ||
"SECONDS PER QUERY: %.6lf\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we want to do queries per second rather than seconds per query, given I think we'll be able to get at least 1 for most inputs, and our fastest approaches will have big numbers. |
||
querySize > 0 ? seconds / querySize : 0); | ||
printf("\n"); | ||
Comment on lines
+226
to
+234
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blocking: Yeah, let's not force everyone to perfectly implement this print block, having this in shared code. |
||
} | ||
|
||
static void queryMIH( | ||
const int maxDistance, | ||
const bool verbose, | ||
const unsigned int seed, | ||
const size_t indexSize, | ||
const size_t querySize, | ||
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>& | ||
queries, | ||
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>& | ||
index) { | ||
// Build the MIH | ||
std::chrono::time_point<std::chrono::steady_clock> t1, t2; | ||
std::chrono::duration<double> elapsedSeconds; | ||
double indexTimeSeconds; | ||
|
||
facebook::pdq::index::MIH256<std::string> mih; | ||
|
||
t1 = std::chrono::steady_clock::now(); | ||
for (const auto& it : index) { | ||
mih.insert(it.first, it.second); | ||
} | ||
t2 = std::chrono::steady_clock::now(); | ||
elapsedSeconds = t2 - t1; | ||
indexTimeSeconds = elapsedSeconds.count(); | ||
printf("\n"); | ||
if (verbose) { | ||
printf("\n"); | ||
mih.dump(); | ||
printf("\n"); | ||
} | ||
|
||
// Do indexed searches | ||
std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>> matches; | ||
matches.clear(); | ||
|
||
t1 = std::chrono::steady_clock::now(); | ||
for (const auto& it : queries) { | ||
mih.queryAll(it.first, maxDistance, matches); | ||
} | ||
t2 = std::chrono::steady_clock::now(); | ||
elapsedSeconds = t2 - t1; | ||
double seconds = elapsedSeconds.count(); | ||
|
||
printf("METHOD: Mutually-indexed hashing query\n"); | ||
printf("INDEX BUILD SECONDS: %.6lf\n", indexTimeSeconds); | ||
printf("QUERY COUNT: %d\n", (int)queries.size()); | ||
printf("INDEX COUNT: %d\n", (int)mih.size()); | ||
printf("TOTAL MATCH COUNT: %d\n", (int)matches.size()); | ||
printf("TOTAL QUERY SECONDS: %.6lf\n", seconds); | ||
printf( | ||
"SECONDS PER QUERY: %.6lf\n", | ||
querySize > 0 ? seconds / querySize : 0); | ||
printf("\n"); | ||
} | ||
|
||
////////////////////////// | ||
//// Helper Functions //// | ||
////////////////////////// | ||
|
||
// Generate random hash | ||
static facebook::pdq::hashing::Hash256 generateRandomHash(std::mt19937& gen) { | ||
facebook::pdq::hashing::Hash256 hash; | ||
std::uniform_int_distribution<uint16_t> dist(0, UINT16_MAX); | ||
|
||
for (int i = 0; i < facebook::pdq::hashing::HASH256_NUM_WORDS; i++) { | ||
hash.w[i] = dist(gen); | ||
} | ||
return hash; | ||
} | ||
|
||
// Add noise to hash by flipping random bits | ||
static facebook::pdq::hashing::Hash256 addNoise( | ||
const facebook::pdq::hashing::Hash256& original, | ||
int numBitsToFlip, | ||
std::mt19937& gen) { | ||
Comment on lines
+292
to
+311
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dumb c++ question: Why declare them on top and then implement down here. Why not just implement on top? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did this so the top gives an overview of all the methods, and I feel that methods like |
||
facebook::pdq::hashing::Hash256 noisy = original; | ||
std::uniform_int_distribution<int> wordDist( | ||
0, facebook::pdq::hashing::HASH256_NUM_WORDS - 1); | ||
std::uniform_int_distribution<int> bitDist(0, 15); // Each word is 16 bits | ||
for (int i = 0; i < numBitsToFlip; i++) { | ||
int wordIndex = wordDist(gen); | ||
int bitIndex = bitDist(gen); | ||
// Flip bit with xor | ||
noisy.w[wordIndex] ^= (1 << bitIndex); | ||
} | ||
Comment on lines
+316
to
+321
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blocking: Can you end up flipping the same bit twice and thus ending with less distance than you expect? Why not do uniform random 256 and modulus instead of doing this in two passes? A possible solution to blocker:
Alternatively, create a sequence from 0-256 and then use https://en.cppreference.com/w/cpp/algorithm/ranges/sample, then flip those bits |
||
return noisy; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I originally thought you were including this for
getopt
. I'm a c++ dummy, what functions/functionalities are you including from GNU?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Whoops! i'm actually not using it, will remove it