bench/wdtStats.cpp (56 lines of code) (raw):
/**
* Copyright (c) 2014-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* Extracts bigrams from input. Example use:
* cat * | wdt_gen_stats -wrap -binary > bigrams
*
*/
#include <algorithm>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <wdt/WdtConfig.h>
#include "wdt/bench/Bigram.h"
DEFINE_bool(binary, false, "Binary format output");
DEFINE_bool(wrap, false, "Consider last byte to wrap back to first");
std::ostream &operator<<(std::ostream &os, const MapOfBigramToCount &t) {
// Transfer into vector to sort
std::vector<PairBigramCount> dv(std::begin(t), std::end(t));
std::sort(dv.begin(), dv.end(),
[](const PairBigramCount &x, const PairBigramCount &y) {
if (x.second == y.second) {
// secondary criteria: ascending
return x.first < y.first;
} else {
// primary sort: descending by count
return x.second > y.second;
}
});
for (const auto &entry : dv) {
Bigram key = entry.first;
uint32_t count = entry.second;
if (FLAGS_binary) {
key.binarySerialize(os);
// Note that this generates a format not portable across endianness
os.write(reinterpret_cast<const char *>(&count), sizeof(count));
} else {
os << "\t{" << key << ", " << count << "}," << std::endl;
}
}
return os;
}
using std::string;
int main(int argc, char **argv) {
FLAGS_logtostderr = true;
// gflags api is nicely inconsistent here
GFLAGS_NAMESPACE::SetArgv(argc, const_cast<const char **>(argv));
GFLAGS_NAMESPACE::SetVersionString(WDT_VERSION_STR);
string usage("Extract statistical model from input. v");
usage.append(GFLAGS_NAMESPACE::VersionString());
usage.append(". Sample usage:\n\t");
usage.append(GFLAGS_NAMESPACE::ProgramInvocationShortName());
usage.append(" [flags] < input > bigrams");
GFLAGS_NAMESPACE::SetUsageMessage(usage);
GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
MapOfBigramToCount map;
int first = getchar_unlocked(), c, prev = first;
while ((c = getchar_unlocked()) >= 0) {
Bigram b(prev, c);
++map[b];
prev = c;
}
if (FLAGS_wrap) {
Bigram wrap(prev, first);
++map[wrap];
}
std::cout << map;
return 0;
}