in src/main/java/org/apache/accumulo/testing/randomwalk/shard/Grep.java [44:101]
public void visit(State state, RandWalkEnv env, Properties props) throws Exception {
// pick a few randoms words... grep for those words and search the index
// ensure both return the same set of documents
String indexTableName = state.getString("indexTableName");
String dataTableName = state.getString("docTableName");
Random rand = state.getRandom();
Text[] words = new Text[rand.nextInt(4) + 2];
for (int i = 0; i < words.length; i++) {
words[i] = new Text(Insert.generateRandomWord(rand));
}
HashSet<Text> documentsFoundInIndex = new HashSet<>();
try (BatchScanner bs = env.getAccumuloClient().createBatchScanner(indexTableName,
Authorizations.EMPTY, 16)) {
IteratorSetting ii = new IteratorSetting(20, "ii", IntersectingIterator.class.getName());
IntersectingIterator.setColumnFamilies(ii, words);
bs.addScanIterator(ii);
bs.setRanges(Collections.singleton(new Range()));
for (Entry<Key,Value> entry : bs) {
documentsFoundInIndex.add(entry.getKey().getColumnQualifier());
}
}
HashSet<Text> documentsFoundByGrep = new HashSet<>();
try (BatchScanner bs = env.getAccumuloClient().createBatchScanner(dataTableName,
Authorizations.EMPTY, 16)) {
for (int i = 0; i < words.length; i++) {
IteratorSetting more = new IteratorSetting(20 + i, "ii" + i, RegExFilter.class);
RegExFilter.setRegexs(more, null, null, null, "(^|(.*\\s))" + words[i] + "($|(\\s.*))",
false);
bs.addScanIterator(more);
}
bs.setRanges(Collections.singleton(new Range()));
for (Entry<Key,Value> entry : bs) {
documentsFoundByGrep.add(entry.getKey().getRow());
}
}
if (!documentsFoundInIndex.equals(documentsFoundByGrep)) {
throw new Exception("Set of documents found not equal for words " + Arrays.toString(words)
+ " " + documentsFoundInIndex + " " + documentsFoundByGrep);
}
log.debug(
"Grep and index agree " + Arrays.toString(words) + " " + documentsFoundInIndex.size());
}