in solr/core/src/java/org/apache/solr/request/SimpleFacets.java [1055:1242]
public NamedList<Integer> getFacetTermEnumCounts(
SolrIndexSearcher searcher,
DocSet docs,
String field,
int offset,
int limit,
int mincount,
boolean missing,
String sort,
String prefix,
Predicate<BytesRef> termFilter,
boolean intersectsCheck)
throws IOException {
/* :TODO: potential optimization...
* cache the Terms with the highest docFreq and try them first
* don't enum if we get our max from them
*/
final NamedList<Integer> res = new NamedList<>();
if (limit == 0) {
return finalize(res, searcher, docs, field, missing);
}
// Minimum term docFreq in order to use the filterCache for that term.
int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);
// make sure we have a set that is fast for random access, if we will use it for that
Bits fastForRandomSet;
if (minDfFilterCache <= 0) {
fastForRandomSet = null;
} else {
fastForRandomSet = docs.getBits();
}
IndexSchema schema = searcher.getSchema();
FieldType ft = schema.getFieldType(field);
assert !ft.isPointField() : "Point Fields don't support enum method";
boolean sortByCount = sort.equals("count") || sort.equals("true");
final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
final BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
int min = mincount - 1; // the smallest value in the top 'N' values
int off = offset;
int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
BytesRef prefixTermBytes = null;
if (prefix != null) {
String indexedPrefix = ft.toInternal(prefix);
prefixTermBytes = new BytesRef(indexedPrefix);
}
Terms terms = MultiTerms.getTerms(searcher.getIndexReader(), field);
TermsEnum termsEnum = null;
SolrIndexSearcher.DocsEnumState deState = null;
BytesRef term = null;
if (terms != null) {
termsEnum = terms.iterator();
// TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
// facet.offset when sorting by index order.
if (prefixTermBytes != null) {
if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
termsEnum = null;
} else {
term = termsEnum.term();
}
} else {
// position termsEnum on first term
term = termsEnum.next();
}
}
PostingsEnum postingsEnum = null;
CharsRefBuilder charsRef = new CharsRefBuilder();
if (docs.size() >= mincount) {
while (term != null) {
if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break;
if (termFilter == null || termFilter.test(term)) {
int df = termsEnum.docFreq();
// If we are sorting, we can use df>min (rather than >=) since we
// are going in index order. For certain term distributions this can
// make a large difference (for example, many terms with df=1).
if (df > 0 && df > min) {
int c;
if (df >= minDfFilterCache) {
// use the filter cache
if (deState == null) {
deState = new SolrIndexSearcher.DocsEnumState();
deState.fieldName = field;
deState.liveDocs = searcher.getLiveDocsBits();
deState.termsEnum = termsEnum;
deState.postingsEnum = postingsEnum;
}
if (intersectsCheck) {
c = searcher.intersects(docs, deState) ? 1 : 0;
} else {
c = searcher.numDocs(docs, deState);
}
postingsEnum = deState.postingsEnum;
} else {
// iterate over TermDocs to calculate the intersection
// TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it
// matter for this?
// TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class
// impl)
// TODO: would passing deleted docs lead to better efficiency over checking the
// fastForRandomSet?
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
c = 0;
if (postingsEnum instanceof MultiPostingsEnum) {
MultiPostingsEnum.EnumWithSlice[] subs =
((MultiPostingsEnum) postingsEnum).getSubs();
int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
SEGMENTS_LOOP:
for (int subindex = 0; subindex < numSubs; subindex++) {
MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
if (sub.postingsEnum == null) continue;
int base = sub.slice.start;
int docid;
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.get(docid + base)) {
c++;
if (intersectsCheck) {
assert c == 1;
break SEGMENTS_LOOP;
}
}
}
}
} else {
int docid;
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.get(docid)) {
c++;
if (intersectsCheck) {
assert c == 1;
break;
}
}
}
}
}
if (sortByCount) {
if (c > min) {
BytesRef termCopy = BytesRef.deepCopyOf(term);
queue.add(new CountPair<>(termCopy, c));
if (queue.size() >= maxsize) min = queue.last().val;
}
} else {
if (c >= mincount && --off < 0) {
if (--lim < 0) break;
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
}
}
}
term = termsEnum.next();
}
}
if (sortByCount) {
for (CountPair<BytesRef, Integer> p : queue) {
if (--off >= 0) continue;
if (--lim < 0) break;
ft.indexedToReadable(p.key, charsRef);
res.add(charsRef.toString(), p.val);
}
}
return finalize(res, searcher, docs, field, missing);
}