private MergeSpecification doFindMerges()

in lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java [453:676]
151 lines of code
44 McCabe index (conditional complexity)

  private MergeSpecification doFindMerges(
      List<SegmentSizeAndDocs> sortedEligibleInfos,
      final long maxMergedSegmentBytes,
      final int mergeFactor,
      final int allowedSegCount,
      final int allowedDelCount,
      final int allowedDocCount,
      final MERGE_TYPE mergeType,
      MergeContext mergeContext,
      boolean maxMergeIsRunning)
      throws IOException {

    List<SegmentSizeAndDocs> sortedEligible = new ArrayList<>(sortedEligibleInfos);

    Map<SegmentCommitInfo, SegmentSizeAndDocs> segInfosSizes = new HashMap<>();
    for (SegmentSizeAndDocs segSizeDocs : sortedEligible) {
      segInfosSizes.put(segSizeDocs.segInfo, segSizeDocs);
    }

    int originalSortedSize = sortedEligible.size();
    if (verbose(mergeContext)) {
      message("findMerges: " + originalSortedSize + " segments", mergeContext);
    }
    if (originalSortedSize == 0) {
      return null;
    }

    final Set<SegmentCommitInfo> toBeMerged = new HashSet<>();

    MergeSpecification spec = null;

    // Cycle to possibly select more than one merge:
    // The trigger point for total deleted documents in the index leads to a bunch of large segment
    // merges at the same time. So only put one large merge in the list of merges per cycle. We'll
    // pick up another
    // merge next time around.
    boolean haveOneLargeMerge = false;

    while (true) {

      // Gather eligible segments for merging, ie segments
      // not already being merged and not already picked (by
      // prior iteration of this loop) for merging:

      // Remove ineligible segments. These are either already being merged or already picked by
      // prior iterations
      Iterator<SegmentSizeAndDocs> iter = sortedEligible.iterator();
      while (iter.hasNext()) {
        SegmentSizeAndDocs segSizeDocs = iter.next();
        if (toBeMerged.contains(segSizeDocs.segInfo)) {
          iter.remove();
        }
      }

      if (verbose(mergeContext)) {
        message(
            "  allowedSegmentCount="
                + allowedSegCount
                + " vs count="
                + originalSortedSize
                + " (eligible count="
                + sortedEligible.size()
                + ")",
            mergeContext);
      }

      if (sortedEligible.size() == 0) {
        return spec;
      }

      final int remainingDelCount = sortedEligible.stream().mapToInt(c -> c.delCount).sum();
      if (mergeType == MERGE_TYPE.NATURAL
          && sortedEligible.size() <= allowedSegCount
          && remainingDelCount <= allowedDelCount) {
        return spec;
      }

      // OK we are over budget -- find best merge!
      MergeScore bestScore = null;
      List<SegmentCommitInfo> best = null;
      boolean bestTooLarge = false;
      long bestMergeBytes = 0;

      for (int startIdx = 0; startIdx < sortedEligible.size(); startIdx++) {

        final List<SegmentCommitInfo> candidate = new ArrayList<>();
        boolean hitTooLarge = false;
        long bytesThisMerge = 0;
        long docCountThisMerge = 0;
        for (int idx = startIdx;
            idx < sortedEligible.size()
                // We allow merging more than mergeFactor segments together if the merged segment
                // would be less than the floor segment size. This is important because segments
                // below the floor segment size are more aggressively merged by this policy, so we
                // need to grow them as quickly as possible.
                && (candidate.size() < mergeFactor || bytesThisMerge < floorSegmentBytes)
                && bytesThisMerge < maxMergedSegmentBytes
                && (bytesThisMerge < floorSegmentBytes || docCountThisMerge <= allowedDocCount);
            idx++) {
          final SegmentSizeAndDocs segSizeDocs = sortedEligible.get(idx);
          final long segBytes = segSizeDocs.sizeInBytes;
          int segDocCount = segSizeDocs.maxDoc - segSizeDocs.delCount;
          if (bytesThisMerge + segBytes > maxMergedSegmentBytes
              || (bytesThisMerge > floorSegmentBytes
                  && docCountThisMerge + segDocCount > allowedDocCount)) {
            // Only set hitTooLarge when reaching the maximum byte size, as this will create
            // segments of the maximum size which will no longer be eligible for merging for a long
            // time (until they accumulate enough deletes).
            hitTooLarge |= bytesThisMerge + segBytes > maxMergedSegmentBytes;
            // We should never have something coming in that _cannot_ be merged, so handle
            // singleton merges
            if (candidate.size() > 0) {
              // NOTE: we continue, so that we can try
              // "packing" smaller segments into this merge
              // to see if we can get closer to the max
              // size; this in general is not perfect since
              // this is really "bin packing" and we'd have
              // to try different permutations.
              continue;
            }
          }
          candidate.add(segSizeDocs.segInfo);
          bytesThisMerge += segBytes;
          docCountThisMerge += segDocCount;
        }

        // We should never see an empty candidate: we iterated over maxMergeAtOnce
        // segments, and already pre-excluded the too-large segments:
        assert candidate.size() > 0;

        SegmentSizeAndDocs maxCandidateSegmentSize = segInfosSizes.get(candidate.get(0));
        if (hitTooLarge == false
            && mergeType == MERGE_TYPE.NATURAL
            && bytesThisMerge < maxCandidateSegmentSize.sizeInBytes * 1.5
            && maxCandidateSegmentSize.delCount
                < maxCandidateSegmentSize.maxDoc * deletesPctAllowed / 100) {
          // Ignore any merge where the resulting segment is not at least 50% larger than the
          // biggest input segment.
          // Otherwise we could run into pathological O(N^2) merging where merges keep rewriting
          // again and again the biggest input segment into a segment that is barely bigger.
          // The only exception we make is when the merge would reclaim lots of deletes in the
          // biggest segment. This is important for cases when lots of documents get deleted at once
          // without introducing new segments of a similar size for instance.
          continue;
        }

        // A singleton merge with no deletes makes no sense. We can get here when forceMerge is
        // looping around...
        if (candidate.size() == 1 && maxCandidateSegmentSize.delCount == 0) {
          continue;
        }

        // If we didn't find a too-large merge and have a list of candidates
        // whose length is less than the merge factor, it means we are reaching
        // the tail of the list of segments and will only find smaller merges.
        // Stop here.
        if (bestScore != null && hitTooLarge == false && candidate.size() < mergeFactor) {
          break;
        }

        final MergeScore score = score(candidate, hitTooLarge, segInfosSizes);
        if (verbose(mergeContext)) {
          message(
              "  maybe="
                  + segString(mergeContext, candidate)
                  + " score="
                  + score.getScore()
                  + " "
                  + score.getExplanation()
                  + " tooLarge="
                  + hitTooLarge
                  + " size="
                  + String.format(Locale.ROOT, "%.3f MB", bytesThisMerge / 1024. / 1024.),
              mergeContext);
        }

        if ((bestScore == null || score.getScore() < bestScore.getScore())
            && (!hitTooLarge || !maxMergeIsRunning)) {
          best = candidate;
          bestScore = score;
          bestTooLarge = hitTooLarge;
          bestMergeBytes = bytesThisMerge;
        }
      }

      if (best == null) {
        return spec;
      }
      // The mergeType == FORCE_MERGE_DELETES behaves as the code does currently and can create a
      // large number of
      // concurrent big merges. If we make findForcedDeletesMerges behave as findForcedMerges and
      // cycle through
      // we should remove this.
      if (haveOneLargeMerge == false
          || bestTooLarge == false
          || mergeType == MERGE_TYPE.FORCE_MERGE_DELETES) {

        haveOneLargeMerge |= bestTooLarge;

        if (spec == null) {
          spec = new MergeSpecification();
        }
        final OneMerge merge = new OneMerge(best);
        spec.add(merge);

        if (verbose(mergeContext)) {
          message(
              "  add merge="
                  + segString(mergeContext, merge.segments)
                  + " size="
                  + String.format(Locale.ROOT, "%.3f MB", bestMergeBytes / 1024. / 1024.)
                  + " score="
                  + String.format(Locale.ROOT, "%.3f", bestScore.getScore())
                  + " "
                  + bestScore.getExplanation()
                  + (bestTooLarge ? " [max merge]" : ""),
              mergeContext);
        }
      }
      // whether we're going to return this list in the spec of not, we need to remove it from
      // consideration on the next loop.
      toBeMerged.addAll(best);
    }
  }