public void build()

in lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java [389:597]


  public void build(InputIterator iterator) throws IOException {
    if (iterator.hasContexts()) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }

    hasPayloads = iterator.hasPayloads();

    OfflineSorter sorter =
        new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));

    IndexOutput tempInput =
        tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);

    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();

    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

    String tempSortedFileName = null;

    long newCount = 0;
    byte[] buffer = new byte[8];
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);

      for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
        LimitedFiniteStringsIterator finiteStrings =
            new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);

        for (IntsRef string; (string = finiteStrings.next()) != null; newCount++) {
          Util.toBytesRef(string, scratch);

          // length of the analyzed text (FST input)
          if (scratch.length() > Short.MAX_VALUE - 2) {
            throw new IllegalArgumentException(
                "cannot handle analyzed forms > "
                    + (Short.MAX_VALUE - 2)
                    + " in length (got "
                    + scratch.length()
                    + ")");
          }
          short analyzedLength = (short) scratch.length();

          // compute the required length:
          // analyzed sequence + weight (4) + surface + analyzedLength (short)
          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;

          BytesRef payload;

          if (hasPayloads) {
            if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
              throw new IllegalArgumentException(
                  "cannot handle surface form > "
                      + (Short.MAX_VALUE - 2)
                      + " in length (got "
                      + surfaceForm.length
                      + ")");
            }
            payload = iterator.payload();
            // payload + surfaceLength (short)
            requiredLength += payload.length + 2;
          } else {
            payload = null;
          }

          buffer = ArrayUtil.growNoCopy(buffer, requiredLength);

          output.reset(buffer);

          output.writeShort(analyzedLength);

          output.writeBytes(scratch.bytes(), 0, scratch.length());

          output.writeInt(encodeWeight(iterator.weight()));

          if (hasPayloads) {
            for (int i = 0; i < surfaceForm.length; i++) {
              if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
                throw new IllegalArgumentException(
                    "surface form cannot contain unit separator character U+001F; this character is reserved");
              }
            }
            output.writeShort((short) surfaceForm.length);
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
            output.writeBytes(payload.bytes, payload.offset, payload.length);
          } else {
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
          }

          assert output.getPosition() == requiredLength
              : output.getPosition() + " vs " + requiredLength;
          writer.write(buffer, 0, output.getPosition());
        }

        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
      }
      CodecUtil.writeFooter(tempInput);
      writer.close();

      // Sort all input/output pairs (required by FST.Builder):
      tempSortedFileName = sorter.sort(tempInput.getName());

      // Free disk space:
      tempDir.deleteFile(tempInput.getName());

      reader =
          new OfflineSorter.ByteSequencesReader(
              tempDir.openChecksumInput(tempSortedFileName), tempSortedFileName);

      PairOutputs<Long, BytesRef> outputs =
          new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
      FSTCompiler<Pair<Long, BytesRef>> fstCompiler =
          new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();

      // Build FST:
      BytesRefBuilder previousAnalyzed = null;
      BytesRefBuilder analyzed = new BytesRefBuilder();
      BytesRef surface = new BytesRef();
      IntsRefBuilder scratchInts = new IntsRefBuilder();
      ByteArrayDataInput input = new ByteArrayDataInput();

      // Used to remove duplicate surface forms (but we
      // still index the hightest-weight one).  We clear
      // this when we see a new analyzed form, so it cannot
      // grow unbounded (at most 256 entries):
      Set<BytesRef> seenSurfaceForms = new HashSet<>();

      int dedup = 0;
      while (true) {
        BytesRef bytes = reader.next();
        if (bytes == null) {
          break;
        }
        input.reset(bytes.bytes, bytes.offset, bytes.length);
        short analyzedLength = input.readShort();
        analyzed.growNoCopy(analyzedLength + 2);
        input.readBytes(analyzed.bytes(), 0, analyzedLength);
        analyzed.setLength(analyzedLength);

        long cost = input.readInt();

        surface.bytes = bytes.bytes;
        if (hasPayloads) {
          surface.length = input.readShort();
          surface.offset = input.getPosition();
        } else {
          surface.offset = input.getPosition();
          surface.length = bytes.length - surface.offset;
        }

        if (previousAnalyzed == null) {
          previousAnalyzed = new BytesRefBuilder();
          previousAnalyzed.copyBytes(analyzed.get());
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else if (analyzed.get().equals(previousAnalyzed.get())) {
          dedup++;
          if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
            // More than maxSurfaceFormsPerAnalyzedForm
            // dups: skip the rest:
            continue;
          }
          if (seenSurfaceForms.contains(surface)) {
            continue;
          }
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else {
          dedup = 0;
          previousAnalyzed.copyBytes(analyzed);
          seenSurfaceForms.clear();
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        }

        // TODO: I think we can avoid the extra 2 bytes when
        // there is no dup (dedup==0), but we'd have to fix
        // the exactFirst logic ... which would be sort of
        // hairy because we'd need to special case the two
        // (dup/not dup)...

        // NOTE: must be byte 0 so we sort before whatever
        // is next
        analyzed.append((byte) 0);
        analyzed.append((byte) dedup);

        Util.toIntsRef(analyzed.get(), scratchInts);
        // System.out.println("ADD: " + scratchInts + " -> " + cost + ": " +
        // surface.utf8ToString());
        if (!hasPayloads) {
          fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
        } else {
          int payloadOffset = input.getPosition() + surface.length;
          int payloadLength = bytes.length - payloadOffset;
          BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
          System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
          br.bytes[surface.length] = PAYLOAD_SEP;
          System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
          br.length = br.bytes.length;
          fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br));
        }
      }
      fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
      count = newCount;

      // Util.dotToFile(fst, "/tmp/suggest.dot");
    } finally {
      IOUtils.closeWhileHandlingException(reader, writer);
      IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
  }