int64_t ExtractText::ReadCallback::operator()

in extensions/standard-processors/processors/ExtractText.cpp [56:158]


int64_t ExtractText::ReadCallback::operator()(const std::shared_ptr<io::InputStream>& stream) const {
  size_t read_size = 0;
  bool regex_mode;
  size_t size_limit = flowFile_->getSize();
  std::vector<std::byte> buffer;
  buffer.resize(std::min(gsl::narrow<size_t>(flowFile_->getSize()), MAX_BUFFER_SIZE));

  std::string attrKey;
  std::string sizeLimitStr;
  ctx_->getProperty(Attribute, attrKey);
  ctx_->getProperty(SizeLimit, sizeLimitStr);
  ctx_->getProperty(RegexMode, regex_mode);

  if (sizeLimitStr.empty())
    sizeLimitStr = DEFAULT_SIZE_LIMIT_STR;

  if (sizeLimitStr != "0")
    size_limit = static_cast<size_t>(std::stoi(sizeLimitStr));

  std::ostringstream contentStream;

  while (read_size < size_limit) {
    // Don't read more than config limit or the size of the buffer
    const auto length = std::min(size_limit - read_size, buffer.size());
    const auto ret = stream->read(std::span(buffer).subspan(0, length));

    if (io::isError(ret)) {
      return -1;  // Stream error
    } else if (ret == 0) {
      break;  // End of stream, no more data
    }

    contentStream.write(reinterpret_cast<const char*>(buffer.data()), gsl::narrow<std::streamsize>(ret));
    read_size += ret;
    if (contentStream.fail()) {
      return -1;
    }
  }

  if (regex_mode) {
    bool insensitive;
    std::vector<utils::Regex::Mode> regex_flags;
    if (ctx_->getProperty(InsensitiveMatch, insensitive) && insensitive) {
      regex_flags.push_back(utils::Regex::Mode::ICASE);
    }

    const bool include_capture_group_zero = ctx_->getProperty<bool>(IncludeCaptureGroupZero).value_or(true);

    bool repeatingcapture;
    ctx_->getProperty(EnableRepeatingCaptureGroup, repeatingcapture);

    const size_t maxCaptureSize = [this] {
      uint64_t val = 0;
      ctx_->getProperty(MaxCaptureGroupLen, val);
      return gsl::narrow<size_t>(val);
    }();

    std::string contentStr = contentStream.str();

    std::map<std::string, std::string> regexAttributes;

    for (const auto& k : ctx_->getDynamicPropertyKeys()) {
      std::string value;
      ctx_->getDynamicProperty(k, value);

      std::string workStr = contentStr;

      int matchcount = 0;

      try {
        utils::Regex rgx(value, regex_flags);
        utils::SMatch matches;
        while (utils::regexSearch(workStr, matches, rgx)) {
          for (std::size_t i = (include_capture_group_zero ? 0 : 1); i < matches.size(); ++i, ++matchcount) {
            std::string attributeValue = matches[i];
            if (attributeValue.length() > maxCaptureSize) {
              attributeValue = attributeValue.substr(0, maxCaptureSize);
            }
            if (matchcount == 0) {
              regexAttributes[k] = attributeValue;
            }
            regexAttributes[k + '.' + std::to_string(matchcount)] = attributeValue;
          }
          if (!repeatingcapture) {
            break;
          }
          workStr = matches.suffix();
        }
      } catch (const Exception &e) {
        logger_->log_error("%s error encountered when trying to construct regular expression from property (key: %s) value: %s",
                           e.what(), k, value);
        continue;
      }
    }

    for (const auto& kv : regexAttributes) {
      flowFile_->setAttribute(kv.first, kv.second);
    }
  } else {
    flowFile_->setAttribute(attrKey, contentStream.str());
  }
  return gsl::narrow<int64_t>(read_size);
}