fetch_purge_docs

in lib/crawler/output_sink/elasticsearch.rb [103:124]


      def fetch_purge_docs(crawl_start_time)
        query = {
          _source: ['url'],
          query: {
            range: {
              last_crawled_at: {
                lt: crawl_start_time.rfc3339
              }
            }
          },
          size: SEARCH_PAGINATION_SIZE,
          sort: [{ last_crawled_at: 'asc' }]
        }.deep_stringify_keys
        system_logger.debug(
          "Fetching docs for pages that were not encountered during the sync. Full query: #{query.inspect}"
        )

        client.indices.refresh(index: [index_name])
        hits = client.paginated_search(index_name, query)
        hits.map { |h| h['_source']['url'] }
      end