in lib/crawler/output_sink/elasticsearch.rb [103:124]
def fetch_purge_docs(crawl_start_time)
query = {
_source: ['url'],
query: {
range: {
last_crawled_at: {
lt: crawl_start_time.rfc3339
}
}
},
size: SEARCH_PAGINATION_SIZE,
sort: [{ last_crawled_at: 'asc' }]
}.deep_stringify_keys
system_logger.debug(
"Fetching docs for pages that were not encountered during the sync. Full query: #{query.inspect}"
)
client.indices.refresh(index: [index_name])
hits = client.paginated_search(index_name, query)
hits.map { |h| h['_source']['url'] }
end