purge

in lib/crawler/output_sink/elasticsearch.rb [125:146]


      def purge(crawl_start_time)
        query = {
          _source: ['url'],
          query: {
            range: {
              last_crawled_at: {
                lt: crawl_start_time.rfc3339
              }
            }
          }
        }.deep_stringify_keys

        system_logger.info('Deleting docs for pages that were not accessible during the purge crawl.')
        system_logger.debug("Full delete query: #{query}")

        client.indices.refresh(index: [index_name])
        response = client.delete_by_query(index: [index_name], body: query)
        system_logger.debug("Delete by query response: #{response}")

        @deleted = response['deleted']
      end