http_logs/_tools/unparse.rb (34 lines of code) (raw):
require "json"
require "time"
################
#
# Reconstructs (un-parses) the existing http_logs corpora (data set). The introduction of ingest node pipelines
# requires the data to be JSON, but un-parsed log lines. This script was used to create the `http_logs_unparsed`, which
# is a mirror copy of "http_logs`, except it is un-parsed AND the timestamp is ISO8601 (not epoch_seconds)
#
# The output of this is is a file with lines of JSON that appear as follows:
#
# {"message" : "30.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /images/info.gif HTTP/1.0\" 200 1251"}
# {"message" : "28.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /french/images/hm_official.gif HTTP/1.1\" 200 972"}
# {"message" : "17.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /french/hosts/cfo/images/cfo/cfophot3.jpg HTTP/1.0\" 200 6695"}
#
# Usage:
#
# rm *.unparse.json
# rm *.bz2
#
# wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-181998.json.bz2
# bunzip2 documents-181998.json.bz2
#
# wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-191998.json.bz2
# bunzip2 documents-191998.json.bz2
#
# wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-201998.json.bz2
# bunzip2 documents-201998.json.bz2
#
# wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-211998.json.bz2
# bunzip2 documents-211998.json.bz2
#
# wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-221998.json.bz2
# bunzip2 documents-221998.json.bz2
#
# wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-231998.json.bz2
# bunzip2 documents-231998.json.bz2
#
# wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-241998.json.bz2
# bunzip2 documents-241998.json.bz2
#
# ruby unparse.rb .
#
# #############
def self.getValue(data,key)
data[key].nil? ? "-" : data[key].to_s
end
threads = 4
running = 0
Dir.glob(File.join(ARGV[0], "*.json")).select do |file|
File.open(file.gsub('json', 'unparsed.json'), 'w') do |json_file|
while running >= threads
sleep 1
end
running = running + 1
Thread.new do
i = 0;
File.open(file).each do |line|
begin
i += 1;
print "." if i % 10000 == 0
data = JSON.parse(line)
logline = getValue(data,'clientip') + " - - [" + Time.at(data['@timestamp'].to_i).iso8601 + "] \\\"" + getValue(data,'request') + "\\\" " + getValue(data,'status') + " " + getValue(data,'size')
json_log_line = "{\"message\" : \"" + logline + "\"}\n"
#TODO: validate this is proper JSON. ~15 rows (.02%) were post modified to remove an invalid '\' char in the resultant JSON
json_file.write(json_log_line)
rescue => e
puts e
end
end
running = running - 1
end
while running > 0
sleep 1
end
end
end