experiments/distinctcount_experiment.py (38 lines of code) (raw):
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
import sys
import os
pp = os.path.abspath('..')
p = os.path.abspath('.')
sys.path.insert(0, p)
sys.path.insert(0, pp)
import DataGenerator
import SketchExperiment
import logging
import Oracle
import random
import numpy as np
import scipy as sp
import copy
from experiment_utils import makeDict
import sketches.Sketches as Sketches
logging.getLogger().setLevel(logging.WARNING)
if __name__ == '__main__':
# A Beta(1,1) is a uniform distribution
d = sp.stats.beta(1,1)
dg = DataGenerator.DistributionDataGenerator(length=int(1e6), distribution=d, name='Uniform')
qg = DataGenerator.RepeatQueryGenerator(query_type="distinct", repeated_val=0, num_queries=5)
w = DataGenerator.Workload(data_generator=dg, query_generator=qg)
oracle = Oracle.DistinctStreamOracle(workload=w) # note this oracle assumes the stream consists of all unique items
opts = SketchExperiment.ExperimentOptions(
nparallel=8,
nreorderings=10,
nrepetitions=10,
save_answers=True)
e = SketchExperiment.SketchExperiment(workload=w,
oracle=oracle,
options=opts,
)
sketches = {'Theta': (Sketches.ThetaSketch, makeDict(lg_k=10, p=1.0, seed=0)),
'HLL': (Sketches.HLLSketch, makeDict(lg_k=10, seed=0)),
}
e.addSketches(sketches)
e.prepare()
e.execute(save_sketch=False)