tuple/tuple_sketch_int64.cpp (183 lines of code) (raw):
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <emscripten/bind.h>
#include <tuple_sketch.hpp>
#include <tuple_union.hpp>
#include <tuple_intersection.hpp>
#include <tuple_a_not_b.hpp>
#include <tuple_jaccard_similarity.hpp>
#include <theta_sketch.hpp>
using Summary = uint64_t;
using Update = uint64_t;
enum tuple_mode {SUM, MIN, MAX, ONE, NOP};
template<typename S, typename U>
class tuple_update_policy {
public:
tuple_update_policy(tuple_mode mode): mode_(mode) {}
Summary create() const {
if (mode_ == ONE) return 1;
else if (mode_ == MIN) return std::numeric_limits<Summary>::max();
else if (mode_ == MAX) return std::numeric_limits<Summary>::min();
return 0;
}
void update(S& summary, const U& update) const {
if (mode_ == SUM) summary += update;
else if (mode_ == MIN) summary = std::min(summary, update);
else if (mode_ == MAX) summary = std::max(summary, update);
else if (mode_ == ONE) summary = 1;
}
private:
tuple_mode mode_;
};
using update_tuple_sketch_int64 = datasketches::update_tuple_sketch<Summary, Update, tuple_update_policy<Summary, Update>>;
using compact_tuple_sketch_int64 = datasketches::compact_tuple_sketch<Summary>;
template<typename S>
class tuple_union_policy {
public:
tuple_union_policy(tuple_mode mode): mode_(mode) {}
void operator()(Summary& summary, const Summary& other) const {
if (mode_ == SUM) summary += other;
else if (mode_ == MIN) summary = std::min(summary, other);
else if (mode_ == MAX) summary = std::max(summary, other);
else if (mode_ == ONE) summary = 1;
}
private:
tuple_mode mode_;
};
using tuple_union_int64 = datasketches::tuple_union<Summary, tuple_union_policy<Summary>>;
template<typename S> using tuple_intersection_policy = tuple_union_policy<S>;
using tuple_intersection_int64 = datasketches::tuple_intersection<Summary, tuple_intersection_policy<Summary>>;
using tuple_a_not_b_int64 = datasketches::tuple_a_not_b<Summary>;
template<typename T>
struct no_op_policy {
void operator()(T&, const T&) const {}
};
using tuple_jaccard_similarity_int64 = datasketches::tuple_jaccard_similarity<Summary, no_op_policy<Summary>, no_op_policy<Summary>>;
tuple_mode convert_mode(const std::string& mode_str) {
if (mode_str == "" || mode_str == "SUM") return SUM;
if (mode_str == "MIN") return MIN;
if (mode_str == "MAX") return MAX;
if (mode_str == "ONE") return ONE;
if (mode_str == "NOP") return NOP;
throw std::invalid_argument("unrecognized mode " + mode_str);
}
const emscripten::val Uint8Array = emscripten::val::global("Uint8Array");
EMSCRIPTEN_BINDINGS(tuple_sketch_int64) {
emscripten::register_vector<double>("VectorDouble");
emscripten::function("getExceptionMessage", emscripten::optional_override([](intptr_t ptr) {
return std::string(reinterpret_cast<std::exception*>(ptr)->what());
}));
emscripten::constant("DEFAULT_LG_K", datasketches::theta_constants::DEFAULT_LG_K);
emscripten::constant("DEFAULT_SEED", datasketches::DEFAULT_SEED);
emscripten::class_<update_tuple_sketch_int64>("update_tuple_sketch_int64")
.constructor(emscripten::optional_override([](uint8_t lg_k, uint64_t seed, float p, const std::string& mode_str) {
const auto policy = tuple_update_policy<Summary, Update>(convert_mode(mode_str));
return new update_tuple_sketch_int64(update_tuple_sketch_int64::builder(policy).set_lg_k(lg_k).set_seed(seed).set_p(p).build());
}))
.function("updateString", emscripten::optional_override([](update_tuple_sketch_int64& self, const std::string& key, Update value) {
self.update(key, value);
}))
.function("updateInt64", emscripten::optional_override([](update_tuple_sketch_int64& self, uint64_t key, Update value) {
self.update(key, value);
}))
.function("serializeAsUint8Array", emscripten::optional_override([](const update_tuple_sketch_int64& self) {
auto bytes = self.compact().serialize();
return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data()));
}))
;
emscripten::class_<compact_tuple_sketch_int64>("compact_tuple_sketch_int64")
.class_function("convertTheta", emscripten::optional_override([](const std::string& theta_sketch_bytes, uint64_t value, uint64_t seed) {
// converting constructor does not currently take wrapped compact theta sketch
const auto sketch = datasketches::compact_theta_sketch::deserialize(theta_sketch_bytes.data(), theta_sketch_bytes.size(), seed);
auto bytes = compact_tuple_sketch_int64(sketch, value).serialize();
return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data()));
}))
.class_function("getEstimate", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) {
return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).get_estimate();
}))
.class_function("getEstimateAndBounds", emscripten::optional_override([](const std::string& sketch_bytes, uint8_t num_std_devs, uint64_t seed) {
const auto sketch = compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed);
auto result = emscripten::val::object();
result.set("estimate", sketch.get_estimate());
result.set("lower_bound", sketch.get_lower_bound(num_std_devs));
result.set("upper_bound", sketch.get_upper_bound(num_std_devs));
return result;
}))
.class_function("getSumEstimateAndBounds", emscripten::optional_override([](const std::string& sketch_bytes, uint8_t num_std_devs, uint64_t seed) {
const auto sketch = compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed);
uint64_t sum = 0;
for (const auto& entry: sketch) sum += entry.second;
const double sum_estimate = sum / sketch.get_theta();
auto result = emscripten::val::object();
result.set("sum_estimate", sum_estimate);
result.set("sum_lower_bound", sketch.get_estimate() > 0 ? (sum_estimate * sketch.get_lower_bound(num_std_devs) / sketch.get_estimate()) : 0);
result.set("sum_upper_bound", sketch.get_estimate() > 0 ? (sum_estimate * sketch.get_upper_bound(num_std_devs) / sketch.get_estimate()) : 0);
return result;
}))
.class_function("getTheta", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) {
return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).get_theta();
}))
.class_function("getNumRetained", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) {
return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).get_num_retained();
}))
.class_function("toString", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) {
return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).to_string();
}))
.class_function("filterLowHigh", emscripten::optional_override([](const std::string& sketch_bytes, int low, int high, uint64_t seed) {
auto bytes = compact_tuple_sketch_int64::deserialize(
sketch_bytes.data(), sketch_bytes.size(), seed
).filter([low, high](int v){return v >= low && v <= high;}).serialize();
return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data()));
}))
;
emscripten::class_<tuple_union_int64>("tuple_union_int64")
.constructor(emscripten::optional_override([](uint8_t lg_k, uint64_t seed, std::string mode_str) {
const auto policy = tuple_union_policy<Summary>(convert_mode(mode_str));
return new tuple_union_int64(tuple_union_int64::builder(policy).set_lg_k(lg_k).set_seed(seed).build());
}))
.function("updateWithUpdateSketch", emscripten::optional_override([](tuple_union_int64& self, const update_tuple_sketch_int64& sketch) {
self.update(sketch);
}))
.function("updateWithCompactSketch", emscripten::optional_override([](tuple_union_int64& self, const compact_tuple_sketch_int64& sketch) {
self.update(sketch);
}))
.function("updateWithBytes", emscripten::optional_override([](tuple_union_int64& self, const std::string& bytes, uint64_t seed) {
self.update(compact_tuple_sketch_int64::deserialize(bytes.data(), bytes.size(), seed));
}))
.function("getResultAsUint8Array", emscripten::optional_override([](tuple_union_int64& self) {
auto bytes = self.get_result().serialize();
return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data()));
}))
;
emscripten::function("tupleUnionInt64", emscripten::optional_override([](
const std::string& bytes1, const std::string& bytes2, uint8_t lg_k, uint64_t seed, const std::string& mode_str
) {
const auto policy = tuple_union_policy<Summary>(convert_mode(mode_str));
auto u = tuple_union_int64(tuple_union_int64::builder(policy).set_lg_k(lg_k).set_seed(seed).build());
u.update(compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed));
u.update(compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed));
const auto bytes = u.get_result().serialize();
return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data()));
}));
emscripten::function("tupleIntersectionInt64", emscripten::optional_override([](
const std::string& bytes1, const std::string& bytes2, uint64_t seed, const std::string& mode_str
) {
tuple_intersection_int64 intersection(seed, tuple_intersection_policy<Summary>(convert_mode(mode_str)));
intersection.update(compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed));
intersection.update(compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed));
const auto bytes = intersection.get_result().serialize();
return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data()));
}));
emscripten::function("tupleAnotBInt64", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) {
auto bytes = tuple_a_not_b_int64(seed).compute(
compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed),
compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed)
).serialize();
return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data()));
}));
emscripten::function("tupleInt64JaccardSimilarity", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) {
const auto arr = tuple_jaccard_similarity_int64::jaccard(
compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed),
compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed),
seed
);
return std::vector<double>{arr[0], arr[1], arr[2]};
}));
}