tuple/tuple_sketch_int64.cpp (183 lines of code) (raw):

/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #include <emscripten/bind.h> #include <tuple_sketch.hpp> #include <tuple_union.hpp> #include <tuple_intersection.hpp> #include <tuple_a_not_b.hpp> #include <tuple_jaccard_similarity.hpp> #include <theta_sketch.hpp> using Summary = uint64_t; using Update = uint64_t; enum tuple_mode {SUM, MIN, MAX, ONE, NOP}; template<typename S, typename U> class tuple_update_policy { public: tuple_update_policy(tuple_mode mode): mode_(mode) {} Summary create() const { if (mode_ == ONE) return 1; else if (mode_ == MIN) return std::numeric_limits<Summary>::max(); else if (mode_ == MAX) return std::numeric_limits<Summary>::min(); return 0; } void update(S& summary, const U& update) const { if (mode_ == SUM) summary += update; else if (mode_ == MIN) summary = std::min(summary, update); else if (mode_ == MAX) summary = std::max(summary, update); else if (mode_ == ONE) summary = 1; } private: tuple_mode mode_; }; using update_tuple_sketch_int64 = datasketches::update_tuple_sketch<Summary, Update, tuple_update_policy<Summary, Update>>; using compact_tuple_sketch_int64 = datasketches::compact_tuple_sketch<Summary>; template<typename S> class tuple_union_policy { public: tuple_union_policy(tuple_mode mode): mode_(mode) {} void operator()(Summary& summary, const Summary& other) const { if (mode_ == SUM) summary += other; else if (mode_ == MIN) summary = std::min(summary, other); else if (mode_ == MAX) summary = std::max(summary, other); else if (mode_ == ONE) summary = 1; } private: tuple_mode mode_; }; using tuple_union_int64 = datasketches::tuple_union<Summary, tuple_union_policy<Summary>>; template<typename S> using tuple_intersection_policy = tuple_union_policy<S>; using tuple_intersection_int64 = datasketches::tuple_intersection<Summary, tuple_intersection_policy<Summary>>; using tuple_a_not_b_int64 = datasketches::tuple_a_not_b<Summary>; template<typename T> struct no_op_policy { void operator()(T&, const T&) const {} }; using tuple_jaccard_similarity_int64 = datasketches::tuple_jaccard_similarity<Summary, no_op_policy<Summary>, no_op_policy<Summary>>; tuple_mode convert_mode(const std::string& mode_str) { if (mode_str == "" || mode_str == "SUM") return SUM; if (mode_str == "MIN") return MIN; if (mode_str == "MAX") return MAX; if (mode_str == "ONE") return ONE; if (mode_str == "NOP") return NOP; throw std::invalid_argument("unrecognized mode " + mode_str); } const emscripten::val Uint8Array = emscripten::val::global("Uint8Array"); EMSCRIPTEN_BINDINGS(tuple_sketch_int64) { emscripten::register_vector<double>("VectorDouble"); emscripten::function("getExceptionMessage", emscripten::optional_override([](intptr_t ptr) { return std::string(reinterpret_cast<std::exception*>(ptr)->what()); })); emscripten::constant("DEFAULT_LG_K", datasketches::theta_constants::DEFAULT_LG_K); emscripten::constant("DEFAULT_SEED", datasketches::DEFAULT_SEED); emscripten::class_<update_tuple_sketch_int64>("update_tuple_sketch_int64") .constructor(emscripten::optional_override([](uint8_t lg_k, uint64_t seed, float p, const std::string& mode_str) { const auto policy = tuple_update_policy<Summary, Update>(convert_mode(mode_str)); return new update_tuple_sketch_int64(update_tuple_sketch_int64::builder(policy).set_lg_k(lg_k).set_seed(seed).set_p(p).build()); })) .function("updateString", emscripten::optional_override([](update_tuple_sketch_int64& self, const std::string& key, Update value) { self.update(key, value); })) .function("updateInt64", emscripten::optional_override([](update_tuple_sketch_int64& self, uint64_t key, Update value) { self.update(key, value); })) .function("serializeAsUint8Array", emscripten::optional_override([](const update_tuple_sketch_int64& self) { auto bytes = self.compact().serialize(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) ; emscripten::class_<compact_tuple_sketch_int64>("compact_tuple_sketch_int64") .class_function("convertTheta", emscripten::optional_override([](const std::string& theta_sketch_bytes, uint64_t value, uint64_t seed) { // converting constructor does not currently take wrapped compact theta sketch const auto sketch = datasketches::compact_theta_sketch::deserialize(theta_sketch_bytes.data(), theta_sketch_bytes.size(), seed); auto bytes = compact_tuple_sketch_int64(sketch, value).serialize(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) .class_function("getEstimate", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) { return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).get_estimate(); })) .class_function("getEstimateAndBounds", emscripten::optional_override([](const std::string& sketch_bytes, uint8_t num_std_devs, uint64_t seed) { const auto sketch = compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed); auto result = emscripten::val::object(); result.set("estimate", sketch.get_estimate()); result.set("lower_bound", sketch.get_lower_bound(num_std_devs)); result.set("upper_bound", sketch.get_upper_bound(num_std_devs)); return result; })) .class_function("getSumEstimateAndBounds", emscripten::optional_override([](const std::string& sketch_bytes, uint8_t num_std_devs, uint64_t seed) { const auto sketch = compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed); uint64_t sum = 0; for (const auto& entry: sketch) sum += entry.second; const double sum_estimate = sum / sketch.get_theta(); auto result = emscripten::val::object(); result.set("sum_estimate", sum_estimate); result.set("sum_lower_bound", sketch.get_estimate() > 0 ? (sum_estimate * sketch.get_lower_bound(num_std_devs) / sketch.get_estimate()) : 0); result.set("sum_upper_bound", sketch.get_estimate() > 0 ? (sum_estimate * sketch.get_upper_bound(num_std_devs) / sketch.get_estimate()) : 0); return result; })) .class_function("getTheta", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) { return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).get_theta(); })) .class_function("getNumRetained", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) { return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).get_num_retained(); })) .class_function("toString", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) { return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).to_string(); })) .class_function("filterLowHigh", emscripten::optional_override([](const std::string& sketch_bytes, int low, int high, uint64_t seed) { auto bytes = compact_tuple_sketch_int64::deserialize( sketch_bytes.data(), sketch_bytes.size(), seed ).filter([low, high](int v){return v >= low && v <= high;}).serialize(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) ; emscripten::class_<tuple_union_int64>("tuple_union_int64") .constructor(emscripten::optional_override([](uint8_t lg_k, uint64_t seed, std::string mode_str) { const auto policy = tuple_union_policy<Summary>(convert_mode(mode_str)); return new tuple_union_int64(tuple_union_int64::builder(policy).set_lg_k(lg_k).set_seed(seed).build()); })) .function("updateWithUpdateSketch", emscripten::optional_override([](tuple_union_int64& self, const update_tuple_sketch_int64& sketch) { self.update(sketch); })) .function("updateWithCompactSketch", emscripten::optional_override([](tuple_union_int64& self, const compact_tuple_sketch_int64& sketch) { self.update(sketch); })) .function("updateWithBytes", emscripten::optional_override([](tuple_union_int64& self, const std::string& bytes, uint64_t seed) { self.update(compact_tuple_sketch_int64::deserialize(bytes.data(), bytes.size(), seed)); })) .function("getResultAsUint8Array", emscripten::optional_override([](tuple_union_int64& self) { auto bytes = self.get_result().serialize(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) ; emscripten::function("tupleUnionInt64", emscripten::optional_override([]( const std::string& bytes1, const std::string& bytes2, uint8_t lg_k, uint64_t seed, const std::string& mode_str ) { const auto policy = tuple_union_policy<Summary>(convert_mode(mode_str)); auto u = tuple_union_int64(tuple_union_int64::builder(policy).set_lg_k(lg_k).set_seed(seed).build()); u.update(compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed)); u.update(compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed)); const auto bytes = u.get_result().serialize(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })); emscripten::function("tupleIntersectionInt64", emscripten::optional_override([]( const std::string& bytes1, const std::string& bytes2, uint64_t seed, const std::string& mode_str ) { tuple_intersection_int64 intersection(seed, tuple_intersection_policy<Summary>(convert_mode(mode_str))); intersection.update(compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed)); intersection.update(compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed)); const auto bytes = intersection.get_result().serialize(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })); emscripten::function("tupleAnotBInt64", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) { auto bytes = tuple_a_not_b_int64(seed).compute( compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed), compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed) ).serialize(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })); emscripten::function("tupleInt64JaccardSimilarity", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) { const auto arr = tuple_jaccard_similarity_int64::jaccard( compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed), compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed), seed ); return std::vector<double>{arr[0], arr[1], arr[2]}; })); }