theta/theta_sketch.cpp (109 lines of code) (raw):

/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #include <strstream> #include <emscripten/bind.h> #include <theta_sketch.hpp> #include <theta_union.hpp> #include <theta_intersection.hpp> #include <theta_a_not_b.hpp> #include <theta_jaccard_similarity.hpp> using datasketches::update_theta_sketch; using datasketches::compact_theta_sketch; using datasketches::wrapped_compact_theta_sketch; using datasketches::theta_union; using datasketches::theta_intersection; using datasketches::theta_a_not_b; const emscripten::val Uint8Array = emscripten::val::global("Uint8Array"); EMSCRIPTEN_BINDINGS(theta_sketch) { emscripten::register_vector<double>("VectorDouble"); emscripten::function("getExceptionMessage", emscripten::optional_override([](intptr_t ptr) { return std::string(reinterpret_cast<std::exception*>(ptr)->what()); })); emscripten::constant("DEFAULT_LG_K", datasketches::theta_constants::DEFAULT_LG_K); emscripten::constant("DEFAULT_SEED", datasketches::DEFAULT_SEED); emscripten::class_<update_theta_sketch>("update_theta_sketch") .constructor(emscripten::optional_override([](uint8_t lg_k, uint64_t seed, float p) { return new update_theta_sketch(update_theta_sketch::builder().set_lg_k(lg_k).set_seed(seed).set_p(p).build()); })) .function("updateString", emscripten::select_overload<void(const std::string&)>(&update_theta_sketch::update)) .function("updateInt64", emscripten::select_overload<void(uint64_t)>(&update_theta_sketch::update)) .function("serializeAsUint8ArrayCompressed", emscripten::optional_override([](const update_theta_sketch& self) { auto bytes = self.compact().serialize_compressed(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) ; emscripten::class_<compact_theta_sketch>("compact_theta_sketch") .class_function("getEstimateFromBytes", emscripten::optional_override([](const std::string& bytes, uint64_t seed) { return wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), seed).get_estimate(); })) .class_function("getEstimateAndBoundsFromBytes", emscripten::optional_override([](const std::string& bytes, uint8_t num_std_devs, uint64_t seed) { const auto sketch = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), seed); return std::vector<double>{sketch.get_estimate(), sketch.get_lower_bound(num_std_devs), sketch.get_upper_bound(num_std_devs)}; })) .class_function("toStringFromBytes", emscripten::optional_override([](const std::string& bytes, uint64_t seed) { return wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), seed).to_string(); })) .class_function("getThetaFromBytes", emscripten::optional_override([](const std::string& bytes, uint64_t seed) { return wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), seed).get_theta(); })) .class_function("getNumRetainedFromBytes", emscripten::optional_override([](const std::string& bytes, uint64_t seed) { return wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), seed).get_num_retained(); })) .class_function("getMaxSerializedSizeBytes", &compact_theta_sketch::get_max_serialized_size_bytes) ; emscripten::class_<theta_union>("theta_union") .constructor(emscripten::optional_override([](uint8_t lg_k, uint64_t seed) { return new theta_union(theta_union::builder().set_lg_k(lg_k).set_seed(seed).build()); })) .function("updateWithUpdateSketch", emscripten::optional_override([](theta_union& self, const update_theta_sketch& sketch) { self.update(sketch); })) .function("updateWithCompactSketch", emscripten::optional_override([](theta_union& self, const compact_theta_sketch& sketch) { self.update(sketch); })) .function("updateWithBytes", emscripten::optional_override([](theta_union& self, const std::string& bytes, uint64_t seed) { self.update(wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), seed)); })) .function("updateWithBuffer", emscripten::optional_override([](theta_union& self, intptr_t bytes, size_t size, uint64_t seed) { self.update(wrapped_compact_theta_sketch::wrap(reinterpret_cast<void*>(bytes), size, seed)); })) .function("getResultStreamCompressed", emscripten::optional_override([](theta_union& self, intptr_t bytes, size_t size) { std::strstream stream(reinterpret_cast<char*>(bytes), size); self.get_result().serialize_compressed(stream); return (int) stream.tellp(); })) .function("getResultAsUint8ArrayCompressed", emscripten::optional_override([](theta_union& self) { auto bytes = self.get_result().serialize_compressed(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) .class_function("computeWithBytesReturnCompressed", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint8_t lg_k, uint64_t seed) { auto u = theta_union::builder().set_lg_k(lg_k).set_seed(seed).build(); u.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size(), seed)); u.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size(), seed)); const auto bytes = u.get_result().serialize_compressed(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) ; emscripten::function("thetaIntersectionCompressed", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) { theta_intersection intersection(seed); intersection.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size(), seed)); intersection.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size(), seed)); const auto bytes = intersection.get_result().serialize_compressed(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) ; emscripten::function("thetaAnotBCompressed", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) { const auto bytes = theta_a_not_b(seed).compute( wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size(), seed), wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size(), seed) ).serialize_compressed(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) ; emscripten::function("thetaJaccardSimilarity", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) { const auto arr = datasketches::theta_jaccard_similarity::jaccard( wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size(), seed), wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size(), seed), seed ); return std::vector<double>{arr[0], arr[1], arr[2]}; })); }