Skip to content

Commit

Permalink
upgrade datasketches lib from 4.1.0 to 5.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
leoparente committed Jan 24, 2024
1 parent 9301d74 commit 16c0316
Show file tree
Hide file tree
Showing 22 changed files with 641 additions and 355 deletions.
5 changes: 3 additions & 2 deletions 3rd/datasketches/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ target_sources(common
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_forward.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/ceiling_power_of_2.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/quantiles_sorted_view.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/quantiles_sorted_view_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/version.hpp.in
${CMAKE_CURRENT_SOURCE_DIR}/include/optional.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/version.hpp.in
)
19 changes: 11 additions & 8 deletions 3rd/datasketches/common/include/common_defs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,30 @@
#include <chrono>
#include <thread>

/// DataSketches namespace
namespace datasketches {

static const uint64_t DEFAULT_SEED = 9001;

enum resize_factor { X1 = 0, X2, X4, X8 };

template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;

// thread-safe random bit
static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
+ std::hash<std::thread::id>{}(std::this_thread::get_id())));
template<typename A> using string = std::basic_string<char, std::char_traits<char>, typename std::allocator_traits<A>::template rebind_alloc<char>>;

// common random declarations
namespace random_utils {
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
static thread_local std::mt19937_64 rand(rd());
static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0);
}

// thread-safe random bit
static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
+ std::hash<std::thread::id>{}(std::this_thread::get_id())));

inline void override_seed(uint64_t s) {
rand.seed(s);
}
}

// utility function to hide unused compiler warning
// usually has no additional cost
Expand Down
2 changes: 0 additions & 2 deletions 3rd/datasketches/common/include/count_zeros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@

#include <cstdint>

#include <stdio.h>

namespace datasketches {

static const uint8_t byte_leading_zeros_table[256] = {
Expand Down
15 changes: 9 additions & 6 deletions 3rd/datasketches/common/include/kolmogorov_smirnov.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,16 @@

namespace datasketches {

/**
* Kolmogorov-Smirnov test for KLL or Quantiles sketches
*/
class kolmogorov_smirnov {
public:
/**
* Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* @param sketch1 KLL sketch 1
* @param sketch2 KLL sketch 2
* @param sketch1 sketch 1
* @param sketch2 sketch 2
* @return the raw delta between two KLL quantile sketches
*/
template<typename Sketch>
Expand All @@ -39,8 +42,8 @@ class kolmogorov_smirnov {
* Adjusts the computed threshold by the error epsilons of the two given sketches.
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* @param sketch1 KLL sketch 1
* @param sketch2 KLL sketch 2
* @param sketch1 sketch 1
* @param sketch2 sketch 2
* @param p Target p-value. Typically .001 to .1, e.g., .05.
* @return the adjusted threshold to be compared with the raw delta
*/
Expand All @@ -52,8 +55,8 @@ class kolmogorov_smirnov {
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
* this will return false.
* @param sketch1 KLL sketch 1
* @param sketch2 KLL sketch 2
* @param sketch1 sketch 1
* @param sketch2 sketch 2
* @param p Target p-value. Typically .001 to .1, e.g., .05.
* @return Boolean indicating whether we can reject the null hypothesis (that the sketches
* reflect the same underlying distribution) using the provided p-value.
Expand Down
148 changes: 148 additions & 0 deletions 3rd/datasketches/common/include/optional.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#ifndef _OPTIONAL_HPP_
#define _OPTIONAL_HPP_

// This is a simplistic substitute for std::optional until we require C++17

#if (__cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L))
#include <optional>
using std::optional;
#else

#include <type_traits>

namespace datasketches {

template<typename T>
class optional {
public:

optional() noexcept: initialized_(false) {}

optional(const T& value) noexcept(std::is_nothrow_copy_constructible<T>::value) {
new (&value_) T(value);
initialized_ = true;
}

optional(T&& value) noexcept(std::is_nothrow_move_constructible<T>::value) {
new (&value_) T(std::move(value));
initialized_ = true;
}

// conversion from compatible types
template<typename TT>
optional(const optional<TT>& other) noexcept(std::is_nothrow_constructible<T, TT>::value): initialized_(false) {
if (other.initialized_) {
new (&value_) T(other.value_);
initialized_ = true;
}
}

optional(const optional& other) noexcept(std::is_nothrow_copy_constructible<T>::value): initialized_(false) {
if (other.initialized_) {
new (&value_) T(other.value_);
initialized_ = true;
}
}

optional(optional&& other) noexcept(std::is_nothrow_move_constructible<T>::value): initialized_(false) {
if (other.initialized_) {
new (&value_) T(std::move(other.value_));
initialized_ = true;
}
}

~optional() noexcept(std::is_nothrow_destructible<T>::value) {
if (initialized_) value_.~T();
}

explicit operator bool() const noexcept {
return initialized_;
}

optional& operator=(const optional& other)
noexcept(std::is_nothrow_copy_constructible<T>::value && std::is_nothrow_copy_assignable<T>::value) {
if (initialized_) {
if (other.initialized_) {
value_ = other.value_;
} else {
reset();
}
} else {
if (other.initialized_) {
new (&value_) T(other.value_);
initialized_ = true;
}
}
return *this;
}

optional& operator=(optional&& other)
noexcept(std::is_nothrow_move_constructible<T>::value && std::is_nothrow_move_assignable<T>::value) {
if (initialized_) {
if (other.initialized_) {
value_ = std::move(other.value_);
} else {
reset();
}
} else {
if (other.initialized_) {
new (&value_) T(std::move(other.value_));
initialized_ = true;
}
}
return *this;
}

template<typename... Args>
void emplace(Args&&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value) {
new (&value_) T(args...);
initialized_ = true;
}

T& operator*() & noexcept { return value_; }
const T& operator*() const & noexcept { return value_; }
T&& operator*() && noexcept { return std::move(value_); }
const T&& operator*() const && noexcept { return std::move(value_); }

T* operator->() noexcept { return &value_; }
const T* operator->() const noexcept { return &value_; }

void reset() noexcept(std::is_nothrow_destructible<T>::value) {
if (initialized_) value_.~T();
initialized_ = false;
}

private:
union {
T value_;
};
bool initialized_;

// for converting constructor
template<typename TT> friend class optional;
};

} // namespace

#endif // C++17

#endif // _OPTIONAL_HPP_
97 changes: 95 additions & 2 deletions 3rd/datasketches/common/include/quantiles_sorted_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,37 +27,129 @@

namespace datasketches {

/**
* Sorted view for quantiles sketches (REQ, KLL and Quantiles)
*/
template<
typename T,
typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
typename Allocator
>
class quantiles_sorted_view {
public:
/// Entry type
using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
using Container = std::vector<Entry, AllocEntry>;

/// @private
quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator);

/// @private
template<typename Iterator>
void add(Iterator begin, Iterator end, uint64_t weight);

/// @private
void convert_to_cummulative();

class const_iterator;

/**
* Iterator pointing to the first entry in the view.
* If the view is empty, the returned iterator must not be dereferenced or incremented.
* @return iterator pointing to the first entry
*/
const_iterator begin() const;

/**
* Iterator pointing to the past-the-end entry in the view.
* The past-the-end entry is the hypothetical entry that would follow the last entry.
* It does not point to any entry, and must not be dereferenced or incremented.
* @return iterator pointing to the past-the-end entry
*/
const_iterator end() const;

/// @return size of the view
size_t size() const;

/**
* Returns an approximation to the normalized rank of the given item.
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param item to be ranked
* @param inclusive if true the weight of the given item is included into the rank.
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
* according to the Comparator.
*
* @return an approximate normalized rank of the given item (0 to 1 inclusive)
*/
double get_rank(const T& item, bool inclusive = true) const;

/**
* Quantile return type.
* This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
*/
using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;

/**
* Returns an item from the sketch that is the best approximation to an item
* from the original stream with the given normalized rank.
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param rank of an item in the hypothetical sorted stream.
* @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
*
* @return approximate quantile associated with the given normalized rank
*/
quantile_return_type get_quantile(double rank, bool inclusive = true) const;

using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;

/**
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
* cumulative analog of the PMF, of the input stream given a set of split points (items).
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
*
* @param size the number of split points in the array
*
* @param inclusive if true the rank of an item includes its own weight, and therefore
* if the sketch contains items equal to a slit point, then in CDF such items are
* included into the interval to the left of split point. Otherwise they are included into
* the interval to the right of split point.
*
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
* of the input stream given the split_points. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
* array. This can be viewed as array of ranks of the given split points plus one more value
* that is always 1.
*/
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;

/**
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
* given a set of split points (items).
*
* <p>If the view is empty this throws std::runtime_error.
*
* @param split_points an array of <i>m</i> unique, monotonically increasing items
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
*
* @param size the number of split points in the array
*
* @param inclusive if true the rank of an item includes its own weight, and therefore
* if the sketch contains items equal to a slit point, then in PMF such items are
* included into the interval to the left of split point. Otherwise they are included into the interval
* to the right of split point.
*
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
*/
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;

private:
Expand Down Expand Up @@ -122,8 +214,6 @@ class quantiles_sorted_view<T, C, A>::const_iterator: public quantiles_sorted_vi
using Base = typename quantiles_sorted_view<T, C, A>::Container::const_iterator;
using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;

const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}

template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
const value_type operator*() const { return Base::operator*(); }

Expand All @@ -147,6 +237,9 @@ class quantiles_sorted_view<T, C, A>::const_iterator: public quantiles_sorted_vi

private:
Base begin;

friend class quantiles_sorted_view<T, C, A>;
const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
};

} /* namespace datasketches */
Expand Down
Loading

0 comments on commit 16c0316

Please sign in to comment.