Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
distribution.h
Go to the documentation of this file.
1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
3
4#pragma once
5
6#include "../../namespaces.h"
7
8#include <functional>
9#include <optional>
10#include <ostream>
11#include <unordered_map>
12
14
15 template<typename T>
16 using Histogram = std::unordered_map<T, std::size_t>;
17
18 template<typename InputIt, typename GroupBy>
19 auto make_histogram(InputIt begin, InputIt end, GroupBy&& group_by) {
20 Histogram<std::invoke_result_t<GroupBy, decltype(*begin)>> histogram{};
21
22 while (begin != end) {
23 auto key{std::invoke(std::forward<GroupBy>(group_by), *begin)};
24
25 histogram.try_emplace(key, 0);
26 histogram[key] += 1;
27 ++begin;
28 }
29
30 return histogram;
31 }
32
33 template<typename T>
39
40 template<typename T>
41 inline std::ostream& operator<<(std::ostream& os, const DistributionError<T>& error) {
42 return os << "DistributionError{" <<
43 "The value { " << error.value <<
44 " } appear with a probability of { " << error.probability <<
45 " } while a probability of { " << error.expected_probability << " } was expected." <<
46 "}";
47 }
48
49 // REMARK: The following should really return an Either of unit/error
50 // but std::variant in C++ is both extremely unusable and comes with a
51 // strong overhead unless certain conditions are met.
52 // For this reason, we keep to the less intutitive optional error.
53
54 /*!
55 * Returns true when the given \a sequence approximately respects a
56 * given distribution.
57 *
58 * The \a sequence respects a given distribution when the count of
59 * each collection of values is a percentage of the total values that
60 * is near the percentage probability described by distribution.
61 *
62 * The values in \a sequence are collected according to \a group_by.
63 * \a group_by, given an element of \a sequence, should return a value
64 * of some type that represent the category of the inspected value.
65 * Values that have the same category share their count.
66 *
67 * The distribution that should be respected is given by \a
68 * probability_of. \a probability_of is a function that takes a
69 * category that was produced from a call to \a group_by and returns
70 * the expect probability, in percentage, of apperance for that
71 * category.
72 *
73 * The given probability is then compared to the one found by counting
74 * the element of \a sequence under \a group_by, to ensure that it
75 * matches.
76 *
77 * The margin of error for the comparison is given, in percentage
78 * points, by \a margin.
79 * The approximation uses an absolute comparison and scales the
80 * margin inversely based on the size of \a sequence, to account for the
81 * precision of the data set itself.
82 *
83 * When the distribution is not respected, a DistributionError is
84 * returned enclosed in an optional value.
85 * The error allows reports which the first category for which the
86 * comparison failed, along with its expected probability and the one
87 * that was actually inferred from \a sequence.
88 */
89 template<typename T, typename GroupBy, typename ProbabilityOf>
90 std::optional<DistributionError<T>> respects_distribution(std::vector<T>&& sequence, GroupBy&& group_by, ProbabilityOf&& probability_of, double margin = 33) {
91 std::size_t data_point_amount{sequence.size()};
92
93 // REMARK: We scale the margin based on the data set to allow for
94 // an easier change in downstream tests.
95 // The precision required for the approximation will vary
96 // depending on how many values we generate.
97 // The amount of values we generate depends on how much time we
98 // want the tests to take.
99 // This amount may change in the future. For example, as code is
100 // added and tests are added, we might need some expensive
101 // computations here and there.
102 // Sometimes, this will increase the test suite runtime without an
103 // obvious way of improving the performance of the underlying code
104 // to reduce it.
105 // In those cases, the total run time can be decreased by running
106 // less generations for battle-tested tests.
107 // If some code has not been changed for a long time, it will have
108 // had thousands of generations by that point, giving us a good
109 // degree of certainty of it not being bugged (for whatever bugs
110 // the tests account for).
111 // Then, running a certain amount of generation is not required
112 // anymore such that some of them can be optimized out.
113 // For tests like the one using this function, where our ability
114 // to test is always dependent on the amount of generations,
115 // changing the generated amount will mean that we will need to
116 // change our conditions too, potentially changing the meaning of
117 // the test.
118 // To take this into account, we perform a scaling on the
119 // condition itself, so that if the amount of data points that are
120 // generated changes, we do not generally have to change anything
121 // in the condition.
122 //
123 // For this case, we scale logarithmically_10 for the simple
124 // reason that we tend to generate values in power of tens,
125 // starting with the 100 values default that Quickcheck used.
126 //
127 // The default value for the margin on which the scaling is based,
128 // was chosen heuristically.
129 // As we expect generation under 10^3 to be generally meaningless
130 // for this kind of testing, the value was chosen so that it would
131 // start to normalize around that amount.
132 // Deviation of about 5-10% were identified trough various
133 // generations for an amount of data points near 1000, while a
134 // deviation of about 1-3% was identified with about 10000 values.
135 // With the chosen default value, the scaling approaches those
136 // percentage points with some margin of error.
137 //
138 // We expect up to a 10%, or a bit more, deviation to be suitable
139 // for our purposes, as it would still allow for a varied
140 // distribution in downstream consumers.
141 double scaled_margin{margin * (1.0/std::log10(data_point_amount))};
142
143 auto histogram{make_histogram(sequence.begin(), sequence.end(), std::forward<GroupBy>(group_by))};
144
145 for (auto& bin : histogram) {
146 auto [key, count] = bin;
147
148 double actual_percentage{percent_of(static_cast<double>(count), static_cast<double>(data_point_amount))};
149 double expected_percentage{std::invoke(std::forward<ProbabilityOf>(probability_of), key)};
150
151 if (!(actual_percentage == Approx(expected_percentage).margin(scaled_margin)))
152 return std::make_optional(DistributionError<T>{key, actual_percentage, expected_percentage});
153 }
154
155 return std::nullopt;
156 }
157
158} // end QDOC_CATCH_GENERATORS_UTILITIES_ABSOLUTE_NAMESPACE
std::optional< DistributionError< T > > respects_distribution(std::vector< T > &&sequence, GroupBy &&group_by, ProbabilityOf &&probability_of, double margin=33)
Returns true when the given sequence approximately respects a given distribution.
auto make_histogram(InputIt begin, InputIt end, GroupBy &&group_by)
#define QDOC_CATCH_GENERATORS_UTILITIES_ABSOLUTE_NAMESPACE
Definition namespaces.h:14