in src/datatrove/utils/stats.py [0:0]
def __add__(self, other):
if not isinstance(other, type(self)):
other = type(self).from_dict(other)
# mean and variance: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
n = self.n + other.n
mean = 0.0
_running_variance = 0.0
if self.n + other.n > 0:
mean = (self.n * self.mean + other.n * other.mean) / n
delta = self.mean - other.mean
_running_variance = (
self._running_variance + other._running_variance + (delta * delta * self.n * other.n) / n
)
total = self.total + other.total
M = max(self.max, other.max)
m = min(self.min, other.min)
return type(self)(
total=total,
n=n,
mean=mean,
min=m,
max=M,
_running_variance=_running_variance,
unit=self.unit if self.unit != "doc" else other.unit,
)