in distant_supervision/synthetic_data_creator.py [0:0]
def _perform_subsample_by_count(self, rdd, subsample_count, *, tot_count=None):
extra_frac = 2.0 # if extra_frac = 1.10, sample for 10% more data
if tot_count is None:
tot_count = rdd.count()
frac = float(subsample_count) / tot_count
if frac >= 0.99:
return rdd
return self._perform_subsample(rdd, frac * extra_frac)