def _perform_subsample_by_count()

in distant_supervision/synthetic_data_creator.py [0:0]


    def _perform_subsample_by_count(self, rdd, subsample_count, *, tot_count=None):
        extra_frac = 2.0  # if extra_frac = 1.10, sample for 10% more data

        if tot_count is None:
            tot_count = rdd.count()
        frac = float(subsample_count) / tot_count

        if frac >= 0.99:
            return rdd

        return self._perform_subsample(rdd, frac * extra_frac)