in benchmarks/src/main/java/org/apache/omid/benchmarks/utils/ZipfianGenerator.java [221:264]
public long nextLong(long itemcount) {
//from "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al, SIGMOD 1994
if (itemcount != countforzeta) {
//have to recompute zetan and eta, since they depend on itemcount
synchronized (this) {
if (itemcount > countforzeta) {
//System.err.println("WARNING: Incrementally recomputing Zipfian distribtion. (itemcount="+itemcount+" countforzeta="+countforzeta+")");
//we have added more items. can compute zetan incrementally, which is cheaper
zetan = zeta(countforzeta, itemcount, theta, zetan);
eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan);
} else if ((itemcount < countforzeta) && (allowitemcountdecrease)) {
//have to start over with zetan
//note : for large itemsets, this is very slow. so don't do it!
//TODO: can also have a negative incremental computation, e.g. if you decrease the number of items, then just subtract
//the zeta sequence terms for the items that went away. This would be faster than recomputing from scratch when the number of items
//decreases
System.err.println("WARNING: Recomputing Zipfian distribtion. This is slow and should be avoided. (itemcount=" + itemcount + " countforzeta=" + countforzeta + ")");
zetan = zeta(itemcount, theta);
eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan);
}
}
}
double u = random.nextDouble();
double uz = u * zetan;
if (uz < 1.0) {
return 0;
}
if (uz < 1.0 + Math.pow(0.5, theta)) {
return 1;
}
long ret = base + (long) ((itemcount) * Math.pow(eta * u - eta + 1, alpha));
setLastInt((int) ret);
return ret;
}