in torchbiggraph/train_cpu.py [0:0]
def get_num_edge_chunks(config: ConfigSchema) -> int:
if config.num_edge_chunks is not None:
return config.num_edge_chunks
max_edges_per_bucket = 0
# We should check all edge paths, all lhs partitions and all rhs partitions,
# but the combinatorial explosion could lead to thousands of checks. Let's
# assume that edges are uniformly distributed among buckets (this is not
# exactly the case, as it's the entities that are uniformly distributed
# among the partitions, and edge assignments to buckets are a function of
# that, thus, for example, very high degree entities could skew this), and
# use the size of bucket (0, 0) as an estimate of the average bucket size.
# We still do it for all edge paths as there could be semantic differences
# between them which lead to different sizes.
for edge_path in config.edge_paths:
edge_storage = EDGE_STORAGES.make_instance(edge_path)
max_edges_per_bucket = max(
max_edges_per_bucket,
edge_storage.get_number_of_edges(UNPARTITIONED, UNPARTITIONED),
)
return max(1, math.ceil(max_edges_per_bucket / config.max_edges_per_chunk))