mesh_tensorflow/transformer/heterogeneous_moe.py [39:61]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  def __init__(self,
               num_experts=16,
               loss_coef=1e-2,
               hidden_size=4096,
               group_size=1024,
               capacity_factor_train=1.25,
               capacity_factor_eval=2.0,
               use_second_place_loss=False,
               second_policy_train="random",
               second_policy_eval="random",
               second_threshold_train=0.2,
               second_threshold_eval=0.2,
               dropout_rate=0.0,
               activation="relu",
               moe_gating="top_2",
               min_expert_capacity=4,
               switch_policy_train="input_jitter",
               switch_policy_eval="input_jitter",
               switch_dropout=0.1,
               switch_temperature=1.0,
               switch_jitter=1e-2,
               ntlb_top_k=4,
               output_dim=None,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



mesh_tensorflow/transformer/moe.py [40:62]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  def __init__(self,
               num_experts=16,
               loss_coef=1e-2,
               hidden_size=4096,
               group_size=1024,
               capacity_factor_train=1.25,
               capacity_factor_eval=2.0,
               use_second_place_loss=False,
               second_policy_train="random",
               second_policy_eval="random",
               second_threshold_train=0.2,
               second_threshold_eval=0.2,
               dropout_rate=0.0,
               activation="relu",
               moe_gating="top_2",
               min_expert_capacity=4,
               switch_policy_train="input_jitter",
               switch_policy_eval="input_jitter",
               switch_dropout=0.1,
               switch_temperature=1.0,
               switch_jitter=1e-2,
               ntlb_top_k=4,
               output_dim=None,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



