dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py [403:551]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  params_override = {
      "runtime": {
          "distribution_strategy": "tpu",
          "mixed_precision_dtype": "mixed_bfloat16",
      },
      "task": {
          "use_synthetic_data": "false",
          "use_tf_record_reader": "true",
          "train_data": {
              "input_path": "gs://zyc_dlrm/dataset/tb_tf_record_train_val/train/day_*/*",
              "global_batch_size": global_batch_size,
          },
          "validation_data": {
              "input_path": "gs://zyc_dlrm/dataset/tb_tf_record_train_val/eval/day_*/*",
              "global_batch_size": global_batch_size,
          },
          "model": {
              "interaction": "multi_layer_dcn",
              "dcn_num_layers": 3,
              "dcn_low_rank_dim": 512,
              "num_dense_features": 13,
              "bottom_mlp": bottom_mlp,
              "embedding_dim": embedding_dim,
              "top_mlp": [1024, 1024, 512, 256, 1],
              "vocab_sizes": [
                  40000000,
                  39060,
                  17295,
                  7424,
                  20265,
                  3,
                  7122,
                  1543,
                  63,
                  40000000,
                  3067956,
                  405282,
                  10,
                  2209,
                  11938,
                  155,
                  4,
                  976,
                  14,
                  40000000,
                  40000000,
                  40000000,
                  590152,
                  12973,
                  108,
                  36,
              ],
              "multi_hot_sizes": [
                  3,
                  2,
                  1,
                  2,
                  6,
                  1,
                  1,
                  1,
                  1,
                  7,
                  3,
                  8,
                  1,
                  6,
                  9,
                  5,
                  1,
                  1,
                  1,
                  12,
                  100,
                  27,
                  10,
                  3,
                  1,
                  1,
              ],
              "use_multi_hot": "true",
              "concat_dense": "false",
              "dcn_use_bias": "true",
              "max_ids_per_chip_per_sample": 128,
              "max_ids_per_table": 15000,
              "max_unique_ids_per_table": 4096,
              "initialize_tables_on_host": "false",
              "use_partial_tpu_embedding": "false",
              "size_threshold": 0,
          },
      },
      "trainer": {
          "use_orbit": "true",
          "validation_interval": 1000,
          "checkpoint_interval": 0,
          "validation_steps": 1000,
          "train_steps": train_steps,
          "optimizer_config": {
              "embedding_optimizer": "SGD",
              "lr_config": {
                  "decay_exp": 1.6,
                  "decay_start_steps": 150000,
                  "decay_steps": 136054,
                  "learning_rate": 30,
                  "warmup_steps": 8000,
              },
          },
      },
  }

  model_dir = "/tmp"

  params_override["trainer"]["pipeline_sparse_and_dense_execution"] = "true"
  tpu_id = Variable.get(benchmark_id, default_var=None)
  # TODO (ericlefort): Replace the model_dir with this line when the var is available
  # model_dir = metric_config.SshEnvVars.GCS_OUTPUT.value + f"/dlrm/v5p/{benchmark_id}"
  epoch = time.time()
  model_dir = f"{gcs_bucket.BASE_OUTPUT_DIR}/{test_owner.Team.SOLUTIONS_TEAM.value}/dlrm/{benchmark_id}/{epoch}"

  # Clean out the prior checkpoint if it exists
  run_model_cmds = (
      (
          f"cd /usr/share/tpu/models && {env_variable} &&"
          " python3 official/recommendation/ranking/train.py"
          f" --model_dir={model_dir} {extraFlags}"
          f" --params_override='{params_override}'"
      ),
  )

  job_test_config = test_config.TpuVmTest(
      test_config.Tpu(
          version=tpu_version,
          cores=tpu_cores,
          runtime_version=runtime_version,
          reserved=True,
          network=network,
          subnetwork=subnetwork,
      ),
      test_name=test_name,
      set_up_cmds=set_up_cmds,
      run_model_cmds=run_model_cmds,
      timeout=datetime.timedelta(minutes=time_out_in_min),
      task_owner=test_owner.CHANDRA_D,
  )

  return task.run_queued_resource_test(
      task_test_config=job_test_config,
      task_gcp_config=job_gcp_config,
      tpu_name_env_var=is_pod,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



dags/solutions_team/configs/tensorflow/solutionsteam_tf_release_supported_config.py [183:331]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  params_override = {
      "runtime": {
          "distribution_strategy": "tpu",
          "mixed_precision_dtype": "mixed_bfloat16",
      },
      "task": {
          "use_synthetic_data": "false",
          "use_tf_record_reader": "true",
          "train_data": {
              "input_path": "gs://zyc_dlrm/dataset/tb_tf_record_train_val/train/day_*/*",
              "global_batch_size": global_batch_size,
          },
          "validation_data": {
              "input_path": "gs://zyc_dlrm/dataset/tb_tf_record_train_val/eval/day_*/*",
              "global_batch_size": global_batch_size,
          },
          "model": {
              "interaction": "multi_layer_dcn",
              "dcn_num_layers": 3,
              "dcn_low_rank_dim": 512,
              "num_dense_features": 13,
              "bottom_mlp": bottom_mlp,
              "embedding_dim": embedding_dim,
              "top_mlp": [1024, 1024, 512, 256, 1],
              "vocab_sizes": [
                  40000000,
                  39060,
                  17295,
                  7424,
                  20265,
                  3,
                  7122,
                  1543,
                  63,
                  40000000,
                  3067956,
                  405282,
                  10,
                  2209,
                  11938,
                  155,
                  4,
                  976,
                  14,
                  40000000,
                  40000000,
                  40000000,
                  590152,
                  12973,
                  108,
                  36,
              ],
              "multi_hot_sizes": [
                  3,
                  2,
                  1,
                  2,
                  6,
                  1,
                  1,
                  1,
                  1,
                  7,
                  3,
                  8,
                  1,
                  6,
                  9,
                  5,
                  1,
                  1,
                  1,
                  12,
                  100,
                  27,
                  10,
                  3,
                  1,
                  1,
              ],
              "use_multi_hot": "true",
              "concat_dense": "false",
              "dcn_use_bias": "true",
              "max_ids_per_chip_per_sample": 128,
              "max_ids_per_table": 15000,
              "max_unique_ids_per_table": 4096,
              "initialize_tables_on_host": "false",
              "use_partial_tpu_embedding": "false",
              "size_threshold": 0,
          },
      },
      "trainer": {
          "use_orbit": "true",
          "validation_interval": 1000,
          "checkpoint_interval": 0,
          "validation_steps": 1000,
          "train_steps": train_steps,
          "optimizer_config": {
              "embedding_optimizer": "SGD",
              "lr_config": {
                  "decay_exp": 1.6,
                  "decay_start_steps": 150000,
                  "decay_steps": 136054,
                  "learning_rate": 30,
                  "warmup_steps": 8000,
              },
          },
      },
  }

  model_dir = "/tmp"

  params_override["trainer"]["pipeline_sparse_and_dense_execution"] = "true"
  tpu_id = Variable.get(benchmark_id, default_var=None)
  # TODO (ericlefort): Replace the model_dir with this line when the var is available
  # model_dir = metric_config.SshEnvVars.GCS_OUTPUT.value + f"/dlrm/v5p/{benchmark_id}"
  epoch = time.time()
  model_dir = f"{gcs_bucket.BASE_OUTPUT_DIR}/{test_owner.Team.SOLUTIONS_TEAM.value}/dlrm/{benchmark_id}/{epoch}"

  # Clean out the prior checkpoint if it exists
  run_model_cmds = (
      (
          f"cd /usr/share/tpu/models && {env_variable} &&"
          " python3 official/recommendation/ranking/train.py"
          f" --model_dir={model_dir} {extraFlags}"
          f" --params_override='{params_override}'"
      ),
  )

  job_test_config = test_config.TpuVmTest(
      test_config.Tpu(
          version=tpu_version,
          cores=tpu_cores,
          runtime_version=runtime_version,
          reserved=True,
          network=network,
          subnetwork=subnetwork,
      ),
      test_name=test_name,
      set_up_cmds=set_up_cmds,
      run_model_cmds=run_model_cmds,
      timeout=datetime.timedelta(minutes=time_out_in_min),
      task_owner=test_owner.CHANDRA_D,
  )

  return task.run_queued_resource_test(
      task_test_config=job_test_config,
      task_gcp_config=job_gcp_config,
      tpu_name_env_var=is_pod,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



