def do_inference()

in src/engine/step4/model_dev/t5_evaluation.py [0:0]


def do_inference(args):
    """
    Get inferences of input questions based on the given args (for parallel inference).
    Args:
        args(tuple): Tuple model, dataframe and gpu device number.

    Returns:
        The list of the inferred queries templates.
    """

    model_test, df, device = args
    try:
        proc_input = [
            "translate English to SQL: %s" % input_text
            for input_text in df["unfolded_questions"]
        ]
        batch_size = 16
        model_test.eval()
        queries = []
        total = df.shape[0]
        with torch.no_grad():
            for i in range(0, len(proc_input), batch_size):
                print(f"{i}/{total} done")
                batch = proc_input[i : i + batch_size]
                inputs = model_test.tokenizer.batch_encode_plus(
                    batch,
                    max_length=model_test.hparams.max_input_length,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt",
                )
                inputs = inputs.to(f"cuda:{device}")
                output = model_test.model.generate(
                    inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_length=model_test.hparams.max_output_length,
                    num_beams=2,
                    repetition_penalty=2.5,
                    length_penalty=1.0,
                )
                queries.extend(output.cpu().numpy().tolist())

        queries = model_test.tokenizer.batch_decode(queries)
        queries = [
            sql.replace("<pad>", "")
            .replace("</s>", "")
            .replace("[", "<")
            .replace("]", ">")
            .strip()
            for sql in queries
        ]
        model_test.to("cpu")
    finally:
        pass

    return queries