evaluator/evaluator.py (25 lines of code) (raw):

import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig from modelscope import Model from modelscope.models.nlp.llama2 import Llama2Tokenizer def load_models_tokenizer(args): tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True).eval() model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path) model.generation_config.do_sample = False # use greedy decoding model.generation_config.repetition_penalty = 1.0 # disable repetition penalty return model, tokenizer def load_llama_models_tokenizer(args): tokenizer = Llama2Tokenizer.from_pretrained(args.checkpoint_path) model = Model.from_pretrained( args.checkpoint_path, device_map=f'cuda:{args.gpu}') model.generation_config.do_sample = False # use greedy decoding model.generation_config.repetition_penalty = 1.0 # disable repetition penalty return model, tokenizer