in torchserve/inf2/llama2/workspace/inf2_handler.py [0:0]
def initialize(self, ctx):
self.manifest = ctx.manifest
properties = ctx.system_properties
model_dir = properties.get("model_dir")
model_checkpoint_dir = ctx.model_yaml_config.get("handler", {}).get(
"model_checkpoint_dir", ""
)
model_checkpoint_path = f"{model_dir}/{model_checkpoint_dir}"
os.environ["NEURONX_CACHE"] = "on"
os.environ["NEURONX_DUMP_TO"] = f"{model_dir}/neuron_cache"
os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference"
# micro batching initialization
micro_batching_parallelism = ctx.model_yaml_config.get(
"micro_batching", {}
).get("parallelism", None)
if micro_batching_parallelism:
logger.info(
f"Setting micro batching parallelism from model_config_yaml: {micro_batching_parallelism}"
)
self.handle.parallelism = micro_batching_parallelism
micro_batch_size = ctx.model_yaml_config.get("micro_batching", {}).get(
"micro_batch_size", 1
)
logger.info(f"Setting micro batching size: {micro_batch_size}")
self.handle.micro_batch_size = micro_batch_size
# settings for model compiliation and loading
amp = ctx.model_yaml_config.get("handler", {}).get("amp", "f32")
tp_degree = ctx.model_yaml_config.get("handler", {}).get("tp_degree", 6)
self.max_length = ctx.model_yaml_config.get("handler", {}).get("max_length", 50)
# allocate "tp_degree" number of neuron cores to the worker process
os.environ["NEURON_RT_NUM_CORES"] = str(tp_degree)
try:
num_neuron_cores_available = (
torch_neuronx.xla_impl.data_parallel.device_count()
)
assert num_neuron_cores_available >= int(tp_degree)
except (RuntimeError, AssertionError) as error:
logger.error(
"Required number of neuron cores for tp_degree "
+ str(tp_degree)
+ " are not available: "
+ str(error)
)
raise error
self.tokenizer = LlamaTokenizer.from_pretrained(model_checkpoint_path)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model = LlamaForSampling.from_pretrained(
model_checkpoint_path,
batch_size=self.handle.micro_batch_size,
amp=amp,
tp_degree=tp_degree,
)
logger.info("Starting to compile the model")
self.model.to_neuron()
logger.info("Model has been successfully compiled")
model_config = AutoConfig.from_pretrained(model_checkpoint_path)
self.model = HuggingFaceGenerationModelAdapter(model_config, self.model)
self.output_streamer = TextIteratorStreamerBatch(
self.tokenizer,
batch_size=self.handle.micro_batch_size,
skip_special_tokens=True,
)
self.initialized = True