in phi3/olive/phi3.py [0:0]
def genai_run(prompt, model_path, max_length):
print("\nModel inference starts...")
print("Loading model...")
app_started_timestamp = time.time()
model = og.Model(model_path)
model_loaded_timestamp = time.time()
print("Model loaded in {:.2f} seconds".format(model_loaded_timestamp - app_started_timestamp))
print("Creating tokenizer...")
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()
input_tokens = tokenizer.encode(prompt)
started_timestamp = time.time()
print("Creating generator ...")
params = og.GeneratorParams(model)
# optimal search options for Phi3
search_options = {
"max_length": max_length,
"top_k": 40,
"top_p": 0.95,
"temperature": 0.8,
"repetition_penalty": 1.0,
}
params.set_search_options(**search_options)
params.input_ids = input_tokens
generator = og.Generator(model, params)
print("Generator created")
first = True
first_token_timestamp = None
new_tokens = []
print("\n", prompt)
try:
while not generator.is_done():
generator.compute_logits()
generator.generate_next_token()
if first:
first_token_timestamp = time.time()
first = False
new_token = generator.get_next_tokens()[0]
print(tokenizer_stream.decode(new_token), end="", flush=True)
new_tokens.append(new_token)
except KeyboardInterrupt:
print(" --control+c pressed, aborting generation--")
del generator
run_time = time.time() - started_timestamp
if first_token_timestamp is None:
print("\n\nNo tokens generated")
else:
print(
"\n\n"
f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
)