tools/smollm_local_inference/mlc.py (12 lines of code) (raw):
from mlc_llm import MLCEngine
# Create engine
model = "HF://mlc-ai/SmolLM2-1.7B-Instruct-q0f16-MLC"
engine = MLCEngine(model)
# Run chat completion in OpenAI API.
for response in engine.chat.completions.create(
messages=[{"role": "user", "content": "What is the meaning of life?"}],
model=model,
stream=True,
):
for choice in response.choices:
print(choice.delta.content, end="", flush=True)
print("\n")
engine.terminate()