tools/smollm_local_inference/mlc.py (12 lines of code) (raw):

from mlc_llm import MLCEngine # Create engine model = "HF://mlc-ai/SmolLM2-1.7B-Instruct-q0f16-MLC" engine = MLCEngine(model) # Run chat completion in OpenAI API. for response in engine.chat.completions.create( messages=[{"role": "user", "content": "What is the meaning of life?"}], model=model, stream=True, ): for choice in response.choices: print(choice.delta.content, end="", flush=True) print("\n") engine.terminate()