in arctic_inference/vllm/patches.py [0:0]
def apply_arctic_patches():
from transformers import AutoConfig
from arctic_inference.common.swiftkv import LlamaSwiftKVConfig
# Register SwiftKV model configurations to transformers.
AutoConfig.register("llama_swiftkv", LlamaSwiftKVConfig)
from vllm import ModelRegistry
#from arctic_inference.vllm.swiftkv import LlamaSwiftKVForCausalLM
# Register SwiftKV model definitions to vLLM.
ModelRegistry.register_model(
"LlamaSwiftKVForCausalLM",
"arctic_inference.vllm.swiftkv:LlamaSwiftKVForCausalLM")
# Register ArcticSpeculator models to vLLM.
from arctic_inference.vllm.spec_dec.arctic_speculator import (
ArcticMLPSpeculator, ArcticLSTMSpeculator)
ModelRegistry.register_model("ArcticMLPSpeculatorPreTrainedModel",
ArcticMLPSpeculator)
ModelRegistry.register_model("ArcticLSTMSpeculatorPreTrainedModel",
ArcticLSTMSpeculator)
# This name is currently used in corvo
ModelRegistry.register_model("MLPVariantSpeculatorPreTrainedModel",
ArcticLSTMSpeculator)
# Patches that make later patches work properly.
EngineCoreProcPatch.apply_patch()
WorkerBasePatch.apply_patch()
# Patches to vLLM arguments and configuration objects.
EngineArgsPatch.apply_patch()
AsyncEngineArgsPatch.apply_patch()
ParallelConfigPatch.apply_patch()
SpeculativeConfigPatch.apply_patch()
SpecDecodingStatsPatch.apply_patch()
SpecDecodingLoggingPatch.apply_patch()
VllmConfigPatch.apply_patch()
XgrammarBackendPatch.apply_patch()
MLPSpeculatorConfigPatch.apply_patch()
# Main optimization patches.
apply_shift_parallel_patches()