def get_quant_method_patch()

in arctic_inference/vllm/spec_dec/fp8.py [0:0]


    def get_quant_method_patch(self, layer: torch.nn.Module,
                               prefix: str) -> Optional["QuantizeMethodBase"]:
        from vllm.attention.layer import Attention  # Avoid circular import
        from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
    
        if isinstance(layer, LinearBase):
            if is_layer_skipped(prefix, self.ignored_layers):
                return UnquantizedLinearMethod()
            return OriginalFp8LinearMethod(self)
        elif isinstance(layer, FusedMoE):
            return Fp8MoEMethod(self)
        elif isinstance(layer, Attention):
            return Fp8KVCacheMethod(self)
        elif isinstance(layer, VocabParallelEmbedding):
            return Fp8LinearMethodEmbedding(self)
        return None