in chatlearn/models/vllm_module.py [0:0]
def build_scheduler(self):
self.seq_counter = Counter()
if CURRENT_VLLM_VERSION == VLLMVersion.v_0_3_0:
if self.scheduler is None:
self.scheduler = Scheduler(self.scheduler_config, self.cache_config, None)
else:
BlockSpaceManagerImpl = get_block_manager_cls(None)
self.scheduler.block_manager = BlockSpaceManagerImpl( # pylint: disable=abstract-class-instantiated
block_size=self.cache_config.block_size,
num_gpu_blocks=self.cache_config.num_gpu_blocks,
num_cpu_blocks=self.cache_config.num_cpu_blocks,
sliding_window=self.cache_config.sliding_window)
elif CURRENT_VLLM_VERSION in [VLLMVersion.v_0_5_1, VLLMVersion.v_0_6_3]:
if self.scheduler is None:
self.scheduler = [
Scheduler(self.scheduler_config, self.cache_config, None,
self.parallel_config.pipeline_parallel_size)
for _ in range(self.parallel_config.pipeline_parallel_size)
]
def get_tokenizer_for_seq(sequence):
tokenizer_group = self.get_tokenizer_group()
assert tokenizer_group, ("tokenizer_group cannot be None, "
"make sure skip_tokenizer_init is False")
return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
tokenizer_for_seq = get_tokenizer_for_seq if CURRENT_VLLM_VERSION == VLLMVersion.v_0_6_3 \
else self.get_tokenizer_for_seq
self.output_processor = (
SequenceGroupOutputProcessor.create_output_processor(
self.scheduler_config,
self.detokenizer,
self.scheduler,
self.seq_counter,
tokenizer_for_seq,
stop_checker=StopChecker(
self.scheduler_config.max_model_len,
tokenizer_for_seq,
),
))
if CURRENT_VLLM_VERSION == VLLMVersion.v_0_6_3:
self.input_preprocessor = InputPreprocessor(self.model_config,
self.tokenizer)
self.cached_scheduler_outputs = [
SchedulerOutputState()
for _ in range(self.parallel_config.pipeline_parallel_size)
]
self.scheduler_contexts = [
SchedulerContext(multi_step_stream_outputs=self.scheduler_config.multi_step_stream_outputs)
for _ in range(self.parallel_config.pipeline_parallel_size)
]
self.use_cached_outputs = False
self.process_request_outputs_callback = None
self.tracer = None
else:
if CURRENT_VLLM_VERSION == VLLMVersion.v_0_6_3:
version = "selfattn"
if (self.scheduler_config.embedding_mode
or self.cache_config.is_attention_free):
version = "placeholder"
else:
version = "v1"
if self.scheduler_config.use_v2_block_manager:
version = "v2"
if self.scheduler_config.embedding_mode:
version = "embedding"
BlockSpaceManagerImpl = get_block_manager_cls(version)
num_gpu_blocks = self.cache_config.num_gpu_blocks
if num_gpu_blocks:
num_gpu_blocks //= self.pipeline_model_parallel_size()
num_cpu_blocks = self.cache_config.num_cpu_blocks
if num_cpu_blocks:
num_cpu_blocks //= self.pipeline_model_parallel_size()
for scheduler in self.scheduler:
scheduler.block_manager = BlockSpaceManagerImpl( # pylint: disable=abstract-class-instantiated
block_size=self.cache_config.block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
sliding_window=self.cache_config.sliding_window,
enable_caching=self.cache_config.enable_prefix_caching)