in chatlearn/models/vllm_module.py [0:0]
def log_metrics_stats(self, num_done_requests):
now = time.monotonic()
self.num_done_requests += num_done_requests
scheduler_list = self.scheduler if isinstance(self.scheduler, list) else [self.scheduler]
avg_request_throughput = self.num_done_requests / (now - self.start_time)
if self.scheduler_outputs.prompt_run:
self.num_processed_prompt += self.scheduler_outputs.num_batched_tokens
else:
self.num_generated_tokens += self.scheduler_outputs.num_batched_tokens
avg_generation_throughput = self.num_generated_tokens / (now - self.start_time)
avg_prompt_throughput = self.num_processed_prompt / (now - self.start_time)
self.forward_count += 1
total_num_gpu_blocks = self.cache_config.num_gpu_blocks
num_free_gpu_blocks = sum(
scheduler.block_manager.get_num_free_gpu_blocks() for scheduler in scheduler_list)
num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks
self.gpu_cache_usage += num_used_gpu_blocks / total_num_gpu_blocks
avg_gpu_cache_usage = self.gpu_cache_usage / self.forward_count
total_num_cpu_blocks = self.cache_config.num_cpu_blocks
if total_num_cpu_blocks > 0:
num_free_cpu_blocks = sum(
scheduler.block_manager.get_num_free_cpu_blocks() for scheduler in scheduler_list)
num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks
cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks
else:
cpu_cache_usage = 0.0
self.cpu_cache_usage += cpu_cache_usage
avg_cpu_cache_usage = self.cpu_cache_usage / self.forward_count
for idx in range(self.num_done_requests - num_done_requests, self.num_done_requests):
output = self.outputs[idx]
prompt_length = len(output.prompt_token_ids)
output_length = len(output.outputs[0].token_ids)
batch_index = int(output.request_id / self.scheduler_config.max_num_seqs)
self.max_prompt_length_static_batching[batch_index] = max(
self.max_prompt_length_static_batching[batch_index], prompt_length)
self.max_output_length_static_batching[batch_index] = max(
self.max_output_length_static_batching[batch_index], output_length)
self.action_length += output_length
self.action_max_length = max(self.action_max_length, output_length)
self.action_min_length = min(self.action_min_length, output_length)
action_length_mean = float(self.action_length / self.num_done_requests) if self.num_done_requests else 0.0
for scheduler in scheduler_list:
self.batch_size_stats += len(scheduler.running)
avg_batch_size = self.batch_size_stats / self.forward_count
if not self.num_requests or (now - self.last_stats_time >= _LOGGING_INTERVAL_SEC):
self.last_stats_time = now
message = ""
if not self.num_requests:
batch_size = [self.scheduler_config.max_num_seqs \
for _ in range(math.ceil(self.num_done_requests / self.scheduler_config.max_num_seqs))]
if self.num_done_requests % self.scheduler_config.max_num_seqs:
batch_size[-1] = self.num_done_requests % self.scheduler_config.max_num_seqs
num_prompt_tokens_static_batching = sum( # pylint: disable=consider-using-generator
[prompt_len * bs for prompt_len, bs in zip(self.max_prompt_length_static_batching, batch_size)])
num_output_tokens_static_batching = sum( # pylint: disable=consider-using-generator
[output_length * bs for output_length, bs in zip(self.max_output_length_static_batching, batch_size)])
message = f"num_processed_prompts_continuous_batching: {self.num_processed_prompt}, " \
f"num_processed_prompts_static_batching: {num_prompt_tokens_static_batching}, " \
f"num_processed_prompts_continuous_batching/num_processed_prompts_static_batching: \
{self.num_processed_prompt/num_prompt_tokens_static_batching:.1f}, " \
f"num_output_tokens_continuous_batching: {self.num_generated_tokens}, " \
f"num_output_tokens_static_batching: {num_output_tokens_static_batching}, " \
f"num_output_tokens_continuous_batching/num_output_tokens_static_batching: \
{self.num_generated_tokens/num_output_tokens_static_batching:.1f}, " \
self._logger.info(f"allready generate responses for {self.num_done_requests} reqs, "
f"avg_request_throughput: {avg_request_throughput:.1f} reqs/s, "
f"avg_prompt_throughput: {avg_prompt_throughput:.1f} tokens/s, "
f"avg_generation_throughput: {avg_generation_throughput:.1f} tokens/s, "
f"avg_batch_size: {avg_batch_size:.1f} reqs, "
f"avg_gpu_cache_usage: {avg_gpu_cache_usage * 100:.1f}%, "
f"avg_cpu_cache_usage {avg_cpu_cache_usage * 100:.1f}%, "
f"action_length_mean: {action_length_mean:.1f}, "
f"action_max_length: {self.action_max_length if self.num_done_requests else 'inf'}, "
f"action_min_length: {self.action_min_length if self.num_done_requests else '-inf'}, "
f"{message}")