in arctic_inference/vllm/model_runner.py [0:0]
def _update_suffix_cache(self, sampled_token_ids: list[list[int]]) -> None:
seen_req_ids = set()
for i, sampled_ids in enumerate(sampled_token_ids):
req_id = self.input_batch.req_ids[i]
seen_req_ids.add(req_id)
if not sampled_ids:
continue
index = self.input_batch.req_id_to_index[req_id]
if req_id not in self._suffix_cache.active_requests:
if req_id in self._suffix_cache.cached_requests:
# Reset the suffix cache for this request.
self._suffix_cache.evict_cached_response(req_id)
num_prompt_tokens = self.input_batch.num_prompt_tokens[index]
prompt_token_ids = (
self.input_batch.token_ids_cpu[index, :num_prompt_tokens])
self._suffix_cache.start_request(req_id, prompt_token_ids)
self._suffix_cache.add_active_response(req_id, sampled_ids)
# Stop requests that are not seen
for req_id in list(self._suffix_cache.active_requests):
if req_id not in seen_req_ids:
self._suffix_cache.stop_request(req_id)