in arctic_inference/suffix_decoding/cache.py [0:0]
def start_request(self, req_id: Hashable, prompt_token_ids: Sequence[int]):
"""
This method should be called when starting to process a new request. It
will store the prompt for the request, allowing future speculations for
the same request to use the prompt context. The prompt will be stored
until `stop_request` is called. If `max_cached_requests != 0`, then a
new slot is allocated in the global cache for the response, triggering
cache eviction (FIFO order) if needed.
Args:
req_id (Hashable): The request identifier. Must be a hashable value
that uniquely identifies the request.
prompt_token_ids (Sequence[int]): A sequence of token IDs
representing the prompt of the request.
Raises:
ValueError: If a request with the same `req_id` is already active
or cached.
"""
if req_id in self._local_trees:
raise ValueError(f"Request '{req_id}' is already active")
self._local_trees[req_id] = SuffixTree(self._max_tree_depth)
self._local_trees[req_id].extend(0, prompt_token_ids)
if self._max_cached_requests != 0:
# Global cache is enabled.
if req_id in self._req_to_seq_id:
# Evict existing cached response for the request if present.
self.evict_cached_response(req_id)
# Allocate a new seq_id for the request.
self._generate_seq_id(req_id)