arctic_inference/embedding/proto/inference.proto (69 lines of code) (raw):
syntax = "proto3";
package arctic_inference;
service InferenceService {
// encode a given prompt
rpc Encode (EncodeRequest) returns (EncodeResponse) {}
// Abort an ongoing generation
rpc Abort (AbortRequest) returns (AbortResponse) {}
// Get replica information
rpc GetReplicaInfo (ReplicaInfoRequest) returns (ReplicaInfoResponse) {}
// Health check
rpc HealthCheck (HealthCheckRequest) returns (HealthCheckResponse) {}
}
message EncodeRequest {
string request_id = 1;
int32 n_prompts = 2;
// The prompt to encode, can be a string or a list of token ids
// use bytes to avoid serialization/deserialization overhead
repeated string prompts = 3;
repeated bytes token_id_bytes_i32 = 4;
string model_name = 5;
int32 priority = 6;
}
message EncodeResponse {
string request_id = 1;
int32 n_prompts = 2;
int32 embedding_dim = 3;
// The embedding vector encoded as a sequence of 4-byte, little-endian float32.
// use bytes to avoid serialization/deserialization overhead
repeated bytes embedding_bytes_fp32 = 4;
// error message, empty if success
string error = 5;
}
// Request to abort a generation
message AbortRequest {
string request_id = 1;
}
// Response for abort request
message AbortResponse {
bool success = 1;
string message = 2;
}
// Request for replica information
message ReplicaInfoRequest {}
// Response with replica information
message ReplicaInfoResponse {
repeated SingleReplicaInfoResponse replica_infos = 1;
int32 n_replicas = 2;
int32 n_healthy_replicas = 3;
// error message, empty if success
string message = 4;
}
// Response with replica information
message SingleReplicaInfoResponse {
string model_name = 1;
string task = 2;
string dtype = 3;
bool ready = 4;
string parallel_config = 5;
string decoding_config = 6;
string scheduler_config = 7;
string lora_config = 8;
}
// Health check request
message HealthCheckRequest {}
// Health check response
message HealthCheckResponse {
bool healthy = 1;
string message = 2;
}