scenarios/workload-genai/policies/fragments/rate-limiting/adaptive-rate-limiting.xml (38 lines of code) (raw):

<fragment> <azure-openai-token-limit counter-key="GlobalTokensLimit" tokens-per-minute="500" estimate-prompt-tokens="true" remaining-tokens-header-name="x-apim-global-remaining-tokens" remaining-tokens-variable-name="globalRemainingTokens" tokens-consumed-header-name="x-apim-global-consumed-tokens"/> <choose> <when condition="@(context.Request.Headers.ContainsKey("x-higher-limit"))"> <set-variable name="higherLimit" value="@{ var defaultHigherLimit = 300; if (context.Variables["globalRemainingTokens"] is int globalRemainingTokens && globalRemainingTokens > 0) { defaultHigherLimit += (int)(globalRemainingTokens * 0.1); } return (int)defaultHigherLimit; }" /> <azure-openai-token-limit counter-key="highRateLimitSvc" tokens-per-minute="@((int)context.Variables["higherLimit"])" estimate-prompt-tokens="true" remaining-tokens-header-name="x-apim-high-rate-remaining-tokens"/> </when> <otherwise> <set-variable name="lowerRateLimit" value="@{ var defaultLowerRateLimit = 100; if (context.Variables["globalRemainingTokens"] is int globalRemainingTokens && globalRemainingTokens > 0) { defaultLowerRateLimit += (int)(globalRemainingTokens * 0.1); } return defaultLowerRateLimit; }" /> <azure-openai-token-limit counter-key="lowRateLimitSvc" tokens-per-minute="@((int)context.Variables["lowerRateLimit"])" estimate-prompt-tokens="true" remaining-tokens-header-name="x-apim-low-rate-remaining-tokens"/> </otherwise> </choose> </fragment>