scenarios/workload-genai/policies/fragments/rate-limiting/adaptive-rate-limiting.xml (38 lines of code) (raw):
<fragment>
<azure-openai-token-limit counter-key="GlobalTokensLimit"
tokens-per-minute="500"
estimate-prompt-tokens="true"
remaining-tokens-header-name="x-apim-global-remaining-tokens"
remaining-tokens-variable-name="globalRemainingTokens"
tokens-consumed-header-name="x-apim-global-consumed-tokens"/>
<choose>
<when condition="@(context.Request.Headers.ContainsKey("x-higher-limit"))">
<set-variable name="higherLimit" value="@{
var defaultHigherLimit = 300;
if (context.Variables["globalRemainingTokens"] is int globalRemainingTokens && globalRemainingTokens > 0)
{
defaultHigherLimit += (int)(globalRemainingTokens * 0.1);
}
return (int)defaultHigherLimit;
}" />
<azure-openai-token-limit counter-key="highRateLimitSvc"
tokens-per-minute="@((int)context.Variables["higherLimit"])"
estimate-prompt-tokens="true"
remaining-tokens-header-name="x-apim-high-rate-remaining-tokens"/>
</when>
<otherwise>
<set-variable name="lowerRateLimit" value="@{
var defaultLowerRateLimit = 100;
if (context.Variables["globalRemainingTokens"] is int globalRemainingTokens && globalRemainingTokens > 0)
{
defaultLowerRateLimit += (int)(globalRemainingTokens * 0.1);
}
return defaultLowerRateLimit;
}" />
<azure-openai-token-limit counter-key="lowRateLimitSvc"
tokens-per-minute="@((int)context.Variables["lowerRateLimit"])"
estimate-prompt-tokens="true"
remaining-tokens-header-name="x-apim-low-rate-remaining-tokens"/>
</otherwise>
</choose>
</fragment>