scenarios/workload-genai/policies/genai-policy.xml (19 lines of code) (raw):
<policies>
<inbound>
<base />
<!-- sets the backed to the load balanced pool-->
<include-fragment fragment-id="simple-priority-weighted" />
<!-- Rate limiting: applies tokens constraint defined as per the fragment, doesn't set the backend-->
<!-- there are other variants that can be used as well-->
<include-fragment fragment-id="rate-limiting-by-tokens" />
<!-- tracks token consumption as custom metrics in App insights -->
<include-fragment fragment-id="usage-tracking-with-appinsights" />
</inbound>
<backend>
<!-- either base or the fragment needs to be present. -->
<!-- <base /> -->
<!-- set the count to the number of backends-->
<retry condition="@(context.Response.StatusCode == 429)" count="3" interval="1" first-fast-retry="true">
<forward-request buffer-request-body="true" />
</retry>
<!-- END: sets backend service -->
</backend>
<outbound>
<base />
<!-- tracks token consumption using the event hub, this can be used as a workaround. -->
<!-- <include-fragment fragment-id="usage-tracking-with-eventhub" /> -->
</outbound>
<on-error>
<base />
</on-error>
</policies>