src/webgpu/util/device_pool.ts (293 lines of code) (raw):
import { SkipTestCase, TestCaseRecorder } from '../../common/framework/fixture.js';
import { attemptGarbageCollection } from '../../common/util/collect_garbage.js';
import { getGPU, getDefaultRequestAdapterOptions } from '../../common/util/navigator_gpu.js';
import {
assert,
raceWithRejectOnTimeout,
assertReject,
unreachable,
} from '../../common/util/util.js';
import { getDefaultLimits, kLimits } from '../capability_info.js';
// MUST_NOT_BE_IMPORTED_BY_DATA_CACHE
// This file should not be transitively imported by .cache.ts files
export interface DeviceProvider {
/** Adapter the device was created from. Cannot be reused; just for adapter info. */
readonly adapter: GPUAdapter;
readonly device: GPUDevice;
expectDeviceLost(reason: GPUDeviceLostReason): void;
}
class TestFailedButDeviceReusable extends Error {}
class FeaturesNotSupported extends Error {}
export class TestOOMedShouldAttemptGC extends Error {}
/**
* DescriptorModifier lets you supply a function to select a device
* based on the limits/features available from the adapter.
* Devices pooled based on a key and that key is derived before
* an adapter is requested. That means you select key without
* knowledge of what the adapter will provide. You do this by
* providing a keyModifier function that appends a suffix.
*
* For example: If your modifier adds all the limits you might
* choose 'maxLimits' are your suffix
*
* ```js
* keyModifier(s: string) { return `${s}:maxLimits`; },
* ```
*
* If your modifier selects only `maxBindGroups` and `maxColorAttachments`
* then your suffix might be `maxBindGroups&maxColorAttachments`
*
* ```js
* keyModifier(s: string) { return `${s}:maxBindGroups&maxColorAttachments`; },
* ```
*/
export type DescriptorModifier = {
keyModifier(baseKey: string): string;
descriptorModifier(
adapter: GPUAdapter,
desc: CanonicalDeviceDescriptor | undefined
): CanonicalDeviceDescriptor;
};
export class DevicePool {
private holders: 'uninitialized' | 'failed' | DescriptorToHolderMap = 'uninitialized';
/** Acquire a device from the pool and begin the error scopes. */
async acquire(
recorder: TestCaseRecorder,
descriptor: UncanonicalizedDeviceDescriptor | undefined,
descriptorModifier: DescriptorModifier | undefined
): Promise<DeviceProvider> {
let errorMessage = '';
if (this.holders === 'uninitialized') {
this.holders = new DescriptorToHolderMap();
try {
await this.holders.getOrCreate(recorder, undefined, descriptorModifier);
} catch (ex) {
this.holders = 'failed';
if (ex instanceof Error) {
errorMessage = ` with ${ex.name} "${ex.message}"`;
}
}
}
assert(
this.holders !== 'failed',
`WebGPU device failed to initialize${errorMessage}; not retrying`
);
const holder = await this.holders.getOrCreate(recorder, descriptor, descriptorModifier);
assert(holder.state === 'free', 'Device was in use on DevicePool.acquire');
holder.state = 'acquired';
holder.beginTestScope();
return holder;
}
/**
* End the error scopes and check for errors.
* Then, if the device seems reusable, release it back into the pool. Otherwise, drop it.
*/
async release(holder: DeviceProvider): Promise<void> {
assert(this.holders instanceof DescriptorToHolderMap, 'DevicePool got into a bad state');
assert(holder instanceof DeviceHolder, 'DeviceProvider should always be a DeviceHolder');
assert(holder.state === 'acquired', 'trying to release a device while already released');
try {
await holder.endTestScope();
// (Hopefully if the device was lost, it has been reported by the time endErrorScopes()
// has finished (or timed out). If not, it could cause a finite number of extra test
// failures following this one (but should recover eventually).)
assert(
holder.lostInfo === undefined,
`Device was unexpectedly lost. Reason: ${holder.lostInfo?.reason}, Message: ${holder.lostInfo?.message}`
);
} catch (ex) {
// Any error that isn't explicitly TestFailedButDeviceReusable forces a new device to be
// created for the next test.
if (!(ex instanceof TestFailedButDeviceReusable)) {
this.holders.delete(holder);
if ('destroy' in holder.device) {
holder.device.destroy();
// Wait for destruction (or actual device loss if any) to complete.
await holder.device.lost;
}
// Release the (hopefully only) ref to the GPUDevice.
holder.releaseGPUDevice();
// Try to clean up, in case there are stray GPU resources in need of collection.
if (ex instanceof TestOOMedShouldAttemptGC) {
await attemptGarbageCollection();
}
}
// In the try block, we may throw an error if the device is lost in order to force device
// reinitialization, however, if the device lost was expected we want to suppress the error
// The device lost is expected when `holder.expectedLostReason` is equal to
// `holder.lostInfo.reason`.
const expectedDeviceLost =
holder.expectedLostReason !== undefined &&
holder.lostInfo !== undefined &&
holder.expectedLostReason === holder.lostInfo.reason;
if (!expectedDeviceLost) {
throw ex;
}
} finally {
// Mark the holder as free so the device can be reused (if it's still in this.devices).
holder.state = 'free';
}
}
}
/**
* Map from GPUDeviceDescriptor to DeviceHolder.
*/
class DescriptorToHolderMap {
/** Map keys that are known to be unsupported and can be rejected quickly. */
private unsupported: Set<string> = new Set();
private holders: Map<string, DeviceHolder> = new Map();
/** Deletes an item from the map by DeviceHolder value. */
delete(holder: DeviceHolder): void {
for (const [k, v] of this.holders) {
if (v === holder) {
this.holders.delete(k);
return;
}
}
unreachable("internal error: couldn't find DeviceHolder to delete");
}
/**
* Gets a DeviceHolder from the map if it exists; otherwise, calls create() to create one,
* inserts it, and returns it.
*
* If an `uncanonicalizedDescriptor` is provided, it is canonicalized and used as the map key.
* If one is not provided, the map key is `""` (empty string).
*
* Throws SkipTestCase if devices with this descriptor are unsupported.
*/
async getOrCreate(
recorder: TestCaseRecorder,
uncanonicalizedDescriptor: UncanonicalizedDeviceDescriptor | undefined,
descriptorModifier: DescriptorModifier | undefined
): Promise<DeviceHolder> {
const [descriptor, baseKey] = canonicalizeDescriptor(uncanonicalizedDescriptor);
const key = descriptorModifier?.keyModifier(baseKey) || baseKey;
// Quick-reject descriptors that are known to be unsupported already.
if (this.unsupported.has(key)) {
throw new SkipTestCase(
`GPUDeviceDescriptor previously failed: ${JSON.stringify(descriptor)}`
);
}
// Search for an existing device with the same descriptor.
{
const value = this.holders.get(key);
if (value) {
// Move it to the end of the Map (most-recently-used).
this.holders.delete(key);
this.holders.set(key, value);
return value;
}
}
// No existing item was found; add a new one.
let value;
try {
value = await DeviceHolder.create(recorder, descriptor, descriptorModifier);
} catch (ex) {
if (ex instanceof FeaturesNotSupported) {
this.unsupported.add(key);
throw new SkipTestCase(
`GPUDeviceDescriptor not supported: ${JSON.stringify(descriptor)}\n${ex?.message ?? ''}`
);
}
throw ex;
}
this.insertAndCleanUp(key, value);
return value;
}
/** Insert an entry, then remove the least-recently-used items if there are too many. */
private insertAndCleanUp(key: string, value: DeviceHolder) {
this.holders.set(key, value);
const kMaxEntries = 5;
if (this.holders.size > kMaxEntries) {
// Delete the first (least recently used) item in the set.
for (const [key] of this.holders) {
this.holders.delete(key);
return;
}
}
}
}
export type UncanonicalizedDeviceDescriptor = {
requiredFeatures?: Iterable<GPUFeatureName>;
requiredLimits?: Record<string, GPUSize32>;
};
export type CanonicalDeviceDescriptor = Omit<
Required<GPUDeviceDescriptor>,
'label' | 'nonGuaranteedFeatures' | 'nonGuaranteedLimits'
>;
/**
* Make a stringified map-key from a GPUDeviceDescriptor.
* Tries to make sure all defaults are resolved, first - but it's okay if some are missed
* (it just means some GPUDevice objects won't get deduplicated).
*
* This does **not** canonicalize `undefined` (the "default" descriptor) into a fully-qualified
* GPUDeviceDescriptor. This is just because `undefined` is a common case and we want to use it
* as a sanity check that WebGPU is working.
*/
function canonicalizeDescriptor(
desc: UncanonicalizedDeviceDescriptor | undefined
): [CanonicalDeviceDescriptor | undefined, string] {
if (desc === undefined) {
return [undefined, ''];
}
const featuresCanonicalized = desc.requiredFeatures
? Array.from(new Set(desc.requiredFeatures)).sort()
: [];
/** Canonicalized version of the requested limits: in canonical order, with only values which are
* specified _and_ non-default. */
const limitsCanonicalized: Record<string, number> = {};
// MAINTENANCE_TODO: Remove cast when @webgpu/types includes compatibilityMode
const adapterOptions = getDefaultRequestAdapterOptions() as unknown as {
compatibilityMode?: boolean;
};
const featureLevel = adapterOptions?.compatibilityMode ? 'compatibility' : 'core';
const defaultLimits = getDefaultLimits(featureLevel);
if (desc.requiredLimits) {
for (const limit of kLimits) {
const requestedValue = desc.requiredLimits[limit];
const defaultValue = defaultLimits[limit].default;
// Skip adding a limit to limitsCanonicalized if it is the same as the default.
if (requestedValue !== undefined && requestedValue !== defaultValue) {
limitsCanonicalized[limit] = requestedValue;
}
}
}
// Type ensures every field is carried through.
const descriptorCanonicalized: CanonicalDeviceDescriptor = {
requiredFeatures: featuresCanonicalized,
requiredLimits: limitsCanonicalized,
defaultQueue: {},
};
return [descriptorCanonicalized, JSON.stringify(descriptorCanonicalized)];
}
function supportsFeature(
adapter: GPUAdapter,
descriptor: CanonicalDeviceDescriptor | undefined
): boolean {
if (descriptor === undefined) {
return true;
}
for (const feature of descriptor.requiredFeatures) {
if (!adapter.features.has(feature)) {
return false;
}
}
return true;
}
/**
* DeviceHolder has three states:
* - 'free': Free to be used for a new test.
* - 'acquired': In use by a running test.
*/
type DeviceHolderState = 'free' | 'acquired';
/**
* Holds a GPUDevice and tracks its state (free/acquired) and handles device loss.
*/
class DeviceHolder implements DeviceProvider {
/** Adapter the device was created from. Cannot be reused; just for adapter info. */
readonly adapter: GPUAdapter;
/** The device. Will be cleared during cleanup if there were unexpected errors. */
private _device: GPUDevice | undefined;
/** Whether the device is in use by a test or not. */
state: DeviceHolderState = 'free';
/** initially undefined; becomes set when the device is lost */
lostInfo?: GPUDeviceLostInfo;
/** Set if the device is expected to be lost. */
expectedLostReason?: GPUDeviceLostReason;
// Gets a device and creates a DeviceHolder.
// If the device is lost, DeviceHolder.lost gets set.
static async create(
recorder: TestCaseRecorder,
descriptor: CanonicalDeviceDescriptor | undefined,
descriptorModifier: DescriptorModifier | undefined
): Promise<DeviceHolder> {
const gpu = getGPU(recorder);
const adapter = await gpu.requestAdapter();
assert(adapter !== null, 'requestAdapter returned null');
if (descriptorModifier) {
descriptor = descriptorModifier.descriptorModifier(adapter, descriptor);
}
if (!supportsFeature(adapter, descriptor)) {
throw new FeaturesNotSupported('One or more features are not supported');
}
// No trackForCleanup because we plan to reuse the device for the next test.
// eslint-disable-next-line no-restricted-syntax
const device = await adapter.requestDevice(descriptor);
assert(device !== null, 'requestDevice returned null');
return new DeviceHolder(adapter, device);
}
private constructor(adapter: GPUAdapter, device: GPUDevice) {
this.adapter = adapter;
this._device = device;
void this._device.lost.then(ev => {
this.lostInfo = ev;
});
}
get device() {
assert(this._device !== undefined);
return this._device;
}
/** Push error scopes that surround test execution. */
beginTestScope(): void {
assert(this.state === 'acquired');
this.device.pushErrorScope('validation');
this.device.pushErrorScope('internal');
this.device.pushErrorScope('out-of-memory');
}
/** Mark the DeviceHolder as expecting a device loss when the test scope ends. */
expectDeviceLost(reason: GPUDeviceLostReason) {
assert(this.state === 'acquired');
this.expectedLostReason = reason;
}
/**
* Attempt to end test scopes: Check that there are no extra error scopes, and that no
* otherwise-uncaptured errors occurred during the test. Time out if it takes too long.
*/
endTestScope(): Promise<void> {
assert(this.state === 'acquired');
const kTimeout = 5000;
// Time out if attemptEndTestScope (popErrorScope or onSubmittedWorkDone) never completes. If
// this rejects, the device won't be reused, so it's OK that popErrorScope calls may not have
// finished.
//
// This could happen due to a browser bug - e.g.,
// as of this writing, on Chrome GPU process crash, popErrorScope just hangs.
return raceWithRejectOnTimeout(this.attemptEndTestScope(), kTimeout, 'endTestScope timed out');
}
private async attemptEndTestScope(): Promise<void> {
let gpuValidationError: GPUError | null;
let gpuInternalError: GPUError | null;
let gpuOutOfMemoryError: GPUError | null;
// Submit to the queue to attempt to force a GPU flush.
this.device.queue.submit([]);
try {
// May reject if the device was lost.
[gpuOutOfMemoryError, gpuInternalError, gpuValidationError] = await Promise.all([
this.device.popErrorScope(),
this.device.popErrorScope(),
this.device.popErrorScope(),
]);
} catch (ex) {
assert(this.lostInfo !== undefined, 'popErrorScope failed; did beginTestScope get missed?');
throw ex;
}
// Attempt to wait for the queue to be idle.
if (this.device.queue.onSubmittedWorkDone) {
await this.device.queue.onSubmittedWorkDone();
}
await assertReject('OperationError', this.device.popErrorScope(), {
allowMissingStack: true,
message: 'There was an extra error scope on the stack after a test',
});
if (gpuOutOfMemoryError !== null) {
assert(gpuOutOfMemoryError instanceof GPUOutOfMemoryError);
// Don't allow the device to be reused; unexpected OOM could break the device.
throw new TestOOMedShouldAttemptGC('Unexpected out-of-memory error occurred');
}
if (gpuInternalError !== null) {
assert(gpuInternalError instanceof GPUInternalError);
// Allow the device to be reused.
throw new TestFailedButDeviceReusable(
`Unexpected internal error occurred: ${gpuInternalError.message}`
);
}
if (gpuValidationError !== null) {
assert(gpuValidationError instanceof GPUValidationError);
// Allow the device to be reused.
throw new TestFailedButDeviceReusable(
`Unexpected validation error occurred: ${gpuValidationError.message}`
);
}
}
/**
* Release the ref to the GPUDevice. This should be the only ref held by the DevicePool or
* GPUTest, so in theory it can get garbage collected.
*/
releaseGPUDevice(): void {
this._device = undefined;
}
}