in trainers/catex.py [0:0]
def __init__(self, cfg, classnames, clip_model):
super().__init__()
n_cls = len(classnames)
n_ctx = cfg.TRAINER.CATEX.N_CTX
ctx_init = cfg.TRAINER.CATEX.CTX_INIT
dtype = clip_model.dtype
ctx_dim = clip_model.ln_final.weight.shape[0]
clip_imsize = clip_model.visual.input_resolution
cfg_imsize = cfg.INPUT.SIZE[0]
assert cfg_imsize == clip_imsize, f"cfg_imsize ({cfg_imsize}) must equal to clip_imsize ({clip_imsize})"
self.adjust_cls_promt = False
self.cfg = cfg
ctx_common = None
if ctx_init and 'ensemble' not in ctx_init:
# use given words to initialize context vectors
ctx_init = ctx_init.replace("_", " ")
n_ctx = len(ctx_init.split(" "))
prompt = clip.tokenize(ctx_init)
with torch.no_grad():
embedding = clip_model.token_embedding(prompt).type(dtype)
ctx_vectors = embedding[0, 1 : 1 + n_ctx, :]
prompt_prefix = ctx_init
else:
# random initialization
if cfg.TRAINER.CATEX.CSC:
print("Initializing class-specific contexts")
ctx_vectors = torch.empty(n_cls, n_ctx, ctx_dim, dtype=dtype)
nn.init.normal_(ctx_vectors, std=0.02)
else:
print("Initializing a generic context")
ctx_vectors = torch.empty(n_ctx, ctx_dim, dtype=dtype)
nn.init.normal_(ctx_vectors, std=0.02)
prompt_prefix = " ".join(["X"] * n_ctx)
print(f'Initial context: "{prompt_prefix}"')
print(f"Number of context words (tokens): {n_ctx}")
self.ctx = nn.Parameter(ctx_vectors) # to be optimized
self.ctx_cm = nn.Parameter(ctx_common) if ctx_common is not None else None
if cfg.TRAINER.OOD_PROMPT:
if cfg.TRAINER.OOD_PROMPT_NUM > 1:
self.ctx_ood = []
for _ in range(cfg.TRAINER.OOD_PROMPT_NUM):
ctx_ood = torch.empty(n_cls, n_ctx, ctx_dim, dtype=dtype)
nn.init.normal_(ctx_ood, std=0.02)
self.ctx_ood.append(nn.Parameter(ctx_ood))
self.ctx_ood = nn.ParameterList(self.ctx_ood)
else: ## TODO: compatible for pre-trained weights
ctx_ood = torch.empty(n_cls, n_ctx, ctx_dim, dtype=dtype)
nn.init.normal_(ctx_ood, std=0.02)
self.ctx_ood = nn.Parameter(ctx_ood)
classnames = [name.replace("_", " ") for name in classnames]
name_lens = [len(_tokenizer.encode(name)) for name in classnames]
prompts = [prompt_prefix + " " + name + "." for name in classnames]
tokenized_prompts = torch.cat([clip.tokenize(p) for p in prompts])
with torch.no_grad():
embedding = clip_model.token_embedding(tokenized_prompts).type(dtype)
# These token vectors will be saved when in save_model(),
# but they should be ignored in load_model() as we want to use
# those computed using the current class names
self.register_buffer("token_prefix", embedding[:, :1, :]) # SOS
if self.adjust_cls_promt:
self.token_suffix = nn.Parameter(embedding[:, 1 + n_ctx :, :])
else:
self.register_buffer("token_suffix", embedding[:, 1 + n_ctx :, :]) # CLS, EOS
self.n_cls = n_cls
self.n_ctx = n_ctx
self.tokenized_prompts = tokenized_prompts # torch.Tensor
self.name_lens = name_lens
self.class_token_position = cfg.TRAINER.CATEX.CLASS_TOKEN_POSITION