in extract.py [0:0]
def main(args):
model, alphabet = pretrained.load_model_and_alphabet(args.model_location)
model.eval()
if torch.cuda.is_available() and not args.nogpu:
model = model.cuda()
print("Transferred model to GPU")
dataset = FastaBatchedDataset.from_file(args.fasta_file)
batches = dataset.get_batch_indices(args.toks_per_batch, extra_toks_per_seq=1)
data_loader = torch.utils.data.DataLoader(
dataset, collate_fn=alphabet.get_batch_converter(), batch_sampler=batches
)
print(f"Read {args.fasta_file} with {len(dataset)} sequences")
args.output_dir.mkdir(parents=True, exist_ok=True)
return_contacts = "contacts" in args.include
assert all(-(model.num_layers + 1) <= i <= model.num_layers for i in args.repr_layers)
repr_layers = [(i + model.num_layers + 1) % (model.num_layers + 1) for i in args.repr_layers]
with torch.no_grad():
for batch_idx, (labels, strs, toks) in enumerate(data_loader):
print(
f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)"
)
if torch.cuda.is_available() and not args.nogpu:
toks = toks.to(device="cuda", non_blocking=True)
# The model is trained on truncated sequences and passing longer ones in at
# infernce will cause an error. See https://github.com/facebookresearch/esm/issues/21
if args.truncate:
toks = toks[:, :1022]
out = model(toks, repr_layers=repr_layers, return_contacts=return_contacts)
logits = out["logits"].to(device="cpu")
representations = {
layer: t.to(device="cpu") for layer, t in out["representations"].items()
}
if return_contacts:
contacts = out["contacts"].to(device="cpu")
for i, label in enumerate(labels):
args.output_file = args.output_dir / f"{label}.pt"
args.output_file.parent.mkdir(parents=True, exist_ok=True)
result = {"label": label}
# Call clone on tensors to ensure tensors are not views into a larger representation
# See https://github.com/pytorch/pytorch/issues/1995
if "per_tok" in args.include:
result["representations"] = {
layer: t[i, 1 : len(strs[i]) + 1].clone()
for layer, t in representations.items()
}
if "mean" in args.include:
result["mean_representations"] = {
layer: t[i, 1 : len(strs[i]) + 1].mean(0).clone()
for layer, t in representations.items()
}
if "bos" in args.include:
result["bos_representations"] = {
layer: t[i, 0].clone() for layer, t in representations.items()
}
if return_contacts:
result["contacts"] = contacts[i, : len(strs[i]), : len(strs[i])].clone()
torch.save(
result,
args.output_file,
)