in torchbiggraph/train_gpu.py [0:0]
def run(self) -> None:
torch.set_num_threads(1)
torch.cuda.set_device(self.my_device)
if self.subprocess_init is not None:
self.subprocess_init()
self.master_endpoint.close()
for s in self.embedding_storage_freelist:
assert s.is_shared()
cudart = torch.cuda.cudart()
res = cudart.cudaHostRegister(s.data_ptr(), s.size() * s.element_size(), 0)
torch.cuda.check_error(res)
assert s.is_pinned()
logger.info(f"GPU subprocess {self.gpu_idx} up and running")
while True:
try:
job: SubprocessArgs = self.worker_endpoint.recv()
except EOFError:
break
stats = self.do_one_job(
lhs_types=job.lhs_types,
rhs_types=job.rhs_types,
lhs_part=job.lhs_part,
rhs_part=job.rhs_part,
lhs_subpart=job.lhs_subpart,
rhs_subpart=job.rhs_subpart,
next_lhs_subpart=job.next_lhs_subpart,
next_rhs_subpart=job.next_rhs_subpart,
model=job.model,
trainer=job.trainer,
all_embs=job.all_embs,
subpart_slices=job.subpart_slices,
subbuckets=job.subbuckets,
batch_size=job.batch_size,
lr=job.lr,
)
self.worker_endpoint.send(
SubprocessReturn(gpu_idx=self.gpu_idx, stats=stats)
)