src/nanotron/generation/decode.py [355:433]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - elif sampler_type == SamplerType.BASIC: sampler = BasicSampler(pg=parallel_context.tp_pg) else: raise NotImplementedError(f"Sampler type {sampler_type} is not implemented") new_decoder_input_ids = sampler(sharded_logits=sharded_logits[:, -1, :]) # TODO @thomasw21: Handle this correctly, ie from some point after this should only generate masked tokens # TODO @thomasw21: Actually I can probably build this thing on the next device directly. Will save some communication new_decoder_input_mask = torch.ones( size=(new_decoder_input_ids.shape[0], 1), dtype=torch.bool, device=new_decoder_input_ids.device, ) # TODO @thomasw21: We need to have stop condition. # broadcast new_tokens to everyone if decoder_input_rank == decoder_logit_rank: # It's the same rank so no need to do anything too fancy all_new_decoder_input_ids_and_mask_same_rank.append( (new_decoder_input_ids, new_decoder_input_mask) ) else: pipeline_state.register_send_activation( new_decoder_input_ids, to_rank=decoder_input_rank, p2p=p2p ) pipeline_state.register_send_activation( new_decoder_input_mask, to_rank=decoder_input_rank, p2p=p2p ) if not is_max_nb_microbatches and state_id == number_states_in_buffer - 1: # Send new_decoder_input_ids AND new_decoder_input_ids pipeline_state.run_communication() pipeline_state.run_communication() else: assert isinstance(sharded_logits, TensorPointer) all_new_decoder_input_ids_and_mask: Iterable[ Tuple[Union[torch.LongTensor, TensorPointer], Union[torch.BoolTensor, TensorPointer]] ] if is_decoder_input_rank: # We receive the tensor from other ranks unless `decoder_input_rank` == `decoder_logit_rank` in which case `all_new_decoder_input_ids` is already populated. if decoder_input_rank == decoder_logit_rank: # `all_new_decoder_input_ids_and_mask_same_rank` is already populated. Since `decoder_input_rank` and `decoder_logit_rank` are the same, there's no need to communicate as we can just store the new input_ids in a list. assert len(all_new_decoder_input_ids_and_mask_same_rank) == number_states_in_buffer all_new_decoder_input_ids_and_mask = all_new_decoder_input_ids_and_mask_same_rank else: def generator(): for _ in range(number_states_in_buffer): pipeline_state.register_recv_activation(from_rank=decoder_logit_rank, p2p=p2p) pipeline_state.register_recv_activation(from_rank=decoder_logit_rank, p2p=p2p) while len(pipeline_state.activations_buffer) < 2: pipeline_state.run_communication() new_decoder_input_ids = pipeline_state.activations_buffer.popleft() new_decoder_input_mask = pipeline_state.activations_buffer.popleft() yield new_decoder_input_ids, new_decoder_input_mask all_new_decoder_input_ids_and_mask = iter(generator()) else: all_new_decoder_input_ids_and_mask = ( (TensorPointer(group_rank=decoder_input_rank), TensorPointer(group_rank=decoder_input_rank)) for _ in range(number_states_in_buffer) ) # Create new decoder states decoder_states = ( GenerationStates( new_input_ids=new_decoder_input_ids_and_mask[0], new_input_mask=new_decoder_input_ids_and_mask[1], store=state.store, generation_ids=state.generation_ids + [new_decoder_input_ids_and_mask[0]], generation_mask=state.generation_mask + [new_decoder_input_ids_and_mask[1]], ) for state, new_decoder_input_ids_and_mask in zip( new_decoder_states, all_new_decoder_input_ids_and_mask ) ) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - src/nanotron/generation/decode.py [673:751]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - elif sampler_type == SamplerType.BASIC: sampler = BasicSampler(pg=parallel_context.tp_pg) else: raise NotImplementedError(f"Sampler type {sampler_type} is not implemented") new_decoder_input_ids = sampler(sharded_logits=sharded_logits[:, -1, :]) # TODO @thomasw21: Handle this correctly, ie from some point after this should only generate masked tokens # TODO @thomasw21: Actually I can probably build this thing on the next device directly. Will save some communication new_decoder_input_mask = torch.ones( size=(new_decoder_input_ids.shape[0], 1), dtype=torch.bool, device=new_decoder_input_ids.device, ) # TODO @thomasw21: We need to have stop condition. # broadcast new_tokens to everyone if decoder_input_rank == decoder_logit_rank: # It's the same rank so no need to do anything too fancy all_new_decoder_input_ids_and_mask_same_rank.append( (new_decoder_input_ids, new_decoder_input_mask) ) else: pipeline_state.register_send_activation( new_decoder_input_ids, to_rank=decoder_input_rank, p2p=p2p ) pipeline_state.register_send_activation( new_decoder_input_mask, to_rank=decoder_input_rank, p2p=p2p ) if not is_max_nb_microbatches and state_id == number_states_in_buffer - 1: # Send new_decoder_input_ids AND new_decoder_input_ids pipeline_state.run_communication() pipeline_state.run_communication() else: assert isinstance(sharded_logits, TensorPointer) all_new_decoder_input_ids_and_mask: Iterable[ Tuple[Union[torch.LongTensor, TensorPointer], Union[torch.BoolTensor, TensorPointer]] ] if is_decoder_input_rank: # We receive the tensor from other ranks unless `decoder_input_rank` == `decoder_logit_rank` in which case `all_new_decoder_input_ids` is already populated. if decoder_input_rank == decoder_logit_rank: # `all_new_decoder_input_ids_and_mask_same_rank` is already populated. Since `decoder_input_rank` and `decoder_logit_rank` are the same, there's no need to communicate as we can just store the new input_ids in a list. assert len(all_new_decoder_input_ids_and_mask_same_rank) == number_states_in_buffer all_new_decoder_input_ids_and_mask = all_new_decoder_input_ids_and_mask_same_rank else: def generator(): for _ in range(number_states_in_buffer): pipeline_state.register_recv_activation(from_rank=decoder_logit_rank, p2p=p2p) pipeline_state.register_recv_activation(from_rank=decoder_logit_rank, p2p=p2p) while len(pipeline_state.activations_buffer) < 2: pipeline_state.run_communication() new_decoder_input_ids = pipeline_state.activations_buffer.popleft() new_decoder_input_mask = pipeline_state.activations_buffer.popleft() yield new_decoder_input_ids, new_decoder_input_mask all_new_decoder_input_ids_and_mask = iter(generator()) else: all_new_decoder_input_ids_and_mask = ( (TensorPointer(group_rank=decoder_input_rank), TensorPointer(group_rank=decoder_input_rank)) for _ in range(number_states_in_buffer) ) # Create new decoder states decoder_states = ( GenerationStates( new_input_ids=new_decoder_input_ids_and_mask[0], new_input_mask=new_decoder_input_ids_and_mask[1], store=state.store, generation_ids=state.generation_ids + [new_decoder_input_ids_and_mask[0]], generation_mask=state.generation_mask + [new_decoder_input_ids_and_mask[1]], ) for state, new_decoder_input_ids_and_mask in zip( new_decoder_states, all_new_decoder_input_ids_and_mask ) ) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -