maga_transformer/cpp/kernels/layernorm_fp8_kernels.cu (20 lines): - line 400: __syncthreads(); // TODO check where should we put sync - line 403: __syncthreads(); // TODO check where should we put sync - line 405: __syncthreads(); // TODO check where should we put sync - line 491: __syncthreads(); // TODO check where should we put sync - line 494: __syncthreads(); // TODO check where should we put sync - line 496: __syncthreads(); // TODO check where should we put sync - line 513: // TODO: implement T2 = half2 - line 560: __syncthreads(); // TODO check where should we put sync - line 563: __syncthreads(); // TODO check where should we put sync - line 565: __syncthreads(); // TODO check where should we put sync - line 588: // TODO: implement T2 = half2 - line 630: __syncthreads(); // TODO check where should we put sync - line 633: __syncthreads(); // TODO check where should we put sync - line 635: __syncthreads(); // TODO check where should we put sync - line 772: param.normed_output[blockIdx.x * param.n + i] = (T1)local_out; // TODO This conversion has bug - line 776: __syncthreads(); // TODO check where should we put sync - line 851: __syncthreads(); // TODO check where should we put sync - line 977: // TODO (bhsueh) check the condition here - line 1039: __syncthreads(); // TODO check where should we put sync - line 1102: __syncthreads(); // TODO check where should we put sync maga_transformer/cpp/cuda/cublas/cublasFP8MMWrapper.cc (9 lines): - line 171: // TODO: Check that do we need to set these attributes - line 172: // TODO: comment them for compiler first - line 385: // TODO: Check that do we need to set these attributes - line 386: // TODO: comment them for compiler first - line 652: // TODO: Check that do we need to set these attributes - line 653: // TODO: comment them for compiler first - line 816: // TODO: Check that do we need to set these attributes - line 817: // TODO: comment them for compiler first - line 857: // (TODO Hongbinl)Not sure if the implementation makes sense maga_transformer/cpp/cutlass/cutlass_kernels/moe_gemm/moe_kernels.inl (5 lines): - line 71: // TODO Could linear search be better for small # experts - line 383: // TODO For some reason Volta fails on GELU_taylor here with Warp Illegal Instruction. - line 414: // TODO this is almost certainly faster as a linear scan - line 611: // TODO Some of this setup could be cached - line 898: // TODO: when bias_is_broadcast is false, fuse bias to gemm maga_transformer/cpp/cutlass/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fused_moe_kernel_routine.cuh (4 lines): - line 49: = params.ptr_fc1 + (2 * problem_index + 1) * N1 * K1; // TODO: we only focus on gated activation.. - line 51: = params.ptr_fc1 + 2 * problem_index * N1 * K1; // TODO: we only focus on gated activation.. - line 416: // (4.3) copy rf result to smem (TODO: maybe use forloop for better performance..) - line 770: // (4.3) copy rf result to smem (TODO: maybe use forloop for better performance..) maga_transformer/cpp/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.h (4 lines): - line 1393: // TODO: with cyclic kv cache, we set it 0 for now (will optimize in the future) - line 1582: // TODO: Use a better way to convert from T to float. - line 1765: // TODO: Use a better way to convert from T to float. - line 1882: // TODO: Use a better way to convert from T to float. maga_transformer/cpp/cutlass/cutlass_kernels/moe_gemm/moe_kernels.cu (4 lines): - line 767: // TODO we need to update the sparseMixerMask() function to mask all previous experts instead of just the most - line 844: // TODO: fix DeviceRadixSort - line 1269: // TODO Needs updated when gather/finalize fusion is integrated - line 1295: // TODO Make quant 2 & 4 bigger for FP8 if we ever change to scaling per expert maga_transformer/cpp/cutlass/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fused_moe_kernel.cuh (4 lines): - line 137: // TODO: use cuda core gemm here - line 149: // TODO: use cuda core gemm here - line 155: // TODO: use cuda core gemm here - line 160: // TODO: use cuda core gemm here maga_transformer/cpp/devices/cuda_impl/CudaFfnLayer.cc (3 lines): - line 195: // TODO: can use torch all gather unequal size to avoid copy - line 206: // TODO: why this assertion? - line 383: // TODO group_size maga_transformer/openai/renderers/qwen_agent/llm/base.py (3 lines): - line 114: raise NotImplementedError('Not implemented function_choice="none" yet.') # TODO: - line 221: # TODO: Postprocessing may be incorrect if delta_stream=True. - line 222: # TODO: Early break if truncated at stop words. maga_transformer/models/minicpmv/resampler.py (3 lines): - line 489: # TODO finish disentangling control flow so we don't do in-projections when statics are passed - line 498: # TODO finish disentangling control flow so we don't do in-projections when statics are passed - line 541: assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights" maga_transformer/cpp/kernels/no_aux_tc_kernels.cu (2 lines): - line 534: // TODO: norm_node - line 667: //@TODO: check if this default strategy is acceptable. Might need to leave it as nan array. maga_transformer/cpp/kernels/sampling_penalty_kernels.cu (2 lines): - line 25: // TODO Add half2 implementation - line 126: // TODO: Add macro or device function to get MAX_T_VAL. maga_transformer/cpp/api_server/HttpApiServer.cc (2 lines): - line 26: // TODO: queueSize may interleave with controller :( - line 414: // TODO: maga_transformer/models/minicpmv/modeling_navit_siglip.py (2 lines): - line 479: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache - line 547: # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. maga_transformer/cpp/models/GptModel.cc (2 lines): - line 289: // TODO: design better split strategy that consider the computational workload of each request - line 830: // TODO: maybe move this layernorm to ffn layer maga_transformer/cpp/rocm/rocmFmhaWrapper.cc (2 lines): - line 70: scale_s = 1.0 / ck_tile::sqrt(static_cast(hdim_q)); // TODO: q ? v ? - line 103: mask_info mask = mask_info::decode(msk_str, seqlen_q, seqlen_k); // TODO: we don't need x/y anymore maga_transformer/cpp/cutlass/cutlass_kernels/moe_gemm/moe_fp8_kernels.cu (2 lines): - line 210: // // TODO this is almost certainly faster as a linear scan - line 302: // // TODO this is almost certainly faster as a linear scan maga_transformer/cpp/devices/rocm_impl/ROCmFfnLayer.cc (2 lines): - line 36: // TODO: cuda version also not init this - line 42: // TODO group_size maga_transformer/cpp/devices/cuda_impl/CudaOps.cc (2 lines): - line 131: // TODO: change this to use efficient cuda kernel - line 441: // TODO: see if we should add overflow protection for offset maga_transformer/cpp/cutlass/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.inl (2 lines): - line 74: // TODO Update once mixed input support is added - line 161: // TODO Add mode for fused activation once CUTLASS adds support maga_transformer/cpp/kernels/sampling_topk_kernels.cu (1 line): - line 292: //@miji TODO: use block sum to make it faster maga_transformer/cpp/devices/arm_impl/ArmDevice.cc (1 line): - line 121: #define MAX_PRE_CALC_SEQ_LEN 1024 // TODO: get it from model config maga_transformer/cpp/devices/arm_impl/ArmGemmKaiOp.cc (1 line): - line 283: /* TODO maga_transformer/cpp/trt_plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp (1 line): - line 49: // TODO: add bf16 support maga_transformer/openai/api_datatype.py (1 line): - line 82: # TODO: maybe also implement Qwen Style function call. maga_transformer/cpp/kernels/moe_topKSoftmax_kernels.cu (1 line): - line 559: // TODO Could linear search be better for small # experts maga_transformer/cpp/cutlass/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h (1 line): - line 61: // TODO - Switch this to column major for weights since gemms should be more performant. maga_transformer/cpp/devices/arm_impl/gemm_opt/ArmGemmKernel.h (1 line): - line 219: p.do_act = 0; // TODO: change back to 1 maga_transformer/cpp/stream/GenerateStream.cc (1 line): - line 158: // TODO: maybe need fix when context and reuse maga_transformer/models/base_model.py (1 line): - line 57: tokenizer: Any = None # TODO: remove this maga_transformer/cpp/cuda/cublas/cublasMMWrapper.cc (1 line): - line 285: // TODO: default cublas libs maga_transformer/models/downstream_modules/classifier/classifier.py (1 line): - line 54: #TODO test it maga_transformer/cpp/rocm/quantizePreprocessors.cc (1 line): - line 515: // TODO : feifei maga_transformer/cpp/cutlass/cutlass_kernels/weightOnlyBatchedGemv/kernelDispatcher.h (1 line): - line 27: // TODO: maga_transformer/cpp/cutlass/cutlass_kernels/int8_gemm/int8_gemm_template.h (1 line): - line 145: // TODO: handle that maga_transformer/cpp/kernels/gpt_kernels.cu (1 line): - line 435: // TODO Add half2 implementation maga_transformer/cpp/disaggregate/rtpllm_master/tokenize/RemoteTokenizeModule.cpp (1 line): - line 21: //TODO: request timeout maga_transformer/cpp/devices/cuda_impl/CudaDeepEPFfnLayer.cc (1 line): - line 24: // TODO: check if get right maga_transformer/cpp/kernels/unfused_attention_kernels.cu (1 line): - line 1220: // TODO: optimize this kernel? maga_transformer/cpp/cuda/cufmha/fmha_profiling_interface.h (1 line): - line 153: // TODO: add print log maga_transformer/cpp/kernels/activation_fp8_kernels.cu (1 line): - line 266: // // TODO FP8 maga_transformer/cpp/devices/arm_impl/gemm_opt/ArmGemmThreadblock.cc (1 line): - line 184: float* bias_ptr = p.bias_ptr + n; // TODO: handle float16_t bias maga_transformer/cpp/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention.h (1 line): - line 118: // The current timestep. TODO Check that do we only this param in cross attention? maga_transformer/cpp/disaggregate/cache_store/MessagerClient.cpp (1 line): - line 84: request_block_buffer, timeout_ms - 10, partition_count, partition_id); // TODO: 10 is message transfer time maga_transformer/model_factory.py (1 line): - line 95: #TODO: remove model_config, get all info from gpt_config maga_transformer/cpp/rocm/rocmMoeWrapper.cc (1 line): - line 60: // TODO: temporarily fixed scale type maga_transformer/cpp/schedulers/FIFOScheduler.h (1 line): - line 69: // TODO @wangyin support different beams run togather maga_transformer/cpp/devices/Weights.h (1 line): - line 142: // TODO: This Weights class might be refactor into a complete model description maga_transformer/cpp/kernels/decoder_masked_multihead_attention_utils.h (1 line): - line 3422: // FIXME: maga_transformer/cpp/kernels/triton/aot_triton_kernel.bzl (1 line): - line 213: # TODO: get value from --python_top=//:python310 maga_transformer/cpp/normal_engine/NormalGenerateStream.cc (1 line): - line 162: //TODO: move it to better position maga_transformer/cpp/cuda/reduce_kernel_utils.cuh (1 line): - line 286: // TODO Add implementation here maga_transformer/openai/renderers/qwen_agent/llm/qwen_dashscope.py (1 line): - line 141: raise NotImplementedError('Not implemented function_choice="none" yet.') # TODO: maga_transformer/cpp/api_server/TokenProcessor.h (1 line): - line 37: // TODO: change to tokenizer wrapper maga_transformer/cpp/devices/cuda_impl/CudaGemmOp.cc (1 line): - line 164: // TODO: support it in ppu maga_transformer/server/frontend_worker.py (1 line): - line 97: #TODO temp fix sp with batch infer, will change request_id to str later maga_transformer/cpp/kernels/unfused_attention_fp8_kernels.cu (1 line): - line 1004: // TODO: bfloat162 computation ? maga_transformer/cpp/devices/cuda_impl/CudaAttentionOp.cc (1 line): - line 279: // TODO: refactor QBuffer to suppport view and return QBuffer maga_transformer/cpp/disaggregate/rtpllm_master/cluster/PrefillLoadBalancer.cpp (1 line): - line 111: //TODO: what happens if worker get error time? maybe use timedelta is better maga_transformer/cpp/devices/cuda_impl/DeepEPDefs.h (1 line): - line 192: // TODO: check which count will be ok maga_transformer/cpp/trt_plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp (1 line): - line 121: int const num_not_finished = num_rows; // TODO Take this as an input maga_transformer/cpp/position_ids_generator/PositionIdsGenerator.h (1 line): - line 23: // TODO: not same -> implement different interface here and BatchStreamProcessor maga_transformer/cpp/api_server/InferenceService.cc (1 line): - line 64: // TODO: adapter_name maga_transformer/device/device_impl.py (1 line): - line 316: # TODO: need add device infomation for selection maga_transformer/cpp/devices/cuda_impl/CudaDeepEPLLFfnLayer.cc (1 line): - line 62: // TODO: deep_ep_output might should be removed from output objects.