maga_transformer/cpp/kernels/layernorm_fp8_kernels.cu (20 lines):
	- line 400: __syncthreads();  // TODO check where should we put sync
	- line 403: __syncthreads();  // TODO check where should we put sync
	- line 405: __syncthreads();  // TODO check where should we put sync
	- line 491: __syncthreads();  // TODO check where should we put sync
	- line 494: __syncthreads();  // TODO check where should we put sync
	- line 496: __syncthreads();  // TODO check where should we put sync
	- line 513: // TODO: implement T2 = half2
	- line 560: __syncthreads();  // TODO check where should we put sync
	- line 563: __syncthreads();  // TODO check where should we put sync
	- line 565: __syncthreads();  // TODO check where should we put sync
	- line 588: // TODO: implement T2 = half2
	- line 630: __syncthreads();  // TODO check where should we put sync
	- line 633: __syncthreads();  // TODO check where should we put sync
	- line 635: __syncthreads();  // TODO check where should we put sync
	- line 772: param.normed_output[blockIdx.x * param.n + i] = (T1)local_out;  // TODO This conversion has bug
	- line 776: __syncthreads();  // TODO check where should we put sync
	- line 851: __syncthreads();  // TODO check where should we put sync
	- line 977: // TODO (bhsueh) check the condition here
	- line 1039: __syncthreads();  // TODO check where should we put sync
	- line 1102: __syncthreads();  // TODO check where should we put sync


maga_transformer/cpp/cuda/cublas/cublasFP8MMWrapper.cc (9 lines):
	- line 171: // TODO: Check that do we need to set these attributes
	- line 172: // TODO: comment them for compiler first
	- line 385: // TODO: Check that do we need to set these attributes
	- line 386: // TODO: comment them for compiler first
	- line 652: // TODO: Check that do we need to set these attributes
	- line 653: // TODO: comment them for compiler first
	- line 816: // TODO: Check that do we need to set these attributes
	- line 817: // TODO: comment them for compiler first
	- line 857: // (TODO Hongbinl)Not sure if the implementation makes sense


maga_transformer/cpp/cutlass/cutlass_kernels/moe_gemm/moe_kernels.inl (5 lines):
	- line 71: // TODO Could linear search be better for small # experts
	- line 383: // TODO For some reason Volta fails on GELU_taylor here with Warp Illegal Instruction.
	- line 414: // TODO this is almost certainly faster as a linear scan
	- line 611: // TODO Some of this setup could be cached
	- line 898: // TODO: when bias_is_broadcast is false, fuse bias to gemm


maga_transformer/cpp/cutlass/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fused_moe_kernel_routine.cuh (4 lines):
	- line 49: = params.ptr_fc1 + (2 * problem_index + 1) * N1 * K1; // TODO: we only focus on gated activation..
	- line 51: = params.ptr_fc1 + 2 * problem_index * N1 * K1;       // TODO: we only focus on gated activation..
	- line 416: // (4.3) copy rf result to smem (TODO: maybe use forloop for better performance..)
	- line 770: // (4.3) copy rf result to smem (TODO: maybe use forloop for better performance..)


maga_transformer/cpp/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.h (4 lines):
	- line 1393: // TODO: with cyclic kv cache, we set it 0 for now (will optimize in the future)
	- line 1582: // TODO: Use a better way to convert from T to float.
	- line 1765: // TODO: Use a better way to convert from T to float.
	- line 1882: // TODO: Use a better way to convert from T to float.


maga_transformer/cpp/cutlass/cutlass_kernels/moe_gemm/moe_kernels.cu (4 lines):
	- line 767: // TODO we need to update the sparseMixerMask() function to mask all previous experts instead of just the most
	- line 844: // TODO: fix DeviceRadixSort
	- line 1269: // TODO Needs updated when gather/finalize fusion is integrated
	- line 1295: // TODO Make quant 2 & 4 bigger for FP8 if we ever change to scaling per expert


maga_transformer/cpp/cutlass/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fused_moe_kernel.cuh (4 lines):
	- line 137: // TODO: use cuda core gemm here
	- line 149: // TODO: use cuda core gemm here
	- line 155: // TODO: use cuda core gemm here
	- line 160: // TODO: use cuda core gemm here


maga_transformer/cpp/devices/cuda_impl/CudaFfnLayer.cc (3 lines):
	- line 195: // TODO: can use torch all gather unequal size to avoid copy
	- line 206: // TODO: why this assertion?
	- line 383: // TODO group_size


maga_transformer/openai/renderers/qwen_agent/llm/base.py (3 lines):
	- line 114: raise NotImplementedError('Not implemented function_choice="none" yet.')  # TODO:
	- line 221: # TODO: Postprocessing may be incorrect if delta_stream=True.
	- line 222: # TODO: Early break if truncated at stop words.


maga_transformer/models/minicpmv/resampler.py (3 lines):
	- line 489: # TODO finish disentangling control flow so we don't do in-projections when statics are passed
	- line 498: # TODO finish disentangling control flow so we don't do in-projections when statics are passed
	- line 541: assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"


maga_transformer/cpp/kernels/no_aux_tc_kernels.cu (2 lines):
	- line 534: // TODO: norm_node
	- line 667: //@TODO: check if this default strategy is acceptable. Might need to leave it as nan array.


maga_transformer/cpp/kernels/sampling_penalty_kernels.cu (2 lines):
	- line 25: // TODO Add half2 implementation
	- line 126: // TODO: Add macro or device function to get MAX_T_VAL.


maga_transformer/cpp/api_server/HttpApiServer.cc (2 lines):
	- line 26: // TODO: queueSize may interleave with controller :(
	- line 414: // TODO:


maga_transformer/models/minicpmv/modeling_navit_siglip.py (2 lines):
	- line 479: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 547: # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.


maga_transformer/cpp/models/GptModel.cc (2 lines):
	- line 289: // TODO: design better split strategy that consider the computational workload of each request
	- line 830: // TODO: maybe move this layernorm to ffn layer


maga_transformer/cpp/rocm/rocmFmhaWrapper.cc (2 lines):
	- line 70: scale_s = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q));  // TODO: q ? v ?
	- line 103: mask_info mask = mask_info::decode(msk_str, seqlen_q, seqlen_k);  // TODO: we don't need x/y anymore


maga_transformer/cpp/cutlass/cutlass_kernels/moe_gemm/moe_fp8_kernels.cu (2 lines):
	- line 210: //     // TODO this is almost certainly faster as a linear scan
	- line 302: //     // TODO this is almost certainly faster as a linear scan


maga_transformer/cpp/devices/rocm_impl/ROCmFfnLayer.cc (2 lines):
	- line 36: // TODO: cuda version also not init this
	- line 42: // TODO group_size


maga_transformer/cpp/devices/cuda_impl/CudaOps.cc (2 lines):
	- line 131: // TODO: change this to use efficient cuda kernel
	- line 441: // TODO: see if we should add overflow protection for offset


maga_transformer/cpp/cutlass/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.inl (2 lines):
	- line 74: // TODO Update once mixed input support is added
	- line 161: // TODO Add mode for fused activation once CUTLASS adds support


maga_transformer/cpp/kernels/sampling_topk_kernels.cu (1 line):
	- line 292: //@miji TODO: use block sum to make it faster


maga_transformer/cpp/devices/arm_impl/ArmDevice.cc (1 line):
	- line 121: #define MAX_PRE_CALC_SEQ_LEN 1024 // TODO: get it from model config


maga_transformer/cpp/devices/arm_impl/ArmGemmKaiOp.cc (1 line):
	- line 283: /* TODO


maga_transformer/cpp/trt_plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp (1 line):
	- line 49: // TODO: add bf16 support


maga_transformer/openai/api_datatype.py (1 line):
	- line 82: # TODO: maybe also implement Qwen Style function call.


maga_transformer/cpp/kernels/moe_topKSoftmax_kernels.cu (1 line):
	- line 559: // TODO Could linear search be better for small # experts


maga_transformer/cpp/cutlass/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h (1 line):
	- line 61: // TODO - Switch this to column major for weights since gemms should be more performant.


maga_transformer/cpp/devices/arm_impl/gemm_opt/ArmGemmKernel.h (1 line):
	- line 219: p.do_act = 0;  // TODO: change back to 1


maga_transformer/cpp/stream/GenerateStream.cc (1 line):
	- line 158: // TODO: maybe need fix when context and reuse


maga_transformer/models/base_model.py (1 line):
	- line 57: tokenizer: Any = None # TODO: remove this


maga_transformer/cpp/cuda/cublas/cublasMMWrapper.cc (1 line):
	- line 285: // TODO: default cublas libs


maga_transformer/models/downstream_modules/classifier/classifier.py (1 line):
	- line 54: #TODO test it


maga_transformer/cpp/rocm/quantizePreprocessors.cc (1 line):
	- line 515: // TODO : feifei


maga_transformer/cpp/cutlass/cutlass_kernels/weightOnlyBatchedGemv/kernelDispatcher.h (1 line):
	- line 27: // TODO:


maga_transformer/cpp/cutlass/cutlass_kernels/int8_gemm/int8_gemm_template.h (1 line):
	- line 145: // TODO: handle that


maga_transformer/cpp/kernels/gpt_kernels.cu (1 line):
	- line 435: // TODO Add half2 implementation


maga_transformer/cpp/disaggregate/rtpllm_master/tokenize/RemoteTokenizeModule.cpp (1 line):
	- line 21: //TODO: request timeout


maga_transformer/cpp/devices/cuda_impl/CudaDeepEPFfnLayer.cc (1 line):
	- line 24: // TODO: check if get right


maga_transformer/cpp/kernels/unfused_attention_kernels.cu (1 line):
	- line 1220: // TODO: optimize this kernel?


maga_transformer/cpp/cuda/cufmha/fmha_profiling_interface.h (1 line):
	- line 153: // TODO: add print log


maga_transformer/cpp/kernels/activation_fp8_kernels.cu (1 line):
	- line 266: //     // TODO FP8


maga_transformer/cpp/devices/arm_impl/gemm_opt/ArmGemmThreadblock.cc (1 line):
	- line 184: float* bias_ptr = p.bias_ptr + n; // TODO: handle float16_t bias


maga_transformer/cpp/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention.h (1 line):
	- line 118: // The current timestep. TODO Check that do we only this param in cross attention?


maga_transformer/cpp/disaggregate/cache_store/MessagerClient.cpp (1 line):
	- line 84: request_block_buffer, timeout_ms - 10, partition_count, partition_id);  // TODO: 10 is message transfer time


maga_transformer/model_factory.py (1 line):
	- line 95: #TODO: remove model_config, get all info from gpt_config


maga_transformer/cpp/rocm/rocmMoeWrapper.cc (1 line):
	- line 60: // TODO: temporarily fixed scale type


maga_transformer/cpp/schedulers/FIFOScheduler.h (1 line):
	- line 69: // TODO @wangyin support different beams run togather


maga_transformer/cpp/devices/Weights.h (1 line):
	- line 142: // TODO: This Weights class might be refactor into a complete model description


maga_transformer/cpp/kernels/decoder_masked_multihead_attention_utils.h (1 line):
	- line 3422: // FIXME:


maga_transformer/cpp/kernels/triton/aot_triton_kernel.bzl (1 line):
	- line 213: # TODO: get value from --python_top=//:python310


maga_transformer/cpp/normal_engine/NormalGenerateStream.cc (1 line):
	- line 162: //TODO: move it to better position


maga_transformer/cpp/cuda/reduce_kernel_utils.cuh (1 line):
	- line 286: // TODO Add implementation here


maga_transformer/openai/renderers/qwen_agent/llm/qwen_dashscope.py (1 line):
	- line 141: raise NotImplementedError('Not implemented function_choice="none" yet.')  # TODO:


maga_transformer/cpp/api_server/TokenProcessor.h (1 line):
	- line 37: // TODO: change to tokenizer wrapper


maga_transformer/cpp/devices/cuda_impl/CudaGemmOp.cc (1 line):
	- line 164: // TODO: support it in ppu


maga_transformer/server/frontend_worker.py (1 line):
	- line 97: #TODO temp fix sp with batch infer, will change request_id to str later


maga_transformer/cpp/kernels/unfused_attention_fp8_kernels.cu (1 line):
	- line 1004: // TODO: bfloat162 computation ?


maga_transformer/cpp/devices/cuda_impl/CudaAttentionOp.cc (1 line):
	- line 279: // TODO: refactor QBuffer to suppport view and return QBuffer


maga_transformer/cpp/disaggregate/rtpllm_master/cluster/PrefillLoadBalancer.cpp (1 line):
	- line 111: //TODO: what happens if worker get error time? maybe use timedelta is better


maga_transformer/cpp/devices/cuda_impl/DeepEPDefs.h (1 line):
	- line 192: // TODO: check which count will be ok


maga_transformer/cpp/trt_plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp (1 line):
	- line 121: int const num_not_finished = num_rows; // TODO Take this as an input


maga_transformer/cpp/position_ids_generator/PositionIdsGenerator.h (1 line):
	- line 23: // TODO: not same -> implement different interface here and BatchStreamProcessor


maga_transformer/cpp/api_server/InferenceService.cc (1 line):
	- line 64: // TODO: adapter_name


maga_transformer/device/device_impl.py (1 line):
	- line 316: # TODO: need add device infomation for selection


maga_transformer/cpp/devices/cuda_impl/CudaDeepEPLLFfnLayer.cc (1 line):
	- line 62: // TODO: deep_ep_output might should be removed from output objects.