src/ExecuteKernelU8S8.cc (7 lines):
	- line 56: // TODO: Have default slower path
	- line 205: // TODO: Have default slower path
	- line 263: // TODO: Have default slower path
	- line 325: // TODO: avx512 path
	- line 337: // TODO: Have default slower path
	- line 347: // TODO: avx512 path
	- line 359: // TODO: Have default slower path


src/GroupwiseConv.cc (4 lines):
	- line 165: // TODO: Have default slower path
	- line 724: // TODO: Have default slower path
	- line 780: // TODO: Remove this when threading is supported.
	- line 1287: // TODO: Have default slower path


fbgemm_gpu/src/split_embeddings_cache_cuda.cu (4 lines):
	- line 76: // // TODO: do we care about 64-bit indices? Currently we just ignore.
	- line 471: // FIXME: __any_sync with mask isn't supported by HIP yet.
	- line 1195: // FIXME: __any_sync with mask isn't supported by HIP yet.
	- line 1884: // FIXME: __ballot_sync with mask isn't supported by HIP yet.


fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py (3 lines):
	- line 872: # TODO: add in-place FloatToFused8BitRowwiseQuantized conversion
	- line 930: TODO: populate the supported list of optimizers
	- line 2278: FIXME: make it in-place fill.


src/RowWiseSparseAdagradFused.cc (3 lines):
	- line 357: // TODO: need to do a tree-reduction to fully take advantage of
	- line 846: // TODO: JIT
	- line 890: // TODO: JIT


fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py (3 lines):
	- line 20: # TODO: add per-feature based converter option (based on embedding_specs during inference)
	- line 21: # TODO: optimize embedding pruning and quantization latency.
	- line 80: # FIXME: How to view the PyTorch Tensor as a different type (e.g., uint8)


fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh (2 lines):
	- line 907: // TODO: pass in dimension info and calculate qparams for rowwise integer
	- line 1054: // TODO: replace uses in backward kernels with warp find qparams


src/Utils.cc (2 lines):
	- line 480: // TODO: more smart ways for thread partitions considering the
	- line 483: // TODO: when G == nthreads + 1, we'll have a big load imbalance because


src/PackWeightMatrixForGConv.cc (2 lines):
	- line 56: // TODO: change to avx512 when avx512 support is available
	- line 210: // TODO: Wrap this as a inverseTransposeConvWeights()?


fbgemm_gpu/src/cumem_utils.cu (2 lines):
	- line 283: // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
	- line 355: // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.


fbgemm_gpu/src/quantize_ops.cu (2 lines):
	- line 167: // TODO: lift range_list into shared memory. However, when nrows is large,
	- line 538: // TODO: torch check input is 2D


cmake/modules/FindMKL.cmake (2 lines):
	- line 76: # TODO: diagnostic if dir does not exist
	- line 88: # TODO: diagnostic if dir does not exist


fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu (2 lines):
	- line 175: // TODO: increase code sharing (templates for accumulator_ty, accumulation, outputs per thread, etc?)
	- line 528: // FIXME: __any_sync with mask isn't supported by HIP yet.


src/FbgemmI64.cc (2 lines):
	- line 62: // TODO: need to tune
	- line 444: // TODO: handle transpose during packing


include/fbgemm/Fbgemm.h (2 lines):
	- line 1164: *                    TODO: if Aq_zero_point == 0, allow passing nullptr.
	- line 1277: *                    TODO: if Aq_zero_point == 0, allow passing nullptr.


fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp (2 lines):
	- line 195: // TODO: to parallelize, we should easily identify segments belong to
	- line 349: // TODO: respect output_dtype


bench/RowwiseAdagradFusedBenchmark.cc (2 lines):
	- line 28: // TODO: Add more inputs
	- line 60: // TODO: check appropriate vals for g,h,w


src/FbgemmI8Spmdm.cc (2 lines):
	- line 51: // TODO: fallback when AVX2 is not available
	- line 310: // TODO: if not hyper sparse, transpose a block of A matrix as in SpMDM.


fbgemm_gpu/src/cumem_utils_host.cpp (1 line):
	- line 43: // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.


fbgemm_gpu/include/fbgemm_gpu/quantize_ops_utils.h (1 line):
	- line 28: // TODO: add a flag later to control whether underflow


fbgemm_gpu/src/jagged_tensor_ops_cpu.cpp (1 line):
	- line 515: // TODO: Add option to pass in total_L


src/PackAWithRowOffset.cc (1 line):
	- line 224: // TODO: Have default slower path


src/FbgemmI8Depthwise2DAvx2-inl.h (1 line):
	- line 112: // TODO: short-circuit when B_zero_point is 0 or A_zero_point is 0


fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h (1 line):
	- line 14: !ten->is_cuda(); // TODO: Should be a better way to do this


fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp (1 line):
	- line 325: // TODO: use accessor for the following 3 parameters


src/SparseAdagrad.cc (1 line):
	- line 281: // TODO: need to do a tree-reduction to fully take advantage of unrolling


src/PackAMatrix.cc (1 line):
	- line 111: // TODO: should print warning because this path is not optimized yet


fbgemm_gpu/src/jagged_tensor_ops.cu (1 line):
	- line 882: // TODO: use shared memory


fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp (1 line):
	- line 125: // TODO: fp16 and weighted


bench/EmbeddingSpMDM8BitBenchmark.cc (1 line):
	- line 41: // TODO: Add more inputs


src/EmbeddingSpMDMNBit.cc (1 line):
	- line 217: // TODO: Make this tunable


fbgemm_gpu/bench/merge_embeddings_benchmark.py (1 line):
	- line 343: # TODO: the result here is wrong. Once MixedDim version for FusedNBit quantization is done, switch to that.


include/fbgemm/FbgemmEmbedding.h (1 line):
	- line 175: // TODO: add compressed_data_size and check array bound


fbgemm_gpu/src/sparse_ops.cu (1 line):
	- line 2011: /* TODO: Remove the condition protecting the slow path because even when the


fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp (1 line):
	- line 18: // FIXME: Enable merge_pooled_embeddings for HIP.


fbgemm_gpu/src/permute_pooled_embedding_ops.cu (1 line):
	- line 38: // passs after D22767058. TODO: optimize and make sure pooled_embs is


include/fbgemm/QuantUtils.h (1 line):
	- line 34: // TODO: T26263653 fix signed-integer-overflow undefined behavior


bench/EmbeddingSpMDMBenchmark.cc (1 line):
	- line 32: // TODO: Add more inputs


fbgemm_gpu/setup.py (1 line):
	- line 22: # TODO: ideally the version.py should be generated when setup is run


src/EmbeddingSpMDM.cc (1 line):
	- line 231: // TODO: Make this tunable


bench/EmbeddingSpMDMNBitBenchmark.cc (1 line):
	- line 43: // TODO: Add more inputs


bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc (1 line):
	- line 43: // TODO: Add more inputs


bench/RowwiseAdagradBenchmark.cc (1 line):
	- line 56: // TODO: check appropriate vals for g,h,w


src/PackAWithIm2Col.cc (1 line):
	- line 738: // TODO: Have default slower path


bench/BenchUtils.h (1 line):
	- line 148: // TODO: measure load imbalance


fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu (1 line):
	- line 19: // TODO: optimization to use multiple warps per row.


src/FbgemmSparseDenseVectorInt8Avx512.cc (1 line):
	- line 177: // TODO: unroll this loop?


bench/SparseAdagradBenchmark.cc (1 line):
	- line 57: // TODO: check appropriate vals for g,h,w