candle-core/src/cpu_backend/mod.rs (12 lines):
	- line 17: // TODO: Maybe we should not implement [Clone] here and instead have an explicit allocator +
	- line 388: // TODO: Specialized implementation for the case 2*sz?
	- line 418: // TODO: Specialized implementation for the case 2*h, 2*w?
	- line 811: // TODO: Avoid making this copy if `inp` already has the appropriate layout.
	- line 887: // TODO: provide specialized kernels for the common use cases.
	- line 950: // TODO: provide specialized kernels for the common use cases.
	- line 1039: // TODO: Avoid making this copy if `inp` already has the appropriate layout.
	- line 1104: // TODO: Avoid making this copy if `inp` already has the appropriate layout.
	- line 1196: // TODO: Avoid making this copy if `inp` already has the appropriate layout.
	- line 1715: // TODO: find a way around the quadratic number of cases below.
	- line 2008: // TODO: Have some generic map for functions that apply on num_traits::Float elements.
	- line 2033: // TODO: Have some generic map for functions that apply on num_traits::Float elements.


candle-onnx/src/eval.rs (7 lines):
	- line 335: // TODO: Validate node.input for each operator.
	- line 399: // TODO: Check that there is at most a single -1 or 0, handle other neg values.
	- line 674: // TODO: Provide an op to handle the ONNX generalized gather op ideally in a
	- line 1024: // TODO: support sparse_value etc.
	- line 1298: // TODO: Handle empty set
	- line 1338: // TODO: This version is only compatible with ReduceMean V13 and below.
	- line 1407: // TODO: Handle empty set


candle-kernels/src/reduce.cu (6 lines):
	- line 8: // TODO: Maybe add some fast_sum_f16_f32 variant that not only accumulate in f32
	- line 34: // TODO: Fast version for the contiguous case.
	- line 312: // TODO: Fast version for the contiguous case.
	- line 350: // TODO: Fast version for the contiguous case.
	- line 392: // TODO: Fast version for the contiguous case.
	- line 440: // TODO: Fast version for the contiguous case.


candle-transformers/src/models/t5.rs (6 lines):
	- line 490: // TODO: Use flash_attn.
	- line 562: // TODO: position_bias_masked?
	- line 705: // TODO: Cache masks
	- line 720: // TODO: clamp for f16?
	- line 723: // TODO: clamp for f16?
	- line 726: // TODO: clamp for f16?


candle-transformers/src/models/quantized_t5.rs (6 lines):
	- line 361: // TODO: Use flash_attn.
	- line 432: // TODO: position_bias_masked?
	- line 575: // TODO: Cache masks
	- line 590: // TODO: clamp for f16?
	- line 593: // TODO: clamp for f16?
	- line 596: // TODO: clamp for f16?


candle-core/src/quantized/k_quants.rs (4 lines):
	- line 1855: // TODO: Do not make this copy if the DotType is f32.
	- line 1856: // TODO: Pre-allocate this.
	- line 1948: // TODO: vectorize
	- line 1959: // TODO: vectorize


candle-pyo3/src/lib.rs (4 lines):
	- line 194: // TODO: Something similar to this should probably be a part of candle core.
	- line 234: // TODO: Handle arbitrary input dtype and shape.
	- line 311: "TODO: conversion to PyObject is not handled for rank {n}"
	- line 316: // TODO: Handle arbitrary shapes.


candle-core/src/tensor.rs (4 lines):
	- line 551: // TODO: Also make an inplace version or a pre-allocated? This could be tricky
	- line 1378: // TODO: Avoid concretising the broadcasted matrixes via contiguous.
	- line 1556: // TODO: Maybe we want to add a more efficient implementation at some point.
	- line 2210: // TODO: Avoid passing through the cpu storage here, especially if the gpu ids


candle-transformers/src/models/segment_anything/sam.rs (3 lines):
	- line 308: // TODO:
	- line 318: // TODO: Return to the original image frame.
	- line 343: // TODO: remove duplicates


candle-core/src/safetensors.rs (3 lines):
	- line 59: // TODO: Avoid the unwrap here.
	- line 80: // TODO: Avoid the unwrap here.
	- line 232: // TODO: This makes an unnecessary copy when the tensor is on the cpu.


candle-transformers/src/models/bert.rs (3 lines):
	- line 193: // TODO: Proper absolute positions?
	- line 419: // TODO: Support cross-attention?
	- line 421: // TODO: Support something similar to `apply_chunking_to_forward`?


candle-flash-attn/kernels/flash_fwd_kernel.h (3 lines):
	- line 341: // TODO: when we have key_padding_mask we'll need to Check_inf
	- line 1014: // TODO: allocate enough smem for sOaccum
	- line 1284: // TODO: Should check if this is using vectorized store, but it seems pretty fast


candle-kernels/src/conv.cu (3 lines):
	- line 416: // TODO: Improve this.
	- line 468: // TODO: Improve this.
	- line 523: // TODO: Improve this.


candle-transformers/src/models/metavoice.rs (2 lines):
	- line 476: // TODO: causal mask
	- line 656: // TODO: speaker embs.


candle-pyo3/py_src/candle/models/bert.py (2 lines):
	- line 125: # TODO: Support cross-attention?
	- line 127: # TODO: Support something similar to `apply_chunking_to_forward`?


candle-nn/src/ops.rs (2 lines):
	- line 63: // FIXME: using `candle::map_dtype` causes compilation errors.
	- line 243: // TODO: Should we have a specialized op for this?


candle-core/src/cpu_backend/utils.rs (2 lines):
	- line 122: // TODO: Maybe we want to avoid going through the layout twice.
	- line 151: // TODO: Maybe we want to avoid going through the layout twice.


candle-core/src/cuda_backend/mod.rs (2 lines):
	- line 315: // TODO: Maybe use grid_y if the output is too large?
	- line 316: // TODO: Specialized implementation when reducing on no or all dimensions or when


candle-flash-attn/kernels/kernel_traits.h (2 lines):
	- line 231: // TODO: generalize to other values of kBlockN
	- line 232: // TODO: what should be the Swizzle here? 3 is faster than 1, and 1 is faster than 2


candle-transformers/src/models/clip/text_model.rs (2 lines):
	- line 61: // TODO rewrite to be more similar to https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L142
	- line 301: // TODO: rewrite to newer version


candle-flash-attn/src/lib.rs (2 lines):
	- line 83: // TODO: Handle head sizes that are not a multiple of 8 via some padding.
	- line 535: // TODO: Handle head sizes that are not a multiple of 8 via some padding.


candle-core/src/device.rs (2 lines):
	- line 336: // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
	- line 374: // TODO: Remove the special case if we start supporting generating f16/bf16 directly.


candle-core/src/custom_op.rs (2 lines):
	- line 433: // TODO: support more dtypes.
	- line 469: // TODO: support more dtypes.


candle-core/src/cuda_backend/device.rs (2 lines):
	- line 340: // TODO: Add support for F16 and BF16 though this is likely to require some upstream
	- line 374: // TODO: Add support for F16 and BF16 though this is likely to require some upstream


candle-transformers/src/models/distilbert.rs (2 lines):
	- line 261: // TODO: Support cross-attention?
	- line 263: // TODO: Support something similar to `apply_chunking_to_forward`?


candle-transformers/src/models/bigcode.rs (2 lines):
	- line 225: // TODO: we could trim the tensors to MAX_SEQ_LEN so that this would work for
	- line 266: // TODO: Add cross-attention?


candle-core/src/quantized/neon.rs (2 lines):
	- line 17: // TODO: dotprod
	- line 563: // TODO: dotprod


candle-transformers/src/models/mimi/conv.rs (2 lines):
	- line 560: // TODO: We should ensure for the seed to be constant when running these tests.
	- line 610: // TODO: We should ensure for the seed to be constant when running these tests.


candle-core/src/convert.rs (1 line):
	- line 118: // TODO: Avoid using a buffer when data is already on the CPU.


candle-pyo3/stub.py (1 line):
	- line 159: # TODO it would be interesting to add the setter maybe ?


candle-transformers/src/models/debertav2.rs (1 line):
	- line 877: // TODO: In order to fully test ConvLayer a model needs to be found has a configuration where `conv_kernel_size` exists and is > 0


candle-metal-kernels/src/conv.metal (1 line):
	- line 183: // TODO: Improve this.


candle-transformers/src/models/wuerstchen/paella_vq.rs (1 line):
	- line 192: // TODO: quantizer if we want to support `force_not_quantize=False`.


candle-core/src/streaming.rs (1 line):
	- line 132: // TODO: Should we also have a flush method?


candle-transformers/src/models/llava/mod.rs (1 line):
	- line 244: //TODO: process of multiple images/ new line


candle-transformers/src/models/gemma2.rs (1 line):
	- line 32: // TODO: Handle the sliding window in the attention mask.


candle-transformers/src/models/mimi/quantization.rs (1 line):
	- line 115: // TODO: avoid repeating this.


candle-transformers/src/models/falcon.rs (1 line):
	- line 281: // TODO: we could trim the tensors to MAX_SEQ_LEN so that this would work for


candle-transformers/src/models/mimi/transformer.rs (1 line):
	- line 363: // TODO: Maybe use bias_ff here?


candle-transformers/src/models/encodec.rs (1 line):
	- line 538: // TODO: Apply dilations!


candle-core/src/pickle.rs (1 line):
	- line 367: // TODO: have a separate ordered dict and a separate default dict.


candle-metal-kernels/src/lib.rs (1 line):
	- line 1646: /// - final type != bf16 (TODO maybe just template this kernel too?)


candle-pyo3/py_src/candle/nn/module.py (1 line):
	- line 257: # TODO: Remove `args` and the parsing logic when BC allows.


candle-core/src/backend.rs (1 line):
	- line 135: // TODO: Make the usize generic and part of a generic DeviceLocation.


candle-book/src/lib.rs (1 line):
	- line 100: // TODO: Implement from_buffer_iterator so we can skip the extra CPU alloc.


candle-wasm-examples/whisper/src/worker.rs (1 line):
	- line 213: // TODO: Besides suppress tokens, we should apply the heuristics from


candle-core/src/npy.rs (1 line):
	- line 199: // TODO: Add the possibility to read directly to a device?


candle-pyo3/py_src/candle/models/llama.py (1 line):
	- line 99: # TODO: maybe repeat k/v here if we start supporting MQA.


candle-transformers/src/models/stella_en_v5.rs (1 line):
	- line 201: // TODO: re-visit this


candle-examples/src/bs1770.rs (1 line):
	- line 389: // TODO: Should this branch be marked cold?


candle-nn/src/conv.rs (1 line):
	- line 244: // TODO: support groups.


candle-transformers/src/models/stable_diffusion/attention.rs (1 line):
	- line 554: // TODO: revert the call to force_contiguous once the three matmul kernels have been


candle-kernels/src/compatibility.cuh (1 line):
	- line 7: // FIXME: the minimum compute capabilities are just guesses since the table is not specific enough


candle-kernels/src/indexing.cu (1 line):
	- line 2: // TODO: proper error reporting when ids are larger than v_size.


candle-pyo3/py_src/candle/nn/linear.py (1 line):
	- line 82: # TODO: Do actual initialization here: e.g. kaiming_uniform or xavier_uniform


candle-kernels/src/cuda_utils.cuh (1 line):
	- line 5: // TODO: This is often used to check that the data is contiguous so that


candle-transformers/src/models/llama2_c_weights.rs (1 line):
	- line 113: // TODO: As of 2023-08-04, gemm is slower than expected when multiplying a matrix of


candle-core/src/quantized/ggml_file.rs (1 line):
	- line 215: // TODO: Mmap version to avoid copying the data around?