faiss/gpu/impl/PQCodeLoad.cuh (12 lines):
	- line 86: // FIXME: this is a non-coalesced, unaligned, non-vectorized load
	- line 92: // FIXME: this is also slow, since we have to recover the
	- line 129: // FIXME: this is a non-coalesced, unaligned, non-vectorized load
	- line 157: // FIXME: this is a non-coalesced, unaligned, non-vectorized load
	- line 174: // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
	- line 195: // FIXME: this is a non-coalesced, unaligned, non-vectorized load
	- line 214: // FIXME: this is a non-coalesced load
	- line 232: // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
	- line 259: // FIXME: this is a non-coalesced load
	- line 283: // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
	- line 316: // FIXME: this is a non-coalesced load
	- line 346: // FIXME: this is a non-coalesced load


faiss/gpu/utils/MergeNetworkBlock.cuh (4 lines):
	- line 66: // FIXME: is this a CUDA 9 compiler bug?
	- line 97: // FIXME: is this a CUDA 9 compiler bug?
	- line 153: // FIXME: is this a CUDA 9 compiler bug?
	- line 189: // FIXME: is this a CUDA 9 compiler bug?


faiss/gpu/impl/PQCodeDistances-inl.cuh (4 lines):
	- line 103: // FIXME: investigate loading separately, so we don't need this
	- line 137: // FIXME: try always making this centroid id 0 so we can
	- line 621: // FIXME: tune
	- line 631: // FIXME: probably impractical for large # of dims?


faiss/gpu/GpuIndexIVF.cu (4 lines):
	- line 56: // FIXME: inherit our same device
	- line 104: // FIXME: inherit our same device
	- line 108: // FIXME: 2 different float16 options?
	- line 111: // FIXME: 2 different float16 options?


faiss/gpu/utils/MergeNetworkWarp.cuh (4 lines):
	- line 272: // FIXME: compiler doesn't like this expression? compiler bug?
	- line 302: // FIXME: compiler doesn't like this expression? compiler bug?
	- line 359: // FIXME: compiler doesn't like this expression? compiler bug?
	- line 389: // FIXME: compiler doesn't like this expression? compiler bug?


faiss/gpu/impl/BroadcastSum.cu (3 lines):
	- line 32: // FIXME: if we have exact multiples, don't need this
	- line 141: // FIXME: if we have exact multiples, don't need this
	- line 216: // FIXME: speed up


faiss/gpu/impl/IVFAppend.cu (3 lines):
	- line 98: // FIXME: there could be overflow here, but where should we check this?
	- line 288: // FIXME: stride with threads instead of single thread
	- line 366: // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?


faiss/gpu/impl/PQScanMultiPassPrecomputed.cu (2 lines):
	- line 54: // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
	- line 609: // FIXME: we should adjust queryTileSize to deal with this, since


faiss/python/__init__.py (2 lines):
	- line 1134: # TODO: once deprecated classes are removed, remove the dict and just use .lower() below
	- line 1182: # TODO check class name


faiss/gpu/StandardGpuResources.cpp (2 lines):
	- line 437: // FIXME: as of CUDA 11, a memory allocation error appears to be
	- line 458: // FIXME: as of CUDA 11, a memory allocation error appears to be


faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh (2 lines):
	- line 48: // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
	- line 592: // FIXME: we should adjust queryTileSize to deal with this, since


faiss/gpu/impl/IVFPQ.cu (2 lines):
	- line 322: // FIXME: why are we doing this?
	- line 575: // FIXME: we might ultimately be calling this function with inputs


faiss/utils/partitioning.cpp (2 lines):
	- line 183: // FIXME avoid a second pass over the array to sample the threshold
	- line 821: /// FIXME when MSB of uint16 is set


faiss/IndexIVFPQ.cpp (2 lines):
	- line 199: // TODO: parallelize?
	- line 318: // TODO: parallelize?


faiss/gpu/impl/IVFFlatScan.cu (2 lines):
	- line 69: // FIXME: why does getLaneId() not work when we write out below!?!?!
	- line 401: // FIXME: we should adjust queryTileSize to deal with this, since


faiss/utils/simdlib_neon.h (2 lines):
	- line 10: // TODO: Support big endian (currently supporting only little endian)
	- line 490: // TODO find a better name


faiss/gpu/GpuIndexIVFPQ.cu (2 lines):
	- line 245: // FIXME jhj convert to _n version
	- line 317: // FIXME: GPUize more of this


faiss/IndexIVFPQFastScan.cpp (2 lines):
	- line 260: // TODO should not need stable
	- line 265: // TODO parallelize


faiss/gpu/utils/HostTensor-inl.cuh (2 lines):
	- line 215: // FIXME: type-specific abs()
	- line 220: // FIXME: type-specific abs


faiss/gpu/GpuIndexFlat.cu (2 lines):
	- line 254: // FIXME jhj: kernel for copy
	- line 272: // FIXME jhj: kernel for copy


faiss/utils/simdlib.h (1 line):
	- line 30: // FIXME: make a SSE version


faiss/gpu/utils/StackDeviceMemory.cpp (1 line):
	- line 70: // FIXME: make sure there are no outstanding memory allocations?


faiss/invlists/OnDiskInvertedLists.cpp (1 line):
	- line 561: // TODO shrink global storage if needed


faiss/IndexIVF.cpp (1 line):
	- line 1158: : dis > radius; // TODO templatize to remove this test


faiss/IndexShards.cpp (1 line):
	- line 150: // FIXME: assumes that nothing is currently running on the sub-indexes, which is


faiss/IndexIVFAdditiveQuantizer.cpp (1 line):
	- line 202: // TODO find a way to provide the nprobes together to do a matmul


contrib/torch_utils.py (1 line):
	- line 398: # FIXME: no rev_swig_ptr equivalent for torch.Tensor, just convert


faiss/gpu/GpuIndexIVFFlat.cu (1 line):
	- line 158: // FIXME: GPUize more of this


contrib/exhaustive_search.py (1 line):
	- line 217: # TODO: all result manipulations are in python, should move to C++ if perf


faiss/utils/hamming-inl.h (1 line):
	- line 59: nbit -= 8; // TODO remove nbit


faiss/utils/simdlib_avx2.h (1 line):
	- line 212: // TODO find a better name


faiss/gpu/utils/MatrixMult-inl.cuh (1 line):
	- line 51: // FIXME: some weird CUDA 11 bug? where cublasSgemmEx on


faiss/python/python_callbacks.cpp (1 line):
	- line 52: // TODO check nb of bytes written


faiss/gpu/utils/DeviceUtils.cu (1 line):
	- line 125: // FIXME: what to use for managed memory?


faiss/gpu/GpuDistance.cu (1 line):
	- line 139: // FIXME: convert to int32_t everywhere?


faiss/gpu/utils/Limits.cuh (1 line):
	- line 21: // FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity


faiss/gpu/GpuIndexIVFScalarQuantizer.cu (1 line):
	- line 204: // FIXME: GPUize more of this


faiss/impl/ScalarQuantizer.cpp (1 line):
	- line 595: // TODO just do a qucikselect


faiss/impl/index_write.cpp (1 line):
	- line 71: * TODO: in this file, the read functions that encouter errors may


faiss/gpu/utils/Tensor-inl.cuh (1 line):
	- line 552: // FIXME: maybe also consider offset in bytes? multiply by sizeof(T)?


faiss/utils/simdlib_emulated.h (1 line):
	- line 242: // TODO find a better name


faiss/gpu/impl/Distance.cu (1 line):
	- line 107: // FIXME: optimize with a dedicated kernel


faiss/gpu/impl/IVFFlat.cu (1 line):
	- line 256: // FIXME: we might ultimately be calling this function with inputs


faiss/gpu/impl/IVFInterleaved.cuh (1 line):
	- line 68: // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?


faiss/IndexReplicas.cpp (1 line):
	- line 158: // FIXME: assumes that nothing is currently running on the sub-indexes, which is


faiss/gpu/impl/IVFBase.cu (1 line):
	- line 455: // FIXME: really this can be into pinned memory and a true async


faiss/gpu/utils/ThrustAllocator.cuh (1 line):
	- line 49: // FIXME: we cannot use temporary memory for new requests because


faiss/gpu/impl/Distance.cuh (1 line):
	- line 21: /// FIXME: the output distances must fit in GPU memory


faiss/IVFlib.cpp (1 line):
	- line 53: // TODO: check as thoroughfully for other index types


faiss/impl/simd_result_handlers.h (1 line):
	- line 130: if (with_id_map) { // FIXME test on q_map instead


faiss/gpu/impl/L2Select.cu (1 line):
	- line 45: // FIXME: if we have exact multiples, don't need this


faiss/gpu/GpuIndex.cu (1 line):
	- line 37: // FIXME: parameterize based on algorithm need


faiss/impl/AdditiveQuantizer.cpp (1 line):
	- line 261: // TODO: make tree of partial sums