path # lines of code bench/EmbeddingQuantizeBenchmark.cc 89 bench/TransposeBenchmark.cc 49 bench/GEMMsTunableBenchmark.cc 286 bench/Depthwise3DBenchmark.cc 188 bench/BenchUtils.cc 182 bench/RequantizeBenchmark.cc 123 bench/BenchUtils.h 415 bench/GEMMsBenchmark.cc 260 bench/PackedRequantizeAcc16Benchmark.cc 359 bench/SparseDenseMMFP32Benchmark.cc 93 bench/SparseDenseMMInt8Benchmark.cc 112 bench/I8SpmdmBenchmark.cc 172 bench/PackedRequantizeAcc32Benchmark.cc 262 bench/DepthwiseBenchmark.cc 305 bench/EmbeddingIndexRemappingBenchmark.cc 134 bench/AlignedVec.h 86 bench/FP16Benchmark.cc 40 bench/I64Benchmark.cc 115 bench/EmbeddingSpMDMNBitBenchmark.cc 278 bench/ConvertBenchmark.cc 51 bench/RowOffsetBenchmark.cc 37 bench/ConvUnifiedBenchmark.cc 449 bench/GroupwiseConvRequantizeBenchmark.cc 413 bench/Im2ColFusedRequantizeBenchmark.cc 296 bench/RowwiseAdagradFusedBenchmark.cc 162 bench/RowwiseAdagradBenchmark.cc 189 bench/EmbeddingSpMDMBenchmark.cc 293 bench/PackedFloatInOutBenchmark.cc 242 bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc 309 bench/EmbeddingSpMDM8BitBenchmark.cc 282 bench/SparseAdagradBenchmark.cc 190 fbgemm_gpu/bench/split_embeddings_cache_benchmark.py 448 fbgemm_gpu/bench/jagged_tensor_benchmark.py 58 fbgemm_gpu/bench/merge_embeddings_benchmark.py 440 fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py 137 fbgemm_gpu/bench/bench_utils.py 34 fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py 1916 fbgemm_gpu/bench/scripts/batch_benchmark_run.py 73 fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu 287 fbgemm_gpu/bench/sparse_ops_benchmark.py 52 fbgemm_gpu/bench/quantize_ops_benchmark.py 186 fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py 242 fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py 88 fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py 51 fbgemm_gpu/fbgemm_gpu/enums.py 10 fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py 145 fbgemm_gpu/fbgemm_gpu/__init__.py 4 fbgemm_gpu/fbgemm_gpu/uvm.py 19 fbgemm_gpu/fbgemm_gpu/batched_unary_embeddings_ops.py 61 fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py 1963 fbgemm_gpu/version.py 1 fbgemm_gpu/include/fbgemm_gpu/input_combine.h 16 fbgemm_gpu/include/fbgemm_gpu/cub_namespace_postfix.cuh 14 fbgemm_gpu/include/fbgemm_gpu/layout_transform_ops.cuh 105 fbgemm_gpu/include/fbgemm_gpu/cpu_utils.h 42 fbgemm_gpu/include/fbgemm_gpu/embedding_common.h 54 fbgemm_gpu/include/fbgemm_gpu/quantize_ops_utils.h 55 fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh 139 fbgemm_gpu/include/fbgemm_gpu/cub_namespace_prefix.cuh 12 fbgemm_gpu/include/fbgemm_gpu/quantize_ops_gpu.h 14 fbgemm_gpu/include/fbgemm_gpu/embedding_backward_template_helpers.cuh 98 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh 35 fbgemm_gpu/include/fbgemm_gpu/quantize_ops.cuh 27 fbgemm_gpu/include/fbgemm_gpu/cuda_utils.cuh 14 fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh 1948 fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embs_function.h 1 fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h 16 fbgemm_gpu/include/fbgemm_gpu/enum_utils.h 57 fbgemm_gpu/include/fbgemm_gpu/merge_pooled_embeddings.h 7 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h 317 fbgemm_gpu/include/fbgemm_gpu/bench_utils.cuh 65 fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h 214 fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h 141 fbgemm_gpu/pyproject.toml 18 fbgemm_gpu/codegen/embedding_bounds_check.cu 138 fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp 200 fbgemm_gpu/codegen/embedding_backward_dense_host.cpp 363 fbgemm_gpu/codegen/embedding_backward_split_template.cu 1085 fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp 526 fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp 191 fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp 467 fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp 380 fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu 274 fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp 98 fbgemm_gpu/codegen/embedding_bounds_check_host.cpp 16 fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu 917 fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp 359 fbgemm_gpu/codegen/embedding_forward_split_template.cu 450 fbgemm_gpu/codegen/embedding_forward_split_cpu.h 58 fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp 164 fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp 180 fbgemm_gpu/codegen/embedding_backward_code_generator.py 531 fbgemm_gpu/codegen/embedding_forward_template_helpers.cuh 29 fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp 340 fbgemm_gpu/codegen/lookup_args.py 39 fbgemm_gpu/setup.py 144 fbgemm_gpu/src/histogram_binning_calibration_ops.cu 388 fbgemm_gpu/src/split_table_batched_embeddings.cpp 101 fbgemm_gpu/src/sparse_ops_gpu.cpp 157 fbgemm_gpu/src/sparse_ops.cu 1953 fbgemm_gpu/src/input_combine_cpu.cpp 278 fbgemm_gpu/src/sparse_ops_cpu.cpp 1683 fbgemm_gpu/src/layout_transform_ops_cpu.cpp 63 fbgemm_gpu/src/quantize_ops.cu 666 fbgemm_gpu/src/cumem_utils.cu 305 fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp 341 fbgemm_gpu/src/cumem_utils_host.cpp 34 fbgemm_gpu/src/merge_pooled_embeddings_cpu.cpp 33 fbgemm_gpu/src/jagged_tensor_ops_cpu.cpp 967 fbgemm_gpu/src/layout_transform_ops_gpu.cpp 16 fbgemm_gpu/src/jagged_tensor_ops.cu 1150 fbgemm_gpu/src/split_embeddings_cache_cuda.cu 1803 fbgemm_gpu/src/permute_pooled_embedding_ops_cpu.cpp 1 fbgemm_gpu/src/cumem_utils.h 18 fbgemm_gpu/src/quantize_ops_cpu.cpp 303 fbgemm_gpu/src/quantize_ops_gpu.cpp 35 fbgemm_gpu/src/permute_pooled_embedding_ops_gpu.cpp 132 fbgemm_gpu/src/split_embeddings_utils.cu 284 fbgemm_gpu/src/cpu_utils.cpp 145 fbgemm_gpu/src/layout_transform_ops.cu 129 fbgemm_gpu/src/permute_pooled_embedding_ops.cu 74 cmake/modules/FindMKL.cmake 252 cmake/modules/FindSphinx.cmake 8 include/fbgemm/ConvUtils.h 161 include/fbgemm/Utils.h 214 include/fbgemm/FbgemmSparse.h 135 include/fbgemm/QuantUtilsAvx512.h 21 include/fbgemm/FbgemmFPCommon.h 190 include/fbgemm/spmmUtilsAvx2.h 28 include/fbgemm/FbgemmConvert.h 56 include/fbgemm/spmmUtils.h 44 include/fbgemm/Types.h 116 include/fbgemm/FbgemmBuild.h 60 include/fbgemm/FbgemmI8DepthwiseAvx2.h 63 include/fbgemm/FbgemmI64.h 18 include/fbgemm/FbgemmEmbedding.h 234 include/fbgemm/PackingTraits-inl.h 298 include/fbgemm/QuantUtils.h 224 include/fbgemm/QuantUtilsAvx2.h 111 include/fbgemm/FbgemmI8Spmdm.h 77 include/fbgemm/OutputProcessing-inl.h 282 include/fbgemm/UtilsAvx2.h 51 include/fbgemm/FbgemmPackMatrixB.h 200 include/fbgemm/Fbgemm.h 804 include/fbgemm/FbgemmFP16.h 40 include/fbgemm/FbgemmI8DirectconvAvx2.h 33 defs.bzl 118 src/QuantUtilsAvx512.cc 356 src/FbgemmI8DepthwisePerChannelQuantAvx2.cc 91 src/ExecuteKernel.cc 1 src/QuantUtils.cc 706 src/GenerateKernelDirectConvU8S8S32ACC32.cc 491 src/UtilsAvx2.cc 269 src/spmmUtilsAvx2.cc 271 src/FbgemmSparseDenseInt8Avx512.cc 506 src/RowWiseSparseAdagradFused.cc 823 src/FbgemmSparseDenseInt8Avx2.cc 221 src/GenerateKernelU8S8S32ACC16Avx512.cc 256 src/FbgemmI8Depthwise3DAvx2.cc 989 src/FbgemmFP16UKernelsAvx2.cc 886 src/OptimizedKernelsAvx2.cc 299 src/FbgemmFP16UKernelsAvx512_256.cc 2247 src/PackWeightsForDirectConv.cc 412 src/FbgemmI8Spmdm.cc 296 src/FbgemmSparseDenseAvx512.cc 130 src/TransposeUtilsAvx2.h 343 src/FbgemmI8Depthwise2DAvx2-inl.h 656 src/GenerateKernel.h 99 src/spmmUtils.cc 292 src/GroupwiseConvAcc32Avx512.cc 206 src/FbgemmBfloat16Convert.cc 56 src/FbgemmFP16UKernelsAvx512.cc 3130 src/ExecuteKernelU8S8.h 50 src/PackWeightMatrixForGConv.cc 189 src/Utils.cc 397 src/GenerateI8Depthwise.h 30 src/EmbeddingSpMDM.cc 1251 src/PackBMatrix.cc 264 src/FbgemmFP16UKernelsAvx2.h 14 src/QuantUtilsAvx2.cc 1761 src/FbgemmBfloat16ConvertAvx2.cc 42 src/FbgemmFloat16ConvertAvx512.cc 54 src/FbgemmConv.cc 434 src/FbgemmFP16UKernelsIntrinsicAvx2.cc 86 src/FbgemmFP16UKernelsAvx512.h 22 src/FbgemmBfloat16ConvertAvx512.cc 38 src/SparseAdagrad.cc 842 src/OptimizedKernelsAvx2.h 19 src/Fbgemm.cc 510 src/FbgemmFPCommon.cc 260 src/PackDepthwiseConvMatrixAvx2.cc 113 src/FbgemmFP16UKernelsIntrinsicAvx512_256.cc 92 src/GenerateKernelU8S8S32ACC16Avx512VNNI.cc 27 src/FbgemmFloat16ConvertAvx2.cc 54 src/ExecuteKernelU8S8.cc 486 src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc 271 src/CodeGenHelpers.h 167 src/GenerateI8Depthwise.cc 483 src/FbgemmFloat16Convert.cc 74 src/GenerateKernelU8S8S32ACC32.cc 308 src/GenerateKernelU8S8S32ACC16.cc 262 src/FbgemmI8DepthwiseAvx2.cc 158 src/RefImplementations.cc 1755 src/GroupwiseConvAcc32Avx2.cc 204 src/PackWeightsForConv.cc 172 src/EmbeddingSpMDMAvx512.cc 521 src/FbgemmFP16UKernelsIntrinsicAvx512.cc 111 src/codegen_fp16fp32.cc 461 src/RefImplementations.h 259 src/PackAWithRowOffset.cc 201 src/ExecuteKernel.h 5 src/FbgemmSparseDenseVectorInt8Avx512.cc 217 src/GroupwiseConv.h 256 src/GroupwiseConv.cc 938 src/EmbeddingSpMDMNBit.cc 1170 src/FbgemmI64.cc 409 src/PackAWithIm2Col.cc 645 src/EmbeddingSpMDMAvx2.cc 132 src/CodeCache.h 96 src/FbgemmFP16UKernelsAvx512_256.h 16 src/UtilsAvx512.cc 845 src/FbgemmSparseDense.cc 253 src/TransposeUtils.cc 74 src/FbgemmFP16.cc 119 src/PackAWithQuantRowOffset.cc 217 src/PackAMatrix.cc 153 src/ExecuteKernelGeneric.h 49 src/FbgemmSparseDenseAvx2.cc 117 src/DirectConv.h 148 src/PackMatrix.cc 116 src/MaskAvx2.h 39 src/TransposeUtils.h 31 src/GenerateKernel.cc 15 src/FbgemmI8DepthwiseAvx2-inl.h 487