pkgs/python-modules/awq-inference-engine/default.nix (68 lines of code) (raw):

{ lib, stdenv, fetchFromGitHub, buildPythonPackage, autoAddDriverRunpath, cmake, git, ninja, packaging, psutil, which, cudaPackages, torch, }: buildPythonPackage rec { pname = "awq-inference-engine"; version = "0.1.0-dev"; src = fetchFromGitHub { owner = "huggingface"; repo = "llm-awq"; rev = "bd1dc2d5254345cc76ab71894651fb821275bdd4"; hash = "sha256-vNe8Nhnzrux3ysi+1d2E8CuJJ3uPxpATdSTlywC6p+s="; }; sourceRoot = "${src.name}/awq/kernels"; stdenv = cudaPackages.backendStdenv; buildInputs = with cudaPackages; [ cuda_cccl cuda_cudart libcublas libcusolver libcusparse ]; nativeBuildInputs = [ autoAddDriverRunpath cmake ninja which ]; dependencies = [ torch packaging ]; env = let # Only supports compute capability 8.0 or later. cudaCapabilities = lib.filter (v: lib.versionAtLeast v "8.0") torch.cudaCapabilities; in { CUDA_HOME = lib.getDev cudaPackages.cuda_nvcc; TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" cudaCapabilities; }; propagatedBuildInputs = [ torch ]; # cmake/ninja are used for parallel builds, but we don't want the # cmake configure hook to kick in. dontUseCmakeConfigure = true; # We don't have any tests in this package (yet). doCheck = false; preBuild = '' export MAX_JOBS=$NIX_BUILD_CORES ''; pythonImportsCheck = [ "awq_inference_engine" ]; meta = with lib; { description = "Activation-aware Weight Quantization (AWQ) kernels"; license = licenses.mit; platforms = platforms.linux; }; }