| [general] | |
| name = "paged_attention" | |
| universal = false | |
| [torch] | |
| src = [ | |
| "torch-ext/torch_binding.cpp", | |
| "torch-ext/torch_binding.h" | |
| ] | |
| [kernel.cuda_utils] | |
| backend = "cuda" | |
| src = [ | |
| "cuda-utils/cuda_utils.h", | |
| "cuda-utils/cuda_utils_kernels.cu", | |
| ] | |
| depends = [] | |
| [kernel.cuda_utils_rocm] | |
| backend = "rocm" | |
| rocm-archs = [ | |
| "gfx906", | |
| "gfx908", | |
| "gfx90a", | |
| "gfx940", | |
| "gfx941", | |
| "gfx942", | |
| "gfx1030", | |
| "gfx1100", | |
| "gfx1101", | |
| ] | |
| src = [ | |
| "cuda-utils/cuda_utils.h", | |
| "cuda-utils/cuda_utils_kernels.cu", | |
| ] | |
| depends = ["torch"] | |
| [kernel.paged_attention] | |
| backend = "cuda" | |
| src = [ | |
| "cuda-utils/cuda_utils.h", | |
| "paged-attention/attention/attention_dtypes.h", | |
| "paged-attention/attention/attention_generic.cuh", | |
| "paged-attention/attention/attention_kernels.cuh", | |
| "paged-attention/attention/attention_utils.cuh", | |
| "paged-attention/attention/dtype_bfloat16.cuh", | |
| "paged-attention/attention/dtype_float16.cuh", | |
| "paged-attention/attention/dtype_float32.cuh", | |
| "paged-attention/attention/dtype_fp8.cuh", | |
| "paged-attention/attention/paged_attention_v1.cu", | |
| "paged-attention/attention/paged_attention_v2.cu", | |
| "paged-attention/cache_kernels.cu", | |
| "paged-attention/cuda_compat.h", | |
| "paged-attention/dispatch_utils.h", | |
| "paged-attention/quantization/fp8/amd/quant_utils.cuh", | |
| "paged-attention/quantization/fp8/nvidia/quant_utils.cuh", | |
| ] | |
| include = [ "cuda-utils", "paged-attention" ] | |
| depends = [ "torch" ] | |
| [kernel.paged_attention_rocm] | |
| backend = "rocm" | |
| rocm-archs = [ | |
| "gfx906", | |
| "gfx908", | |
| "gfx90a", | |
| "gfx940", | |
| "gfx941", | |
| "gfx942", | |
| "gfx1030", | |
| "gfx1100", | |
| "gfx1101", | |
| ] | |
| src = [ | |
| "cuda-utils/cuda_utils.h", | |
| "paged-attention/attention/attention_dtypes.h", | |
| "paged-attention/attention/attention_generic.cuh", | |
| "paged-attention/attention/attention_kernels.cuh", | |
| "paged-attention/attention/attention_utils.cuh", | |
| "paged-attention/attention/dtype_bfloat16.cuh", | |
| "paged-attention/attention/dtype_float16.cuh", | |
| "paged-attention/attention/dtype_float32.cuh", | |
| "paged-attention/attention/dtype_fp8.cuh", | |
| "paged-attention/attention/paged_attention_v1.cu", | |
| "paged-attention/attention/paged_attention_v2.cu", | |
| "paged-attention/cache_kernels.cu", | |
| "paged-attention/cuda_compat.h", | |
| "paged-attention/dispatch_utils.h", | |
| "paged-attention/quantization/fp8/amd/quant_utils.cuh", | |
| "paged-attention/quantization/fp8/nvidia/quant_utils.cuh", | |
| ] | |
| include = [ "cuda-utils", "paged-attention" ] | |
| depends = [ "torch" ] | |
| [kernel.paged_attention_metal] | |
| backend = "metal" | |
| src = [ | |
| "paged-attention-metal/attention/paged_attention.metal", | |
| "paged-attention-metal/cache/copy_blocks.metal", | |
| "paged-attention-metal/cache/reshape_and_cache.metal", | |
| "paged-attention-metal/convert_fp8.metal", | |
| "paged-attention-metal/float8.metal", | |
| "paged-attention-metal/utils.metal", | |
| "paged-attention-metal/paged_attention.mm", | |
| "paged-attention-metal/cache.mm", | |
| "paged-attention-metal/convert_fp8.mm", | |
| "paged-attention-metal/device.mm", | |
| ] | |
| include = [ "." ] | |
| depends = [ "torch" ] | |