llm: introduce k/v context quantization (vRAM improvements) (#6279)

This commit is contained in:
Sam
2024-12-04 10:57:19 +11:00
committed by GitHub
parent 2b82c5a8a1
commit 1bdab9fdb1
10 changed files with 147 additions and 21 deletions

View File

@@ -183,3 +183,17 @@ func (si SystemInfo) GetOptimalThreadCount() int {
return coreCount
}
// For each GPU, check if it does NOT support flash attention
func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l {
supportsFA := gpu.Library == "metal" ||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
gpu.Library == "rocm"
if !supportsFA {
return false
}
}
return true
}