Use flash attention flag for now (#4580)

* put flash attention behind flag for now * add test * remove print * up timeout for sheduler tests
2025-12-10 15:57:04 +00:00 · 2024-05-22 21:52:09 -07:00
parent 73630a7e85
commit 38255d2af1
4 changed files with 19 additions and 6 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -200,20 +200,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 	}

-	flashAttnSupported := true
+	flashAttnEnabled := envconfig.FlashAttention

 	// partial offloading does not support flash attention
-	if uint64(opts.NumGPU) < ggml.KV().BlockCount() + 1 {
-		flashAttnSupported = false
+	if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
+		flashAttnEnabled = false
 	}

 	// only cuda (compute capability 7+) and metal support flash attention
 	for _, g := range gpus {
 		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
-			flashAttnSupported = false
+			flashAttnEnabled = false
 		}
 	}
-	if flashAttnSupported {
+	if flashAttnEnabled {
 		params = append(params, "--flash-attn")
 	}