Use flash attention flag for now (#4580)

* put flash attention behind flag for now

* add test

* remove print

* up timeout for sheduler tests
This commit is contained in:
Jeffrey Morgan
2024-05-22 21:52:09 -07:00
committed by GitHub
parent 73630a7e85
commit 38255d2af1
4 changed files with 19 additions and 6 deletions

View File

@@ -200,20 +200,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--numa")
}
flashAttnSupported := true
flashAttnEnabled := envconfig.FlashAttention
// partial offloading does not support flash attention
if uint64(opts.NumGPU) < ggml.KV().BlockCount() + 1 {
flashAttnSupported = false
if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
flashAttnEnabled = false
}
// only cuda (compute capability 7+) and metal support flash attention
for _, g := range gpus {
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
flashAttnSupported = false
flashAttnEnabled = false
}
}
if flashAttnSupported {
if flashAttnEnabled {
params = append(params, "--flash-attn")
}