mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Use flash attention flag for now (#4580)
* put flash attention behind flag for now * add test * remove print * up timeout for sheduler tests
This commit is contained in:
@@ -200,20 +200,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--numa")
|
||||
}
|
||||
|
||||
flashAttnSupported := true
|
||||
flashAttnEnabled := envconfig.FlashAttention
|
||||
|
||||
// partial offloading does not support flash attention
|
||||
if uint64(opts.NumGPU) < ggml.KV().BlockCount() + 1 {
|
||||
flashAttnSupported = false
|
||||
if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||
flashAttnEnabled = false
|
||||
}
|
||||
|
||||
// only cuda (compute capability 7+) and metal support flash attention
|
||||
for _, g := range gpus {
|
||||
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
|
||||
flashAttnSupported = false
|
||||
flashAttnEnabled = false
|
||||
}
|
||||
}
|
||||
if flashAttnSupported {
|
||||
if flashAttnEnabled {
|
||||
params = append(params, "--flash-attn")
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user