mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-13 17:27:05 +00:00
server: Reduce gpt-oss context length for small VRAM GPUs
gpt-oss works best with a context length of at least 8k. However, for GPUs with limited amount of VRAM, there is a significant performance hit to this increased context. In these cases, we switch to the Ollama default of 4k
This commit is contained in:
@@ -30,6 +30,7 @@ import (
|
|||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/logutil"
|
"github.com/ollama/ollama/logutil"
|
||||||
@@ -50,11 +51,16 @@ func experimentEnabled(name string) bool {
|
|||||||
|
|
||||||
var useClient2 = experimentEnabled("client2")
|
var useClient2 = experimentEnabled("client2")
|
||||||
|
|
||||||
|
// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
|
||||||
|
// reduced context length on some models
|
||||||
|
var lowVRAMThreshold uint64 = 20 * format.GibiByte
|
||||||
|
|
||||||
var mode string = gin.DebugMode
|
var mode string = gin.DebugMode
|
||||||
|
|
||||||
type Server struct {
|
type Server struct {
|
||||||
addr net.Addr
|
addr net.Addr
|
||||||
sched *Scheduler
|
sched *Scheduler
|
||||||
|
lowVRAM bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
@@ -112,8 +118,9 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
|
|||||||
return nil, nil, nil, err
|
return nil, nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// This model requires a minimum context to function effectively
|
// This model is much more capable with a larger context, so set that
|
||||||
if slices.Contains(model.Config.ModelFamilies, "gptoss") {
|
// unless it would penalize performance too much
|
||||||
|
if !s.lowVRAM && slices.Contains(model.Config.ModelFamilies, "gptoss") {
|
||||||
opts.NumCtx = max(opts.NumCtx, 8192)
|
opts.NumCtx = max(opts.NumCtx, 8192)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1382,6 +1389,15 @@ func Serve(ln net.Listener) error {
|
|||||||
gpus := discover.GetGPUInfo()
|
gpus := discover.GetGPUInfo()
|
||||||
gpus.LogDetails()
|
gpus.LogDetails()
|
||||||
|
|
||||||
|
var totalVRAM uint64
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead()
|
||||||
|
}
|
||||||
|
if totalVRAM < lowVRAMThreshold {
|
||||||
|
s.lowVRAM = true
|
||||||
|
slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold))
|
||||||
|
}
|
||||||
|
|
||||||
err = srvr.Serve(ln)
|
err = srvr.Serve(ln)
|
||||||
// If server is closed from the signal handler, wait for the ctx to be done
|
// If server is closed from the signal handler, wait for the ctx to be done
|
||||||
// otherwise error out quickly
|
// otherwise error out quickly
|
||||||
|
|||||||
Reference in New Issue
Block a user