mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 08:17:03 +00:00
Only set default keep_alive on initial model load
This change fixes the handling of keep_alive so that if client request omits the setting, we only set this on initial load. Once the model is loaded, if new requests leave this unset, we'll keep whatever keep_alive was there.
This commit is contained in:
@@ -24,7 +24,7 @@ type LlmRequest struct {
|
||||
model *Model
|
||||
opts api.Options
|
||||
origNumCtx int // Track the initial ctx request
|
||||
sessionDuration time.Duration
|
||||
sessionDuration *api.Duration
|
||||
successCh chan *runnerRef
|
||||
errCh chan error
|
||||
schedAttempts uint
|
||||
@@ -75,7 +75,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||
}
|
||||
|
||||
// context must be canceled to decrement ref count and release the runner
|
||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
@@ -389,7 +389,9 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
runner.sessionDuration = pending.sessionDuration
|
||||
if pending.sessionDuration != nil {
|
||||
runner.sessionDuration = pending.sessionDuration.Duration
|
||||
}
|
||||
pending.successCh <- runner
|
||||
go func() {
|
||||
<-pending.ctx.Done()
|
||||
@@ -402,6 +404,10 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
|
||||
if numParallel < 1 {
|
||||
numParallel = 1
|
||||
}
|
||||
sessionDuration := envconfig.KeepAlive
|
||||
if req.sessionDuration != nil {
|
||||
sessionDuration = req.sessionDuration.Duration
|
||||
}
|
||||
llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
@@ -419,7 +425,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
|
||||
modelPath: req.model.ModelPath,
|
||||
llama: llama,
|
||||
Options: &req.opts,
|
||||
sessionDuration: req.sessionDuration,
|
||||
sessionDuration: sessionDuration,
|
||||
gpus: gpus,
|
||||
estimatedVRAM: llama.EstimatedVRAM(),
|
||||
estimatedTotal: llama.EstimatedTotal(),
|
||||
|
||||
Reference in New Issue
Block a user