Don't clamp ctx size in PredictServerFit (#4317)

* dont clamp ctx size in `PredictServerFit` * minimum 4 context * remove context warning
2025-12-10 07:46:59 +00:00 · 2024-05-10 10:17:12 -07:00
parent 7e2bceceee
commit bb6fd02298
3 changed files with 6 additions and 19 deletions
--- a/server/sched.go
+++ b/server/sched.go
@@ -61,6 +61,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
 	// allocate a large enough kv cache for all parallel requests
+	if opts.NumCtx < 4 {
+		opts.NumCtx = 4
+	}
+
 	opts.NumCtx = opts.NumCtx * envconfig.NumParallel

 	req := &LlmRequest{