Don't clamp ctx size in PredictServerFit (#4317)

* dont clamp ctx size in `PredictServerFit`

* minimum 4 context

* remove context warning
This commit is contained in:
Jeffrey Morgan
2024-05-10 10:17:12 -07:00
committed by GitHub
parent 7e2bceceee
commit bb6fd02298
3 changed files with 6 additions and 19 deletions

View File

@@ -61,6 +61,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
// allocate a large enough kv cache for all parallel requests
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
opts.NumCtx = opts.NumCtx * envconfig.NumParallel
req := &LlmRequest{