mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Don't clamp ctx size in PredictServerFit (#4317)
* dont clamp ctx size in `PredictServerFit` * minimum 4 context * remove context warning
This commit is contained in:
@@ -12,17 +12,8 @@ import (
|
||||
|
||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
var estimatedVRAM uint64
|
||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
// Split up the GPUs by type and try them
|
||||
var estimatedVRAM uint64
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
var layerCount int
|
||||
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
|
||||
@@ -77,15 +77,7 @@ func LoadModel(model string) (*GGML, error) {
|
||||
// The gpu list must be a single family.
|
||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
||||
var err error
|
||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||
slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
cpuRunner := ""
|
||||
var cpuRunner string
|
||||
var estimatedVRAM uint64
|
||||
var estimatedTotal uint64
|
||||
var systemMemory uint64
|
||||
|
||||
Reference in New Issue
Block a user