server/internal/client: use chunksums for concurrent blob verification (#9746)

Replace large-chunk blob downloads with parallel small-chunk verification to solve timeout and performance issues. Registry users experienced progressively slowing download speeds as large-chunk transfers aged, often timing out completely. The previous approach downloaded blobs in a few large chunks but required a separate, single-threaded pass to read the entire blob back from disk for verification after download completion. This change uses the new chunksums API to fetch many smaller chunk+digest pairs, allowing concurrent downloads and immediate verification as each chunk arrives. Chunks are written directly to their final positions, eliminating the entire separate verification pass. The result is more reliable downloads that maintain speed throughout the transfer process and significantly faster overall completion, especially over unstable connections or with large blobs.
2025-12-12 16:57:04 +00:00 · 2025-03-13 22:18:29 -07:00
parent 4ea4d2b189
commit eb2b22b042
8 changed files with 441 additions and 291 deletions
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -1,6 +1,5 @@
-// Package registry provides an http.Handler for handling local Ollama API
-// requests for performing tasks related to the ollama.com model registry and
-// the local disk cache.
+// Package registry implements an http.Handler for handling local Ollama API
+// model management requests. See [Local] for details.
 package registry

 import (
@@ -10,6 +9,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"maps"
 	"net/http"
 	"sync"
 	"time"
@@ -18,16 +18,11 @@ import (
 	"github.com/ollama/ollama/server/internal/client/ollama"
 )

-// Local is an http.Handler for handling local Ollama API requests for
-// performing tasks related to the ollama.com model registry combined with the
-// local disk cache.
+// Local implements an http.Handler for handling local Ollama API model
+// management requests, such as pushing, pulling, and deleting models.
 //
-// It is not concern of Local, or this package, to handle model creation, which
-// proceeds any registry operations for models it produces.
-//
-// NOTE: The package built for dealing with model creation should use
-// [DefaultCache] to access the blob store and not attempt to read or write
-// directly to the blob disk cache.
+// It can be arranged for all unknown requests to be passed through to a
+// fallback handler, if one is provided.
 type Local struct {
 	Client *ollama.Registry // required
 	Logger *slog.Logger     // required
@@ -63,6 +58,7 @@ func (e serverError) Error() string {
 var (
 	errMethodNotAllowed = &serverError{405, "method_not_allowed", "method not allowed"}
 	errNotFound         = &serverError{404, "not_found", "not found"}
+	errModelNotFound    = &serverError{404, "not_found", "model not found"}
 	errInternalError    = &serverError{500, "internal_error", "internal server error"}
 )

@@ -175,8 +171,16 @@ func (s *Local) serveHTTP(rec *statusCodeRecorder, r *http.Request) {
 }

 type params struct {
-	DeprecatedName string `json:"name"`  // Use [params.model]
-	Model          string `json:"model"` // Use [params.model]
+	// DeprecatedName is the name of the model to push, pull, or delete,
+	// but is deprecated. New clients should use [Model] instead.
+	//
+	// Use [model()] to get the model name for both old and new API requests.
+	DeprecatedName string `json:"name"`
+
+	// Model is the name of the model to push, pull, or delete.
+	//
+	// Use [model()] to get the model name for both old and new API requests.
+	Model string `json:"model"`

 	// AllowNonTLS is a flag that indicates a client using HTTP
 	// is doing so, deliberately.
@@ -189,9 +193,18 @@ type params struct {
 	// confusing flags such as this.
 	AllowNonTLS bool `json:"insecure"`

-	// ProgressStream is a flag that indicates the client is expecting a stream of
-	// progress updates.
-	ProgressStream bool `json:"stream"`
+	// Stream, if true, will make the server send progress updates in a
+	// streaming of JSON objects. If false, the server will send a single
+	// JSON object with the final status as "success", or an error object
+	// if an error occurred.
+	//
+	// Unfortunately, this API was designed to be a bit awkward. Stream is
+	// defined to default to true if not present, so we need a way to check
+	// if the client decisively it to false. So, we use a pointer to a
+	// bool. Gross.
+	//
+	// Use [stream()] to get the correct value for this field.
+	Stream *bool `json:"stream"`
 }

 // model returns the model name for both old and new API requests.
@@ -199,6 +212,13 @@ func (p params) model() string {
 	return cmp.Or(p.Model, p.DeprecatedName)
 }

+func (p params) stream() bool {
+	if p.Stream == nil {
+		return true
+	}
+	return *p.Stream
+}
+
 func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
 	if r.Method != "DELETE" {
 		return errMethodNotAllowed
@@ -212,16 +232,16 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
 		return err
 	}
 	if !ok {
-		return &serverError{404, "not_found", "model not found"}
+		return errModelNotFound
 	}
-	if s.Prune == nil {
-		return nil
+	if s.Prune != nil {
+		return s.Prune()
 	}
-	return s.Prune()
+	return nil
 }

 type progressUpdateJSON struct {
-	Status    string      `json:"status"`
+	Status    string      `json:"status,omitempty,omitzero"`
 	Digest    blob.Digest `json:"digest,omitempty,omitzero"`
 	Total     int64       `json:"total,omitempty,omitzero"`
 	Completed int64       `json:"completed,omitempty,omitzero"`
@@ -237,6 +257,17 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 		return err
 	}

+	enc := json.NewEncoder(w)
+	if !p.stream() {
+		if err := s.Client.Pull(r.Context(), p.model()); err != nil {
+			if errors.Is(err, ollama.ErrModelNotFound) {
+				return errModelNotFound
+			}
+			return err
+		}
+		return enc.Encode(progressUpdateJSON{Status: "success"})
+	}
+
 	maybeFlush := func() {
 		fl, _ := w.(http.Flusher)
 		if fl != nil {
@@ -246,69 +277,67 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 	defer maybeFlush()

 	var mu sync.Mutex
-	enc := json.NewEncoder(w)
-	enc.Encode(progressUpdateJSON{Status: "pulling manifest"})
+	progress := make(map[*ollama.Layer]int64)

-	ctx := ollama.WithTrace(r.Context(), &ollama.Trace{
-		Update: func(l *ollama.Layer, n int64, err error) {
-			mu.Lock()
-			defer mu.Unlock()
+	progressCopy := make(map[*ollama.Layer]int64, len(progress))
+	pushUpdate := func() {
+		defer maybeFlush()

-			// TODO(bmizerany): coalesce these updates; writing per
-			// update is expensive
+		// TODO(bmizerany): This scales poorly with more layers due to
+		// needing to flush out them all in one big update. We _could_
+		// just flush on the changed ones, or just track the whole
+		// download. Needs more thought. This is fine for now.
+		mu.Lock()
+		maps.Copy(progressCopy, progress)
+		mu.Unlock()
+		for l, n := range progress {
 			enc.Encode(progressUpdateJSON{
 				Digest:    l.Digest,
-				Status:    "pulling",
 				Total:     l.Size,
 				Completed: n,
 			})
+		}
+	}
+
+	t := time.NewTicker(time.Hour) // "unstarted" timer
+	start := sync.OnceFunc(func() {
+		pushUpdate()
+		t.Reset(100 * time.Millisecond)
+	})
+	ctx := ollama.WithTrace(r.Context(), &ollama.Trace{
+		Update: func(l *ollama.Layer, n int64, err error) {
+			if n > 0 {
+				start() // flush initial state
+			}
+			mu.Lock()
+			progress[l] = n
+			mu.Unlock()
 		},
 	})

 	done := make(chan error, 1)
 	go func() {
-		// TODO(bmizerany): continue to support non-streaming responses
 		done <- s.Client.Pull(ctx, p.model())
 	}()

-	func() {
-		t := time.NewTicker(100 * time.Millisecond)
-		defer t.Stop()
-		for {
-			select {
-			case <-t.C:
-				mu.Lock()
-				maybeFlush()
-				mu.Unlock()
-			case err := <-done:
-				if err != nil {
-					var status string
-					if errors.Is(err, ollama.ErrModelNotFound) {
-						status = fmt.Sprintf("error: model %q not found", p.model())
-						enc.Encode(progressUpdateJSON{Status: status})
-					} else {
-						status = fmt.Sprintf("error: %v", err)
-						enc.Encode(progressUpdateJSON{Status: status})
-					}
-					return
+	for {
+		select {
+		case <-t.C:
+			pushUpdate()
+		case err := <-done:
+			pushUpdate()
+			if err != nil {
+				var status string
+				if errors.Is(err, ollama.ErrModelNotFound) {
+					status = fmt.Sprintf("error: model %q not found", p.model())
+				} else {
+					status = fmt.Sprintf("error: %v", err)
 				}
-
-				// These final updates are not strictly necessary, because they have
-				// already happened at this point. Our pull handler code used to do
-				// these steps after, not during, the pull, and they were slow, so we
-				// wanted to provide feedback to users what was happening. For now, we
-				// keep them to not jar users who are used to seeing them. We can phase
-				// them out with a new and nicer UX later. One without progress bars
-				// and digests that no one cares about.
-				enc.Encode(progressUpdateJSON{Status: "verifying layers"})
-				enc.Encode(progressUpdateJSON{Status: "writing manifest"})
-				enc.Encode(progressUpdateJSON{Status: "success"})
-				return
+				enc.Encode(progressUpdateJSON{Status: status})
 			}
+			return nil
 		}
-	}()
-
-	return nil
+	}
 }

 func decodeUserJSON[T any](r io.Reader) (T, error) {