Add support for new models and fix GitHub issues

- Add Gemma3n model support with text generation capabilities - Add new CUDA mean operations for improved performance - Add macOS documentation and performance tests - Update LLAMA patches for ROCm/CUDA compatibility - Fix various model conversion and processing issues - Update CI workflows and build configurations - Add library model tests and Shakespeare test data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-12 16:57:04 +00:00 · 2025-07-20 00:12:36 +08:00
parent 1fa71c2670
commit cbcbc9ae07
81 changed files with 132316 additions and 747 deletions
--- a/server/internal/cache/blob/cache.go
+++ b/server/internal/cache/blob/cache.go
@@ -59,7 +59,7 @@ type DiskCache struct {
 	testHookBeforeFinalWrite func(f *os.File)
 }

-// PutString is a convenience function for c.Put(d, strings.NewReader(s), int64(len(s))).
+// PutBytes is a convenience function for c.Put(d, strings.NewReader(s), int64(len(s))).
 func PutBytes[S string | []byte](c *DiskCache, d Digest, data S) error {
 	return c.Put(d, bytes.NewReader([]byte(data)), int64(len(data)))
 }
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -231,6 +231,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
 	// do not quantize relative position bias (T5)
 	quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")

+	quantize = quantize && !strings.Contains(name, "per_layer_token_embd.weight")
+
 	newType := fsggml.TensorType(t.Kind)
 	if quantize {
 		// get more optimal quantization type based on the tensor shape, layer, etc.
--- a/server/routes.go
+++ b/server/routes.go
@@ -1404,6 +1404,9 @@ func (s *Server) PsHandler(c *gin.Context) {
 			Details:   modelDetails,
 			ExpiresAt: v.expiresAt,
 		}
+		if v.Options != nil {
+			mr.ContextLength = v.Options.NumCtx / v.numParallel
+		}
 		// The scheduler waits to set expiresAt, so if a model is loading it's
 		// possible that it will be set to the unix epoch. For those cases, just
 		// calculate the time w/ the sessionDuration instead.
--- a/server/sched.go
+++ b/server/sched.go
@@ -57,9 +57,7 @@ type Scheduler struct {
 var defaultModelsPerGPU = 3

 // Default automatic value for parallel setting
-// Model will still need to fit in VRAM.  If this setting won't fit
-// we'll back off down to 1 to try to get it to fit
-var defaultParallel = 2
+var defaultParallel = 1

 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")

@@ -191,7 +189,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					}

 					// Load model for fitting
-					ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
+					ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
 					if err != nil {
 						pending.errCh <- err
 						break