mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 16:57:04 +00:00
Add support for new models and fix GitHub issues
- Add Gemma3n model support with text generation capabilities - Add new CUDA mean operations for improved performance - Add macOS documentation and performance tests - Update LLAMA patches for ROCm/CUDA compatibility - Fix various model conversion and processing issues - Update CI workflows and build configurations - Add library model tests and Shakespeare test data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2
server/internal/cache/blob/cache.go
vendored
2
server/internal/cache/blob/cache.go
vendored
@@ -59,7 +59,7 @@ type DiskCache struct {
|
||||
testHookBeforeFinalWrite func(f *os.File)
|
||||
}
|
||||
|
||||
// PutString is a convenience function for c.Put(d, strings.NewReader(s), int64(len(s))).
|
||||
// PutBytes is a convenience function for c.Put(d, strings.NewReader(s), int64(len(s))).
|
||||
func PutBytes[S string | []byte](c *DiskCache, d Digest, data S) error {
|
||||
return c.Put(d, bytes.NewReader([]byte(data)), int64(len(data)))
|
||||
}
|
||||
|
||||
@@ -231,6 +231,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
|
||||
// do not quantize relative position bias (T5)
|
||||
quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")
|
||||
|
||||
quantize = quantize && !strings.Contains(name, "per_layer_token_embd.weight")
|
||||
|
||||
newType := fsggml.TensorType(t.Kind)
|
||||
if quantize {
|
||||
// get more optimal quantization type based on the tensor shape, layer, etc.
|
||||
|
||||
@@ -1404,6 +1404,9 @@ func (s *Server) PsHandler(c *gin.Context) {
|
||||
Details: modelDetails,
|
||||
ExpiresAt: v.expiresAt,
|
||||
}
|
||||
if v.Options != nil {
|
||||
mr.ContextLength = v.Options.NumCtx / v.numParallel
|
||||
}
|
||||
// The scheduler waits to set expiresAt, so if a model is loading it's
|
||||
// possible that it will be set to the unix epoch. For those cases, just
|
||||
// calculate the time w/ the sessionDuration instead.
|
||||
|
||||
@@ -57,9 +57,7 @@ type Scheduler struct {
|
||||
var defaultModelsPerGPU = 3
|
||||
|
||||
// Default automatic value for parallel setting
|
||||
// Model will still need to fit in VRAM. If this setting won't fit
|
||||
// we'll back off down to 1 to try to get it to fit
|
||||
var defaultParallel = 2
|
||||
var defaultParallel = 1
|
||||
|
||||
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
|
||||
|
||||
@@ -191,7 +189,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
}
|
||||
|
||||
// Load model for fitting
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
|
||||
if err != nil {
|
||||
pending.errCh <- err
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user