Add support for new models and fix GitHub issues

- Add Gemma3n model support with text generation capabilities
- Add new CUDA mean operations for improved performance
- Add macOS documentation and performance tests
- Update LLAMA patches for ROCm/CUDA compatibility
- Fix various model conversion and processing issues
- Update CI workflows and build configurations
- Add library model tests and Shakespeare test data

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-07-20 00:12:36 +08:00
parent 1fa71c2670
commit cbcbc9ae07
81 changed files with 132316 additions and 747 deletions

View File

@@ -57,9 +57,7 @@ type Scheduler struct {
var defaultModelsPerGPU = 3
// Default automatic value for parallel setting
// Model will still need to fit in VRAM. If this setting won't fit
// we'll back off down to 1 to try to get it to fit
var defaultParallel = 2
var defaultParallel = 1
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
@@ -191,7 +189,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
// Load model for fitting
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
if err != nil {
pending.errCh <- err
break