Increase performance for Gemma3n models on NVGPUs by enabling CUDA Graph execution (#11525)

* Enable CUDA Graphs for gemma3n.

Similar to
https://github.com/ggml-org/llama.cpp/pull/14741,
though ollama has a slightly different model graph
than llama.cpp which requires different workaround
checks.

* Remove residual check by reshaping differently in gemma3n model

This should make the heuristics more robust
This commit is contained in:
Oliver Simons
2025-07-29 21:37:06 +02:00
committed by GitHub
parent c116a7523d
commit ea85e27bbd
5 changed files with 67 additions and 10 deletions

View File

@@ -203,10 +203,9 @@ func (a AltUp) Predict(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions
coefficients := a.PredictionCoefficient.Forward(ctx, modalities)
coefficients = coefficients.Reshape(ctx, opts.altupInputs, opts.altupInputs, coefficients.Dim(1), coefficients.Dim(2))
hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
predictions := coefficients.Mulmat(ctx, hiddenStates)
predictions = predictions.Add(ctx, hiddenStates)
return predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
predictions := coefficients.Mulmat(ctx, hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx))
predictions = predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
return predictions.Add(ctx, hiddenStates)
}
func (a AltUp) Correct(ctx ml.Context, predictions, activated, one ml.Tensor, opts *TextOptions) ml.Tensor {