mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
Increase performance for Gemma3n models on NVGPUs by enabling CUDA Graph execution (#11525)
* Enable CUDA Graphs for gemma3n. Similar to https://github.com/ggml-org/llama.cpp/pull/14741, though ollama has a slightly different model graph than llama.cpp which requires different workaround checks. * Remove residual check by reshaping differently in gemma3n model This should make the heuristics more robust
This commit is contained in:
@@ -203,10 +203,9 @@ func (a AltUp) Predict(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions
|
||||
coefficients := a.PredictionCoefficient.Forward(ctx, modalities)
|
||||
coefficients = coefficients.Reshape(ctx, opts.altupInputs, opts.altupInputs, coefficients.Dim(1), coefficients.Dim(2))
|
||||
|
||||
hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
predictions := coefficients.Mulmat(ctx, hiddenStates)
|
||||
predictions = predictions.Add(ctx, hiddenStates)
|
||||
return predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
|
||||
predictions := coefficients.Mulmat(ctx, hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx))
|
||||
predictions = predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
|
||||
return predictions.Add(ctx, hiddenStates)
|
||||
}
|
||||
|
||||
func (a AltUp) Correct(ctx ml.Context, predictions, activated, one ml.Tensor, opts *TextOptions) ml.Tensor {
|
||||
|
||||
Reference in New Issue
Block a user