feat: qwen3 dense and sparse models (#10708)

* feat: qwen3 dense
* feat: qwen3moe
* fix llama4 moe
This commit is contained in:
Michael Yang
2025-05-21 10:21:07 -07:00
committed by GitHub
parent 139f84cf21
commit e0ed984cde
5 changed files with 258 additions and 1 deletions

View File

@@ -82,7 +82,7 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
for i := 1; i < opts.numExpertsUsed; i++ {
nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
}
return nextStates