From 69b2fe9282323a57cd3557bed9b598b465d1b3a6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 21 May 2025 09:39:20 -0700
Subject: [PATCH] fix: qwen25vl assign samebatch in multimodal input (#10789)

setting samebatch on the vision start token is problematic because it
will be shared with other inputs that also use images. this will cause
the input to be cached and the runner will not see SameBatch. SameBatch
will also be incorrect since it may be for a different image.

assigning samebatch to the input tokens resolves this by ensure it's
assigned correctly to inputs corresponding to the image.

not setting same batch correctly may cause panics during inference since
images are no longer guaranteed to be in the same batch.
---
 model/models/qwen25vl/model.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go
index 7de9b6eb..32cca560 100644
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -121,13 +121,14 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
 
 			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 1})
+			result = append(result, input.Input{Token: visionStartToken})
 
 			// Add the image token with the multimodal tensor data at the first position
 			result = append(result, input.Input{
 				Token:          imageToken,
 				Multimodal:     inp.Multimodal,
 				MultimodalHash: inp.MultimodalHash,
+				SameBatch:      patchesPerChunk,
 			})
 
 			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)