connect vision to text

This commit is contained in:
Michael Yang
2025-04-17 15:46:55 -07:00
committed by Michael Yang
parent 178761aef3
commit 470af8ab89
2 changed files with 80 additions and 4 deletions

View File

@@ -195,7 +195,17 @@ func newTextModel(c fs.Config) *TextModel {
}
func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs)
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
for _, mi := range batch.Multimodal {
f32s := mi.Multimodal.(*chunk).floats()
img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
if err != nil {
panic(err)
}
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
}
for i, layer := range m.Layers {
cache.SetLayer(i)