Merge pull request #5473 from ollama/mxyng/environ

fix: environ lookup
This commit is contained in:
Michael Yang
2024-07-31 10:18:05 -07:00
committed by GitHub
27 changed files with 547 additions and 515 deletions

View File

@@ -45,14 +45,7 @@ func TestUnicodeModelDir(t *testing.T) {
defer os.RemoveAll(modelDir)
slog.Info("unicode", "OLLAMA_MODELS", modelDir)
oldModelsDir := os.Getenv("OLLAMA_MODELS")
if oldModelsDir == "" {
defer os.Unsetenv("OLLAMA_MODELS")
} else {
defer os.Setenv("OLLAMA_MODELS", oldModelsDir)
}
err = os.Setenv("OLLAMA_MODELS", modelDir)
require.NoError(t, err)
t.Setenv("OLLAMA_MODELS", modelDir)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()

View File

@@ -5,14 +5,16 @@ package integration
import (
"context"
"log/slog"
"os"
"strconv"
"sync"
"testing"
"time"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
func TestMultiModelConcurrency(t *testing.T) {
@@ -106,13 +108,16 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
func TestMultiModelStress(t *testing.T) {
vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
if vram == "" {
s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
if s == "" {
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
}
max, err := strconv.ParseUint(vram, 10, 64)
require.NoError(t, err)
const MB = uint64(1024 * 1024)
maxVram, err := strconv.ParseUint(s, 10, 64)
if err != nil {
t.Fatal(err)
}
type model struct {
name string
size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
@@ -121,83 +126,82 @@ func TestMultiModelStress(t *testing.T) {
smallModels := []model{
{
name: "orca-mini",
size: 2992 * MB,
size: 2992 * format.MebiByte,
},
{
name: "phi",
size: 2616 * MB,
size: 2616 * format.MebiByte,
},
{
name: "gemma:2b",
size: 2364 * MB,
size: 2364 * format.MebiByte,
},
{
name: "stable-code:3b",
size: 2608 * MB,
size: 2608 * format.MebiByte,
},
{
name: "starcoder2:3b",
size: 2166 * MB,
size: 2166 * format.MebiByte,
},
}
mediumModels := []model{
{
name: "llama2",
size: 5118 * MB,
size: 5118 * format.MebiByte,
},
{
name: "mistral",
size: 4620 * MB,
size: 4620 * format.MebiByte,
},
{
name: "orca-mini:7b",
size: 5118 * MB,
size: 5118 * format.MebiByte,
},
{
name: "dolphin-mistral",
size: 4620 * MB,
size: 4620 * format.MebiByte,
},
{
name: "gemma:7b",
size: 5000 * MB,
size: 5000 * format.MebiByte,
},
{
name: "codellama:7b",
size: 5118 * format.MebiByte,
},
// TODO - uncomment this once #3565 is merged and this is rebased on it
// {
// name: "codellama:7b",
// size: 5118 * MB,
// },
}
// These seem to be too slow to be useful...
// largeModels := []model{
// {
// name: "llama2:13b",
// size: 7400 * MB,
// size: 7400 * format.MebiByte,
// },
// {
// name: "codellama:13b",
// size: 7400 * MB,
// size: 7400 * format.MebiByte,
// },
// {
// name: "orca-mini:13b",
// size: 7400 * MB,
// size: 7400 * format.MebiByte,
// },
// {
// name: "gemma:7b",
// size: 5000 * MB,
// size: 5000 * format.MebiByte,
// },
// {
// name: "starcoder2:15b",
// size: 9100 * MB,
// size: 9100 * format.MebiByte,
// },
// }
var chosenModels []model
switch {
case max < 10000*MB:
case maxVram < 10000*format.MebiByte:
slog.Info("selecting small models")
chosenModels = smallModels
// case max < 30000*MB:
// case maxVram < 30000*format.MebiByte:
default:
slog.Info("selecting medium models")
chosenModels = mediumModels
@@ -226,15 +230,15 @@ func TestMultiModelStress(t *testing.T) {
}
var wg sync.WaitGroup
consumed := uint64(256 * MB) // Assume some baseline usage
consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
for i := 0; i < len(req); i++ {
// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
if i > 1 && consumed > max {
slog.Info("achieved target vram exhaustion", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
if i > 1 && consumed > vram {
slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
break
}
consumed += chosenModels[i].size
slog.Info("target vram", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
wg.Add(1)
go func(i int) {

View File

@@ -5,7 +5,6 @@ package integration
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"strconv"
@@ -14,8 +13,10 @@ import (
"testing"
"time"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
)
func TestMaxQueue(t *testing.T) {
@@ -27,13 +28,10 @@ func TestMaxQueue(t *testing.T) {
// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
threadCount := 32
mq := os.Getenv("OLLAMA_MAX_QUEUE")
if mq != "" {
var err error
threadCount, err = strconv.Atoi(mq)
require.NoError(t, err)
if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
threadCount = maxQueue
} else {
os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
}
req := api.GenerateRequest{