wip go engine

Co-authored-by: Patrick Devine <pdevine@sonic.net>
2025-12-12 16:57:04 +00:00 · 2023-07-03 15:22:44 -04:00
parent 172274b809
commit 76cb60d496
39 changed files with 1377 additions and 1 deletions
--- a/python/ollama/init.py
+++ b/python/ollama/init.py
@@ -0,0 +1,9 @@
+from ollama.model import models
+from ollama.engine import generate, load, unload
+
+__all__ = [
+    'models',
+    'generate',
+    'load',
+    'unload',
+]
--- a/python/ollama/main.py
+++ b/python/ollama/main.py
@@ -0,0 +1,4 @@
+from ollama.cmd import cli
+
+if __name__ == '__main__':
+    cli.main()
--- a/python/ollama/cmd/init.py
+++ b/python/ollama/cmd/init.py
--- a/python/ollama/cmd/cli.py
+++ b/python/ollama/cmd/cli.py
@@ -0,0 +1,192 @@
+import os
+import sys
+from argparse import ArgumentParser, HelpFormatter, PARSER
+from yaspin import yaspin
+
+from ollama import model, engine
+from ollama.cmd import server
+
+
+class CustomHelpFormatter(HelpFormatter):
+    """
+    This class is used to customize the way the argparse help text is displayed.
+    We specifically override the _format_action method to exclude the line that
+    shows all the subparser command options in the help text. This line is typically
+    in the form "{serve,models,pull,run}".
+    """
+
+    def _format_action(self, action):
+        # get the original help text
+        parts = super()._format_action(action)
+        if action.nargs == PARSER:
+            # remove the unwanted first line
+            parts = "\n".join(parts.split("\n")[1:])
+        return parts
+
+
+def main():
+    parser = ArgumentParser(
+        description='Ollama: Run any large language model on any machine.',
+        formatter_class=CustomHelpFormatter,
+    )
+
+    # create models home if it doesn't exist
+    os.makedirs(model.MODELS_CACHE_PATH, exist_ok=True)
+
+    subparsers = parser.add_subparsers(
+        title='commands',
+    )
+
+    list_parser = subparsers.add_parser(
+        "models",
+        description="List all available models stored locally.",
+        help="List all available models stored locally.",
+    )
+    list_parser.set_defaults(fn=list_models)
+
+    search_parser = subparsers.add_parser(
+        "search",
+        description="Search for compatible models that Ollama can run.",
+        help="Search for compatible models that Ollama can run. Usage: search [model]",
+    )
+    search_parser.add_argument(
+        "query",
+        nargs="?",
+        help="Optional name of the model to search for.",
+    )
+    search_parser.set_defaults(fn=search)
+
+    pull_parser = subparsers.add_parser(
+        "pull",
+        description="Download a specified model from a remote source.",
+        help="Download a specified model from a remote source. Usage: pull [model]",
+    )
+    pull_parser.add_argument("model", help="Name of the model to download.")
+    pull_parser.set_defaults(fn=pull)
+
+    run_parser = subparsers.add_parser(
+        "run",
+        description="Run a model and submit prompts.",
+        help="Run a model and submit prompts. Usage: run [model] [prompt]",
+    )
+    run_parser.add_argument("model", help="Name of the model to run.")
+    run_parser.add_argument(
+        "prompt",
+        nargs="?",
+        help="Optional prompt for the model, interactive mode enabled when not specified.",
+    )
+    run_parser.set_defaults(fn=run)
+
+    server.set_parser(
+        subparsers.add_parser(
+            "serve",
+            description="Start a persistent server to interact with models via the API.",
+            help="Start a persistent server to interact with models via the API.",
+        )
+    )
+
+    args = parser.parse_args()
+    args = vars(args)
+
+    try:
+        fn = args.pop("fn")
+        fn(**args)
+    except KeyboardInterrupt:
+        pass
+    except KeyError:
+        parser.print_help()
+    except Exception as e:
+        print(e)
+
+
+def list_models(*args, **kwargs):
+    for m in model.models(*args, **kwargs):
+        print(m)
+
+
+def generate(*args, **kwargs):
+    if prompt := kwargs.get("prompt"):
+        print(">>>", prompt, flush=True)
+        generate_oneshot(*args, **kwargs)
+        return
+
+    if sys.stdin.isatty():
+        return generate_interactive(*args, **kwargs)
+
+    return generate_batch(*args, **kwargs)
+
+
+def generate_oneshot(*args, **kwargs):
+    print(flush=True)
+
+    spinner = yaspin()
+    spinner.start()
+    spinner_running = True
+    try:
+        for output in engine.generate(model_name=kwargs.pop('model'), *args, **kwargs):
+            choices = output.get("choices", [])
+            if len(choices) > 0:
+                if spinner_running:
+                    spinner.stop()
+                    spinner_running = False
+                    print("\r", end="")  # move cursor back to beginning of line again
+                print(choices[0].get("text", ""), end="", flush=True)
+    except Exception:
+        spinner.stop()
+        raise
+
+    # end with a new line
+    print(flush=True)
+    print(flush=True)
+
+
+def generate_interactive(*args, **kwargs):
+    while True:
+        print(">>> ", end="", flush=True)
+        line = next(sys.stdin)
+        if not line:
+            return
+
+        kwargs.update({"prompt": line})
+        generate_oneshot(*args, **kwargs)
+
+
+def generate_batch(*args, **kwargs):
+    for line in sys.stdin:
+        print(">>> ", line, end="", flush=True)
+        kwargs.update({"prompt": line})
+        generate_oneshot(*args, **kwargs)
+
+
+def search(*args, **kwargs):
+    try:
+        model_names = model.search_directory(*args, **kwargs)
+        if len(model_names) == 0:
+            print("No models found.")
+            return
+        elif len(model_names) == 1:
+            print(f"Found {len(model_names)} available model:")
+        else:
+            print(f"Found {len(model_names)} available models:")
+        for model_name in model_names:
+            print(model_name.lower())
+    except Exception as e:
+        print("Failed to fetch available models, check your network connection")
+
+
+def pull(*args, **kwargs):
+    try:
+        model.pull(model_name=kwargs.pop('model'), *args, **kwargs)
+        print("Up to date.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+
+def run(*args, **kwargs):
+    try:
+        name = model.pull(model_name=kwargs.pop('model'), *args, **kwargs)
+        kwargs.update({"model": name})
+        print(f"Running {name}...")
+        generate(*args, **kwargs)
+    except Exception as e:
+        print(f"An error occurred: {e}")
--- a/python/ollama/cmd/server.py
+++ b/python/ollama/cmd/server.py
@@ -0,0 +1,94 @@
+import json
+import aiohttp_cors
+from aiohttp import web
+
+from ollama import engine
+
+
+def set_parser(parser):
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", default=7734)
+    parser.set_defaults(fn=serve)
+
+
+def serve(*args, **kwargs):
+    app = web.Application()
+
+    cors = aiohttp_cors.setup(
+        app,
+        defaults={
+            "*": aiohttp_cors.ResourceOptions(
+                allow_credentials=True,
+                expose_headers="*",
+                allow_headers="*",
+            )
+        },
+    )
+
+    app.add_routes(
+        [
+            web.post("/load", load),
+            web.post("/unload", unload),
+            web.post("/generate", generate),
+        ]
+    )
+
+    for route in app.router.routes():
+        cors.add(route)
+
+    app.update(
+        {
+            "models": {},
+        }
+    )
+
+    web.run_app(app, **kwargs)
+
+
+async def load(request):
+    body = await request.json()
+    name = body.get("model")
+    if not name:
+        raise web.HTTPBadRequest()
+
+    kwargs = {
+        "models": request.app.get("models"),
+    }
+
+    engine.load(name, **kwargs)
+    return web.Response()
+
+
+async def unload(request):
+    body = await request.json()
+    name = body.get("model")
+    if not name:
+        raise web.HTTPBadRequest()
+
+    engine.unload(name, models=request.app.get("models"))
+    return web.Response()
+
+
+async def generate(request):
+    body = await request.json()
+    name = body.get("model")
+    if not name:
+        raise web.HTTPBadRequest()
+
+    prompt = body.get("prompt")
+    if not prompt:
+        raise web.HTTPBadRequest()
+
+    response = web.StreamResponse()
+    await response.prepare(request)
+
+    kwargs = {
+        "models": request.app.get("models"),
+    }
+
+    for output in engine.generate(name, prompt, **kwargs):
+        output = json.dumps(output).encode('utf-8')
+        await response.write(output)
+        await response.write(b"\n")
+
+    return response
--- a/python/ollama/engine.py
+++ b/python/ollama/engine.py
@@ -0,0 +1,121 @@
+import os
+import sys
+from os import path
+from contextlib import contextmanager
+from thefuzz import process
+from llama_cpp import Llama
+from ctransformers import AutoModelForCausalLM
+
+import ollama.prompt
+from ollama.model import MODELS_CACHE_PATH
+
+
+@contextmanager
+def suppress(file):
+    original = os.dup(file.fileno())
+    with open(os.devnull, "w") as devnull:
+        os.dup2(devnull.fileno(), file.fileno())
+        yield
+
+    os.dup2(original, file.fileno())
+
+
+def generate(model_name, prompt, models={}, *args, **kwargs):
+    model = load(model_name, models=models)
+    inputs = ollama.prompt.template(model_name, prompt)
+    return model.generate(inputs, *args, **kwargs)
+
+
+def load(model_name, models={}):
+    if not models.get(model_name, None):
+        model_path = path.expanduser(model_name)
+        if not path.exists(model_path):
+            model_path = str(MODELS_CACHE_PATH / (model_name + ".bin"))
+
+        runners = {
+            model_type: cls
+            for cls in [LlamaCppRunner, CtransformerRunner]
+            for model_type in cls.model_types()
+        }
+
+        for match, _ in process.extract(model_path, runners.keys(), limit=len(runners)):
+            try:
+                model = runners.get(match)
+                runner = model(model_path, match)
+                models.update({model_name: runner})
+                return runner
+            except Exception:
+                pass
+
+        raise Exception("failed to load model", model_path, model_name)
+
+
+def unload(model_name, models={}):
+    if model_name in models:
+        models.pop(model_name)
+
+
+class LlamaCppRunner:
+    def __init__(self, model_path, model_type):
+        try:
+            with suppress(sys.stderr), suppress(sys.stdout):
+                self.model = Llama(model_path, verbose=False, n_gpu_layers=1, seed=-1)
+        except Exception:
+            raise Exception("Failed to load model", model_path, model_type)
+
+    @staticmethod
+    def model_types():
+        return [
+            'llama',
+            'orca',
+            'vicuna',
+            'ultralm',
+        ]
+
+    def generate(self, prompt, *args, **kwargs):
+        if "max_tokens" not in kwargs:
+            kwargs.update({"max_tokens": 512})
+
+        if "stop" not in kwargs:
+            kwargs.update({"stop": ["Q:"]})
+
+        if "stream" not in kwargs:
+            kwargs.update({"stream": True})
+
+        with suppress(sys.stderr):
+            for output in self.model(prompt, *args, **kwargs):
+                yield output
+
+
+class CtransformerRunner:
+    def __init__(self, model_path, model_type):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, model_type=model_type, local_files_only=True
+        )
+
+    @staticmethod
+    def model_types():
+        return [
+            'falcon',
+            'mpt',
+            'starcoder',
+        ]
+
+    def generate(self, prompt, *args, **kwargs):
+        if "max_new_tokens" not in kwargs:
+            kwargs.update({"max_new_tokens": 512})
+
+        if "stop" not in kwargs:
+            kwargs.update({"stop": ["User"]})
+
+        if "stream" not in kwargs:
+            kwargs.update({"stream": True})
+
+        for output in self.model(prompt, *args, **kwargs):
+            yield {
+                'choices': [
+                    {
+                        'text': output,
+                    },
+                ],
+            }
--- a/python/ollama/model.py
+++ b/python/ollama/model.py
@@ -0,0 +1,157 @@
+import requests
+import validators
+from pathlib import Path
+from os import path, walk
+from urllib.parse import urlsplit, urlunsplit
+from tqdm import tqdm
+
+
+MODELS_MANIFEST = 'https://ollama.ai/api/models'
+MODELS_CACHE_PATH = Path.home() / '.ollama' / 'models'
+
+
+def models(*args, **kwargs):
+    for _, _, files in walk(MODELS_CACHE_PATH):
+        for file in files:
+            base, ext = path.splitext(file)
+            if ext == '.bin':
+                yield base
+
+
+# search the directory and return all models which contain the search term as a substring,
+# or all models if no search term is provided
+def search_directory(query):
+    response = requests.get(MODELS_MANIFEST)
+    response.raise_for_status()
+    directory = response.json()
+    model_names = []
+    for model_info in directory:
+        if not query or query.lower() in model_info.get('name', '').lower():
+            model_names.append(model_info.get('name'))
+    return model_names
+
+
+# get the url of the model from our curated directory
+def get_url_from_directory(model):
+    response = requests.get(MODELS_MANIFEST)
+    response.raise_for_status()
+    directory = response.json()
+    for model_info in directory:
+        if model_info.get('name').lower() == model.lower():
+            return model_info.get('url')
+    return model
+
+
+def download_from_repo(url, file_name):
+    parts = urlsplit(url)
+    path_parts = parts.path.split('/tree/')
+
+    if len(path_parts) == 1:
+        location = path_parts[0]
+        branch = 'main'
+    else:
+        location, branch = path_parts
+
+    location = location.strip('/')
+    if file_name == '':
+        file_name = path.basename(location).lower()
+    download_url = urlunsplit(
+        (
+            'https',
+            parts.netloc,
+            f'/api/models/{location}/tree/{branch}',
+            parts.query,
+            parts.fragment,
+        )
+    )
+    response = requests.get(download_url)
+    response.raise_for_status()
+    json_response = response.json()
+
+    download_url, file_size = find_bin_file(json_response, location, branch)
+    return download_file(download_url, file_name, file_size)
+
+
+def find_bin_file(json_response, location, branch):
+    download_url = None
+    file_size = 0
+    for file_info in json_response:
+        if file_info.get('type') == 'file' and file_info.get('path').endswith('.bin'):
+            f_path = file_info.get('path')
+            download_url = (
+                f'https://huggingface.co/{location}/resolve/{branch}/{f_path}'
+            )
+            file_size = file_info.get('size')
+
+    if download_url is None:
+        raise Exception('No model found')
+
+    return download_url, file_size
+
+
+def download_file(download_url, file_name, file_size):
+    local_filename = MODELS_CACHE_PATH / str(file_name + '.bin')
+
+    first_byte = path.getsize(local_filename) if path.exists(local_filename) else 0
+
+    if first_byte >= file_size:
+        return local_filename
+
+    print(f'Pulling {file_name}...')
+
+    header = {'Range': f'bytes={first_byte}-'} if first_byte != 0 else {}
+
+    response = requests.get(download_url, headers=header, stream=True)
+    response.raise_for_status()
+
+    total_size = int(response.headers.get('content-length', 0)) + first_byte
+
+    with open(local_filename, 'ab' if first_byte else 'wb') as file, tqdm(
+        total=total_size,
+        unit='iB',
+        unit_scale=True,
+        unit_divisor=1024,
+        initial=first_byte,
+        ascii=' ==',
+        bar_format='Downloading [{bar}] {percentage:3.2f}% {rate_fmt}{postfix}',
+    ) as bar:
+        for data in response.iter_content(chunk_size=1024):
+            size = file.write(data)
+            bar.update(size)
+
+    return local_filename
+
+
+def pull(model_name, *args, **kwargs):
+    # check the remote model location and see if it needs to be downloaded
+    url = model_name
+    file_name = ""
+    if not validators.url(url) and not url.startswith('huggingface.co'):
+        try:
+            url = get_url_from_directory(model_name)
+        except Exception as e:
+            # may not have been able to check remote directory, return now
+            return model_name
+        if url is model_name:
+            # this is not a model from our directory, so can't check remote
+            maybe_existing_model_location = MODELS_CACHE_PATH / str(model_name + '.bin')
+            if path.exists(model_name) or path.exists(maybe_existing_model_location):
+                # a file on the filesystem is being specified
+                return model_name
+            raise Exception("unknown model")
+        else:
+            # this is a model from our directory, check remote
+            file_name = model_name
+
+    if not (url.startswith('http://') or url.startswith('https://')):
+        url = f'https://{url}'
+
+    if not validators.url(url):
+        if model_name in models(MODELS_CACHE_PATH):
+            # the model is already downloaded, and specified by name
+            return model_name
+        raise Exception(f'Unknown model {model_name}')
+
+    local_filename = download_from_repo(url, file_name)
+
+    return local_filename
--- a/python/ollama/prompt.py
+++ b/python/ollama/prompt.py
@@ -0,0 +1,12 @@
+from os import path
+from difflib import get_close_matches
+from jinja2 import Environment, PackageLoader
+
+
+def template(name, prompt):
+    environment = Environment(loader=PackageLoader(__name__, 'templates'))
+    best_templates = get_close_matches(
+        path.basename(name), environment.list_templates(), n=1, cutoff=0
+    )
+    template = environment.get_template(best_templates.pop())
+    return template.render(prompt=prompt)
--- a/python/ollama/templates/alpaca.prompt
+++ b/python/ollama/templates/alpaca.prompt
@@ -0,0 +1,8 @@
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{ prompt }}
+
+### Response:
+
+
--- a/python/ollama/templates/falcon.prompt
+++ b/python/ollama/templates/falcon.prompt
@@ -0,0 +1,3 @@
+A helpful assistant who helps the user with any questions asked.
+User: {{ prompt }}
+Assistant:
--- a/python/ollama/templates/gpt4.prompt
+++ b/python/ollama/templates/gpt4.prompt
@@ -0,0 +1,5 @@
+### Instruction:
+{{ prompt }}
+
+### Response:
+
--- a/python/ollama/templates/hermes.prompt
+++ b/python/ollama/templates/hermes.prompt
@@ -0,0 +1,5 @@
+### Instruction:
+{{ prompt }}
+
+### Response:
+
--- a/python/ollama/templates/mpt.prompt
+++ b/python/ollama/templates/mpt.prompt
@@ -0,0 +1,4 @@
+Below is an instruction that describes a task. Write a response that appropriately completes the request. Be concise. Once the request is completed, include no other text.
+### Instruction:
+{{ prompt }}
+### Response:
--- a/python/ollama/templates/oasst.prompt
+++ b/python/ollama/templates/oasst.prompt
@@ -0,0 +1 @@
+{{ prompt }}
--- a/python/ollama/templates/orca.prompt
+++ b/python/ollama/templates/orca.prompt
@@ -0,0 +1,7 @@
+### System:
+You are an AI assistant that follows instruction extremely well. Help as much as you can.
+
+### User:
+{{ prompt }}
+
+### Response:
--- a/python/ollama/templates/qlora.prompt
+++ b/python/ollama/templates/qlora.prompt
@@ -0,0 +1,2 @@
+### Human: {{ prompt }}
+### Assistant:
--- a/python/ollama/templates/tulu.prompt
+++ b/python/ollama/templates/tulu.prompt
@@ -0,0 +1,4 @@
+
+{{ prompt }}
+
+
--- a/python/ollama/templates/ultralm.prompt
+++ b/python/ollama/templates/ultralm.prompt
@@ -0,0 +1,2 @@
+USER: {{ prompt }}
+ASSISTANT:
--- a/python/ollama/templates/vicuna.prompt
+++ b/python/ollama/templates/vicuna.prompt
@@ -0,0 +1,4 @@
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+
+USER: {{ prompt }}
+ASSISTANT:
--- a/python/ollama/templates/wizardcoder.prompt
+++ b/python/ollama/templates/wizardcoder.prompt
@@ -0,0 +1,5 @@
+Below is an instruction that describes a task. Write a response that appropriately completes the request
+
+### Instruction: {{ prompt }}
+
+### Response:
--- a/python/ollama/templates/wizardlm.prompt
+++ b/python/ollama/templates/wizardlm.prompt
@@ -0,0 +1,2 @@
+{{ prompt }}
+### Response: