Convert ollama to the new model

2025-12-17 07:22:35 +00:00 · 2024-11-17 15:19:55 -08:00 · 2024-11-17 15:19:55 -08:00 · a061f3f8c1
commit a061f3f8c1
parent 028530546f
14 changed files with 379 additions and 113 deletions
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -4,62 +4,19 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import List
+from typing import Any, Dict

 from llama_stack.distribution.datatypes import RemoteProviderConfig
-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig


 DEFAULT_OLLAMA_PORT = 11434


 class OllamaImplConfig(RemoteProviderConfig):
-    port: int = DEFAULT_OLLAMA_PORT
+    port: int

    @classmethod
-    def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
-        return [
-            DockerComposeServiceConfig(
-                service_name="ollama",
-                image="ollama/ollama:latest",
-                volumes=["$HOME/.ollama:/root/.ollama"],
-                devices=["nvidia.com/gpu=all"],
-                deploy={
-                    "resources": {
-                        "reservations": {
-                            "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
-                        }
-                    }
-                },
-                runtime="nvidia",
-                ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
-                healthcheck={
-                    "test": ["CMD", "curl", "-f", "http://ollama:11434"],
-                    "interval": "10s",
-                    "timeout": "5s",
-                    "retries": 5,
-                },
-            ),
-            DockerComposeServiceConfig(
-                service_name="ollama-init",
-                image="ollama/ollama",
-                depends_on={"ollama": {"condition": "service_healthy"}},
-                environment={
-                    "OLLAMA_HOST": "ollama",
-                    "OLLAMA_MODELS": "${OLLAMA_MODELS}",
-                },
-                volumes=["ollama_data:/root/.ollama"],
-                entrypoint=(
-                    'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
-                    "until curl -s http://ollama:11434 > /dev/null; do"
-                    "attempt=$((attempt + 1));"
-                    "if [ $attempt -ge $max_attempts ]; then"
-                    'echo "Timeout waiting for Ollama server";'
-                    "exit 1;"
-                    "fi;"
-                    'echo "Attempt $attempt: Server not ready yet...";'
-                    "sleep 5;"
-                    "done'"
-                ),
-            ),
-        ]
+    def sample_run_config(
+        cls, port_str: str = str(DEFAULT_OLLAMA_PORT)
+    ) -> Dict[str, Any]:
+        return {"port": port_str}
--- a/llama_stack/templates/ollama/init.py
+++ b/llama_stack/templates/ollama/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .ollama import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@ -1,12 +1,19 @@
+version: '2'
 name: ollama
 distribution_spec:
-  description: Use ollama for running LLM inference
+  description: Use (an external) Ollama server for running LLM inference
+  docker_image: llamastack/distribution-ollama:test-0.0.52rc3
  providers:
-    inference: remote::ollama
+    inference:
+    - remote::ollama
    memory:
    - inline::faiss
    - remote::chromadb
    - remote::pgvector
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -0,0 +1,131 @@
+# Ollama Distribution
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+
+{{ providers_table }}
+
+You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
+
+{%- if docker_compose_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in docker_compose_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{%- if default_models %}
+### Models
+
+The following models are configured by default:
+{% for model in default_models %}
+- `{{ model.model_id }}`
+{% endfor %}
+{% endif %}
+
+## Using Docker Compose
+
+You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
+
+```bash
+$ cd distributions/{{ name }}; docker compose up
+```
+
+You will see outputs similar to following ---
+```bash
+[ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
+[ollama]               | [GIN] 2024/10/18 - 21:19:42 | 200 |      60.908µs |             ::1 | GET      "/api/ps"
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
+[llamastack] | Resolved 12 providers
+[llamastack] |  inner-inference => ollama0
+[llamastack] |  models => __routing_table__
+[llamastack] |  inference => __autorouted__
+```
+
+To kill the server
+```bash
+docker compose down
+```
+
+## Starting Ollama and Llama Stack separately
+
+If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
+
+#### Start Ollama server
+- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
+
+**Via Docker**
+```bash
+docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+```
+
+**Via CLI**
+```bash
+ollama run <model_id>
+```
+
+#### Start Llama Stack server pointing to Ollama server
+
+**Via Conda**
+
+```bash
+llama stack build --template ollama --image-type conda
+llama stack run run.yaml
+```
+
+**Via Docker**
+```
+docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
+```
+
+Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
+```yaml
+inference:
+  - provider_id: ollama0
+    provider_type: remote::ollama
+    config:
+      url: http://127.0.0.1:14343
+```
+
+### (Optional) Update Model Serving Configuration
+
+#### Downloading model via Ollama
+
+You can use ollama for managing model downloads.
+
+```bash
+ollama pull llama3.1:8b-instruct-fp16
+ollama pull llama3.1:70b-instruct-fp16
+```
+
+> [!NOTE]
+> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
+
+
+To serve a new model with `ollama`
+```bash
+ollama run <model_name>
+```
+
+To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
+```
+$ ollama ps
+
+NAME                         ID              SIZE     PROCESSOR    UNTIL
+llama3.1:8b-instruct-fp16    4aacac419454    17 GB    100% GPU     4 minutes from now
+```
+
+To verify that the model served by ollama is correctly connected to Llama Stack server
+```bash
+$ llama-stack-client models list
+----------------------+----------------------+---------------+-----------------------------------------------+
+| identifier           | llama_model          | provider_id   | metadata                                      |
+======================+======================+===============+===============================================+
+| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0       | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
+----------------------+----------------------+---------------+-----------------------------------------------+
+```
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::ollama"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="ollama",
+        provider_type="remote::ollama",
+        config=OllamaImplConfig.sample_run_config(
+            port_str="${env.OLLAMA_PORT}",
+        ),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="ollama-inference",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="ollama-safety",
+    )
+
+    return DistributionTemplate(
+        name="ollama",
+        distro_type="self_hosted",
+        description="Use (an external) Ollama server for running LLM inference",
+        docker_image="llamastack/distribution-ollama:test-0.0.52rc3",
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        default_models=[inference_model, safety_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                    ]
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                ],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
+            ),
+        },
+        docker_compose_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model loaded into the TGI server",
+            ),
+            "OLLAMA_PORT": (
+                "14343",
+                "Port of the Ollama server",
+            ),
+            "SAFETY_MODEL": (
+                "meta-llama/Llama-Guard-3-1B",
+                "Name of the safety (Llama-Guard) model to use",
+            ),
+        },
+    )
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@ -2,7 +2,7 @@ version: '2'
 name: tgi
 distribution_spec:
  description: Use (an external) TGI server for running LLM inference
-  docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
+  docker_image: llamastack/distribution-tgi:test-0.0.52rc3
  providers:
    inference:
    - remote::tgi
--- a/llama_stack/templates/tgi/tgi.py
+++ b/llama_stack/templates/tgi/tgi.py
@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
        name="tgi",
        distro_type="self_hosted",
        description="Use (an external) TGI server for running LLM inference",
-        docker_image="llamastack/distribution-remote-tgi:test-0.0.52rc3",
+        docker_image="llamastack/distribution-tgi:test-0.0.52rc3",
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[inference_model, safety_model],