Convert ollama to the new model

2025-07-31 16:01:46 +00:00 · 2024-11-17 15:19:55 -08:00 · 2024-11-17 15:19:55 -08:00 · a061f3f8c1
commit a061f3f8c1
parent 028530546f
14 changed files with 379 additions and 113 deletions
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -0,0 +1,63 @@
+version: '2'
+built_at: 2024-11-17 15:19:07.405618
+image_name: ollama
+docker_image: llamastack/distribution-ollama:test-0.0.52rc3
+conda_env: null
+apis:
+- telemetry
+- agents
+- memory
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: ollama
+    provider_type: remote::ollama
+    config:
+      port: ${env.OLLAMA_PORT}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: ollama-inference
+  provider_model_id: null
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL}
+  provider_id: ollama-safety
+  provider_model_id: null
+shields:
+- params: null
+  shield_id: ${env.SAFETY_MODEL}
+  provider_id: null
+  provider_shield_id: null
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -1,14 +1,12 @@
 version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
+built_at: 2024-11-17 15:19:07.395495
+image_name: ollama
+docker_image: llamastack/distribution-ollama:test-0.0.52rc3
+conda_env: null
 apis:
- shields
+- telemetry
 - agents
- models
 - memory
- memory_banks
 - inference
 - safety
 providers:
@ -16,32 +14,42 @@ providers:
  - provider_id: ollama
    provider_type: remote::ollama
    config:
-      url: ${env.LLAMA_INFERENCE_OLLAMA_URL:http://127.0.0.1:11434}
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
+      port: ${env.OLLAMA_PORT}
  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
    config: {}
  agents:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
-        namespace: null
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:/home/ashwin/.llama/runtime}/kvstore.db
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
  telemetry:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
 models:
-  - model_id: ${env.LLAMA_INFERENCE_MODEL:Llama3.2-3B-Instruct}
-    provider_id: ollama
-  - model_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
-    provider_id: ollama
-shields:
-  - shield_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: ollama-inference
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -1,14 +1,14 @@
 version: '2'
-built_at: 2024-11-17 14:48:55.487270
+built_at: 2024-11-17 15:19:07.405727
 image_name: remote-vllm
 docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
 conda_env: null
 apis:
- safety
- agents
 - telemetry
+- agents
 - memory
 - inference
+- safety
 providers:
  inference:
  - provider_id: vllm-inference
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1,14 +1,14 @@
 version: '2'
-built_at: 2024-11-17 14:48:55.476058
+built_at: 2024-11-17 15:19:07.395327
 image_name: remote-vllm
 docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
 conda_env: null
 apis:
- safety
- agents
 - telemetry
+- agents
 - memory
 - inference
+- safety
 providers:
  inference:
  - provider_id: vllm-inference
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -1,14 +1,14 @@
 version: '2'
-built_at: 2024-11-17 14:48:56.991119
+built_at: 2024-11-17 15:19:09.184709
 image_name: tgi
-docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
+docker_image: llamastack/distribution-tgi:test-0.0.52rc3
 conda_env: null
 apis:
- safety
- agents
 - telemetry
+- agents
 - memory
 - inference
+- safety
 providers:
  inference:
  - provider_id: tgi-inference
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -1,14 +1,14 @@
 version: '2'
-built_at: 2024-11-17 14:48:56.975663
+built_at: 2024-11-17 15:19:09.156305
 image_name: tgi
-docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
+docker_image: llamastack/distribution-tgi:test-0.0.52rc3
 conda_env: null
 apis:
- safety
- agents
 - telemetry
+- agents
 - memory
 - inference
+- safety
 providers:
  inference:
  - provider_id: tgi-inference
--- a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md
@ -2,33 +2,40 @@

 The `llamastack/distribution-ollama` distribution consists of the following provider configurations.

-| **API**         	| **Inference**  	| **Agents**     	| **Memory**                       	  | **Safety**     	| **Telemetry**  	|
-|-----------------	|----------------	|----------------	|------------------------------------	|----------------	|----------------	|
-| **Provider(s)** 	| remote::ollama 	| meta-reference 	| remote::pgvector, remote::chromadb 	| meta-reference 	| meta-reference 	|
+                        Provider Configuration
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ API       ┃ Provider(s)                                             ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ agents    │ `inline::meta-reference`                                │
+│ inference │ `remote::ollama`                                        │
+│ memory    │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` │
+│ safety    │ `inline::llama-guard`                                   │
+│ telemetry │ `inline::meta-reference`                                │
+└───────────┴─────────────────────────────────────────────────────────┘


+You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `OLLAMA_PORT`: Port of the Ollama server (default: `14343`)
+- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
+### Models
+
+The following models are configured by default:
+- `${env.INFERENCE_MODEL}`
+- `${env.SAFETY_MODEL}`
+
 ## Using Docker Compose

 You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.

-### Docker: Start the Distribution (Single Node regular Desktop machine)
-
-> [!NOTE]
-> This will start an ollama server with CPU only, please see [Ollama Documentations](https://github.com/ollama/ollama) for serving models on CPU only.
-
 ```bash
 $ cd distributions/ollama; docker compose up
 ```

-### Docker: Start a Distribution (Single Node with nvidia GPUs)
-
-> [!NOTE]
-> This assumes you have access to GPU to start a Ollama server with access to your GPU.
-
-```bash
-$ cd distributions/ollama-gpu; docker compose up
-```
-
 You will see outputs similar to following ---
 ```bash
 [ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
@ -71,7 +78,7 @@ ollama run <model_id>

 ```bash
 llama stack build --template ollama --image-type conda
-llama stack run ./gpu/run.yaml
+llama stack run run.yaml
 ```

 **Via Docker**
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -4,62 +4,19 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import List
+from typing import Any, Dict

 from llama_stack.distribution.datatypes import RemoteProviderConfig
-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig


 DEFAULT_OLLAMA_PORT = 11434


 class OllamaImplConfig(RemoteProviderConfig):
-    port: int = DEFAULT_OLLAMA_PORT
+    port: int

    @classmethod
-    def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
-        return [
-            DockerComposeServiceConfig(
-                service_name="ollama",
-                image="ollama/ollama:latest",
-                volumes=["$HOME/.ollama:/root/.ollama"],
-                devices=["nvidia.com/gpu=all"],
-                deploy={
-                    "resources": {
-                        "reservations": {
-                            "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
-                        }
-                    }
-                },
-                runtime="nvidia",
-                ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
-                healthcheck={
-                    "test": ["CMD", "curl", "-f", "http://ollama:11434"],
-                    "interval": "10s",
-                    "timeout": "5s",
-                    "retries": 5,
-                },
-            ),
-            DockerComposeServiceConfig(
-                service_name="ollama-init",
-                image="ollama/ollama",
-                depends_on={"ollama": {"condition": "service_healthy"}},
-                environment={
-                    "OLLAMA_HOST": "ollama",
-                    "OLLAMA_MODELS": "${OLLAMA_MODELS}",
-                },
-                volumes=["ollama_data:/root/.ollama"],
-                entrypoint=(
-                    'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
-                    "until curl -s http://ollama:11434 > /dev/null; do"
-                    "attempt=$((attempt + 1));"
-                    "if [ $attempt -ge $max_attempts ]; then"
-                    'echo "Timeout waiting for Ollama server";'
-                    "exit 1;"
-                    "fi;"
-                    'echo "Attempt $attempt: Server not ready yet...";'
-                    "sleep 5;"
-                    "done'"
-                ),
-            ),
-        ]
+    def sample_run_config(
+        cls, port_str: str = str(DEFAULT_OLLAMA_PORT)
+    ) -> Dict[str, Any]:
+        return {"port": port_str}
--- a/llama_stack/templates/ollama/init.py
+++ b/llama_stack/templates/ollama/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .ollama import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@ -1,12 +1,19 @@
+version: '2'
 name: ollama
 distribution_spec:
-  description: Use ollama for running LLM inference
+  description: Use (an external) Ollama server for running LLM inference
+  docker_image: llamastack/distribution-ollama:test-0.0.52rc3
  providers:
-    inference: remote::ollama
+    inference:
+    - remote::ollama
    memory:
    - inline::faiss
    - remote::chromadb
    - remote::pgvector
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -0,0 +1,131 @@
+# Ollama Distribution
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+
+{{ providers_table }}
+
+You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
+
+{%- if docker_compose_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in docker_compose_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{%- if default_models %}
+### Models
+
+The following models are configured by default:
+{% for model in default_models %}
+- `{{ model.model_id }}`
+{% endfor %}
+{% endif %}
+
+## Using Docker Compose
+
+You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
+
+```bash
+$ cd distributions/{{ name }}; docker compose up
+```
+
+You will see outputs similar to following ---
+```bash
+[ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
+[ollama]               | [GIN] 2024/10/18 - 21:19:42 | 200 |      60.908µs |             ::1 | GET      "/api/ps"
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
+[llamastack] | Resolved 12 providers
+[llamastack] |  inner-inference => ollama0
+[llamastack] |  models => __routing_table__
+[llamastack] |  inference => __autorouted__
+```
+
+To kill the server
+```bash
+docker compose down
+```
+
+## Starting Ollama and Llama Stack separately
+
+If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
+
+#### Start Ollama server
+- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
+
+**Via Docker**
+```bash
+docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+```
+
+**Via CLI**
+```bash
+ollama run <model_id>
+```
+
+#### Start Llama Stack server pointing to Ollama server
+
+**Via Conda**
+
+```bash
+llama stack build --template ollama --image-type conda
+llama stack run run.yaml
+```
+
+**Via Docker**
+```
+docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
+```
+
+Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
+```yaml
+inference:
+  - provider_id: ollama0
+    provider_type: remote::ollama
+    config:
+      url: http://127.0.0.1:14343
+```
+
+### (Optional) Update Model Serving Configuration
+
+#### Downloading model via Ollama
+
+You can use ollama for managing model downloads.
+
+```bash
+ollama pull llama3.1:8b-instruct-fp16
+ollama pull llama3.1:70b-instruct-fp16
+```
+
+> [!NOTE]
+> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
+
+
+To serve a new model with `ollama`
+```bash
+ollama run <model_name>
+```
+
+To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
+```
+$ ollama ps
+
+NAME                         ID              SIZE     PROCESSOR    UNTIL
+llama3.1:8b-instruct-fp16    4aacac419454    17 GB    100% GPU     4 minutes from now
+```
+
+To verify that the model served by ollama is correctly connected to Llama Stack server
+```bash
+$ llama-stack-client models list
+----------------------+----------------------+---------------+-----------------------------------------------+
+| identifier           | llama_model          | provider_id   | metadata                                      |
+======================+======================+===============+===============================================+
+| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0       | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
+----------------------+----------------------+---------------+-----------------------------------------------+
+```
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::ollama"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="ollama",
+        provider_type="remote::ollama",
+        config=OllamaImplConfig.sample_run_config(
+            port_str="${env.OLLAMA_PORT}",
+        ),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="ollama-inference",
+    )
+    safety_model = ModelInput(
+        model_id="${env.SAFETY_MODEL}",
+        provider_id="ollama-safety",
+    )
+
+    return DistributionTemplate(
+        name="ollama",
+        distro_type="self_hosted",
+        description="Use (an external) Ollama server for running LLM inference",
+        docker_image="llamastack/distribution-ollama:test-0.0.52rc3",
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        default_models=[inference_model, safety_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                    ]
+                },
+                default_models=[
+                    inference_model,
+                    safety_model,
+                ],
+                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
+            ),
+        },
+        docker_compose_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model loaded into the TGI server",
+            ),
+            "OLLAMA_PORT": (
+                "14343",
+                "Port of the Ollama server",
+            ),
+            "SAFETY_MODEL": (
+                "meta-llama/Llama-Guard-3-1B",
+                "Name of the safety (Llama-Guard) model to use",
+            ),
+        },
+    )
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@ -2,7 +2,7 @@ version: '2'
 name: tgi
 distribution_spec:
  description: Use (an external) TGI server for running LLM inference
-  docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
+  docker_image: llamastack/distribution-tgi:test-0.0.52rc3
  providers:
    inference:
    - remote::tgi
--- a/llama_stack/templates/tgi/tgi.py
+++ b/llama_stack/templates/tgi/tgi.py
@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
        name="tgi",
        distro_type="self_hosted",
        description="Use (an external) TGI server for running LLM inference",
-        docker_image="llamastack/distribution-remote-tgi:test-0.0.52rc3",
+        docker_image="llamastack/distribution-tgi:test-0.0.52rc3",
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[inference_model, safety_model],