diff --git a/distributions/ollama/run-with-safety.yaml b/distributions/ollama/run-with-safety.yaml new file mode 100644 index 000000000..4582ab800 --- /dev/null +++ b/distributions/ollama/run-with-safety.yaml @@ -0,0 +1,63 @@ +version: '2' +built_at: 2024-11-17 15:19:07.405618 +image_name: ollama +docker_image: llamastack/distribution-ollama:test-0.0.52rc3 +conda_env: null +apis: +- telemetry +- agents +- memory +- inference +- safety +providers: + inference: + - provider_id: ollama + provider_type: remote::ollama + config: + port: ${env.OLLAMA_PORT} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: ollama-inference + provider_model_id: null +- metadata: {} + model_id: ${env.SAFETY_MODEL} + provider_id: ollama-safety + provider_model_id: null +shields: +- params: null + shield_id: ${env.SAFETY_MODEL} + provider_id: null + provider_shield_id: null +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/distributions/ollama/run.yaml b/distributions/ollama/run.yaml index d09fa0e05..b6d411614 100644 --- a/distributions/ollama/run.yaml +++ b/distributions/ollama/run.yaml @@ -1,14 +1,12 @@ version: '2' -built_at: '2024-10-08T17:40:45.325529' -image_name: local -docker_image: null -conda_env: local +built_at: 2024-11-17 15:19:07.395495 +image_name: ollama +docker_image: llamastack/distribution-ollama:test-0.0.52rc3 +conda_env: null apis: -- shields +- telemetry - agents -- models - memory -- memory_banks - inference - safety providers: @@ -16,32 +14,42 @@ providers: - provider_id: ollama provider_type: remote::ollama config: - url: ${env.LLAMA_INFERENCE_OLLAMA_URL:http://127.0.0.1:11434} - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - excluded_categories: [] + port: ${env.OLLAMA_PORT} memory: - - provider_id: meta0 - provider_type: inline::meta-reference + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard config: {} agents: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: persistence_store: - namespace: null type: sqlite - db_path: ${env.SQLITE_STORE_DIR:/home/ashwin/.llama/runtime}/kvstore.db + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db telemetry: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db models: - - model_id: ${env.LLAMA_INFERENCE_MODEL:Llama3.2-3B-Instruct} - provider_id: ollama - - model_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B} - provider_id: ollama -shields: - - shield_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B} +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: ollama-inference + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/distributions/remote-vllm/run-with-safety.yaml b/distributions/remote-vllm/run-with-safety.yaml index d3e2ffcdc..43eb955d7 100644 --- a/distributions/remote-vllm/run-with-safety.yaml +++ b/distributions/remote-vllm/run-with-safety.yaml @@ -1,14 +1,14 @@ version: '2' -built_at: 2024-11-17 14:48:55.487270 +built_at: 2024-11-17 15:19:07.405727 image_name: remote-vllm docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 conda_env: null apis: -- safety -- agents - telemetry +- agents - memory - inference +- safety providers: inference: - provider_id: vllm-inference diff --git a/distributions/remote-vllm/run.yaml b/distributions/remote-vllm/run.yaml index 21e58fbd8..4552e6571 100644 --- a/distributions/remote-vllm/run.yaml +++ b/distributions/remote-vllm/run.yaml @@ -1,14 +1,14 @@ version: '2' -built_at: 2024-11-17 14:48:55.476058 +built_at: 2024-11-17 15:19:07.395327 image_name: remote-vllm docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 conda_env: null apis: -- safety -- agents - telemetry +- agents - memory - inference +- safety providers: inference: - provider_id: vllm-inference diff --git a/distributions/tgi/run-with-safety.yaml b/distributions/tgi/run-with-safety.yaml index 504bf3e33..d8a4619f6 100644 --- a/distributions/tgi/run-with-safety.yaml +++ b/distributions/tgi/run-with-safety.yaml @@ -1,14 +1,14 @@ version: '2' -built_at: 2024-11-17 14:48:56.991119 +built_at: 2024-11-17 15:19:09.184709 image_name: tgi -docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3 +docker_image: llamastack/distribution-tgi:test-0.0.52rc3 conda_env: null apis: -- safety -- agents - telemetry +- agents - memory - inference +- safety providers: inference: - provider_id: tgi-inference diff --git a/distributions/tgi/run.yaml b/distributions/tgi/run.yaml index 8c45832e1..1d01c8ea8 100644 --- a/distributions/tgi/run.yaml +++ b/distributions/tgi/run.yaml @@ -1,14 +1,14 @@ version: '2' -built_at: 2024-11-17 14:48:56.975663 +built_at: 2024-11-17 15:19:09.156305 image_name: tgi -docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3 +docker_image: llamastack/distribution-tgi:test-0.0.52rc3 conda_env: null apis: -- safety -- agents - telemetry +- agents - memory - inference +- safety providers: inference: - provider_id: tgi-inference diff --git a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md index 37bef9536..37c5851ab 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md @@ -2,33 +2,40 @@ The `llamastack/distribution-ollama` distribution consists of the following provider configurations. -| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | -|----------------- |---------------- |---------------- |------------------------------------ |---------------- |---------------- | -| **Provider(s)** | remote::ollama | meta-reference | remote::pgvector, remote::chromadb | meta-reference | meta-reference | + Provider Configuration +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ API ┃ Provider(s) ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ agents │ `inline::meta-reference` │ +│ inference │ `remote::ollama` │ +│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` │ +│ safety │ `inline::llama-guard` │ +│ telemetry │ `inline::meta-reference` │ +└───────────┴─────────────────────────────────────────────────────────┘ +You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables + +The following environment variables can be configured: + +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`) +- `OLLAMA_PORT`: Port of the Ollama server (default: `14343`) +- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) +### Models + +The following models are configured by default: +- `${env.INFERENCE_MODEL}` +- `${env.SAFETY_MODEL}` + ## Using Docker Compose You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command. -### Docker: Start the Distribution (Single Node regular Desktop machine) - -> [!NOTE] -> This will start an ollama server with CPU only, please see [Ollama Documentations](https://github.com/ollama/ollama) for serving models on CPU only. - ```bash $ cd distributions/ollama; docker compose up ``` -### Docker: Start a Distribution (Single Node with nvidia GPUs) - -> [!NOTE] -> This assumes you have access to GPU to start a Ollama server with access to your GPU. - -```bash -$ cd distributions/ollama-gpu; docker compose up -``` - You will see outputs similar to following --- ```bash [ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps" @@ -71,7 +78,7 @@ ollama run ```bash llama stack build --template ollama --image-type conda -llama stack run ./gpu/run.yaml +llama stack run run.yaml ``` **Via Docker** diff --git a/llama_stack/providers/remote/inference/ollama/config.py b/llama_stack/providers/remote/inference/ollama/config.py index 5bf8b98e7..09900ecf2 100644 --- a/llama_stack/providers/remote/inference/ollama/config.py +++ b/llama_stack/providers/remote/inference/ollama/config.py @@ -4,62 +4,19 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import List +from typing import Any, Dict from llama_stack.distribution.datatypes import RemoteProviderConfig -from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig DEFAULT_OLLAMA_PORT = 11434 class OllamaImplConfig(RemoteProviderConfig): - port: int = DEFAULT_OLLAMA_PORT + port: int @classmethod - def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]: - return [ - DockerComposeServiceConfig( - service_name="ollama", - image="ollama/ollama:latest", - volumes=["$HOME/.ollama:/root/.ollama"], - devices=["nvidia.com/gpu=all"], - deploy={ - "resources": { - "reservations": { - "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}] - } - } - }, - runtime="nvidia", - ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"], - healthcheck={ - "test": ["CMD", "curl", "-f", "http://ollama:11434"], - "interval": "10s", - "timeout": "5s", - "retries": 5, - }, - ), - DockerComposeServiceConfig( - service_name="ollama-init", - image="ollama/ollama", - depends_on={"ollama": {"condition": "service_healthy"}}, - environment={ - "OLLAMA_HOST": "ollama", - "OLLAMA_MODELS": "${OLLAMA_MODELS}", - }, - volumes=["ollama_data:/root/.ollama"], - entrypoint=( - 'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";' - "until curl -s http://ollama:11434 > /dev/null; do" - "attempt=$((attempt + 1));" - "if [ $attempt -ge $max_attempts ]; then" - 'echo "Timeout waiting for Ollama server";' - "exit 1;" - "fi;" - 'echo "Attempt $attempt: Server not ready yet...";' - "sleep 5;" - "done'" - ), - ), - ] + def sample_run_config( + cls, port_str: str = str(DEFAULT_OLLAMA_PORT) + ) -> Dict[str, Any]: + return {"port": port_str} diff --git a/llama_stack/templates/ollama/__init__.py b/llama_stack/templates/ollama/__init__.py new file mode 100644 index 000000000..3a2c40f27 --- /dev/null +++ b/llama_stack/templates/ollama/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .ollama import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml index 8cab877ea..45ab2a6e5 100644 --- a/llama_stack/templates/ollama/build.yaml +++ b/llama_stack/templates/ollama/build.yaml @@ -1,12 +1,19 @@ +version: '2' name: ollama distribution_spec: - description: Use ollama for running LLM inference + description: Use (an external) Ollama server for running LLM inference + docker_image: llamastack/distribution-ollama:test-0.0.52rc3 providers: - inference: remote::ollama + inference: + - remote::ollama memory: - inline::faiss - remote::chromadb - remote::pgvector - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md new file mode 100644 index 000000000..2121a4fd6 --- /dev/null +++ b/llama_stack/templates/ollama/doc_template.md @@ -0,0 +1,131 @@ +# Ollama Distribution + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. + +{{ providers_table }} + +You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. + +{%- if docker_compose_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in docker_compose_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{%- if default_models %} +### Models + +The following models are configured by default: +{% for model in default_models %} +- `{{ model.model_id }}` +{% endfor %} +{% endif %} + +## Using Docker Compose + +You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command. + +```bash +$ cd distributions/{{ name }}; docker compose up +``` + +You will see outputs similar to following --- +```bash +[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps" +[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps" +INFO: Started server process [1] +INFO: Waiting for application startup. +INFO: Application startup complete. +INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) +[llamastack] | Resolved 12 providers +[llamastack] | inner-inference => ollama0 +[llamastack] | models => __routing_table__ +[llamastack] | inference => __autorouted__ +``` + +To kill the server +```bash +docker compose down +``` + +## Starting Ollama and Llama Stack separately + +If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands. + +#### Start Ollama server +- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details. + +**Via Docker** +```bash +docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama +``` + +**Via CLI** +```bash +ollama run +``` + +#### Start Llama Stack server pointing to Ollama server + +**Via Conda** + +```bash +llama stack build --template ollama --image-type conda +llama stack run run.yaml +``` + +**Via Docker** +``` +docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml +``` + +Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g. +```yaml +inference: + - provider_id: ollama0 + provider_type: remote::ollama + config: + url: http://127.0.0.1:14343 +``` + +### (Optional) Update Model Serving Configuration + +#### Downloading model via Ollama + +You can use ollama for managing model downloads. + +```bash +ollama pull llama3.1:8b-instruct-fp16 +ollama pull llama3.1:70b-instruct-fp16 +``` + +> [!NOTE] +> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models. + + +To serve a new model with `ollama` +```bash +ollama run +``` + +To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama. +``` +$ ollama ps + +NAME ID SIZE PROCESSOR UNTIL +llama3.1:8b-instruct-fp16 4aacac419454 17 GB 100% GPU 4 minutes from now +``` + +To verify that the model served by ollama is correctly connected to Llama Stack server +```bash +$ llama-stack-client models list ++----------------------+----------------------+---------------+-----------------------------------------------+ +| identifier | llama_model | provider_id | metadata | ++======================+======================+===============+===============================================+ +| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0 | {'ollama_model': 'llama3.1:8b-instruct-fp16'} | ++----------------------+----------------------+---------------+-----------------------------------------------+ +``` diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py new file mode 100644 index 000000000..0c45f8dc1 --- /dev/null +++ b/llama_stack/templates/ollama/ollama.py @@ -0,0 +1,86 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.remote.inference.ollama import OllamaImplConfig +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::ollama"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="ollama", + provider_type="remote::ollama", + config=OllamaImplConfig.sample_run_config( + port_str="${env.OLLAMA_PORT}", + ), + ) + + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="ollama-inference", + ) + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="ollama-safety", + ) + + return DistributionTemplate( + name="ollama", + distro_type="self_hosted", + description="Use (an external) Ollama server for running LLM inference", + docker_image="llamastack/distribution-ollama:test-0.0.52rc3", + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + default_models=[inference_model, safety_model], + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=[inference_model], + ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [ + inference_provider, + ] + }, + default_models=[ + inference_model, + safety_model, + ], + default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], + ), + }, + docker_compose_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "INFERENCE_MODEL": ( + "meta-llama/Llama-3.2-3B-Instruct", + "Inference model loaded into the TGI server", + ), + "OLLAMA_PORT": ( + "14343", + "Port of the Ollama server", + ), + "SAFETY_MODEL": ( + "meta-llama/Llama-Guard-3-1B", + "Name of the safety (Llama-Guard) model to use", + ), + }, + ) diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml index 9817d90c7..5f44c2d86 100644 --- a/llama_stack/templates/tgi/build.yaml +++ b/llama_stack/templates/tgi/build.yaml @@ -2,7 +2,7 @@ version: '2' name: tgi distribution_spec: description: Use (an external) TGI server for running LLM inference - docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3 + docker_image: llamastack/distribution-tgi:test-0.0.52rc3 providers: inference: - remote::tgi diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index 0987bc0b8..cee848fc8 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate: name="tgi", distro_type="self_hosted", description="Use (an external) TGI server for running LLM inference", - docker_image="llamastack/distribution-remote-tgi:test-0.0.52rc3", + docker_image="llamastack/distribution-tgi:test-0.0.52rc3", template_path=Path(__file__).parent / "doc_template.md", providers=providers, default_models=[inference_model, safety_model],