Convert ollama to the new model

This commit is contained in:
Ashwin Bharambe 2024-11-17 15:19:55 -08:00
parent 028530546f
commit a061f3f8c1
14 changed files with 379 additions and 113 deletions

View file

@ -0,0 +1,63 @@
version: '2'
built_at: 2024-11-17 15:19:07.405618
image_name: ollama
docker_image: llamastack/distribution-ollama:test-0.0.52rc3
conda_env: null
apis:
- telemetry
- agents
- memory
- inference
- safety
providers:
inference:
- provider_id: ollama
provider_type: remote::ollama
config:
port: ${env.OLLAMA_PORT}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: ollama-inference
provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: ollama-safety
provider_model_id: null
shields:
- params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,14 +1,12 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
built_at: 2024-11-17 15:19:07.395495
image_name: ollama
docker_image: llamastack/distribution-ollama:test-0.0.52rc3
conda_env: null
apis:
- shields
- telemetry
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
@ -16,32 +14,42 @@ providers:
- provider_id: ollama
provider_type: remote::ollama
config:
url: ${env.LLAMA_INFERENCE_OLLAMA_URL:http://127.0.0.1:11434}
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
excluded_categories: []
port: ${env.OLLAMA_PORT}
memory:
- provider_id: meta0
provider_type: inline::meta-reference
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta0
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:/home/ashwin/.llama/runtime}/kvstore.db
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
telemetry:
- provider_id: meta0
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
models:
- model_id: ${env.LLAMA_INFERENCE_MODEL:Llama3.2-3B-Instruct}
provider_id: ollama
- model_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
provider_id: ollama
shields:
- shield_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: ollama-inference
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,14 +1,14 @@
version: '2'
built_at: 2024-11-17 14:48:55.487270
built_at: 2024-11-17 15:19:07.405727
image_name: remote-vllm
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
conda_env: null
apis:
- safety
- agents
- telemetry
- agents
- memory
- inference
- safety
providers:
inference:
- provider_id: vllm-inference

View file

@ -1,14 +1,14 @@
version: '2'
built_at: 2024-11-17 14:48:55.476058
built_at: 2024-11-17 15:19:07.395327
image_name: remote-vllm
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
conda_env: null
apis:
- safety
- agents
- telemetry
- agents
- memory
- inference
- safety
providers:
inference:
- provider_id: vllm-inference

View file

@ -1,14 +1,14 @@
version: '2'
built_at: 2024-11-17 14:48:56.991119
built_at: 2024-11-17 15:19:09.184709
image_name: tgi
docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
docker_image: llamastack/distribution-tgi:test-0.0.52rc3
conda_env: null
apis:
- safety
- agents
- telemetry
- agents
- memory
- inference
- safety
providers:
inference:
- provider_id: tgi-inference

View file

@ -1,14 +1,14 @@
version: '2'
built_at: 2024-11-17 14:48:56.975663
built_at: 2024-11-17 15:19:09.156305
image_name: tgi
docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
docker_image: llamastack/distribution-tgi:test-0.0.52rc3
conda_env: null
apis:
- safety
- agents
- telemetry
- agents
- memory
- inference
- safety
providers:
inference:
- provider_id: tgi-inference

View file

@ -2,33 +2,40 @@
The `llamastack/distribution-ollama` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |---------------- |---------------- |------------------------------------ |---------------- |---------------- |
| **Provider(s)** | remote::ollama | meta-reference | remote::pgvector, remote::chromadb | meta-reference | meta-reference |
Provider Configuration
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ API ┃ Provider(s) ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ agents │ `inline::meta-reference`
│ inference │ `remote::ollama`
│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector`
│ safety │ `inline::llama-guard`
│ telemetry │ `inline::meta-reference`
└───────────┴─────────────────────────────────────────────────────────┘
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `OLLAMA_PORT`: Port of the Ollama server (default: `14343`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
### Models
The following models are configured by default:
- `${env.INFERENCE_MODEL}`
- `${env.SAFETY_MODEL}`
## Using Docker Compose
You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
### Docker: Start the Distribution (Single Node regular Desktop machine)
> [!NOTE]
> This will start an ollama server with CPU only, please see [Ollama Documentations](https://github.com/ollama/ollama) for serving models on CPU only.
```bash
$ cd distributions/ollama; docker compose up
```
### Docker: Start a Distribution (Single Node with nvidia GPUs)
> [!NOTE]
> This assumes you have access to GPU to start a Ollama server with access to your GPU.
```bash
$ cd distributions/ollama-gpu; docker compose up
```
You will see outputs similar to following ---
```bash
[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps"
@ -71,7 +78,7 @@ ollama run <model_id>
```bash
llama stack build --template ollama --image-type conda
llama stack run ./gpu/run.yaml
llama stack run run.yaml
```
**Via Docker**

View file

@ -4,62 +4,19 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import List
from typing import Any, Dict
from llama_stack.distribution.datatypes import RemoteProviderConfig
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
DEFAULT_OLLAMA_PORT = 11434
class OllamaImplConfig(RemoteProviderConfig):
port: int = DEFAULT_OLLAMA_PORT
port: int
@classmethod
def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
return [
DockerComposeServiceConfig(
service_name="ollama",
image="ollama/ollama:latest",
volumes=["$HOME/.ollama:/root/.ollama"],
devices=["nvidia.com/gpu=all"],
deploy={
"resources": {
"reservations": {
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
}
}
},
runtime="nvidia",
ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
healthcheck={
"test": ["CMD", "curl", "-f", "http://ollama:11434"],
"interval": "10s",
"timeout": "5s",
"retries": 5,
},
),
DockerComposeServiceConfig(
service_name="ollama-init",
image="ollama/ollama",
depends_on={"ollama": {"condition": "service_healthy"}},
environment={
"OLLAMA_HOST": "ollama",
"OLLAMA_MODELS": "${OLLAMA_MODELS}",
},
volumes=["ollama_data:/root/.ollama"],
entrypoint=(
'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
"until curl -s http://ollama:11434 > /dev/null; do"
"attempt=$((attempt + 1));"
"if [ $attempt -ge $max_attempts ]; then"
'echo "Timeout waiting for Ollama server";'
"exit 1;"
"fi;"
'echo "Attempt $attempt: Server not ready yet...";'
"sleep 5;"
"done'"
),
),
]
def sample_run_config(
cls, port_str: str = str(DEFAULT_OLLAMA_PORT)
) -> Dict[str, Any]:
return {"port": port_str}

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .ollama import get_distribution_template # noqa: F401

View file

@ -1,12 +1,19 @@
version: '2'
name: ollama
distribution_spec:
description: Use ollama for running LLM inference
description: Use (an external) Ollama server for running LLM inference
docker_image: llamastack/distribution-ollama:test-0.0.52rc3
providers:
inference: remote::ollama
inference:
- remote::ollama
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,131 @@
# Ollama Distribution
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
{{ providers_table }}
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
{%- if docker_compose_env_vars %}
### Environment Variables
The following environment variables can be configured:
{% for var, (default_value, description) in docker_compose_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
{% endfor %}
{% endif %}
{%- if default_models %}
### Models
The following models are configured by default:
{% for model in default_models %}
- `{{ model.model_id }}`
{% endfor %}
{% endif %}
## Using Docker Compose
You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
```bash
$ cd distributions/{{ name }}; docker compose up
```
You will see outputs similar to following ---
```bash
[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps"
[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps"
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
[llamastack] | Resolved 12 providers
[llamastack] | inner-inference => ollama0
[llamastack] | models => __routing_table__
[llamastack] | inference => __autorouted__
```
To kill the server
```bash
docker compose down
```
## Starting Ollama and Llama Stack separately
If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
#### Start Ollama server
- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
**Via Docker**
```bash
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
```
**Via CLI**
```bash
ollama run <model_id>
```
#### Start Llama Stack server pointing to Ollama server
**Via Conda**
```bash
llama stack build --template ollama --image-type conda
llama stack run run.yaml
```
**Via Docker**
```
docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
```
Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
```yaml
inference:
- provider_id: ollama0
provider_type: remote::ollama
config:
url: http://127.0.0.1:14343
```
### (Optional) Update Model Serving Configuration
#### Downloading model via Ollama
You can use ollama for managing model downloads.
```bash
ollama pull llama3.1:8b-instruct-fp16
ollama pull llama3.1:70b-instruct-fp16
```
> [!NOTE]
> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
To serve a new model with `ollama`
```bash
ollama run <model_name>
```
To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
```
$ ollama ps
NAME ID SIZE PROCESSOR UNTIL
llama3.1:8b-instruct-fp16 4aacac419454 17 GB 100% GPU 4 minutes from now
```
To verify that the model served by ollama is correctly connected to Llama Stack server
```bash
$ llama-stack-client models list
+----------------------+----------------------+---------------+-----------------------------------------------+
| identifier | llama_model | provider_id | metadata |
+======================+======================+===============+===============================================+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0 | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
+----------------------+----------------------+---------------+-----------------------------------------------+
```

View file

@ -0,0 +1,86 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::ollama"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="ollama",
provider_type="remote::ollama",
config=OllamaImplConfig.sample_run_config(
port_str="${env.OLLAMA_PORT}",
),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="ollama-inference",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="ollama-safety",
)
return DistributionTemplate(
name="ollama",
distro_type="self_hosted",
description="Use (an external) Ollama server for running LLM inference",
docker_image="llamastack/distribution-ollama:test-0.0.52rc3",
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
]
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
),
},
docker_compose_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the TGI server",
),
"OLLAMA_PORT": (
"14343",
"Port of the Ollama server",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Name of the safety (Llama-Guard) model to use",
),
},
)

View file

@ -2,7 +2,7 @@ version: '2'
name: tgi
distribution_spec:
description: Use (an external) TGI server for running LLM inference
docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
docker_image: llamastack/distribution-tgi:test-0.0.52rc3
providers:
inference:
- remote::tgi

View file

@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
name="tgi",
distro_type="self_hosted",
description="Use (an external) TGI server for running LLM inference",
docker_image="llamastack/distribution-remote-tgi:test-0.0.52rc3",
docker_image="llamastack/distribution-tgi:test-0.0.52rc3",
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[inference_model, safety_model],