mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 07:22:35 +00:00
Convert ollama to the new model
This commit is contained in:
parent
028530546f
commit
a061f3f8c1
14 changed files with 379 additions and 113 deletions
|
|
@ -4,62 +4,19 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import List
|
||||
from typing import Any, Dict
|
||||
|
||||
from llama_stack.distribution.datatypes import RemoteProviderConfig
|
||||
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
|
||||
|
||||
|
||||
DEFAULT_OLLAMA_PORT = 11434
|
||||
|
||||
|
||||
class OllamaImplConfig(RemoteProviderConfig):
|
||||
port: int = DEFAULT_OLLAMA_PORT
|
||||
port: int
|
||||
|
||||
@classmethod
|
||||
def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
|
||||
return [
|
||||
DockerComposeServiceConfig(
|
||||
service_name="ollama",
|
||||
image="ollama/ollama:latest",
|
||||
volumes=["$HOME/.ollama:/root/.ollama"],
|
||||
devices=["nvidia.com/gpu=all"],
|
||||
deploy={
|
||||
"resources": {
|
||||
"reservations": {
|
||||
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
|
||||
}
|
||||
}
|
||||
},
|
||||
runtime="nvidia",
|
||||
ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
|
||||
healthcheck={
|
||||
"test": ["CMD", "curl", "-f", "http://ollama:11434"],
|
||||
"interval": "10s",
|
||||
"timeout": "5s",
|
||||
"retries": 5,
|
||||
},
|
||||
),
|
||||
DockerComposeServiceConfig(
|
||||
service_name="ollama-init",
|
||||
image="ollama/ollama",
|
||||
depends_on={"ollama": {"condition": "service_healthy"}},
|
||||
environment={
|
||||
"OLLAMA_HOST": "ollama",
|
||||
"OLLAMA_MODELS": "${OLLAMA_MODELS}",
|
||||
},
|
||||
volumes=["ollama_data:/root/.ollama"],
|
||||
entrypoint=(
|
||||
'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
|
||||
"until curl -s http://ollama:11434 > /dev/null; do"
|
||||
"attempt=$((attempt + 1));"
|
||||
"if [ $attempt -ge $max_attempts ]; then"
|
||||
'echo "Timeout waiting for Ollama server";'
|
||||
"exit 1;"
|
||||
"fi;"
|
||||
'echo "Attempt $attempt: Server not ready yet...";'
|
||||
"sleep 5;"
|
||||
"done'"
|
||||
),
|
||||
),
|
||||
]
|
||||
def sample_run_config(
|
||||
cls, port_str: str = str(DEFAULT_OLLAMA_PORT)
|
||||
) -> Dict[str, Any]:
|
||||
return {"port": port_str}
|
||||
|
|
|
|||
7
llama_stack/templates/ollama/__init__.py
Normal file
7
llama_stack/templates/ollama/__init__.py
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .ollama import get_distribution_template # noqa: F401
|
||||
|
|
@ -1,12 +1,19 @@
|
|||
version: '2'
|
||||
name: ollama
|
||||
distribution_spec:
|
||||
description: Use ollama for running LLM inference
|
||||
description: Use (an external) Ollama server for running LLM inference
|
||||
docker_image: llamastack/distribution-ollama:test-0.0.52rc3
|
||||
providers:
|
||||
inference: remote::ollama
|
||||
inference:
|
||||
- remote::ollama
|
||||
memory:
|
||||
- inline::faiss
|
||||
- remote::chromadb
|
||||
- remote::pgvector
|
||||
safety: inline::llama-guard
|
||||
agents: inline::meta-reference
|
||||
telemetry: inline::meta-reference
|
||||
safety:
|
||||
- inline::llama-guard
|
||||
agents:
|
||||
- inline::meta-reference
|
||||
telemetry:
|
||||
- inline::meta-reference
|
||||
image_type: conda
|
||||
|
|
|
|||
131
llama_stack/templates/ollama/doc_template.md
Normal file
131
llama_stack/templates/ollama/doc_template.md
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
# Ollama Distribution
|
||||
|
||||
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
||||
|
||||
{{ providers_table }}
|
||||
|
||||
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
|
||||
|
||||
{%- if docker_compose_env_vars %}
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
{% for var, (default_value, description) in docker_compose_env_vars.items() %}
|
||||
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
{%- if default_models %}
|
||||
### Models
|
||||
|
||||
The following models are configured by default:
|
||||
{% for model in default_models %}
|
||||
- `{{ model.model_id }}`
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
## Using Docker Compose
|
||||
|
||||
You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
|
||||
|
||||
```bash
|
||||
$ cd distributions/{{ name }}; docker compose up
|
||||
```
|
||||
|
||||
You will see outputs similar to following ---
|
||||
```bash
|
||||
[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps"
|
||||
[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps"
|
||||
INFO: Started server process [1]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
|
||||
[llamastack] | Resolved 12 providers
|
||||
[llamastack] | inner-inference => ollama0
|
||||
[llamastack] | models => __routing_table__
|
||||
[llamastack] | inference => __autorouted__
|
||||
```
|
||||
|
||||
To kill the server
|
||||
```bash
|
||||
docker compose down
|
||||
```
|
||||
|
||||
## Starting Ollama and Llama Stack separately
|
||||
|
||||
If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
|
||||
|
||||
#### Start Ollama server
|
||||
- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
|
||||
|
||||
**Via Docker**
|
||||
```bash
|
||||
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||
```
|
||||
|
||||
**Via CLI**
|
||||
```bash
|
||||
ollama run <model_id>
|
||||
```
|
||||
|
||||
#### Start Llama Stack server pointing to Ollama server
|
||||
|
||||
**Via Conda**
|
||||
|
||||
```bash
|
||||
llama stack build --template ollama --image-type conda
|
||||
llama stack run run.yaml
|
||||
```
|
||||
|
||||
**Via Docker**
|
||||
```
|
||||
docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
|
||||
```
|
||||
|
||||
Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
|
||||
```yaml
|
||||
inference:
|
||||
- provider_id: ollama0
|
||||
provider_type: remote::ollama
|
||||
config:
|
||||
url: http://127.0.0.1:14343
|
||||
```
|
||||
|
||||
### (Optional) Update Model Serving Configuration
|
||||
|
||||
#### Downloading model via Ollama
|
||||
|
||||
You can use ollama for managing model downloads.
|
||||
|
||||
```bash
|
||||
ollama pull llama3.1:8b-instruct-fp16
|
||||
ollama pull llama3.1:70b-instruct-fp16
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
|
||||
|
||||
|
||||
To serve a new model with `ollama`
|
||||
```bash
|
||||
ollama run <model_name>
|
||||
```
|
||||
|
||||
To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
|
||||
```
|
||||
$ ollama ps
|
||||
|
||||
NAME ID SIZE PROCESSOR UNTIL
|
||||
llama3.1:8b-instruct-fp16 4aacac419454 17 GB 100% GPU 4 minutes from now
|
||||
```
|
||||
|
||||
To verify that the model served by ollama is correctly connected to Llama Stack server
|
||||
```bash
|
||||
$ llama-stack-client models list
|
||||
+----------------------+----------------------+---------------+-----------------------------------------------+
|
||||
| identifier | llama_model | provider_id | metadata |
|
||||
+======================+======================+===============+===============================================+
|
||||
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0 | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
|
||||
+----------------------+----------------------+---------------+-----------------------------------------------+
|
||||
```
|
||||
86
llama_stack/templates/ollama/ollama.py
Normal file
86
llama_stack/templates/ollama/ollama.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
||||
|
||||
def get_distribution_template() -> DistributionTemplate:
|
||||
providers = {
|
||||
"inference": ["remote::ollama"],
|
||||
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||
"safety": ["inline::llama-guard"],
|
||||
"agents": ["inline::meta-reference"],
|
||||
"telemetry": ["inline::meta-reference"],
|
||||
}
|
||||
|
||||
inference_provider = Provider(
|
||||
provider_id="ollama",
|
||||
provider_type="remote::ollama",
|
||||
config=OllamaImplConfig.sample_run_config(
|
||||
port_str="${env.OLLAMA_PORT}",
|
||||
),
|
||||
)
|
||||
|
||||
inference_model = ModelInput(
|
||||
model_id="${env.INFERENCE_MODEL}",
|
||||
provider_id="ollama-inference",
|
||||
)
|
||||
safety_model = ModelInput(
|
||||
model_id="${env.SAFETY_MODEL}",
|
||||
provider_id="ollama-safety",
|
||||
)
|
||||
|
||||
return DistributionTemplate(
|
||||
name="ollama",
|
||||
distro_type="self_hosted",
|
||||
description="Use (an external) Ollama server for running LLM inference",
|
||||
docker_image="llamastack/distribution-ollama:test-0.0.52rc3",
|
||||
template_path=Path(__file__).parent / "doc_template.md",
|
||||
providers=providers,
|
||||
default_models=[inference_model, safety_model],
|
||||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
},
|
||||
default_models=[inference_model],
|
||||
),
|
||||
"run-with-safety.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [
|
||||
inference_provider,
|
||||
]
|
||||
},
|
||||
default_models=[
|
||||
inference_model,
|
||||
safety_model,
|
||||
],
|
||||
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||
),
|
||||
},
|
||||
docker_compose_env_vars={
|
||||
"LLAMASTACK_PORT": (
|
||||
"5001",
|
||||
"Port for the Llama Stack distribution server",
|
||||
),
|
||||
"INFERENCE_MODEL": (
|
||||
"meta-llama/Llama-3.2-3B-Instruct",
|
||||
"Inference model loaded into the TGI server",
|
||||
),
|
||||
"OLLAMA_PORT": (
|
||||
"14343",
|
||||
"Port of the Ollama server",
|
||||
),
|
||||
"SAFETY_MODEL": (
|
||||
"meta-llama/Llama-Guard-3-1B",
|
||||
"Name of the safety (Llama-Guard) model to use",
|
||||
),
|
||||
},
|
||||
)
|
||||
|
|
@ -2,7 +2,7 @@ version: '2'
|
|||
name: tgi
|
||||
distribution_spec:
|
||||
description: Use (an external) TGI server for running LLM inference
|
||||
docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
|
||||
docker_image: llamastack/distribution-tgi:test-0.0.52rc3
|
||||
providers:
|
||||
inference:
|
||||
- remote::tgi
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
name="tgi",
|
||||
distro_type="self_hosted",
|
||||
description="Use (an external) TGI server for running LLM inference",
|
||||
docker_image="llamastack/distribution-remote-tgi:test-0.0.52rc3",
|
||||
docker_image="llamastack/distribution-tgi:test-0.0.52rc3",
|
||||
template_path=Path(__file__).parent / "doc_template.md",
|
||||
providers=providers,
|
||||
default_models=[inference_model, safety_model],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue