llama-stack-mirror/llama_stack/templates/ollama/ollama.py
Ben Browning 8ede67b809 More work on file_search verification test
This gets the file_search verification test working against ollama,
fireworks, and api.openai.com. We don't have the entirety of the
vector store API implemented in Llama Stack yet, so this still has a
bit of a hack to swap between using only OpenAI-compatible APIs versus
using the LlamaStackClient to insert content into our vector stores.

Outside of actually inserting file contents, the rest of the test
works the same and uses only the OpenAI client for all of these providers.

How to run the tests:

Ollama (sometimes flakes with small model):

```
ollama run llama3.2:3b

INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \
llama stack run ./llama_stack/templates/ollama/run.yaml \
  --image-type venv \
  --env OLLAMA_URL="http://0.0.0.0:11434"

pytest -sv \
  'tests/verifications/openai_api/test_responses.py::test_response_non_streaming_file_search' \
  --base-url=http://localhost:8321/v1/openai/v1 \
  --model meta-llama/Llama-3.2-3B-Instruct
```

Fireworks via Llama Stack:

```
llama stack run llama_stack/templates/fireworks/run.yaml

pytest -sv \
  'tests/verifications/openai_api/test_responses.py::test_response_non_streaming_file_search' \
  --base-url=http://localhost:8321/v1/openai/v1 \
  --model meta-llama/Llama-3.3-70B-Instruct
```

OpenAI directly:

```
pytest -sv \
  'tests/verifications/openai_api/test_responses.py::test_response_non_streaming_file_search' \
  --base-url=https://api.openai.com/v1 \
  --model gpt-4o
```

Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-06-13 09:36:04 -04:00

169 lines
6.1 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.apis.models.models import ModelType
from llama_stack.distribution.datatypes import (
ModelInput,
Provider,
ShieldInput,
ToolGroupInput,
)
from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::ollama"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
"eval": ["inline::meta-reference"],
"datasetio": ["remote::huggingface", "inline::localfs"],
"scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
"files": ["inline::localfs"],
"post_training": ["inline::huggingface"],
"tool_runtime": [
"remote::brave-search",
"remote::tavily-search",
"inline::rag-runtime",
"remote::model-context-protocol",
"remote::wolfram-alpha",
],
}
name = "ollama"
inference_provider = Provider(
provider_id="ollama",
provider_type="remote::ollama",
config=OllamaImplConfig.sample_run_config(),
)
vector_io_provider_faiss = Provider(
provider_id="faiss",
provider_type="inline::faiss",
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
)
files_provider = Provider(
provider_id="meta-reference-files",
provider_type="inline::localfs",
config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
)
posttraining_provider = Provider(
provider_id="huggingface",
provider_type="inline::huggingface",
config=HuggingFacePostTrainingConfig.sample_run_config(f"~/.llama/distributions/{name}"),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="ollama",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="ollama",
)
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="ollama",
provider_model_id="all-minilm:latest",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
default_tool_groups = [
ToolGroupInput(
toolgroup_id="builtin::websearch",
provider_id="tavily-search",
),
ToolGroupInput(
toolgroup_id="builtin::rag",
provider_id="rag-runtime",
),
ToolGroupInput(
toolgroup_id="builtin::wolfram_alpha",
provider_id="wolfram-alpha",
),
]
return DistributionTemplate(
name=name,
distro_type="self_hosted",
description="Use (an external) Ollama server for running LLM inference",
container_image=None,
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
"vector_io": [vector_io_provider_faiss],
"files": [files_provider],
"post_training": [posttraining_provider],
},
default_models=[inference_model, embedding_model],
default_tool_groups=default_tool_groups,
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
"vector_io": [vector_io_provider_faiss],
"files": [files_provider],
"post_training": [posttraining_provider],
"safety": [
Provider(
provider_id="llama-guard",
provider_type="inline::llama-guard",
config={},
),
Provider(
provider_id="code-scanner",
provider_type="inline::code-scanner",
config={},
),
],
},
default_models=[
inference_model,
safety_model,
embedding_model,
],
default_shields=[
ShieldInput(
shield_id="${env.SAFETY_MODEL}",
provider_id="llama-guard",
),
ShieldInput(
shield_id="CodeScanner",
provider_id="code-scanner",
),
],
default_tool_groups=default_tool_groups,
),
},
run_config_env_vars={
"LLAMA_STACK_PORT": (
"8321",
"Port for the Llama Stack distribution server",
),
"OLLAMA_URL": (
"http://127.0.0.1:11434",
"URL of the Ollama server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the Ollama server",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Safety model loaded into the Ollama server",
),
},
)