Merge branch 'main' into chromadb_openai_compatible

2025-12-23 05:12:26 +00:00 · 2025-07-23 15:15:49 -04:00 · 2025-07-23 15:15:49 -04:00 · 56d01cee69
commit 56d01cee69
parent 0c24d0cc41 427136bb63
10 changed files with 639 additions and 10 deletions
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from enum import Enum
+from enum import Enum, StrEnum
 from typing import Annotated, Any, Literal, Protocol
 from pydantic import BaseModel, Field, field_validator
@ -88,7 +88,7 @@ class RAGQueryGenerator(Enum):
@json_schema_type
-class RAGSearchMode(Enum):
+class RAGSearchMode(StrEnum):
    """
    Search modes for RAG query retrieval:
    - VECTOR: Uses vector similarity search for semantic matching
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -336,7 +336,7 @@ $CONTAINER_BINARY build \
  "$BUILD_CONTEXT_DIR"
 # clean up tmp/configs
-rm -f "$BUILD_CONTEXT_DIR/run.yaml"
+rm -rf "$BUILD_CONTEXT_DIR/run.yaml" "$TEMP_DIR"
 set +x
 echo "Success!"
--- a/llama_stack/templates/dell/init.py
+++ b/llama_stack/templates/dell/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .dell import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/dell/build.yaml
+++ b/llama_stack/templates/dell/build.yaml
@ -0,0 +1,35 @@
 version: 2
 distribution_spec:
  description: Dell's distribution of Llama Stack. TGI inference via Dell's custom
    container
  providers:
    inference:
    - remote::tgi
    - inline::sentence-transformers
    vector_io:
    - inline::faiss
    - remote::chromadb
    - remote::pgvector
    safety:
    - inline::llama-guard
    agents:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    datasetio:
    - remote::huggingface
    - inline::localfs
    scoring:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
    tool_runtime:
    - remote::brave-search
    - remote::tavily-search
    - inline::rag-runtime
 image_type: conda
 additional_pip_packages:
 - aiosqlite
 - sqlalchemy[asyncio]
--- a/llama_stack/templates/dell/dell.py
+++ b/llama_stack/templates/dell/dell.py
@ -0,0 +1,142 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.models import ModelType
 from llama_stack.distribution.datatypes import (
    ModelInput,
    Provider,
    ShieldInput,
    ToolGroupInput,
 )
 from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
 )
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["remote::tgi", "inline::sentence-transformers"],
        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
        "tool_runtime": [
            "remote::brave-search",
            "remote::tavily-search",
            "inline::rag-runtime",
        ],
    }
    name = "dell"
    inference_provider = Provider(
        provider_id="tgi0",
        provider_type="remote::tgi",
        config={
            "url": "${env.DEH_URL}",
        },
    )
    safety_inference_provider = Provider(
        provider_id="tgi1",
        provider_type="remote::tgi",
        config={
            "url": "${env.DEH_SAFETY_URL}",
        },
    )
    embedding_provider = Provider(
        provider_id="sentence-transformers",
        provider_type="inline::sentence-transformers",
        config=SentenceTransformersInferenceConfig.sample_run_config(),
    )
    chromadb_provider = Provider(
        provider_id="chromadb",
        provider_type="remote::chromadb",
        config={
            "url": "${env.CHROMA_URL}",
        },
    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="tgi0",
    )
    safety_model = ModelInput(
        model_id="${env.SAFETY_MODEL}",
        provider_id="tgi1",
    )
    embedding_model = ModelInput(
        model_id="all-MiniLM-L6-v2",
        provider_id="sentence-transformers",
        model_type=ModelType.embedding,
        metadata={
            "embedding_dimension": 384,
        },
    )
    default_tool_groups = [
        ToolGroupInput(
            toolgroup_id="builtin::websearch",
            provider_id="brave-search",
        ),
        ToolGroupInput(
            toolgroup_id="builtin::rag",
            provider_id="rag-runtime",
        ),
    ]
    return DistributionTemplate(
        name=name,
        distro_type="self_hosted",
        description="Dell's distribution of Llama Stack. TGI inference via Dell's custom container",
        container_image=None,
        providers=providers,
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider, embedding_provider],
                    "vector_io": [chromadb_provider],
                },
                default_models=[inference_model, embedding_model],
                default_tool_groups=default_tool_groups,
            ),
            "run-with-safety.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [
                        inference_provider,
                        safety_inference_provider,
                        embedding_provider,
                    ],
                    "vector_io": [chromadb_provider],
                },
                default_models=[inference_model, safety_model, embedding_model],
                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
                default_tool_groups=default_tool_groups,
            ),
        },
        run_config_env_vars={
            "DEH_URL": (
                "http://0.0.0.0:8181",
                "URL for the Dell inference server",
            ),
            "DEH_SAFETY_URL": (
                "http://0.0.0.0:8282",
                "URL for the Dell safety inference server",
            ),
            "CHROMA_URL": (
                "http://localhost:6601",
                "URL for the Chroma server",
            ),
            "INFERENCE_MODEL": (
                "meta-llama/Llama-3.2-3B-Instruct",
                "Inference model loaded into the TGI server",
            ),
            "SAFETY_MODEL": (
                "meta-llama/Llama-Guard-3-1B",
                "Name of the safety (Llama-Guard) model to use",
            ),
        },
    )
--- a/llama_stack/templates/dell/doc_template.md
+++ b/llama_stack/templates/dell/doc_template.md
@ -0,0 +1,178 @@
 ---
 orphan: true
 ---
 # Dell Distribution of Llama Stack
 ```{toctree}
 :maxdepth: 2
 :hidden:
 self
 ```
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
 {{ providers_table }}
 You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
 {% if run_config_env_vars %}
 ### Environment Variables
 The following environment variables can be configured:
 {% for var, (default_value, description) in run_config_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}
 ## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
 NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
 ```bash
 export INFERENCE_PORT=8181
 export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
 export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
 export CHROMADB_HOST=localhost
 export CHROMADB_PORT=6601
 export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
 export CUDA_VISIBLE_DEVICES=0
 export LLAMA_STACK_PORT=8321
 docker run --rm -it \
  --pull always \
  --network host \
  -v $HOME/.cache/huggingface:/data \
  -e HF_TOKEN=$HF_TOKEN \
  -p $INFERENCE_PORT:$INFERENCE_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
  ghcr.io/huggingface/text-generation-inference \
  --dtype bfloat16 \
  --usage-stats off \
  --sharded false \
  --cuda-memory-fraction 0.7 \
  --model-id $INFERENCE_MODEL \
  --port $INFERENCE_PORT --hostname 0.0.0.0
 ```
 If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
 ```bash
 export SAFETY_INFERENCE_PORT=8282
 export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export CUDA_VISIBLE_DEVICES=1
 docker run --rm -it \
  --pull always \
  --network host \
  -v $HOME/.cache/huggingface:/data \
  -e HF_TOKEN=$HF_TOKEN \
  -p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
  ghcr.io/huggingface/text-generation-inference \
  --dtype bfloat16 \
  --usage-stats off \
  --sharded false \
  --cuda-memory-fraction 0.7 \
  --model-id $SAFETY_MODEL \
  --hostname 0.0.0.0 \
  --port $SAFETY_INFERENCE_PORT
 ```
 ## Dell distribution relies on ChromaDB for vector database usage
 You can start a chroma-db easily using docker.
 ```bash
 # This is where the indices are persisted
 mkdir -p $HOME/chromadb
 podman run --rm -it \
  --network host \
  --name chromadb \
  -v $HOME/chromadb:/chroma/chroma \
  -e IS_PERSISTENT=TRUE \
  chromadb/chroma:latest \
  --port $CHROMADB_PORT \
  --host $CHROMADB_HOST
 ```
 ## Running Llama Stack
 Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 docker run -it \
  --pull always \
  --network host \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
  # NOTE: mount the llama-stack directory if testing local changes else not needed
  -v /home/hjshah/git/llama-stack:/app/llama-stack-source \
  # localhost/distribution-dell:dev if building / testing locally
  llamastack/distribution-{{ name }}\
  --port $LLAMA_STACK_PORT  \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env DEH_URL=$DEH_URL \
  --env CHROMA_URL=$CHROMA_URL
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 # You need a local checkout of llama-stack to run this, get it using
 # git clone https://github.com/meta-llama/llama-stack.git
 cd /path/to/llama-stack
 export SAFETY_INFERENCE_PORT=8282
 export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env DEH_URL=$DEH_URL \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
  --env CHROMA_URL=$CHROMA_URL
 ```
 ### Via Conda
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 ```bash
 llama stack build --template {{ name }} --image-type conda
 llama stack run {{ name }}
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env DEH_URL=$DEH_URL \
  --env CHROMA_URL=$CHROMA_URL
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run ./run-with-safety.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env DEH_URL=$DEH_URL \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
  --env CHROMA_URL=$CHROMA_URL
 ```
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@ -0,0 +1,131 @@
 version: 2
 image_name: dell
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - safety
 - scoring
 - telemetry
 - tool_runtime
 - vector_io
 providers:
  inference:
  - provider_id: tgi0
    provider_type: remote::tgi
    config:
      url: ${env.DEH_URL}
  - provider_id: tgi1
    provider_type: remote::tgi
    config:
      url: ${env.DEH_SAFETY_URL}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
    config: {}
  vector_io:
  - provider_id: chromadb
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMA_URL}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/agents_store.db
      responses_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/responses_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/trace_store.db
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/meta_reference_eval.db
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/huggingface_datasetio.db
  - provider_id: localfs
    provider_type: inline::localfs
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/localfs_datasetio.db
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:=}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
    config:
      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: tavily-search
    provider_type: remote::tavily-search
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/registry.db
 inference_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/inference_store.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: tgi0
  model_type: llm
 - metadata: {}
  model_id: ${env.SAFETY_MODEL}
  provider_id: tgi1
  model_type: llm
 - metadata:
    embedding_dimension: 384
  model_id: all-MiniLM-L6-v2
  provider_id: sentence-transformers
  model_type: embedding
 shields:
 - shield_id: ${env.SAFETY_MODEL}
 vector_dbs: []
 datasets: []
 scoring_fns: []
 benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
  provider_id: brave-search
 - toolgroup_id: builtin::rag
  provider_id: rag-runtime
 server:
  port: 8321
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@ -0,0 +1,122 @@
 version: 2
 image_name: dell
 apis:
 - agents
 - datasetio
 - eval
 - inference
 - safety
 - scoring
 - telemetry
 - tool_runtime
 - vector_io
 providers:
  inference:
  - provider_id: tgi0
    provider_type: remote::tgi
    config:
      url: ${env.DEH_URL}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
    config: {}
  vector_io:
  - provider_id: chromadb
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMA_URL}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/agents_store.db
      responses_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/responses_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/trace_store.db
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/meta_reference_eval.db
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/huggingface_datasetio.db
  - provider_id: localfs
    provider_type: inline::localfs
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/localfs_datasetio.db
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
    config: {}
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:=}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
    config:
      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: tavily-search
    provider_type: remote::tavily-search
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
    config: {}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/registry.db
 inference_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/inference_store.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: tgi0
  model_type: llm
 - metadata:
    embedding_dimension: 384
  model_id: all-MiniLM-L6-v2
  provider_id: sentence-transformers
  model_type: embedding
 shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
 benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
  provider_id: brave-search
 - toolgroup_id: builtin::rag
  provider_id: rag-runtime
 server:
  port: 8321
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@ -65,7 +65,15 @@ class TestRagQuery:
            RAGQueryConfig(mode="invalid_mode")
    async def test_query_accepts_valid_modes(self):
-        RAGQueryConfig()  # Test default (vector)
+        default_config = RAGQueryConfig()  # Test default (vector)
-        RAGQueryConfig(mode="vector")  # Test vector
+        assert default_config.mode == "vector"
-        RAGQueryConfig(mode="keyword")  # Test keyword
+        vector_config = RAGQueryConfig(mode="vector")  # Test vector
-        RAGQueryConfig(mode="hybrid")  # Test hybrid
+        assert vector_config.mode == "vector"
        keyword_config = RAGQueryConfig(mode="keyword")  # Test keyword
        assert keyword_config.mode == "keyword"
        hybrid_config = RAGQueryConfig(mode="hybrid")  # Test hybrid
        assert hybrid_config.mode == "hybrid"
        # Test that invalid mode raises an error
        with pytest.raises(ValueError):
            RAGQueryConfig(mode="wrong_mode")
--- a/tests/unit/server/test_access_control.py
+++ b/tests/unit/server/test_access_control.py
@ -285,9 +285,15 @@ async def test_access_policy(mock_get_authenticated_user, test_setup_with_access
            "projects": ["foo", "bar"],
        },
    )
-    await routing_table.register_model("model-1", provider_id="test_provider")
+    await routing_table.register_model(
-    await routing_table.register_model("model-2", provider_id="test_provider")
+        "model-1", provider_model_id="test_provider/model-1", provider_id="test_provider"
-    await routing_table.register_model("model-3", provider_id="test_provider")
+    )
    await routing_table.register_model(
        "model-2", provider_model_id="test_provider/model-2", provider_id="test_provider"
    )
    await routing_table.register_model(
        "model-3", provider_model_id="test_provider/model-3", provider_id="test_provider"
    )
    model = await routing_table.get_model("model-1")
    assert model.identifier == "model-1"
    model = await routing_table.get_model("model-2")