Merge branch 'main' into rm-faiss-file

2025-12-23 18:34:01 +00:00 · 2025-07-14 10:57:14 -07:00 · 2025-07-14 10:57:14 -07:00 · c30e343fbb
commit c30e343fbb
parent fd8da5fa67 77d2c8e95d
9 changed files with 102 additions and 4 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -14796,7 +14796,8 @@
                        "description": "Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: \"Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n\""
                    },
                    "mode": {
-                        "type": "string",
+                        "$ref": "#/components/schemas/RAGSearchMode",
                        "default": "vector",
                        "description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
                    },
                    "ranker": {
@ -14831,6 +14832,16 @@
                    }
                }
            },
            "RAGSearchMode": {
                "type": "string",
                "enum": [
                    "vector",
                    "keyword",
                    "hybrid"
                ],
                "title": "RAGSearchMode",
                "description": "Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search for semantic matching - KEYWORD: Uses keyword-based search for exact matching - HYBRID: Combines both vector and keyword search for better results"
            },
            "RRFRanker": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -10346,7 +10346,8 @@ components:
            content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
            {chunk.content}\nMetadata: {metadata}\n"
        mode:
-          type: string
+          $ref: '#/components/schemas/RAGSearchMode'
          default: vector
          description: >-
            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
            "vector".
@ -10373,6 +10374,17 @@ components:
        mapping:
          default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
          llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
    RAGSearchMode:
      type: string
      enum:
        - vector
        - keyword
        - hybrid
      title: RAGSearchMode
      description: >-
        Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
        for semantic matching - KEYWORD: Uses keyword-based search for exact matching
        - HYBRID: Combines both vector and keyword search for better results
    RRFRanker:
      type: object
      properties:
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -145,6 +145,10 @@ $ llama stack build --template starter
 ...
 You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
 ```
 ```{tip}
 The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
 ```
 :::
 :::{tab-item} Building from Scratch
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -2,6 +2,10 @@
 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
 ```{note}
 The default `run.yaml` files generated by templates are starting points for your configuration. For guidance on customizing these files for your specific needs, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
 ```
 ```{dropdown} 👋 Click here for a Sample Configuration File
 ```yaml
--- a/docs/source/distributions/customizing_run_yaml.md
+++ b/docs/source/distributions/customizing_run_yaml.md
@ -0,0 +1,40 @@
 # Customizing run.yaml Files
 The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
 ## Key Points
 - **Templates are starting points**: Generated `run.yaml` files contain defaults for development/testing
 - **Customization expected**: Update URLs, credentials, models, and settings for your environment
 - **Version control separately**: Keep customized configs in your own repository
 - **Environment-specific**: Create different configurations for dev, staging, production
 ## What You Can Customize
 You can customize:
 - **Provider endpoints**: Change `http://localhost:8000` to your actual servers
 - **Swap providers**: Replace default providers (e.g., swap Tavily with Brave for search)
 - **Storage paths**: Move from `/tmp/` to production directories
 - **Authentication**: Add API keys, SSL, timeouts
 - **Models**: Different model sizes for dev vs prod
 - **Database settings**: Switch from SQLite to PostgreSQL
 - **Tool configurations**: Add custom tools and integrations
 ## Best Practices
 - Use environment variables for secrets and environment-specific values
 - Create separate `run.yaml` files for different environments (dev, staging, prod)
 - Document your changes with comments
 - Test configurations before deployment
 - Keep your customized configs in version control
 Example structure:
 ```
 your-project/
 ├── configs/
 │   ├── dev-run.yaml
 │   ├── prod-run.yaml
 └── README.md
 ```
 The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -9,6 +9,7 @@ This section provides an overview of the distributions available in Llama Stack.
 importing_as_library
 configuration
 customizing_run_yaml
 list_of_distributions
 kubernetes_deployment
 building_distro
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -54,7 +54,7 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th
 You can use Python to build and run the Llama Stack server, which is useful for testing and development.
 Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
-which defines the providers and their settings.
+which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml.md).
 Now let's build and run the Llama Stack config for Ollama.
 We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -87,6 +87,20 @@ class RAGQueryGenerator(Enum):
    custom = "custom"
@json_schema_type
 class RAGSearchMode(Enum):
    """
    Search modes for RAG query retrieval:
    - VECTOR: Uses vector similarity search for semantic matching
    - KEYWORD: Uses keyword-based search for exact matching
    - HYBRID: Combines both vector and keyword search for better results
    """
    VECTOR = "vector"
    KEYWORD = "keyword"
    HYBRID = "hybrid"
@json_schema_type
 class DefaultRAGQueryGeneratorConfig(BaseModel):
    type: Literal["default"] = "default"
@ -128,7 +142,7 @@ class RAGQueryConfig(BaseModel):
    max_tokens_in_context: int = 4096
    max_chunks: int = 5
    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
-    mode: str | None = None
+    mode: RAGSearchMode | None = RAGSearchMode.VECTOR
    ranker: Ranker | None = Field(default=None)  # Only used for hybrid mode
    @field_validator("chunk_template")
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@ -8,6 +8,7 @@ from unittest.mock import AsyncMock, MagicMock
 import pytest
 from llama_stack.apis.tools.rag_tool import RAGQueryConfig
 from llama_stack.apis.vector_io import (
    Chunk,
    ChunkMetadata,
@ -58,3 +59,14 @@ class TestRagQuery:
        )
        assert expected_metadata_string in result.content[1].text
        assert result.content is not None
    async def test_query_raises_incorrect_mode(self):
        with pytest.raises(ValueError):
            RAGQueryConfig(mode="invalid_mode")
    @pytest.mark.asyncio
    async def test_query_accepts_valid_modes(self):
        RAGQueryConfig()  # Test default (vector)
        RAGQueryConfig(mode="vector")  # Test vector
        RAGQueryConfig(mode="keyword")  # Test keyword
        RAGQueryConfig(mode="hybrid")  # Test hybrid