Merge branch 'main' into content-extension

2025-12-18 19:19:47 +00:00 · 2025-09-07 12:38:35 -06:00 · 2025-09-07 12:38:35 -06:00 · 354ed48598
commit 354ed48598
parent 4c1f187c71 78cab5331a
227 changed files with 21224 additions and 10798 deletions
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -5,10 +5,10 @@
 # the root directory of this source tree.

 from enum import StrEnum
-from typing import Annotated, Literal, Protocol, runtime_checkable
+from typing import Annotated, ClassVar, Literal, Protocol, runtime_checkable

 from fastapi import File, Form, Response, UploadFile
-from pydantic import BaseModel
+from pydantic import BaseModel, Field

 from llama_stack.apis.common.responses import Order
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -49,6 +49,23 @@ class OpenAIFileObject(BaseModel):
    purpose: OpenAIFilePurpose


+@json_schema_type
+class ExpiresAfter(BaseModel):
+    """
+    Control expiration of uploaded files.
+
+    Params:
+     - anchor, must be "created_at"
+     - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+    """
+
+    MIN: ClassVar[int] = 3600  # 1 hour
+    MAX: ClassVar[int] = 2592000  # 30 days
+
+    anchor: Literal["created_at"]
+    seconds: int = Field(..., ge=3600, le=2592000)
+
+
@json_schema_type
 class ListOpenAIFileResponse(BaseModel):
    """
@ -92,6 +109,9 @@ class Files(Protocol):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
+        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
+        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
+        # TODO: expires_after is producing strange openapi spec, params are showing up as a required w/ oneOf being null
    ) -> OpenAIFileObject:
        """
        Upload a file that can be used across various endpoints.
@ -99,6 +119,7 @@ class Files(Protocol):
        The file upload should be a multipart form request with:
        - file: The File object (not file name) to be uploaded.
        - purpose: The intended purpose of the uploaded file.
+        - expires_after: Optional form values describing expiration for the file. Expected expires_after[anchor] = "created_at", expires_after[seconds] = <int>. Seconds must be between 3600 and 2592000 (1 hour to 30 days).

        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.).
        :param purpose: The intended purpose of the uploaded file (e.g., "assistants", "fine-tune").
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -284,7 +284,15 @@ async def instantiate_providers(
        if provider.provider_id is None:
            continue

-        deps = {a: impls[a] for a in provider.spec.api_dependencies}
+        try:
+            deps = {a: impls[a] for a in provider.spec.api_dependencies}
+        except KeyError as e:
+            missing_api = e.args[0]
+            raise RuntimeError(
+                f"Failed to resolve '{provider.spec.api.value}' provider '{provider.provider_id}' of type '{provider.spec.provider_type}': "
+                f"required dependency '{missing_api.value}' is not available. "
+                f"Please add a '{missing_api.value}' provider to your configuration or check if the provider is properly configured."
+            ) from e
        for a in provider.spec.optional_api_dependencies:
            if a in impls:
                deps[a] = impls[a]
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -527,7 +527,7 @@ class InferenceRouter(Inference):

        # Store the response with the ID that will be returned to the client
        if self.store:
-            await self.store.store_chat_completion(response, messages)
+            asyncio.create_task(self.store.store_chat_completion(response, messages))

        if self.telemetry:
            metrics = self._construct_metrics(
@ -755,7 +755,7 @@ class InferenceRouter(Inference):
                            choices_data[idx] = {
                                "content_parts": [],
                                "tool_calls_builder": {},
-                                "finish_reason": None,
+                                "finish_reason": "stop",
                                "logprobs_content_parts": [],
                            }
                        current_choice_data = choices_data[idx]
@ -855,4 +855,4 @@ class InferenceRouter(Inference):
                    object="chat.completion",
                )
                logger.debug(f"InferenceRouter.completion_response: {final_response}")
-                await self.store.store_chat_completion(final_response, messages)
+                asyncio.create_task(self.store.store_chat_completion(final_response, messages))
--- a/llama_stack/core/routing_tables/vector_dbs.py
+++ b/llama_stack/core/routing_tables/vector_dbs.py
@ -52,7 +52,6 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
        provider_vector_db_id: str | None = None,
        vector_db_name: str | None = None,
    ) -> VectorDB:
-        provider_vector_db_id = provider_vector_db_id or vector_db_id
        if provider_id is None:
            if len(self.impls_by_provider_id) > 0:
                provider_id = list(self.impls_by_provider_id.keys())[0]
@ -69,14 +68,33 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
            raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
        if "embedding_dimension" not in model.metadata:
            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
+
+        provider = self.impls_by_provider_id[provider_id]
+        logger.warning(
+            "VectorDB is being deprecated in future releases in favor of VectorStore. Please migrate your usage accordingly."
+        )
+        vector_store = await provider.openai_create_vector_store(
+            name=vector_db_name or vector_db_id,
+            embedding_model=embedding_model,
+            embedding_dimension=model.metadata["embedding_dimension"],
+            provider_id=provider_id,
+            provider_vector_db_id=provider_vector_db_id,
+        )
+
+        vector_store_id = vector_store.id
+        actual_provider_vector_db_id = provider_vector_db_id or vector_store_id
+        logger.warning(
+            f"Ignoring vector_db_id {vector_db_id} and using vector_store_id {vector_store_id} instead. Setting VectorDB {vector_db_id} to VectorDB.vector_db_name"
+        )
+
        vector_db_data = {
-            "identifier": vector_db_id,
+            "identifier": vector_store_id,
            "type": ResourceType.vector_db.value,
            "provider_id": provider_id,
-            "provider_resource_id": provider_vector_db_id,
+            "provider_resource_id": actual_provider_vector_db_id,
            "embedding_model": embedding_model,
            "embedding_dimension": model.metadata["embedding_dimension"],
-            "vector_db_name": vector_db_name,
+            "vector_db_name": vector_store.name,
        }
        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
        await self.register_object(vector_db)
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -132,15 +132,17 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
            },
        )
    elif isinstance(exc, ConflictError):
-        return HTTPException(status_code=409, detail=str(exc))
+        return HTTPException(status_code=httpx.codes.CONFLICT, detail=str(exc))
    elif isinstance(exc, ResourceNotFoundError):
-        return HTTPException(status_code=404, detail=str(exc))
+        return HTTPException(status_code=httpx.codes.NOT_FOUND, detail=str(exc))
    elif isinstance(exc, ValueError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
    elif isinstance(exc, BadRequestError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
    elif isinstance(exc, PermissionError | AccessDeniedError):
        return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
+    elif isinstance(exc, ConnectionError | httpx.ConnectError):
+        return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
    elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
        return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
    elif isinstance(exc, NotImplementedError):
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -105,12 +105,12 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):

        method = getattr(impls[api], register_method)
        for obj in objects:
-            logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
-
-            # Do not register models on disabled providers
-            if hasattr(obj, "provider_id") and (not obj.provider_id or obj.provider_id == "__disabled__"):
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
-                continue
+            if hasattr(obj, "provider_id"):
+                # Do not register models on disabled providers
+                if not obj.provider_id or obj.provider_id == "__disabled__":
+                    logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
+                    continue
+                logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")

            # we want to maintain the type information in arguments to method.
            # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
--- a/llama_stack/distributions/ci-tests/ci_tests.py
+++ b/llama_stack/distributions/ci-tests/ci_tests.py
@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut


 def get_distribution_template() -> DistributionTemplate:
-    template = get_starter_distribution_template()
-    name = "ci-tests"
-    template.name = name
+    template = get_starter_distribution_template(name="ci-tests")
    template.description = "CI tests for Llama Stack"

    return template
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -89,28 +89,28 @@ providers:
    config:
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/ci-tests}/milvus.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
@ -121,15 +121,15 @@ providers:
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ci-tests/files}
      metadata_store:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@ -134,6 +134,11 @@ models:
  provider_id: nvidia
  provider_model_id: meta/llama-3.3-70b-instruct
  model_type: llm
+- metadata: {}
+  model_id: nvidia/vila
+  provider_id: nvidia
+  provider_model_id: nvidia/vila
+  model_type: llm
 - metadata:
    embedding_dimension: 2048
    context_length: 8192
--- a/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/llama_stack/distributions/open-benchmark/open_benchmark.py
@ -43,7 +43,7 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
            "openai",
            [
                ProviderModelEntry(
-                    provider_model_id="openai/gpt-4o",
+                    provider_model_id="gpt-4o",
                    model_type=ModelType.llm,
                )
            ],
@ -53,7 +53,7 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
            "anthropic",
            [
                ProviderModelEntry(
-                    provider_model_id="anthropic/claude-3-5-sonnet-latest",
+                    provider_model_id="claude-3-5-sonnet-latest",
                    model_type=ModelType.llm,
                )
            ],
@ -206,13 +206,6 @@ def get_distribution_template() -> DistributionTemplate:
                uri="huggingface://datasets/llamastack/math_500?split=test",
            ),
        ),
-        DatasetInput(
-            dataset_id="bfcl",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
-            ),
-        ),
        DatasetInput(
            dataset_id="ifeval",
            purpose=DatasetPurpose.eval_messages_answer,
@ -250,11 +243,6 @@ def get_distribution_template() -> DistributionTemplate:
            dataset_id="math_500",
            scoring_functions=["basic::regex_parser_math_response"],
        ),
-        BenchmarkInput(
-            benchmark_id="meta-reference-bfcl",
-            dataset_id="bfcl",
-            scoring_functions=["basic::bfcl"],
-        ),
        BenchmarkInput(
            benchmark_id="meta-reference-ifeval",
            dataset_id="ifeval",
--- a/llama_stack/distributions/open-benchmark/run.yaml
+++ b/llama_stack/distributions/open-benchmark/run.yaml
@ -136,14 +136,14 @@ inference_store:
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/open-benchmark}/inference_store.db
 models:
 - metadata: {}
-  model_id: openai/gpt-4o
+  model_id: gpt-4o
  provider_id: openai
-  provider_model_id: openai/gpt-4o
+  provider_model_id: gpt-4o
  model_type: llm
 - metadata: {}
-  model_id: anthropic/claude-3-5-sonnet-latest
+  model_id: claude-3-5-sonnet-latest
  provider_id: anthropic
-  provider_model_id: anthropic/claude-3-5-sonnet-latest
+  provider_model_id: claude-3-5-sonnet-latest
  model_type: llm
 - metadata: {}
  model_id: gemini/gemini-1.5-flash
@ -188,12 +188,6 @@ datasets:
    uri: huggingface://datasets/llamastack/math_500?split=test
  metadata: {}
  dataset_id: math_500
- purpose: eval/messages-answer
-  source:
-    type: uri
-    uri: huggingface://datasets/llamastack/bfcl_v3?split=train
-  metadata: {}
-  dataset_id: bfcl
 - purpose: eval/messages-answer
  source:
    type: uri
@ -228,11 +222,6 @@ benchmarks:
  - basic::regex_parser_math_response
  metadata: {}
  benchmark_id: meta-reference-math-500
- dataset_id: bfcl
-  scoring_functions:
-  - basic::bfcl
-  metadata: {}
-  benchmark_id: meta-reference-bfcl
 - dataset_id: ifeval
  scoring_functions:
  - basic::ifeval
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -89,28 +89,28 @@ providers:
    config:
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
@ -121,15 +121,15 @@ providers:
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
      metadata_store:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/distributions/starter-gpu/starter_gpu.py
+++ b/llama_stack/distributions/starter-gpu/starter_gpu.py
@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut


 def get_distribution_template() -> DistributionTemplate:
-    template = get_starter_distribution_template()
-    name = "starter-gpu"
-    template.name = name
+    template = get_starter_distribution_template(name="starter-gpu")
    template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments."

    template.providers["post_training"] = [
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@ -99,9 +99,8 @@ def get_remote_inference_providers() -> list[Provider]:
    return inference_providers


-def get_distribution_template() -> DistributionTemplate:
+def get_distribution_template(name: str = "starter") -> DistributionTemplate:
    remote_inference_providers = get_remote_inference_providers()
-    name = "starter"

    providers = {
        "inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers]
--- a/llama_stack/providers/inline/batches/reference/batches.py
+++ b/llama_stack/providers/inline/batches/reference/batches.py
@ -178,9 +178,9 @@ class ReferenceBatchesImpl(Batches):

        # TODO: set expiration time for garbage collection

-        if endpoint not in ["/v1/chat/completions"]:
+        if endpoint not in ["/v1/chat/completions", "/v1/completions"]:
            raise ValueError(
-                f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions. Code: invalid_value. Param: endpoint",
+                f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions, /v1/completions. Code: invalid_value. Param: endpoint",
            )

        if completion_window != "24h":
@ -424,13 +424,21 @@ class ReferenceBatchesImpl(Batches):
                            )
                            valid = False

-                        for param, expected_type, type_string in [
-                            ("model", str, "a string"),
-                            # messages is specific to /v1/chat/completions
-                            # we could skip validating messages here and let inference fail. however,
-                            # that would be a very expensive way to find out messages is wrong.
-                            ("messages", list, "an array"),  # TODO: allow messages to be a string?
-                        ]:
+                        if batch.endpoint == "/v1/chat/completions":
+                            required_params = [
+                                ("model", str, "a string"),
+                                # messages is specific to /v1/chat/completions
+                                # we could skip validating messages here and let inference fail. however,
+                                # that would be a very expensive way to find out messages is wrong.
+                                ("messages", list, "an array"),  # TODO: allow messages to be a string?
+                            ]
+                        else:  # /v1/completions
+                            required_params = [
+                                ("model", str, "a string"),
+                                ("prompt", str, "a string"),  # TODO: allow prompt to be a list of strings??
+                            ]
+
+                        for param, expected_type, type_string in required_params:
                            if param not in body:
                                errors.append(
                                    BatchError(
@ -591,20 +599,37 @@ class ReferenceBatchesImpl(Batches):

        try:
            # TODO(SECURITY): review body for security issues
-            request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
-            chat_response = await self.inference_api.openai_chat_completion(**request.body)
+            if request.url == "/v1/chat/completions":
+                request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
+                chat_response = await self.inference_api.openai_chat_completion(**request.body)

-            # this is for mypy, we don't allow streaming so we'll get the right type
-            assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method"
-            return {
-                "id": request_id,
-                "custom_id": request.custom_id,
-                "response": {
-                    "status_code": 200,
-                    "request_id": request_id,  # TODO: should this be different?
-                    "body": chat_response.model_dump_json(),
-                },
-            }
+                # this is for mypy, we don't allow streaming so we'll get the right type
+                assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method"
+                return {
+                    "id": request_id,
+                    "custom_id": request.custom_id,
+                    "response": {
+                        "status_code": 200,
+                        "request_id": request_id,  # TODO: should this be different?
+                        "body": chat_response.model_dump_json(),
+                    },
+                }
+            else:  # /v1/completions
+                completion_response = await self.inference_api.openai_completion(**request.body)
+
+                # this is for mypy, we don't allow streaming so we'll get the right type
+                assert hasattr(completion_response, "model_dump_json"), (
+                    "Completion response must have model_dump_json method"
+                )
+                return {
+                    "id": request_id,
+                    "custom_id": request.custom_id,
+                    "response": {
+                        "status_code": 200,
+                        "request_id": request_id,
+                        "body": completion_response.model_dump_json(),
+                    },
+                }
        except Exception as e:
            logger.info(f"Error processing request {request.custom_id} in batch {batch_id}: {e}")
            return {
--- a/llama_stack/providers/inline/files/localfs/files.py
+++ b/llama_stack/providers/inline/files/localfs/files.py
@ -86,11 +86,16 @@ class LocalfsFilesImpl(Files):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
+        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
+        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
    ) -> OpenAIFileObject:
        """Upload a file that can be used across various endpoints."""
        if not self.sql_store:
            raise RuntimeError("Files provider not initialized")

+        if expires_after_anchor is not None or expires_after_seconds is not None:
+            raise NotImplementedError("File expiration is not supported by this provider")
+
        file_id = self._generate_file_id()
        file_path = self._get_file_path(file_id)

--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -22,7 +22,6 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 )

 from .config import BasicScoringConfig
-from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
 from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
@ -37,7 +36,6 @@ FIXED_FNS = [
    SubsetOfScoringFn,
    RegexParserScoringFn,
    RegexParserMathResponseScoringFn,
-    BFCLScoringFn,
    IfEvalScoringFn,
    DocVQAScoringFn,
 ]
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@ -1,93 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-import re
-from typing import Any
-
-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-
-from ..utils.bfcl.ast_parser import decode_ast
-from ..utils.bfcl.checker import ast_checker, is_empty_output
-from .fn_defs.bfcl import bfcl
-
-
-def postprocess(x: dict[str, Any], test_category: str) -> dict[str, Any]:
-    contain_func_call = False
-    error = None
-    error_type = None
-    checker_result = {}
-    try:
-        prediction = decode_ast(x["generated_answer"], x["language"]) or ""
-        contain_func_call = True
-        # if not is_function_calling_format_output(prediction):
-        if is_empty_output(prediction):
-            contain_func_call = False
-            error = "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
-            error_type = "ast_decoder:decoder_wrong_output_format"
-        else:
-            checker_result = ast_checker(
-                json.loads(x["function"]),
-                prediction,
-                json.loads(x["ground_truth"]),
-                x["language"],
-                test_category=test_category,
-                model_name="",
-            )
-    except Exception as e:
-        prediction = ""
-        error = f"Invalid syntax. Failed to decode AST. {str(e)}"
-        error_type = "ast_decoder:decoder_failed"
-    return {
-        "prediction": prediction,
-        "contain_func_call": contain_func_call,
-        "valid": checker_result.get("valid", False),
-        "error": error or checker_result.get("error", ""),
-        "error_type": error_type or checker_result.get("error_type", ""),
-    }
-
-
-def gen_valid(x: dict[str, Any]) -> dict[str, float]:
-    return {"valid": x["valid"]}
-
-
-def gen_relevance_acc(x: dict[str, Any]) -> dict[str, float]:
-    # This function serves for both relevance and irrelevance tests, which share the exact opposite logic.
-    # If `test_category` is "irrelevance", the model is expected to output no function call.
-    # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
-    # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call.
-    acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"]
-    return {"valid": float(acc)}
-
-
-class BFCLScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn for BFCL
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            bfcl.identifier: bfcl,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = "bfcl",
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
-        score_result = postprocess(input_row, test_category)
-        if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}:
-            score = gen_relevance_acc(score_result)["valid"]
-        else:
-            score = gen_valid(score_result)["valid"]
-        return {
-            "score": float(score),
-        }
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    ScoringFn,
-)
-
-bfcl = ScoringFn(
-    identifier="basic::bfcl",
-    description="BFCL complex scoring",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="bfcl",
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
-)
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/init.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
@ -1,296 +0,0 @@
-# ruff: noqa
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import ast
-
-from .tree_sitter import get_parser
-
-
-def parse_java_function_call(source_code):
-    if not source_code.endswith(";"):
-        source_code += ";"  # Necessary for the parser not to register an error
-    parser = get_parser("java")
-    tree = parser.parse(bytes(source_code, "utf8"))
-    root_node = tree.root_node
-
-    if root_node.has_error:
-        raise Exception("Error parsing java the source code.")
-
-    def get_text(node):
-        """Returns the text represented by the node."""
-        return source_code[node.start_byte : node.end_byte]
-
-    def traverse_node(node, nested=False):
-        if node.type == "string_literal":
-            if nested:
-                return get_text(node)
-            # Strip surrounding quotes from string literals
-            return get_text(node)[1:-1]
-        elif node.type == "character_literal":
-            if nested:
-                return get_text(node)
-            # Strip surrounding single quotes from character literals
-            return get_text(node)[1:-1]
-        """Traverse the node to collect texts for complex structures."""
-        if node.type in [
-            "identifier",
-            "class_literal",
-            "type_identifier",
-            "method_invocation",
-        ]:
-            return get_text(node)
-        elif node.type == "array_creation_expression":
-            # Handle array creation expression specifically
-            type_node = node.child_by_field_name("type")
-            value_node = node.child_by_field_name("value")
-            type_text = traverse_node(type_node, True)
-            value_text = traverse_node(value_node, True)
-            return f"new {type_text}[]{value_text}"
-        elif node.type == "object_creation_expression":
-            # Handle object creation expression specifically
-            type_node = node.child_by_field_name("type")
-            arguments_node = node.child_by_field_name("arguments")
-            type_text = traverse_node(type_node, True)
-            if arguments_node:
-                # Process each argument carefully, avoiding unnecessary punctuation
-                argument_texts = []
-                for child in arguments_node.children:
-                    if child.type not in [
-                        ",",
-                        "(",
-                        ")",
-                    ]:  # Exclude commas and parentheses
-                        argument_text = traverse_node(child, True)
-                        argument_texts.append(argument_text)
-                arguments_text = ", ".join(argument_texts)
-                return f"new {type_text}({arguments_text})"
-            else:
-                return f"new {type_text}()"
-        elif node.type == "set":
-            # Handling sets specifically
-            items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]]
-            return "{" + ", ".join(items) + "}"
-
-        elif node.child_count > 0:
-            return "".join(traverse_node(child, True) for child in node.children)
-        else:
-            return get_text(node)
-
-    def extract_arguments(args_node):
-        arguments = {}
-        for child in args_node.children:
-            if child.type == "assignment_expression":
-                # For named parameters
-                name_node, value_node = child.children[0], child.children[2]
-                name = get_text(name_node)
-                value = traverse_node(value_node)
-                if name in arguments:
-                    if not isinstance(arguments[name], list):
-                        arguments[name] = [arguments[name]]
-                    arguments[name].append(value)
-                else:
-                    arguments[name] = value
-                # arguments.append({'name': name, 'value': value})
-            elif child.type in ["identifier", "class_literal", "set"]:
-                # For unnamed parameters and handling sets
-                value = traverse_node(child)
-                if None in arguments:
-                    if not isinstance(arguments[None], list):
-                        arguments[None] = [arguments[None]]
-                    arguments[None].append(value)
-                else:
-                    arguments[None] = value
-        return arguments
-
-    def traverse(node):
-        if node.type == "method_invocation":
-            # Extract the function name and its arguments
-            method_name = get_text(node.child_by_field_name("name"))
-            class_name_node = node.child_by_field_name("object")
-            if class_name_node:
-                class_name = get_text(class_name_node)
-                function_name = f"{class_name}.{method_name}"
-            else:
-                function_name = method_name
-            arguments_node = node.child_by_field_name("arguments")
-            if arguments_node:
-                arguments = extract_arguments(arguments_node)
-                for key, value in arguments.items():
-                    if isinstance(value, list):
-                        raise Exception("Error: Multiple arguments with the same name are not supported.")
-                return [{function_name: arguments}]
-
-        else:
-            for child in node.children:
-                result = traverse(child)
-                if result:
-                    return result
-
-    result = traverse(root_node)
-    return result if result else {}
-
-
-def parse_javascript_function_call(source_code):
-    if not source_code.endswith(";"):
-        source_code += ";"  # Necessary for the parser not to register an error
-    parser = get_parser("javascript")
-    # Parse the source code
-    tree = parser.parse(bytes(source_code, "utf8"))
-    root_node = tree.root_node
-    if root_node.has_error:
-        raise Exception("Error js parsing the source code.")
-
-    # Function to recursively extract argument details
-    def extract_arguments(node):
-        args = {}
-        for child in node.children:
-            if child.type == "assignment_expression":
-                # Extract left (name) and right (value) parts of the assignment
-                name = child.children[0].text.decode("utf-8")
-                value = child.children[2].text.decode("utf-8")
-                if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
-                    value = value[1:-1]  # Trim the quotation marks
-                if name in args:
-                    if not isinstance(args[name], list):
-                        args[name] = [args[name]]
-                    args[name].append(value)
-                else:
-                    args[name] = value
-
-            elif child.type == "identifier" or child.type == "true":
-                # Handle non-named arguments and boolean values
-                value = child.text.decode("utf-8")
-                if None in args:
-                    if not isinstance(args[None], list):
-                        args[None] = [args[None]]
-                    args[None].append(value)
-                else:
-                    args[None] = value
-        return args
-
-    # Find the function call and extract its name and arguments
-    if root_node.type == "program":
-        for child in root_node.children:
-            if child.type == "expression_statement":
-                for sub_child in child.children:
-                    if sub_child.type == "call_expression":
-                        function_name = sub_child.children[0].text.decode("utf8")
-                        arguments_node = sub_child.children[1]
-                        parameters = extract_arguments(arguments_node)
-                        for key, value in parameters.items():
-                            if isinstance(value, list):
-                                raise Exception("Error: Multiple arguments with the same name are not supported.")
-                        result = [{function_name: parameters}]
-                        return result
-
-
-def ast_parse(input_str, language="Python"):
-    if language == "Python":
-        cleaned_input = input_str.strip("[]'")
-        parsed = ast.parse(cleaned_input, mode="eval")
-        extracted = []
-        if isinstance(parsed.body, ast.Call):
-            extracted.append(resolve_ast_call(parsed.body))
-        else:
-            for elem in parsed.body.elts:
-                extracted.append(resolve_ast_call(elem))
-        return extracted
-    elif language == "Java":
-        return parse_java_function_call(input_str[1:-1])  # Remove the [ and ] from the string
-    elif language == "JavaScript":
-        return parse_javascript_function_call(input_str[1:-1])
-    else:
-        raise NotImplementedError(f"Unsupported language: {language}")
-
-
-def resolve_ast_call(elem):
-    # Handle nested attributes for deeply nested module paths
-    func_parts = []
-    func_part = elem.func
-    while isinstance(func_part, ast.Attribute):
-        func_parts.append(func_part.attr)
-        func_part = func_part.value
-    if isinstance(func_part, ast.Name):
-        func_parts.append(func_part.id)
-    func_name = ".".join(reversed(func_parts))
-    args_dict = {}
-    # Parse when args are simply passed as an unnamed dictionary arg
-    for arg in elem.args:
-        if isinstance(arg, ast.Dict):
-            for key, value in zip(arg.keys, arg.values):
-                if isinstance(key, ast.Constant):
-                    arg_name = key.value
-                output = resolve_ast_by_type(value)
-                args_dict[arg_name] = output
-    for arg in elem.keywords:
-        output = resolve_ast_by_type(arg.value)
-        args_dict[arg.arg] = output
-    return {func_name: args_dict}
-
-
-def resolve_ast_by_type(value):
-    if isinstance(value, ast.Constant):
-        if value.value is Ellipsis:
-            output = "..."
-        else:
-            output = value.value
-    elif isinstance(value, ast.UnaryOp):
-        output = -value.operand.value
-    elif isinstance(value, ast.List):
-        output = [resolve_ast_by_type(v) for v in value.elts]
-    elif isinstance(value, ast.Dict):
-        output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)}
-    elif isinstance(value, ast.NameConstant):  # Added this condition to handle boolean values
-        output = value.value
-    elif isinstance(value, ast.BinOp):  # Added this condition to handle function calls as arguments
-        output = eval(ast.unparse(value))
-    elif isinstance(value, ast.Name):
-        output = value.id
-    elif isinstance(value, ast.Call):
-        if len(value.keywords) == 0:
-            output = ast.unparse(value)
-        else:
-            output = resolve_ast_call(value)
-    elif isinstance(value, ast.Tuple):
-        output = tuple(resolve_ast_by_type(v) for v in value.elts)
-    elif isinstance(value, ast.Lambda):
-        output = eval(ast.unparse(value.body[0].value))
-    elif isinstance(value, ast.Ellipsis):
-        output = "..."
-    elif isinstance(value, ast.Subscript):
-        try:
-            output = ast.unparse(value.body[0].value)
-        except:
-            output = ast.unparse(value.value) + "[" + ast.unparse(value.slice) + "]"
-    else:
-        raise Exception(f"Unsupported AST type: {type(value)}")
-    return output
-
-
-def decode_ast(result, language="Python"):
-    func = result
-    func = func.replace("\n", "")  # remove new line characters
-    if not func.startswith("["):
-        func = "[" + func
-    if not func.endswith("]"):
-        func = func + "]"
-    decoded_output = ast_parse(func, language)
-    return decoded_output
-
-
-def decode_execute(result):
-    func = result
-    func = func.replace("\n", "")  # remove new line characters
-    if not func.startswith("["):
-        func = "[" + func
-    if not func.endswith("]"):
-        func = func + "]"
-    decode_output = ast_parse(func)
-    execution_list = []
-    for function_call in decode_output:
-        for key, value in function_call.items():
-            execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})")
-    return execution_list
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
@ -1,989 +0,0 @@
-# ruff: noqa
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import json
-import re
-import time
-from typing import Any
-
-# Comment out for now until we actually use the rest checker in evals
-# import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
-
-
-class NoAPIKeyError(Exception):
-    def __init__(self):
-        self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
-        super().__init__(self.message)
-
-
-REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
-
-
-JAVA_TYPE_CONVERSION = {
-    "byte": int,
-    "short": int,
-    "integer": int,
-    "float": float,
-    "double": float,
-    "long": int,
-    "boolean": bool,
-    "char": str,
-    "Array": list,
-    "ArrayList": list,
-    "Set": set,
-    "HashMap": dict,
-    "Hashtable": dict,
-    "Queue": list,  # this can be `queue.Queue` as well, for simplicity we check with list
-    "Stack": list,
-    "String": str,
-    "any": str,
-}
-
-JS_TYPE_CONVERSION = {
-    "String": str,
-    "integer": int,
-    "float": float,
-    "Bigint": int,
-    "Boolean": bool,
-    "dict": dict,
-    "array": list,
-    "any": str,
-}
-
-# We switch to conditional import for the following two imports to avoid unnecessary installations.
-# User doesn't need to setup the tree-sitter packages if they are not running the test for that language.
-# from js_type_converter import js_type_converter
-# from java_type_converter import java_type_converter
-
-PYTHON_TYPE_MAPPING = {
-    "string": str,
-    "integer": int,
-    "float": float,
-    "boolean": bool,
-    "array": list,
-    "tuple": list,
-    "dict": dict,
-    "any": str,
-}
-
-# This is the list of types that we need to recursively check its values
-PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"]
-
-
-NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"]
-
-
-#### Helper functions for AST ####
-def find_description(func_descriptions, name):
-    if type(func_descriptions) == list:
-        for func_description in func_descriptions:
-            if func_description["name"] == name:
-                return func_description
-        return None
-    else:
-        # it is a dict, there is only one function
-        return func_descriptions
-
-
-def get_possible_answer_type(possible_answer: list):
-    for answer in possible_answer:
-        if answer != "":  # Optional parameter
-            return type(answer)
-    return None
-
-
-def type_checker(
-    param: str,
-    value,
-    possible_answer: list,
-    expected_type_description: str,
-    expected_type_converted,
-    nested_type_converted,
-):
-    # NOTE: This type checker only supports nested type checking for one level deep.
-    # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex.
-
-    result: Any = {
-        "valid": True,
-        "error": [],
-        "is_variable": False,
-        "error_type": "type_error:simple",
-    }
-
-    is_variable = False
-    # check for the case where a variable is used instead of a actual value.
-    # use the type in possible_answer as the expected type
-    possible_answer_type = get_possible_answer_type(possible_answer)
-    # if possible_answer only contains optional parameters, we can't determine the type
-    if possible_answer_type != None:
-        # we are being precise here.
-        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
-        if possible_answer_type != expected_type_converted:
-            is_variable = True
-
-    # value is the same type as in function description
-    if type(value) == expected_type_converted:
-        # We don't need to do recursive check for simple types
-        if nested_type_converted == None:
-            result["is_variable"] = is_variable
-            return result
-        else:
-            for possible_answer_item in possible_answer:
-                flag = True  # Each parameter should match to at least one possible answer type.
-                # Here, we assume that each item should be the same type. We could also relax it.
-                if type(possible_answer_item) == list:
-                    for value_item in value:
-                        checker_result = type_checker(
-                            param,
-                            value_item,
-                            possible_answer_item,
-                            str(nested_type_converted),
-                            nested_type_converted,
-                            None,
-                        )
-                        if not checker_result["valid"]:
-                            flag = False
-                            break
-
-                if flag:
-                    return {"valid": True, "error": [], "is_variable": is_variable}
-
-            result["valid"] = False
-            result["error"] = [
-                f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}."
-            ]
-            result["error_type"] = "type_error:nested"
-
-    # value is not as expected, check for the case where a variable is used instead of a actual value
-    # use the type in possible_answer as the expected type
-    possible_answer_type = get_possible_answer_type(possible_answer)
-    # if possible_answer only contains optional parameters, we can't determine the type
-    if possible_answer_type != None:
-        # we are being precise here.
-        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
-        if type(value) == possible_answer_type:
-            result["is_variable"] = True
-            return result
-
-    result["valid"] = False
-    result["error"].append(
-        f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}."
-    )
-    result["error_type"] = "type_error:simple"
-    return result
-
-
-def standardize_string(input_string: str):
-    # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase
-    # It will also convert all the single quotes to double quotes
-    # This is used to compare the model output with the possible answers
-    # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
-    regex_string = r"[ \,\.\/\-\_\*\^]"
-    return re.sub(regex_string, "", input_string).lower().replace("'", '"')
-
-
-def string_checker(param: str, model_output: str, possible_answer: list):
-    standardize_possible_answer = []
-    standardize_model_output = standardize_string(model_output)
-    for i in range(len(possible_answer)):
-        if type(possible_answer[i]) == str:
-            standardize_possible_answer.append(standardize_string(possible_answer[i]))
-
-    if standardize_model_output not in standardize_possible_answer:
-        return {
-            "valid": False,
-            "error": [
-                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive."
-            ],
-            "error_type": "value_error:string",
-        }
-
-    return {"valid": True, "error": []}
-
-
-def list_checker(param: str, model_output: list, possible_answer: list):
-    # Convert the tuple to a list
-
-    standardize_model_output = list(model_output)
-
-    # If the element in the list is a string, we need to standardize it
-    for i in range(len(standardize_model_output)):
-        if type(standardize_model_output[i]) == str:
-            standardize_model_output[i] = standardize_string(model_output[i])
-
-    standardize_possible_answer: Any = []
-    # We also need to standardize the possible answers
-    for i in range(len(possible_answer)):
-        standardize_possible_answer.append([])
-        for j in range(len(possible_answer[i])):
-            if type(possible_answer[i][j]) == str:
-                standardize_possible_answer[i].append(standardize_string(possible_answer[i][j]))
-            else:
-                standardize_possible_answer[i].append(possible_answer[i][j])
-
-    if standardize_model_output not in standardize_possible_answer:
-        return {
-            "valid": False,
-            "error": [
-                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}."
-            ],
-            "error_type": "value_error:list/tuple",
-        }
-
-    return {"valid": True, "error": []}
-
-
-def dict_checker(param: str, model_output: dict, possible_answers: list):
-    # This function works for simple dictionaries, but not dictionaries with nested dictionaries.
-    # The current dataset only contains simple dictionaries, so this is sufficient.
-
-    result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
-    for i in range(len(possible_answers)):
-        if possible_answers[i] == "":
-            continue
-
-        result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
-
-        flag = True
-
-        possible_answer = possible_answers[i]
-        # possible_anwer is a single dictionary
-
-        for key, value in model_output.items():
-            if key not in possible_answer:
-                result["valid"] = False
-                result["error"].append(f"Unexpected dict key parameter: '{key}'.")  # type: ignore[attr-defined]
-                result["error_type"] = "value_error:dict_key"
-                flag = False
-                break
-
-            standardize_value = value
-            # If the value is a string, we need to standardize it
-            if type(value) == str:
-                standardize_value = standardize_string(value)
-
-            # We also need to standardize the possible answers if they are string
-            standardize_possible_answer = []
-            for i in range(len(possible_answer[key])):
-                if type(possible_answer[key][i]) == str:
-                    standardize_possible_answer.append(standardize_string(possible_answer[key][i]))
-                else:
-                    standardize_possible_answer.append(possible_answer[key][i])
-
-            if standardize_value not in standardize_possible_answer:
-                result["valid"] = False
-                result["error"].append(  # type: ignore[attr-defined]
-                    f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}."
-                )
-                result["error_type"] = "value_error:dict_value"
-                flag = False
-                break
-
-        for key, value in possible_answer.items():
-            if key not in model_output and "" not in value:
-                result["valid"] = False
-                result["error"].append(f"Missing dict key parameter: '{key}'.")  # type: ignore[attr-defined]
-                result["error_type"] = "value_error:dict_key"
-                flag = False
-                break
-
-        if flag:
-            return {"valid": True, "error": []}
-
-    return result
-
-
-def list_dict_checker(param: str, model_output: list, possible_answers: list):
-    # This function takes in a list of dictionaries and checks if each dictionary is valid
-    # The order of the dictionaries in the list must match the order of the possible answers
-
-    result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"}
-
-    for answer_index in range(len(possible_answers)):
-        flag = True  # True means so far, all dictionaries are valid
-
-        # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers
-        if len(model_output) != len(possible_answers[answer_index]):
-            result["valid"] = False
-            result["error"] = ["Wrong number of dictionaries in the list."]
-            result["error_type"] = "value_error:list_dict_count"
-            flag = False
-            continue
-
-        for dict_index in range(len(model_output)):
-            result = dict_checker(
-                param,
-                model_output[dict_index],
-                [possible_answers[answer_index][dict_index]],
-            )
-            if not result["valid"]:
-                flag = False
-                break
-        if flag:
-            return {"valid": True, "error": []}
-
-    return result
-
-
-def simple_function_checker(
-    func_description: dict,
-    model_output: dict,
-    possible_answer: dict,
-    language: str,
-    model_name: str,
-):
-    possible_answer = list(possible_answer.values())[0]
-    # Extract function name and parameters details
-    func_name = func_description["name"]
-    param_details = func_description["parameters"]["properties"]
-    required_params = func_description["parameters"]["required"]
-
-    # Initialize a result dictionary
-    result = {
-        "valid": True,
-        "error": [],
-        "error_type": "simple_function_checker:unclear",
-    }
-
-    # Check if function name matches
-    if func_name not in model_output:
-        result["valid"] = False
-        result["error"].append(  # type: ignore[attr-defined]
-            f"Function name {repr(func_name)} not found in model output."
-        )
-        result["error_type"] = "simple_function_checker:wrong_func_name"
-        return result
-
-    model_params = model_output[func_name]
-
-    # Check for required parameters in model output
-    for param in required_params:
-        if param not in model_params:
-            result["valid"] = False
-            result["error"].append(f"Missing required parameter: {repr(param)}.")  # type: ignore[attr-defined]
-            result["error_type"] = "simple_function_checker:missing_required"
-            return result
-
-    # Validate types and values for each parameter in model output
-    for param, value in model_params.items():
-        if param not in param_details or param not in possible_answer:
-            result["valid"] = False
-            result["error"].append(f"Unexpected parameter: {repr(param)}.")  # type: ignore[attr-defined]
-            result["error_type"] = "simple_function_checker:unexpected_param"
-            return result
-
-        full_param_details = param_details[param]
-        expected_type_description = full_param_details["type"]  # This is a string
-        is_variable = False
-        nested_type_converted = None
-
-        if language == "Java":
-            from evals.utils.bfcl.java_type_converter import java_type_converter
-
-            expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description]
-
-            if expected_type_description in JAVA_TYPE_CONVERSION:
-                if type(value) != str:
-                    result["valid"] = False
-                    result["error"].append(  # type: ignore[attr-defined]
-                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
-                    )
-                    result["error_type"] = "type_error:java"
-                    return result
-
-                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
-                    nested_type = param_details[param]["items"]["type"]
-                    nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
-                    value = java_type_converter(value, expected_type_description, nested_type)
-                else:
-                    value = java_type_converter(value, expected_type_description)
-
-        elif language == "JavaScript":
-            from evals.utils.bfcl.js_type_converter import js_type_converter
-
-            expected_type_converted = JS_TYPE_CONVERSION[expected_type_description]
-
-            if expected_type_description in JS_TYPE_CONVERSION:
-                if type(value) != str:
-                    result["valid"] = False
-                    result["error"].append(  # type: ignore[attr-defined]
-                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
-                    )
-                    result["error_type"] = "type_error:js"
-                    return result
-
-                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
-                    nested_type = param_details[param]["items"]["type"]
-                    nested_type_converted = JS_TYPE_CONVERSION[nested_type]
-                    value = js_type_converter(value, expected_type_description, nested_type)
-                else:
-                    value = js_type_converter(value, expected_type_description)
-
-        elif language == "Python":
-            expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description]
-            if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST:
-                nested_type = param_details[param]["items"]["type"]
-                nested_type_converted = PYTHON_TYPE_MAPPING[nested_type]
-
-        # We convert all tuple value to list when the expected type is tuple.
-        # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load().
-        # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future.
-        if expected_type_description == "tuple" and type(value) == tuple:
-            value = list(value)
-
-        # Allow python auto conversion from int to float
-        if language == "Python" and expected_type_description == "float" and type(value) == int:
-            value = float(value)
-
-        # Type checking
-        # In fact, we only check for Python here.
-        # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct.
-        type_check_result = type_checker(
-            param,
-            value,
-            possible_answer[param],
-            expected_type_description,
-            expected_type_converted,
-            nested_type_converted,
-        )
-        is_variable = type_check_result["is_variable"]
-        if not type_check_result["valid"]:
-            return type_check_result
-
-        # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable.
-        # We can just treat the variable as a string and use the normal flow.
-        if not is_variable:
-            # Special handle for dictionaries
-            if expected_type_converted == dict:
-                result = dict_checker(param, value, possible_answer[param])
-                if not result["valid"]:
-                    return result
-                continue
-
-            # Special handle for list of dictionaries
-            elif expected_type_converted == list and nested_type_converted == dict:
-                result = list_dict_checker(param, value, possible_answer[param])
-                if not result["valid"]:
-                    return result
-                continue
-
-            # Special handle for strings
-            elif expected_type_converted == str:
-                # We don't check for case sensitivity for string, as long as it's not a variable
-                result = string_checker(param, value, possible_answer[param])
-                if not result["valid"]:
-                    return result
-                continue
-
-            elif expected_type_converted == list:
-                result = list_checker(param, value, possible_answer[param])
-                if not result["valid"]:
-                    return result
-                continue
-
-        # Check if the value is within the possible answers
-        if value not in possible_answer[param]:
-            result["valid"] = False
-            result["error"].append(  # type: ignore[attr-defined]
-                f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}."
-            )
-            result["error_type"] = "value_error:others"
-            return result
-
-    # Check for optional parameters not provided but allowed
-    for param in possible_answer:
-        if param not in model_params and "" not in possible_answer[param]:
-            result["valid"] = False
-            result["error"].append(  # type: ignore[attr-defined]
-                f"Optional parameter {repr(param)} not provided and not marked as optional."
-            )
-            result["error_type"] = "simple_function_checker:missing_optional"
-            return result
-
-    return result
-
-
-def parallel_function_checker_enforce_order(
-    func_descriptions: list,
-    model_output: list,
-    possible_answers: dict,
-    language: str,
-    model_name: str,
-):
-    if len(model_output) != len(possible_answers):
-        return {
-            "valid": False,
-            "error": ["Wrong number of functions."],
-            "error_type": "parallel_function_checker_enforce_order:wrong_count",
-        }
-
-    func_name_list = list(possible_answers.keys())
-    possible_answers_list = []
-
-    for key, value in possible_answers.items():
-        possible_answers_list.append({key: value})
-
-    for i in range(len(possible_answers_list)):
-        func_description = find_description(func_descriptions, func_name_list[i])
-
-        result = simple_function_checker(
-            func_description,
-            model_output[i],
-            possible_answers_list[i],
-            language,
-            model_name,
-        )
-        if not result["valid"]:
-            return result
-
-    return {"valid": True, "error": []}
-
-
-def parallel_function_checker_no_order(
-    func_descriptions: list,
-    model_output: list,
-    possible_answers: list,
-    language: str,
-    model_name: str,
-):
-    if len(model_output) != len(possible_answers):
-        return {
-            "valid": False,
-            "error": ["Wrong number of functions."],
-            "error_type": "parallel_function_checker_no_order:wrong_count",
-        }
-
-    matched_indices = []
-
-    # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer
-    # It must be this way because we need ground truth to fetch the correct function description
-    for i in range(len(possible_answers)):
-        # possible_answers[i] is a dictionary with only one key
-        func_name_expected = list(possible_answers[i].keys())[0]
-        func_description = find_description(func_descriptions, func_name_expected)
-
-        all_errors = []
-
-        for index in range(len(model_output)):
-            if index in matched_indices:
-                continue
-
-            result = simple_function_checker(
-                func_description,
-                model_output[index],
-                possible_answers[i],
-                language,
-                model_name,
-            )
-
-            if result["valid"]:
-                matched_indices.append(index)
-                break
-            else:
-                all_errors.append(
-                    {
-                        f"Model Result Index {index}": {
-                            "sub_error": result["error"],
-                            "sub_error_type": result["error_type"],
-                            "model_output_item": model_output[index],
-                            "possible_answer_item": possible_answers[i],
-                        }
-                    }
-                )
-
-        if not result["valid"]:
-            considered_indices = [i for i in range(len(model_output)) if i not in matched_indices]
-            all_errors.insert(
-                0,
-                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
-            )
-            return {
-                "valid": False,
-                "error": all_errors,
-                "error_type": "parallel_function_checker_no_order:cannot_find_match",
-            }
-
-    return {"valid": True, "error": []}
-
-
-def multiple_function_checker(
-    func_descriptions: list,
-    model_output: list,
-    possible_answers: list,
-    language: str,
-    model_name: str,
-):
-    if len(model_output) != len(possible_answers):
-        return {
-            "valid": False,
-            "error": ["Wrong number of functions."],
-            "error_type": "multiple_function_checker:wrong_count",
-        }
-
-    # possible_answers is a list of only one dictionary with only one key
-    func_name_expected = list(possible_answers[0].keys())[0]
-    func_description = find_description(func_descriptions, func_name_expected)
-    return simple_function_checker(
-        func_description,
-        model_output[0],
-        possible_answers[0],
-        language,
-        model_name,
-    )
-
-
-def patten_matcher(exec_output, expected_result, function_call, is_sanity_check):
-    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
-
-    if type(exec_output) != type(expected_result):
-        return {
-            "valid": False,
-            "error": [
-                f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}."
-            ],
-            "error_type": "executable_checker:wrong_result_type",
-            "model_executed_output": exec_output,
-        }
-    if type(exec_output) == dict:
-        # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one.
-        # This happens when the key is a timestamp or a random number.
-        if is_sanity_check:
-            if len(exec_output) != len(expected_result):
-                return {
-                    "valid": False,
-                    "error": [
-                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
-                    ],
-                    "error_type": "executable_checker:wrong_result_type:dict_length",
-                    "model_executed_output": exec_output,
-                }
-            else:
-                return result
-
-        for key, value in expected_result.items():
-            if key not in exec_output:
-                return {
-                    "valid": False,
-                    "error": [
-                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output."
-                    ],
-                    "error_type": "executable_checker:wrong_result_type:dict_key_not_found",
-                    "model_executed_output": exec_output,
-                }
-        for key, value in exec_output.items():
-            if key not in expected_result:
-                return {
-                    "valid": False,
-                    "error": [
-                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output."
-                    ],
-                    "error_type": "executable_checker:wrong_result_type:dict_extra_key",
-                    "model_executed_output": exec_output,
-                }
-    if type(exec_output) == list:
-        if len(exec_output) != len(expected_result):
-            return {
-                "valid": False,
-                "error": [
-                    f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
-                ],
-                "error_type": "executable_checker:wrong_result_type:list_length",
-                "model_executed_output": exec_output,
-            }
-    return result
-
-
-#### Helper functions for Exec ####
-def executable_checker_simple(
-    function_call: str,
-    expected_result,
-    expected_result_type: str,
-    is_sanity_check=False,
-):
-    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
-
-    exec_dict: Any = {}
-
-    try:
-        exec(
-            "from executable_python_function import *" + "\nresult=" + function_call,
-            exec_dict,
-        )
-        exec_output = exec_dict["result"]
-    except NoAPIKeyError as e:
-        raise e
-    except Exception as e:
-        result["valid"] = False
-        result["error"].append(  # type: ignore[attr-defined]
-            f"Error in execution: {repr(function_call)}. Error: {str(e)}"
-        )
-        result["error_type"] = "executable_checker:execution_error"
-        return result
-
-    # We need to special handle the case where the execution result is a tuple and convert it to a list
-    # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json
-    if isinstance(exec_output, tuple):
-        exec_output = list(exec_output)
-
-    if expected_result_type == "exact_match":
-        if exec_output != expected_result:
-            result["valid"] = False
-            result["error"].append(  # type: ignore[attr-defined]
-                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}."
-            )
-            result["error_type"] = "executable_checker:wrong_result"
-            result["model_executed_output"] = exec_output
-            return result
-
-    elif expected_result_type == "real_time_match":
-        # Allow for 5% difference
-        if (type(expected_result) == float or type(expected_result) == int) and (
-            type(exec_output) == float or type(exec_output) == int
-        ):
-            if not (
-                expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
-                <= exec_output
-                <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
-            ):
-                result["valid"] = False
-                result["error"].append(  # type: ignore[attr-defined]
-                    f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed."
-                )
-                result["error_type"] = "executable_checker:wrong_result_real_time"
-                result["model_executed_output"] = exec_output
-                return result
-        else:
-            result["valid"] = False
-            result["error"].append(  # type: ignore[attr-defined]
-                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria."
-            )
-            result["error_type"] = "executable_checker:wrong_result_real_time"
-            result["model_executed_output"] = exec_output
-            return result
-
-    else:
-        # structural match
-        pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check)
-        if not pattern_match_result["valid"]:
-            return pattern_match_result
-
-    return result
-
-
-def executable_checker_parallel_no_order(
-    decoded_result: list, expected_exec_result: list, expected_exec_result_type: list
-):
-    if len(decoded_result) != len(expected_exec_result):
-        return {
-            "valid": False,
-            "error": [
-                f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}."
-            ],
-            "error_type": "value_error:exec_result_count",
-        }
-
-    matched_indices = []
-    for i in range(len(expected_exec_result)):
-        all_errors = []
-        for index in range(len(decoded_result)):
-            if index in matched_indices:
-                continue
-
-            result = executable_checker_simple(
-                decoded_result[index],
-                expected_exec_result[i],
-                expected_exec_result_type[i],
-                False,
-            )
-
-            if result["valid"]:
-                matched_indices.append(index)
-                break
-            else:
-                all_errors.append(
-                    {
-                        f"Model Result Index {index}": {
-                            "sub_error": result["error"],
-                            "sub_error_type": result["error_type"],
-                            "model_executed_output": (
-                                result["model_executed_output"] if "model_executed_output" in result else None
-                            ),
-                        }
-                    }
-                )
-
-        if not result["valid"]:
-            considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices]
-            all_errors.insert(
-                0,
-                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
-            )
-            return {
-                "valid": False,
-                "error": all_errors,
-                "error_type": "executable_checker:cannot_find_match",
-            }
-
-    return {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
-
-
-#### Main function ####
-def executable_checker_rest(func_call, idx):
-    # Move this here for now to avoid needing to read this file / fix paths to be relative to dataset_dir. Fix when it's actually needed / used.
-    EVAL_GROUND_TRUTH_PATH = "/mnt/wsfuse/fair_llm_v2/datasets/eval/bfcl/rest-eval-response_v5.jsonl"  # Ground truth file for v5 for rest execution
-    with open(EVAL_GROUND_TRUTH_PATH, "r") as f:
-        EVAL_GROUND_TRUTH = f.readlines()
-    if "https://geocode.maps.co" in func_call:
-        time.sleep(2)
-    if "requests_get" in func_call:
-        func_call = func_call.replace("requests_get", "requests.get")
-    try:
-        response = eval(func_call)
-    except Exception as e:
-        return {
-            "valid": False,
-            "error": [f"Execution failed. {str(e)}"],
-            "error_type": "executable_checker_rest:execution_error",
-        }
-
-    try:
-        if response.status_code == 200:
-            eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx])
-            try:
-                if isinstance(eval_GT_json, dict):
-                    if isinstance(response.json(), dict):
-                        if set(eval_GT_json.keys()) == set(response.json().keys()):
-                            return {"valid": True, "error": [], "error_type": ""}
-                        return {
-                            "valid": False,
-                            "error": ["Key inconsistency"],
-                            "error_type": "executable_checker_rest:wrong_key",
-                        }
-                    return {
-                        "valid": False,
-                        "error": [f"Expected dictionary, but got {type(response.json())}"],
-                        "error_type": "executable_checker_rest:wrong_type",
-                    }
-
-                elif isinstance(eval_GT_json, list):
-                    if isinstance(response.json(), list):
-                        if len(eval_GT_json) != len(response.json()):
-                            return {
-                                "valid": False,
-                                "error": [f"Response list length inconsistency."],
-                                "error_type": "value_error:exec_result_rest_count",
-                            }
-
-                        else:
-                            for i in range(len(eval_GT_json)):
-                                if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()):
-                                    return {
-                                        "valid": False,
-                                        "error": [f"Key inconsistency"],
-                                        "error_type": "executable_checker_rest:wrong_key",
-                                    }
-
-                            return {"valid": True, "error": []}
-                    else:
-                        return {
-                            "valid": False,
-                            "error": [f"Expected list, but got {type(response.json())}"],
-                            "error_type": "executable_checker_rest:wrong_type",
-                        }
-                return {
-                    "valid": False,
-                    "error": [f"Expected dict or list, but got {type(response.json())}"],
-                    "error_type": "executable_checker_rest:wrong_type",
-                }
-            except Exception as e:
-                return {
-                    "valid": False,
-                    "error": [
-                        f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}"
-                    ],
-                    "error_type": "executable_checker_rest:response_format_error",
-                }
-        else:
-            return {
-                "valid": False,
-                "error": [f"Execution result status code is not 200, got {response.status_code}"],
-                "error_type": "executable_checker_rest:wrong_status_code",
-            }
-    except Exception as e:
-        return {
-            "valid": False,
-            "error": [f"Cannot get status code of the response. Error: {str(e)}"],
-            "error_type": "executable_checker_rest:cannot_get_status_code",
-        }
-
-
-def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name):
-    if "parallel" in test_category:
-        return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name)
-
-    elif "multiple" in test_category:
-        return multiple_function_checker(func_description, model_output, possible_answer, language, model_name)
-
-    else:
-        if len(model_output) != 1:
-            return {
-                "valid": False,
-                "error": ["Wrong number of functions."],
-                "error_type": "simple_function_checker:wrong_count",
-            }
-
-        return simple_function_checker(
-            func_description[0],
-            model_output[0],
-            possible_answer[0],
-            language,
-            model_name,
-        )
-
-
-def exec_checker(decoded_result: list, func_description: dict, test_category: str):
-    if "multiple" in test_category or "parallel" in test_category:
-        return executable_checker_parallel_no_order(
-            decoded_result,
-            func_description["execution_result"],
-            func_description["execution_result_type"],
-        )
-
-    else:
-        if len(decoded_result) != 1:
-            return {
-                "valid": False,
-                "error": ["Wrong number of functions."],
-                "error_type": "simple_exec_checker:wrong_count",
-            }
-        return executable_checker_simple(
-            decoded_result[0],
-            func_description["execution_result"][0],
-            func_description["execution_result_type"][0],
-            False,
-        )
-
-
-def is_empty_output(decoded_output):
-    # This function is a patch to the ast decoder for relevance detection
-    # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
-    # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
-    if not is_function_calling_format_output(decoded_output):
-        return True
-    if len(decoded_output) == 0:
-        return True
-    if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
-        return True
-
-
-def is_function_calling_format_output(decoded_output):
-    # Ensure the output is a list of dictionaries
-    if type(decoded_output) == list:
-        for item in decoded_output:
-            if type(item) != dict:
-                return False
-        return True
-    return False
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Tree-sitter changes its API with unfortunate frequency. Modules that need it should
-import it from here so that we can centrally manage things as necessary.
-"""
-
-# These currently work with tree-sitter 0.23.0
-# NOTE: Don't import tree-sitter or any of the language modules in the main module
-# because not all environments have them. Import lazily inside functions where needed.
-
-import importlib
-import typing
-
-if typing.TYPE_CHECKING:
-    import tree_sitter
-
-
-def get_language(language: str) -> "tree_sitter.Language":
-    import tree_sitter
-
-    language_module_name = f"tree_sitter_{language}"
-    try:
-        language_module = importlib.import_module(language_module_name)
-    except ModuleNotFoundError as exc:
-        raise ValueError(
-            f"Language {language} is not found. Please install the tree-sitter-{language} package."
-        ) from exc
-    return tree_sitter.Language(language_module.language())
-
-
-def get_parser(language: str, **kwargs) -> "tree_sitter.Parser":
-    import tree_sitter
-
-    lang = get_language(language)
-    return tree_sitter.Parser(lang, **kwargs)
--- a/llama_stack/providers/inline/tool_runtime/rag/init.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/init.py
@ -14,6 +14,6 @@ from .config import RagToolRuntimeConfig
 async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
    from .memory import MemoryToolRuntimeImpl

-    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference])
+    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.files])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -5,10 +5,15 @@
 # the root directory of this source tree.

 import asyncio
+import base64
+import io
+import mimetypes
 import secrets
 import string
 from typing import Any

+import httpx
+from fastapi import UploadFile
 from pydantic import TypeAdapter

 from llama_stack.apis.common.content_types import (
@ -17,6 +22,7 @@ from llama_stack.apis.common.content_types import (
    InterleavedContentItem,
    TextContentItem,
 )
+from llama_stack.apis.files import Files, OpenAIFilePurpose
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.tools import (
    ListToolDefsResponse,
@ -30,13 +36,18 @@ from llama_stack.apis.tools import (
    ToolParameter,
    ToolRuntime,
 )
-from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO
+from llama_stack.apis.vector_io import (
+    QueryChunksResponse,
+    VectorIO,
+    VectorStoreChunkingStrategyStatic,
+    VectorStoreChunkingStrategyStaticConfig,
+)
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack.providers.utils.memory.vector_store import (
    content_from_doc,
-    make_overlapped_chunks,
+    parse_data_url,
 )

 from .config import RagToolRuntimeConfig
@ -55,10 +66,12 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        config: RagToolRuntimeConfig,
        vector_io_api: VectorIO,
        inference_api: Inference,
+        files_api: Files,
    ):
        self.config = config
        self.vector_io_api = vector_io_api
        self.inference_api = inference_api
+        self.files_api = files_api

    async def initialize(self):
        pass
@ -78,27 +91,50 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        vector_db_id: str,
        chunk_size_in_tokens: int = 512,
    ) -> None:
-        chunks = []
+        if not documents:
+            return
+
        for doc in documents:
-            content = await content_from_doc(doc)
-            # TODO: we should add enrichment here as URLs won't be added to the metadata by default
-            chunks.extend(
-                make_overlapped_chunks(
-                    doc.document_id,
-                    content,
-                    chunk_size_in_tokens,
-                    chunk_size_in_tokens // 4,
-                    doc.metadata,
+            if isinstance(doc.content, URL):
+                if doc.content.uri.startswith("data:"):
+                    parts = parse_data_url(doc.content.uri)
+                    file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode()
+                    mime_type = parts["mimetype"]
+                else:
+                    async with httpx.AsyncClient() as client:
+                        response = await client.get(doc.content.uri)
+                        file_data = response.content
+                        mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream")
+            else:
+                content_str = await content_from_doc(doc)
+                file_data = content_str.encode("utf-8")
+                mime_type = doc.mime_type or "text/plain"
+
+            file_extension = mimetypes.guess_extension(mime_type) or ".txt"
+            filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
+
+            file_obj = io.BytesIO(file_data)
+            file_obj.name = filename
+
+            upload_file = UploadFile(file=file_obj, filename=filename)
+
+            created_file = await self.files_api.openai_upload_file(
+                file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
+            )
+
+            chunking_strategy = VectorStoreChunkingStrategyStatic(
+                static=VectorStoreChunkingStrategyStaticConfig(
+                    max_chunk_size_tokens=chunk_size_in_tokens,
+                    chunk_overlap_tokens=chunk_size_in_tokens // 4,
                )
            )

-        if not chunks:
-            return
-
-        await self.vector_io_api.insert_chunks(
-            chunks=chunks,
-            vector_db_id=vector_db_id,
-        )
+            await self.vector_io_api.openai_attach_file_to_vector_store(
+                vector_store_id=vector_db_id,
+                file_id=created_file.id,
+                attributes=doc.metadata,
+                chunking_strategy=chunking_strategy,
+            )

    async def query(
        self,
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -30,11 +30,11 @@ from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
 from llama_stack.providers.utils.memory.vector_store import (
    RERANKER_TYPE_RRF,
-    RERANKER_TYPE_WEIGHTED,
    ChunkForDeletion,
    EmbeddingIndex,
    VectorDBWithIndex,
 )
+from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator

 logger = get_logger(name=__name__, category="vector_io")

@ -66,59 +66,6 @@ def _create_sqlite_connection(db_path):
    return connection


-def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
-    """Normalize scores to [0,1] range using min-max normalization."""
-    if not scores:
-        return {}
-    min_score = min(scores.values())
-    max_score = max(scores.values())
-    score_range = max_score - min_score
-    if score_range > 0:
-        return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
-    return dict.fromkeys(scores, 1.0)
-
-
-def _weighted_rerank(
-    vector_scores: dict[str, float],
-    keyword_scores: dict[str, float],
-    alpha: float = 0.5,
-) -> dict[str, float]:
-    """ReRanker that uses weighted average of scores."""
-    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
-    normalized_vector_scores = _normalize_scores(vector_scores)
-    normalized_keyword_scores = _normalize_scores(keyword_scores)
-
-    return {
-        doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
-        + ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
-        for doc_id in all_ids
-    }
-
-
-def _rrf_rerank(
-    vector_scores: dict[str, float],
-    keyword_scores: dict[str, float],
-    impact_factor: float = 60.0,
-) -> dict[str, float]:
-    """ReRanker that uses Reciprocal Rank Fusion."""
-    # Convert scores to ranks
-    vector_ranks = {
-        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
-    }
-    keyword_ranks = {
-        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
-    }
-
-    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
-    rrf_scores = {}
-    for doc_id in all_ids:
-        vector_rank = vector_ranks.get(doc_id, float("inf"))
-        keyword_rank = keyword_ranks.get(doc_id, float("inf"))
-        # RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
-        rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
-    return rrf_scores
-
-
 def _make_sql_identifier(name: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_]", "_", name)

@ -398,14 +345,10 @@ class SQLiteVecIndex(EmbeddingIndex):
            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
        }

-        # Combine scores using the specified reranker
-        if reranker_type == RERANKER_TYPE_WEIGHTED:
-            alpha = reranker_params.get("alpha", 0.5)
-            combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha)
-        else:
-            # Default to RRF for None, RRF, or any unknown types
-            impact_factor = reranker_params.get("impact_factor", 60.0)
-            combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
+        # Combine scores using the reranking utility
+        combined_scores = WeightedInMemoryAggregator.combine_search_results(
+            vector_scores, keyword_scores, reranker_type, reranker_params
+        )

        # Sort by combined score and get top k results
        sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -116,7 +116,7 @@ def available_providers() -> list[ProviderSpec]:
            adapter=AdapterSpec(
                adapter_type="fireworks",
                pip_packages=[
-                    "fireworks-ai",
+                    "fireworks-ai<=0.17.16",
                ],
                module="llama_stack.providers.remote.inference.fireworks",
                config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig",
@ -207,7 +207,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="gemini",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.gemini",
                config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
                provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@ -248,7 +248,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="groq",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.groq",
                config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
                provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@ -270,7 +270,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="sambanova",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.sambanova",
                config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
                provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
@ -292,7 +292,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="watsonx",
-                pip_packages=["ibm_watson_machine_learning"],
+                pip_packages=["ibm_watsonx_ai"],
                module="llama_stack.providers.remote.inference.watsonx",
                config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig",
                provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator",
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@ -32,7 +32,7 @@ def available_providers() -> list[ProviderSpec]:
            ],
            module="llama_stack.providers.inline.tool_runtime.rag",
            config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
-            api_dependencies=[Api.vector_io, Api.inference],
+            api_dependencies=[Api.vector_io, Api.inference, Api.files],
            description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
        ),
        remote_provider_spec(
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@ -404,6 +404,60 @@ That means you'll get fast and efficient vector retrieval.
 - Easy to use
 - Fully integrated with Llama Stack

+There are three implementations of search for PGVectoIndex available:
+
+1. Vector Search:
+- How it works:
+  - Uses PostgreSQL's vector extension (pgvector) to perform similarity search
+  - Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
+  - Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
+
+-Characteristics:
+  - Semantic understanding - finds documents similar in meaning even if they don't share keywords
+  - Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
+  - Best for: Finding conceptually related content, handling synonyms, cross-language search
+
+2. Keyword Search
+- How it works:
+  - Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
+  - Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
+  - Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
+
+- Characteristics:
+  - Lexical matching - finds exact keyword matches and variations
+  - Uses GIN (Generalized Inverted Index) for fast text search performance
+  - Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
+  - Best for: Exact term matching, proper names, technical terms, Boolean-style queries
+
+3. Hybrid Search
+- How it works:
+  - Combines both vector and keyword search results
+  - Runs both searches independently, then merges results using configurable reranking
+
+- Two reranking strategies available:
+    - Reciprocal Rank Fusion (RRF) - (default: 60.0)
+    - Weighted Average - (default: 0.5)
+
+- Characteristics:
+  - Best of both worlds: semantic understanding + exact matching
+  - Documents appearing in both searches get boosted scores
+  - Configurable balance between semantic and lexical matching
+  - Best for: General-purpose search where you want both precision and recall
+
+4. Database Schema
+The PGVector implementation stores data optimized for all three search types:
+CREATE TABLE vector_store_xxx (
+    id TEXT PRIMARY KEY,
+    document JSONB,                    -- Original document
+    embedding vector(dimension),        -- For vector search
+    content_text TEXT,                 -- Raw text content
+    tokenized_content TSVECTOR          -- For keyword search
+);
+
+-- Indexes for performance
+CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content);  -- Keyword search
+-- Vector index created automatically by pgvector
+
 ## Usage

 To use PGVector in your Llama Stack project, follow these steps:
@ -412,6 +466,25 @@ To use PGVector in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
 3. Start storing and querying vectors.

+## This is an example how you can set up your environment for using PGVector
+
+1. Export env vars:
+```bash
+export ENABLE_PGVECTOR=true
+export PGVECTOR_HOST=localhost
+export PGVECTOR_PORT=5432
+export PGVECTOR_DB=llamastack
+export PGVECTOR_USER=llamastack
+export PGVECTOR_PASSWORD=llamastack
+```
+
+2. Create DB:
+```bash
+psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
+psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
+psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
+```
+
 ## Installation

 You can install PGVector using docker:
@ -449,6 +522,7 @@ Weaviate supports:
 - Metadata filtering
 - Multi-modal retrieval

+
 ## Usage

 To use Weaviate in your Llama Stack project, follow these steps:
--- a/llama_stack/providers/remote/files/s3/init.py
+++ b/llama_stack/providers/remote/files/s3/init.py
@ -6,15 +6,14 @@

 from typing import Any

-from llama_stack.core.datatypes import Api
+from llama_stack.core.datatypes import AccessRule, Api

 from .config import S3FilesImplConfig


-async def get_adapter_impl(config: S3FilesImplConfig, deps: dict[Api, Any]):
+async def get_adapter_impl(config: S3FilesImplConfig, deps: dict[Api, Any], policy: list[AccessRule] | None = None):
    from .files import S3FilesImpl

-    # TODO: authorization policies and user separation
-    impl = S3FilesImpl(config)
+    impl = S3FilesImpl(config, policy or [])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/files/s3/files.py
+++ b/llama_stack/providers/remote/files/s3/files.py
@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import time
 import uuid
-from typing import Annotated
+from datetime import UTC, datetime
+from typing import Annotated, Any

 import boto3
 from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError
@ -15,14 +15,17 @@ from fastapi import File, Form, Response, UploadFile
 from llama_stack.apis.common.errors import ResourceNotFoundError
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.files import (
+    ExpiresAfter,
    Files,
    ListOpenAIFileResponse,
    OpenAIFileDeleteResponse,
    OpenAIFileObject,
    OpenAIFilePurpose,
 )
+from llama_stack.core.datatypes import AccessRule
 from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
-from llama_stack.providers.utils.sqlstore.sqlstore import SqlStore, sqlstore_impl
+from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
+from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl

 from .config import S3FilesImplConfig

@ -83,22 +86,85 @@ async def _create_bucket_if_not_exists(client: boto3.client, config: S3FilesImpl
            raise RuntimeError(f"Failed to access S3 bucket '{config.bucket_name}': {e}") from e


+def _make_file_object(
+    *,
+    id: str,
+    filename: str,
+    purpose: str,
+    bytes: int,
+    created_at: int,
+    expires_at: int,
+    **kwargs: Any,  # here to ignore any additional fields, e.g. extra fields from AuthorizedSqlStore
+) -> OpenAIFileObject:
+    """
+    Construct an OpenAIFileObject and normalize expires_at.
+
+    If expires_at is greater than the max we treat it as no-expiration and
+    return None for expires_at.
+
+    The OpenAI spec says expires_at type is Integer, but the implementation
+    will return None for no expiration.
+    """
+    obj = OpenAIFileObject(
+        id=id,
+        filename=filename,
+        purpose=OpenAIFilePurpose(purpose),
+        bytes=bytes,
+        created_at=created_at,
+        expires_at=expires_at,
+    )
+
+    if obj.expires_at is not None and obj.expires_at > (obj.created_at + ExpiresAfter.MAX):
+        obj.expires_at = None  # type: ignore
+
+    return obj
+
+
 class S3FilesImpl(Files):
    """S3-based implementation of the Files API."""

-    # TODO: implement expiration, for now a silly offset
-    _SILLY_EXPIRATION_OFFSET = 100 * 365 * 24 * 60 * 60
-
-    def __init__(self, config: S3FilesImplConfig) -> None:
+    def __init__(self, config: S3FilesImplConfig, policy: list[AccessRule]) -> None:
        self._config = config
+        self.policy = policy
        self._client: boto3.client | None = None
-        self._sql_store: SqlStore | None = None
+        self._sql_store: AuthorizedSqlStore | None = None
+
+    def _now(self) -> int:
+        """Return current UTC timestamp as int seconds."""
+        return int(datetime.now(UTC).timestamp())
+
+    async def _get_file(self, file_id: str, return_expired: bool = False) -> dict[str, Any]:
+        where: dict[str, str | dict] = {"id": file_id}
+        if not return_expired:
+            where["expires_at"] = {">": self._now()}
+        if not (row := await self.sql_store.fetch_one("openai_files", policy=self.policy, where=where)):
+            raise ResourceNotFoundError(file_id, "File", "files.list()")
+        return row
+
+    async def _delete_file(self, file_id: str) -> None:
+        """Delete a file from S3 and the database."""
+        try:
+            self.client.delete_object(
+                Bucket=self._config.bucket_name,
+                Key=file_id,
+            )
+        except ClientError as e:
+            if e.response["Error"]["Code"] != "NoSuchKey":
+                raise RuntimeError(f"Failed to delete file from S3: {e}") from e
+
+        await self.sql_store.delete("openai_files", where={"id": file_id})
+
+    async def _delete_if_expired(self, file_id: str) -> None:
+        """If the file exists and is expired, delete it."""
+        if row := await self._get_file(file_id, return_expired=True):
+            if (expires_at := row.get("expires_at")) and expires_at <= self._now():
+                await self._delete_file(file_id)

    async def initialize(self) -> None:
        self._client = _create_s3_client(self._config)
        await _create_bucket_if_not_exists(self._client, self._config)

-        self._sql_store = sqlstore_impl(self._config.metadata_store)
+        self._sql_store = AuthorizedSqlStore(sqlstore_impl(self._config.metadata_store))
        await self._sql_store.create_table(
            "openai_files",
            {
@ -121,7 +187,7 @@ class S3FilesImpl(Files):
        return self._client

    @property
-    def sql_store(self) -> SqlStore:
+    def sql_store(self) -> AuthorizedSqlStore:
        assert self._sql_store is not None, "Provider not initialized"
        return self._sql_store

@ -129,27 +195,47 @@ class S3FilesImpl(Files):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
+        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
+        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
    ) -> OpenAIFileObject:
        file_id = f"file-{uuid.uuid4().hex}"

        filename = getattr(file, "filename", None) or "uploaded_file"

-        created_at = int(time.time())
-        expires_at = created_at + self._SILLY_EXPIRATION_OFFSET
+        created_at = self._now()
+
+        expires_after = None
+        if expires_after_anchor is not None or expires_after_seconds is not None:
+            # we use ExpiresAfter to validate input
+            expires_after = ExpiresAfter(
+                anchor=expires_after_anchor,  # type: ignore[arg-type]
+                seconds=expires_after_seconds,  # type: ignore[arg-type]
+            )
+
+        # the default is no expiration.
+        # to implement no expiration we set an expiration beyond the max.
+        # we'll hide this fact from users when returning the file object.
+        expires_at = created_at + ExpiresAfter.MAX * 42
+        # the default for BATCH files is 30 days, which happens to be the expiration max.
+        if purpose == OpenAIFilePurpose.BATCH:
+            expires_at = created_at + ExpiresAfter.MAX
+
+        if expires_after is not None:
+            expires_at = created_at + expires_after.seconds
+
        content = await file.read()
        file_size = len(content)

-        await self.sql_store.insert(
-            "openai_files",
-            {
-                "id": file_id,
-                "filename": filename,
-                "purpose": purpose.value,
-                "bytes": file_size,
-                "created_at": created_at,
-                "expires_at": expires_at,
-            },
-        )
+        entry: dict[str, Any] = {
+            "id": file_id,
+            "filename": filename,
+            "purpose": purpose.value,
+            "bytes": file_size,
+            "created_at": created_at,
+            "expires_at": expires_at,
+        }
+
+        await self.sql_store.insert("openai_files", entry)

        try:
            self.client.put_object(
@ -163,14 +249,7 @@ class S3FilesImpl(Files):

            raise RuntimeError(f"Failed to upload file to S3: {e}") from e

-        return OpenAIFileObject(
-            id=file_id,
-            filename=filename,
-            purpose=purpose,
-            bytes=file_size,
-            created_at=created_at,
-            expires_at=expires_at,
-        )
+        return _make_file_object(**entry)

    async def openai_list_files(
        self,
@ -183,29 +262,20 @@ class S3FilesImpl(Files):
        if not order:
            order = Order.desc

-        where_conditions = {}
+        where_conditions: dict[str, Any] = {"expires_at": {">": self._now()}}
        if purpose:
            where_conditions["purpose"] = purpose.value

        paginated_result = await self.sql_store.fetch_all(
            table="openai_files",
-            where=where_conditions if where_conditions else None,
+            policy=self.policy,
+            where=where_conditions,
            order_by=[("created_at", order.value)],
            cursor=("id", after) if after else None,
            limit=limit,
        )

-        files = [
-            OpenAIFileObject(
-                id=row["id"],
-                filename=row["filename"],
-                purpose=OpenAIFilePurpose(row["purpose"]),
-                bytes=row["bytes"],
-                created_at=row["created_at"],
-                expires_at=row["expires_at"],
-            )
-            for row in paginated_result.data
-        ]
+        files = [_make_file_object(**row) for row in paginated_result.data]

        return ListOpenAIFileResponse(
            data=files,
@ -216,41 +286,20 @@ class S3FilesImpl(Files):
        )

    async def openai_retrieve_file(self, file_id: str) -> OpenAIFileObject:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
-        if not row:
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
-
-        return OpenAIFileObject(
-            id=row["id"],
-            filename=row["filename"],
-            purpose=OpenAIFilePurpose(row["purpose"]),
-            bytes=row["bytes"],
-            created_at=row["created_at"],
-            expires_at=row["expires_at"],
-        )
+        await self._delete_if_expired(file_id)
+        row = await self._get_file(file_id)
+        return _make_file_object(**row)

    async def openai_delete_file(self, file_id: str) -> OpenAIFileDeleteResponse:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
-        if not row:
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
-
-        try:
-            self.client.delete_object(
-                Bucket=self._config.bucket_name,
-                Key=row["id"],
-            )
-        except ClientError as e:
-            if e.response["Error"]["Code"] != "NoSuchKey":
-                raise RuntimeError(f"Failed to delete file from S3: {e}") from e
-
-        await self.sql_store.delete("openai_files", where={"id": file_id})
-
+        await self._delete_if_expired(file_id)
+        _ = await self._get_file(file_id)  # raises if not found
+        await self._delete_file(file_id)
        return OpenAIFileDeleteResponse(id=file_id, deleted=True)

    async def openai_retrieve_file_content(self, file_id: str) -> Response:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
-        if not row:
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
+        await self._delete_if_expired(file_id)
+
+        row = await self._get_file(file_id)

        try:
            response = self.client.get_object(
@ -261,7 +310,7 @@ class S3FilesImpl(Files):
            content = response["Body"].read()
        except ClientError as e:
            if e.response["Error"]["Code"] == "NoSuchKey":
-                await self.sql_store.delete("openai_files", where={"id": file_id})
+                await self._delete_file(file_id)
                raise ResourceNotFoundError(file_id, "File", "files.list()") from e
            raise RuntimeError(f"Failed to download file from S3: {e}") from e

--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -5,12 +5,13 @@
 # the root directory of this source tree.

 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import GeminiConfig
 from .models import MODEL_ENTRIES


-class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
+class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    def __init__(self, config: GeminiConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
@ -21,6 +22,11 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config

+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self):
+        return "https://generativelanguage.googleapis.com/v1beta/openai/"
+
    async def initialize(self) -> None:
        await super().initialize()

--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -4,30 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from collections.abc import AsyncIterator
-from typing import Any

-from openai import AsyncOpenAI
-
-from llama_stack.apis.inference import (
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAIChoiceDelta,
-    OpenAIChunkChoice,
-    OpenAIMessageParam,
-    OpenAIResponseFormatParam,
-    OpenAISystemMessageParam,
-)
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
-from llama_stack.providers.utils.inference.openai_compat import (
-    prepare_openai_completion_params,
-)
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .models import MODEL_ENTRIES


-class GroqInferenceAdapter(LiteLLMOpenAIMixin):
+class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    _config: GroqConfig

    def __init__(self, config: GroqConfig):
@ -40,122 +25,14 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config

+    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self) -> str:
+        return f"{self.config.url}/openai/v1"
+
    async def initialize(self):
        await super().initialize()

    async def shutdown(self):
        await super().shutdown()
-
-    def _get_openai_client(self) -> AsyncOpenAI:
-        return AsyncOpenAI(
-            base_url=f"{self.config.url}/openai/v1",
-            api_key=self.get_api_key(),
-        )
-
-    async def openai_chat_completion(
-        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        model_obj = await self.model_store.get_model(model)
-
-        # Groq does not support json_schema response format, so we need to convert it to json_object
-        if response_format and response_format.type == "json_schema":
-            response_format.type = "json_object"
-            schema = response_format.json_schema.get("schema", {})
-            response_format.json_schema = None
-            json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
-            if messages and messages[0].role == "system":
-                messages[0].content = messages[0].content + json_instructions
-            else:
-                messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
-
-        # Groq returns a 400 error if tools are provided but none are called
-        # So, set tool_choice to "required" to attempt to force a call
-        if tools and (not tool_choice or tool_choice == "auto"):
-            tool_choice = "required"
-
-        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_completion_tokens=max_completion_tokens,
-            max_tokens=max_tokens,
-            n=n,
-            parallel_tool_calls=parallel_tool_calls,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            user=user,
-        )
-
-        # Groq does not support streaming requests that set response_format
-        fake_stream = False
-        if stream and response_format:
-            params["stream"] = False
-            fake_stream = True
-
-        response = await self._get_openai_client().chat.completions.create(**params)
-
-        if fake_stream:
-            chunk_choices = []
-            for choice in response.choices:
-                delta = OpenAIChoiceDelta(
-                    content=choice.message.content,
-                    role=choice.message.role,
-                    tool_calls=choice.message.tool_calls,
-                )
-                chunk_choice = OpenAIChunkChoice(
-                    delta=delta,
-                    finish_reason=choice.finish_reason,
-                    index=choice.index,
-                    logprobs=None,
-                )
-                chunk_choices.append(chunk_choice)
-            chunk = OpenAIChatCompletionChunk(
-                id=response.id,
-                choices=chunk_choices,
-                object="chat.completion.chunk",
-                created=response.created,
-                model=response.model,
-            )
-
-            async def _fake_stream_generator():
-                yield chunk
-
-            return _fake_stream_generator()
-        else:
-            return response
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -41,10 +41,10 @@ client.initialize()

 ### Create Completion

-> Note on Completion API
->
-> The hosted NVIDIA Llama NIMs (e.g., `meta-llama/Llama-3.1-8B-Instruct`) with ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` does not support the ```completion``` method, while the locally deployed NIM does.
+The following example shows how to create a completion for an NVIDIA NIM.

+> [!NOTE]
+> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.

 ```python
 response = client.inference.completion(
@ -60,6 +60,8 @@ print(f"Response: {response.content}")

 ### Create Chat Completion

+The following example shows how to create a chat completion for an NVIDIA NIM.
+
 ```python
 response = client.inference.chat_completion(
    model_id="meta-llama/Llama-3.1-8B-Instruct",
@ -82,6 +84,9 @@ print(f"Response: {response.completion_message.content}")
 ```

 ### Tool Calling Example ###
+
+The following example shows how to do tool calling for an NVIDIA NIM.
+
 ```python
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition

@ -117,6 +122,9 @@ if tool_response.completion_message.tool_calls:
 ```

 ### Structured Output Example
+
+The following example shows how to do structured output for an NVIDIA NIM.
+
 ```python
 from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType

@ -149,8 +157,10 @@ print(f"Structured Response: {structured_response.completion_message.content}")
 ```

 ### Create Embeddings
-> Note on OpenAI embeddings compatibility
->
+
+The following example shows how to create embeddings for an NVIDIA NIM.
+
+> [!NOTE]
 > NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.

 ```python
@ -160,4 +170,42 @@ response = client.inference.embeddings(
    task_type="query",
 )
 print(f"Embeddings: {response.embeddings}")
-```
+```
+
+### Vision Language Models Example
+
+The following example shows how to run vision inference by using an NVIDIA NIM.
+
+```python
+def load_image_as_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        img_bytes = image_file.read()
+        return base64.b64encode(img_bytes).decode("utf-8")
+
+
+image_path = {path_to_the_image}
+demo_image_b64 = load_image_as_base64(image_path)
+
+vlm_response = client.inference.chat_completion(
+    model_id="nvidia/vila",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": {
+                        "data": demo_image_b64,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": "Please describe what you see in this image in detail.",
+                },
+            ],
+        }
+    ],
+)
+
+print(f"VLM Response: {vlm_response.completion_message.content}")
+```
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@ -55,6 +55,10 @@ MODEL_ENTRIES = [
        "meta/llama-3.3-70b-instruct",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
+    ProviderModelEntry(
+        provider_model_id="nvidia/vila",
+        model_type=ModelType.llm,
+    ),
    # NeMo Retriever Text Embedding models -
    #
    # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -118,10 +118,10 @@ class OllamaInferenceAdapter(

    async def initialize(self) -> None:
        logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
-        health_response = await self.health()
-        if health_response["status"] == HealthStatus.ERROR:
+        r = await self.health()
+        if r["status"] == HealthStatus.ERROR:
            logger.warning(
-                "Ollama Server is not running, make sure to start it using `ollama serve` in a separate terminal"
+                f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
            )

    async def should_refresh_models(self) -> bool:
@ -156,7 +156,7 @@ class OllamaInferenceAdapter(
            ),
            Model(
                identifier="nomic-embed-text",
-                provider_resource_id="nomic-embed-text",
+                provider_resource_id="nomic-embed-text:latest",
                provider_id=provider_id,
                metadata={
                    "embedding_dimension": 768,
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -4,13 +4,26 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES


-class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
+class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
+    """
+    SambaNova Inference Adapter for Llama Stack.
+
+    Note: The inheritance order is important here. OpenAIMixin must come before
+    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
+    is used instead of LiteLLMOpenAIMixin.check_model_availability().
+
+    - OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
+    - LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
+    """
+
    def __init__(self, config: SambaNovaImplConfig):
        self.config = config
        self.environment_available_models = []
@ -24,3 +37,14 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
            download_images=True,  # SambaNova requires base64 image encoding
            json_schema_strict=False,  # SambaNova doesn't support strict=True yet
        )
+
+    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self) -> str:
+        """
+        Get the base URL for OpenAI mixin.
+
+        :return: The SambaNova base URL
+        """
+        return self.config.url
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -7,8 +7,8 @@
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any

-from ibm_watson_machine_learning.foundation_models import Model
-from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
+from ibm_watsonx_ai.foundation_models import Model
+from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
 from openai import AsyncOpenAI

 from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import heapq
 from typing import Any

 import psycopg2
@ -23,6 +24,9 @@ from llama_stack.apis.vector_io import (
 )
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
+from llama_stack.providers.utils.inference.prompt_adapter import (
+    interleaved_content_as_str,
+)
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
@ -31,6 +35,7 @@ from llama_stack.providers.utils.memory.vector_store import (
    EmbeddingIndex,
    VectorDBWithIndex,
 )
+from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator, sanitize_collection_name

 from .config import PGVectorVectorIOConfig

@ -72,25 +77,63 @@ def load_models(cur, cls):


 class PGVectorIndex(EmbeddingIndex):
-    def __init__(self, vector_db: VectorDB, dimension: int, conn, kvstore: KVStore | None = None):
-        self.conn = conn
-        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            # Sanitize the table name by replacing hyphens with underscores
-            # SQL doesn't allow hyphens in table names, and vector_db.identifier may contain hyphens
-            # when created with patterns like "test-vector-db-{uuid4()}"
-            sanitized_identifier = vector_db.identifier.replace("-", "_")
-            self.table_name = f"vector_store_{sanitized_identifier}"
-            self.kvstore = kvstore
+    # reference: https://github.com/pgvector/pgvector?tab=readme-ov-file#querying
+    PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION: dict[str, str] = {
+        "L2": "<->",
+        "L1": "<+>",
+        "COSINE": "<=>",
+        "INNER_PRODUCT": "<#>",
+        "HAMMING": "<~>",
+        "JACCARD": "<%>",
+    }

-            cur.execute(
-                f"""
-                CREATE TABLE IF NOT EXISTS {self.table_name} (
-                    id TEXT PRIMARY KEY,
-                    document JSONB,
-                    embedding vector({dimension})
+    def __init__(
+        self,
+        vector_db: VectorDB,
+        dimension: int,
+        conn: psycopg2.extensions.connection,
+        kvstore: KVStore | None = None,
+        distance_metric: str = "COSINE",
+    ):
+        self.vector_db = vector_db
+        self.dimension = dimension
+        self.conn = conn
+        self.kvstore = kvstore
+        self.check_distance_metric_availability(distance_metric)
+        self.distance_metric = distance_metric
+        self.table_name = None
+
+    async def initialize(self) -> None:
+        try:
+            with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+                # Sanitize the table name by replacing hyphens with underscores
+                # SQL doesn't allow hyphens in table names, and vector_db.identifier may contain hyphens
+                # when created with patterns like "test-vector-db-{uuid4()}"
+                sanitized_identifier = sanitize_collection_name(self.vector_db.identifier)
+                self.table_name = f"vs_{sanitized_identifier}"
+
+                cur.execute(
+                    f"""
+                    CREATE TABLE IF NOT EXISTS {self.table_name} (
+                        id TEXT PRIMARY KEY,
+                        document JSONB,
+                        embedding vector({self.dimension}),
+                        content_text TEXT,
+                        tokenized_content TSVECTOR
+                    )
+                """
                )
-            """
-            )
+
+                # Create GIN index for full-text search performance
+                cur.execute(
+                    f"""
+                    CREATE INDEX IF NOT EXISTS {self.table_name}_content_gin_idx
+                    ON {self.table_name} USING GIN(tokenized_content)
+                """
+                )
+        except Exception as e:
+            log.exception(f"Error creating PGVectorIndex for vector_db: {self.vector_db.identifier}")
+            raise RuntimeError(f"Error creating PGVectorIndex for vector_db: {self.vector_db.identifier}") from e

    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
@ -99,29 +142,49 @@ class PGVectorIndex(EmbeddingIndex):

        values = []
        for i, chunk in enumerate(chunks):
+            content_text = interleaved_content_as_str(chunk.content)
            values.append(
                (
                    f"{chunk.chunk_id}",
                    Json(chunk.model_dump()),
                    embeddings[i].tolist(),
+                    content_text,
+                    content_text,  # Pass content_text twice - once for content_text column, once for to_tsvector function. Eg. to_tsvector(content_text) = tokenized_content
                )
            )

        query = sql.SQL(
            f"""
-        INSERT INTO {self.table_name} (id, document, embedding)
+        INSERT INTO {self.table_name} (id, document, embedding, content_text, tokenized_content)
        VALUES %s
-        ON CONFLICT (id) DO UPDATE SET embedding = EXCLUDED.embedding, document = EXCLUDED.document
+        ON CONFLICT (id) DO UPDATE SET
+            embedding = EXCLUDED.embedding,
+            document = EXCLUDED.document,
+            content_text = EXCLUDED.content_text,
+            tokenized_content = EXCLUDED.tokenized_content
    """
        )
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            execute_values(cur, query, values, template="(%s, %s, %s::vector)")
+            execute_values(cur, query, values, template="(%s, %s, %s::vector, %s, to_tsvector('english', %s))")

    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+        """
+        Performs vector similarity search using PostgreSQL's search function. Default distance metric is COSINE.
+
+        Args:
+            embedding: The query embedding vector
+            k: Number of results to return
+            score_threshold: Minimum similarity score threshold
+
+        Returns:
+            QueryChunksResponse with combined results
+        """
+        pgvector_search_function = self.get_pgvector_search_function()
+
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(
                f"""
-            SELECT document, embedding <-> %s::vector AS distance
+            SELECT document, embedding {pgvector_search_function} %s::vector AS distance
            FROM {self.table_name}
            ORDER BY distance
            LIMIT %s
@ -147,7 +210,40 @@ class PGVectorIndex(EmbeddingIndex):
        k: int,
        score_threshold: float,
    ) -> QueryChunksResponse:
-        raise NotImplementedError("Keyword search is not supported in PGVector")
+        """
+        Performs keyword-based search using PostgreSQL's full-text search with ts_rank scoring.
+
+        Args:
+            query_string: The text query for keyword search
+            k: Number of results to return
+            score_threshold: Minimum similarity score threshold
+
+        Returns:
+            QueryChunksResponse with combined results
+        """
+        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            # Use plainto_tsquery to handle user input safely and ts_rank for relevance scoring
+            cur.execute(
+                f"""
+            SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
+            FROM {self.table_name}
+            WHERE tokenized_content @@ plainto_tsquery('english', %s)
+            ORDER BY score DESC
+            LIMIT %s
+        """,
+                (query_string, query_string, k),
+            )
+            results = cur.fetchall()
+
+            chunks = []
+            scores = []
+            for doc, score in results:
+                if score < score_threshold:
+                    continue
+                chunks.append(Chunk(**doc))
+                scores.append(float(score))
+
+            return QueryChunksResponse(chunks=chunks, scores=scores)

    async def query_hybrid(
        self,
@ -158,7 +254,59 @@ class PGVectorIndex(EmbeddingIndex):
        reranker_type: str,
        reranker_params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
-        raise NotImplementedError("Hybrid search is not supported in PGVector")
+        """
+        Hybrid search combining vector similarity and keyword search using configurable reranking.
+
+        Args:
+            embedding: The query embedding vector
+            query_string: The text query for keyword search
+            k: Number of results to return
+            score_threshold: Minimum similarity score threshold
+            reranker_type: Type of reranker to use ("rrf" or "weighted")
+            reranker_params: Parameters for the reranker
+
+        Returns:
+            QueryChunksResponse with combined results
+        """
+        if reranker_params is None:
+            reranker_params = {}
+
+        # Get results from both search methods
+        vector_response = await self.query_vector(embedding, k, score_threshold)
+        keyword_response = await self.query_keyword(query_string, k, score_threshold)
+
+        # Convert responses to score dictionaries using chunk_id
+        vector_scores = {
+            chunk.chunk_id: score for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
+        }
+        keyword_scores = {
+            chunk.chunk_id: score
+            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
+        }
+
+        # Combine scores using the reranking utility
+        combined_scores = WeightedInMemoryAggregator.combine_search_results(
+            vector_scores, keyword_scores, reranker_type, reranker_params
+        )
+
+        # Efficient top-k selection because it only tracks the k best candidates it's seen so far
+        top_k_items = heapq.nlargest(k, combined_scores.items(), key=lambda x: x[1])
+
+        # Filter by score threshold
+        filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
+
+        # Create a map of chunk_id to chunk for both responses
+        chunk_map = {c.chunk_id: c for c in vector_response.chunks + keyword_response.chunks}
+
+        # Use the map to look up chunks by their IDs
+        chunks = []
+        scores = []
+        for doc_id, score in filtered_items:
+            if doc_id in chunk_map:
+                chunks.append(chunk_map[doc_id])
+                scores.append(score)
+
+        return QueryChunksResponse(chunks=chunks, scores=scores)

    async def delete(self):
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
@ -170,6 +318,25 @@ class PGVectorIndex(EmbeddingIndex):
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(f"DELETE FROM {self.table_name} WHERE id = ANY(%s)", (chunk_ids,))

+    def get_pgvector_search_function(self) -> str:
+        return self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION[self.distance_metric]
+
+    def check_distance_metric_availability(self, distance_metric: str) -> None:
+        """Check if the distance metric is supported by PGVector.
+
+        Args:
+            distance_metric: The distance metric to check
+
+        Raises:
+            ValueError: If the distance metric is not supported
+        """
+        if distance_metric not in self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION:
+            supported_metrics = list(self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION.keys())
+            raise ValueError(
+                f"Distance metric '{distance_metric}' is not supported by PGVector. "
+                f"Supported metrics are: {', '.join(supported_metrics)}"
+            )
+

 class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    def __init__(
@ -185,8 +352,8 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        self.files_api = files_api
        self.kvstore: KVStore | None = None
        self.vector_db_store = None
-        self.openai_vector_store: dict[str, dict[str, Any]] = {}
-        self.metadatadata_collection_name = "openai_vector_stores_metadata"
+        self.openai_vector_stores: dict[str, dict[str, Any]] = {}
+        self.metadata_collection_name = "openai_vector_stores_metadata"

    async def initialize(self) -> None:
        log.info(f"Initializing PGVector memory adapter with config: {self.config}")
@ -233,9 +400,13 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        upsert_models(self.conn, [(vector_db.identifier, vector_db)])

        # Create and cache the PGVector index table for the vector DB
+        pgvector_index = PGVectorIndex(
+            vector_db=vector_db, dimension=vector_db.embedding_dimension, conn=self.conn, kvstore=self.kvstore
+        )
+        await pgvector_index.initialize()
        index = VectorDBWithIndex(
            vector_db,
-            index=PGVectorIndex(vector_db, vector_db.embedding_dimension, self.conn, kvstore=self.kvstore),
+            index=pgvector_index,
            inference_api=self.inference_api,
        )
        self.cache[vector_db.identifier] = index
@ -272,8 +443,15 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        if vector_db_id in self.cache:
            return self.cache[vector_db_id]

+        if self.vector_db_store is None:
+            raise VectorStoreNotFoundError(vector_db_id)
+
        vector_db = await self.vector_db_store.get_vector_db(vector_db_id)
+        if not vector_db:
+            raise VectorStoreNotFoundError(vector_db_id)
+
        index = PGVectorIndex(vector_db, vector_db.embedding_dimension, self.conn)
+        await index.initialize()
        self.cache[vector_db_id] = VectorDBWithIndex(vector_db, index, self.inference_api)
        return self.cache[vector_db_id]

--- a/llama_stack/providers/utils/bedrock/config.py
+++ b/llama_stack/providers/utils/bedrock/config.py
@ -4,53 +4,55 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import os
+
 from pydantic import BaseModel, Field


 class BedrockBaseConfig(BaseModel):
    aws_access_key_id: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_ACCESS_KEY_ID"),
        description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
    )
    aws_secret_access_key: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_SECRET_ACCESS_KEY"),
        description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
    )
    aws_session_token: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_SESSION_TOKEN"),
        description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
    )
    region_name: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_DEFAULT_REGION"),
        description="The default AWS Region to use, for example, us-west-1 or us-west-2."
        "Default use environment variable: AWS_DEFAULT_REGION",
    )
    profile_name: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_PROFILE"),
        description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE",
    )
    total_max_attempts: int | None = Field(
-        default=None,
+        default_factory=lambda: int(val) if (val := os.getenv("AWS_MAX_ATTEMPTS")) else None,
        description="An integer representing the maximum number of attempts that will be made for a single request, "
        "including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS",
    )
    retry_mode: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_RETRY_MODE"),
        description="A string representing the type of retries Boto3 will perform."
        "Default use environment variable: AWS_RETRY_MODE",
    )
    connect_timeout: float | None = Field(
-        default=60,
+        default_factory=lambda: float(os.getenv("AWS_CONNECT_TIMEOUT", "60")),
        description="The time in seconds till a timeout exception is thrown when attempting to make a connection. "
        "The default is 60 seconds.",
    )
    read_timeout: float | None = Field(
-        default=60,
+        default_factory=lambda: float(os.getenv("AWS_READ_TIMEOUT", "60")),
        description="The time in seconds till a timeout exception is thrown when attempting to read from a connection."
        "The default is 60 seconds.",
    )
    session_ttl: int | None = Field(
-        default=3600,
+        default_factory=lambda: int(os.getenv("AWS_SESSION_TTL", "3600")),
        description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
    )

--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import base64
 import struct
 from typing import TYPE_CHECKING
@ -43,9 +44,11 @@ class SentenceTransformerEmbeddingMixin:
        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)
-        embedding_model = self._load_sentence_transformer_model(model.provider_resource_id)
-        embeddings = embedding_model.encode(
-            [interleaved_content_as_str(content) for content in contents], show_progress_bar=False
+        embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
+        embeddings = await asyncio.to_thread(
+            embedding_model.encode,
+            [interleaved_content_as_str(content) for content in contents],
+            show_progress_bar=False,
        )
        return EmbeddingsResponse(embeddings=embeddings)

@ -64,8 +67,8 @@ class SentenceTransformerEmbeddingMixin:

        # Get the model and generate embeddings
        model_obj = await self.model_store.get_model(model)
-        embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id)
-        embeddings = embedding_model.encode(input_list, show_progress_bar=False)
+        embedding_model = await self._load_sentence_transformer_model(model_obj.provider_resource_id)
+        embeddings = await asyncio.to_thread(embedding_model.encode, input_list, show_progress_bar=False)

        # Convert embeddings to the requested format
        data = []
@ -93,7 +96,7 @@ class SentenceTransformerEmbeddingMixin:
            usage=usage,
        )

-    def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
+    async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
        global EMBEDDING_MODELS

        loaded_model = EMBEDDING_MODELS.get(model)
@ -101,8 +104,12 @@ class SentenceTransformerEmbeddingMixin:
            return loaded_model

        log.info(f"Loading sentence transformer for {model}...")
-        from sentence_transformers import SentenceTransformer

-        loaded_model = SentenceTransformer(model)
+        def _load_model():
+            from sentence_transformers import SentenceTransformer
+
+            return SentenceTransformer(model)
+
+        loaded_model = await asyncio.to_thread(_load_model)
        EMBEDDING_MODELS[model] = loaded_model
        return loaded_model
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -294,12 +294,12 @@ class VectorDBWithIndex:
                _validate_embedding(c.embedding, i, self.vector_db.embedding_dimension)

        if chunks_to_embed:
-            resp = await self.inference_api.embeddings(
+            resp = await self.inference_api.openai_embeddings(
                self.vector_db.embedding_model,
                [c.content for c in chunks_to_embed],
            )
-            for c, embedding in zip(chunks_to_embed, resp.embeddings, strict=False):
-                c.embedding = embedding
+            for c, data in zip(chunks_to_embed, resp.data, strict=False):
+                c.embedding = data.embedding

        embeddings = np.array([c.embedding for c in chunks], dtype=np.float32)
        await self.index.add_chunks(chunks, embeddings)
@ -334,8 +334,8 @@ class VectorDBWithIndex:
        if mode == "keyword":
            return await self.index.query_keyword(query_string, k, score_threshold)

-        embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
-        query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
+        embeddings_response = await self.inference_api.openai_embeddings(self.vector_db.embedding_model, [query_string])
+        query_vector = np.array(embeddings_response.data[0].embedding, dtype=np.float32)
        if mode == "hybrid":
            return await self.index.query_hybrid(
                query_vector, query_string, k, score_threshold, reranker_type, reranker_params
--- a/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
@ -23,6 +23,7 @@ from sqlalchemy import (
 )
 from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
 from sqlalchemy.ext.asyncio.engine import AsyncEngine
+from sqlalchemy.sql.elements import ColumnElement

 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.log import get_logger
@ -43,6 +44,30 @@ TYPE_MAPPING: dict[ColumnType, Any] = {
 }


+def _build_where_expr(column: ColumnElement, value: Any) -> ColumnElement:
+    """Return a SQLAlchemy expression for a where condition.
+
+    `value` may be a simple scalar (equality) or a mapping like {">": 123}.
+    The returned expression is a SQLAlchemy ColumnElement usable in query.where(...).
+    """
+    if isinstance(value, Mapping):
+        if len(value) != 1:
+            raise ValueError(f"Operator mapping must have a single operator, got: {value}")
+        op, operand = next(iter(value.items()))
+        if op == "==" or op == "=":
+            return column == operand
+        if op == ">":
+            return column > operand
+        if op == "<":
+            return column < operand
+        if op == ">=":
+            return column >= operand
+        if op == "<=":
+            return column <= operand
+        raise ValueError(f"Unsupported operator '{op}' in where mapping")
+    return column == value
+
+
 class SqlAlchemySqlStoreImpl(SqlStore):
    def __init__(self, config: SqlAlchemySqlStoreConfig):
        self.config = config
@ -111,7 +136,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):

            if where:
                for key, value in where.items():
-                    query = query.where(table_obj.c[key] == value)
+                    query = query.where(_build_where_expr(table_obj.c[key], value))

            if where_sql:
                query = query.where(text(where_sql))
@ -222,7 +247,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
        async with self.async_session() as session:
            stmt = self.metadata.tables[table].update()
            for key, value in where.items():
-                stmt = stmt.where(self.metadata.tables[table].c[key] == value)
+                stmt = stmt.where(_build_where_expr(self.metadata.tables[table].c[key], value))
            await session.execute(stmt, data)
            await session.commit()

@ -233,7 +258,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
        async with self.async_session() as session:
            stmt = self.metadata.tables[table].delete()
            for key, value in where.items():
-                stmt = stmt.where(self.metadata.tables[table].c[key] == value)
+                stmt = stmt.where(_build_where_expr(self.metadata.tables[table].c[key], value))
            await session.execute(stmt)
            await session.commit()

--- a/llama_stack/providers/utils/tools/mcp.py
+++ b/llama_stack/providers/utils/tools/mcp.py
@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
                    raise AuthenticationRequiredError(exc) from exc
            if i == len(connection_strategies) - 1:
                raise
+        except* httpx.ConnectError as eg:
+            # Connection refused, server down, network unreachable
+            if i == len(connection_strategies) - 1:
+                error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused"
+                logger.error(f"MCP connection error: {error_msg}")
+                raise ConnectionError(error_msg) from eg
+            else:
+                logger.warning(
+                    f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
+                )
+        except* httpx.TimeoutException as eg:
+            # Request timeout, server too slow
+            if i == len(connection_strategies) - 1:
+                error_msg = f"MCP server at {endpoint} timed out"
+                logger.error(f"MCP timeout error: {error_msg}")
+                raise TimeoutError(error_msg) from eg
+            else:
+                logger.warning(
+                    f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
+                )
+        except* httpx.RequestError as eg:
+            # DNS resolution failures, network errors, invalid URLs
+            if i == len(connection_strategies) - 1:
+                # Get the first exception's message for the error string
+                exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error"
+                error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}"
+                logger.error(f"MCP network error: {error_msg}")
+                raise ConnectionError(error_msg) from eg
+            else:
+                logger.warning(
+                    f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
+                )
        except* McpError:
            if i < len(connection_strategies) - 1:
                logger.warning(
--- a/llama_stack/providers/utils/vector_io/vector_utils.py
+++ b/llama_stack/providers/utils/vector_io/vector_utils.py
@ -37,3 +37,122 @@ def sanitize_collection_name(name: str, weaviate_format=False) -> str:
    else:
        s = proper_case(re.sub(r"[^a-zA-Z0-9]", "", name))
    return s
+
+
+class WeightedInMemoryAggregator:
+    @staticmethod
+    def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
+        """
+        Normalize scores to 0-1 range using min-max normalization.
+
+        Args:
+            scores: dictionary of scores with document IDs as keys and scores as values
+
+        Returns:
+            Normalized scores with document IDs as keys and normalized scores as values
+        """
+        if not scores:
+            return {}
+        min_score, max_score = min(scores.values()), max(scores.values())
+        score_range = max_score - min_score
+        if score_range > 0:
+            return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
+        return dict.fromkeys(scores, 1.0)
+
+    @staticmethod
+    def weighted_rerank(
+        vector_scores: dict[str, float],
+        keyword_scores: dict[str, float],
+        alpha: float = 0.5,
+    ) -> dict[str, float]:
+        """
+        Rerank via weighted average of scores.
+
+        Args:
+            vector_scores: scores from vector search
+            keyword_scores: scores from keyword search
+            alpha: weight factor between 0 and 1 (default: 0.5)
+                   0 = keyword only, 1 = vector only, 0.5 = equal weight
+
+        Returns:
+            All unique document IDs with weighted combined scores
+        """
+        all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
+        normalized_vector_scores = WeightedInMemoryAggregator._normalize_scores(vector_scores)
+        normalized_keyword_scores = WeightedInMemoryAggregator._normalize_scores(keyword_scores)
+
+        # Weighted formula: score = (1-alpha) * keyword_score + alpha * vector_score
+        # alpha=0 means keyword only, alpha=1 means vector only
+        return {
+            doc_id: ((1 - alpha) * normalized_keyword_scores.get(doc_id, 0.0))
+            + (alpha * normalized_vector_scores.get(doc_id, 0.0))
+            for doc_id in all_ids
+        }
+
+    @staticmethod
+    def rrf_rerank(
+        vector_scores: dict[str, float],
+        keyword_scores: dict[str, float],
+        impact_factor: float = 60.0,
+    ) -> dict[str, float]:
+        """
+        Rerank via Reciprocal Rank Fusion.
+
+        Args:
+            vector_scores: scores from vector search
+            keyword_scores: scores from keyword search
+            impact_factor: impact factor for RRF (default: 60.0)
+
+        Returns:
+            All unique document IDs with RRF combined scores
+        """
+
+        # Convert scores to ranks
+        vector_ranks = {
+            doc_id: i + 1
+            for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
+        }
+        keyword_ranks = {
+            doc_id: i + 1
+            for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
+        }
+
+        all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
+        rrf_scores = {}
+        for doc_id in all_ids:
+            vector_rank = vector_ranks.get(doc_id, float("inf"))
+            keyword_rank = keyword_ranks.get(doc_id, float("inf"))
+
+            # RRF formula: score = 1/(k + r) where k is impact_factor (default: 60.0) and r is the rank
+            rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
+        return rrf_scores
+
+    @staticmethod
+    def combine_search_results(
+        vector_scores: dict[str, float],
+        keyword_scores: dict[str, float],
+        reranker_type: str = "rrf",
+        reranker_params: dict[str, float] | None = None,
+    ) -> dict[str, float]:
+        """
+        Combine vector and keyword search results using specified reranking strategy.
+
+        Args:
+            vector_scores: scores from vector search
+            keyword_scores: scores from keyword search
+            reranker_type: type of reranker to use (default: RERANKER_TYPE_RRF)
+            reranker_params: parameters for the reranker
+
+        Returns:
+            All unique document IDs with combined scores
+        """
+        if reranker_params is None:
+            reranker_params = {}
+
+        if reranker_type == "weighted":
+            alpha = reranker_params.get("alpha", 0.5)
+            return WeightedInMemoryAggregator.weighted_rerank(vector_scores, keyword_scores, alpha)
+        else:
+            # Default to RRF for None, RRF, or any unknown types
+            impact_factor = reranker_params.get("impact_factor", 60.0)
+            return WeightedInMemoryAggregator.rrf_rerank(vector_scores, keyword_scores, impact_factor)
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -30,6 +30,9 @@ from openai.types.completion_choice import CompletionChoice
 CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None
 CompletionChoice.model_rebuild()

+REPO_ROOT = Path(__file__).parent.parent.parent
+DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings"
+

 class InferenceMode(StrEnum):
    LIVE = "live"
@ -51,7 +54,7 @@ def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict


 def get_inference_mode() -> InferenceMode:
-    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "live").lower())
+    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "replay").lower())


 def setup_inference_recording():
@ -60,28 +63,18 @@ def setup_inference_recording():
    to increase their reliability and reduce reliance on expensive, external services.

    Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.
-    Calls to the /models endpoint are not currently trapped. We probably need to add support for this.

-    Two environment variables are required:
-    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'.
-    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in.
+    Two environment variables are supported:
+    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'.
+    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.

-    The recordings are stored in a SQLite database and a JSON file for each request. The SQLite database is used to
-    quickly find the correct recording for a given request. The JSON files are used to store the request and response
-    bodies.
+    The recordings are stored as JSON files.
    """
    mode = get_inference_mode()
-
-    if mode not in InferenceMode:
-        raise ValueError(f"Invalid LLAMA_STACK_TEST_INFERENCE_MODE: {mode}. Must be 'live', 'record', or 'replay'")
-
    if mode == InferenceMode.LIVE:
        return None

-    if "LLAMA_STACK_TEST_RECORDING_DIR" not in os.environ:
-        raise ValueError("LLAMA_STACK_TEST_RECORDING_DIR must be set for recording or replaying")
-    storage_dir = os.environ["LLAMA_STACK_TEST_RECORDING_DIR"]
-
+    storage_dir = os.environ.get("LLAMA_STACK_TEST_RECORDING_DIR", DEFAULT_STORAGE_DIR)
    return inference_recording(mode=mode, storage_dir=storage_dir)


@ -134,8 +127,8 @@ class ResponseStorage:
    def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
        """Store a request/response pair."""
        # Generate unique response filename
-        response_file = f"{request_hash[:12]}.json"
-        response_path = self.responses_dir / response_file
+        short_hash = request_hash[:12]
+        response_file = f"{short_hash}.json"

        # Serialize response body if needed
        serialized_response = dict(response)
@ -147,6 +140,14 @@ class ResponseStorage:
                # Handle single response
                serialized_response["body"] = _serialize_response(serialized_response["body"])

+        # If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants
+        endpoint = request.get("endpoint")
+        if endpoint in ("/api/tags", "/v1/models"):
+            digest = _model_identifiers_digest(endpoint, response)
+            response_file = f"models-{short_hash}-{digest}.json"
+
+        response_path = self.responses_dir / response_file
+
        # Save response to JSON file
        with open(response_path, "w") as f:
            json.dump({"request": request, "response": serialized_response}, f, indent=2)
@ -161,19 +162,85 @@ class ResponseStorage:
        if not response_path.exists():
            return None

-        with open(response_path) as f:
-            data = json.load(f)
+        return _recording_from_file(response_path)

-        # Deserialize response body if needed
-        if "response" in data and "body" in data["response"]:
-            if isinstance(data["response"]["body"], list):
-                # Handle streaming responses
-                data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
+    def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]:
+        results: list[dict[str, Any]] = []
+        for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
+            data = _recording_from_file(path)
+            results.append(data)
+        return results
+
+
+def _recording_from_file(response_path) -> dict[str, Any]:
+    with open(response_path) as f:
+        data = json.load(f)
+
+    # Deserialize response body if needed
+    if "response" in data and "body" in data["response"]:
+        if isinstance(data["response"]["body"], list):
+            # Handle streaming responses
+            data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
+        else:
+            # Handle single response
+            data["response"]["body"] = _deserialize_response(data["response"]["body"])
+
+    return cast(dict[str, Any], data)
+
+
+def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
+    def _extract_model_identifiers():
+        """Extract a stable set of identifiers for model-list endpoints.
+
+        Supported endpoints:
+        - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
+        - '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
+        Returns a list of unique identifiers or None if structure doesn't match.
+        """
+        body = response["body"]
+        if endpoint == "/api/tags":
+            items = body.get("models")
+            idents = [m.model for m in items]
+        else:
+            items = body.get("data")
+            idents = [m.id for m in items]
+        return sorted(set(idents))
+
+    identifiers = _extract_model_identifiers()
+    return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
+
+
+def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
+    """Return a single, unioned recording for supported model-list endpoints."""
+    seen: dict[str, dict[str, Any]] = {}
+    for rec in records:
+        body = rec["response"]["body"]
+        if endpoint == "/api/tags":
+            items = body.models
+        elif endpoint == "/v1/models":
+            items = body.data
+        else:
+            items = []
+
+        for m in items:
+            if endpoint == "/v1/models":
+                key = m.id
            else:
-                # Handle single response
-                data["response"]["body"] = _deserialize_response(data["response"]["body"])
+                key = m.model
+            seen[key] = m

-        return cast(dict[str, Any], data)
+    ordered = [seen[k] for k in sorted(seen.keys())]
+    canonical = records[0]
+    canonical_req = canonical.get("request", {})
+    if isinstance(canonical_req, dict):
+        canonical_req["endpoint"] = endpoint
+    if endpoint == "/v1/models":
+        body = {"data": ordered, "object": "list"}
+    else:
+        from ollama import ListResponse
+
+        body = ListResponse(models=ordered)
+    return {"request": canonical_req, "response": {"body": body, "is_streaming": False}}


 async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
@ -195,8 +262,6 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
        raise ValueError(f"Unknown client type: {client_type}")

    url = base_url.rstrip("/") + endpoint
-
-    # Normalize request for matching
    method = "POST"
    headers = {}
    body = kwargs
@ -204,7 +269,12 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
    request_hash = normalize_request(method, url, headers, body)

    if _current_mode == InferenceMode.REPLAY:
-        recording = _current_storage.find_recording(request_hash)
+        # Special handling for model-list endpoints: return union of all responses
+        if endpoint in ("/api/tags", "/v1/models"):
+            records = _current_storage._model_list_responses(request_hash[:12])
+            recording = _combine_model_list_responses(endpoint, records)
+        else:
+            recording = _current_storage.find_recording(request_hash)
        if recording:
            response_body = recording["response"]["body"]

@ -274,12 +344,14 @@ def patch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels

    # Store original methods for both OpenAI and Ollama clients
    _original_methods = {
        "chat_completions_create": AsyncChatCompletions.create,
        "completions_create": AsyncCompletions.create,
        "embeddings_create": AsyncEmbeddings.create,
+        "models_list": AsyncModels.list,
        "ollama_generate": OllamaAsyncClient.generate,
        "ollama_chat": OllamaAsyncClient.chat,
        "ollama_embed": OllamaAsyncClient.embed,
@ -304,10 +376,16 @@ def patch_inference_clients():
            _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
        )

+    async def patched_models_list(self, *args, **kwargs):
+        return await _patched_inference_method(
+            _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
+        )
+
    # Apply OpenAI patches
    AsyncChatCompletions.create = patched_chat_completions_create
    AsyncCompletions.create = patched_completions_create
    AsyncEmbeddings.create = patched_embeddings_create
+    AsyncModels.list = patched_models_list

    # Create patched methods for Ollama client
    async def patched_ollama_generate(self, *args, **kwargs):
@ -361,11 +439,13 @@ def unpatch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels

    # Restore OpenAI client methods
    AsyncChatCompletions.create = _original_methods["chat_completions_create"]
    AsyncCompletions.create = _original_methods["completions_create"]
    AsyncEmbeddings.create = _original_methods["embeddings_create"]
+    AsyncModels.list = _original_methods["models_list"]

    # Restore Ollama client methods if they were patched
    OllamaAsyncClient.generate = _original_methods["ollama_generate"]
@ -379,16 +459,10 @@ def unpatch_inference_clients():


@contextmanager
-def inference_recording(mode: str = "live", storage_dir: str | Path | None = None) -> Generator[None, None, None]:
+def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Generator[None, None, None]:
    """Context manager for inference recording/replaying."""
    global _current_mode, _current_storage

-    # Set defaults
-    if storage_dir is None:
-        storage_dir_path = Path.home() / ".llama" / "recordings"
-    else:
-        storage_dir_path = Path(storage_dir)
-
    # Store previous state
    prev_mode = _current_mode
    prev_storage = _current_storage
@ -397,7 +471,9 @@ def inference_recording(mode: str = "live", storage_dir: str | Path | None = Non
        _current_mode = mode

        if mode in ["record", "replay"]:
-            _current_storage = ResponseStorage(storage_dir_path)
+            if storage_dir is None:
+                raise ValueError("storage_dir is required for record and replay modes")
+            _current_storage = ResponseStorage(Path(storage_dir))
            patch_inference_clients()

        yield
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -14,11 +14,11 @@
        "@radix-ui/react-select": "^2.2.5",
        "@radix-ui/react-separator": "^1.1.7",
        "@radix-ui/react-slot": "^1.2.3",
-        "@radix-ui/react-tooltip": "^1.2.6",
+        "@radix-ui/react-tooltip": "^1.2.8",
        "class-variance-authority": "^0.7.1",
        "clsx": "^2.1.1",
-        "framer-motion": "^11.18.2",
-        "llama-stack-client": "^0.2.19",
+        "framer-motion": "^12.23.12",
+        "llama-stack-client": "^0.2.20",
        "lucide-react": "^0.510.0",
        "next": "15.3.3",
        "next-auth": "^4.24.11",
@ -39,16 +39,16 @@
        "@testing-library/jest-dom": "^6.8.0",
        "@testing-library/react": "^16.3.0",
        "@types/jest": "^29.5.14",
-        "@types/node": "^20",
+        "@types/node": "^24",
        "@types/react": "^19",
        "@types/react-dom": "^19",
        "eslint": "^9",
-        "eslint-config-next": "15.3.2",
+        "eslint-config-next": "15.5.2",
        "eslint-config-prettier": "^10.1.8",
        "eslint-plugin-prettier": "^5.5.4",
        "jest": "^29.7.0",
        "jest-environment-jsdom": "^29.7.0",
-        "prettier": "3.5.3",
+        "prettier": "3.6.2",
        "tailwindcss": "^4",
        "ts-node": "^10.9.2",
        "tw-animate-css": "^1.2.9",
@ -1854,9 +1854,9 @@
      "integrity": "sha512-OdiMrzCl2Xi0VTjiQQUK0Xh7bJHnOuET2s+3V+Y40WJBAXrJeGA3f+I8MZJ/YQ3mVGi5XGR1L66oFlgqXhQ4Vw=="
    },
    "node_modules/@next/eslint-plugin-next": {
-      "version": "15.3.2",
-      "resolved": "https://registry.npmjs.org/@next/eslint-plugin-next/-/eslint-plugin-next-15.3.2.tgz",
-      "integrity": "sha512-ijVRTXBgnHT33aWnDtmlG+LJD+5vhc9AKTJPquGG5NKXjpKNjc62woIhFtrAcWdBobt8kqjCoaJ0q6sDQoX7aQ==",
+      "version": "15.5.2",
+      "resolved": "https://registry.npmjs.org/@next/eslint-plugin-next/-/eslint-plugin-next-15.5.2.tgz",
+      "integrity": "sha512-lkLrRVxcftuOsJNhWatf1P2hNVfh98k/omQHrCEPPriUypR6RcS13IvLdIrEvkm9AH2Nu2YpR5vLqBuy6twH3Q==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
@ -2861,29 +2861,6 @@
        }
      }
    },
-    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-visually-hidden": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.3.tgz",
-      "integrity": "sha512-pzJq12tEaaIhqjbzpCuv/OypJY/BPavOofm+dbab+MHLajy277+1lLm6JFcGgF5eskJ6mquGirhXY2GD/8u8Ug==",
-      "license": "MIT",
-      "dependencies": {
-        "@radix-ui/react-primitive": "2.1.3"
-      },
-      "peerDependencies": {
-        "@types/react": "*",
-        "@types/react-dom": "*",
-        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
-        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
-      },
-      "peerDependenciesMeta": {
-        "@types/react": {
-          "optional": true
-        },
-        "@types/react-dom": {
-          "optional": true
-        }
-      }
-    },
    "node_modules/@radix-ui/react-separator": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-separator/-/react-separator-1.1.7.tgz",
@ -2949,23 +2926,23 @@
      }
    },
    "node_modules/@radix-ui/react-tooltip": {
-      "version": "1.2.6",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-tooltip/-/react-tooltip-1.2.6.tgz",
-      "integrity": "sha512-zYb+9dc9tkoN2JjBDIIPLQtk3gGyz8FMKoqYTb8EMVQ5a5hBcdHPECrsZVI4NpPAUOixhkoqg7Hj5ry5USowfA==",
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-tooltip/-/react-tooltip-1.2.8.tgz",
+      "integrity": "sha512-tY7sVt1yL9ozIxvmbtN5qtmH2krXcBCfjEiCgKGLqunJHvgvZG2Pcl2oQ3kbcZARb1BGEHdkLzcYGO8ynVlieg==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
-        "@radix-ui/react-dismissable-layer": "1.1.9",
+        "@radix-ui/react-dismissable-layer": "1.1.11",
        "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-popper": "1.2.6",
-        "@radix-ui/react-portal": "1.1.8",
-        "@radix-ui/react-presence": "1.1.4",
-        "@radix-ui/react-primitive": "2.1.2",
-        "@radix-ui/react-slot": "1.2.2",
+        "@radix-ui/react-popper": "1.2.8",
+        "@radix-ui/react-portal": "1.1.9",
+        "@radix-ui/react-presence": "1.1.5",
+        "@radix-ui/react-primitive": "2.1.3",
+        "@radix-ui/react-slot": "1.2.3",
        "@radix-ui/react-use-controllable-state": "1.2.2",
-        "@radix-ui/react-visually-hidden": "1.2.2"
+        "@radix-ui/react-visually-hidden": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
@ -2982,21 +2959,162 @@
        }
      }
    },
-    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-slot": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz",
-      "integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==",
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/primitive": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
+      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
+      "license": "MIT"
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-arrow": {
+      "version": "1.1.7",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
+      "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-compose-refs": "1.1.2"
+        "@radix-ui/react-primitive": "2.1.3"
      },
      "peerDependencies": {
        "@types/react": "*",
-        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-dismissable-layer": {
+      "version": "1.1.11",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
+      "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/primitive": "1.1.3",
+        "@radix-ui/react-compose-refs": "1.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
+        "@radix-ui/react-use-callback-ref": "1.1.1",
+        "@radix-ui/react-use-escape-keydown": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-popper": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
+      "integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
+      "license": "MIT",
+      "dependencies": {
+        "@floating-ui/react-dom": "^2.0.0",
+        "@radix-ui/react-arrow": "1.1.7",
+        "@radix-ui/react-compose-refs": "1.1.2",
+        "@radix-ui/react-context": "1.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
+        "@radix-ui/react-use-callback-ref": "1.1.1",
+        "@radix-ui/react-use-layout-effect": "1.1.1",
+        "@radix-ui/react-use-rect": "1.1.1",
+        "@radix-ui/react-use-size": "1.1.1",
+        "@radix-ui/rect": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-portal": {
+      "version": "1.1.9",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
+      "integrity": "sha512-bpIxvq03if6UNwXZ+HTK71JLh4APvnXntDc6XOX8UVq4XQOVl7lwok0AvIl+b8zgCw3fSaVTZMpAPPagXbKmHQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-primitive": "2.1.3",
+        "@radix-ui/react-use-layout-effect": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-presence": {
+      "version": "1.1.5",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz",
+      "integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-compose-refs": "1.1.2",
+        "@radix-ui/react-use-layout-effect": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-primitive": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
+      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-slot": "1.2.3"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
        }
      }
    },
@ -3137,12 +3255,35 @@
      }
    },
    "node_modules/@radix-ui/react-visually-hidden": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.2.tgz",
-      "integrity": "sha512-ORCmRUbNiZIv6uV5mhFrhsIKw4UX/N3syZtyqvry61tbGm4JlgQuSn0hk5TwCARsCjkcnuRkSdCE3xfb+ADHew==",
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.3.tgz",
+      "integrity": "sha512-pzJq12tEaaIhqjbzpCuv/OypJY/BPavOofm+dbab+MHLajy277+1lLm6JFcGgF5eskJ6mquGirhXY2GD/8u8Ug==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-primitive": "2.1.2"
+        "@radix-ui/react-primitive": "2.1.3"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-visually-hidden/node_modules/@radix-ui/react-primitive": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
+      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
@ -3910,12 +4051,12 @@
      "license": "MIT"
    },
    "node_modules/@types/node": {
-      "version": "20.17.47",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.17.47.tgz",
-      "integrity": "sha512-3dLX0Upo1v7RvUimvxLeXqwrfyKxUINk0EAM83swP2mlSUcwV73sZy8XhNz8bcZ3VbsfQyC/y6jRdL5tgCNpDQ==",
+      "version": "24.3.0",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-24.3.0.tgz",
+      "integrity": "sha512-aPTXCrfwnDLj4VvXrm+UUCQjNEvJgNA8s5F1cvwQU+3KNltTOkBm1j30uNLyqqPNe7gE3KFzImYoZEfLhp4Yow==",
      "license": "MIT",
      "dependencies": {
-        "undici-types": "~6.19.2"
+        "undici-types": "~7.10.0"
      }
    },
    "node_modules/@types/node-fetch": {
@ -6433,13 +6574,13 @@
      }
    },
    "node_modules/eslint-config-next": {
-      "version": "15.3.2",
-      "resolved": "https://registry.npmjs.org/eslint-config-next/-/eslint-config-next-15.3.2.tgz",
-      "integrity": "sha512-FerU4DYccO4FgeYFFglz0SnaKRe1ejXQrDb8kWUkTAg036YWi+jUsgg4sIGNCDhAsDITsZaL4MzBWKB6f4G1Dg==",
+      "version": "15.5.2",
+      "resolved": "https://registry.npmjs.org/eslint-config-next/-/eslint-config-next-15.5.2.tgz",
+      "integrity": "sha512-3hPZghsLupMxxZ2ggjIIrat/bPniM2yRpsVPVM40rp8ZMzKWOJp2CGWn7+EzoV2ddkUr5fxNfHpF+wU1hGt/3g==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
-        "@next/eslint-plugin-next": "15.3.2",
+        "@next/eslint-plugin-next": "15.5.2",
        "@rushstack/eslint-patch": "^1.10.3",
        "@typescript-eslint/eslint-plugin": "^5.4.2 || ^6.0.0 || ^7.0.0 || ^8.0.0",
        "@typescript-eslint/parser": "^5.4.2 || ^6.0.0 || ^7.0.0 || ^8.0.0",
@ -7268,13 +7409,13 @@
      }
    },
    "node_modules/framer-motion": {
-      "version": "11.18.2",
-      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-11.18.2.tgz",
-      "integrity": "sha512-5F5Och7wrvtLVElIpclDT0CBzMVg3dL22B64aZwHtsIY8RB4mXICLrkajK4G9R+ieSAGcgrLeae2SeUTg2pr6w==",
+      "version": "12.23.12",
+      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-12.23.12.tgz",
+      "integrity": "sha512-6e78rdVtnBvlEVgu6eFEAgG9v3wLnYEboM8I5O5EXvfKC8gxGQB8wXJdhkMy10iVcn05jl6CNw7/HTsTCfwcWg==",
      "license": "MIT",
      "dependencies": {
-        "motion-dom": "^11.18.1",
-        "motion-utils": "^11.18.1",
+        "motion-dom": "^12.23.12",
+        "motion-utils": "^12.23.6",
        "tslib": "^2.4.0"
      },
      "peerDependencies": {
@ -10006,9 +10147,9 @@
      "license": "MIT"
    },
    "node_modules/llama-stack-client": {
-      "version": "0.2.19",
-      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.19.tgz",
-      "integrity": "sha512-sDuAhUdEGlERZ3jlMUzPXcQTgMv/pGbDrPX0ifbE5S+gr7Q+7ohuQYrIXe+hXgIipFjq+y4b2c5laZ76tmAyEA==",
+      "version": "0.2.20",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.20.tgz",
+      "integrity": "sha512-1vD5nizTX5JEW8TADxKgy/P1W8YZoPSpdnmfxbdYbWgpQ3BWtbvLS6jmDk7VwVA5fRC4895VfHsRDfS1liHarw==",
      "license": "MIT",
      "dependencies": {
        "@types/node": "^18.11.18",
@ -11184,18 +11325,18 @@
      }
    },
    "node_modules/motion-dom": {
-      "version": "11.18.1",
-      "resolved": "https://registry.npmjs.org/motion-dom/-/motion-dom-11.18.1.tgz",
-      "integrity": "sha512-g76KvA001z+atjfxczdRtw/RXOM3OMSdd1f4DL77qCTF/+avrRJiawSG4yDibEQ215sr9kpinSlX2pCTJ9zbhw==",
+      "version": "12.23.12",
+      "resolved": "https://registry.npmjs.org/motion-dom/-/motion-dom-12.23.12.tgz",
+      "integrity": "sha512-RcR4fvMCTESQBD/uKQe49D5RUeDOokkGRmz4ceaJKDBgHYtZtntC/s2vLvY38gqGaytinij/yi3hMcWVcEF5Kw==",
      "license": "MIT",
      "dependencies": {
-        "motion-utils": "^11.18.1"
+        "motion-utils": "^12.23.6"
      }
    },
    "node_modules/motion-utils": {
-      "version": "11.18.1",
-      "resolved": "https://registry.npmjs.org/motion-utils/-/motion-utils-11.18.1.tgz",
-      "integrity": "sha512-49Kt+HKjtbJKLtgO/LKj9Ld+6vw9BjH5d9sc40R/kVyH8GLAXgT42M2NnuPcJNuA3s9ZfZBUcwIgpmZWGEE+hA==",
+      "version": "12.23.6",
+      "resolved": "https://registry.npmjs.org/motion-utils/-/motion-utils-12.23.6.tgz",
+      "integrity": "sha512-eAWoPgr4eFEOFfg2WjIsMoqJTW6Z8MTUCgn/GZ3VRpClWBdnbjryiA3ZSNLyxCTmCQx4RmYX6jX1iWHbenUPNQ==",
      "license": "MIT"
    },
    "node_modules/ms": {
@ -12083,9 +12224,9 @@
      }
    },
    "node_modules/prettier": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.5.3.tgz",
-      "integrity": "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw==",
+      "version": "3.6.2",
+      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz",
+      "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==",
      "dev": true,
      "license": "MIT",
      "bin": {
@ -13986,9 +14127,9 @@
      }
    },
    "node_modules/undici-types": {
-      "version": "6.19.8",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.19.8.tgz",
-      "integrity": "sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.10.0.tgz",
+      "integrity": "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag==",
      "license": "MIT"
    },
    "node_modules/unified": {
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -19,11 +19,11 @@
    "@radix-ui/react-select": "^2.2.5",
    "@radix-ui/react-separator": "^1.1.7",
    "@radix-ui/react-slot": "^1.2.3",
-    "@radix-ui/react-tooltip": "^1.2.6",
+    "@radix-ui/react-tooltip": "^1.2.8",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
-    "framer-motion": "^11.18.2",
-    "llama-stack-client": "^0.2.19",
+    "framer-motion": "^12.23.12",
+    "llama-stack-client": "^0.2.20",
    "lucide-react": "^0.510.0",
    "next": "15.3.3",
    "next-auth": "^4.24.11",
@ -44,16 +44,16 @@
    "@testing-library/jest-dom": "^6.8.0",
    "@testing-library/react": "^16.3.0",
    "@types/jest": "^29.5.14",
-    "@types/node": "^20",
+    "@types/node": "^24",
    "@types/react": "^19",
    "@types/react-dom": "^19",
    "eslint": "^9",
-    "eslint-config-next": "15.3.2",
+    "eslint-config-next": "15.5.2",
    "eslint-config-prettier": "^10.1.8",
    "eslint-plugin-prettier": "^5.5.4",
    "jest": "^29.7.0",
    "jest-environment-jsdom": "^29.7.0",
-    "prettier": "3.5.3",
+    "prettier": "3.6.2",
    "tailwindcss": "^4",
    "ts-node": "^10.9.2",
    "tw-animate-css": "^1.2.9",