Merge remote-tracking branch 'upstream/main' into api-pkg

Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-12-04 10:10:36 +00:00 · 2025-11-12 13:53:31 -05:00 · 2025-11-12 13:53:31 -05:00 · d6b915ce0a
commit d6b915ce0a
parent 85d407c2a0 492f79ca9b
48 changed files with 1990 additions and 425 deletions
--- a/tests/integration/ci_matrix.json
+++ b/tests/integration/ci_matrix.json
@ -1,6 +1,7 @@
 {
  "default": [
    {"suite": "base", "setup": "ollama"},
+    {"suite": "base", "setup": "ollama-postgres", "allowed_clients": ["server"], "stack_config": "server:ci-tests::run-with-postgres-store.yaml"},
    {"suite": "vision", "setup": "ollama-vision"},
    {"suite": "responses", "setup": "gpt"},
    {"suite": "base-vllm-subset", "setup": "vllm"}
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@ -233,10 +233,21 @@ def instantiate_llama_stack_client(session):
        raise ValueError("You must specify either --stack-config or LLAMA_STACK_CONFIG")

    # Handle server:<config_name> format or server:<config_name>:<port>
+    # Also handles server:<distro>::<run_file.yaml> format
    if config.startswith("server:"):
-        parts = config.split(":")
-        config_name = parts[1]
-        port = int(parts[2]) if len(parts) > 2 else int(os.environ.get("LLAMA_STACK_PORT", DEFAULT_PORT))
+        # Strip the "server:" prefix first
+        config_part = config[7:]  # len("server:") == 7
+
+        # Check for :: (distro::runfile format)
+        if "::" in config_part:
+            config_name = config_part
+            port = int(os.environ.get("LLAMA_STACK_PORT", DEFAULT_PORT))
+        else:
+            # Single colon format: either <name> or <name>:<port>
+            parts = config_part.split(":")
+            config_name = parts[0]
+            port = int(parts[1]) if len(parts) > 1 else int(os.environ.get("LLAMA_STACK_PORT", DEFAULT_PORT))
+
        base_url = f"http://localhost:{port}"

        force_restart = os.environ.get("LLAMA_STACK_TEST_FORCE_SERVER_RESTART") == "1"
@ -323,7 +334,13 @@ def require_server(llama_stack_client):
@pytest.fixture(scope="session")
 def openai_client(llama_stack_client, require_server):
    base_url = f"{llama_stack_client.base_url}/v1"
-    return OpenAI(base_url=base_url, api_key="fake")
+    client = OpenAI(base_url=base_url, api_key="fake", max_retries=0, timeout=30.0)
+    yield client
+    # Cleanup: close HTTP connections
+    try:
+        client.close()
+    except Exception:
+        pass


@pytest.fixture(params=["openai_client", "client_with_models"])
--- a/tests/integration/recordings/README.md
+++ b/tests/integration/recordings/README.md
@ -2,6 +2,10 @@

 This directory contains recorded inference API responses used for deterministic testing without requiring live API access.

+For more information, see the
+[docs](https://llamastack.github.io/docs/contributing/testing/record-replay).
+This README provides more technical information.
+
 ## Structure

 - `responses/` - JSON files containing request/response pairs for inference operations
--- a/tests/integration/responses/fixtures/fixtures.py
+++ b/tests/integration/responses/fixtures/fixtures.py
@ -115,7 +115,15 @@ def openai_client(base_url, api_key, provider):
        client = LlamaStackAsLibraryClient(config, skip_logger_removal=True)
        return client

-    return OpenAI(
+    client = OpenAI(
        base_url=base_url,
        api_key=api_key,
+        max_retries=0,
+        timeout=30.0,
    )
+    yield client
+    # Cleanup: close HTTP connections
+    try:
+        client.close()
+    except Exception:
+        pass
--- a/tests/integration/responses/test_conversation_responses.py
+++ b/tests/integration/responses/test_conversation_responses.py
@ -65,8 +65,14 @@ class TestConversationResponses:
        conversation_items = openai_client.conversations.items.list(conversation.id)
        assert len(conversation_items.data) >= 4  # 2 user + 2 assistant messages

+    @pytest.mark.timeout(60, method="thread")
    def test_conversation_context_loading(self, openai_client, text_model_id):
-        """Test that conversation context is properly loaded for responses."""
+        """Test that conversation context is properly loaded for responses.
+
+        Note: 60s timeout added due to CI-specific deadlock in pytest/OpenAI client/httpx
+        after running 25+ tests. Hangs before first HTTP request is made. Works fine locally.
+        Investigation needed: connection pool exhaustion or event loop state issue.
+        """
        conversation = openai_client.conversations.create(
            items=[
                {"type": "message", "role": "user", "content": "My name is Alice. I like to eat apples."},
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@ -71,6 +71,26 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
            "embedding_model": "ollama/nomic-embed-text:v1.5",
        },
    ),
+    "ollama-postgres": Setup(
+        name="ollama-postgres",
+        description="Server-mode tests with Postgres-backed persistence",
+        env={
+            "OLLAMA_URL": "http://0.0.0.0:11434",
+            "SAFETY_MODEL": "ollama/llama-guard3:1b",
+            "POSTGRES_HOST": "127.0.0.1",
+            "POSTGRES_PORT": "5432",
+            "POSTGRES_DB": "llamastack",
+            "POSTGRES_USER": "llamastack",
+            "POSTGRES_PASSWORD": "llamastack",
+            "LLAMA_STACK_LOGGING": "openai_responses=info",
+        },
+        defaults={
+            "text_model": "ollama/llama3.2:3b-instruct-fp16",
+            "embedding_model": "sentence-transformers/nomic-embed-text-v1.5",
+            "safety_model": "ollama/llama-guard3:1b",
+            "safety_shield": "llama-guard",
+        },
+    ),
    "vllm": Setup(
        name="vllm",
        description="vLLM provider with a text model",
--- a/tests/integration/vector_io/test_openai_vector_stores.py
+++ b/tests/integration/vector_io/test_openai_vector_stores.py
@ -8,6 +8,7 @@ import time
 from io import BytesIO

 import pytest
+from llama_stack_api.apis.files import ExpiresAfter
 from llama_stack_api.apis.vector_io import Chunk
 from llama_stack_client import BadRequestError
 from openai import BadRequestError as OpenAIBadRequestError
@ -1604,3 +1605,97 @@ def test_openai_vector_store_embedding_config_from_metadata(

    assert "metadata_config_store" in store_names
    assert "consistent_config_store" in store_names
+
+
+@vector_provider_wrapper
+def test_openai_vector_store_file_contents_with_extra_query(
+    compat_client_with_empty_stores, client_with_models, embedding_model_id, embedding_dimension, vector_io_provider_id
+):
+    """Test that vector store file contents endpoint supports extra_query parameter."""
+    skip_if_provider_doesnt_support_openai_vector_stores(client_with_models)
+    compat_client = compat_client_with_empty_stores
+
+    # Create a vector store
+    vector_store = compat_client.vector_stores.create(
+        name="test_extra_query_store",
+        extra_body={
+            "embedding_model": embedding_model_id,
+            "provider_id": vector_io_provider_id,
+        },
+    )
+
+    # Create and attach a file
+    test_content = b"This is test content for extra_query validation."
+    with BytesIO(test_content) as file_buffer:
+        file_buffer.name = "test_extra_query.txt"
+        file = compat_client.files.create(
+            file=file_buffer,
+            purpose="assistants",
+            expires_after=ExpiresAfter(anchor="created_at", seconds=86400),
+        )
+
+    file_attach_response = compat_client.vector_stores.files.create(
+        vector_store_id=vector_store.id,
+        file_id=file.id,
+        extra_body={"embedding_model": embedding_model_id},
+    )
+    assert file_attach_response.status == "completed"
+
+    # Wait for processing
+    time.sleep(2)
+
+    # Test that extra_query parameter is accepted and processed
+    content_with_extra_query = compat_client.vector_stores.files.content(
+        vector_store_id=vector_store.id,
+        file_id=file.id,
+        extra_query={"include_embeddings": True, "include_metadata": True},
+    )
+
+    # Test without extra_query for comparison
+    content_without_extra_query = compat_client.vector_stores.files.content(
+        vector_store_id=vector_store.id,
+        file_id=file.id,
+    )
+
+    # Validate that both calls succeed
+    assert content_with_extra_query is not None
+    assert content_without_extra_query is not None
+    assert len(content_with_extra_query.data) > 0
+    assert len(content_without_extra_query.data) > 0
+
+    # Validate that extra_query parameter is processed correctly
+    # Both should have the embedding/metadata fields available (may be None based on flags)
+    first_chunk_with_flags = content_with_extra_query.data[0]
+    first_chunk_without_flags = content_without_extra_query.data[0]
+
+    # The key validation: extra_query fields are present in the response
+    # Handle both dict and object responses (different clients may return different formats)
+    def has_field(obj, field):
+        if isinstance(obj, dict):
+            return field in obj
+        else:
+            return hasattr(obj, field)
+
+    # Validate that all expected fields are present in both responses
+    expected_fields = ["embedding", "chunk_metadata", "metadata", "text"]
+    for field in expected_fields:
+        assert has_field(first_chunk_with_flags, field), f"Field '{field}' missing from response with extra_query"
+        assert has_field(first_chunk_without_flags, field), f"Field '{field}' missing from response without extra_query"
+
+    # Validate content is the same
+    def get_field(obj, field):
+        if isinstance(obj, dict):
+            return obj[field]
+        else:
+            return getattr(obj, field)
+
+    assert get_field(first_chunk_with_flags, "text") == test_content.decode("utf-8")
+    assert get_field(first_chunk_without_flags, "text") == test_content.decode("utf-8")
+
+    with_flags_embedding = get_field(first_chunk_with_flags, "embedding")
+    without_flags_embedding = get_field(first_chunk_without_flags, "embedding")
+
+    # Validate that embeddings are included when requested and excluded when not requested
+    assert with_flags_embedding is not None, "Embeddings should be included when include_embeddings=True"
+    assert len(with_flags_embedding) > 0, "Embedding should be a non-empty list"
+    assert without_flags_embedding is None, "Embeddings should not be included when include_embeddings=False"