Merge branch 'main' into prompt-api

2025-12-17 07:22:35 +00:00 · 2025-09-06 21:53:34 -06:00 · 2025-09-06 21:53:34 -06:00 · 60361b910c
commit 60361b910c
parent 574dffbe38 78cab5331a
8 changed files with 112 additions and 138 deletions
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -5,6 +5,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Name | File | Purpose |
 | ---- | ---- | ------- |
 | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
 | API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
 | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
 | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -0,0 +1,57 @@
 # API Conformance Tests
 # This workflow ensures that API changes maintain backward compatibility and don't break existing integrations
 # It runs schema validation and OpenAPI diff checks to catch breaking changes early
 name: API Conformance Tests
 run-name: Run the API Conformance test suite on the changes.
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - '.github/workflows/conformance.yml' # This workflow itself
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  # Cancel in-progress runs when new commits are pushed to avoid wasting CI resources
  cancel-in-progress: true
 jobs:
  # Job to check if API schema changes maintain backward compatibility
  check-schema-compatibility:
    runs-on: ubuntu-latest
    steps:
      # Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act`
      # This ensures consistent behavior between local testing and CI
      - name: Checkout PR Code
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
      # Checkout the base branch to compare against (usually main)
      # This allows us to diff the current changes against the previous state
      - name: Checkout Base Branch
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          ref: ${{ github.event.pull_request.base.ref }}
          path: 'base'
      # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
      - name: Install oasdiff
        run: |
          curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
      # Run oasdiff to detect breaking changes in the API specification
      # This step will fail if incompatible changes are detected, preventing breaking changes from being merged
      - name: Run OpenAPI Breaking Change Diff
        run: |
          oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
          --match-path '^/v1/vector-io' \
          --match-path '^/v1/vector-dbs'
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -207,7 +207,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="gemini",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.gemini",
                config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
                provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@ -248,7 +248,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="groq",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.groq",
                config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
                provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@ -270,7 +270,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="sambanova",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.sambanova",
                config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
                provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -5,12 +5,13 @@
 # the root directory of this source tree.
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import GeminiConfig
 from .models import MODEL_ENTRIES
-class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
+class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    def __init__(self, config: GeminiConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
@ -21,6 +22,11 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self):
        return "https://generativelanguage.googleapis.com/v1beta/openai/"
    async def initialize(self) -> None:
        await super().initialize()
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -4,30 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections.abc import AsyncIterator
 from typing import Any
 from openai import AsyncOpenAI
 from llama_stack.apis.inference import (
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAIChoiceDelta,
    OpenAIChunkChoice,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    OpenAISystemMessageParam,
 )
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
-from llama_stack.providers.utils.inference.openai_compat import (
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
    prepare_openai_completion_params,
 )
 from .models import MODEL_ENTRIES
-class GroqInferenceAdapter(LiteLLMOpenAIMixin):
+class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    _config: GroqConfig
    def __init__(self, config: GroqConfig):
@ -40,122 +25,14 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config
    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self) -> str:
        return f"{self.config.url}/openai/v1"
    async def initialize(self):
        await super().initialize()
    async def shutdown(self):
        await super().shutdown()
    def _get_openai_client(self) -> AsyncOpenAI:
        return AsyncOpenAI(
            base_url=f"{self.config.url}/openai/v1",
            api_key=self.get_api_key(),
        )
    async def openai_chat_completion(
        self,
        model: str,
        messages: list[OpenAIMessageParam],
        frequency_penalty: float | None = None,
        function_call: str | dict[str, Any] | None = None,
        functions: list[dict[str, Any]] | None = None,
        logit_bias: dict[str, float] | None = None,
        logprobs: bool | None = None,
        max_completion_tokens: int | None = None,
        max_tokens: int | None = None,
        n: int | None = None,
        parallel_tool_calls: bool | None = None,
        presence_penalty: float | None = None,
        response_format: OpenAIResponseFormatParam | None = None,
        seed: int | None = None,
        stop: str | list[str] | None = None,
        stream: bool | None = None,
        stream_options: dict[str, Any] | None = None,
        temperature: float | None = None,
        tool_choice: str | dict[str, Any] | None = None,
        tools: list[dict[str, Any]] | None = None,
        top_logprobs: int | None = None,
        top_p: float | None = None,
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self.model_store.get_model(model)
        # Groq does not support json_schema response format, so we need to convert it to json_object
        if response_format and response_format.type == "json_schema":
            response_format.type = "json_object"
            schema = response_format.json_schema.get("schema", {})
            response_format.json_schema = None
            json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
            if messages and messages[0].role == "system":
                messages[0].content = messages[0].content + json_instructions
            else:
                messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
        # Groq returns a 400 error if tools are provided but none are called
        # So, set tool_choice to "required" to attempt to force a call
        if tools and (not tool_choice or tool_choice == "auto"):
            tool_choice = "required"
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        # Groq does not support streaming requests that set response_format
        fake_stream = False
        if stream and response_format:
            params["stream"] = False
            fake_stream = True
        response = await self._get_openai_client().chat.completions.create(**params)
        if fake_stream:
            chunk_choices = []
            for choice in response.choices:
                delta = OpenAIChoiceDelta(
                    content=choice.message.content,
                    role=choice.message.role,
                    tool_calls=choice.message.tool_calls,
                )
                chunk_choice = OpenAIChunkChoice(
                    delta=delta,
                    finish_reason=choice.finish_reason,
                    index=choice.index,
                    logprobs=None,
                )
                chunk_choices.append(chunk_choice)
            chunk = OpenAIChatCompletionChunk(
                id=response.id,
                choices=chunk_choices,
                object="chat.completion.chunk",
                created=response.created,
                model=response.model,
            )
            async def _fake_stream_generator():
                yield chunk
            return _fake_stream_generator()
        else:
            return response
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -4,13 +4,26 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES
-class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
+class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    """
    SambaNova Inference Adapter for Llama Stack.
    Note: The inheritance order is important here. OpenAIMixin must come before
    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
    is used instead of LiteLLMOpenAIMixin.check_model_availability().
    - OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
    - LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
    """
    def __init__(self, config: SambaNovaImplConfig):
        self.config = config
        self.environment_available_models = []
@ -24,3 +37,14 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
            download_images=True,  # SambaNova requires base64 image encoding
            json_schema_strict=False,  # SambaNova doesn't support strict=True yet
        )
    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self) -> str:
        """
        Get the base URL for OpenAI mixin.
        :return: The SambaNova base URL
        """
        return self.config.url
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -37,6 +37,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::sambanova",
        "remote::tgi",
        "remote::vertexai",
        # {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
        # or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
        "remote::groq",
        "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
@ -63,6 +67,12 @@ def skip_if_doesnt_support_n(client_with_models, model_id):
    if provider.provider_type in (
        "remote::sambanova",
        "remote::ollama",
        # https://console.groq.com/docs/openai#currently-unsupported-openai-features
        # -> Error code: 400 - {'error': {'message': "'n' : number must be at most 1", 'type': 'invalid_request_error'}}
        "remote::groq",
        # Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the
        # current model', 'status': 'INVALID_ARGUMENT'}}]
        "remote::gemini",
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
--- a/tests/unit/providers/inference/test_inference_client_caching.py
+++ b/tests/unit/providers/inference/test_inference_client_caching.py
@ -33,8 +33,7 @@ def test_groq_provider_openai_client_caching():
        with request_provider_data_context(
            {"x-llamastack-provider-data": json.dumps({inference_adapter.provider_data_api_key_field: api_key})}
        ):
-            openai_client = inference_adapter._get_openai_client()
+            assert inference_adapter.client.api_key == api_key
            assert openai_client.api_key == api_key
 def test_openai_provider_openai_client_caching():