From d6c3b363904eedd9c5323594603dc4f2d5b581eb Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 6 Sep 2025 15:22:20 -0400 Subject: [PATCH 1/7] chore: update the gemini inference impl to use openai-python for openai-compat functions (#3351) # What does this PR do? update the Gemini inference provider to use openai-python for the openai-compat endpoints partially addresses #3349, does not address /inference/completion or /inference/chat-completion ## Test Plan ci --- llama_stack/providers/registry/inference.py | 2 +- llama_stack/providers/remote/inference/gemini/gemini.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 50956f58c..1bb3c3147 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -207,7 +207,7 @@ def available_providers() -> list[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="gemini", - pip_packages=["litellm"], + pip_packages=["litellm", "openai"], module="llama_stack.providers.remote.inference.gemini", config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig", provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator", diff --git a/llama_stack/providers/remote/inference/gemini/gemini.py b/llama_stack/providers/remote/inference/gemini/gemini.py index b6048eff7..569227fdd 100644 --- a/llama_stack/providers/remote/inference/gemini/gemini.py +++ b/llama_stack/providers/remote/inference/gemini/gemini.py @@ -5,12 +5,13 @@ # the root directory of this source tree. from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin +from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from .config import GeminiConfig from .models import MODEL_ENTRIES -class GeminiInferenceAdapter(LiteLLMOpenAIMixin): +class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): def __init__(self, config: GeminiConfig) -> None: LiteLLMOpenAIMixin.__init__( self, @@ -21,6 +22,11 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin): ) self.config = config + get_api_key = LiteLLMOpenAIMixin.get_api_key + + def get_base_url(self): + return "https://generativelanguage.googleapis.com/v1beta/openai/" + async def initialize(self) -> None: await super().initialize() From 4c28544c04ab96c5c4d188dda3a53dc2eab0415d Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 6 Sep 2025 15:22:44 -0400 Subject: [PATCH 2/7] chore(gemini, tests): add skips for n and completions, gemini api does not support them (#3350) # What does this PR do? the gemini api endpoints do not support the n param or completions ## Test Plan ci --- tests/integration/inference/test_openai_completion.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index bb447b3c1..099032578 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -37,6 +37,7 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id) "remote::sambanova", "remote::tgi", "remote::vertexai", + "remote::gemini", # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404 ): pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.") @@ -63,6 +64,9 @@ def skip_if_doesnt_support_n(client_with_models, model_id): if provider.provider_type in ( "remote::sambanova", "remote::ollama", + # Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the + # current model', 'status': 'INVALID_ARGUMENT'}}] + "remote::gemini", ): pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.") From bf02cd846fdc39db80291746e06ca547e5afdbdb Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 6 Sep 2025 15:25:13 -0400 Subject: [PATCH 3/7] chore: update the sambanova inference impl to use openai-python for openai-compat functions (#3345) # What does this PR do? update SambaNova inference provider to use OpenAIMixin for openai-compat endpoints ## Test Plan ``` $ SAMBANOVA_API_KEY=... uv run llama stack build --image-type venv --providers inference=remote::sambanova --run ... $ LLAMA_STACK_CONFIG=http://localhost:8321 uv run --group test pytest -v -ra --text-model sambanova/Meta-Llama-3.3-70B-Instruct tests/integration/inference -k 'not store' ... FAILED tests/integration/inference/test_text_inference.py::test_text_chat_completion_tool_calling_tools_not_in_request[txt=sambanova/Meta-Llama-3.3-70B-Instruct-inference:chat_completion:tool_calling_tools_absent-True] - AttributeError: 'NoneType' object has no attribute 'delta' FAILED tests/integration/inference/test_text_inference.py::test_text_chat_completion_tool_calling_tools_not_in_request[txt=sambanova/Meta-Llama-3.3-70B-Instruct-inference:chat_completion:tool_calling_tools_absent-False] - llama_stack_client.InternalServerError: Error code: 500 - {'detail': 'Internal server error: An une... =========== 2 failed, 16 passed, 68 skipped, 8 deselected, 3 xfailed, 13 warnings in 15.85s ============ ``` the two failures also exist before this change. they are part of the deprecated inference.chat_completion tests that flow through litellm. they can be resolved later. --- llama_stack/providers/registry/inference.py | 2 +- .../remote/inference/sambanova/sambanova.py | 26 ++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 1bb3c3147..7a95fd089 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -270,7 +270,7 @@ Available Models: api=Api.inference, adapter=AdapterSpec( adapter_type="sambanova", - pip_packages=["litellm"], + pip_packages=["litellm", "openai"], module="llama_stack.providers.remote.inference.sambanova", config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig", provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator", diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py index 96469acac..ee3b0f648 100644 --- a/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -4,13 +4,26 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. + from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin +from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from .config import SambaNovaImplConfig from .models import MODEL_ENTRIES -class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin): +class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): + """ + SambaNova Inference Adapter for Llama Stack. + + Note: The inheritance order is important here. OpenAIMixin must come before + LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability() + is used instead of LiteLLMOpenAIMixin.check_model_availability(). + + - OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists + - LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM + """ + def __init__(self, config: SambaNovaImplConfig): self.config = config self.environment_available_models = [] @@ -24,3 +37,14 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin): download_images=True, # SambaNova requires base64 image encoding json_schema_strict=False, # SambaNova doesn't support strict=True yet ) + + # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin + get_api_key = LiteLLMOpenAIMixin.get_api_key + + def get_base_url(self) -> str: + """ + Get the base URL for OpenAI mixin. + + :return: The SambaNova base URL + """ + return self.config.url From 9252d9fc018487ef7bd6ac400ed329cd5db1e8c4 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 6 Sep 2025 15:35:30 -0400 Subject: [PATCH 4/7] chore(groq test): skip with_n tests for groq, it is not supported server-side (#3346) # What does this PR do? skip the with_n test for groq, because it isn't supported by the provider's service see https://console.groq.com/docs/openai#currently-unsupported-openai-features Co-authored-by: raghotham --- tests/integration/inference/test_openai_completion.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 099032578..2043f6aeb 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -64,6 +64,9 @@ def skip_if_doesnt_support_n(client_with_models, model_id): if provider.provider_type in ( "remote::sambanova", "remote::ollama", + # https://console.groq.com/docs/openai#currently-unsupported-openai-features + # -> Error code: 400 - {'error': {'message': "'n' : number must be at most 1", 'type': 'invalid_request_error'}} + "remote::groq", # Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the # current model', 'status': 'INVALID_ARGUMENT'}}] "remote::gemini", From ecd9d8dc1a70ea70ea06253869392afac3abdb40 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Sat, 6 Sep 2025 15:40:33 -0400 Subject: [PATCH 5/7] test: introduce api conformance test (#3257) # What does this PR do? this test runs on each PR and uses a new conformance workflow to compare the base (main) branch openapi spec to the one on this PR if one of our "stable" APIs change, the test will fail. this workflow uses `oasdiff` to identify breaking changes for paths we want to ensure comptability for. specifically this is using `oasdiff breaking` with `--match-path` which only checks breaking changes for the specified paths. As a follow up to this, we can add an optional way to make it so that it is OK to make these change if properly documented or in a changelog or something. or by using a label on the PR to override the failing test. related to #3237 ## Test Plan conformance test should pass given there are no changes Signed-off-by: Charlie Doern --- .github/workflows/README.md | 1 + .github/workflows/conformance.yml | 57 +++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 .github/workflows/conformance.yml diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 2e0df58b8..059bb873f 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -5,6 +5,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl | Name | File | Purpose | | ---- | ---- | ------- | | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md | +| API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. | | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script | | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication | | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore | diff --git a/.github/workflows/conformance.yml b/.github/workflows/conformance.yml new file mode 100644 index 000000000..2433b0203 --- /dev/null +++ b/.github/workflows/conformance.yml @@ -0,0 +1,57 @@ +# API Conformance Tests +# This workflow ensures that API changes maintain backward compatibility and don't break existing integrations +# It runs schema validation and OpenAPI diff checks to catch breaking changes early + +name: API Conformance Tests + +run-name: Run the API Conformance test suite on the changes. + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + types: [opened, synchronize, reopened] + paths: + - 'llama_stack/**' + - '!llama_stack/ui/**' + - 'tests/**' + - 'uv.lock' + - 'pyproject.toml' + - '.github/workflows/conformance.yml' # This workflow itself + +concurrency: + group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }} + # Cancel in-progress runs when new commits are pushed to avoid wasting CI resources + cancel-in-progress: true + +jobs: + # Job to check if API schema changes maintain backward compatibility + check-schema-compatibility: + runs-on: ubuntu-latest + steps: + # Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act` + # This ensures consistent behavior between local testing and CI + - name: Checkout PR Code + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + + # Checkout the base branch to compare against (usually main) + # This allows us to diff the current changes against the previous state + - name: Checkout Base Branch + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + ref: ${{ github.event.pull_request.base.ref }} + path: 'base' + + # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs. + - name: Install oasdiff + run: | + curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh + + # Run oasdiff to detect breaking changes in the API specification + # This step will fail if incompatible changes are detected, preventing breaking changes from being merged + - name: Run OpenAPI Breaking Change Diff + run: | + oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \ + --match-path '^/v1/vector-io' \ + --match-path '^/v1/vector-dbs' From d23607483fc8ca63ea15db5a95f672aec249d2e1 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 6 Sep 2025 18:36:27 -0400 Subject: [PATCH 6/7] chore: update the groq inference impl to use openai-python for openai-compat functions (#3348) # What does this PR do? update Groq inference provider to use OpenAIMixin for openai-compat endpoints changes on api.groq.com - - json_schema is now supported for specific models, see https://console.groq.com/docs/structured-outputs#supported-models - response_format with streaming is now supported for models that support response_format - groq no longer returns a 400 error if tools are provided and tool_choice is not "required" ## Test Plan ``` $ GROQ_API_KEY=... uv run llama stack build --image-type venv --providers inference=remote::groq --run ... $ LLAMA_STACK_CONFIG=http://localhost:8321 uv run --group test pytest -v -ra --text-model groq/llama-3.3-70b-versatile tests/integration/inference/test_openai_completion.py -k 'not store' ... SKIPPED [3] tests/integration/inference/test_openai_completion.py:44: Model groq/llama-3.3-70b-versatile hosted by remote::groq doesn't support OpenAI completions. SKIPPED [3] tests/integration/inference/test_openai_completion.py:94: Model groq/llama-3.3-70b-versatile hosted by remote::groq doesn't support vllm extra_body parameters. SKIPPED [4] tests/integration/inference/test_openai_completion.py:73: Model groq/llama-3.3-70b-versatile hosted by remote::groq doesn't support n param. SKIPPED [1] tests/integration/inference/test_openai_completion.py:100: Model groq/llama-3.3-70b-versatile hosted by remote::groq doesn't support chat completion calls with base64 encoded files. ======================= 8 passed, 11 skipped, 8 deselected, 2 warnings in 5.13s ======================== ``` --------- Co-authored-by: raghotham --- llama_stack/providers/registry/inference.py | 2 +- .../providers/remote/inference/groq/groq.py | 139 +----------------- .../test_inference_client_caching.py | 3 +- 3 files changed, 10 insertions(+), 134 deletions(-) diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 7a95fd089..4176f85a6 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -248,7 +248,7 @@ Available Models: api=Api.inference, adapter=AdapterSpec( adapter_type="groq", - pip_packages=["litellm"], + pip_packages=["litellm", "openai"], module="llama_stack.providers.remote.inference.groq", config_class="llama_stack.providers.remote.inference.groq.GroqConfig", provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator", diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py index fd7212de4..888953af0 100644 --- a/llama_stack/providers/remote/inference/groq/groq.py +++ b/llama_stack/providers/remote/inference/groq/groq.py @@ -4,30 +4,15 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from collections.abc import AsyncIterator -from typing import Any -from openai import AsyncOpenAI - -from llama_stack.apis.inference import ( - OpenAIChatCompletion, - OpenAIChatCompletionChunk, - OpenAIChoiceDelta, - OpenAIChunkChoice, - OpenAIMessageParam, - OpenAIResponseFormatParam, - OpenAISystemMessageParam, -) from llama_stack.providers.remote.inference.groq.config import GroqConfig from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin -from llama_stack.providers.utils.inference.openai_compat import ( - prepare_openai_completion_params, -) +from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from .models import MODEL_ENTRIES -class GroqInferenceAdapter(LiteLLMOpenAIMixin): +class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): _config: GroqConfig def __init__(self, config: GroqConfig): @@ -40,122 +25,14 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin): ) self.config = config + # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin + get_api_key = LiteLLMOpenAIMixin.get_api_key + + def get_base_url(self) -> str: + return f"{self.config.url}/openai/v1" + async def initialize(self): await super().initialize() async def shutdown(self): await super().shutdown() - - def _get_openai_client(self) -> AsyncOpenAI: - return AsyncOpenAI( - base_url=f"{self.config.url}/openai/v1", - api_key=self.get_api_key(), - ) - - async def openai_chat_completion( - self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, - ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: - model_obj = await self.model_store.get_model(model) - - # Groq does not support json_schema response format, so we need to convert it to json_object - if response_format and response_format.type == "json_schema": - response_format.type = "json_object" - schema = response_format.json_schema.get("schema", {}) - response_format.json_schema = None - json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}" - if messages and messages[0].role == "system": - messages[0].content = messages[0].content + json_instructions - else: - messages.insert(0, OpenAISystemMessageParam(content=json_instructions)) - - # Groq returns a 400 error if tools are provided but none are called - # So, set tool_choice to "required" to attempt to force a call - if tools and (not tool_choice or tool_choice == "auto"): - tool_choice = "required" - - params = await prepare_openai_completion_params( - model=model_obj.provider_resource_id, - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, - ) - - # Groq does not support streaming requests that set response_format - fake_stream = False - if stream and response_format: - params["stream"] = False - fake_stream = True - - response = await self._get_openai_client().chat.completions.create(**params) - - if fake_stream: - chunk_choices = [] - for choice in response.choices: - delta = OpenAIChoiceDelta( - content=choice.message.content, - role=choice.message.role, - tool_calls=choice.message.tool_calls, - ) - chunk_choice = OpenAIChunkChoice( - delta=delta, - finish_reason=choice.finish_reason, - index=choice.index, - logprobs=None, - ) - chunk_choices.append(chunk_choice) - chunk = OpenAIChatCompletionChunk( - id=response.id, - choices=chunk_choices, - object="chat.completion.chunk", - created=response.created, - model=response.model, - ) - - async def _fake_stream_generator(): - yield chunk - - return _fake_stream_generator() - else: - return response diff --git a/tests/unit/providers/inference/test_inference_client_caching.py b/tests/unit/providers/inference/test_inference_client_caching.py index b371cf907..f4b3201e9 100644 --- a/tests/unit/providers/inference/test_inference_client_caching.py +++ b/tests/unit/providers/inference/test_inference_client_caching.py @@ -33,8 +33,7 @@ def test_groq_provider_openai_client_caching(): with request_provider_data_context( {"x-llamastack-provider-data": json.dumps({inference_adapter.provider_data_api_key_field: api_key})} ): - openai_client = inference_adapter._get_openai_client() - assert openai_client.api_key == api_key + assert inference_adapter.client.api_key == api_key def test_openai_provider_openai_client_caching(): From 78cab5331a78d27b084a04c0d8e302a51d5d2c4d Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 6 Sep 2025 19:21:55 -0400 Subject: [PATCH 7/7] chore(groq test): skip completions tests for groq, api is not supported server-side (#3347) # What does this PR do? skip /v1/completions tests on groq, endpoint is not supported Co-authored-by: raghotham --- tests/integration/inference/test_openai_completion.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 2043f6aeb..aee75b21d 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -37,6 +37,9 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id) "remote::sambanova", "remote::tgi", "remote::vertexai", + # {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos, + # or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}} + "remote::groq", "remote::gemini", # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404 ): pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")