mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-14 17:16:09 +00:00
Some checks failed
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 43s
Unit Tests / unit-tests (3.12) (push) Failing after 45s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 2s
Vector IO Integration Tests / test-matrix (3.12, inline::milvus) (push) Failing after 4s
Integration Tests / discover-tests (push) Successful in 6s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 7s
Pre-commit / pre-commit (push) Successful in 2m8s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 4s
Test Llama Stack Build / generate-matrix (push) Successful in 5s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 7s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 9s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 9s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 11s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 12s
Test Llama Stack Build / build-single-provider (push) Failing after 7s
Python Package Build Test / build (3.13) (push) Failing after 5s
Python Package Build Test / build (3.12) (push) Failing after 7s
Unit Tests / unit-tests (3.13) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 13s
Test External Providers / test-external-providers (venv) (push) Failing after 7s
Vector IO Integration Tests / test-matrix (3.13, inline::milvus) (push) Failing after 11s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 12s
Update ReadTheDocs / update-readthedocs (push) Failing after 6s
Integration Tests / test-matrix (push) Failing after 6s
Test Llama Stack Build / build (push) Failing after 4s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 12s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 16s
# What does this PR do? Some of our inference providers support passthrough authentication via `x-llamastack-provider-data` header values. This fixes the providers that support passthrough auth to not cache their clients to the backend providers (mostly OpenAI client instances) so that the client connecting to Llama Stack has to provide those auth values on each and every request. ## Test Plan I added some unit tests to ensure we're not caching clients across requests for all the fixed providers in this PR. ``` uv run pytest -sv tests/unit/providers/inference/test_inference_client_caching.py ``` I also ran some of our OpenAI compatible API integration tests for each of the changed providers, just to ensure they still work. Note that these providers don't actually pass all these tests (for unrelated reasons due to quirks of the Groq and Together SaaS services), but enough of the tests passed to confirm the clients are still working as intended. ### Together ``` ENABLE_TOGETHER="together" \ uv run llama stack run llama_stack/templates/starter/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ uv run pytest -sv \ tests/integration/inference/test_openai_completion.py \ --text-model "together/meta-llama/Llama-3.1-8B-Instruct" ``` ### OpenAI ``` ENABLE_OPENAI="openai" \ uv run llama stack run llama_stack/templates/starter/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ uv run pytest -sv \ tests/integration/inference/test_openai_completion.py \ --text-model "openai/gpt-4o-mini" ``` ### Groq ``` ENABLE_GROQ="groq" \ uv run llama stack run llama_stack/templates/starter/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ uv run pytest -sv \ tests/integration/inference/test_openai_completion.py \ --text-model "groq/meta-llama/Llama-3.1-8B-Instruct" ``` --------- Signed-off-by: Ben Browning <bbrownin@redhat.com>
160 lines
5.8 KiB
Python
160 lines
5.8 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from collections.abc import AsyncIterator
|
|
from typing import Any
|
|
|
|
from openai import AsyncOpenAI
|
|
|
|
from llama_stack.apis.inference import (
|
|
OpenAIChatCompletion,
|
|
OpenAIChatCompletionChunk,
|
|
OpenAIChoiceDelta,
|
|
OpenAIChunkChoice,
|
|
OpenAIMessageParam,
|
|
OpenAIResponseFormatParam,
|
|
OpenAISystemMessageParam,
|
|
)
|
|
from llama_stack.providers.remote.inference.groq.config import GroqConfig
|
|
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
|
|
from llama_stack.providers.utils.inference.openai_compat import (
|
|
prepare_openai_completion_params,
|
|
)
|
|
|
|
from .models import MODEL_ENTRIES
|
|
|
|
|
|
class GroqInferenceAdapter(LiteLLMOpenAIMixin):
|
|
_config: GroqConfig
|
|
|
|
def __init__(self, config: GroqConfig):
|
|
LiteLLMOpenAIMixin.__init__(
|
|
self,
|
|
model_entries=MODEL_ENTRIES,
|
|
api_key_from_config=config.api_key,
|
|
provider_data_api_key_field="groq_api_key",
|
|
)
|
|
self.config = config
|
|
|
|
async def initialize(self):
|
|
await super().initialize()
|
|
|
|
async def shutdown(self):
|
|
await super().shutdown()
|
|
|
|
def _get_openai_client(self) -> AsyncOpenAI:
|
|
return AsyncOpenAI(
|
|
base_url=f"{self.config.url}/openai/v1",
|
|
api_key=self.get_api_key(),
|
|
)
|
|
|
|
async def openai_chat_completion(
|
|
self,
|
|
model: str,
|
|
messages: list[OpenAIMessageParam],
|
|
frequency_penalty: float | None = None,
|
|
function_call: str | dict[str, Any] | None = None,
|
|
functions: list[dict[str, Any]] | None = None,
|
|
logit_bias: dict[str, float] | None = None,
|
|
logprobs: bool | None = None,
|
|
max_completion_tokens: int | None = None,
|
|
max_tokens: int | None = None,
|
|
n: int | None = None,
|
|
parallel_tool_calls: bool | None = None,
|
|
presence_penalty: float | None = None,
|
|
response_format: OpenAIResponseFormatParam | None = None,
|
|
seed: int | None = None,
|
|
stop: str | list[str] | None = None,
|
|
stream: bool | None = None,
|
|
stream_options: dict[str, Any] | None = None,
|
|
temperature: float | None = None,
|
|
tool_choice: str | dict[str, Any] | None = None,
|
|
tools: list[dict[str, Any]] | None = None,
|
|
top_logprobs: int | None = None,
|
|
top_p: float | None = None,
|
|
user: str | None = None,
|
|
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
|
model_obj = await self.model_store.get_model(model)
|
|
|
|
# Groq does not support json_schema response format, so we need to convert it to json_object
|
|
if response_format and response_format.type == "json_schema":
|
|
response_format.type = "json_object"
|
|
schema = response_format.json_schema.get("schema", {})
|
|
response_format.json_schema = None
|
|
json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
|
|
if messages and messages[0].role == "system":
|
|
messages[0].content = messages[0].content + json_instructions
|
|
else:
|
|
messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
|
|
|
|
# Groq returns a 400 error if tools are provided but none are called
|
|
# So, set tool_choice to "required" to attempt to force a call
|
|
if tools and (not tool_choice or tool_choice == "auto"):
|
|
tool_choice = "required"
|
|
|
|
params = await prepare_openai_completion_params(
|
|
model=model_obj.provider_resource_id.replace("groq/", ""),
|
|
messages=messages,
|
|
frequency_penalty=frequency_penalty,
|
|
function_call=function_call,
|
|
functions=functions,
|
|
logit_bias=logit_bias,
|
|
logprobs=logprobs,
|
|
max_completion_tokens=max_completion_tokens,
|
|
max_tokens=max_tokens,
|
|
n=n,
|
|
parallel_tool_calls=parallel_tool_calls,
|
|
presence_penalty=presence_penalty,
|
|
response_format=response_format,
|
|
seed=seed,
|
|
stop=stop,
|
|
stream=stream,
|
|
stream_options=stream_options,
|
|
temperature=temperature,
|
|
tool_choice=tool_choice,
|
|
tools=tools,
|
|
top_logprobs=top_logprobs,
|
|
top_p=top_p,
|
|
user=user,
|
|
)
|
|
|
|
# Groq does not support streaming requests that set response_format
|
|
fake_stream = False
|
|
if stream and response_format:
|
|
params["stream"] = False
|
|
fake_stream = True
|
|
|
|
response = await self._get_openai_client().chat.completions.create(**params)
|
|
|
|
if fake_stream:
|
|
chunk_choices = []
|
|
for choice in response.choices:
|
|
delta = OpenAIChoiceDelta(
|
|
content=choice.message.content,
|
|
role=choice.message.role,
|
|
tool_calls=choice.message.tool_calls,
|
|
)
|
|
chunk_choice = OpenAIChunkChoice(
|
|
delta=delta,
|
|
finish_reason=choice.finish_reason,
|
|
index=choice.index,
|
|
logprobs=None,
|
|
)
|
|
chunk_choices.append(chunk_choice)
|
|
chunk = OpenAIChatCompletionChunk(
|
|
id=response.id,
|
|
choices=chunk_choices,
|
|
object="chat.completion.chunk",
|
|
created=response.created,
|
|
model=response.model,
|
|
)
|
|
|
|
async def _fake_stream_generator():
|
|
yield chunk
|
|
|
|
return _fake_stream_generator()
|
|
else:
|
|
return response
|