mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
Merge remote-tracking branch 'origin/main' into resp_branching
This commit is contained in:
commit
1536ae0333
144 changed files with 62682 additions and 51560 deletions
|
@ -4,14 +4,11 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Any
|
||||
|
||||
import litellm
|
||||
|
||||
from llama_stack.apis.common.content_types import (
|
||||
InterleavedContent,
|
||||
)
|
||||
from llama_stack.apis.inference import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
|
@ -62,7 +59,7 @@ class LiteLLMOpenAIMixin(
|
|||
self,
|
||||
litellm_provider_name: str,
|
||||
api_key_from_config: str | None,
|
||||
provider_data_api_key_field: str,
|
||||
provider_data_api_key_field: str | None = None,
|
||||
model_entries: list[ProviderModelEntry] | None = None,
|
||||
openai_compat_api_base: str | None = None,
|
||||
download_images: bool = False,
|
||||
|
@ -73,7 +70,7 @@ class LiteLLMOpenAIMixin(
|
|||
|
||||
:param model_entries: The model entries to register.
|
||||
:param api_key_from_config: The API key to use from the config.
|
||||
:param provider_data_api_key_field: The field in the provider data that contains the API key.
|
||||
:param provider_data_api_key_field: The field in the provider data that contains the API key (optional).
|
||||
:param litellm_provider_name: The name of the provider, used for model lookups.
|
||||
:param openai_compat_api_base: The base URL for OpenAI compatibility, or None if not using OpenAI compatibility.
|
||||
:param download_images: Whether to download images and convert to base64 for message conversion.
|
||||
|
@ -108,17 +105,6 @@ class LiteLLMOpenAIMixin(
|
|||
else model_id
|
||||
)
|
||||
|
||||
async def completion(
|
||||
self,
|
||||
model_id: str,
|
||||
content: InterleavedContent,
|
||||
sampling_params: SamplingParams | None = None,
|
||||
response_format: ResponseFormat | None = None,
|
||||
stream: bool | None = False,
|
||||
logprobs: LogProbConfig | None = None,
|
||||
) -> AsyncGenerator:
|
||||
raise NotImplementedError("LiteLLM does not support completion requests")
|
||||
|
||||
async def chat_completion(
|
||||
self,
|
||||
model_id: str,
|
||||
|
|
|
@ -63,7 +63,7 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
|
|||
model_entries: list[ProviderModelEntry] | None = None,
|
||||
allowed_models: list[str] | None = None,
|
||||
):
|
||||
self.allowed_models = allowed_models
|
||||
self.allowed_models = allowed_models if allowed_models else []
|
||||
|
||||
self.alias_to_provider_id_map = {}
|
||||
self.provider_id_to_llama_model_map = {}
|
||||
|
|
|
@ -103,8 +103,6 @@ from llama_stack.apis.inference import (
|
|||
JsonSchemaResponseFormat,
|
||||
Message,
|
||||
OpenAIChatCompletion,
|
||||
OpenAICompletion,
|
||||
OpenAICompletionChoice,
|
||||
OpenAIEmbeddingData,
|
||||
OpenAIMessageParam,
|
||||
OpenAIResponseFormatParam,
|
||||
|
@ -1229,76 +1227,6 @@ async def prepare_openai_completion_params(**params):
|
|||
return completion_params
|
||||
|
||||
|
||||
class OpenAICompletionToLlamaStackMixin:
|
||||
async def openai_completion(
|
||||
self,
|
||||
model: str,
|
||||
prompt: str | list[str] | list[int] | list[list[int]],
|
||||
best_of: int | None = None,
|
||||
echo: bool | None = None,
|
||||
frequency_penalty: float | None = None,
|
||||
logit_bias: dict[str, float] | None = None,
|
||||
logprobs: bool | None = None,
|
||||
max_tokens: int | None = None,
|
||||
n: int | None = None,
|
||||
presence_penalty: float | None = None,
|
||||
seed: int | None = None,
|
||||
stop: str | list[str] | None = None,
|
||||
stream: bool | None = None,
|
||||
stream_options: dict[str, Any] | None = None,
|
||||
temperature: float | None = None,
|
||||
top_p: float | None = None,
|
||||
user: str | None = None,
|
||||
guided_choice: list[str] | None = None,
|
||||
prompt_logprobs: int | None = None,
|
||||
suffix: str | None = None,
|
||||
) -> OpenAICompletion:
|
||||
if stream:
|
||||
raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
|
||||
|
||||
# This is a pretty hacky way to do emulate completions -
|
||||
# basically just de-batches them...
|
||||
prompts = [prompt] if not isinstance(prompt, list) else prompt
|
||||
|
||||
sampling_params = _convert_openai_sampling_params(
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
)
|
||||
|
||||
choices = []
|
||||
# "n" is the number of completions to generate per prompt
|
||||
n = n or 1
|
||||
for _i in range(0, n):
|
||||
# and we may have multiple prompts, if batching was used
|
||||
|
||||
for prompt in prompts:
|
||||
result = self.completion(
|
||||
model_id=model,
|
||||
content=prompt,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
index = len(choices)
|
||||
text = result.content
|
||||
finish_reason = _convert_stop_reason_to_openai_finish_reason(result.stop_reason)
|
||||
|
||||
choice = OpenAICompletionChoice(
|
||||
index=index,
|
||||
text=text,
|
||||
finish_reason=finish_reason,
|
||||
)
|
||||
choices.append(choice)
|
||||
|
||||
return OpenAICompletion(
|
||||
id=f"cmpl-{uuid.uuid4()}",
|
||||
choices=choices,
|
||||
created=int(time.time()),
|
||||
model=model,
|
||||
object="text_completion",
|
||||
)
|
||||
|
||||
|
||||
class OpenAIChatCompletionToLlamaStackMixin:
|
||||
async def openai_chat_completion(
|
||||
self,
|
||||
|
|
|
@ -24,6 +24,7 @@ from llama_stack.apis.inference import (
|
|||
OpenAIResponseFormatParam,
|
||||
)
|
||||
from llama_stack.apis.models import ModelType
|
||||
from llama_stack.core.request_headers import NeedsRequestProviderData
|
||||
from llama_stack.log import get_logger
|
||||
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
||||
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
|
||||
|
@ -32,7 +33,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import localize_image_
|
|||
logger = get_logger(name=__name__, category="providers::utils")
|
||||
|
||||
|
||||
class OpenAIMixin(ModelRegistryHelper, ABC):
|
||||
class OpenAIMixin(ModelRegistryHelper, NeedsRequestProviderData, ABC):
|
||||
"""
|
||||
Mixin class that provides OpenAI-specific functionality for inference providers.
|
||||
This class handles direct OpenAI API calls using the AsyncOpenAI client.
|
||||
|
@ -69,6 +70,9 @@ class OpenAIMixin(ModelRegistryHelper, ABC):
|
|||
# List of allowed models for this provider, if empty all models allowed
|
||||
allowed_models: list[str] = []
|
||||
|
||||
# Optional field name in provider data to look for API key, which takes precedence
|
||||
provider_data_api_key_field: str | None = None
|
||||
|
||||
@abstractmethod
|
||||
def get_api_key(self) -> str:
|
||||
"""
|
||||
|
@ -111,9 +115,28 @@ class OpenAIMixin(ModelRegistryHelper, ABC):
|
|||
|
||||
Uses the abstract methods get_api_key() and get_base_url() which must be
|
||||
implemented by child classes.
|
||||
|
||||
Users can also provide the API key via the provider data header, which
|
||||
is used instead of any config API key.
|
||||
"""
|
||||
|
||||
api_key = self.get_api_key()
|
||||
|
||||
if self.provider_data_api_key_field:
|
||||
provider_data = self.get_request_provider_data()
|
||||
if provider_data and getattr(provider_data, self.provider_data_api_key_field, None):
|
||||
api_key = getattr(provider_data, self.provider_data_api_key_field)
|
||||
|
||||
if not api_key: # TODO: let get_api_key return None
|
||||
raise ValueError(
|
||||
"API key is not set. Please provide a valid API key in the "
|
||||
"provider data header, e.g. x-llamastack-provider-data: "
|
||||
f'{{"{self.provider_data_api_key_field}": "<API_KEY>"}}, '
|
||||
"or in the provider config."
|
||||
)
|
||||
|
||||
return AsyncOpenAI(
|
||||
api_key=self.get_api_key(),
|
||||
api_key=api_key,
|
||||
base_url=self.get_base_url(),
|
||||
**self.get_extra_client_params(),
|
||||
)
|
||||
|
|
|
@ -229,28 +229,6 @@ async def convert_image_content_to_url(
|
|||
return base64.b64encode(content).decode("utf-8")
|
||||
|
||||
|
||||
async def completion_request_to_prompt(request: CompletionRequest) -> str:
|
||||
content = augment_content_with_response_format_prompt(request.response_format, request.content)
|
||||
request.content = content
|
||||
request = await convert_request_to_raw(request)
|
||||
|
||||
formatter = ChatFormat(tokenizer=Tokenizer.get_instance())
|
||||
model_input = formatter.encode_content(request.content)
|
||||
return formatter.tokenizer.decode(model_input.tokens)
|
||||
|
||||
|
||||
async def completion_request_to_prompt_model_input_info(
|
||||
request: CompletionRequest,
|
||||
) -> tuple[str, int]:
|
||||
content = augment_content_with_response_format_prompt(request.response_format, request.content)
|
||||
request.content = content
|
||||
request = await convert_request_to_raw(request)
|
||||
|
||||
formatter = ChatFormat(tokenizer=Tokenizer.get_instance())
|
||||
model_input = formatter.encode_content(request.content)
|
||||
return (formatter.tokenizer.decode(model_input.tokens), len(model_input.tokens))
|
||||
|
||||
|
||||
def augment_content_with_response_format_prompt(response_format, content):
|
||||
if fmt_prompt := response_format_prompt(response_format):
|
||||
if isinstance(content, list):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue