mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-25 21:57:45 +00:00
Some checks failed
Coverage Badge / unit-tests (push) Failing after 3s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 6s
Python Package Build Test / build (3.12) (push) Failing after 3s
Vector IO Integration Tests / test-matrix (3.12, inline::milvus) (push) Failing after 6s
Integration Tests / discover-tests (push) Successful in 7s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 6s
Python Package Build Test / build (3.13) (push) Failing after 2s
Vector IO Integration Tests / test-matrix (3.13, inline::milvus) (push) Failing after 6s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 5s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 8s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 9s
Unit Tests / unit-tests (3.12) (push) Failing after 8s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 11s
Test External Providers / test-external-providers (venv) (push) Failing after 8s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 12s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 9s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 17s
Unit Tests / unit-tests (3.13) (push) Failing after 12s
Update ReadTheDocs / update-readthedocs (push) Failing after 11s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 16s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 18s
Integration Tests / test-matrix (push) Failing after 18s
Pre-commit / pre-commit (push) Successful in 1m14s
# What does this PR do? add an `OpenAIMixin` for use by inference providers who remote endpoints support an OpenAI compatible API. use is demonstrated by refactoring - OpenAIInferenceAdapter - NVIDIAInferenceAdapter (adds embedding support) - LlamaCompatInferenceAdapter ## Test Plan existing unit and integration tests
257 lines
9.3 KiB
Python
257 lines
9.3 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import logging
|
|
import warnings
|
|
from collections.abc import AsyncIterator
|
|
|
|
from openai import APIConnectionError, BadRequestError
|
|
|
|
from llama_stack.apis.common.content_types import (
|
|
InterleavedContent,
|
|
InterleavedContentItem,
|
|
TextContentItem,
|
|
)
|
|
from llama_stack.apis.inference import (
|
|
ChatCompletionRequest,
|
|
ChatCompletionResponse,
|
|
ChatCompletionResponseStreamChunk,
|
|
CompletionRequest,
|
|
CompletionResponse,
|
|
CompletionResponseStreamChunk,
|
|
EmbeddingsResponse,
|
|
EmbeddingTaskType,
|
|
Inference,
|
|
LogProbConfig,
|
|
Message,
|
|
ResponseFormat,
|
|
SamplingParams,
|
|
TextTruncation,
|
|
ToolChoice,
|
|
ToolConfig,
|
|
)
|
|
from llama_stack.models.llama.datatypes import ToolDefinition, ToolPromptFormat
|
|
from llama_stack.providers.utils.inference.model_registry import (
|
|
ModelRegistryHelper,
|
|
)
|
|
from llama_stack.providers.utils.inference.openai_compat import (
|
|
convert_openai_chat_completion_choice,
|
|
convert_openai_chat_completion_stream,
|
|
)
|
|
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
|
|
from llama_stack.providers.utils.inference.prompt_adapter import content_has_media
|
|
|
|
from . import NVIDIAConfig
|
|
from .models import MODEL_ENTRIES
|
|
from .openai_utils import (
|
|
convert_chat_completion_request,
|
|
convert_completion_request,
|
|
convert_openai_completion_choice,
|
|
convert_openai_completion_stream,
|
|
)
|
|
from .utils import _is_nvidia_hosted
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper):
|
|
"""
|
|
NVIDIA Inference Adapter for Llama Stack.
|
|
|
|
Note: The inheritance order is important here. OpenAIMixin must come before
|
|
ModelRegistryHelper to ensure that OpenAIMixin.check_model_availability()
|
|
is used instead of ModelRegistryHelper.check_model_availability(). It also
|
|
must come before Inference to ensure that OpenAIMixin methods are available
|
|
in the Inference interface.
|
|
|
|
- OpenAIMixin.check_model_availability() queries the NVIDIA API to check if a model exists
|
|
- ModelRegistryHelper.check_model_availability() just returns False and shows a warning
|
|
"""
|
|
|
|
def __init__(self, config: NVIDIAConfig) -> None:
|
|
# TODO(mf): filter by available models
|
|
ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
|
|
|
|
logger.info(f"Initializing NVIDIAInferenceAdapter({config.url})...")
|
|
|
|
if _is_nvidia_hosted(config):
|
|
if not config.api_key:
|
|
raise RuntimeError(
|
|
"API key is required for hosted NVIDIA NIM. Either provide an API key or use a self-hosted NIM."
|
|
)
|
|
# elif self._config.api_key:
|
|
#
|
|
# we don't raise this warning because a user may have deployed their
|
|
# self-hosted NIM with an API key requirement.
|
|
#
|
|
# warnings.warn(
|
|
# "API key is not required for self-hosted NVIDIA NIM. "
|
|
# "Consider removing the api_key from the configuration."
|
|
# )
|
|
|
|
self._config = config
|
|
|
|
def get_api_key(self) -> str:
|
|
"""
|
|
Get the API key for OpenAI mixin.
|
|
|
|
:return: The NVIDIA API key
|
|
"""
|
|
return self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY"
|
|
|
|
def get_base_url(self) -> str:
|
|
"""
|
|
Get the base URL for OpenAI mixin.
|
|
|
|
:return: The NVIDIA API base URL
|
|
"""
|
|
return f"{self._config.url}/v1" if self._config.append_api_version else self._config.url
|
|
|
|
async def completion(
|
|
self,
|
|
model_id: str,
|
|
content: InterleavedContent,
|
|
sampling_params: SamplingParams | None = None,
|
|
response_format: ResponseFormat | None = None,
|
|
stream: bool | None = False,
|
|
logprobs: LogProbConfig | None = None,
|
|
) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
|
|
if sampling_params is None:
|
|
sampling_params = SamplingParams()
|
|
if content_has_media(content):
|
|
raise NotImplementedError("Media is not supported")
|
|
|
|
# ToDo: check health of NeMo endpoints and enable this
|
|
# removing this health check as NeMo customizer endpoint health check is returning 404
|
|
# await check_health(self._config) # this raises errors
|
|
|
|
provider_model_id = await self._get_provider_model_id(model_id)
|
|
request = convert_completion_request(
|
|
request=CompletionRequest(
|
|
model=provider_model_id,
|
|
content=content,
|
|
sampling_params=sampling_params,
|
|
response_format=response_format,
|
|
stream=stream,
|
|
logprobs=logprobs,
|
|
),
|
|
n=1,
|
|
)
|
|
|
|
try:
|
|
response = await self.client.completions.create(**request)
|
|
except APIConnectionError as e:
|
|
raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
|
|
|
|
if stream:
|
|
return convert_openai_completion_stream(response)
|
|
else:
|
|
# we pass n=1 to get only one completion
|
|
return convert_openai_completion_choice(response.choices[0])
|
|
|
|
async def embeddings(
|
|
self,
|
|
model_id: str,
|
|
contents: list[str] | list[InterleavedContentItem],
|
|
text_truncation: TextTruncation | None = TextTruncation.none,
|
|
output_dimension: int | None = None,
|
|
task_type: EmbeddingTaskType | None = None,
|
|
) -> EmbeddingsResponse:
|
|
if any(content_has_media(content) for content in contents):
|
|
raise NotImplementedError("Media is not supported")
|
|
|
|
#
|
|
# Llama Stack: contents = list[str] | list[InterleavedContentItem]
|
|
# ->
|
|
# OpenAI: input = str | list[str]
|
|
#
|
|
# we can ignore str and always pass list[str] to OpenAI
|
|
#
|
|
flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents]
|
|
input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents]
|
|
provider_model_id = await self._get_provider_model_id(model_id)
|
|
|
|
extra_body = {}
|
|
|
|
if text_truncation is not None:
|
|
text_truncation_options = {
|
|
TextTruncation.none: "NONE",
|
|
TextTruncation.end: "END",
|
|
TextTruncation.start: "START",
|
|
}
|
|
extra_body["truncate"] = text_truncation_options[text_truncation]
|
|
|
|
if output_dimension is not None:
|
|
extra_body["dimensions"] = output_dimension
|
|
|
|
if task_type is not None:
|
|
task_type_options = {
|
|
EmbeddingTaskType.document: "passage",
|
|
EmbeddingTaskType.query: "query",
|
|
}
|
|
extra_body["input_type"] = task_type_options[task_type]
|
|
|
|
try:
|
|
response = await self.client.embeddings.create(
|
|
model=provider_model_id,
|
|
input=input,
|
|
extra_body=extra_body,
|
|
)
|
|
except BadRequestError as e:
|
|
raise ValueError(f"Failed to get embeddings: {e}") from e
|
|
|
|
#
|
|
# OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...)
|
|
# ->
|
|
# Llama Stack: EmbeddingsResponse(embeddings=list[list[float]])
|
|
#
|
|
return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
|
|
|
|
async def chat_completion(
|
|
self,
|
|
model_id: str,
|
|
messages: list[Message],
|
|
sampling_params: SamplingParams | None = None,
|
|
response_format: ResponseFormat | None = None,
|
|
tools: list[ToolDefinition] | None = None,
|
|
tool_choice: ToolChoice | None = ToolChoice.auto,
|
|
tool_prompt_format: ToolPromptFormat | None = None,
|
|
stream: bool | None = False,
|
|
logprobs: LogProbConfig | None = None,
|
|
tool_config: ToolConfig | None = None,
|
|
) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
|
|
if sampling_params is None:
|
|
sampling_params = SamplingParams()
|
|
if tool_prompt_format:
|
|
warnings.warn("tool_prompt_format is not supported by NVIDIA NIM, ignoring", stacklevel=2)
|
|
|
|
# await check_health(self._config) # this raises errors
|
|
|
|
provider_model_id = await self._get_provider_model_id(model_id)
|
|
request = await convert_chat_completion_request(
|
|
request=ChatCompletionRequest(
|
|
model=provider_model_id,
|
|
messages=messages,
|
|
sampling_params=sampling_params,
|
|
response_format=response_format,
|
|
tools=tools,
|
|
stream=stream,
|
|
logprobs=logprobs,
|
|
tool_config=tool_config,
|
|
),
|
|
n=1,
|
|
)
|
|
|
|
try:
|
|
response = await self.client.chat.completions.create(**request)
|
|
except APIConnectionError as e:
|
|
raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
|
|
|
|
if stream:
|
|
return convert_openai_chat_completion_stream(response, enable_incremental_tool_calls=False)
|
|
else:
|
|
# we pass n=1 to get only one completion
|
|
return convert_openai_chat_completion_choice(response.choices[0])
|