diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index b2ad060fb..7a2697327 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -13,7 +13,6 @@ from llama_stack.apis.inference import ( OpenAIEmbeddingUsage, ) from llama_stack.log import get_logger -from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from . import NVIDIAConfig @@ -22,7 +21,9 @@ from .utils import _is_nvidia_hosted logger = get_logger(name=__name__, category="inference::nvidia") -class NVIDIAInferenceAdapter(OpenAIMixin, ModelRegistryHelper): +class NVIDIAInferenceAdapter(OpenAIMixin): + config: NVIDIAConfig + """ NVIDIA Inference Adapter for Llama Stack. @@ -36,27 +37,12 @@ class NVIDIAInferenceAdapter(OpenAIMixin, ModelRegistryHelper): - ModelRegistryHelper.check_model_availability() just returns False and shows a warning """ - def __init__(self, config: NVIDIAConfig) -> None: - """Initialize the NVIDIA inference adapter with configuration.""" - # Initialize ModelRegistryHelper with empty model entries since NVIDIA uses dynamic model discovery - ModelRegistryHelper.__init__(self, model_entries=[], allowed_models=config.allowed_models) - self.config = config - # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html embedding_model_metadata: dict[str, dict[str, int]] = { - "nvidia/llama-3.2-nv-embedqa-1b-v2": { - "embedding_dimension": 2048, - "context_length": 8192, - }, + "nvidia/llama-3.2-nv-embedqa-1b-v2": {"embedding_dimension": 2048, "context_length": 8192}, "nvidia/nv-embedqa-e5-v5": {"embedding_dimension": 512, "context_length": 1024}, - "nvidia/nv-embedqa-mistral-7b-v2": { - "embedding_dimension": 512, - "context_length": 4096, - }, - "snowflake/arctic-embed-l": { - "embedding_dimension": 512, - "context_length": 1024, - }, + "nvidia/nv-embedqa-mistral-7b-v2": {"embedding_dimension": 512, "context_length": 4096}, + "snowflake/arctic-embed-l": {"embedding_dimension": 512, "context_length": 1024}, } async def initialize(self) -> None: @@ -109,7 +95,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, ModelRegistryHelper): response = await self.client.embeddings.create( model=await self._get_provider_model_id(model), input=input, - encoding_format=(encoding_format if encoding_format is not None else NOT_GIVEN), + encoding_format=encoding_format if encoding_format is not None else NOT_GIVEN, dimensions=dimensions if dimensions is not None else NOT_GIVEN, user=user if user is not None else NOT_GIVEN, extra_body=extra_body,