mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 14:39:47 +00:00
fix for openai embedding issue for asymmetric embedding nims
This commit is contained in:
parent
eb07a0f86a
commit
85cae08e79
2 changed files with 59 additions and 1 deletions
|
|
@ -77,6 +77,10 @@ print(f"Response: {response.completion_message.content}")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Create Embeddings
|
### Create Embeddings
|
||||||
|
> Note on OpenAI embeddings compatibility
|
||||||
|
>
|
||||||
|
> NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
response = client.inference.embeddings(
|
response = client.inference.embeddings(
|
||||||
model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
|
model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ import logging
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncIterator
|
from collections.abc import AsyncIterator
|
||||||
|
|
||||||
from openai import APIConnectionError, BadRequestError
|
from openai import NOT_GIVEN, APIConnectionError, BadRequestError
|
||||||
|
|
||||||
from llama_stack.apis.common.content_types import (
|
from llama_stack.apis.common.content_types import (
|
||||||
InterleavedContent,
|
InterleavedContent,
|
||||||
|
|
@ -27,6 +27,9 @@ from llama_stack.apis.inference import (
|
||||||
Inference,
|
Inference,
|
||||||
LogProbConfig,
|
LogProbConfig,
|
||||||
Message,
|
Message,
|
||||||
|
OpenAIEmbeddingData,
|
||||||
|
OpenAIEmbeddingsResponse,
|
||||||
|
OpenAIEmbeddingUsage,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
SamplingParams,
|
SamplingParams,
|
||||||
TextTruncation,
|
TextTruncation,
|
||||||
|
|
@ -210,6 +213,57 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper):
|
||||||
#
|
#
|
||||||
return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
|
return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
|
||||||
|
|
||||||
|
async def openai_embeddings(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
input: str | list[str],
|
||||||
|
encoding_format: str | None = "float",
|
||||||
|
dimensions: int | None = None,
|
||||||
|
user: str | None = None,
|
||||||
|
) -> OpenAIEmbeddingsResponse:
|
||||||
|
"""
|
||||||
|
OpenAI-compatible embeddings for NVIDIA NIM.
|
||||||
|
|
||||||
|
Note: NVIDIA NIM asymmetric embedding models require an "input_type" field not present in the standard OpenAI embeddings API.
|
||||||
|
We default this to "query" to ensure requests succeed when using the
|
||||||
|
OpenAI-compatible endpoint. For passage embeddings, use the embeddings API with
|
||||||
|
`task_type='document'`.
|
||||||
|
"""
|
||||||
|
extra_body: dict[str, object] = {"input_type": "query"}
|
||||||
|
logger.warning(
|
||||||
|
"NVIDIA OpenAI-compatible embeddings: defaulting to input_type='query'. "
|
||||||
|
"For passage embeddings, use the embeddings API with task_type='document'."
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await self.client.embeddings.create(
|
||||||
|
model=await self._get_provider_model_id(model),
|
||||||
|
input=input,
|
||||||
|
encoding_format=encoding_format if encoding_format is not None else NOT_GIVEN,
|
||||||
|
dimensions=dimensions if dimensions is not None else NOT_GIVEN,
|
||||||
|
user=user if user is not None else NOT_GIVEN,
|
||||||
|
extra_body=extra_body,
|
||||||
|
)
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for i, embedding_data in enumerate(response.data):
|
||||||
|
data.append(
|
||||||
|
OpenAIEmbeddingData(
|
||||||
|
embedding=embedding_data.embedding,
|
||||||
|
index=i,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
usage = OpenAIEmbeddingUsage(
|
||||||
|
prompt_tokens=response.usage.prompt_tokens,
|
||||||
|
total_tokens=response.usage.total_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
return OpenAIEmbeddingsResponse(
|
||||||
|
data=data,
|
||||||
|
model=response.model,
|
||||||
|
usage=usage,
|
||||||
|
)
|
||||||
|
|
||||||
async def chat_completion(
|
async def chat_completion(
|
||||||
self,
|
self,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue