mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-21 01:15:10 +00:00
fix: fix ``openai_embeddings
`` for asymmetric embedding NIMs (#3205)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / generate-matrix (push) Successful in 5s
Python Package Build Test / build (3.13) (push) Failing after 3s
Test Llama Stack Build / build-single-provider (push) Failing after 9s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 12s
Integration Tests (Replay) / Integration Tests (, , , client=, vision=) (push) Failing after 14s
Unit Tests / unit-tests (3.13) (push) Failing after 11s
Unit Tests / unit-tests (3.12) (push) Failing after 13s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 16s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 19s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 20s
Vector IO Integration Tests / test-matrix (push) Failing after 19s
Test External API and Providers / test-external (venv) (push) Failing after 18s
Python Package Build Test / build (3.12) (push) Failing after 49s
Test Llama Stack Build / build (push) Failing after 54s
UI Tests / ui-tests (22) (push) Failing after 1m26s
Pre-commit / pre-commit (push) Successful in 2m24s
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / generate-matrix (push) Successful in 5s
Python Package Build Test / build (3.13) (push) Failing after 3s
Test Llama Stack Build / build-single-provider (push) Failing after 9s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 12s
Integration Tests (Replay) / Integration Tests (, , , client=, vision=) (push) Failing after 14s
Unit Tests / unit-tests (3.13) (push) Failing after 11s
Unit Tests / unit-tests (3.12) (push) Failing after 13s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 16s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 19s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 20s
Vector IO Integration Tests / test-matrix (push) Failing after 19s
Test External API and Providers / test-external (venv) (push) Failing after 18s
Python Package Build Test / build (3.12) (push) Failing after 49s
Test Llama Stack Build / build (push) Failing after 54s
UI Tests / ui-tests (22) (push) Failing after 1m26s
Pre-commit / pre-commit (push) Successful in 2m24s
# What does this PR do? NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. This PR adds the `input_type="query"` as default and updates the documentation to suggest using the `embedding` API for passage embeddings. <!-- If resolving an issue, uncomment and update the line below --> Resolves #2892 ## Test Plan ``` pytest -s -v tests/integration/inference/test_openai_embeddings.py --stack-config="inference=nvidia" --embedding-model="nvidia/llama-3.2-nv-embedqa-1b-v2" --env NVIDIA_API_KEY={nvidia_api_key} --env NVIDIA_BASE_URL="https://integrate.api.nvidia.com" ```
This commit is contained in:
parent
3f8df167f3
commit
55e9959f62
2 changed files with 59 additions and 1 deletions
|
@ -77,6 +77,10 @@ print(f"Response: {response.completion_message.content}")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Create Embeddings
|
### Create Embeddings
|
||||||
|
> Note on OpenAI embeddings compatibility
|
||||||
|
>
|
||||||
|
> NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
response = client.inference.embeddings(
|
response = client.inference.embeddings(
|
||||||
model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
|
model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncIterator
|
from collections.abc import AsyncIterator
|
||||||
|
|
||||||
from openai import APIConnectionError, BadRequestError
|
from openai import NOT_GIVEN, APIConnectionError, BadRequestError
|
||||||
|
|
||||||
from llama_stack.apis.common.content_types import (
|
from llama_stack.apis.common.content_types import (
|
||||||
InterleavedContent,
|
InterleavedContent,
|
||||||
|
@ -26,6 +26,9 @@ from llama_stack.apis.inference import (
|
||||||
Inference,
|
Inference,
|
||||||
LogProbConfig,
|
LogProbConfig,
|
||||||
Message,
|
Message,
|
||||||
|
OpenAIEmbeddingData,
|
||||||
|
OpenAIEmbeddingsResponse,
|
||||||
|
OpenAIEmbeddingUsage,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
SamplingParams,
|
SamplingParams,
|
||||||
TextTruncation,
|
TextTruncation,
|
||||||
|
@ -210,6 +213,57 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper):
|
||||||
#
|
#
|
||||||
return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
|
return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
|
||||||
|
|
||||||
|
async def openai_embeddings(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
input: str | list[str],
|
||||||
|
encoding_format: str | None = "float",
|
||||||
|
dimensions: int | None = None,
|
||||||
|
user: str | None = None,
|
||||||
|
) -> OpenAIEmbeddingsResponse:
|
||||||
|
"""
|
||||||
|
OpenAI-compatible embeddings for NVIDIA NIM.
|
||||||
|
|
||||||
|
Note: NVIDIA NIM asymmetric embedding models require an "input_type" field not present in the standard OpenAI embeddings API.
|
||||||
|
We default this to "query" to ensure requests succeed when using the
|
||||||
|
OpenAI-compatible endpoint. For passage embeddings, use the embeddings API with
|
||||||
|
`task_type='document'`.
|
||||||
|
"""
|
||||||
|
extra_body: dict[str, object] = {"input_type": "query"}
|
||||||
|
logger.warning(
|
||||||
|
"NVIDIA OpenAI-compatible embeddings: defaulting to input_type='query'. "
|
||||||
|
"For passage embeddings, use the embeddings API with task_type='document'."
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await self.client.embeddings.create(
|
||||||
|
model=await self._get_provider_model_id(model),
|
||||||
|
input=input,
|
||||||
|
encoding_format=encoding_format if encoding_format is not None else NOT_GIVEN,
|
||||||
|
dimensions=dimensions if dimensions is not None else NOT_GIVEN,
|
||||||
|
user=user if user is not None else NOT_GIVEN,
|
||||||
|
extra_body=extra_body,
|
||||||
|
)
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for i, embedding_data in enumerate(response.data):
|
||||||
|
data.append(
|
||||||
|
OpenAIEmbeddingData(
|
||||||
|
embedding=embedding_data.embedding,
|
||||||
|
index=i,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
usage = OpenAIEmbeddingUsage(
|
||||||
|
prompt_tokens=response.usage.prompt_tokens,
|
||||||
|
total_tokens=response.usage.total_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
return OpenAIEmbeddingsResponse(
|
||||||
|
data=data,
|
||||||
|
model=response.model,
|
||||||
|
usage=usage,
|
||||||
|
)
|
||||||
|
|
||||||
async def chat_completion(
|
async def chat_completion(
|
||||||
self,
|
self,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue