fix: NVIDIA embedding results in InternalServerError (#1851)

Closes #1819 

## Test Plan

```bash
pytest -v tests/integration/inference/test_embedding.py  --stack-config=http://localhost:5002 --embedding-model=nvidia/llama-3.2-nv-embedqa-1b-v2
=============================================================================== test session starts ================================================================================
platform linux -- Python 3.10.0, pytest-8.3.5, pluggy-1.5.0 -- /home/ubuntu/miniconda/envs/nvidia-1/bin/python
cachedir: .pytest_cache
rootdir: /home/ubuntu/llama-stack
configfile: pyproject.toml
plugins: anyio-4.9.0
collected 23 items                                                                                                                                                                 

tests/integration/inference/test_embedding.py::test_embedding_text[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-list[string]] PASSED                                                [  4%]
tests/integration/inference/test_embedding.py::test_embedding_text[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-list[text]] PASSED                                                  [  8%]
tests/integration/inference/test_embedding.py::test_embedding_image[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-list[url,base64]] XFAIL (nvidia/llama-3.2-nv-embedqa-1b-v2 doe...) [ 13%]
tests/integration/inference/test_embedding.py::test_embedding_image[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-list[url,string,base64,text]] XFAIL (nvidia/llama-3.2-nv-embed...) [ 17%]
tests/integration/inference/test_embedding.py::test_embedding_truncation[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-long-end] PASSED                                              [ 21%]
tests/integration/inference/test_embedding.py::test_embedding_truncation[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-long-start] PASSED                                            [ 26%]
tests/integration/inference/test_embedding.py::test_embedding_truncation[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-short-end] PASSED                                             [ 30%]
tests/integration/inference/test_embedding.py::test_embedding_truncation[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-short-start] PASSED                                           [ 34%]
tests/integration/inference/test_embedding.py::test_embedding_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-long-text-None] PASSED                                  [ 39%]
tests/integration/inference/test_embedding.py::test_embedding_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-long-text-none] PASSED                                  [ 43%]
tests/integration/inference/test_embedding.py::test_embedding_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-long-str-None] PASSED                                   [ 47%]
tests/integration/inference/test_embedding.py::test_embedding_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-long-str-none] PASSED                                   [ 52%]
tests/integration/inference/test_embedding.py::test_embedding_output_dimension[emb=nvidia/llama-3.2-nv-embedqa-1b-v2] PASSED                                                 [ 56%]
tests/integration/inference/test_embedding.py::test_embedding_task_type[emb=nvidia/llama-3.2-nv-embedqa-1b-v2] PASSED                                                        [ 60%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-None] PASSED                                             [ 65%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-none] PASSED                                             [ 69%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-end] PASSED                                              [ 73%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-start] PASSED                                            [ 78%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-NONE] PASSED                                       [ 82%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-END] PASSED                                        [ 86%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-START] PASSED                                      [ 91%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-left] PASSED                                       [ 95%]
tests/integration/inference/test_embedding.py::test_embedding_text_truncation_error[emb=nvidia/llama-3.2-nv-embedqa-1b-v2-right] PASSED                                      [100%]

===================================================================== 21 passed, 2 xfailed, 1 warning in 7.18s =====================================================================
```

[//]: # (## Documentation)

cc: @dglogo @mattf @sumitb
This commit is contained in:
Rashmi Pawar 2025-04-01 17:01:29 +05:30 committed by GitHub
parent 0a895c70d1
commit c169c164b3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -205,7 +205,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
extra_body["input_type"] = task_type_options[task_type]
try:
response = await self._client.embeddings.create(
response = await self._get_client(model).embeddings.create(
model=model,
input=input,
extra_body=extra_body,