mirror of
				https://github.com/meta-llama/llama-stack.git
				synced 2025-10-25 17:11:12 +00:00 
			
		
		
		
	
		
			Some checks failed
		
		
	
	SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
				
			Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
				
			SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
				
			Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
				
			Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
				
			Python Package Build Test / build (3.13) (push) Failing after 1s
				
			Test Llama Stack Build / generate-matrix (push) Successful in 4s
				
			Test Llama Stack Build / build-custom-container-distribution (push) Failing after 3s
				
			Python Package Build Test / build (3.12) (push) Failing after 2s
				
			Test Llama Stack Build / build-single-provider (push) Failing after 4s
				
			Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 3s
				
			Test External API and Providers / test-external (venv) (push) Failing after 5s
				
			Unit Tests / unit-tests (3.12) (push) Failing after 5s
				
			Test Llama Stack Build / build (push) Failing after 4s
				
			Unit Tests / unit-tests (3.13) (push) Failing after 5s
				
			Vector IO Integration Tests / test-matrix (push) Failing after 9s
				
			API Conformance Tests / check-schema-compatibility (push) Successful in 16s
				
			UI Tests / ui-tests (22) (push) Successful in 33s
				
			Pre-commit / pre-commit (push) Successful in 1m33s
				
			# What does this PR do? <!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. --> Previously, the NVIDIA inference provider implemented a custom `openai_embeddings` method with a hardcoded `input_type="query"` parameter, which is required by NVIDIA asymmetric embedding models([https://github.com/llamastack/llama-stack/pull/3205](https://github.com/llamastack/llama-stack/pull/3205)). Recently `extra_body` parameter is added to the embeddings API ([https://github.com/llamastack/llama-stack/pull/3794](https://github.com/llamastack/llama-stack/pull/3794)). So, this PR updates the NVIDIA inference provider to use the base `OpenAIMixin.openai_embeddings` method instead and pass the `input_type` through the `extra_body` parameter for asymmetric embedding models. <!-- If resolving an issue, uncomment and update the line below --> <!-- Closes #[issue-number] --> ## Test Plan <!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* --> Run the following command for the ```embedding_model```: ```nvidia/llama-3.2-nv-embedqa-1b-v2```, ```nvidia/nv-embedqa-e5-v5```, ```nvidia/nv-embedqa-mistral-7b-v2```, and ```snowflake/arctic-embed-l```. ``` pytest -s -v tests/integration/inference/test_openai_embeddings.py --stack-config="inference=nvidia" --embedding-model={embedding_model} --env NVIDIA_API_KEY={nvidia_api_key} --env NVIDIA_BASE_URL="https://integrate.api.nvidia.com" --inference-mode=record ```
		
			
				
	
	
		
			386 lines
		
	
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			386 lines
		
	
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Copyright (c) Meta Platforms, Inc. and affiliates.
 | |
| # All rights reserved.
 | |
| #
 | |
| # This source code is licensed under the terms described in the LICENSE file in
 | |
| # the root directory of this source tree.
 | |
| 
 | |
| import base64
 | |
| import struct
 | |
| 
 | |
| import pytest
 | |
| from openai import OpenAI
 | |
| 
 | |
| from llama_stack.core.library_client import LlamaStackAsLibraryClient
 | |
| 
 | |
| ASYMMETRIC_EMBEDDING_MODELS_BY_PROVIDER = {
 | |
|     "remote::nvidia": [
 | |
|         "nvidia/llama-3.2-nv-embedqa-1b-v2",
 | |
|         "nvidia/nv-embedqa-e5-v5",
 | |
|         "nvidia/nv-embedqa-mistral-7b-v2",
 | |
|         "snowflake/arctic-embed-l",
 | |
|     ],
 | |
| }
 | |
| 
 | |
| 
 | |
| def decode_base64_to_floats(base64_string: str) -> list[float]:
 | |
|     """Helper function to decode base64 string to list of float32 values."""
 | |
|     embedding_bytes = base64.b64decode(base64_string)
 | |
|     float_count = len(embedding_bytes) // 4  # 4 bytes per float32
 | |
|     embedding_floats = struct.unpack(f"{float_count}f", embedding_bytes)
 | |
|     return list(embedding_floats)
 | |
| 
 | |
| 
 | |
| def provider_from_model(client_with_models, model_id):
 | |
|     models = {m.identifier: m for m in client_with_models.models.list()}
 | |
|     models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
 | |
|     provider_id = models[model_id].provider_id
 | |
|     providers = {p.provider_id: p for p in client_with_models.providers.list()}
 | |
|     return providers[provider_id]
 | |
| 
 | |
| 
 | |
| def is_asymmetric_model(client_with_models, model_id):
 | |
|     provider = provider_from_model(client_with_models, model_id)
 | |
|     provider_type = provider.provider_type
 | |
| 
 | |
|     if provider_type not in ASYMMETRIC_EMBEDDING_MODELS_BY_PROVIDER:
 | |
|         return False
 | |
| 
 | |
|     return model_id in ASYMMETRIC_EMBEDDING_MODELS_BY_PROVIDER[provider_type]
 | |
| 
 | |
| 
 | |
| def get_extra_body_for_model(client_with_models, model_id, input_type="query"):
 | |
|     if not is_asymmetric_model(client_with_models, model_id):
 | |
|         return None
 | |
| 
 | |
|     provider = provider_from_model(client_with_models, model_id)
 | |
| 
 | |
|     if provider.provider_type == "remote::nvidia":
 | |
|         return {"input_type": input_type}
 | |
| 
 | |
|     return None
 | |
| 
 | |
| 
 | |
| def skip_if_model_doesnt_support_user_param(client, model_id):
 | |
|     provider = provider_from_model(client, model_id)
 | |
|     if provider.provider_type in (
 | |
|         "remote::together",  # service returns 400
 | |
|         "remote::fireworks",  # service returns 400 malformed input
 | |
|     ):
 | |
|         pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support user param.")
 | |
| 
 | |
| 
 | |
| def skip_if_model_doesnt_support_encoding_format_base64(client, model_id):
 | |
|     provider = provider_from_model(client, model_id)
 | |
| 
 | |
|     should_skip = provider.provider_type in (
 | |
|         "remote::databricks",  # param silently ignored, always returns floats
 | |
|         "remote::fireworks",  # param silently ignored, always returns list of floats
 | |
|         "remote::ollama",  # param silently ignored, always returns list of floats
 | |
|     ) or (
 | |
|         provider.provider_type == "remote::nvidia"
 | |
|         and model_id
 | |
|         in [
 | |
|             "nvidia/nv-embedqa-e5-v5",
 | |
|             "nvidia/nv-embedqa-mistral-7b-v2",
 | |
|             "snowflake/arctic-embed-l",
 | |
|         ]
 | |
|     )
 | |
| 
 | |
|     if should_skip:
 | |
|         pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support encoding_format='base64'.")
 | |
| 
 | |
| 
 | |
| def skip_if_model_doesnt_support_variable_dimensions(client_with_models, model_id):
 | |
|     provider = provider_from_model(client_with_models, model_id)
 | |
| 
 | |
|     should_skip = (
 | |
|         provider.provider_type
 | |
|         in (
 | |
|             "remote::together",  # returns 400
 | |
|             "inline::sentence-transformers",
 | |
|             # Error code: 400 - {'error_code': 'BAD_REQUEST', 'message': 'Bad request: json: unknown field "dimensions"\n'}
 | |
|             "remote::databricks",
 | |
|             "remote::watsonx",  # openai.BadRequestError: Error code: 400 - {'detail': "litellm.UnsupportedParamsError: watsonx does not support parameters: {'dimensions': 384}
 | |
|         )
 | |
|         or (provider.provider_type == "remote::openai" and "text-embedding-3" not in model_id)
 | |
|         or (
 | |
|             provider.provider_type == "remote::nvidia"
 | |
|             and model_id
 | |
|             in [
 | |
|                 "nvidia/nv-embedqa-e5-v5",
 | |
|                 "nvidia/nv-embedqa-mistral-7b-v2",
 | |
|                 "snowflake/arctic-embed-l",
 | |
|             ]
 | |
|         )
 | |
|     )
 | |
| 
 | |
|     if should_skip:
 | |
|         pytest.skip(
 | |
|             f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
 | |
|         )
 | |
| 
 | |
| 
 | |
| @pytest.fixture(params=["openai_client", "llama_stack_client"])
 | |
| def compat_client(request, client_with_models):
 | |
|     if request.param == "openai_client" and isinstance(client_with_models, LlamaStackAsLibraryClient):
 | |
|         pytest.skip("OpenAI client tests not supported with library client")
 | |
|     return request.getfixturevalue(request.param)
 | |
| 
 | |
| 
 | |
| def skip_if_model_doesnt_support_openai_embeddings(client, model_id):
 | |
|     provider = provider_from_model(client, model_id)
 | |
|     if provider.provider_type in (
 | |
|         "inline::meta-reference",
 | |
|         "remote::bedrock",
 | |
|         "remote::cerebras",
 | |
|         "remote::runpod",
 | |
|         "remote::sambanova",
 | |
|         "remote::tgi",
 | |
|     ):
 | |
|         pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.")
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def openai_client(client_with_models):
 | |
|     base_url = f"{client_with_models.base_url}/v1"
 | |
|     return OpenAI(base_url=base_url, api_key="fake")
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_single_string(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test OpenAI embeddings endpoint with a single string input."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
| 
 | |
|     input_text = "Hello, world!"
 | |
| 
 | |
|     response = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_text,
 | |
|         encoding_format="float",
 | |
|         extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|     )
 | |
| 
 | |
|     assert response.object == "list"
 | |
| 
 | |
|     # Handle provider-scoped model identifiers (e.g., sentence-transformers/nomic-ai/nomic-embed-text-v1.5)
 | |
|     assert response.model == embedding_model_id or response.model.endswith(f"/{embedding_model_id}")
 | |
|     assert len(response.data) == 1
 | |
|     assert response.data[0].object == "embedding"
 | |
|     assert response.data[0].index == 0
 | |
|     assert isinstance(response.data[0].embedding, list)
 | |
|     assert len(response.data[0].embedding) > 0
 | |
|     assert all(isinstance(x, float) for x in response.data[0].embedding)
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_multiple_strings(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test OpenAI embeddings endpoint with multiple string inputs."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
| 
 | |
|     input_texts = ["Hello, world!", "How are you today?", "This is a test."]
 | |
| 
 | |
|     response = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_texts,
 | |
|         encoding_format="float",
 | |
|         extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|     )
 | |
| 
 | |
|     assert response.object == "list"
 | |
| 
 | |
|     # Handle provider-scoped model identifiers (e.g., sentence-transformers/nomic-ai/nomic-embed-text-v1.5)
 | |
|     assert response.model == embedding_model_id or response.model.endswith(f"/{embedding_model_id}")
 | |
|     assert len(response.data) == len(input_texts)
 | |
| 
 | |
|     for i, embedding_data in enumerate(response.data):
 | |
|         assert embedding_data.object == "embedding"
 | |
|         assert embedding_data.index == i
 | |
|         assert isinstance(embedding_data.embedding, list)
 | |
|         assert len(embedding_data.embedding) > 0
 | |
|         assert all(isinstance(x, float) for x in embedding_data.embedding)
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_with_encoding_format_float(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test OpenAI embeddings endpoint with float encoding format."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
| 
 | |
|     input_text = "Test encoding format"
 | |
| 
 | |
|     response = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_text,
 | |
|         encoding_format="float",
 | |
|         extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|     )
 | |
| 
 | |
|     assert response.object == "list"
 | |
|     assert len(response.data) == 1
 | |
|     assert isinstance(response.data[0].embedding, list)
 | |
|     assert all(isinstance(x, float) for x in response.data[0].embedding)
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_with_dimensions(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test OpenAI embeddings endpoint with custom dimensions parameter."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
|     skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)
 | |
| 
 | |
|     input_text = "Test dimensions parameter"
 | |
|     dimensions = 16
 | |
| 
 | |
|     response = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_text,
 | |
|         dimensions=dimensions,
 | |
|         extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|     )
 | |
| 
 | |
|     assert response.object == "list"
 | |
|     assert len(response.data) == 1
 | |
|     # Note: Not all models support custom dimensions, so we don't assert the exact dimension
 | |
|     assert isinstance(response.data[0].embedding, list)
 | |
|     assert len(response.data[0].embedding) > 0
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_with_user_parameter(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test OpenAI embeddings endpoint with user parameter."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
|     skip_if_model_doesnt_support_user_param(client_with_models, embedding_model_id)
 | |
| 
 | |
|     input_text = "Test user parameter"
 | |
|     user_id = "test-user-123"
 | |
| 
 | |
|     response = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_text,
 | |
|         user=user_id,
 | |
|         extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|     )
 | |
| 
 | |
|     assert response.object == "list"
 | |
|     assert len(response.data) == 1
 | |
|     assert isinstance(response.data[0].embedding, list)
 | |
|     assert len(response.data[0].embedding) > 0
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_empty_list_error(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test that empty list input raises an appropriate error."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
| 
 | |
|     with pytest.raises(Exception):  # noqa: B017
 | |
|         compat_client.embeddings.create(
 | |
|             model=embedding_model_id,
 | |
|             input=[],
 | |
|             extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|         )
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_invalid_model_error(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test that invalid model ID raises an appropriate error."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
| 
 | |
|     with pytest.raises(Exception):  # noqa: B017
 | |
|         compat_client.embeddings.create(
 | |
|             model="invalid-model-id",
 | |
|             input="Test text",
 | |
|             extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|         )
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_different_inputs_different_outputs(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test that different inputs produce different embeddings."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
| 
 | |
|     input_text1 = "This is the first text"
 | |
|     input_text2 = "This is completely different content"
 | |
| 
 | |
|     extra_body = get_extra_body_for_model(client_with_models, embedding_model_id)
 | |
|     response1 = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_text1,
 | |
|         encoding_format="float",
 | |
|         extra_body=extra_body,
 | |
|     )
 | |
| 
 | |
|     response2 = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_text2,
 | |
|         encoding_format="float",
 | |
|         extra_body=extra_body,
 | |
|     )
 | |
| 
 | |
|     embedding1 = response1.data[0].embedding
 | |
|     embedding2 = response2.data[0].embedding
 | |
| 
 | |
|     assert len(embedding1) == len(embedding2)
 | |
|     # Embeddings should be different for different inputs
 | |
|     assert embedding1 != embedding2
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_with_encoding_format_base64(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test OpenAI embeddings endpoint with base64 encoding format."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
|     skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)
 | |
|     skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)
 | |
| 
 | |
|     input_text = "Test base64 encoding format"
 | |
|     dimensions = 12
 | |
| 
 | |
|     response = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_text,
 | |
|         encoding_format="base64",
 | |
|         dimensions=dimensions,
 | |
|         extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|     )
 | |
| 
 | |
|     # Validate response structure
 | |
|     assert response.object == "list"
 | |
|     assert len(response.data) == 1
 | |
| 
 | |
|     # With base64 encoding, embedding should be a string, not a list
 | |
|     embedding_data = response.data[0]
 | |
|     assert embedding_data.object == "embedding"
 | |
|     assert embedding_data.index == 0
 | |
|     assert isinstance(embedding_data.embedding, str)
 | |
| 
 | |
|     # Verify it's valid base64 and decode to floats
 | |
|     embedding_floats = decode_base64_to_floats(embedding_data.embedding)
 | |
| 
 | |
|     # Verify we got valid floats
 | |
|     assert len(embedding_floats) == dimensions, f"Got embedding length {len(embedding_floats)}, expected {dimensions}"
 | |
|     assert all(isinstance(x, float) for x in embedding_floats)
 | |
| 
 | |
| 
 | |
| def test_openai_embeddings_base64_batch_processing(compat_client, client_with_models, embedding_model_id):
 | |
|     """Test OpenAI embeddings endpoint with base64 encoding for batch processing."""
 | |
|     skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
 | |
|     skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)
 | |
| 
 | |
|     input_texts = ["First text for base64", "Second text for base64", "Third text for base64"]
 | |
| 
 | |
|     response = compat_client.embeddings.create(
 | |
|         model=embedding_model_id,
 | |
|         input=input_texts,
 | |
|         encoding_format="base64",
 | |
|         extra_body=get_extra_body_for_model(client_with_models, embedding_model_id),
 | |
|     )
 | |
|     # Validate response structure
 | |
|     assert response.object == "list"
 | |
| 
 | |
|     # Handle provider-scoped model identifiers (e.g., sentence-transformers/nomic-ai/nomic-embed-text-v1.5)
 | |
|     assert response.model == embedding_model_id or response.model.endswith(f"/{embedding_model_id}")
 | |
|     assert len(response.data) == len(input_texts)
 | |
| 
 | |
|     # Validate each embedding in the batch
 | |
|     embedding_dimensions = []
 | |
|     for i, embedding_data in enumerate(response.data):
 | |
|         assert embedding_data.object == "embedding"
 | |
|         assert embedding_data.index == i
 | |
| 
 | |
|         # With base64 encoding, embedding should be a string, not a list
 | |
|         assert isinstance(embedding_data.embedding, str)
 | |
|         embedding_floats = decode_base64_to_floats(embedding_data.embedding)
 | |
|         assert len(embedding_floats) > 0
 | |
|         assert all(isinstance(x, float) for x in embedding_floats)
 | |
|         embedding_dimensions.append(len(embedding_floats))
 | |
| 
 | |
|     # All embeddings should have the same dimensionality
 | |
|     assert all(dim == embedding_dimensions[0] for dim in embedding_dimensions)
 |