mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
Merge branch 'main' into hide-non-openai-inference-apis
This commit is contained in:
commit
cb534281c8
714 changed files with 123149 additions and 54618 deletions
|
@ -5,11 +5,26 @@
|
|||
# the root directory of this source tree.
|
||||
|
||||
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
import pytest
|
||||
|
||||
from ..test_cases.test_case import TestCase
|
||||
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
"""
|
||||
Normalize Unicode text by removing diacritical marks for comparison.
|
||||
|
||||
The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun
|
||||
in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct
|
||||
Latin spelling. The test is failing because it's doing a simple case-insensitive string search
|
||||
for "sol" but the actual response contains the diacritical mark.
|
||||
"""
|
||||
return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower()
|
||||
|
||||
|
||||
def provider_from_model(client_with_models, model_id):
|
||||
models = {m.identifier: m for m in client_with_models.models.list()}
|
||||
models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
|
||||
|
@ -25,7 +40,6 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
|
|||
"inline::sentence-transformers",
|
||||
"inline::vllm",
|
||||
"remote::bedrock",
|
||||
"remote::cerebras",
|
||||
"remote::databricks",
|
||||
# Technically Nvidia does support OpenAI completions, but none of their hosted models
|
||||
# support both completions and chat completions endpoint and all the Llama models are
|
||||
|
@ -33,8 +47,17 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
|
|||
"remote::nvidia",
|
||||
"remote::runpod",
|
||||
"remote::sambanova",
|
||||
"remote::tgi",
|
||||
"remote::vertexai",
|
||||
# {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
|
||||
# or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
|
||||
"remote::groq",
|
||||
"remote::gemini", # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
|
||||
"remote::anthropic", # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
|
||||
"remote::azure", # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation
|
||||
# does not work with the specified model, gpt-5-mini. Please choose different model and try
|
||||
# again. You can learn more about which models can be used with each operation here:
|
||||
# https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
|
||||
"remote::watsonx", # return 404 when hitting the /openai/v1 endpoint
|
||||
):
|
||||
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
|
||||
|
||||
|
@ -56,6 +79,31 @@ def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
|
|||
pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")
|
||||
|
||||
|
||||
def skip_if_doesnt_support_n(client_with_models, model_id):
|
||||
provider = provider_from_model(client_with_models, model_id)
|
||||
if provider.provider_type in (
|
||||
"remote::sambanova",
|
||||
"remote::ollama",
|
||||
# https://console.groq.com/docs/openai#currently-unsupported-openai-features
|
||||
# -> Error code: 400 - {'error': {'message': "'n' : number must be at most 1", 'type': 'invalid_request_error'}}
|
||||
"remote::groq",
|
||||
# Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the
|
||||
# current model', 'status': 'INVALID_ARGUMENT'}}]
|
||||
"remote::gemini",
|
||||
# https://docs.anthropic.com/en/api/openai-sdk#simple-fields
|
||||
"remote::anthropic",
|
||||
"remote::vertexai",
|
||||
# Error code: 400 - [{'error': {'code': 400, 'message': 'Unable to submit request because candidateCount must be 1 but
|
||||
# the entered value was 2. Update the candidateCount value and try again.', 'status': 'INVALID_ARGUMENT'}
|
||||
"remote::tgi", # TGI ignores n param silently
|
||||
"remote::together", # `n` > 1 is not supported when streaming tokens. Please disable `stream`
|
||||
# Error code 400 - {'message': '"n" > 1 is not currently supported', 'type': 'invalid_request_error', 'param': 'n', 'code': 'wrong_api_format'}
|
||||
"remote::cerebras",
|
||||
"remote::databricks", # Bad request: parameter "n" must be equal to 1 for streaming mode
|
||||
):
|
||||
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
|
||||
|
||||
|
||||
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
|
||||
provider = provider_from_model(client_with_models, model_id)
|
||||
if provider.provider_type in (
|
||||
|
@ -63,10 +111,10 @@ def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, mode
|
|||
"inline::sentence-transformers",
|
||||
"inline::vllm",
|
||||
"remote::bedrock",
|
||||
"remote::cerebras",
|
||||
"remote::databricks",
|
||||
"remote::cerebras",
|
||||
"remote::runpod",
|
||||
"remote::tgi",
|
||||
"remote::watsonx", # watsonx returns 404 when hitting the /openai/v1 endpoint
|
||||
):
|
||||
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")
|
||||
|
||||
|
@ -130,7 +178,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_
|
|||
assert len(response.choices) > 0
|
||||
choice = response.choices[0]
|
||||
assert len(choice.text) > 5
|
||||
assert "france" in choice.text.lower()
|
||||
normalized_text = _normalize_text(choice.text)
|
||||
assert "france" in normalized_text
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -221,7 +270,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models,
|
|||
)
|
||||
message_content = response.choices[0].message.content.lower().strip()
|
||||
assert len(message_content) > 0
|
||||
assert expected.lower() in message_content
|
||||
normalized_expected = _normalize_text(expected)
|
||||
normalized_content = _normalize_text(message_content)
|
||||
assert normalized_expected in normalized_content
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -245,10 +296,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
|
|||
)
|
||||
streamed_content = []
|
||||
for chunk in response:
|
||||
if chunk.choices[0].delta.content:
|
||||
# On some providers like Azure, the choices are empty on the first chunk, so we need to check for that
|
||||
if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
|
||||
streamed_content.append(chunk.choices[0].delta.content.lower().strip())
|
||||
assert len(streamed_content) > 0
|
||||
assert expected.lower() in "".join(streamed_content)
|
||||
normalized_expected = _normalize_text(expected)
|
||||
normalized_content = _normalize_text("".join(streamed_content))
|
||||
assert normalized_expected in normalized_content
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -260,10 +314,7 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
|
|||
)
|
||||
def test_openai_chat_completion_streaming_with_n(compat_client, client_with_models, text_model_id, test_case):
|
||||
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
|
||||
|
||||
provider = provider_from_model(client_with_models, text_model_id)
|
||||
if provider.provider_type == "remote::ollama":
|
||||
pytest.skip(f"Model {text_model_id} hosted by {provider.provider_type} doesn't support n > 1.")
|
||||
skip_if_doesnt_support_n(client_with_models, text_model_id)
|
||||
|
||||
tc = TestCase(test_case)
|
||||
question = tc["question"]
|
||||
|
@ -284,8 +335,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
|
|||
streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
|
||||
)
|
||||
assert len(streamed_content) == 2
|
||||
normalized_expected = _normalize_text(expected)
|
||||
for i, content in streamed_content.items():
|
||||
assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}"
|
||||
normalized_content = _normalize_text(content)
|
||||
assert normalized_expected in normalized_content, (
|
||||
f"Choice {i}: Expected {normalized_expected} in {normalized_content}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -315,16 +370,23 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
|
|||
content = ""
|
||||
response_id = None
|
||||
for chunk in response:
|
||||
if response_id is None:
|
||||
if response_id is None and chunk.id:
|
||||
response_id = chunk.id
|
||||
if chunk.choices[0].delta.content:
|
||||
if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
|
||||
content += chunk.choices[0].delta.content
|
||||
else:
|
||||
response_id = response.id
|
||||
content = response.choices[0].message.content
|
||||
|
||||
responses = client.chat.completions.list(limit=1000)
|
||||
assert response_id in [r.id for r in responses.data]
|
||||
tries = 0
|
||||
while tries < 10:
|
||||
responses = client.chat.completions.list(limit=1000)
|
||||
if response_id in [r.id for r in responses.data]:
|
||||
break
|
||||
else:
|
||||
tries += 1
|
||||
time.sleep(0.1)
|
||||
assert tries < 10, f"Response {response_id} not found after 1 second"
|
||||
|
||||
retrieved_response = client.chat.completions.retrieve(response_id)
|
||||
assert retrieved_response.id == response_id
|
||||
|
@ -379,15 +441,28 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
|
|||
content = ""
|
||||
response_id = None
|
||||
for chunk in response:
|
||||
if response_id is None:
|
||||
if response_id is None and chunk.id:
|
||||
response_id = chunk.id
|
||||
if delta := chunk.choices[0].delta:
|
||||
if delta.content:
|
||||
content += delta.content
|
||||
if chunk.choices and len(chunk.choices) > 0:
|
||||
if delta := chunk.choices[0].delta:
|
||||
if delta.content:
|
||||
content += delta.content
|
||||
else:
|
||||
response_id = response.id
|
||||
content = response.choices[0].message.content
|
||||
|
||||
# wait for the response to be stored
|
||||
tries = 0
|
||||
while tries < 10:
|
||||
responses = client.chat.completions.list(limit=1000)
|
||||
if response_id in [r.id for r in responses.data]:
|
||||
break
|
||||
else:
|
||||
tries += 1
|
||||
time.sleep(0.1)
|
||||
|
||||
assert tries < 10, f"Response {response_id} not found after 1 second"
|
||||
|
||||
responses = client.chat.completions.list(limit=1000)
|
||||
assert response_id in [r.id for r in responses.data]
|
||||
|
||||
|
@ -441,4 +516,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
|
|||
stream=False,
|
||||
)
|
||||
message_content = response.choices[0].message.content.lower().strip()
|
||||
assert "hello world" in message_content
|
||||
normalized_content = _normalize_text(message_content)
|
||||
assert "hello world" in normalized_content
|
||||
|
|
|
@ -29,9 +29,40 @@ def provider_from_model(client_with_models, model_id):
|
|||
return providers[provider_id]
|
||||
|
||||
|
||||
def skip_if_model_doesnt_support_variable_dimensions(model_id):
|
||||
if "text-embedding-3" not in model_id:
|
||||
pytest.skip("{model_id} does not support variable output embedding dimensions")
|
||||
def skip_if_model_doesnt_support_user_param(client, model_id):
|
||||
provider = provider_from_model(client, model_id)
|
||||
if provider.provider_type in (
|
||||
"remote::together", # service returns 400
|
||||
"remote::fireworks", # service returns 400 malformed input
|
||||
):
|
||||
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support user param.")
|
||||
|
||||
|
||||
def skip_if_model_doesnt_support_encoding_format_base64(client, model_id):
|
||||
provider = provider_from_model(client, model_id)
|
||||
if provider.provider_type in (
|
||||
"remote::databricks", # param silently ignored, always returns floats
|
||||
"remote::fireworks", # param silently ignored, always returns list of floats
|
||||
"remote::ollama", # param silently ignored, always returns list of floats
|
||||
):
|
||||
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support encoding_format='base64'.")
|
||||
|
||||
|
||||
def skip_if_model_doesnt_support_variable_dimensions(client_with_models, model_id):
|
||||
provider = provider_from_model(client_with_models, model_id)
|
||||
if provider.provider_type in (
|
||||
"remote::together", # returns 400
|
||||
"inline::sentence-transformers",
|
||||
# Error code: 400 - {'error_code': 'BAD_REQUEST', 'message': 'Bad request: json: unknown field "dimensions"\n'}
|
||||
"remote::databricks",
|
||||
):
|
||||
pytest.skip(
|
||||
f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
|
||||
)
|
||||
if provider.provider_type == "remote::openai" and "text-embedding-3" not in model_id:
|
||||
pytest.skip(
|
||||
f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=["openai_client", "llama_stack_client"])
|
||||
|
@ -47,7 +78,6 @@ def skip_if_model_doesnt_support_openai_embeddings(client, model_id):
|
|||
"inline::meta-reference",
|
||||
"remote::bedrock",
|
||||
"remote::cerebras",
|
||||
"remote::databricks",
|
||||
"remote::runpod",
|
||||
"remote::sambanova",
|
||||
"remote::tgi",
|
||||
|
@ -92,6 +122,7 @@ def test_openai_embeddings_multiple_strings(compat_client, client_with_models, e
|
|||
response = compat_client.embeddings.create(
|
||||
model=embedding_model_id,
|
||||
input=input_texts,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
assert response.object == "list"
|
||||
|
@ -127,7 +158,7 @@ def test_openai_embeddings_with_encoding_format_float(compat_client, client_with
|
|||
def test_openai_embeddings_with_dimensions(compat_client, client_with_models, embedding_model_id):
|
||||
"""Test OpenAI embeddings endpoint with custom dimensions parameter."""
|
||||
skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
|
||||
skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
|
||||
skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)
|
||||
|
||||
input_text = "Test dimensions parameter"
|
||||
dimensions = 16
|
||||
|
@ -148,6 +179,7 @@ def test_openai_embeddings_with_dimensions(compat_client, client_with_models, em
|
|||
def test_openai_embeddings_with_user_parameter(compat_client, client_with_models, embedding_model_id):
|
||||
"""Test OpenAI embeddings endpoint with user parameter."""
|
||||
skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
|
||||
skip_if_model_doesnt_support_user_param(client_with_models, embedding_model_id)
|
||||
|
||||
input_text = "Test user parameter"
|
||||
user_id = "test-user-123"
|
||||
|
@ -196,11 +228,13 @@ def test_openai_embeddings_different_inputs_different_outputs(compat_client, cli
|
|||
response1 = compat_client.embeddings.create(
|
||||
model=embedding_model_id,
|
||||
input=input_text1,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
response2 = compat_client.embeddings.create(
|
||||
model=embedding_model_id,
|
||||
input=input_text2,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
embedding1 = response1.data[0].embedding
|
||||
|
@ -214,7 +248,8 @@ def test_openai_embeddings_different_inputs_different_outputs(compat_client, cli
|
|||
def test_openai_embeddings_with_encoding_format_base64(compat_client, client_with_models, embedding_model_id):
|
||||
"""Test OpenAI embeddings endpoint with base64 encoding format."""
|
||||
skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
|
||||
skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
|
||||
skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)
|
||||
skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)
|
||||
|
||||
input_text = "Test base64 encoding format"
|
||||
dimensions = 12
|
||||
|
@ -247,6 +282,7 @@ def test_openai_embeddings_with_encoding_format_base64(compat_client, client_wit
|
|||
def test_openai_embeddings_base64_batch_processing(compat_client, client_with_models, embedding_model_id):
|
||||
"""Test OpenAI embeddings endpoint with base64 encoding for batch processing."""
|
||||
skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
|
||||
skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)
|
||||
|
||||
input_texts = ["First text for base64", "Second text for base64", "Third text for base64"]
|
||||
|
||||
|
@ -255,7 +291,6 @@ def test_openai_embeddings_base64_batch_processing(compat_client, client_with_mo
|
|||
input=input_texts,
|
||||
encoding_format="base64",
|
||||
)
|
||||
|
||||
# Validate response structure
|
||||
assert response.object == "list"
|
||||
assert response.model == embedding_model_id
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue