feat: add embedding and dynamic model support to Together inference adapter (#3458)

# What does this PR do? adds embedding and dynamic model support to Together inference adapter - updated to use OpenAIMixin - workarounds for Together api quirks - recordings for together suite when subdirs=inference,pattern=openai ## Test Plan ``` $ TOGETHER_API_KEY=_NONE_ ./scripts/integration-tests.sh --stack-config server:ci-tests --setup together --subdirs inference --pattern openai ... tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming[txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:completion:sanity] instantiating llama_stack_client Port 8321 is already in use, assuming server is already running... llama_stack_client instantiated in 0.121s PASSED [ 2%] tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming_suffix[txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:completion:suffix] SKIPPED [ 4%] tests/integration/inference/test_openai_completion.py::test_openai_completion_streaming[txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:completion:sanity] PASSED [ 6%] tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-1] SKIPPED [ 8%] tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free] SKIPPED [ 10%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:non_streaming_01] PASSED [ 12%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:streaming_01] PASSED [ 14%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:streaming_01] SKIPPED [ 17%] tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-True] PASSED [ 19%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-True] PASSED [ 21%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming_with_file[txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free] SKIPPED [ 23%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_single_string[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 25%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_multiple_strings[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 27%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_float[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 29%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_dimensions[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] SKIPPED [ 31%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_user_parameter[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] SKIPPED [ 34%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_empty_list_error[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 36%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_invalid_model_error[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 38%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_different_inputs_different_outputs[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 40%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_base64[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] SKIPPED [ 42%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_base64_batch_processing[openai_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] SKIPPED [ 44%] tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-0] SKIPPED [ 46%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:non_streaming_02] PASSED [ 48%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:streaming_02] PASSED [ 51%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:streaming_02] SKIPPED [ 53%] tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-False] PASSED [ 55%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-False] PASSED [ 57%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_single_string[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 59%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_multiple_strings[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 61%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_float[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 63%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_dimensions[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] SKIPPED [ 65%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_user_parameter[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] SKIPPED [ 68%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_empty_list_error[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 70%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_invalid_model_error[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 72%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_different_inputs_different_outputs[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] PASSED [ 74%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_base64[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] SKIPPED [ 76%] tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_base64_batch_processing[llama_stack_client-emb=together/togethercomputer/m2-bert-80M-32k-retrieval] SKIPPED [ 78%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:non_streaming_01] PASSED [ 80%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:streaming_01] PASSED [ 82%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:streaming_01] SKIPPED [ 85%] tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-True] PASSED [ 87%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-True] PASSED [ 89%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:non_streaming_02] PASSED [ 91%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:streaming_02] PASSED [ 93%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-inference:chat_completion:streaming_02] SKIPPED [ 95%] tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-False] PASSED [ 97%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free-False] PASSED [100%] ============================================ 30 passed, 17 skipped, 50 deselected, 4 warnings in 21.96s ============================================= ```
2025-12-03 18:00:36 +00:00 · 2025-09-16 14:53:41 -04:00 · 2025-09-16 14:53:41 -04:00 · 49d4a5cc84
commit 49d4a5cc84
parent 3defdf7d3a
20 changed files with 9229 additions and 180 deletions
--- a/tests/integration/inference/test_openai_embeddings.py
+++ b/tests/integration/inference/test_openai_embeddings.py
@ -29,9 +29,35 @@ def provider_from_model(client_with_models, model_id):
    return providers[provider_id]


-def skip_if_model_doesnt_support_variable_dimensions(model_id):
-    if "text-embedding-3" not in model_id:
-        pytest.skip("{model_id} does not support variable output embedding dimensions")
+def skip_if_model_doesnt_support_user_param(client, model_id):
+    provider = provider_from_model(client, model_id)
+    if provider.provider_type in (
+        "remote::together",  # service returns 400
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support user param.")
+
+
+def skip_if_model_doesnt_support_encoding_format_base64(client, model_id):
+    provider = provider_from_model(client, model_id)
+    if provider.provider_type in (
+        "remote::together",  # param silently ignored, always returns floats
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support encoding_format='base64'.")
+
+
+def skip_if_model_doesnt_support_variable_dimensions(client_with_models, model_id):
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type in (
+        "remote::together",  # returns 400
+        "inline::sentence-transformers",
+    ):
+        pytest.skip(
+            f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
+        )
+    if provider.provider_type == "remote::openai" and "text-embedding-3" not in model_id:
+        pytest.skip(
+            f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
+        )


@pytest.fixture(params=["openai_client", "llama_stack_client"])
@ -92,6 +118,7 @@ def test_openai_embeddings_multiple_strings(compat_client, client_with_models, e
    response = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_texts,
+        encoding_format="float",
    )

    assert response.object == "list"
@ -127,7 +154,7 @@ def test_openai_embeddings_with_encoding_format_float(compat_client, client_with
 def test_openai_embeddings_with_dimensions(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with custom dimensions parameter."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
-    skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
+    skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)

    input_text = "Test dimensions parameter"
    dimensions = 16
@ -148,6 +175,7 @@ def test_openai_embeddings_with_dimensions(compat_client, client_with_models, em
 def test_openai_embeddings_with_user_parameter(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with user parameter."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_user_param(client_with_models, embedding_model_id)

    input_text = "Test user parameter"
    user_id = "test-user-123"
@ -196,11 +224,13 @@ def test_openai_embeddings_different_inputs_different_outputs(compat_client, cli
    response1 = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_text1,
+        encoding_format="float",
    )

    response2 = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_text2,
+        encoding_format="float",
    )

    embedding1 = response1.data[0].embedding
@ -214,7 +244,8 @@ def test_openai_embeddings_different_inputs_different_outputs(compat_client, cli
 def test_openai_embeddings_with_encoding_format_base64(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with base64 encoding format."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
-    skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
+    skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)

    input_text = "Test base64 encoding format"
    dimensions = 12
@ -247,6 +278,7 @@ def test_openai_embeddings_with_encoding_format_base64(compat_client, client_wit
 def test_openai_embeddings_base64_batch_processing(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with base64 encoding for batch processing."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)

    input_texts = ["First text for base64", "Second text for base64", "Third text for base64"]