mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore(apis): unpublish deprecated /v1/inference apis (#3297)
# What does this PR do? unpublish (make unavailable to users) the following apis - - `/v1/inference/completion`, replaced by `/v1/openai/v1/completions` - `/v1/inference/chat-completion`, replaced by `/v1/openai/v1/chat/completions` - `/v1/inference/embeddings`, replaced by `/v1/openai/v1/embeddings` - `/v1/inference/batch-completion`, replaced by `/v1/openai/v1/batches` - `/v1/inference/batch-chat-completion`, replaced by `/v1/openai/v1/batches` note: the implementations are still available for internal use, e.g. agents uses chat-completion.
This commit is contained in:
parent
60484c5c4e
commit
53b15725b6
23 changed files with 3134 additions and 1347 deletions
|
@ -9,6 +9,7 @@ import time
|
|||
import unicodedata
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ..test_cases.test_case import TestCase
|
||||
|
||||
|
@ -62,6 +63,14 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
|
|||
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
|
||||
|
||||
|
||||
def skip_if_doesnt_support_completions_logprobs(client_with_models, model_id):
|
||||
provider_type = provider_from_model(client_with_models, model_id).provider_type
|
||||
if provider_type in (
|
||||
"remote::ollama", # logprobs is ignored
|
||||
):
|
||||
pytest.skip(f"Model {model_id} hosted by {provider_type} doesn't support /v1/completions logprobs.")
|
||||
|
||||
|
||||
def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
|
||||
# To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix.
|
||||
# Use this to specifically test this API functionality.
|
||||
|
@ -205,28 +214,6 @@ def test_openai_completion_streaming(llama_stack_client, client_with_models, tex
|
|||
assert len(content_str) > 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompt_logprobs",
|
||||
[
|
||||
1,
|
||||
0,
|
||||
],
|
||||
)
|
||||
def test_openai_completion_prompt_logprobs(llama_stack_client, client_with_models, text_model_id, prompt_logprobs):
|
||||
skip_if_provider_isnt_vllm(client_with_models, text_model_id)
|
||||
|
||||
prompt = "Hello, world!"
|
||||
response = llama_stack_client.completions.create(
|
||||
model=text_model_id,
|
||||
prompt=prompt,
|
||||
stream=False,
|
||||
prompt_logprobs=prompt_logprobs,
|
||||
)
|
||||
assert len(response.choices) > 0
|
||||
choice = response.choices[0]
|
||||
assert len(choice.prompt_logprobs) > 0
|
||||
|
||||
|
||||
def test_openai_completion_guided_choice(llama_stack_client, client_with_models, text_model_id):
|
||||
skip_if_provider_isnt_vllm(client_with_models, text_model_id)
|
||||
|
||||
|
@ -518,3 +505,214 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
|
|||
message_content = response.choices[0].message.content.lower().strip()
|
||||
normalized_content = _normalize_text(message_content)
|
||||
assert "hello world" in normalized_content
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:completion:stop_sequence",
|
||||
],
|
||||
)
|
||||
def test_openai_completion_stop_sequence(client_with_models, openai_client, text_model_id, test_case):
|
||||
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
|
||||
|
||||
tc = TestCase(test_case)
|
||||
|
||||
response = openai_client.completions.create(
|
||||
model=text_model_id,
|
||||
prompt=tc["content"],
|
||||
stop="1963",
|
||||
stream=False,
|
||||
)
|
||||
assert len(response.choices) > 0
|
||||
choice = response.choices[0]
|
||||
assert "1963" not in choice.text
|
||||
|
||||
response = openai_client.completions.create(
|
||||
model=text_model_id,
|
||||
prompt=tc["content"],
|
||||
stop=["blathering", "1963"],
|
||||
stream=False,
|
||||
)
|
||||
assert len(response.choices) > 0
|
||||
choice = response.choices[0]
|
||||
assert "1963" not in choice.text
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:completion:log_probs",
|
||||
],
|
||||
)
|
||||
def test_openai_completion_logprobs(client_with_models, openai_client, text_model_id, test_case):
|
||||
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
|
||||
skip_if_doesnt_support_completions_logprobs(client_with_models, text_model_id)
|
||||
|
||||
tc = TestCase(test_case)
|
||||
|
||||
response = openai_client.completions.create(
|
||||
model=text_model_id,
|
||||
prompt=tc["content"],
|
||||
logprobs=5,
|
||||
)
|
||||
assert len(response.choices) > 0
|
||||
choice = response.choices[0]
|
||||
assert choice.text, "Response text should not be empty"
|
||||
assert choice.logprobs, "Logprobs should not be empty"
|
||||
logprobs = choice.logprobs
|
||||
assert logprobs.token_logprobs, "Response tokens should not be empty"
|
||||
assert len(logprobs.tokens) == len(logprobs.token_logprobs)
|
||||
assert len(logprobs.token_logprobs) == len(logprobs.top_logprobs)
|
||||
for i, (token, prob) in enumerate(zip(logprobs.tokens, logprobs.token_logprobs, strict=True)):
|
||||
assert logprobs.top_logprobs[i][token] == prob
|
||||
assert len(logprobs.top_logprobs[i]) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:completion:log_probs",
|
||||
],
|
||||
)
|
||||
def test_openai_completion_logprobs_streaming(client_with_models, openai_client, text_model_id, test_case):
|
||||
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
|
||||
skip_if_doesnt_support_completions_logprobs(client_with_models, text_model_id)
|
||||
|
||||
tc = TestCase(test_case)
|
||||
|
||||
response = openai_client.completions.create(
|
||||
model=text_model_id,
|
||||
prompt=tc["content"],
|
||||
logprobs=3,
|
||||
stream=True,
|
||||
max_tokens=5,
|
||||
)
|
||||
for chunk in response:
|
||||
choice = chunk.choices[0]
|
||||
choice = response.choices[0]
|
||||
if choice.text: # if there's a token, we expect logprobs
|
||||
assert choice.logprobs, "Logprobs should not be empty"
|
||||
logprobs = choice.logprobs
|
||||
assert logprobs.token_logprobs, "Response tokens should not be empty"
|
||||
assert len(logprobs.tokens) == len(logprobs.token_logprobs)
|
||||
assert len(logprobs.token_logprobs) == len(logprobs.top_logprobs)
|
||||
for i, (token, prob) in enumerate(zip(logprobs.tokens, logprobs.token_logprobs, strict=True)):
|
||||
assert logprobs.top_logprobs[i][token] == prob
|
||||
assert len(logprobs.top_logprobs[i]) == 3
|
||||
else: # no token, no logprobs
|
||||
assert not choice.logprobs, "Logprobs should be empty"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:chat_completion:tool_calling",
|
||||
],
|
||||
)
|
||||
def test_openai_chat_completion_with_tools(openai_client, text_model_id, test_case):
|
||||
tc = TestCase(test_case)
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=text_model_id,
|
||||
messages=tc["messages"],
|
||||
tools=tc["tools"],
|
||||
tool_choice="auto",
|
||||
stream=False,
|
||||
)
|
||||
assert len(response.choices) == 1
|
||||
assert len(response.choices[0].message.tool_calls) == 1
|
||||
tool_call = response.choices[0].message.tool_calls[0]
|
||||
assert tool_call.function.name == tc["tools"][0]["function"]["name"]
|
||||
assert "location" in tool_call.function.arguments
|
||||
assert tc["expected"]["location"] in tool_call.function.arguments
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:chat_completion:tool_calling",
|
||||
],
|
||||
)
|
||||
def test_openai_chat_completion_with_tools_and_streaming(openai_client, text_model_id, test_case):
|
||||
tc = TestCase(test_case)
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=text_model_id,
|
||||
messages=tc["messages"],
|
||||
tools=tc["tools"],
|
||||
tool_choice="auto",
|
||||
stream=True,
|
||||
)
|
||||
# Accumulate tool calls from streaming chunks
|
||||
tool_calls = []
|
||||
for chunk in response:
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
for i, tc_delta in enumerate(chunk.choices[0].delta.tool_calls):
|
||||
while len(tool_calls) <= i:
|
||||
tool_calls.append({"function": {"name": "", "arguments": ""}})
|
||||
if tc_delta.function and tc_delta.function.name:
|
||||
tool_calls[i]["function"]["name"] = tc_delta.function.name
|
||||
if tc_delta.function and tc_delta.function.arguments:
|
||||
tool_calls[i]["function"]["arguments"] += tc_delta.function.arguments
|
||||
assert len(tool_calls) == 1
|
||||
tool_call = tool_calls[0]
|
||||
assert tool_call["function"]["name"] == tc["tools"][0]["function"]["name"]
|
||||
assert "location" in tool_call["function"]["arguments"]
|
||||
assert tc["expected"]["location"] in tool_call["function"]["arguments"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:chat_completion:tool_calling",
|
||||
],
|
||||
)
|
||||
def test_openai_chat_completion_with_tool_choice_none(openai_client, text_model_id, test_case):
|
||||
tc = TestCase(test_case)
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=text_model_id,
|
||||
messages=tc["messages"],
|
||||
tools=tc["tools"],
|
||||
tool_choice="none",
|
||||
stream=False,
|
||||
)
|
||||
assert len(response.choices) == 1
|
||||
tool_calls = response.choices[0].message.tool_calls
|
||||
assert tool_calls is None or len(tool_calls) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:chat_completion:structured_output",
|
||||
],
|
||||
)
|
||||
def test_openai_chat_completion_structured_output(openai_client, text_model_id, test_case):
|
||||
# Note: Skip condition may need adjustment for OpenAI client
|
||||
class AnswerFormat(BaseModel):
|
||||
first_name: str
|
||||
last_name: str
|
||||
year_of_birth: int
|
||||
|
||||
tc = TestCase(test_case)
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=text_model_id,
|
||||
messages=tc["messages"],
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "AnswerFormat",
|
||||
"schema": AnswerFormat.model_json_schema(),
|
||||
},
|
||||
},
|
||||
stream=False,
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
answer = AnswerFormat.model_validate_json(response.choices[0].message.content)
|
||||
expected = tc["expected"]
|
||||
assert answer.first_name == expected["first_name"]
|
||||
assert answer.last_name == expected["last_name"]
|
||||
assert answer.year_of_birth == expected["year_of_birth"]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue