forked from phoenix-oss/llama-stack-mirror
feat: Support "stop" parameter in remote:vLLM (#1715)
# What does this PR do? This adds support for "stop" parameter: https://platform.openai.com/docs/api-reference/completions/create#completions-create-stop ## Test Plan ``` tests/integration/inference/test_text_inference.py::test_text_completion_non_streaming[txt=8B-inference:completion:sanity] PASSED [ 5%] tests/integration/inference/test_text_inference.py::test_text_completion_streaming[txt=8B-inference:completion:sanity] PASSED [ 11%] tests/integration/inference/test_text_inference.py::test_text_completion_stop_sequence[txt=8B-inference:completion:stop_sequence] PASSED [ 16%] tests/integration/inference/test_text_inference.py::test_text_completion_log_probs_non_streaming[txt=8B-inference:completion:log_probs] PASSED [ 22%] tests/integration/inference/test_text_inference.py::test_text_completion_log_probs_streaming[txt=8B-inference:completion:log_probs] PASSED [ 27%] tests/integration/inference/test_text_inference.py::test_text_completion_structured_output[txt=8B-inference:completion:structured_output] PASSED [ 33%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_non_streaming[txt=8B-inference:chat_completion:non_streaming_01] PASSED [ 38%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_non_streaming[txt=8B-inference:chat_completion:non_streaming_02] PASSED [ 44%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_first_token_profiling[txt=8B-inference:chat_completion:ttft] ^TPASSED [ 50%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_streaming[txt=8B-inference:chat_completion:streaming_01] PASSED [ 55%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_streaming[txt=8B-inference:chat_completion:streaming_02] PASSED [ 61%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_with_tool_calling_and_non_streaming[txt=8B-inference:chat_completion:tool_calling] PASSED [ 66%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_with_tool_calling_and_streaming[txt=8B-inference:chat_completion:tool_calling] PASSED [ 72%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_with_tool_choice_required[txt=8B-inference:chat_completion:tool_calling] PASSED [ 77%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_with_tool_choice_none[txt=8B-inference:chat_completion:tool_calling] PASSED [ 83%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_structured_output[txt=8B-inference:chat_completion:structured_output] PASSED [ 88%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_tool_calling_tools_not_in_request[txt=8B-inference:chat_completion:tool_calling_tools_absent-True] PASSED [ 94%] tests/integration/inference/test_text_inference.py::test_text_chat_completion_tool_calling_tools_not_in_request[txt=8B-inference:chat_completion:tool_calling_tools_absent-False] PASSED [100%] =============================================================== 18 passed, 3 warnings in 755.79s (0:12:35) =============================================================== ``` --------- Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
This commit is contained in:
parent
9ff82036f7
commit
441016bee8
6 changed files with 79 additions and 4 deletions
19
docs/_static/llama-stack-spec.html
vendored
19
docs/_static/llama-stack-spec.html
vendored
|
@ -4053,22 +4053,33 @@
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"strategy": {
|
||||
"$ref": "#/components/schemas/SamplingStrategy"
|
||||
"$ref": "#/components/schemas/SamplingStrategy",
|
||||
"description": "The sampling strategy."
|
||||
},
|
||||
"max_tokens": {
|
||||
"type": "integer",
|
||||
"default": 0
|
||||
"default": 0,
|
||||
"description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
|
||||
},
|
||||
"repetition_penalty": {
|
||||
"type": "number",
|
||||
"default": 1.0
|
||||
"default": 1.0,
|
||||
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
|
||||
},
|
||||
"stop": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"strategy"
|
||||
],
|
||||
"title": "SamplingParams"
|
||||
"title": "SamplingParams",
|
||||
"description": "Sampling parameters."
|
||||
},
|
||||
"SamplingStrategy": {
|
||||
"oneOf": [
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue