feat: Support "stop" parameter in remote:vLLM

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-12-31 04:53:54 +00:00 · 2025-03-19 22:41:34 -04:00 · 2025-03-19 22:41:34 -04:00 · a1da09e166
commit a1da09e166
parent f369871083
4 changed files with 34 additions and 0 deletions
--- a/llama_stack/models/llama/datatypes.py
+++ b/llama_stack/models/llama/datatypes.py
@ -201,6 +201,7 @@ class SamplingParams(BaseModel):

    max_tokens: Optional[int] = 0
    repetition_penalty: Optional[float] = 1.0
+    stop: Optional[List[str]] = None


 class CheckpointQuantizationFormat(Enum):
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -147,6 +147,9 @@ def get_sampling_options(params: SamplingParams) -> dict:
        if params.repetition_penalty is not None and params.repetition_penalty != 1.0:
            options["repeat_penalty"] = params.repetition_penalty

+        if params.stop is not None:
+            options["stop"] = params.stop
+
    return options


--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -99,6 +99,30 @@ def test_text_completion_streaming(client_with_models, text_model_id, test_case)
    assert len(content_str) > 10


+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:stop_sequence",
+    ],
+)
+def test_text_completion_stop_sequence(client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.completion(
+        content=tc["content"],
+        stream=True,
+        model_id=text_model_id,
+        sampling_params={
+            "max_tokens": 50,
+            "stop": ["1963"],
+        },
+    )
+    streamed_content = [chunk.delta for chunk in response]
+    content_str = "".join(streamed_content).lower().strip()
+    assert "1963" not in content_str
+
+
@pytest.mark.parametrize(
    "test_case",
    [
--- a/tests/integration/test_cases/inference/completion.json
+++ b/tests/integration/test_cases/inference/completion.json
@ -10,6 +10,12 @@
            "expected": "1963"
        }
    },
+    "stop_sequence": {
+        "data": {
+            "content": "Return the exact same sentence: Michael Jordan was born in 1963",
+            "expected": "Michael Jordan was born in"
+        }
+    },
    "streaming": {
        "data": {
            "content": "Roses are red,"