From 10920cc0f57028404ccbdf5ba4f7d58aedfec6cc Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Sat, 15 Feb 2025 17:37:28 -0800 Subject: [PATCH] Update llama_stack/providers/inline/inference/vllm/config.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sébastien Han --- llama_stack/providers/inline/inference/vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py index f362f7e2f..0e85c9a48 100644 --- a/llama_stack/providers/inline/inference/vllm/config.py +++ b/llama_stack/providers/inline/inference/vllm/config.py @@ -26,7 +26,7 @@ class VLLMConfig(BaseModel): description="Maximum number of tokens to generate.", ) max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.") - max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation") + max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation.") enforce_eager: bool = Field( default=False, description="Whether to use eager mode for inference (otherwise cuda graphs are used).",