Update llama_stack/providers/inline/inference/vllm/config.py

Co-authored-by: Sébastien Han <seb@redhat.com>
This commit is contained in:
Fred Reiss 2025-02-15 17:37:28 -08:00 committed by Ashwin Bharambe
parent bb024daf21
commit 10920cc0f5

View file

@ -26,7 +26,7 @@ class VLLMConfig(BaseModel):
description="Maximum number of tokens to generate.", description="Maximum number of tokens to generate.",
) )
max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.") max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.")
max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation") max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation.")
enforce_eager: bool = Field( enforce_eager: bool = Field(
default=False, default=False,
description="Whether to use eager mode for inference (otherwise cuda graphs are used).", description="Whether to use eager mode for inference (otherwise cuda graphs are used).",