mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-12 04:50:39 +00:00
Update llama_stack/providers/inline/inference/vllm/config.py
Co-authored-by: Sébastien Han <seb@redhat.com>
This commit is contained in:
parent
bb024daf21
commit
10920cc0f5
1 changed files with 1 additions and 1 deletions
|
@ -26,7 +26,7 @@ class VLLMConfig(BaseModel):
|
||||||
description="Maximum number of tokens to generate.",
|
description="Maximum number of tokens to generate.",
|
||||||
)
|
)
|
||||||
max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.")
|
max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.")
|
||||||
max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation")
|
max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation.")
|
||||||
enforce_eager: bool = Field(
|
enforce_eager: bool = Field(
|
||||||
default=False,
|
default=False,
|
||||||
description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
|
description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue