From 59211067d1085cb60d92b11955b713e97e6c4623 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Sat, 25 Jan 2025 19:00:08 -0800 Subject: [PATCH] Remove unneeded config parameter --- .../providers/inline/inference/vllm/config.py | 34 +++++-------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py index 5921b132d..f362f7e2f 100644 --- a/llama_stack/providers/inline/inference/vllm/config.py +++ b/llama_stack/providers/inline/inference/vllm/config.py @@ -4,20 +4,19 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field -from llama_stack.providers.utils.inference import supported_inference_models from llama_stack.schema_utils import json_schema_type @json_schema_type class VLLMConfig(BaseModel): - """Configuration for the vLLM inference provider.""" + """Configuration for the vLLM inference provider. + + Note that the model name is no longer part of this static configuration. + You can bind an instance of this provider to a specific model with the + ``models.register()`` API call.""" - model: str = Field( - default="Llama3.2-3B-Instruct", - description="Model descriptor from `llama model list`", - ) tensor_parallel_size: int = Field( default=1, description="Number of tensor parallel replicas (number of GPUs to use).", @@ -26,12 +25,8 @@ class VLLMConfig(BaseModel): default=4096, description="Maximum number of tokens to generate.", ) - max_model_len: int = Field( - default=4096, description="Maximum context length to use during serving." - ) - max_num_seqs: int = Field( - default=4, description="Maximum parallel batch size for generation" - ) + max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.") + max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation") enforce_eager: bool = Field( default=False, description="Whether to use eager mode for inference (otherwise cuda graphs are used).", @@ -47,7 +42,6 @@ class VLLMConfig(BaseModel): @classmethod def sample_run_config(cls): return { - "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}", "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}", "max_tokens": "${env.MAX_TOKENS:4096}", "max_model_len": "${env.MAX_MODEL_LEN:4096}", @@ -55,15 +49,3 @@ class VLLMConfig(BaseModel): "enforce_eager": "${env.ENFORCE_EAGER:False}", "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.3}", } - - @field_validator("model") - @classmethod - def validate_model(cls, model: str) -> str: - permitted_models = supported_inference_models() - - descriptors = [m.descriptor() for m in permitted_models] - repos = [m.huggingface_repo for m in permitted_models] - if model not in (descriptors + repos): - model_list = "\n\t".join(repos) - raise ValueError(f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]") - return model