Make vllm inference better

Tests still don't pass completely (some hang) so I think there are some potential threading issues maybe
2025-12-03 18:00:36 +00:00 · 2024-10-24 22:30:49 -07:00 · 2024-10-24 22:30:49 -07:00 · 70d59b0f5d
commit 70d59b0f5d
parent cb43caa2c3
2 changed files with 84 additions and 84 deletions
--- a/llama_stack/providers/impls/vllm/config.py
+++ b/llama_stack/providers/impls/vllm/config.py
@ -15,13 +15,24 @@ class VLLMConfig(BaseModel):
    """Configuration for the vLLM inference provider."""
    model: str = Field(
-        default="Llama3.1-8B-Instruct",
+        default="Llama3.2-3B-Instruct",
        description="Model descriptor from `llama model list`",
    )
    tensor_parallel_size: int = Field(
        default=1,
        description="Number of tensor parallel replicas (number of GPUs to use).",
    )
    max_tokens: int = Field(
        default=4096,
        description="Maximum number of tokens to generate.",
    )
    enforce_eager: bool = Field(
        default=False,
        description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
    )
    gpu_memory_utilization: float = Field(
        default=0.3,
    )
    @field_validator("model")
    @classmethod
--- a/llama_stack/providers/impls/vllm/vllm.py
+++ b/llama_stack/providers/impls/vllm/vllm.py
@ -7,11 +7,12 @@
 import logging
 import os
 import uuid
-from typing import Any, AsyncGenerator
+from typing import AsyncGenerator, Optional
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.sku_list import resolve_model
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -19,7 +20,7 @@ from vllm.sampling_params import SamplingParams as VLLMSamplingParams
 from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
@ -40,74 +41,15 @@ def _random_uuid() -> str:
    return str(uuid.uuid4().hex)
-def _vllm_sampling_params(sampling_params: Any) -> VLLMSamplingParams:
+class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
    """Convert sampling params to vLLM sampling params."""
    if sampling_params is None:
        return VLLMSamplingParams()
    # TODO convert what I saw in my first test ... but surely there's more to do here
    kwargs = {
        "temperature": sampling_params.temperature,
    }
    if sampling_params.top_k >= 1:
        kwargs["top_k"] = sampling_params.top_k
    if sampling_params.top_p:
        kwargs["top_p"] = sampling_params.top_p
    if sampling_params.max_tokens >= 1:
        kwargs["max_tokens"] = sampling_params.max_tokens
    if sampling_params.repetition_penalty > 0:
        kwargs["repetition_penalty"] = sampling_params.repetition_penalty
    return VLLMSamplingParams(**kwargs)
 class VLLMInferenceImpl(ModelRegistryHelper, Inference):
    """Inference implementation for vLLM."""
    HF_MODEL_MAPPINGS = {
        # TODO: seems like we should be able to build this table dynamically ...
        "Llama3.1-8B": "meta-llama/Llama-3.1-8B",
        "Llama3.1-70B": "meta-llama/Llama-3.1-70B",
        "Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B",
        "Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8",
        "Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B",
        "Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
        "Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct",
        "Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct",
        "Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8",
        "Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct",
        "Llama3.2-1B": "meta-llama/Llama-3.2-1B",
        "Llama3.2-3B": "meta-llama/Llama-3.2-3B",
        "Llama3.2-11B-Vision": "meta-llama/Llama-3.2-11B-Vision",
        "Llama3.2-90B-Vision": "meta-llama/Llama-3.2-90B-Vision",
        "Llama3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct",
        "Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
        "Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
        "Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
        "Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision",
        "Llama-Guard-3-1B:int4-mp1": "meta-llama/Llama-Guard-3-1B-INT4",
        "Llama-Guard-3-1B": "meta-llama/Llama-Guard-3-1B",
        "Llama-Guard-3-8B": "meta-llama/Llama-Guard-3-8B",
        "Llama-Guard-3-8B:int8-mp1": "meta-llama/Llama-Guard-3-8B-INT8",
        "Prompt-Guard-86M": "meta-llama/Prompt-Guard-86M",
        "Llama-Guard-2-8B": "meta-llama/Llama-Guard-2-8B",
    }
    def __init__(self, config: VLLMConfig):
        Inference.__init__(self)
        ModelRegistryHelper.__init__(
            self,
            stack_to_provider_models_map=self.HF_MODEL_MAPPINGS,
        )
        self.config = config
        self.engine = None
-
+        self.formatter = ChatFormat(Tokenizer.get_instance())
        tokenizer = Tokenizer.get_instance()
        self.formatter = ChatFormat(tokenizer)
    async def initialize(self):
        """Initialize the vLLM inference adapter."""
        log.info("Initializing vLLM inference adapter")
        # Disable usage stats reporting. This would be a surprising thing for most
@ -116,15 +58,22 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
        if "VLLM_NO_USAGE_STATS" not in os.environ:
            os.environ["VLLM_NO_USAGE_STATS"] = "1"
-        hf_model = self.HF_MODEL_MAPPINGS.get(self.config.model)
+        model = resolve_model(self.config.model)
        if model is None:
            raise ValueError(f"Unknown model {self.config.model}")
        if model.huggingface_repo is None:
            raise ValueError(f"Model {self.config.model} needs a huggingface repo")
        # TODO -- there are a ton of options supported here ...
-        engine_args = AsyncEngineArgs()
+        engine_args = AsyncEngineArgs(
-        engine_args.model = hf_model
+            model=model.huggingface_repo,
-        # We will need a new config item for this in the future if model support is more broad
+            tokenizer=model.huggingface_repo,
-        # than it is today (llama only)
+            tensor_parallel_size=self.config.tensor_parallel_size,
-        engine_args.tokenizer = hf_model
+            enforce_eager=self.config.enforce_eager,
-        engine_args.tensor_parallel_size = self.config.tensor_parallel_size
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
            guided_decoding_backend="lm-format-enforcer",
        )
        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
@ -134,13 +83,47 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
        if self.engine:
            self.engine.shutdown_background_loop()
    async def register_model(self, model: ModelDef) -> None:
        raise ValueError(
            "You cannot dynamically add a model to a running vllm instance"
        )
    async def list_models(self) -> List[ModelDef]:
        return [
            ModelDef(
                identifier=self.config.model,
                llama_model=self.config.model,
            )
        ]
    def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams:
        if sampling_params is None:
            return VLLMSamplingParams(max_tokens=self.config.max_tokens)
        # TODO convert what I saw in my first test ... but surely there's more to do here
        kwargs = {
            "temperature": sampling_params.temperature,
            "max_tokens": self.config.max_tokens,
        }
        if sampling_params.top_k:
            kwargs["top_k"] = sampling_params.top_k
        if sampling_params.top_p:
            kwargs["top_p"] = sampling_params.top_p
        if sampling_params.max_tokens:
            kwargs["max_tokens"] = sampling_params.max_tokens
        if sampling_params.repetition_penalty > 0:
            kwargs["repetition_penalty"] = sampling_params.repetition_penalty
        return VLLMSamplingParams(**kwargs)
    async def completion(
        self,
        model: str,
        content: InterleavedTextMedia,
-        sampling_params: Any | None = ...,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        stream: bool | None = False,
+        response_format: Optional[ResponseFormat] = None,
-        logprobs: LogProbConfig | None = None,
+        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> CompletionResponse | CompletionResponseStreamChunk:
        log.info("vLLM completion")
        messages = [UserMessage(content=content)]
@ -155,13 +138,14 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
    async def chat_completion(
        self,
        model: str,
-        messages: list[Message],
+        messages: List[Message],
-        sampling_params: Any | None = ...,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: list[ToolDefinition] | None = ...,
+        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: ToolChoice | None = ...,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = ...,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: bool | None = False,
+        response_format: Optional[ResponseFormat] = None,
-        logprobs: LogProbConfig | None = None,
+        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
        log.info("vLLM chat completion")
@ -182,7 +166,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
        request_id = _random_uuid()
        prompt = chat_completion_request_to_prompt(request, self.formatter)
-        vllm_sampling_params = _vllm_sampling_params(request.sampling_params)
+        vllm_sampling_params = self._sampling_params(request.sampling_params)
        results_generator = self.engine.generate(
            prompt, vllm_sampling_params, request_id
        )
@ -213,14 +197,19 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
    ) -> AsyncGenerator:
        async def _generate_and_convert_to_openai_compat():
            cur = []
            async for chunk in results_generator:
                if not chunk.outputs:
                    log.warning("Empty chunk received")
                    continue
-                text = "".join([output.text for output in chunk.outputs])
+                output = chunk.outputs[-1]
                new_tokens = output.token_ids[len(cur) :]
                text = self.formatter.tokenizer.decode(new_tokens)
                cur.extend(new_tokens)
                choice = OpenAICompatCompletionChoice(
-                    finish_reason=chunk.outputs[-1].stop_reason,
+                    finish_reason=output.finish_reason,
                    text=text,
                )
                yield OpenAICompatCompletionResponse(