diff --git a/llama_stack/providers/inline/inference/vllm/openai_utils.py b/llama_stack/providers/inline/inference/vllm/openai_utils.py
index a525bdf87..c59261d2c 100644
--- a/llama_stack/providers/inline/inference/vllm/openai_utils.py
+++ b/llama_stack/providers/inline/inference/vllm/openai_utils.py
@@ -7,7 +7,6 @@
 from typing import List, Optional
 
 import vllm
-
 from llama_models.llama3.api.datatypes import BuiltinTool, ToolDefinition
 
 from llama_stack.apis.inference import (
@@ -23,7 +22,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
     get_sampling_options,
 )
 
-
 ###############################################################################
 # This file contains OpenAI compatibility code that is currently only used
 # by the inline vLLM connector. Some or all of this code may be moved to a
@@ -77,8 +75,7 @@ def _llama_stack_tools_to_openai_tools(
             parameters = {
                 "type": "object",  # Mystery value that shows up in OpenAI docs
                 "properties": {
-                    k: {"type": v.param_type, "description": v.description}
-                    for k, v in t.parameters.items()
+                    k: {"type": v.param_type, "description": v.description} for k, v in t.parameters.items()
                 },
                 "required": required_params,
             }
@@ -88,11 +85,7 @@ def _llama_stack_tools_to_openai_tools(
         )
 
         # Every tool definition is double-boxed in a ChatCompletionToolsParam
-        result.append(
-            vllm.entrypoints.openai.protocol.ChatCompletionToolsParam(
-                function=function_def
-            )
-        )
+        result.append(vllm.entrypoints.openai.protocol.ChatCompletionToolsParam(function=function_def))
     return result
 
 
@@ -113,9 +106,7 @@ async def llama_stack_chat_completion_to_openai_chat_completion_dict(
 
     converted_messages = [
         # This mystery async call makes the parent function also be async
-        await convert_message_to_openai_dict(
-            _merge_context_into_content(m), download=True
-        )
+        await convert_message_to_openai_dict(_merge_context_into_content(m), download=True)
         for m in request.messages
     ]
     converted_tools = _llama_stack_tools_to_openai_tools(request.tools)
@@ -123,11 +114,7 @@ async def llama_stack_chat_completion_to_openai_chat_completion_dict(
     # Llama will try to use built-in tools with no tool catalog, so don't enable
     # tool choice unless at least one tool is enabled.
     converted_tool_choice = "none"
-    if (
-        request.tool_choice == ToolChoice.auto
-        and request.tools is not None
-        and len(request.tools) > 0
-    ):
+    if request.tool_choice == ToolChoice.auto and request.tools is not None and len(request.tools) > 0:
         converted_tool_choice = "auto"
 
     # TODO: Figure out what to do with the tool_prompt_format argument.
@@ -143,13 +130,8 @@ async def llama_stack_chat_completion_to_openai_chat_completion_dict(
     # API will handle correctly. Two wrongs make a right...
     if "repeat_penalty" in sampling_options:
         del sampling_options["repeat_penalty"]
-    if (
-        request.sampling_params.repetition_penalty is not None
-        and request.sampling_params.repetition_penalty != 1.0
-    ):
-        sampling_options["repetition_penalty"] = (
-            request.sampling_params.repetition_penalty
-        )
+    if request.sampling_params.repetition_penalty is not None and request.sampling_params.repetition_penalty != 1.0:
+        sampling_options["repetition_penalty"] = request.sampling_params.repetition_penalty
 
     # Convert a single response format into four different parameters, per
     # the OpenAI spec
@@ -162,10 +144,7 @@ async def llama_stack_chat_completion_to_openai_chat_completion_dict(
     elif isinstance(request.response_format, GrammarResponseFormat):
         guided_decoding_options["guided_grammar"] = request.response_format.bnf
     else:
-        raise TypeError(
-            f"ResponseFormat object is of unexpected "
-            f"subtype '{type(request.response_format)}'"
-        )
+        raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(request.response_format)}'")
 
     logprob_options = dict()
     if request.logprobs is not None:
diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py
index 2c6bcbf82..639728278 100644
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@@ -20,14 +20,13 @@ from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import (
     SamplingParams,
     StopReason,
+    ToolCall,
     ToolDefinition,
     ToolPromptFormat,
     TopKSamplingStrategy,
     TopPSamplingStrategy,
 )
 from llama_models.llama3.api.tokenizer import Tokenizer
-
-# We deep-import the names that don't conflict with Llama Stack names
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -35,6 +34,7 @@ from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingM
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
+    InterleavedContentItem,
     TextDelta,
     ToolCallDelta,
 )
@@ -54,9 +54,10 @@ from llama_stack.apis.inference import (
     LogProbConfig,
     Message,
     ResponseFormat,
+    TextTruncation,
     TokenLogProbs,
-    ToolCall,
     ToolChoice,
+    ToolConfig,
 )
 from llama_stack.apis.models import Model
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
@@ -254,7 +255,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
         logger.debug(f"In register_model({model})")
 
         # First attempt to interpret the model coordinates as a Llama model name
-        resolved_llama_model = resolve_model(model.provider_model_id)
+        resolved_llama_model = llama_models.sku_list.resolve_model(model.provider_model_id)
         if resolved_llama_model is not None:
             # Load from Hugging Face repo into default local cache dir
             model_id_for_vllm = resolved_llama_model.huggingface_repo
@@ -277,16 +278,12 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
             else:
                 # Model already loaded
                 logger.info(
-                    f"Requested id {model} resolves to {model_id_for_vllm}, "
-                    f"which is already loaded. Continuing."
+                    f"Requested id {model} resolves to {model_id_for_vllm}, which is already loaded. Continuing."
                 )
                 self.model_ids.add(model.model_id)
                 return model
 
-        logger.info(
-            f"Requested id {model} resolves to {model_id_for_vllm}. Loading "
-            f"{model_id_for_vllm}."
-        )
+        logger.info(f"Requested id {model} resolves to {model_id_for_vllm}. Loading {model_id_for_vllm}.")
         if is_meta_llama_model:
             logger.info(f"Model {model_id_for_vllm} is a Meta Llama model.")
         self.is_meta_llama_model = is_meta_llama_model
@@ -425,7 +422,8 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
         tool_prompt_format: Optional[ToolPromptFormat] = None,
         stream: Optional[bool] = False,
         logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        tool_config: Optional[ToolConfig] = None,
+    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
         if model_id not in self.model_ids:
             raise ValueError(
                 f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index cdce5510d..8a15ff016 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -15,11 +15,12 @@ providers:
   - provider_id: vllm
     provider_type: inline::vllm
     config:
-      model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
       tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
       max_tokens: ${env.MAX_TOKENS:4096}
+      max_model_len: ${env.MAX_MODEL_LEN:4096}
+      max_num_seqs: ${env.MAX_NUM_SEQS:4}
       enforce_eager: ${env.ENFORCE_EAGER:False}
-      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
+      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.3}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}