Working

2025-07-28 15:02:37 +00:00 · 2024-10-10 20:58:52 -04:00 · 2024-10-10 20:58:52 -04:00 · 7bbce6394a
commit 7bbce6394a
parent cdadf0f87d
2 changed files with 10 additions and 3 deletions
--- a/llama_stack/providers/adapters/inference/vllm/config.py
+++ b/llama_stack/providers/adapters/inference/vllm/config.py
@ -10,7 +10,6 @@ from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field


-# TODO: Any other engine configs
@json_schema_type
 class VLLMImplConfig(BaseModel):
    url: Optional[str] = Field(
--- a/llama_stack/providers/adapters/inference/vllm/vllm.py
+++ b/llama_stack/providers/adapters/inference/vllm/vllm.py
@ -29,7 +29,8 @@ from .config import VLLMImplConfig

 # Reference: https://docs.vllm.ai/en/latest/models/supported_models.html
 VLLM_SUPPORTED_MODELS = {
-    "Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3-70B-Instruct",
+    "Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct",
    "Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct",
 }

@ -48,7 +49,14 @@ class VLLMInferenceAdapter(ModelRegistryHelper, Inference):
    async def shutdown(self) -> None:
        pass

-    def completion(self, request: CompletionRequest) -> AsyncGenerator:
+    def completion(
+            self,
+            model: str,
+            content: InterleavedTextMedia,
+            sampling_params: Optional[SamplingParams] = SamplingParams(),
+            stream: Optional[bool] = False,
+            logprobs: Optional[LogProbConfig] = None,
+    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
        raise NotImplementedError()

    def chat_completion(