mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-29 15:23:51 +00:00
Address comments
This commit is contained in:
parent
24f64915b6
commit
660983b72d
1 changed files with 24 additions and 14 deletions
|
@ -6,15 +6,14 @@
|
||||||
from typing import AsyncGenerator
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
from llama_models.llama3.api.chat_format import ChatFormat
|
from llama_models.llama3.api.chat_format import ChatFormat
|
||||||
|
|
||||||
from llama_models.llama3.api.datatypes import Message
|
from llama_models.llama3.api.datatypes import Message
|
||||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||||
|
from llama_models.sku_list import all_registered_models
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
from llama_stack.apis.inference import * # noqa: F403
|
from llama_stack.apis.inference import * # noqa: F403
|
||||||
|
|
||||||
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
|
@ -27,19 +26,15 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
|
||||||
from .config import VLLMImplConfig
|
from .config import VLLMImplConfig
|
||||||
|
|
||||||
|
|
||||||
# Reference: https://docs.vllm.ai/en/latest/models/supported_models.html
|
class VLLMInferenceAdapter(Inference):
|
||||||
VLLM_SUPPORTED_MODELS = {
|
model_id: str
|
||||||
"Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class VLLMInferenceAdapter(ModelRegistryHelper, Inference):
|
|
||||||
def __init__(self, config: VLLMImplConfig) -> None:
|
def __init__(self, config: VLLMImplConfig) -> None:
|
||||||
ModelRegistryHelper.__init__(
|
self.huggingface_repo_to_llama_model_id = {
|
||||||
self, stack_to_provider_models_map=VLLM_SUPPORTED_MODELS
|
model.huggingface_repo: model.descriptor()
|
||||||
)
|
for model in all_registered_models()
|
||||||
|
if model.huggingface_repo
|
||||||
|
}
|
||||||
self.config = config
|
self.config = config
|
||||||
self.formatter = ChatFormat(Tokenizer.get_instance())
|
self.formatter = ChatFormat(Tokenizer.get_instance())
|
||||||
|
|
||||||
|
@ -49,6 +44,19 @@ class VLLMInferenceAdapter(ModelRegistryHelper, Inference):
|
||||||
async def shutdown(self) -> None:
|
async def shutdown(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
async def list_models(self) -> List[ModelDef]:
|
||||||
|
repo = self.model_id
|
||||||
|
identifier = self.huggingface_repo_to_llama_model_id[repo]
|
||||||
|
return [
|
||||||
|
ModelDef(
|
||||||
|
identifier=identifier,
|
||||||
|
llama_model=identifier,
|
||||||
|
metadata={
|
||||||
|
"huggingface_repo": repo,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def completion(
|
def completion(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
|
@ -99,6 +107,8 @@ class VLLMInferenceAdapter(ModelRegistryHelper, Inference):
|
||||||
) -> AsyncGenerator:
|
) -> AsyncGenerator:
|
||||||
params = self._get_params(request)
|
params = self._get_params(request)
|
||||||
|
|
||||||
|
# TODO: Can we use client.completions.acreate() or maybe there is another way to directly create an async
|
||||||
|
# generator so this wrapper is not necessary?
|
||||||
async def _to_async_generator():
|
async def _to_async_generator():
|
||||||
s = client.completions.create(**params)
|
s = client.completions.create(**params)
|
||||||
for chunk in s:
|
for chunk in s:
|
||||||
|
@ -112,7 +122,7 @@ class VLLMInferenceAdapter(ModelRegistryHelper, Inference):
|
||||||
|
|
||||||
def _get_params(self, request: ChatCompletionRequest) -> dict:
|
def _get_params(self, request: ChatCompletionRequest) -> dict:
|
||||||
return {
|
return {
|
||||||
"model": self.map_to_provider_model(request.model),
|
"model": request.model,
|
||||||
"prompt": chat_completion_request_to_prompt(request, self.formatter),
|
"prompt": chat_completion_request_to_prompt(request, self.formatter),
|
||||||
"stream": request.stream,
|
"stream": request.stream,
|
||||||
**get_sampling_options(request),
|
**get_sampling_options(request),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue