forked from phoenix-oss/llama-stack-mirror
# What does this PR do? Move around bits. This makes the copies from llama-models _much_ easier to maintain and ensures we don't entangle meta-reference specific tidbits into llama-models code even by accident. Also, kills the meta-reference-quantized-gpu distro and rolls quantization deps into meta-reference-gpu. ## Test Plan ``` LLAMA_MODELS_DEBUG=1 \ with-proxy llama stack run meta-reference-gpu \ --env INFERENCE_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct \ --env INFERENCE_CHECKPOINT_DIR=<DIR> \ --env MODEL_PARALLEL_SIZE=4 \ --env QUANTIZATION_TYPE=fp8_mixed ``` Start a server with and without quantization. Point integration tests to it using: ``` pytest -s -v tests/integration/inference/test_text_inference.py \ --stack-config http://localhost:8321 --text-model meta-llama/Llama-4-Scout-17B-16E-Instruct ```
171 lines
6.4 KiB
Python
171 lines
6.4 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from typing import List, Optional
|
|
|
|
import vllm
|
|
|
|
from llama_stack.apis.inference import (
|
|
ChatCompletionRequest,
|
|
GrammarResponseFormat,
|
|
JsonSchemaResponseFormat,
|
|
Message,
|
|
ToolChoice,
|
|
ToolDefinition,
|
|
UserMessage,
|
|
)
|
|
from llama_stack.models.llama.datatypes import BuiltinTool
|
|
from llama_stack.providers.utils.inference.openai_compat import (
|
|
convert_message_to_openai_dict,
|
|
get_sampling_options,
|
|
)
|
|
|
|
###############################################################################
|
|
# This file contains OpenAI compatibility code that is currently only used
|
|
# by the inline vLLM connector. Some or all of this code may be moved to a
|
|
# central location at a later date.
|
|
|
|
|
|
def _merge_context_into_content(message: Message) -> Message: # type: ignore
|
|
"""
|
|
Merge the ``context`` field of a Llama Stack ``Message`` object into
|
|
the content field for compabilitiy with OpenAI-style APIs.
|
|
|
|
Generates a content string that emulates the current behavior
|
|
of ``llama_models.llama3.api.chat_format.encode_message()``.
|
|
|
|
:param message: Message that may include ``context`` field
|
|
|
|
:returns: A version of ``message`` with any context merged into the
|
|
``content`` field.
|
|
"""
|
|
if not isinstance(message, UserMessage): # Separate type check for linter
|
|
return message
|
|
if message.context is None:
|
|
return message
|
|
return UserMessage(
|
|
role=message.role,
|
|
# Emumate llama_models.llama3.api.chat_format.encode_message()
|
|
content=message.content + "\n\n" + message.context,
|
|
context=None,
|
|
)
|
|
|
|
|
|
def _llama_stack_tools_to_openai_tools(
|
|
tools: Optional[List[ToolDefinition]] = None,
|
|
) -> List[vllm.entrypoints.openai.protocol.ChatCompletionToolsParam]:
|
|
"""
|
|
Convert the list of available tools from Llama Stack's format to vLLM's
|
|
version of OpenAI's format.
|
|
"""
|
|
if tools is None:
|
|
return []
|
|
|
|
result = []
|
|
for t in tools:
|
|
if isinstance(t.tool_name, BuiltinTool):
|
|
raise NotImplementedError("Built-in tools not yet implemented")
|
|
if t.parameters is None:
|
|
parameters = None
|
|
else: # if t.parameters is not None
|
|
# Convert the "required" flags to a list of required params
|
|
required_params = [k for k, v in t.parameters.items() if v.required]
|
|
parameters = {
|
|
"type": "object", # Mystery value that shows up in OpenAI docs
|
|
"properties": {
|
|
k: {"type": v.param_type, "description": v.description} for k, v in t.parameters.items()
|
|
},
|
|
"required": required_params,
|
|
}
|
|
|
|
function_def = vllm.entrypoints.openai.protocol.FunctionDefinition(
|
|
name=t.tool_name, description=t.description, parameters=parameters
|
|
)
|
|
|
|
# Every tool definition is double-boxed in a ChatCompletionToolsParam
|
|
result.append(vllm.entrypoints.openai.protocol.ChatCompletionToolsParam(function=function_def))
|
|
return result
|
|
|
|
|
|
async def llama_stack_chat_completion_to_openai_chat_completion_dict(
|
|
request: ChatCompletionRequest,
|
|
) -> dict:
|
|
"""
|
|
Convert a chat completion request in Llama Stack format into an
|
|
equivalent set of arguments to pass to an OpenAI-compatible
|
|
chat completions API.
|
|
|
|
:param request: Bundled request parameters in Llama Stack format.
|
|
|
|
:returns: Dictionary of key-value pairs to use as an initializer
|
|
for a dataclass or to be converted directly to JSON and sent
|
|
over the wire.
|
|
"""
|
|
|
|
converted_messages = [
|
|
# This mystery async call makes the parent function also be async
|
|
await convert_message_to_openai_dict(_merge_context_into_content(m), download=True)
|
|
for m in request.messages
|
|
]
|
|
converted_tools = _llama_stack_tools_to_openai_tools(request.tools)
|
|
|
|
# Llama will try to use built-in tools with no tool catalog, so don't enable
|
|
# tool choice unless at least one tool is enabled.
|
|
converted_tool_choice = "none"
|
|
if (
|
|
request.tool_config is not None
|
|
and request.tool_config.tool_choice == ToolChoice.auto
|
|
and request.tools is not None
|
|
and len(request.tools) > 0
|
|
):
|
|
converted_tool_choice = "auto"
|
|
|
|
# TODO: Figure out what to do with the tool_prompt_format argument.
|
|
# Other connectors appear to drop it quietly.
|
|
|
|
# Use Llama Stack shared code to translate sampling parameters.
|
|
sampling_options = get_sampling_options(request.sampling_params)
|
|
|
|
# get_sampling_options() translates repetition penalties to an option that
|
|
# OpenAI's APIs don't know about.
|
|
# vLLM's OpenAI-compatible API also handles repetition penalties wrong.
|
|
# For now, translate repetition penalties into a format that vLLM's broken
|
|
# API will handle correctly. Two wrongs make a right...
|
|
if "repeat_penalty" in sampling_options:
|
|
del sampling_options["repeat_penalty"]
|
|
if request.sampling_params.repetition_penalty is not None and request.sampling_params.repetition_penalty != 1.0:
|
|
sampling_options["repetition_penalty"] = request.sampling_params.repetition_penalty
|
|
|
|
# Convert a single response format into four different parameters, per
|
|
# the OpenAI spec
|
|
guided_decoding_options = dict()
|
|
if request.response_format is None:
|
|
# Use defaults
|
|
pass
|
|
elif isinstance(request.response_format, JsonSchemaResponseFormat):
|
|
guided_decoding_options["guided_json"] = request.response_format.json_schema
|
|
elif isinstance(request.response_format, GrammarResponseFormat):
|
|
guided_decoding_options["guided_grammar"] = request.response_format.bnf
|
|
else:
|
|
raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(request.response_format)}'")
|
|
|
|
logprob_options = dict()
|
|
if request.logprobs is not None:
|
|
logprob_options["logprobs"] = request.logprobs.top_k
|
|
|
|
# Marshall together all the arguments for a ChatCompletionRequest
|
|
request_options = {
|
|
"model": request.model,
|
|
"messages": converted_messages,
|
|
"tools": converted_tools,
|
|
"tool_choice": converted_tool_choice,
|
|
"stream": request.stream,
|
|
**sampling_options,
|
|
**guided_decoding_options,
|
|
**logprob_options,
|
|
}
|
|
|
|
return request_options
|