mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
# What does this PR do? Fixes error when running an provider using openai_compat.py ```python Traceback (most recent call last): File "/home/ubuntu/miniconda3/envs/llamastack-vllm/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/ubuntu/miniconda3/envs/llamastack-vllm/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/home/ubuntu/us-south-2/llama-stack/llama_stack/distribution/server/server.py", line 426, in <module> main() File "/home/ubuntu/us-south-2/llama-stack/llama_stack/distribution/server/server.py", line 349, in main impls = asyncio.run(construct_stack(config)) File "/home/ubuntu/miniconda3/envs/llamastack-vllm/lib/python3.10/asyncio/runners.py", line 44, in run return loop.run_until_complete(main) File "/home/ubuntu/miniconda3/envs/llamastack-vllm/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete return future.result() File "/home/ubuntu/us-south-2/llama-stack/llama_stack/distribution/stack.py", line 207, in construct_stack impls = await resolve_impls( File "/home/ubuntu/us-south-2/llama-stack/llama_stack/distribution/resolver.py", line 239, in resolve_impls impl = await instantiate_provider( File "/home/ubuntu/us-south-2/llama-stack/llama_stack/distribution/resolver.py", line 330, in instantiate_provider impl = await fn(*args) File "/home/ubuntu/us-south-2/llama-stack/llama_stack/providers/remote/inference/vllm/__init__.py", line 11, in get_adapter_impl from .vllm import VLLMInferenceAdapter File "/home/ubuntu/us-south-2/llama-stack/llama_stack/providers/remote/inference/vllm/vllm.py", line 39, in <module> from llama_stack.providers.utils.inference.openai_compat import ( File "/home/ubuntu/us-south-2/llama-stack/llama_stack/providers/utils/inference/openai_compat.py", line 11, in <module> from llama_models.llama3.api.datatypes import ( ImportError: cannot import name 'GreedySamplingStrategy' from 'llama_models.llama3.api.datatypes' (/home/ubuntu/miniconda3/envs/llamastack-vllm/lib/python3.10/site-packages/llama_models/llama3/api/datatypes.py) ++ error_handler 61 ++ echo 'Error occurred in script at line: 61' Error occurred in script at line: 61 ++ exit 1 ``` ## Test Plan ```bash conda create --name llamastack-vllm python=3.10 conda activate llamastack-vllm # To sync with the current llama-models repo pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models export INFERENCE_MODEL=unsloth/Llama-3.3-70B-Instruct-bnb-4bit && \ pip install -e . && \ llama stack build --template remote-vllm --image-type conda && \ llama stack run ./distributions/remote-vllm/run.yaml \ --port 5000 \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env VLLM_URL=http://localhost:8000 ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests.
346 lines
11 KiB
Python
346 lines
11 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from typing import AsyncGenerator, Dict, List, Optional
|
|
|
|
from llama_models.datatypes import (
|
|
GreedySamplingStrategy,
|
|
SamplingParams,
|
|
TopKSamplingStrategy,
|
|
TopPSamplingStrategy,
|
|
)
|
|
|
|
from llama_models.llama3.api.chat_format import ChatFormat
|
|
from llama_models.llama3.api.datatypes import StopReason
|
|
from pydantic import BaseModel
|
|
|
|
from llama_stack.apis.common.content_types import (
|
|
ImageContentItem,
|
|
TextContentItem,
|
|
TextDelta,
|
|
ToolCallDelta,
|
|
ToolCallParseStatus,
|
|
)
|
|
|
|
from llama_stack.apis.inference import (
|
|
ChatCompletionResponse,
|
|
ChatCompletionResponseEvent,
|
|
ChatCompletionResponseEventType,
|
|
ChatCompletionResponseStreamChunk,
|
|
CompletionMessage,
|
|
CompletionResponse,
|
|
CompletionResponseStreamChunk,
|
|
Message,
|
|
TokenLogProbs,
|
|
)
|
|
|
|
from llama_stack.providers.utils.inference.prompt_adapter import (
|
|
convert_image_content_to_url,
|
|
)
|
|
|
|
|
|
class OpenAICompatCompletionChoiceDelta(BaseModel):
|
|
content: str
|
|
|
|
|
|
class OpenAICompatLogprobs(BaseModel):
|
|
text_offset: Optional[List[int]] = None
|
|
|
|
token_logprobs: Optional[List[float]] = None
|
|
|
|
tokens: Optional[List[str]] = None
|
|
|
|
top_logprobs: Optional[List[Dict[str, float]]] = None
|
|
|
|
|
|
class OpenAICompatCompletionChoice(BaseModel):
|
|
finish_reason: Optional[str] = None
|
|
text: Optional[str] = None
|
|
delta: Optional[OpenAICompatCompletionChoiceDelta] = None
|
|
logprobs: Optional[OpenAICompatLogprobs] = None
|
|
|
|
|
|
class OpenAICompatCompletionResponse(BaseModel):
|
|
choices: List[OpenAICompatCompletionChoice]
|
|
|
|
|
|
def get_sampling_strategy_options(params: SamplingParams) -> dict:
|
|
options = {}
|
|
if isinstance(params.strategy, GreedySamplingStrategy):
|
|
options["temperature"] = 0.0
|
|
elif isinstance(params.strategy, TopPSamplingStrategy):
|
|
options["temperature"] = params.strategy.temperature
|
|
options["top_p"] = params.strategy.top_p
|
|
elif isinstance(params.strategy, TopKSamplingStrategy):
|
|
options["top_k"] = params.strategy.top_k
|
|
else:
|
|
raise ValueError(f"Unsupported sampling strategy: {params.strategy}")
|
|
|
|
return options
|
|
|
|
|
|
def get_sampling_options(params: SamplingParams) -> dict:
|
|
options = {}
|
|
if params:
|
|
options.update(get_sampling_strategy_options(params))
|
|
if params.max_tokens:
|
|
options["max_tokens"] = params.max_tokens
|
|
|
|
if params.repetition_penalty is not None and params.repetition_penalty != 1.0:
|
|
options["repeat_penalty"] = params.repetition_penalty
|
|
|
|
return options
|
|
|
|
|
|
def text_from_choice(choice) -> str:
|
|
if hasattr(choice, "delta") and choice.delta:
|
|
return choice.delta.content
|
|
|
|
if hasattr(choice, "message"):
|
|
return choice.message.content
|
|
|
|
return choice.text
|
|
|
|
|
|
def get_stop_reason(finish_reason: str) -> StopReason:
|
|
if finish_reason in ["stop", "eos"]:
|
|
return StopReason.end_of_turn
|
|
elif finish_reason == "eom":
|
|
return StopReason.end_of_message
|
|
elif finish_reason == "length":
|
|
return StopReason.out_of_tokens
|
|
|
|
return StopReason.out_of_tokens
|
|
|
|
|
|
def convert_openai_completion_logprobs(
|
|
logprobs: Optional[OpenAICompatLogprobs],
|
|
) -> Optional[List[TokenLogProbs]]:
|
|
if not logprobs:
|
|
return None
|
|
return [TokenLogProbs(logprobs_by_token=x) for x in logprobs.top_logprobs]
|
|
|
|
|
|
def process_completion_response(
|
|
response: OpenAICompatCompletionResponse, formatter: ChatFormat
|
|
) -> CompletionResponse:
|
|
choice = response.choices[0]
|
|
# drop suffix <eot_id> if present and return stop reason as end of turn
|
|
if choice.text.endswith("<|eot_id|>"):
|
|
return CompletionResponse(
|
|
stop_reason=StopReason.end_of_turn,
|
|
content=choice.text[: -len("<|eot_id|>")],
|
|
logprobs=convert_openai_completion_logprobs(choice.logprobs),
|
|
)
|
|
# drop suffix <eom_id> if present and return stop reason as end of message
|
|
if choice.text.endswith("<|eom_id|>"):
|
|
return CompletionResponse(
|
|
stop_reason=StopReason.end_of_message,
|
|
content=choice.text[: -len("<|eom_id|>")],
|
|
logprobs=convert_openai_completion_logprobs(choice.logprobs),
|
|
)
|
|
return CompletionResponse(
|
|
stop_reason=get_stop_reason(choice.finish_reason),
|
|
content=choice.text,
|
|
logprobs=convert_openai_completion_logprobs(choice.logprobs),
|
|
)
|
|
|
|
|
|
def process_chat_completion_response(
|
|
response: OpenAICompatCompletionResponse, formatter: ChatFormat
|
|
) -> ChatCompletionResponse:
|
|
choice = response.choices[0]
|
|
|
|
raw_message = formatter.decode_assistant_message_from_content(
|
|
text_from_choice(choice), get_stop_reason(choice.finish_reason)
|
|
)
|
|
return ChatCompletionResponse(
|
|
completion_message=CompletionMessage(
|
|
content=raw_message.content,
|
|
stop_reason=raw_message.stop_reason,
|
|
tool_calls=raw_message.tool_calls,
|
|
),
|
|
logprobs=None,
|
|
)
|
|
|
|
|
|
async def process_completion_stream_response(
|
|
stream: AsyncGenerator[OpenAICompatCompletionResponse, None], formatter: ChatFormat
|
|
) -> AsyncGenerator:
|
|
stop_reason = None
|
|
|
|
async for chunk in stream:
|
|
choice = chunk.choices[0]
|
|
finish_reason = choice.finish_reason
|
|
|
|
text = text_from_choice(choice)
|
|
if text == "<|eot_id|>":
|
|
stop_reason = StopReason.end_of_turn
|
|
text = ""
|
|
continue
|
|
elif text == "<|eom_id|>":
|
|
stop_reason = StopReason.end_of_message
|
|
text = ""
|
|
continue
|
|
yield CompletionResponseStreamChunk(
|
|
delta=text,
|
|
stop_reason=stop_reason,
|
|
logprobs=convert_openai_completion_logprobs(choice.logprobs),
|
|
)
|
|
if finish_reason:
|
|
if finish_reason in ["stop", "eos", "eos_token"]:
|
|
stop_reason = StopReason.end_of_turn
|
|
elif finish_reason == "length":
|
|
stop_reason = StopReason.out_of_tokens
|
|
break
|
|
|
|
yield CompletionResponseStreamChunk(
|
|
delta="",
|
|
stop_reason=stop_reason,
|
|
)
|
|
|
|
|
|
async def process_chat_completion_stream_response(
|
|
stream: AsyncGenerator[OpenAICompatCompletionResponse, None], formatter: ChatFormat
|
|
) -> AsyncGenerator:
|
|
yield ChatCompletionResponseStreamChunk(
|
|
event=ChatCompletionResponseEvent(
|
|
event_type=ChatCompletionResponseEventType.start,
|
|
delta=TextDelta(text=""),
|
|
)
|
|
)
|
|
|
|
buffer = ""
|
|
ipython = False
|
|
stop_reason = None
|
|
|
|
async for chunk in stream:
|
|
choice = chunk.choices[0]
|
|
finish_reason = choice.finish_reason
|
|
|
|
if finish_reason:
|
|
if stop_reason is None and finish_reason in ["stop", "eos", "eos_token"]:
|
|
stop_reason = StopReason.end_of_turn
|
|
elif stop_reason is None and finish_reason == "length":
|
|
stop_reason = StopReason.out_of_tokens
|
|
break
|
|
|
|
text = text_from_choice(choice)
|
|
if not text:
|
|
# Sometimes you get empty chunks from providers
|
|
continue
|
|
|
|
# check if its a tool call ( aka starts with <|python_tag|> )
|
|
if not ipython and text.startswith("<|python_tag|>"):
|
|
ipython = True
|
|
yield ChatCompletionResponseStreamChunk(
|
|
event=ChatCompletionResponseEvent(
|
|
event_type=ChatCompletionResponseEventType.progress,
|
|
delta=ToolCallDelta(
|
|
content="",
|
|
parse_status=ToolCallParseStatus.started,
|
|
),
|
|
)
|
|
)
|
|
buffer += text
|
|
continue
|
|
|
|
if text == "<|eot_id|>":
|
|
stop_reason = StopReason.end_of_turn
|
|
text = ""
|
|
continue
|
|
elif text == "<|eom_id|>":
|
|
stop_reason = StopReason.end_of_message
|
|
text = ""
|
|
continue
|
|
|
|
if ipython:
|
|
buffer += text
|
|
delta = ToolCallDelta(
|
|
content=text,
|
|
parse_status=ToolCallParseStatus.in_progress,
|
|
)
|
|
|
|
yield ChatCompletionResponseStreamChunk(
|
|
event=ChatCompletionResponseEvent(
|
|
event_type=ChatCompletionResponseEventType.progress,
|
|
delta=delta,
|
|
stop_reason=stop_reason,
|
|
)
|
|
)
|
|
else:
|
|
buffer += text
|
|
yield ChatCompletionResponseStreamChunk(
|
|
event=ChatCompletionResponseEvent(
|
|
event_type=ChatCompletionResponseEventType.progress,
|
|
delta=TextDelta(text=text),
|
|
stop_reason=stop_reason,
|
|
)
|
|
)
|
|
|
|
# parse tool calls and report errors
|
|
message = formatter.decode_assistant_message_from_content(buffer, stop_reason)
|
|
parsed_tool_calls = len(message.tool_calls) > 0
|
|
if ipython and not parsed_tool_calls:
|
|
yield ChatCompletionResponseStreamChunk(
|
|
event=ChatCompletionResponseEvent(
|
|
event_type=ChatCompletionResponseEventType.progress,
|
|
delta=ToolCallDelta(
|
|
content="",
|
|
parse_status=ToolCallParseStatus.failed,
|
|
),
|
|
stop_reason=stop_reason,
|
|
)
|
|
)
|
|
|
|
for tool_call in message.tool_calls:
|
|
yield ChatCompletionResponseStreamChunk(
|
|
event=ChatCompletionResponseEvent(
|
|
event_type=ChatCompletionResponseEventType.progress,
|
|
delta=ToolCallDelta(
|
|
content=tool_call,
|
|
parse_status=ToolCallParseStatus.succeeded,
|
|
),
|
|
stop_reason=stop_reason,
|
|
)
|
|
)
|
|
|
|
yield ChatCompletionResponseStreamChunk(
|
|
event=ChatCompletionResponseEvent(
|
|
event_type=ChatCompletionResponseEventType.complete,
|
|
delta=TextDelta(text=""),
|
|
stop_reason=stop_reason,
|
|
)
|
|
)
|
|
|
|
|
|
async def convert_message_to_openai_dict(
|
|
message: Message, download: bool = False
|
|
) -> dict:
|
|
async def _convert_content(content) -> dict:
|
|
if isinstance(content, ImageContentItem):
|
|
return {
|
|
"type": "image_url",
|
|
"image_url": {
|
|
"url": await convert_image_content_to_url(
|
|
content, download=download
|
|
),
|
|
},
|
|
}
|
|
else:
|
|
text = content.text if isinstance(content, TextContentItem) else content
|
|
assert isinstance(text, str)
|
|
return {"type": "text", "text": text}
|
|
|
|
if isinstance(message.content, list):
|
|
content = [await _convert_content(c) for c in message.content]
|
|
else:
|
|
content = [await _convert_content(message.content)]
|
|
|
|
return {
|
|
"role": message.role,
|
|
"content": content,
|
|
}
|