mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-30 23:51:00 +00:00
Merge branch 'main' into eval_task_register
This commit is contained in:
commit
00869799a1
6 changed files with 83 additions and 47 deletions
|
@ -57,13 +57,13 @@ class MetaReferenceSafetyImpl(Safety, ShieldsProtocolPrivate):
|
||||||
|
|
||||||
async def run_shield(
|
async def run_shield(
|
||||||
self,
|
self,
|
||||||
shield_type: str,
|
identifier: str,
|
||||||
messages: List[Message],
|
messages: List[Message],
|
||||||
params: Dict[str, Any] = None,
|
params: Dict[str, Any] = None,
|
||||||
) -> RunShieldResponse:
|
) -> RunShieldResponse:
|
||||||
shield_def = await self.shield_store.get_shield(shield_type)
|
shield_def = await self.shield_store.get_shield(identifier)
|
||||||
if not shield_def:
|
if not shield_def:
|
||||||
raise ValueError(f"Unknown shield {shield_type}")
|
raise ValueError(f"Unknown shield {identifier}")
|
||||||
|
|
||||||
shield = self.get_shield_impl(shield_def)
|
shield = self.get_shield_impl(shield_def)
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,9 @@ from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.prompt_adapter import (
|
from llama_stack.providers.utils.inference.prompt_adapter import (
|
||||||
chat_completion_request_to_prompt,
|
chat_completion_request_to_prompt,
|
||||||
|
completion_request_to_prompt,
|
||||||
|
convert_message_to_dict,
|
||||||
|
request_has_media,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .config import VLLMInferenceAdapterConfig
|
from .config import VLLMInferenceAdapterConfig
|
||||||
|
@ -105,19 +108,25 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
async def _nonstream_chat_completion(
|
async def _nonstream_chat_completion(
|
||||||
self, request: ChatCompletionRequest, client: OpenAI
|
self, request: ChatCompletionRequest, client: OpenAI
|
||||||
) -> ChatCompletionResponse:
|
) -> ChatCompletionResponse:
|
||||||
params = self._get_params(request)
|
params = await self._get_params(request)
|
||||||
r = client.completions.create(**params)
|
if "messages" in params:
|
||||||
|
r = client.chat.completions.create(**params)
|
||||||
|
else:
|
||||||
|
r = client.completions.create(**params)
|
||||||
return process_chat_completion_response(r, self.formatter)
|
return process_chat_completion_response(r, self.formatter)
|
||||||
|
|
||||||
async def _stream_chat_completion(
|
async def _stream_chat_completion(
|
||||||
self, request: ChatCompletionRequest, client: OpenAI
|
self, request: ChatCompletionRequest, client: OpenAI
|
||||||
) -> AsyncGenerator:
|
) -> AsyncGenerator:
|
||||||
params = self._get_params(request)
|
params = await self._get_params(request)
|
||||||
|
|
||||||
# TODO: Can we use client.completions.acreate() or maybe there is another way to directly create an async
|
# TODO: Can we use client.completions.acreate() or maybe there is another way to directly create an async
|
||||||
# generator so this wrapper is not necessary?
|
# generator so this wrapper is not necessary?
|
||||||
async def _to_async_generator():
|
async def _to_async_generator():
|
||||||
s = client.completions.create(**params)
|
if "messages" in params:
|
||||||
|
s = client.chat.completions.create(**params)
|
||||||
|
else:
|
||||||
|
s = client.completions.create(**params)
|
||||||
for chunk in s:
|
for chunk in s:
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
@ -127,7 +136,9 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
):
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
def _get_params(self, request: ChatCompletionRequest) -> dict:
|
async def _get_params(
|
||||||
|
self, request: Union[ChatCompletionRequest, CompletionRequest]
|
||||||
|
) -> dict:
|
||||||
options = get_sampling_options(request.sampling_params)
|
options = get_sampling_options(request.sampling_params)
|
||||||
if "max_tokens" not in options:
|
if "max_tokens" not in options:
|
||||||
options["max_tokens"] = self.config.max_tokens
|
options["max_tokens"] = self.config.max_tokens
|
||||||
|
@ -136,9 +147,28 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
if model is None:
|
if model is None:
|
||||||
raise ValueError(f"Unknown model: {request.model}")
|
raise ValueError(f"Unknown model: {request.model}")
|
||||||
|
|
||||||
|
input_dict = {}
|
||||||
|
media_present = request_has_media(request)
|
||||||
|
if isinstance(request, ChatCompletionRequest):
|
||||||
|
if media_present:
|
||||||
|
# vllm does not seem to work well with image urls, so we download the images
|
||||||
|
input_dict["messages"] = [
|
||||||
|
await convert_message_to_dict(m, download=True)
|
||||||
|
for m in request.messages
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
input_dict["prompt"] = chat_completion_request_to_prompt(
|
||||||
|
request, self.formatter
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
not media_present
|
||||||
|
), "Together does not support media for Completion requests"
|
||||||
|
input_dict["prompt"] = completion_request_to_prompt(request, self.formatter)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"model": model.huggingface_repo,
|
"model": model.huggingface_repo,
|
||||||
"prompt": chat_completion_request_to_prompt(request, self.formatter),
|
**input_dict,
|
||||||
"stream": request.stream,
|
"stream": request.stream,
|
||||||
**options,
|
**options,
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,21 +28,21 @@ We have the following orthogonal parametrizations (pytest "marks") for inference
|
||||||
|
|
||||||
If you want to run a test with the llama_8b model with fireworks, you can use:
|
If you want to run a test with the llama_8b model with fireworks, you can use:
|
||||||
```bash
|
```bash
|
||||||
pytest -s -v llama_stack/providers/tests/inference/test_inference.py \
|
pytest -s -v llama_stack/providers/tests/inference/test_text_inference.py \
|
||||||
-m "fireworks and llama_8b" \
|
-m "fireworks and llama_8b" \
|
||||||
--env FIREWORKS_API_KEY=<...>
|
--env FIREWORKS_API_KEY=<...>
|
||||||
```
|
```
|
||||||
|
|
||||||
You can make it more complex to run both llama_8b and llama_3b on Fireworks, but only llama_3b with Ollama:
|
You can make it more complex to run both llama_8b and llama_3b on Fireworks, but only llama_3b with Ollama:
|
||||||
```bash
|
```bash
|
||||||
pytest -s -v llama_stack/providers/tests/inference/test_inference.py \
|
pytest -s -v llama_stack/providers/tests/inference/test_text_inference.py \
|
||||||
-m "fireworks or (ollama and llama_3b)" \
|
-m "fireworks or (ollama and llama_3b)" \
|
||||||
--env FIREWORKS_API_KEY=<...>
|
--env FIREWORKS_API_KEY=<...>
|
||||||
```
|
```
|
||||||
|
|
||||||
Finally, you can override the model completely by doing:
|
Finally, you can override the model completely by doing:
|
||||||
```bash
|
```bash
|
||||||
pytest -s -v llama_stack/providers/tests/inference/test_inference.py \
|
pytest -s -v llama_stack/providers/tests/inference/test_text_inference.py \
|
||||||
-m fireworks \
|
-m fireworks \
|
||||||
--inference-model "Llama3.1-70B-Instruct" \
|
--inference-model "Llama3.1-70B-Instruct" \
|
||||||
--env FIREWORKS_API_KEY=<...>
|
--env FIREWORKS_API_KEY=<...>
|
||||||
|
|
|
@ -19,7 +19,7 @@ from .utils import group_chunks
|
||||||
|
|
||||||
# How to run this test:
|
# How to run this test:
|
||||||
#
|
#
|
||||||
# pytest -v -s llama_stack/providers/tests/inference/test_inference.py
|
# pytest -v -s llama_stack/providers/tests/inference/test_text_inference.py
|
||||||
# -m "(fireworks or ollama) and llama_3b"
|
# -m "(fireworks or ollama) and llama_3b"
|
||||||
# --env FIREWORKS_API_KEY=<your_api_key>
|
# --env FIREWORKS_API_KEY=<your_api_key>
|
||||||
|
|
|
@ -20,8 +20,25 @@ THIS_DIR = Path(__file__).parent
|
||||||
|
|
||||||
class TestVisionModelInference:
|
class TestVisionModelInference:
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"image, expected_strings",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
ImageMedia(image=PIL_Image.open(THIS_DIR / "pasta.jpeg")),
|
||||||
|
["spaghetti"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
ImageMedia(
|
||||||
|
image=URL(
|
||||||
|
uri="https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
["puppy"],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
async def test_vision_chat_completion_non_streaming(
|
async def test_vision_chat_completion_non_streaming(
|
||||||
self, inference_model, inference_stack
|
self, inference_model, inference_stack, image, expected_strings
|
||||||
):
|
):
|
||||||
inference_impl, _ = inference_stack
|
inference_impl, _ = inference_stack
|
||||||
|
|
||||||
|
@ -31,42 +48,27 @@ class TestVisionModelInference:
|
||||||
"remote::together",
|
"remote::together",
|
||||||
"remote::fireworks",
|
"remote::fireworks",
|
||||||
"remote::ollama",
|
"remote::ollama",
|
||||||
|
"remote::vllm",
|
||||||
):
|
):
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
"Other inference providers don't support vision chat completion() yet"
|
"Other inference providers don't support vision chat completion() yet"
|
||||||
)
|
)
|
||||||
|
|
||||||
images = [
|
response = await inference_impl.chat_completion(
|
||||||
ImageMedia(image=PIL_Image.open(THIS_DIR / "pasta.jpeg")),
|
model=inference_model,
|
||||||
ImageMedia(
|
messages=[
|
||||||
image=URL(
|
UserMessage(content="You are a helpful assistant."),
|
||||||
uri="https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
|
UserMessage(content=[image, "Describe this image in two sentences."]),
|
||||||
)
|
],
|
||||||
),
|
stream=False,
|
||||||
]
|
sampling_params=SamplingParams(max_tokens=100),
|
||||||
|
)
|
||||||
|
|
||||||
# These are a bit hit-and-miss, need to be careful
|
assert isinstance(response, ChatCompletionResponse)
|
||||||
expected_strings_to_check = [
|
assert response.completion_message.role == "assistant"
|
||||||
["spaghetti"],
|
assert isinstance(response.completion_message.content, str)
|
||||||
["puppy"],
|
for expected_string in expected_strings:
|
||||||
]
|
assert expected_string in response.completion_message.content
|
||||||
for image, expected_strings in zip(images, expected_strings_to_check):
|
|
||||||
response = await inference_impl.chat_completion(
|
|
||||||
model=inference_model,
|
|
||||||
messages=[
|
|
||||||
SystemMessage(content="You are a helpful assistant."),
|
|
||||||
UserMessage(
|
|
||||||
content=[image, "Describe this image in two sentences."]
|
|
||||||
),
|
|
||||||
],
|
|
||||||
stream=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert isinstance(response, ChatCompletionResponse)
|
|
||||||
assert response.completion_message.role == "assistant"
|
|
||||||
assert isinstance(response.completion_message.content, str)
|
|
||||||
for expected_string in expected_strings:
|
|
||||||
assert expected_string in response.completion_message.content
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_vision_chat_completion_streaming(
|
async def test_vision_chat_completion_streaming(
|
||||||
|
@ -80,6 +82,7 @@ class TestVisionModelInference:
|
||||||
"remote::together",
|
"remote::together",
|
||||||
"remote::fireworks",
|
"remote::fireworks",
|
||||||
"remote::ollama",
|
"remote::ollama",
|
||||||
|
"remote::vllm",
|
||||||
):
|
):
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
"Other inference providers don't support vision chat completion() yet"
|
"Other inference providers don't support vision chat completion() yet"
|
||||||
|
@ -101,12 +104,13 @@ class TestVisionModelInference:
|
||||||
async for r in await inference_impl.chat_completion(
|
async for r in await inference_impl.chat_completion(
|
||||||
model=inference_model,
|
model=inference_model,
|
||||||
messages=[
|
messages=[
|
||||||
SystemMessage(content="You are a helpful assistant."),
|
UserMessage(content="You are a helpful assistant."),
|
||||||
UserMessage(
|
UserMessage(
|
||||||
content=[image, "Describe this image in two sentences."]
|
content=[image, "Describe this image in two sentences."]
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
stream=True,
|
stream=True,
|
||||||
|
sampling_params=SamplingParams(max_tokens=100),
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -90,13 +90,15 @@ async def convert_image_media_to_url(
|
||||||
return base64.b64encode(content).decode("utf-8")
|
return base64.b64encode(content).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
async def convert_message_to_dict(message: Message) -> dict:
|
# TODO: name this function better! this is about OpenAI compatibile image
|
||||||
|
# media conversion of the message. this should probably go in openai_compat.py
|
||||||
|
async def convert_message_to_dict(message: Message, download: bool = False) -> dict:
|
||||||
async def _convert_content(content) -> dict:
|
async def _convert_content(content) -> dict:
|
||||||
if isinstance(content, ImageMedia):
|
if isinstance(content, ImageMedia):
|
||||||
return {
|
return {
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": await convert_image_media_to_url(content),
|
"url": await convert_image_media_to_url(content, download=download),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue