Merge branch 'main' into eval_task_register

This commit is contained in:
Xi Yan 2024-11-06 16:34:22 -08:00
commit 00869799a1
6 changed files with 83 additions and 47 deletions

View file

@ -57,13 +57,13 @@ class MetaReferenceSafetyImpl(Safety, ShieldsProtocolPrivate):
async def run_shield( async def run_shield(
self, self,
shield_type: str, identifier: str,
messages: List[Message], messages: List[Message],
params: Dict[str, Any] = None, params: Dict[str, Any] = None,
) -> RunShieldResponse: ) -> RunShieldResponse:
shield_def = await self.shield_store.get_shield(shield_type) shield_def = await self.shield_store.get_shield(identifier)
if not shield_def: if not shield_def:
raise ValueError(f"Unknown shield {shield_type}") raise ValueError(f"Unknown shield {identifier}")
shield = self.get_shield_impl(shield_def) shield = self.get_shield_impl(shield_def)

View file

@ -22,6 +22,9 @@ from llama_stack.providers.utils.inference.openai_compat import (
) )
from llama_stack.providers.utils.inference.prompt_adapter import ( from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt, chat_completion_request_to_prompt,
completion_request_to_prompt,
convert_message_to_dict,
request_has_media,
) )
from .config import VLLMInferenceAdapterConfig from .config import VLLMInferenceAdapterConfig
@ -105,19 +108,25 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
async def _nonstream_chat_completion( async def _nonstream_chat_completion(
self, request: ChatCompletionRequest, client: OpenAI self, request: ChatCompletionRequest, client: OpenAI
) -> ChatCompletionResponse: ) -> ChatCompletionResponse:
params = self._get_params(request) params = await self._get_params(request)
r = client.completions.create(**params) if "messages" in params:
r = client.chat.completions.create(**params)
else:
r = client.completions.create(**params)
return process_chat_completion_response(r, self.formatter) return process_chat_completion_response(r, self.formatter)
async def _stream_chat_completion( async def _stream_chat_completion(
self, request: ChatCompletionRequest, client: OpenAI self, request: ChatCompletionRequest, client: OpenAI
) -> AsyncGenerator: ) -> AsyncGenerator:
params = self._get_params(request) params = await self._get_params(request)
# TODO: Can we use client.completions.acreate() or maybe there is another way to directly create an async # TODO: Can we use client.completions.acreate() or maybe there is another way to directly create an async
# generator so this wrapper is not necessary? # generator so this wrapper is not necessary?
async def _to_async_generator(): async def _to_async_generator():
s = client.completions.create(**params) if "messages" in params:
s = client.chat.completions.create(**params)
else:
s = client.completions.create(**params)
for chunk in s: for chunk in s:
yield chunk yield chunk
@ -127,7 +136,9 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
): ):
yield chunk yield chunk
def _get_params(self, request: ChatCompletionRequest) -> dict: async def _get_params(
self, request: Union[ChatCompletionRequest, CompletionRequest]
) -> dict:
options = get_sampling_options(request.sampling_params) options = get_sampling_options(request.sampling_params)
if "max_tokens" not in options: if "max_tokens" not in options:
options["max_tokens"] = self.config.max_tokens options["max_tokens"] = self.config.max_tokens
@ -136,9 +147,28 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
if model is None: if model is None:
raise ValueError(f"Unknown model: {request.model}") raise ValueError(f"Unknown model: {request.model}")
input_dict = {}
media_present = request_has_media(request)
if isinstance(request, ChatCompletionRequest):
if media_present:
# vllm does not seem to work well with image urls, so we download the images
input_dict["messages"] = [
await convert_message_to_dict(m, download=True)
for m in request.messages
]
else:
input_dict["prompt"] = chat_completion_request_to_prompt(
request, self.formatter
)
else:
assert (
not media_present
), "Together does not support media for Completion requests"
input_dict["prompt"] = completion_request_to_prompt(request, self.formatter)
return { return {
"model": model.huggingface_repo, "model": model.huggingface_repo,
"prompt": chat_completion_request_to_prompt(request, self.formatter), **input_dict,
"stream": request.stream, "stream": request.stream,
**options, **options,
} }

View file

@ -28,21 +28,21 @@ We have the following orthogonal parametrizations (pytest "marks") for inference
If you want to run a test with the llama_8b model with fireworks, you can use: If you want to run a test with the llama_8b model with fireworks, you can use:
```bash ```bash
pytest -s -v llama_stack/providers/tests/inference/test_inference.py \ pytest -s -v llama_stack/providers/tests/inference/test_text_inference.py \
-m "fireworks and llama_8b" \ -m "fireworks and llama_8b" \
--env FIREWORKS_API_KEY=<...> --env FIREWORKS_API_KEY=<...>
``` ```
You can make it more complex to run both llama_8b and llama_3b on Fireworks, but only llama_3b with Ollama: You can make it more complex to run both llama_8b and llama_3b on Fireworks, but only llama_3b with Ollama:
```bash ```bash
pytest -s -v llama_stack/providers/tests/inference/test_inference.py \ pytest -s -v llama_stack/providers/tests/inference/test_text_inference.py \
-m "fireworks or (ollama and llama_3b)" \ -m "fireworks or (ollama and llama_3b)" \
--env FIREWORKS_API_KEY=<...> --env FIREWORKS_API_KEY=<...>
``` ```
Finally, you can override the model completely by doing: Finally, you can override the model completely by doing:
```bash ```bash
pytest -s -v llama_stack/providers/tests/inference/test_inference.py \ pytest -s -v llama_stack/providers/tests/inference/test_text_inference.py \
-m fireworks \ -m fireworks \
--inference-model "Llama3.1-70B-Instruct" \ --inference-model "Llama3.1-70B-Instruct" \
--env FIREWORKS_API_KEY=<...> --env FIREWORKS_API_KEY=<...>

View file

@ -19,7 +19,7 @@ from .utils import group_chunks
# How to run this test: # How to run this test:
# #
# pytest -v -s llama_stack/providers/tests/inference/test_inference.py # pytest -v -s llama_stack/providers/tests/inference/test_text_inference.py
# -m "(fireworks or ollama) and llama_3b" # -m "(fireworks or ollama) and llama_3b"
# --env FIREWORKS_API_KEY=<your_api_key> # --env FIREWORKS_API_KEY=<your_api_key>

View file

@ -20,8 +20,25 @@ THIS_DIR = Path(__file__).parent
class TestVisionModelInference: class TestVisionModelInference:
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize(
"image, expected_strings",
[
(
ImageMedia(image=PIL_Image.open(THIS_DIR / "pasta.jpeg")),
["spaghetti"],
),
(
ImageMedia(
image=URL(
uri="https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
)
),
["puppy"],
),
],
)
async def test_vision_chat_completion_non_streaming( async def test_vision_chat_completion_non_streaming(
self, inference_model, inference_stack self, inference_model, inference_stack, image, expected_strings
): ):
inference_impl, _ = inference_stack inference_impl, _ = inference_stack
@ -31,42 +48,27 @@ class TestVisionModelInference:
"remote::together", "remote::together",
"remote::fireworks", "remote::fireworks",
"remote::ollama", "remote::ollama",
"remote::vllm",
): ):
pytest.skip( pytest.skip(
"Other inference providers don't support vision chat completion() yet" "Other inference providers don't support vision chat completion() yet"
) )
images = [ response = await inference_impl.chat_completion(
ImageMedia(image=PIL_Image.open(THIS_DIR / "pasta.jpeg")), model=inference_model,
ImageMedia( messages=[
image=URL( UserMessage(content="You are a helpful assistant."),
uri="https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" UserMessage(content=[image, "Describe this image in two sentences."]),
) ],
), stream=False,
] sampling_params=SamplingParams(max_tokens=100),
)
# These are a bit hit-and-miss, need to be careful assert isinstance(response, ChatCompletionResponse)
expected_strings_to_check = [ assert response.completion_message.role == "assistant"
["spaghetti"], assert isinstance(response.completion_message.content, str)
["puppy"], for expected_string in expected_strings:
] assert expected_string in response.completion_message.content
for image, expected_strings in zip(images, expected_strings_to_check):
response = await inference_impl.chat_completion(
model=inference_model,
messages=[
SystemMessage(content="You are a helpful assistant."),
UserMessage(
content=[image, "Describe this image in two sentences."]
),
],
stream=False,
)
assert isinstance(response, ChatCompletionResponse)
assert response.completion_message.role == "assistant"
assert isinstance(response.completion_message.content, str)
for expected_string in expected_strings:
assert expected_string in response.completion_message.content
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_vision_chat_completion_streaming( async def test_vision_chat_completion_streaming(
@ -80,6 +82,7 @@ class TestVisionModelInference:
"remote::together", "remote::together",
"remote::fireworks", "remote::fireworks",
"remote::ollama", "remote::ollama",
"remote::vllm",
): ):
pytest.skip( pytest.skip(
"Other inference providers don't support vision chat completion() yet" "Other inference providers don't support vision chat completion() yet"
@ -101,12 +104,13 @@ class TestVisionModelInference:
async for r in await inference_impl.chat_completion( async for r in await inference_impl.chat_completion(
model=inference_model, model=inference_model,
messages=[ messages=[
SystemMessage(content="You are a helpful assistant."), UserMessage(content="You are a helpful assistant."),
UserMessage( UserMessage(
content=[image, "Describe this image in two sentences."] content=[image, "Describe this image in two sentences."]
), ),
], ],
stream=True, stream=True,
sampling_params=SamplingParams(max_tokens=100),
) )
] ]

View file

@ -90,13 +90,15 @@ async def convert_image_media_to_url(
return base64.b64encode(content).decode("utf-8") return base64.b64encode(content).decode("utf-8")
async def convert_message_to_dict(message: Message) -> dict: # TODO: name this function better! this is about OpenAI compatibile image
# media conversion of the message. this should probably go in openai_compat.py
async def convert_message_to_dict(message: Message, download: bool = False) -> dict:
async def _convert_content(content) -> dict: async def _convert_content(content) -> dict:
if isinstance(content, ImageMedia): if isinstance(content, ImageMedia):
return { return {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": await convert_image_media_to_url(content), "url": await convert_image_media_to_url(content, download=download),
}, },
} }
else: else: