Use inference APIs for executing Llama Guard (#121)

We should use Inference APIs to execute Llama Guard instead of directly needing to use HuggingFace modeling related code. The actual inference consideration is handled by Inference.
This commit is contained in:
Ashwin Bharambe 2024-09-28 15:40:06 -07:00 committed by GitHub
parent 6236634d84
commit 0a3999a9a4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 167 additions and 204 deletions

View file

@ -16,6 +16,8 @@ from llama_models.llama3.prompt_templates import (
)
from llama_models.sku_list import resolve_model
from llama_stack.providers.utils.inference import supported_inference_models
def augment_messages_for_tools(request: ChatCompletionRequest) -> List[Message]:
"""Reads chat completion request and augments the messages to handle tools.
@ -27,8 +29,8 @@ def augment_messages_for_tools(request: ChatCompletionRequest) -> List[Message]:
cprint(f"Could not resolve model {request.model}", color="red")
return request.messages
if model.model_family not in [ModelFamily.llama3_1, ModelFamily.llama3_2]:
cprint(f"Model family {model.model_family} not llama 3_1 or 3_2", color="red")
if model.descriptor() not in supported_inference_models():
cprint(f"Unsupported inference model? {model.descriptor()}", color="red")
return request.messages
if model.model_family == ModelFamily.llama3_1 or (