Use inference APIs for executing Llama Guard (#121)

We should use Inference APIs to execute Llama Guard instead of directly needing to use HuggingFace modeling related code. The actual inference consideration is handled by Inference.
2025-12-06 10:37:22 +00:00 · 2024-09-28 15:40:06 -07:00 · 2024-09-28 15:40:06 -07:00 · 0a3999a9a4
commit 0a3999a9a4
parent 6236634d84
9 changed files with 167 additions and 204 deletions
--- a/llama_stack/apis/inference/client.py
+++ b/llama_stack/apis/inference/client.py
@ -13,7 +13,6 @@ import httpx

 from llama_models.llama3.api.datatypes import ImageMedia, URL

-from PIL import Image as PIL_Image
 from pydantic import BaseModel

 from llama_models.llama3.api import *  # noqa: F403
@ -120,13 +119,9 @@ async def run_main(host: str, port: int, stream: bool):
 async def run_mm_main(host: str, port: int, stream: bool, path: str):
    client = InferenceClient(f"http://{host}:{port}")

-    with open(path, "rb") as f:
-        img = PIL_Image.open(f).convert("RGB")
-
    message = UserMessage(
        content=[
            ImageMedia(image=URL(uri=f"file://{path}")),
-            # ImageMedia(image=img),
            "Describe this image in two sentences",
        ],
    )