feat: use /v1/chat/completions for safety model inference (#3591)

# What does this PR do? migrate safety api implementation from /inference/chat-completion to /v1/chat/completions ## Test Plan ci w/ recordings --------- Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
2025-12-03 18:00:36 +00:00 · 2025-09-30 14:01:44 -04:00 · 2025-09-30 14:01:44 -04:00 · 2de4e6c900
commit 2de4e6c900
parent cb33f45c11
26 changed files with 1630 additions and 4 deletions
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -290,13 +290,13 @@ class LlamaGuardShield:
        else:
            shield_input_message = self.build_text_shield_input(messages)

-        # TODO: llama-stack inference protocol has issues with non-streaming inference code
-        response = await self.inference_api.chat_completion(
-            model_id=self.model,
+        response = await self.inference_api.openai_chat_completion(
+            model=self.model,
            messages=[shield_input_message],
            stream=False,
+            temperature=0.0,  # default is 1, which is too high for safety
        )
-        content = response.completion_message.content
+        content = response.choices[0].message.content
        content = content.strip()
        return self.get_shield_response(content)