feat: use /v1/chat/completions for safety model inference (#3591)

# What does this PR do?

migrate safety api implementation from /inference/chat-completion to
/v1/chat/completions

## Test Plan

ci w/ recordings

---------

Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
Matthew Farrellee 2025-09-30 14:01:44 -04:00 committed by GitHub
parent cb33f45c11
commit 2de4e6c900
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
26 changed files with 1630 additions and 4 deletions

View file

@ -290,13 +290,13 @@ class LlamaGuardShield:
else:
shield_input_message = self.build_text_shield_input(messages)
# TODO: llama-stack inference protocol has issues with non-streaming inference code
response = await self.inference_api.chat_completion(
model_id=self.model,
response = await self.inference_api.openai_chat_completion(
model=self.model,
messages=[shield_input_message],
stream=False,
temperature=0.0, # default is 1, which is too high for safety
)
content = response.completion_message.content
content = response.choices[0].message.content
content = content.strip()
return self.get_shield_response(content)