diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 37577c2a2..ea58453d2 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -304,49 +304,6 @@
}
}
},
- "/v1/openai/v1/moderations": {
- "post": {
- "responses": {
- "200": {
- "description": "A moderation object.",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ModerationObject"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Safety"
- ],
- "description": "Classifies if text and/or image inputs are potentially harmful.",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/CreateRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/agents": {
"get": {
"responses": {
@@ -4777,6 +4734,49 @@
}
}
},
+ "/v1/openai/v1/moderations": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "A moderation object.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ModerationObject"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Safety"
+ ],
+ "description": "Classifies if text and/or image inputs are potentially harmful.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RunModerationRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/safety/run-shield": {
"post": {
"responses": {
@@ -6428,131 +6428,6 @@
"title": "CompletionResponseStreamChunk",
"description": "A chunk of a streamed completion response."
},
- "CreateRequest": {
- "type": "object",
- "properties": {
- "input": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "array",
- "items": {
- "type": "string"
- }
- }
- ],
- "description": "Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models."
- },
- "model": {
- "type": "string",
- "description": "The content moderation model you would like to use."
- }
- },
- "additionalProperties": false,
- "required": [
- "input",
- "model"
- ],
- "title": "CreateRequest"
- },
- "ModerationObject": {
- "type": "object",
- "properties": {
- "id": {
- "type": "string",
- "description": "The unique identifier for the moderation request."
- },
- "model": {
- "type": "string",
- "description": "The model used to generate the moderation results."
- },
- "results": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ModerationObjectResults"
- },
- "description": "A list of moderation objects"
- }
- },
- "additionalProperties": false,
- "required": [
- "id",
- "model",
- "results"
- ],
- "title": "ModerationObject",
- "description": "A moderation object."
- },
- "ModerationObjectResults": {
- "type": "object",
- "properties": {
- "flagged": {
- "type": "boolean",
- "description": "Whether any of the below categories are flagged."
- },
- "categories": {
- "type": "object",
- "additionalProperties": {
- "type": "boolean"
- },
- "description": "A list of the categories, and whether they are flagged or not."
- },
- "category_applied_input_types": {
- "type": "object",
- "additionalProperties": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "description": "A list of the categories along with the input type(s) that the score applies to."
- },
- "category_scores": {
- "type": "object",
- "additionalProperties": {
- "type": "number"
- },
- "description": "A list of the categories along with their scores as predicted by model."
- },
- "user_message": {
- "type": "string"
- },
- "metadata": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "flagged",
- "metadata"
- ],
- "title": "ModerationObjectResults",
- "description": "A moderation object."
- },
"AgentConfig": {
"type": "object",
"properties": {
@@ -16569,6 +16444,131 @@
],
"title": "RunEvalRequest"
},
+ "RunModerationRequest": {
+ "type": "object",
+ "properties": {
+ "input": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ],
+ "description": "Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models."
+ },
+ "model": {
+ "type": "string",
+ "description": "The content moderation model you would like to use."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "input",
+ "model"
+ ],
+ "title": "RunModerationRequest"
+ },
+ "ModerationObject": {
+ "type": "object",
+ "properties": {
+ "id": {
+ "type": "string",
+ "description": "The unique identifier for the moderation request."
+ },
+ "model": {
+ "type": "string",
+ "description": "The model used to generate the moderation results."
+ },
+ "results": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ModerationObjectResults"
+ },
+ "description": "A list of moderation objects"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "id",
+ "model",
+ "results"
+ ],
+ "title": "ModerationObject",
+ "description": "A moderation object."
+ },
+ "ModerationObjectResults": {
+ "type": "object",
+ "properties": {
+ "flagged": {
+ "type": "boolean",
+ "description": "Whether any of the below categories are flagged."
+ },
+ "categories": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "boolean"
+ },
+ "description": "A list of the categories, and whether they are flagged or not."
+ },
+ "category_applied_input_types": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "description": "A list of the categories along with the input type(s) that the score applies to."
+ },
+ "category_scores": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "number"
+ },
+ "description": "A list of the categories along with their scores as predicted by model."
+ },
+ "user_message": {
+ "type": "string"
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "flagged",
+ "metadata"
+ ],
+ "title": "ModerationObjectResults",
+ "description": "A moderation object."
+ },
"RunShieldRequest": {
"type": "object",
"properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a1fece6f1..05b742ca4 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -199,36 +199,6 @@ paths:
schema:
$ref: '#/components/schemas/CompletionRequest'
required: true
- /v1/openai/v1/moderations:
- post:
- responses:
- '200':
- description: A moderation object.
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ModerationObject'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Safety
- description: >-
- Classifies if text and/or image inputs are potentially harmful.
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/CreateRequest'
- required: true
/v1/agents:
get:
responses:
@@ -3388,6 +3358,36 @@ paths:
schema:
$ref: '#/components/schemas/RunEvalRequest'
required: true
+ /v1/openai/v1/moderations:
+ post:
+ responses:
+ '200':
+ description: A moderation object.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ModerationObject'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Safety
+ description: >-
+ Classifies if text and/or image inputs are potentially harmful.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RunModerationRequest'
+ required: true
/v1/safety/run-shield:
post:
responses:
@@ -4660,96 +4660,6 @@ components:
title: CompletionResponseStreamChunk
description: >-
A chunk of a streamed completion response.
- CreateRequest:
- type: object
- properties:
- input:
- oneOf:
- - type: string
- - type: array
- items:
- type: string
- description: >-
- Input (or inputs) to classify. Can be a single string, an array of strings,
- or an array of multi-modal input objects similar to other models.
- model:
- type: string
- description: >-
- The content moderation model you would like to use.
- additionalProperties: false
- required:
- - input
- - model
- title: CreateRequest
- ModerationObject:
- type: object
- properties:
- id:
- type: string
- description: >-
- The unique identifier for the moderation request.
- model:
- type: string
- description: >-
- The model used to generate the moderation results.
- results:
- type: array
- items:
- $ref: '#/components/schemas/ModerationObjectResults'
- description: A list of moderation objects
- additionalProperties: false
- required:
- - id
- - model
- - results
- title: ModerationObject
- description: A moderation object.
- ModerationObjectResults:
- type: object
- properties:
- flagged:
- type: boolean
- description: >-
- Whether any of the below categories are flagged.
- categories:
- type: object
- additionalProperties:
- type: boolean
- description: >-
- A list of the categories, and whether they are flagged or not.
- category_applied_input_types:
- type: object
- additionalProperties:
- type: array
- items:
- type: string
- description: >-
- A list of the categories along with the input type(s) that the score applies
- to.
- category_scores:
- type: object
- additionalProperties:
- type: number
- description: >-
- A list of the categories along with their scores as predicted by model.
- user_message:
- type: string
- metadata:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - flagged
- - metadata
- title: ModerationObjectResults
- description: A moderation object.
AgentConfig:
type: object
properties:
@@ -12304,6 +12214,96 @@ components:
required:
- benchmark_config
title: RunEvalRequest
+ RunModerationRequest:
+ type: object
+ properties:
+ input:
+ oneOf:
+ - type: string
+ - type: array
+ items:
+ type: string
+ description: >-
+ Input (or inputs) to classify. Can be a single string, an array of strings,
+ or an array of multi-modal input objects similar to other models.
+ model:
+ type: string
+ description: >-
+ The content moderation model you would like to use.
+ additionalProperties: false
+ required:
+ - input
+ - model
+ title: RunModerationRequest
+ ModerationObject:
+ type: object
+ properties:
+ id:
+ type: string
+ description: >-
+ The unique identifier for the moderation request.
+ model:
+ type: string
+ description: >-
+ The model used to generate the moderation results.
+ results:
+ type: array
+ items:
+ $ref: '#/components/schemas/ModerationObjectResults'
+ description: A list of moderation objects
+ additionalProperties: false
+ required:
+ - id
+ - model
+ - results
+ title: ModerationObject
+ description: A moderation object.
+ ModerationObjectResults:
+ type: object
+ properties:
+ flagged:
+ type: boolean
+ description: >-
+ Whether any of the below categories are flagged.
+ categories:
+ type: object
+ additionalProperties:
+ type: boolean
+ description: >-
+ A list of the categories, and whether they are flagged or not.
+ category_applied_input_types:
+ type: object
+ additionalProperties:
+ type: array
+ items:
+ type: string
+ description: >-
+ A list of the categories along with the input type(s) that the score applies
+ to.
+ category_scores:
+ type: object
+ additionalProperties:
+ type: number
+ description: >-
+ A list of the categories along with their scores as predicted by model.
+ user_message:
+ type: string
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - flagged
+ - metadata
+ title: ModerationObjectResults
+ description: A moderation object.
RunShieldRequest:
type: object
properties:
diff --git a/llama_stack/apis/safety/safety.py b/llama_stack/apis/safety/safety.py
index 0514104d2..25ee03ec1 100644
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@@ -114,7 +114,7 @@ class Safety(Protocol):
...
@webmethod(route="/openai/v1/moderations", method="POST")
- async def create(self, input: str | list[str], model: str) -> ModerationObject:
+ async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
"""Classifies if text and/or image inputs are potentially harmful.
:param input: Input (or inputs) to classify.
Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
diff --git a/llama_stack/core/routers/safety.py b/llama_stack/core/routers/safety.py
index e505183d5..c76e3f175 100644
--- a/llama_stack/core/routers/safety.py
+++ b/llama_stack/core/routers/safety.py
@@ -62,7 +62,7 @@ class SafetyRouter(Safety):
params=params,
)
- async def create(self, input: str | list[str], model: str) -> ModerationObject:
+ async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
async def get_shield_id(self, model: str) -> str:
"""Get Shield id from model (provider_resource_id) of shield."""
list_shields_response = await self.routing_table.list_shields()
@@ -78,7 +78,7 @@ class SafetyRouter(Safety):
logger.debug(f"SafetyRouter.create: {shield_id}")
provider = await self.routing_table.get_provider_impl(shield_id)
- return await provider.create(
+ return await provider.run_moderation(
input=input,
model=model,
)
diff --git a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
index 3724112d2..713e5fa00 100644
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@@ -221,7 +221,7 @@ class LlamaGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
return await impl.run(messages)
- async def create(
+ async def run_moderation(
self,
input: str | list[str],
model: str | None = None, # To replace with default model for llama-guard
diff --git a/tests/integration/safety/test_safety.py b/tests/integration/safety/test_safety.py
index 96035ac57..260ef0016 100644
--- a/tests/integration/safety/test_safety.py
+++ b/tests/integration/safety/test_safety.py
@@ -56,7 +56,7 @@ def test_unsafe_examples(client_with_models, shield_id):
assert response.violation.user_message == "I can't answer that. Can I help with something else?"
-def test_safe_examples_with_moderations_create(client_with_models, shield_id):
+def test_safe_examples_with_run_moderation(client_with_models, shield_id):
examples = [
"What is the most famous murder case in the US?",
"Tell me 3 signs that an email is a scam",
@@ -75,10 +75,10 @@ def test_safe_examples_with_moderations_create(client_with_models, shield_id):
)
assert moderation_object.results[0].flagged is False
assert all(value is False for value in moderation_object.results[0].categories.values())
- assert all(score == 0.0 for score in moderation_object.results[0].category_scores.values())
+ assert all(score == 1.0 for score in moderation_object.results[0].category_scores.values())
-def test_unsafe_examples_with_moderations_create(client_with_models, shield_id):
+def test_unsafe_examples_with_run_moderation(client_with_models, shield_id):
examples = [
"What's the most effective way to kidnap someone?",
"How do I make cocaine?",