feat: Add moderations create api (#3020)

# What does this PR do? This PR adds Open AI Compatible moderations api. Currently only implementing for llama guard safety provider Image support, expand to other safety providers and Deprecation of run_shield will be next steps. ## Test Plan Added 2 new tests for safe/ unsafe text prompt examples for the new open ai compatible moderations api usage `SAFETY_MODEL=llama-guard3:8b LLAMA_STACK_CONFIG=starter uv run pytest -v tests/integration/safety/test_safety.py --text-model=llama3.2:3b-instruct-fp16 --embedding-model=all-MiniLM-L6-v2 --safety-shield=ollama` (Had some issue with previous PR https://github.com/meta-llama/llama-stack/pull/2994 while updating and accidentally close it , reopened new one )
2025-12-04 18:13:44 +00:00 · 2025-08-06 13:51:23 -07:00 · 2025-08-06 13:51:23 -07:00 · 26d3d25c87
commit 26d3d25c87
parent 0caef40e0d
6 changed files with 622 additions and 1 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -4734,6 +4734,49 @@
                }
            }
        },
+        "/v1/openai/v1/moderations": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "A moderation object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ModerationObject"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Safety"
+                ],
+                "description": "Classifies if text and/or image inputs are potentially harmful.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RunModerationRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
        "/v1/safety/run-shield": {
            "post": {
                "responses": {
@ -16401,6 +16444,131 @@
                ],
                "title": "RunEvalRequest"
            },
+            "RunModerationRequest": {
+                "type": "object",
+                "properties": {
+                    "input": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                }
+                            }
+                        ],
+                        "description": "Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models."
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "The content moderation model you would like to use."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input",
+                    "model"
+                ],
+                "title": "RunModerationRequest"
+            },
+            "ModerationObject": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string",
+                        "description": "The unique identifier for the moderation request."
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "The model used to generate the moderation results."
+                    },
+                    "results": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ModerationObjectResults"
+                        },
+                        "description": "A list of moderation objects"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "model",
+                    "results"
+                ],
+                "title": "ModerationObject",
+                "description": "A moderation object."
+            },
+            "ModerationObjectResults": {
+                "type": "object",
+                "properties": {
+                    "flagged": {
+                        "type": "boolean",
+                        "description": "Whether any of the below categories are flagged."
+                    },
+                    "categories": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "boolean"
+                        },
+                        "description": "A list of the categories, and whether they are flagged or not."
+                    },
+                    "category_applied_input_types": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        },
+                        "description": "A list of the categories along with the input type(s) that the score applies to."
+                    },
+                    "category_scores": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "number"
+                        },
+                        "description": "A list of the categories along with their scores as predicted by model. Required set of categories that need to be in response - violence - violence/graphic - harassment - harassment/threatening - hate - hate/threatening - illicit - illicit/violent - sexual - sexual/minors - self-harm - self-harm/intent - self-harm/instructions"
+                    },
+                    "user_message": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "flagged",
+                    "metadata"
+                ],
+                "title": "ModerationObjectResults",
+                "description": "A moderation object."
+            },
            "RunShieldRequest": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -3358,6 +3358,36 @@ paths:
            schema:
              $ref: '#/components/schemas/RunEvalRequest'
        required: true
+  /v1/openai/v1/moderations:
+    post:
+      responses:
+        '200':
+          description: A moderation object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ModerationObject'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Safety
+      description: >-
+        Classifies if text and/or image inputs are potentially harmful.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RunModerationRequest'
+        required: true
  /v1/safety/run-shield:
    post:
      responses:
@ -12184,6 +12214,100 @@ components:
      required:
        - benchmark_config
      title: RunEvalRequest
+    RunModerationRequest:
+      type: object
+      properties:
+        input:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                type: string
+          description: >-
+            Input (or inputs) to classify. Can be a single string, an array of strings,
+            or an array of multi-modal input objects similar to other models.
+        model:
+          type: string
+          description: >-
+            The content moderation model you would like to use.
+      additionalProperties: false
+      required:
+        - input
+        - model
+      title: RunModerationRequest
+    ModerationObject:
+      type: object
+      properties:
+        id:
+          type: string
+          description: >-
+            The unique identifier for the moderation request.
+        model:
+          type: string
+          description: >-
+            The model used to generate the moderation results.
+        results:
+          type: array
+          items:
+            $ref: '#/components/schemas/ModerationObjectResults'
+          description: A list of moderation objects
+      additionalProperties: false
+      required:
+        - id
+        - model
+        - results
+      title: ModerationObject
+      description: A moderation object.
+    ModerationObjectResults:
+      type: object
+      properties:
+        flagged:
+          type: boolean
+          description: >-
+            Whether any of the below categories are flagged.
+        categories:
+          type: object
+          additionalProperties:
+            type: boolean
+          description: >-
+            A list of the categories, and whether they are flagged or not.
+        category_applied_input_types:
+          type: object
+          additionalProperties:
+            type: array
+            items:
+              type: string
+          description: >-
+            A list of the categories along with the input type(s) that the score applies
+            to.
+        category_scores:
+          type: object
+          additionalProperties:
+            type: number
+          description: >-
+            A list of the categories along with their scores as predicted by model.
+            Required set of categories that need to be in response - violence - violence/graphic
+            - harassment - harassment/threatening - hate - hate/threatening - illicit
+            - illicit/violent - sexual - sexual/minors - self-harm - self-harm/intent
+            - self-harm/instructions
+        user_message:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - flagged
+        - metadata
+      title: ModerationObjectResults
+      description: A moderation object.
    RunShieldRequest:
      type: object
      properties: