# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from collections.abc import AsyncIterator from typing import Annotated, Protocol, runtime_checkable, Any, Literal from pydantic import BaseModel from llama_stack_api.common.responses import Order from llama_stack_api.schema_utils import ExtraBodyField, json_schema_type, webmethod from llama_stack_api.version import LLAMA_STACK_API_V1 from .openai_responses import ( ListOpenAIResponseInputItem, ListOpenAIResponseObject, OpenAIDeleteResponseObject, OpenAIResponseInput, OpenAIResponseInputTool, OpenAIResponseObject, OpenAIResponseObjectStream, OpenAIResponsePrompt, OpenAIResponseText, ) @json_schema_type class ResponseGuardrailSpec(BaseModel): """Specification for a guardrail to apply during response generation. Production-focused configuration allowing safety, moderation, and policy controls. Fields ------ type: Identifier for the guardrail implementation (e.g. 'llama-guard', 'content-filter'). description: Human readable explanation / purpose. enabled: Whether enforcement is active (default True). severity: Severity classification for violations (info | warn | block). action: Action when violation occurs (flag | block | redact | annotate). If omitted, provider default applies. policy_id: Optional external policy/reference identifier to map violations to organizational policy. version: Optional version of this guardrail configuration (for audit/rollback). categories: List of safety/moderation categories this guardrail targets (e.g. ['violence','self-harm']). thresholds: Per-category numeric thresholds (e.g. {'violence':0.8}). Semantics depend on provider. max_violations: If set, cap number of violations before early termination. config: Provider/model specific free-form settings (nested allowed). tags: Arbitrary labels to assist analytics/telemetry and routing. metadata: Arbitrary supplemental structured metadata for downstream logging. """ type: str description: str | None = None enabled: bool = True severity: Literal["info", "warn", "block"] | None = None action: Literal["flag", "block", "redact", "annotate"] | None = None policy_id: str | None = None version: str | None = None categories: list[str] | None = None thresholds: dict[str, float] | None = None max_violations: int | None = None config: dict[str, Any] | None = None tags: list[str] | None = None metadata: dict[str, Any] | None = None model_config = { "extra": "forbid", "title": "ResponseGuardrailSpec", } @classmethod def _non_empty(cls, value: str, field_name: str) -> str: # internal helper if not value or not value.strip(): raise ValueError(f"{field_name} cannot be empty") return value @classmethod def validate(cls, value: Any): # pydantic v2 uses model validators; minimal safeguard here if invoked directly return value def normalized(self) -> "ResponseGuardrailSpec": """Return a normalized copy (e.g., lower-casing categories, stripping whitespace).""" if self.categories: object.__setattr__(self, "categories", [c.strip().lower() for c in self.categories]) return self ResponseGuardrail = str | ResponseGuardrailSpec @runtime_checkable class Agents(Protocol): """Agents APIs for creating and interacting with agentic systems.""" # We situate the OpenAI Responses API in the Agents API just like we did things # for Inference. The Responses API, in its intent, serves the same purpose as # the Agents API above -- it is essentially a lightweight "agentic loop" with # integrated tool calling. # # Both of these APIs are inherently stateful. @webmethod(route="/responses/{response_id}", method="GET", level=LLAMA_STACK_API_V1) async def get_openai_response( self, response_id: str, ) -> OpenAIResponseObject: """Get a model response. :param response_id: The ID of the OpenAI response to retrieve. :returns: An OpenAIResponseObject. """ ... @webmethod(route="/responses", method="POST", level=LLAMA_STACK_API_V1) async def create_openai_response( self, input: str | list[OpenAIResponseInput], model: str, prompt: OpenAIResponsePrompt | None = None, instructions: str | None = None, parallel_tool_calls: bool | None = True, previous_response_id: str | None = None, conversation: str | None = None, store: bool | None = True, stream: bool | None = False, temperature: float | None = None, text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, include: list[str] | None = None, max_infer_iters: int | None = 10, # this is an extension to the OpenAI API guardrails: Annotated[ list[ResponseGuardrail] | None, ExtraBodyField( "List of guardrails to apply during response generation. Guardrails provide safety and content moderation." ), ] = None, max_tool_calls: int | None = None, ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: """Create a model response. :param input: Input message(s) to create the response. :param model: The underlying LLM used for completions. :param prompt: (Optional) Prompt object with ID, version, and variables. :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses. :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation. :param include: (Optional) Additional fields to include in the response. :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications. :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response. :returns: An OpenAIResponseObject. """ ... @webmethod(route="/responses", method="GET", level=LLAMA_STACK_API_V1) async def list_openai_responses( self, after: str | None = None, limit: int | None = 50, model: str | None = None, order: Order | None = Order.desc, ) -> ListOpenAIResponseObject: """List all responses. :param after: The ID of the last response to return. :param limit: The number of responses to return. :param model: The model to filter responses by. :param order: The order to sort responses by when sorted by created_at ('asc' or 'desc'). :returns: A ListOpenAIResponseObject. """ ... @webmethod(route="/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1) async def list_openai_response_input_items( self, response_id: str, after: str | None = None, before: str | None = None, include: list[str] | None = None, limit: int | None = 20, order: Order | None = Order.desc, ) -> ListOpenAIResponseInputItem: """List input items. :param response_id: The ID of the response to retrieve input items for. :param after: An item ID to list items after, used for pagination. :param before: An item ID to list items before, used for pagination. :param include: Additional fields to include in the response. :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20. :param order: The order to return the input items in. Default is desc. :returns: An ListOpenAIResponseInputItem. """ ... @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1) async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject: """Delete a response. :param response_id: The ID of the OpenAI response to delete. :returns: An OpenAIDeleteResponseObject """ ...