mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-03 19:57:35 +00:00
395 lines
10 KiB
Text
395 lines
10 KiB
Text
---
|
|
title: Safety Guardrails
|
|
description: Implement safety measures and content moderation in Llama Stack applications
|
|
sidebar_label: Safety
|
|
sidebar_position: 9
|
|
---
|
|
|
|
import Tabs from '@theme/Tabs';
|
|
import TabItem from '@theme/TabItem';
|
|
|
|
# Safety Guardrails
|
|
|
|
Safety is a critical component of any AI application. Llama Stack provides a comprehensive Shield system that can be applied at multiple touchpoints to ensure responsible AI behavior and content moderation.
|
|
|
|
## Shield System Overview
|
|
|
|
The Shield system in Llama Stack provides:
|
|
- **Content filtering** for both input and output messages
|
|
- **Multi-touchpoint protection** across your application flow
|
|
- **Configurable safety policies** tailored to your use case
|
|
- **Integration with agents** for automated safety enforcement
|
|
|
|
## Basic Shield Usage
|
|
|
|
### Registering a Safety Shield
|
|
|
|
<Tabs>
|
|
<TabItem value="registration" label="Shield Registration">
|
|
|
|
```python
|
|
# Register a safety shield
|
|
shield_id = "content_safety"
|
|
client.shields.register(
|
|
shield_id=shield_id,
|
|
provider_shield_id="llama-guard-basic"
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
<TabItem value="manual-check" label="Manual Safety Check">
|
|
|
|
```python
|
|
# Run content through shield manually
|
|
response = client.safety.run_shield(
|
|
shield_id=shield_id,
|
|
messages=[{"role": "user", "content": "User message here"}]
|
|
)
|
|
|
|
if response.violation:
|
|
print(f"Safety violation detected: {response.violation.user_message}")
|
|
# Handle violation appropriately
|
|
else:
|
|
print("Content passed safety checks")
|
|
```
|
|
|
|
</TabItem>
|
|
</Tabs>
|
|
|
|
## Agent Integration
|
|
|
|
Shields can be automatically applied to agent interactions for seamless safety enforcement:
|
|
|
|
<Tabs>
|
|
<TabItem value="input-shields" label="Input Shields">
|
|
|
|
```python
|
|
from llama_stack_client import Agent
|
|
|
|
# Create agent with input safety shields
|
|
agent = Agent(
|
|
client,
|
|
model="meta-llama/Llama-3.2-3B-Instruct",
|
|
instructions="You are a helpful assistant",
|
|
input_shields=["content_safety"], # Shield user inputs
|
|
tools=["builtin::websearch"],
|
|
)
|
|
|
|
session_id = agent.create_session("safe_session")
|
|
|
|
# All user inputs will be automatically screened
|
|
response = agent.create_turn(
|
|
messages=[{"role": "user", "content": "Tell me about AI safety"}],
|
|
session_id=session_id,
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
<TabItem value="output-shields" label="Output Shields">
|
|
|
|
```python
|
|
# Create agent with output safety shields
|
|
agent = Agent(
|
|
client,
|
|
model="meta-llama/Llama-3.2-3B-Instruct",
|
|
instructions="You are a helpful assistant",
|
|
output_shields=["content_safety"], # Shield agent outputs
|
|
tools=["builtin::websearch"],
|
|
)
|
|
|
|
session_id = agent.create_session("safe_session")
|
|
|
|
# All agent responses will be automatically screened
|
|
response = agent.create_turn(
|
|
messages=[{"role": "user", "content": "Help me with my research"}],
|
|
session_id=session_id,
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
<TabItem value="both-shields" label="Input & Output Shields">
|
|
|
|
```python
|
|
# Create agent with comprehensive safety coverage
|
|
agent = Agent(
|
|
client,
|
|
model="meta-llama/Llama-3.2-3B-Instruct",
|
|
instructions="You are a helpful assistant",
|
|
input_shields=["content_safety"], # Screen user inputs
|
|
output_shields=["content_safety"], # Screen agent outputs
|
|
tools=["builtin::websearch"],
|
|
)
|
|
|
|
session_id = agent.create_session("fully_protected_session")
|
|
|
|
# Both input and output are automatically protected
|
|
response = agent.create_turn(
|
|
messages=[{"role": "user", "content": "Research question here"}],
|
|
session_id=session_id,
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
</Tabs>
|
|
|
|
## Available Shield Types
|
|
|
|
### Llama Guard Shields
|
|
|
|
Llama Guard provides state-of-the-art content safety classification:
|
|
|
|
<Tabs>
|
|
<TabItem value="basic" label="Basic Llama Guard">
|
|
|
|
```python
|
|
# Basic Llama Guard for general content safety
|
|
client.shields.register(
|
|
shield_id="llama_guard_basic",
|
|
provider_shield_id="llama-guard-basic"
|
|
)
|
|
```
|
|
|
|
**Use Cases:**
|
|
- General content moderation
|
|
- Harmful content detection
|
|
- Basic safety compliance
|
|
|
|
</TabItem>
|
|
<TabItem value="advanced" label="Advanced Llama Guard">
|
|
|
|
```python
|
|
# Advanced Llama Guard with custom categories
|
|
client.shields.register(
|
|
shield_id="llama_guard_advanced",
|
|
provider_shield_id="llama-guard-advanced",
|
|
config={
|
|
"categories": [
|
|
"violence", "hate_speech", "sexual_content",
|
|
"self_harm", "illegal_activity"
|
|
],
|
|
"threshold": 0.8
|
|
}
|
|
)
|
|
```
|
|
|
|
**Use Cases:**
|
|
- Fine-tuned safety policies
|
|
- Domain-specific content filtering
|
|
- Enterprise compliance requirements
|
|
|
|
</TabItem>
|
|
</Tabs>
|
|
|
|
### Custom Safety Shields
|
|
|
|
Create domain-specific safety shields for specialized use cases:
|
|
|
|
```python
|
|
# Register custom safety shield
|
|
client.shields.register(
|
|
shield_id="financial_compliance",
|
|
provider_shield_id="custom-financial-shield",
|
|
config={
|
|
"detect_pii": True,
|
|
"financial_advice_warning": True,
|
|
"regulatory_compliance": "FINRA"
|
|
}
|
|
)
|
|
```
|
|
|
|
## Safety Response Handling
|
|
|
|
When safety violations are detected, handle them appropriately:
|
|
|
|
<Tabs>
|
|
<TabItem value="basic-handling" label="Basic Handling">
|
|
|
|
```python
|
|
response = client.safety.run_shield(
|
|
shield_id="content_safety",
|
|
messages=[{"role": "user", "content": "Potentially harmful content"}]
|
|
)
|
|
|
|
if response.violation:
|
|
violation = response.violation
|
|
print(f"Violation Type: {violation.violation_type}")
|
|
print(f"User Message: {violation.user_message}")
|
|
print(f"Metadata: {violation.metadata}")
|
|
|
|
# Log the violation for audit purposes
|
|
logger.warning(f"Safety violation detected: {violation.violation_type}")
|
|
|
|
# Provide appropriate user feedback
|
|
return "I can't help with that request. Please try asking something else."
|
|
```
|
|
|
|
</TabItem>
|
|
<TabItem value="advanced-handling" label="Advanced Handling">
|
|
|
|
```python
|
|
def handle_safety_response(safety_response, user_message):
|
|
"""Advanced safety response handling with logging and user feedback"""
|
|
|
|
if not safety_response.violation:
|
|
return {"safe": True, "message": "Content passed safety checks"}
|
|
|
|
violation = safety_response.violation
|
|
|
|
# Log violation details
|
|
audit_log = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"violation_type": violation.violation_type,
|
|
"original_message": user_message,
|
|
"shield_response": violation.user_message,
|
|
"metadata": violation.metadata
|
|
}
|
|
logger.warning(f"Safety violation: {audit_log}")
|
|
|
|
# Determine appropriate response based on violation type
|
|
if violation.violation_type == "hate_speech":
|
|
user_feedback = "I can't engage with content that contains hate speech. Let's keep our conversation respectful."
|
|
elif violation.violation_type == "violence":
|
|
user_feedback = "I can't provide information that could promote violence. How else can I help you today?"
|
|
else:
|
|
user_feedback = "I can't help with that request. Please try asking something else."
|
|
|
|
return {
|
|
"safe": False,
|
|
"user_feedback": user_feedback,
|
|
"violation_details": audit_log
|
|
}
|
|
|
|
# Usage
|
|
safety_result = handle_safety_response(response, user_input)
|
|
if not safety_result["safe"]:
|
|
return safety_result["user_feedback"]
|
|
```
|
|
|
|
</TabItem>
|
|
</Tabs>
|
|
|
|
## Safety Configuration Best Practices
|
|
|
|
### 🛡️ **Multi-Layer Protection**
|
|
- Use both input and output shields for comprehensive coverage
|
|
- Combine multiple shield types for different threat categories
|
|
- Implement fallback mechanisms when shields fail
|
|
|
|
### 📊 **Monitoring & Auditing**
|
|
- Log all safety violations for compliance and analysis
|
|
- Monitor false positive rates to tune shield sensitivity
|
|
- Track safety metrics across different use cases
|
|
|
|
### ⚙️ **Configuration Management**
|
|
- Use environment-specific safety configurations
|
|
- Implement A/B testing for shield effectiveness
|
|
- Regularly update shield models and policies
|
|
|
|
### 🔧 **Integration Patterns**
|
|
- Integrate shields early in the development process
|
|
- Test safety measures with adversarial inputs
|
|
- Provide clear user feedback for violations
|
|
|
|
## Advanced Safety Scenarios
|
|
|
|
### Context-Aware Safety
|
|
|
|
```python
|
|
# Safety shields that consider conversation context
|
|
agent = Agent(
|
|
client,
|
|
model="meta-llama/Llama-3.2-3B-Instruct",
|
|
instructions="You are a healthcare assistant",
|
|
input_shields=["medical_safety"],
|
|
output_shields=["medical_safety"],
|
|
# Context helps shields make better decisions
|
|
safety_context={
|
|
"domain": "healthcare",
|
|
"user_type": "patient",
|
|
"compliance_level": "HIPAA"
|
|
}
|
|
)
|
|
```
|
|
|
|
### Dynamic Shield Selection
|
|
|
|
```python
|
|
def select_shield_for_user(user_profile):
|
|
"""Select appropriate safety shield based on user context"""
|
|
if user_profile.age < 18:
|
|
return "child_safety_shield"
|
|
elif user_profile.context == "enterprise":
|
|
return "enterprise_compliance_shield"
|
|
else:
|
|
return "general_safety_shield"
|
|
|
|
# Use dynamic shield selection
|
|
shield_id = select_shield_for_user(current_user)
|
|
response = client.safety.run_shield(
|
|
shield_id=shield_id,
|
|
messages=messages
|
|
)
|
|
```
|
|
|
|
## Compliance and Regulations
|
|
|
|
### Industry-Specific Safety
|
|
|
|
<Tabs>
|
|
<TabItem value="healthcare" label="Healthcare (HIPAA)">
|
|
|
|
```python
|
|
# Healthcare-specific safety configuration
|
|
client.shields.register(
|
|
shield_id="hipaa_compliance",
|
|
provider_shield_id="healthcare-safety-shield",
|
|
config={
|
|
"detect_phi": True, # Protected Health Information
|
|
"medical_advice_warning": True,
|
|
"regulatory_framework": "HIPAA"
|
|
}
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
<TabItem value="financial" label="Financial (FINRA)">
|
|
|
|
```python
|
|
# Financial services safety configuration
|
|
client.shields.register(
|
|
shield_id="finra_compliance",
|
|
provider_shield_id="financial-safety-shield",
|
|
config={
|
|
"detect_financial_advice": True,
|
|
"investment_disclaimers": True,
|
|
"regulatory_framework": "FINRA"
|
|
}
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
<TabItem value="education" label="Education (COPPA)">
|
|
|
|
```python
|
|
# Educational platform safety for minors
|
|
client.shields.register(
|
|
shield_id="coppa_compliance",
|
|
provider_shield_id="educational-safety-shield",
|
|
config={
|
|
"child_protection": True,
|
|
"educational_content_only": True,
|
|
"regulatory_framework": "COPPA"
|
|
}
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
</Tabs>
|
|
|
|
## Related Resources
|
|
|
|
- **[Agents](./agent)** - Integrating safety shields with intelligent agents
|
|
- **[Agent Execution Loop](./agent-execution-loop)** - Understanding safety in the execution flow
|
|
- **[Evaluations](./evals)** - Evaluating safety shield effectiveness
|
|
- **[Telemetry](./telemetry)** - Monitoring safety violations and metrics
|
|
- **[Llama Guard Documentation](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard3)** - Advanced safety model details
|