Guardrails & Safety
Duration: 50 min
Bedrock Guardrails help you control model behavior, filter harmful content, detect PII, and enforce safety policies. This module covers guardrail configuration, content filtering, and compliance patterns.
What are Guardrails?
Guardrails are safety mechanisms that:
- Filter harmful content (violence, hate speech, illegal activities)
- Detect and redact personally identifiable information (PII)
- Enforce topic restrictions
- Apply word filters
- Prevent jailbreak attempts
Creating a Guardrail
import boto3
import json
client = boto3.client('bedrock', region_name='us-east-1')
# Create a guardrail
response = client.create_guardrail(
name='customer-support-guardrail',
description='Safety guardrail for customer support chatbot',
topicPolicyConfig={
'topicsConfig': [
{
'name': 'financial-advice',
'definition': 'Providing investment or financial advice',
'examples': [
'Should I buy this stock?',
'What cryptocurrency should I invest in?'
],
'type': 'DENY'
},
{
'name': 'medical-advice',
'definition': 'Providing medical or health advice',
'examples': [
'What medicine should I take?',
'Is this symptom serious?'
],
'type': 'DENY'
}
]
},
contentPolicyConfig={
'filtersConfig': [
{
'type': 'VIOLENCE',
'inputStrength': 'HIGH',
'outputStrength': 'HIGH'
},
{
'type': 'HATE',
'inputStrength': 'HIGH',
'outputStrength': 'HIGH'
},
{
'type': 'SEXUAL',
'inputStrength': 'MEDIUM',
'outputStrength': 'MEDIUM'
},
{
'type': 'INSULTS',
'inputStrength': 'MEDIUM',
'outputStrength': 'MEDIUM'
}
]
},
sensitiveInformationPolicyConfig={
'piiEntitiesConfig': [
{
'type': 'EMAIL',
'action': 'ANONYMIZE'
},
{
'type': 'PHONE',
'action': 'ANONYMIZE'
},
{
'type': 'SSN',
'action': 'BLOCK'
},
{
'type': 'CREDIT_CARD',
'action': 'BLOCK'
}
]
},
wordPolicyConfig={
'wordsConfig': [
{
'text': 'competitor-name',
'action': 'BLOCK'
}
],
'managedWordListConfig': [
{
'type': 'PROFANITY'
}
]
}
)
guardrail_id = response['guardrailId']
print(f"Guardrail ID: {guardrail_id}")Using Guardrails with Models
# Apply guardrail to model invocation
response = client.invoke_model(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
body=json.dumps({
"anthropic_version": "bedrock-2023-06-01",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "What is AWS Bedrock?"}
]
}),
guardrailIdentifier=guardrail_id,
guardrailVersion='1'
)
result = json.loads(response['body'].read())
print(result['content'][0]['text'])Content Filtering
# Configure content filters
content_filters = {
'VIOLENCE': {
'inputStrength': 'HIGH', # Filter user input strictly
'outputStrength': 'HIGH' # Filter model output strictly
},
'HATE': {
'inputStrength': 'HIGH',
'outputStrength': 'HIGH'
},
'SEXUAL': {
'inputStrength': 'MEDIUM', # Medium filtering
'outputStrength': 'MEDIUM'
},
'INSULTS': {
'inputStrength': 'LOW', # Light filtering
'outputStrength': 'MEDIUM'
},
'MISCONDUCT': {
'inputStrength': 'HIGH',
'outputStrength': 'HIGH'
}
}
# Strength levels:
# HIGH: Strict filtering, blocks most content
# MEDIUM: Moderate filtering
# LOW: Minimal filtering, only obvious violations
# NONE: No filteringPII Detection & Redaction
# Configure PII handling
pii_config = {
'EMAIL': {
'action': 'ANONYMIZE' # Replace with [EMAIL]
},
'PHONE': {
'action': 'ANONYMIZE' # Replace with [PHONE]
},
'NAME': {
'action': 'ANONYMIZE' # Replace with [NAME]
},
'SSN': {
'action': 'BLOCK' # Block the entire request
},
'CREDIT_CARD': {
'action': 'BLOCK' # Block the entire request
},
'IP_ADDRESS': {
'action': 'ANONYMIZE'
},
'DRIVER_LICENSE': {
'action': 'BLOCK'
}
}
# Example: User input with PII
user_input = "My email is john@example.com and my phone is 555-1234"
# After guardrail processing:
# "My email is [EMAIL] and my phone is [PHONE]"Topic Policies
# Define denied topics
denied_topics = [
{
'name': 'illegal-activities',
'definition': 'Instructions for illegal activities',
'examples': [
'How to make explosives',
'How to hack into systems',
'How to forge documents'
],
'type': 'DENY'
},
{
'name': 'self-harm',
'definition': 'Content promoting self-harm',
'examples': [
'How to hurt myself',
'Methods of suicide'
],
'type': 'DENY'
}
]
# Define allowed topics (optional)
allowed_topics = [
{
'name': 'product-support',
'definition': 'Questions about our products',
'examples': [
'How do I use feature X?',
'What are the system requirements?'
],
'type': 'ALLOW'
}
]Word Filters
# Custom word list
word_filters = {
'wordsConfig': [
{
'text': 'competitor-name',
'action': 'BLOCK'
},
{
'text': 'internal-project-name',
'action': 'BLOCK'
}
],
'managedWordListConfig': [
{
'type': 'PROFANITY' # AWS-managed profanity list
}
]
}Guardrail Response Handling
from botocore.exceptions import ClientError
try:
response = client.invoke_model(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
body=json.dumps({
"anthropic_version": "bedrock-2023-06-01",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": user_input}
]
}),
guardrailIdentifier=guardrail_id,
guardrailVersion='1'
)
result = json.loads(response['body'].read())
# Check if guardrail was triggered
if 'guardrailAction' in response:
action = response['guardrailAction']
if action == 'BLOCKED':
print("Request blocked by guardrail")
return "I can't help with that request."
return result['content'][0]['text']
except ClientError as e:
if e.response['Error']['Code'] == 'GuardrailInterceptedException':
print("Guardrail blocked the request")
return "Request violates safety policies"Best Practices
# ✅ Good: Layered safety approach
guardrail_config = {
'contentFilters': {
'VIOLENCE': 'HIGH',
'HATE': 'HIGH',
'SEXUAL': 'MEDIUM'
},
'piiFilters': {
'EMAIL': 'ANONYMIZE',
'PHONE': 'ANONYMIZE',
'SSN': 'BLOCK'
},
'topicFilters': [
'illegal-activities',
'self-harm'
]
}
# ✅ Good: Regular guardrail updates
# Review and update guardrails quarterly
# Monitor blocked requests for patterns
# Adjust filter strengths based on false positives
# ✅ Good: Transparent user communication
def handle_blocked_request(reason):
messages = {
'VIOLENCE': 'I can\'t discuss violent content.',
'HATE': 'I can\'t engage with hateful content.',
'PII_BLOCKED': 'Please don\'t share sensitive information.',
'TOPIC_DENIED': 'I\'m not able to help with that topic.'
}
return messages.get(reason, 'I can\'t help with that request.')❓ What is the primary purpose of Bedrock Guardrails?
❓ What does the ANONYMIZE action do for PII?
❓ What does a DENY topic policy do?
❓ What is the difference between HIGH and LOW filter strength?