Module 7 of 11 · AWS Bedrock — Build with Foundation Models · Intermediate

Guardrails & Safety

Duration: 50 min

Bedrock Guardrails help you control model behavior, filter harmful content, detect PII, and enforce safety policies. This module covers guardrail configuration, content filtering, and compliance patterns.

What are Guardrails?

Guardrails are safety mechanisms that:

Creating a Guardrail

import boto3
import json

client = boto3.client('bedrock', region_name='us-east-1')

# Create a guardrail
response = client.create_guardrail(
    name='customer-support-guardrail',
    description='Safety guardrail for customer support chatbot',
    topicPolicyConfig={
        'topicsConfig': [
            {
                'name': 'financial-advice',
                'definition': 'Providing investment or financial advice',
                'examples': [
                    'Should I buy this stock?',
                    'What cryptocurrency should I invest in?'
                ],
                'type': 'DENY'
            },
            {
                'name': 'medical-advice',
                'definition': 'Providing medical or health advice',
                'examples': [
                    'What medicine should I take?',
                    'Is this symptom serious?'
                ],
                'type': 'DENY'
            }
        ]
    },
    contentPolicyConfig={
        'filtersConfig': [
            {
                'type': 'VIOLENCE',
                'inputStrength': 'HIGH',
                'outputStrength': 'HIGH'
            },
            {
                'type': 'HATE',
                'inputStrength': 'HIGH',
                'outputStrength': 'HIGH'
            },
            {
                'type': 'SEXUAL',
                'inputStrength': 'MEDIUM',
                'outputStrength': 'MEDIUM'
            },
            {
                'type': 'INSULTS',
                'inputStrength': 'MEDIUM',
                'outputStrength': 'MEDIUM'
            }
        ]
    },
    sensitiveInformationPolicyConfig={
        'piiEntitiesConfig': [
            {
                'type': 'EMAIL',
                'action': 'ANONYMIZE'
            },
            {
                'type': 'PHONE',
                'action': 'ANONYMIZE'
            },
            {
                'type': 'SSN',
                'action': 'BLOCK'
            },
            {
                'type': 'CREDIT_CARD',
                'action': 'BLOCK'
            }
        ]
    },
    wordPolicyConfig={
        'wordsConfig': [
            {
                'text': 'competitor-name',
                'action': 'BLOCK'
            }
        ],
        'managedWordListConfig': [
            {
                'type': 'PROFANITY'
            }
        ]
    }
)

guardrail_id = response['guardrailId']
print(f"Guardrail ID: {guardrail_id}")

Using Guardrails with Models

# Apply guardrail to model invocation
response = client.invoke_model(
    modelId='anthropic.claude-3-sonnet-20240229-v1:0',
    body=json.dumps({
        "anthropic_version": "bedrock-2023-06-01",
        "max_tokens": 1024,
        "messages": [
            {"role": "user", "content": "What is AWS Bedrock?"}
        ]
    }),
    guardrailIdentifier=guardrail_id,
    guardrailVersion='1'
)

result = json.loads(response['body'].read())
print(result['content'][0]['text'])

Content Filtering

# Configure content filters
content_filters = {
    'VIOLENCE': {
        'inputStrength': 'HIGH',      # Filter user input strictly
        'outputStrength': 'HIGH'      # Filter model output strictly
    },
    'HATE': {
        'inputStrength': 'HIGH',
        'outputStrength': 'HIGH'
    },
    'SEXUAL': {
        'inputStrength': 'MEDIUM',    # Medium filtering
        'outputStrength': 'MEDIUM'
    },
    'INSULTS': {
        'inputStrength': 'LOW',       # Light filtering
        'outputStrength': 'MEDIUM'
    },
    'MISCONDUCT': {
        'inputStrength': 'HIGH',
        'outputStrength': 'HIGH'
    }
}

# Strength levels:
# HIGH: Strict filtering, blocks most content
# MEDIUM: Moderate filtering
# LOW: Minimal filtering, only obvious violations
# NONE: No filtering

PII Detection & Redaction

# Configure PII handling
pii_config = {
    'EMAIL': {
        'action': 'ANONYMIZE'  # Replace with [EMAIL]
    },
    'PHONE': {
        'action': 'ANONYMIZE'  # Replace with [PHONE]
    },
    'NAME': {
        'action': 'ANONYMIZE'  # Replace with [NAME]
    },
    'SSN': {
        'action': 'BLOCK'      # Block the entire request
    },
    'CREDIT_CARD': {
        'action': 'BLOCK'      # Block the entire request
    },
    'IP_ADDRESS': {
        'action': 'ANONYMIZE'
    },
    'DRIVER_LICENSE': {
        'action': 'BLOCK'
    }
}

# Example: User input with PII
user_input = "My email is john@example.com and my phone is 555-1234"

# After guardrail processing:
# "My email is [EMAIL] and my phone is [PHONE]"

Topic Policies

# Define denied topics
denied_topics = [
    {
        'name': 'illegal-activities',
        'definition': 'Instructions for illegal activities',
        'examples': [
            'How to make explosives',
            'How to hack into systems',
            'How to forge documents'
        ],
        'type': 'DENY'
    },
    {
        'name': 'self-harm',
        'definition': 'Content promoting self-harm',
        'examples': [
            'How to hurt myself',
            'Methods of suicide'
        ],
        'type': 'DENY'
    }
]

# Define allowed topics (optional)
allowed_topics = [
    {
        'name': 'product-support',
        'definition': 'Questions about our products',
        'examples': [
            'How do I use feature X?',
            'What are the system requirements?'
        ],
        'type': 'ALLOW'
    }
]

Word Filters

# Custom word list
word_filters = {
    'wordsConfig': [
        {
            'text': 'competitor-name',
            'action': 'BLOCK'
        },
        {
            'text': 'internal-project-name',
            'action': 'BLOCK'
        }
    ],
    'managedWordListConfig': [
        {
            'type': 'PROFANITY'  # AWS-managed profanity list
        }
    ]
}

Guardrail Response Handling

from botocore.exceptions import ClientError

try:
    response = client.invoke_model(
        modelId='anthropic.claude-3-sonnet-20240229-v1:0',
        body=json.dumps({
            "anthropic_version": "bedrock-2023-06-01",
            "max_tokens": 1024,
            "messages": [
                {"role": "user", "content": user_input}
            ]
        }),
        guardrailIdentifier=guardrail_id,
        guardrailVersion='1'
    )
    
    result = json.loads(response['body'].read())
    
    # Check if guardrail was triggered
    if 'guardrailAction' in response:
        action = response['guardrailAction']
        if action == 'BLOCKED':
            print("Request blocked by guardrail")
            return "I can't help with that request."
    
    return result['content'][0]['text']

except ClientError as e:
    if e.response['Error']['Code'] == 'GuardrailInterceptedException':
        print("Guardrail blocked the request")
        return "Request violates safety policies"

Best Practices

# ✅ Good: Layered safety approach
guardrail_config = {
    'contentFilters': {
        'VIOLENCE': 'HIGH',
        'HATE': 'HIGH',
        'SEXUAL': 'MEDIUM'
    },
    'piiFilters': {
        'EMAIL': 'ANONYMIZE',
        'PHONE': 'ANONYMIZE',
        'SSN': 'BLOCK'
    },
    'topicFilters': [
        'illegal-activities',
        'self-harm'
    ]
}

# ✅ Good: Regular guardrail updates
# Review and update guardrails quarterly
# Monitor blocked requests for patterns
# Adjust filter strengths based on false positives

# ✅ Good: Transparent user communication
def handle_blocked_request(reason):
    messages = {
        'VIOLENCE': 'I can\'t discuss violent content.',
        'HATE': 'I can\'t engage with hateful content.',
        'PII_BLOCKED': 'Please don\'t share sensitive information.',
        'TOPIC_DENIED': 'I\'m not able to help with that topic.'
    }
    return messages.get(reason, 'I can\'t help with that request.')

❓ What is the primary purpose of Bedrock Guardrails?

❓ What does the ANONYMIZE action do for PII?

❓ What does a DENY topic policy do?

❓ What is the difference between HIGH and LOW filter strength?

← Previous Continue interactively → Next →

Related Courses