Bedrock API & boto3
Duration: 55 min
The Bedrock API provides two main ways to invoke models: InvokeModel for single requests and Converse for multi-turn conversations. This module covers both APIs, streaming responses, and error handling with practical boto3 examples.
InvokeModel API
The InvokeModel operation sends a request to a foundation model and returns the response. Each model has its own request/response format.
Basic InvokeModel Call
import boto3
import json
client = boto3.client('bedrock-runtime', region_name='us-east-1')
# Invoke Claude 3 Sonnet
response = client.invoke_model(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
body=json.dumps({
"anthropic_version": "bedrock-2023-06-01",
"max_tokens": 1024,
"messages": [
{
"role": "user",
"content": "What is machine learning?"
}
]
})
)
# Parse response
result = json.loads(response['body'].read())
print(result['content'][0]['text'])Model-Specific Request Formats
Different models expect different request formats:
# Claude format
claude_body = {
"anthropic_version": "bedrock-2023-06-01",
"max_tokens": 1024,
"messages": [{"role": "user", "content": "Hello"}]
}
# Llama format
llama_body = {
"prompt": "What is AI?",
"max_gen_len": 512,
"temperature": 0.7,
"top_p": 0.9
}
# Mistral format
mistral_body = {
"prompt": "Explain quantum computing",
"max_tokens": 512,
"temperature": 0.7
}
# Titan format
titan_body = {
"inputText": "Summarize AWS",
"textGenerationConfig": {
"maxTokenCount": 512,
"temperature": 0.7,
"topP": 0.9
}
}Converse API (Multi-turn Conversations)
The Converse API simplifies multi-turn conversations by handling message formatting automatically.
import boto3
client = boto3.client('bedrock-runtime', region_name='us-east-1')
# Start a conversation
messages = [
{
"role": "user",
"content": "What is AWS Bedrock?"
}
]
response = client.converse(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
messages=messages,
system="You are a helpful AWS expert.",
inferenceConfig={
"maxTokens": 1024,
"temperature": 0.7
}
)
assistant_message = response['output']['message']['content'][0]['text']
print(assistant_message)
# Continue conversation
messages.append({
"role": "assistant",
"content": assistant_message
})
messages.append({
"role": "user",
"content": "How does it compare to OpenAI?"
})
response = client.converse(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
messages=messages,
system="You are a helpful AWS expert.",
inferenceConfig={
"maxTokens": 1024,
"temperature": 0.7
}
)
print(response['output']['message']['content'][0]['text'])Streaming Responses
For long responses, stream tokens as they're generated instead of waiting for the full response.
# Streaming with InvokeModel
response = client.invoke_model_with_response_stream(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
body=json.dumps({
"anthropic_version": "bedrock-2023-06-01",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Write a 500-word essay on AI"}
]
})
)
# Process stream
for event in response['body']:
if 'contentBlockDelta' in event:
delta = event['contentBlockDelta']['delta']
if 'text' in delta:
print(delta['text'], end='', flush=True)
print() # Newline after streaming completesConverse Stream API
# Streaming with Converse
response = client.converse_stream(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
messages=[
{"role": "user", "content": "Explain quantum computing in detail"}
],
inferenceConfig={
"maxTokens": 2048,
"temperature": 0.7
}
)
# Process stream
for event in response['stream']:
if 'contentBlockDelta' in event:
print(event['contentBlockDelta']['delta']['text'], end='', flush=True)Error Handling
import boto3
from botocore.exceptions import ClientError
client = boto3.client('bedrock-runtime', region_name='us-east-1')
try:
response = client.invoke_model(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
body=json.dumps({
"anthropic_version": "bedrock-2023-06-01",
"max_tokens": 1024,
"messages": [{"role": "user", "content": "Hello"}]
})
)
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == 'AccessDeniedException':
print("Model access not enabled. Request access in AWS Console.")
elif error_code == 'ThrottlingException':
print("Rate limit exceeded. Implement exponential backoff.")
elif error_code == 'ValidationException':
print("Invalid request format. Check model ID and body.")
else:
print(f"Error: {error_code}")Token Counting
Estimate costs by counting tokens before invoking:
# Count tokens for Claude
response = client.invoke_model(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
body=json.dumps({
"anthropic_version": "bedrock-2023-06-01",
"max_tokens": 1,
"messages": [
{"role": "user", "content": "What is AI?"}
]
})
)
result = json.loads(response['body'].read())
input_tokens = result['usage']['input_tokens']
output_tokens = result['usage']['output_tokens']
print(f"Input: {input_tokens}, Output: {output_tokens}")API Response Structure
{
"claude_response": {
"content": [
{
"type": "text",
"text": "Response text here"
}
],
"usage": {
"input_tokens": 10,
"output_tokens": 50
},
"stop_reason": "end_turn"
},
"llama_response": {
"generation": "Response text here",
"prompt_token_count": 10,
"generation_token_count": 50,
"stop_reason": "length"
},
"converse_response": {
"output": {
"message": {
"role": "assistant",
"content": [
{
"type": "text",
"text": "Response text here"
}
]
}
},
"usage": {
"inputTokens": 10,
"outputTokens": 50
},
"stopReason": "end_turn"
}
}❓ What is the main advantage of the Converse API over InvokeModel?
❓ Which method should you use for long-running responses to improve user experience?
❓ What error indicates that a model hasn't been enabled in your AWS account?
❓ How can you estimate the cost of an API call before invoking?