Back to Blog
Best Practices

Prompt Engineering for Production

Scale prompt engineering from notebooks to production systems

Published July 1, 2026 11 min read

Challenge: Prompts that work in ChatGPT fail in production. This guide covers versioning, testing, monitoring, and scaling prompt strategies.

The Problem: From Ad-Hoc to Production

Typical notebook development:

# ❌ Bad: hardcoded, no versioning
response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Write a poem about AI"}]
)

Issues:

Production Prompt Structure

# prompts/poetry.yaml
name: "generate_poem"
version: "1.0"
model: "gpt-4-turbo"
system: |
  You are a creative poet specializing in technology poetry.
  Your poems are witty, thought-provoking, and accessible.
  
  Guidelines:
  - Use vivid metaphors
  - Keep rhythm consistent
  - Target audience: software engineers

user_template: |
  Write a poem about {topic} in the style of {style}.
  Poem should be {lines} lines long.

parameters:
  temperature: 0.7
  max_tokens: 500
  top_p: 0.9

Then load and use:

import yaml

def load_prompt(name: str):
    with open(f"prompts/{name}.yaml") as f:
        return yaml.safe_load(f)

def generate_poem(topic: str, style: str, lines: int = 8):
    config = load_prompt("poetry")
    
    user_message = config["user_template"].format(
        topic=topic,
        style=style,
        lines=lines
    )
    
    response = client.chat.completions.create(
        model=config["model"],
        messages=[
            {"role": "system", "content": config["system"]},
            {"role": "user", "content": user_message}
        ],
        temperature=config["parameters"]["temperature"],
        max_tokens=config["parameters"]["max_tokens"]
    )
    
    return response.choices[0].message.content

Testing Prompts

Manual Test Cases

# tests/test_poetry.py
import pytest

@pytest.mark.parametrize("topic,style", [
    ("AI", "cyberpunk"),
    ("Quantum computing", "haiku"),
    ("Machine learning", "Shakespearean sonnet"),
])
def test_poem_generation(topic, style):
    poem = generate_poem(topic, style)
    
    # Check length
    assert len(poem) > 100
    
    # Check it mentions the topic
    assert topic.lower() in poem.lower()
    
    # Check it's not a hallucination/error
    assert "error" not in poem.lower()
    assert "unable" not in poem.lower()

Automated Quality Checks

def evaluate_response_quality(response: str) -> dict:
    """Score LLM output quality"""
    
    metrics = {
        "length": len(response.split()),
        "has_errors": "error" in response.lower(),
        "coherence": evaluate_coherence(response),
        "toxicity": evaluate_toxicity(response),
        "relevance": evaluate_relevance(response)
    }
    
    # Fail if quality is too low
    if metrics["coherence"] < 0.6:
        raise ValueError("Output quality too low")
    
    return metrics

Versioning and A/B Testing

# Version prompts like code
# prompts/poetry/v1.yaml - Original version
# prompts/poetry/v2.yaml - Added creativity guidelines
# prompts/poetry/v3.yaml - Simplified instructions

def run_ab_test(topic: str, num_samples: int = 100):
    """Compare two prompt versions"""
    
    results = {
        "v2": [],
        "v3": []
    }
    
    for _ in range(num_samples):
        # Get response from v2
        prompt_v2 = load_prompt_version("poetry", "v2")
        response_v2 = generate_from_prompt(prompt_v2, topic)
        score_v2 = score_response(response_v2)
        results["v2"].append(score_v2)
        
        # Get response from v3
        prompt_v3 = load_prompt_version("poetry", "v3")
        response_v3 = generate_from_prompt(prompt_v3, topic)
        score_v3 = score_response(response_v3)
        results["v3"].append(score_v3)
    
    # Statistical comparison
    v2_mean = sum(results["v2"]) / len(results["v2"])
    v3_mean = sum(results["v3"]) / len(results["v3"])
    
    print(f"v2 avg score: {v2_mean:.3f}")
    print(f"v3 avg score: {v3_mean:.3f}")
    
    if v3_mean > v2_mean:
        print("✅ v3 is better. Deploy to production.")
    else:
        print("❌ v2 is still better. Keep v2.")

Monitoring in Production

# Collect metrics on every call
class PromptLogger:
    def __init__(self):
        self.metrics = []
    
    def log_call(self, prompt_name: str, response: str, latency: float):
        self.metrics.append({
            "timestamp": datetime.now(),
            "prompt": prompt_name,
            "response_length": len(response),
            "latency_ms": latency,
            "quality_score": score_response(response)
        })
    
    def check_drift(self):
        """Alert if quality degrades"""
        recent = self.metrics[-100:]
        recent_quality = sum(m["quality_score"] for m in recent) / len(recent)
        
        baseline = 0.85
        if recent_quality < baseline * 0.9:  # 10% drop
            send_alert(f"Quality degraded: {recent_quality:.2f}")

logger = PromptLogger()

# In your API
@app.post("/generate-poem")
def api_generate_poem(topic: str):
    start = time.time()
    result = generate_poem(topic)
    latency = (time.time() - start) * 1000
    
    logger.log_call("poetry", result, latency)
    logger.check_drift()
    
    return {"poem": result}

Common Patterns

1. Few-Shot Prompting

System: You are a sentiment classifier.

Examples:
Text: "I love this product!" → Sentiment: Positive
Text: "Worst experience ever" → Sentiment: Negative

Now classify:
Text: {user_input} → Sentiment:

2. Chain-of-Thought

Think step by step:
1. What is the question asking?
2. What information do I need?
3. What is the answer?

Question: {user_question}

3. Role-Based Prompting

You are an expert {role} with {years} years of experience.
Your task: {task}
Constraints: {constraints}

Cost Optimization

Tip: Longer prompts = more tokens = higher cost

  • Use concise system prompts
  • Cache repeated context with prompt caching APIs
  • Use cheaper models (gpt-3.5) for simple tasks
  • Batch requests when possible

Scaling Across Teams

# Central prompt repository
# prompts/
#   ├── customer-service/
#   │   ├── v1.yaml
#   │   ├── v2.yaml
#   │   └── tests.py
#   ├── content-generation/
#   │   ├── v1.yaml
#   │   └── tests.py
#   └── translation/
#       └── v1.yaml

# Anyone can load any prompt
from prompts import load_prompt
prompt = load_prompt("customer-service", version="v2")

Learn Prompt Engineering at Scale

Master production prompt engineering with real projects:

Master Production Prompt Engineering

Build reliable LLM applications with versioning, testing, and monitoring.

Start Prompt Engineering Course →