Prompt engineering has evolved from an art to an engineering discipline with established best practices, patterns, and evaluation frameworks. This post covers proven techniques for building reliable LLM-powered applications through effective prompt design.
The Anatomy of an Effective Prompt
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class PromptTemplate:
"""Structured prompt with clear components"""
system_message: str # Set context and behavior
task_description: str # What to do
context: Optional[str] = None # Background information
examples: Optional[List[tuple[str, str]]] = None # Few-shot examples
constraints: Optional[List[str]] = None # Rules to follow
output_format: Optional[str] = None # Expected format
def render(self, input_text: str) -> str:
"""Render complete prompt"""
parts = [self.system_message, self.task_description]
if self.context:
parts.append(f"Context:\n{self.context}")
if self.examples:
parts.append("Examples:")
for input_ex, output_ex in self.examples:
parts.append(f"Input: {input_ex}\nOutput: {output_ex}")
if self.constraints:
parts.append("Constraints:\n" + "\n".join(f"- {c}" for c in self.constraints))
if self.output_format:
parts.append(f"Output Format:\n{self.output_format}")
parts.append(f"Input: {input_text}")
return "\n\n".join(parts)
# Example: Sentiment analysis
sentiment_prompt = PromptTemplate(
system_message="You are an expert at analyzing customer feedback sentiment.",
task_description="Analyze the sentiment of the given customer review.",
examples=[
("The product exceeded my expectations!", "Positive"),
("Terrible quality, waste of money.", "Negative"),
("It's okay, nothing special.", "Neutral"),
],
constraints=[
"Choose only from: Positive, Negative, Neutral",
"Provide only the sentiment, no explanation",
],
output_format="Single word: Positive, Negative, or Neutral"
)
Chain-of-Thought Prompting
Encourage step-by-step reasoning:
def create_cot_prompt(problem: str) -> str:
"""Chain-of-thought prompt for complex reasoning"""
return f"""
Solve this problem step by step. Show your reasoning clearly.
Problem: {problem}
Let's approach this systematically:
Step 1: Understand what we're being asked
[Think through the problem]
Step 2: Identify the relevant information
[List the key facts]
Step 3: Determine the approach
[Explain the method]
Step 4: Execute the solution
[Show the work]
Step 5: Verify the answer
[Check if it makes sense]
Final Answer: [Provide the solution]
"""
# Example usage
problem = "A bakery sells 150 cookies in the morning and 80% more in the afternoon. How many cookies were sold in total?"
# The CoT prompting will guide the model to:
# 1. Identify morning sales: 150
# 2. Calculate 80% of 150: 120
# 3. Calculate afternoon sales: 150 + 120 = 270
# 4. Calculate total: 150 + 270 = 420
# 5. Verify: Does 270 equal 150 + (0.8 × 150)? Yes.
Self-Consistency and Multiple Sampling
Generate multiple responses and select the most consistent:
from collections import Counter
from typing import List
async def self_consistency_prompting(
llm: LLMEngine,
prompt: str,
num_samples: int = 5,
temperature: float = 0.8
) -> tuple[str, float]:
"""
Generate multiple responses and return most common answer.
Returns (answer, confidence)
"""
# Generate multiple independent samples
responses = await llm.generate(
[prompt] * num_samples,
temperature=temperature
)
# Extract answers (assumes answers are in final line)
answers = [extract_final_answer(r) for r in responses]
# Find most common answer
answer_counts = Counter(answers)
most_common_answer, count = answer_counts.most_common(1)[0]
# Confidence is frequency of most common answer
confidence = count / num_samples
return most_common_answer, confidence
def extract_final_answer(response: str) -> str:
"""Extract final answer from response"""
lines = response.strip().split('\n')
for line in reversed(lines):
if line.startswith("Final Answer:"):
return line.replace("Final Answer:", "").strip()
return lines[-1].strip()
Few-Shot Learning Patterns
class FewShotPromptBuilder:
"""Build few-shot prompts with dynamic example selection"""
def __init__(self, example_pool: List[tuple[str, str]]):
self.example_pool = example_pool
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Pre-compute example embeddings
example_texts = [ex[0] for ex in example_pool]
self.example_embeddings = self.embedding_model.encode(example_texts)
def build_prompt(
self,
input_text: str,
num_examples: int = 3,
selection_method: str = "semantic"
) -> str:
"""Build prompt with relevant examples"""
if selection_method == "semantic":
examples = self._select_semantic_examples(input_text, num_examples)
elif selection_method == "diverse":
examples = self._select_diverse_examples(num_examples)
else:
examples = self.example_pool[:num_examples]
# Build prompt
prompt_parts = ["Given input-output pairs, respond to the new input:\n"]
for inp, out in examples:
prompt_parts.append(f"Input: {inp}\nOutput: {out}\n")
prompt_parts.append(f"Input: {input_text}\nOutput:")
return "\n".join(prompt_parts)
def _select_semantic_examples(
self,
input_text: str,
num_examples: int
) -> List[tuple[str, str]]:
"""Select most semantically similar examples"""
input_embedding = self.embedding_model.encode([input_text])[0]
# Compute similarities
similarities = cosine_similarity(
[input_embedding],
self.example_embeddings
)[0]
# Get top k
top_indices = similarities.argsort()[-num_examples:][::-1]
return [self.example_pool[i] for i in top_indices]
def _select_diverse_examples(
self,
num_examples: int
) -> List[tuple[str, str]]:
"""Select diverse examples to cover different patterns"""
# Use clustering or maximum margin selection
# Simplified: just sample randomly
import random
return random.sample(self.example_pool, num_examples)
Structured Output with JSON
import json
from pydantic import BaseModel, Field
from typing import List
class ExtractedEntity(BaseModel):
text: str = Field(description="The entity text")
type: str = Field(description="Entity type: PERSON, ORG, LOCATION, DATE")
confidence: float = Field(description="Confidence score 0-1")
class ExtractionResult(BaseModel):
entities: List[ExtractedEntity]
summary: str
def create_structured_extraction_prompt(text: str) -> str:
"""Prompt for structured entity extraction"""
schema = ExtractionResult.schema_json(indent=2)
return f"""
Extract named entities from the following text and return a JSON object.
Text: {text}
Return your response as valid JSON matching this schema:
{schema}
Requirements:
- Extract all entities of types: PERSON, ORG, LOCATION, DATE
- Provide confidence scores based on context clarity
- Include a brief summary of the text
JSON Output:
"""
# Usage with validation
async def extract_entities(llm: LLMEngine, text: str) -> ExtractionResult:
prompt = create_structured_extraction_prompt(text)
response = await llm.generate([prompt], temperature=0.1)
# Parse and validate
try:
result = ExtractionResult.parse_raw(response[0])
return result
except Exception as e:
# Retry with clarification
retry_prompt = f"{prompt}\n\nPrevious response was invalid: {e}\nPlease provide valid JSON."
response = await llm.generate([retry_prompt], temperature=0.1)
return ExtractionResult.parse_raw(response[0])
ReAct Pattern (Reasoning + Acting)
class ReActAgent:
"""ReAct pattern for tool-using LLM agents"""
def __init__(self, llm: LLMEngine, tools: Dict[str, Callable]):
self.llm = llm
self.tools = tools
async def run(self, task: str, max_iterations: int = 5) -> str:
"""Execute task using ReAct pattern"""
conversation_history = []
conversation_history.append(self._create_system_prompt())
conversation_history.append(f"Task: {task}")
for iteration in range(max_iterations):
# Generate thought and action
prompt = self._build_react_prompt(conversation_history)
response = await self.llm.generate([prompt])
# Parse response
thought, action, action_input = self._parse_response(response[0])
conversation_history.append(
f"Thought: {thought}\nAction: {action}\nAction Input: {action_input}"
)
# Execute action
if action == "Final Answer":
return action_input
if action in self.tools:
observation = await self.tools[action](action_input)
conversation_history.append(f"Observation: {observation}")
else:
conversation_history.append(f"Observation: Unknown action '{action}'")
return "Max iterations reached without final answer"
def _create_system_prompt(self) -> str:
tool_descriptions = "\n".join(
f"- {name}: {func.__doc__}" for name, func in self.tools.items()
)
return f"""
You have access to the following tools:
{tool_descriptions}
Use this format:
Thought: [your reasoning about what to do next]
Action: [tool name or "Final Answer"]
Action Input: [input for the tool]
After each action, you'll receive an Observation. Continue until you can provide a Final Answer.
"""
def _parse_response(self, response: str) -> tuple[str, str, str]:
"""Parse Thought/Action/Action Input from response"""
lines = response.strip().split('\n')
thought = action = action_input = ""
for line in lines:
if line.startswith("Thought:"):
thought = line.replace("Thought:", "").strip()
elif line.startswith("Action:"):
action = line.replace("Action:", "").strip()
elif line.startswith("Action Input:"):
action_input = line.replace("Action Input:", "").strip()
return thought, action, action_input
# Example usage
async def search_tool(query: str) -> str:
"""Search the web for information"""
# Implement actual search
return f"Search results for: {query}"
async def calculator_tool(expression: str) -> str:
"""Evaluate mathematical expressions"""
try:
result = eval(expression)
return str(result)
except:
return "Invalid expression"
agent = ReActAgent(
llm=my_llm,
tools={
"search": search_tool,
"calculator": calculator_tool,
}
)
result = await agent.run("What is the population of Tokyo times 2?")
Prompt Evaluation Framework
from typing import List, Dict, Callable
import asyncio
class PromptEvaluator:
"""Evaluate prompt performance systematically"""
def __init__(self, llm: LLMEngine):
self.llm = llm
async def evaluate(
self,
prompt_template: str,
test_cases: List[Dict],
metrics: List[Callable]
) -> Dict:
"""
Evaluate prompt across test cases and metrics.
test_cases: [{"input": ..., "expected": ...}, ...]
metrics: [accuracy_metric, relevance_metric, ...]
"""
results = []
for test_case in test_cases:
# Render prompt
prompt = prompt_template.format(**test_case["input"])
# Generate response
response = await self.llm.generate([prompt])
# Evaluate against metrics
scores = {}
for metric in metrics:
score = metric(response[0], test_case["expected"])
scores[metric.__name__] = score
results.append({
"input": test_case["input"],
"expected": test_case["expected"],
"actual": response[0],
"scores": scores,
})
# Aggregate results
summary = self._aggregate_results(results, metrics)
return {
"results": results,
"summary": summary,
}
def _aggregate_results(
self,
results: List[Dict],
metrics: List[Callable]
) -> Dict:
"""Aggregate metric scores across all test cases"""
summary = {}
for metric in metrics:
scores = [r["scores"][metric.__name__] for r in results]
summary[metric.__name__] = {
"mean": sum(scores) / len(scores),
"min": min(scores),
"max": max(scores),
}
return summary
# Example metrics
def exact_match_metric(actual: str, expected: str) -> float:
"""Binary exact match"""
return 1.0 if actual.strip() == expected.strip() else 0.0
def semantic_similarity_metric(actual: str, expected: str) -> float:
"""Semantic similarity using embeddings"""
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([actual, expected])
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
return float(similarity)
def contains_keywords_metric(keywords: List[str]):
"""Check if response contains required keywords"""
def metric(actual: str, expected: str) -> float:
actual_lower = actual.lower()
found = sum(1 for kw in keywords if kw.lower() in actual_lower)
return found / len(keywords)
return metric
# Usage
evaluator = PromptEvaluator(my_llm)
test_cases = [
{
"input": {"text": "The product is amazing!"},
"expected": "Positive"
},
{
"input": {"text": "Worst purchase ever."},
"expected": "Negative"
},
]
results = await evaluator.evaluate(
prompt_template=sentiment_prompt.render("{text}"),
test_cases=test_cases,
metrics=[exact_match_metric, semantic_similarity_metric]
)
print(f"Average exact match: {results['summary']['exact_match_metric']['mean']:.2%}")
Advanced: Automatic Prompt Optimization
class PromptOptimizer:
"""Automatically optimize prompts using feedback"""
def __init__(self, llm: LLMEngine, evaluator: PromptEvaluator):
self.llm = llm
self.evaluator = evaluator
async def optimize(
self,
initial_prompt: str,
test_cases: List[Dict],
metrics: List[Callable],
iterations: int = 5
) -> str:
"""Iteratively improve prompt based on test case performance"""
best_prompt = initial_prompt
best_score = 0.0
for i in range(iterations):
# Evaluate current prompt
results = await self.evaluator.evaluate(
best_prompt,
test_cases,
metrics
)
current_score = results["summary"][metrics[0].__name__]["mean"]
if current_score > best_score:
best_score = current_score
# Generate improved prompt
feedback = self._generate_feedback(results)
improved_prompt = await self._improve_prompt(best_prompt, feedback)
print(f"Iteration {i+1}: score={current_score:.2%}")
best_prompt = improved_prompt
return best_prompt
def _generate_feedback(self, results: Dict) -> str:
"""Generate feedback from failed test cases"""
failures = [r for r in results["results"] if r["scores"][list(r["scores"].keys())[0]] < 0.5]
feedback_parts = ["The following test cases failed:"]
for f in failures[:3]: # Limit to avoid context overflow
feedback_parts.append(
f"Input: {f['input']}\nExpected: {f['expected']}\nActual: {f['actual']}"
)
return "\n\n".join(feedback_parts)
async def _improve_prompt(self, current_prompt: str, feedback: str) -> str:
"""Use LLM to improve prompt based on feedback"""
meta_prompt = f"""
You are an expert at prompt engineering. Improve the following prompt to better handle the failure cases.
Current Prompt:
{current_prompt}
Failure Analysis:
{feedback}
Provide an improved version of the prompt that addresses these failures while maintaining the original intent.
Improved Prompt:
"""
improved = await self.llm.generate([meta_prompt], temperature=0.7)
return improved[0]
Conclusion
Effective prompt engineering requires:
- Clear Structure - System, task, context, examples, constraints, format
- Chain-of-Thought - Guide reasoning for complex tasks
- Few-Shot Learning - Select relevant examples dynamically
- Structured Output - Use JSON schemas for parsing
- Tool Integration - ReAct pattern for agentic behavior
- Systematic Evaluation - Test cases, metrics, optimization
Prompt engineering is evolving from art to science. Treat prompts as code—version control them, test them, and optimize them based on data, not intuition.