Prompt engineering has evolved from an art to an engineering discipline with established best practices, patterns, and evaluation frameworks. This post covers proven techniques for building reliable LLM-powered applications through effective prompt design.

The Anatomy of an Effective Prompt

from dataclasses import dataclass
from typing import List, Optional

@dataclass
class PromptTemplate:
    """Structured prompt with clear components"""

    system_message: str  # Set context and behavior
    task_description: str  # What to do
    context: Optional[str] = None  # Background information
    examples: Optional[List[tuple[str, str]]] = None  # Few-shot examples
    constraints: Optional[List[str]] = None  # Rules to follow
    output_format: Optional[str] = None  # Expected format

    def render(self, input_text: str) -> str:
        """Render complete prompt"""
        parts = [self.system_message, self.task_description]

        if self.context:
            parts.append(f"Context:\n{self.context}")

        if self.examples:
            parts.append("Examples:")
            for input_ex, output_ex in self.examples:
                parts.append(f"Input: {input_ex}\nOutput: {output_ex}")

        if self.constraints:
            parts.append("Constraints:\n" + "\n".join(f"- {c}" for c in self.constraints))

        if self.output_format:
            parts.append(f"Output Format:\n{self.output_format}")

        parts.append(f"Input: {input_text}")

        return "\n\n".join(parts)

# Example: Sentiment analysis
sentiment_prompt = PromptTemplate(
    system_message="You are an expert at analyzing customer feedback sentiment.",
    task_description="Analyze the sentiment of the given customer review.",
    examples=[
        ("The product exceeded my expectations!", "Positive"),
        ("Terrible quality, waste of money.", "Negative"),
        ("It's okay, nothing special.", "Neutral"),
    ],
    constraints=[
        "Choose only from: Positive, Negative, Neutral",
        "Provide only the sentiment, no explanation",
    ],
    output_format="Single word: Positive, Negative, or Neutral"
)

Chain-of-Thought Prompting

Encourage step-by-step reasoning:

def create_cot_prompt(problem: str) -> str:
    """Chain-of-thought prompt for complex reasoning"""

    return f"""
Solve this problem step by step. Show your reasoning clearly.

Problem: {problem}

Let's approach this systematically:

Step 1: Understand what we're being asked
[Think through the problem]

Step 2: Identify the relevant information
[List the key facts]

Step 3: Determine the approach
[Explain the method]

Step 4: Execute the solution
[Show the work]

Step 5: Verify the answer
[Check if it makes sense]

Final Answer: [Provide the solution]
"""

# Example usage
problem = "A bakery sells 150 cookies in the morning and 80% more in the afternoon. How many cookies were sold in total?"

# The CoT prompting will guide the model to:
# 1. Identify morning sales: 150
# 2. Calculate 80% of 150: 120
# 3. Calculate afternoon sales: 150 + 120 = 270
# 4. Calculate total: 150 + 270 = 420
# 5. Verify: Does 270 equal 150 + (0.8 × 150)? Yes.

Self-Consistency and Multiple Sampling

Generate multiple responses and select the most consistent:

from collections import Counter
from typing import List

async def self_consistency_prompting(
    llm: LLMEngine,
    prompt: str,
    num_samples: int = 5,
    temperature: float = 0.8
) -> tuple[str, float]:
    """
    Generate multiple responses and return most common answer.
    Returns (answer, confidence)
    """

    # Generate multiple independent samples
    responses = await llm.generate(
        [prompt] * num_samples,
        temperature=temperature
    )

    # Extract answers (assumes answers are in final line)
    answers = [extract_final_answer(r) for r in responses]

    # Find most common answer
    answer_counts = Counter(answers)
    most_common_answer, count = answer_counts.most_common(1)[0]

    # Confidence is frequency of most common answer
    confidence = count / num_samples

    return most_common_answer, confidence

def extract_final_answer(response: str) -> str:
    """Extract final answer from response"""
    lines = response.strip().split('\n')
    for line in reversed(lines):
        if line.startswith("Final Answer:"):
            return line.replace("Final Answer:", "").strip()
    return lines[-1].strip()

Few-Shot Learning Patterns

class FewShotPromptBuilder:
    """Build few-shot prompts with dynamic example selection"""

    def __init__(self, example_pool: List[tuple[str, str]]):
        self.example_pool = example_pool
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Pre-compute example embeddings
        example_texts = [ex[0] for ex in example_pool]
        self.example_embeddings = self.embedding_model.encode(example_texts)

    def build_prompt(
        self,
        input_text: str,
        num_examples: int = 3,
        selection_method: str = "semantic"
    ) -> str:
        """Build prompt with relevant examples"""

        if selection_method == "semantic":
            examples = self._select_semantic_examples(input_text, num_examples)
        elif selection_method == "diverse":
            examples = self._select_diverse_examples(num_examples)
        else:
            examples = self.example_pool[:num_examples]

        # Build prompt
        prompt_parts = ["Given input-output pairs, respond to the new input:\n"]

        for inp, out in examples:
            prompt_parts.append(f"Input: {inp}\nOutput: {out}\n")

        prompt_parts.append(f"Input: {input_text}\nOutput:")

        return "\n".join(prompt_parts)

    def _select_semantic_examples(
        self,
        input_text: str,
        num_examples: int
    ) -> List[tuple[str, str]]:
        """Select most semantically similar examples"""
        input_embedding = self.embedding_model.encode([input_text])[0]

        # Compute similarities
        similarities = cosine_similarity(
            [input_embedding],
            self.example_embeddings
        )[0]

        # Get top k
        top_indices = similarities.argsort()[-num_examples:][::-1]

        return [self.example_pool[i] for i in top_indices]

    def _select_diverse_examples(
        self,
        num_examples: int
    ) -> List[tuple[str, str]]:
        """Select diverse examples to cover different patterns"""
        # Use clustering or maximum margin selection
        # Simplified: just sample randomly
        import random
        return random.sample(self.example_pool, num_examples)

Structured Output with JSON

import json
from pydantic import BaseModel, Field
from typing import List

class ExtractedEntity(BaseModel):
    text: str = Field(description="The entity text")
    type: str = Field(description="Entity type: PERSON, ORG, LOCATION, DATE")
    confidence: float = Field(description="Confidence score 0-1")

class ExtractionResult(BaseModel):
    entities: List[ExtractedEntity]
    summary: str

def create_structured_extraction_prompt(text: str) -> str:
    """Prompt for structured entity extraction"""

    schema = ExtractionResult.schema_json(indent=2)

    return f"""
Extract named entities from the following text and return a JSON object.

Text: {text}

Return your response as valid JSON matching this schema:
{schema}

Requirements:
- Extract all entities of types: PERSON, ORG, LOCATION, DATE
- Provide confidence scores based on context clarity
- Include a brief summary of the text

JSON Output:
"""

# Usage with validation
async def extract_entities(llm: LLMEngine, text: str) -> ExtractionResult:
    prompt = create_structured_extraction_prompt(text)
    response = await llm.generate([prompt], temperature=0.1)

    # Parse and validate
    try:
        result = ExtractionResult.parse_raw(response[0])
        return result
    except Exception as e:
        # Retry with clarification
        retry_prompt = f"{prompt}\n\nPrevious response was invalid: {e}\nPlease provide valid JSON."
        response = await llm.generate([retry_prompt], temperature=0.1)
        return ExtractionResult.parse_raw(response[0])

ReAct Pattern (Reasoning + Acting)

class ReActAgent:
    """ReAct pattern for tool-using LLM agents"""

    def __init__(self, llm: LLMEngine, tools: Dict[str, Callable]):
        self.llm = llm
        self.tools = tools

    async def run(self, task: str, max_iterations: int = 5) -> str:
        """Execute task using ReAct pattern"""

        conversation_history = []
        conversation_history.append(self._create_system_prompt())
        conversation_history.append(f"Task: {task}")

        for iteration in range(max_iterations):
            # Generate thought and action
            prompt = self._build_react_prompt(conversation_history)
            response = await self.llm.generate([prompt])

            # Parse response
            thought, action, action_input = self._parse_response(response[0])

            conversation_history.append(
                f"Thought: {thought}\nAction: {action}\nAction Input: {action_input}"
            )

            # Execute action
            if action == "Final Answer":
                return action_input

            if action in self.tools:
                observation = await self.tools[action](action_input)
                conversation_history.append(f"Observation: {observation}")
            else:
                conversation_history.append(f"Observation: Unknown action '{action}'")

        return "Max iterations reached without final answer"

    def _create_system_prompt(self) -> str:
        tool_descriptions = "\n".join(
            f"- {name}: {func.__doc__}" for name, func in self.tools.items()
        )

        return f"""
You have access to the following tools:
{tool_descriptions}

Use this format:
Thought: [your reasoning about what to do next]
Action: [tool name or "Final Answer"]
Action Input: [input for the tool]

After each action, you'll receive an Observation. Continue until you can provide a Final Answer.
"""

    def _parse_response(self, response: str) -> tuple[str, str, str]:
        """Parse Thought/Action/Action Input from response"""
        lines = response.strip().split('\n')
        thought = action = action_input = ""

        for line in lines:
            if line.startswith("Thought:"):
                thought = line.replace("Thought:", "").strip()
            elif line.startswith("Action:"):
                action = line.replace("Action:", "").strip()
            elif line.startswith("Action Input:"):
                action_input = line.replace("Action Input:", "").strip()

        return thought, action, action_input

# Example usage
async def search_tool(query: str) -> str:
    """Search the web for information"""
    # Implement actual search
    return f"Search results for: {query}"

async def calculator_tool(expression: str) -> str:
    """Evaluate mathematical expressions"""
    try:
        result = eval(expression)
        return str(result)
    except:
        return "Invalid expression"

agent = ReActAgent(
    llm=my_llm,
    tools={
        "search": search_tool,
        "calculator": calculator_tool,
    }
)

result = await agent.run("What is the population of Tokyo times 2?")

Prompt Evaluation Framework

from typing import List, Dict, Callable
import asyncio

class PromptEvaluator:
    """Evaluate prompt performance systematically"""

    def __init__(self, llm: LLMEngine):
        self.llm = llm

    async def evaluate(
        self,
        prompt_template: str,
        test_cases: List[Dict],
        metrics: List[Callable]
    ) -> Dict:
        """
        Evaluate prompt across test cases and metrics.

        test_cases: [{"input": ..., "expected": ...}, ...]
        metrics: [accuracy_metric, relevance_metric, ...]
        """

        results = []

        for test_case in test_cases:
            # Render prompt
            prompt = prompt_template.format(**test_case["input"])

            # Generate response
            response = await self.llm.generate([prompt])

            # Evaluate against metrics
            scores = {}
            for metric in metrics:
                score = metric(response[0], test_case["expected"])
                scores[metric.__name__] = score

            results.append({
                "input": test_case["input"],
                "expected": test_case["expected"],
                "actual": response[0],
                "scores": scores,
            })

        # Aggregate results
        summary = self._aggregate_results(results, metrics)

        return {
            "results": results,
            "summary": summary,
        }

    def _aggregate_results(
        self,
        results: List[Dict],
        metrics: List[Callable]
    ) -> Dict:
        """Aggregate metric scores across all test cases"""

        summary = {}

        for metric in metrics:
            scores = [r["scores"][metric.__name__] for r in results]
            summary[metric.__name__] = {
                "mean": sum(scores) / len(scores),
                "min": min(scores),
                "max": max(scores),
            }

        return summary

# Example metrics
def exact_match_metric(actual: str, expected: str) -> float:
    """Binary exact match"""
    return 1.0 if actual.strip() == expected.strip() else 0.0

def semantic_similarity_metric(actual: str, expected: str) -> float:
    """Semantic similarity using embeddings"""
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode([actual, expected])
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return float(similarity)

def contains_keywords_metric(keywords: List[str]):
    """Check if response contains required keywords"""
    def metric(actual: str, expected: str) -> float:
        actual_lower = actual.lower()
        found = sum(1 for kw in keywords if kw.lower() in actual_lower)
        return found / len(keywords)
    return metric

# Usage
evaluator = PromptEvaluator(my_llm)

test_cases = [
    {
        "input": {"text": "The product is amazing!"},
        "expected": "Positive"
    },
    {
        "input": {"text": "Worst purchase ever."},
        "expected": "Negative"
    },
]

results = await evaluator.evaluate(
    prompt_template=sentiment_prompt.render("{text}"),
    test_cases=test_cases,
    metrics=[exact_match_metric, semantic_similarity_metric]
)

print(f"Average exact match: {results['summary']['exact_match_metric']['mean']:.2%}")

Advanced: Automatic Prompt Optimization

class PromptOptimizer:
    """Automatically optimize prompts using feedback"""

    def __init__(self, llm: LLMEngine, evaluator: PromptEvaluator):
        self.llm = llm
        self.evaluator = evaluator

    async def optimize(
        self,
        initial_prompt: str,
        test_cases: List[Dict],
        metrics: List[Callable],
        iterations: int = 5
    ) -> str:
        """Iteratively improve prompt based on test case performance"""

        best_prompt = initial_prompt
        best_score = 0.0

        for i in range(iterations):
            # Evaluate current prompt
            results = await self.evaluator.evaluate(
                best_prompt,
                test_cases,
                metrics
            )

            current_score = results["summary"][metrics[0].__name__]["mean"]

            if current_score > best_score:
                best_score = current_score

            # Generate improved prompt
            feedback = self._generate_feedback(results)
            improved_prompt = await self._improve_prompt(best_prompt, feedback)

            print(f"Iteration {i+1}: score={current_score:.2%}")
            best_prompt = improved_prompt

        return best_prompt

    def _generate_feedback(self, results: Dict) -> str:
        """Generate feedback from failed test cases"""
        failures = [r for r in results["results"] if r["scores"][list(r["scores"].keys())[0]] < 0.5]

        feedback_parts = ["The following test cases failed:"]
        for f in failures[:3]:  # Limit to avoid context overflow
            feedback_parts.append(
                f"Input: {f['input']}\nExpected: {f['expected']}\nActual: {f['actual']}"
            )

        return "\n\n".join(feedback_parts)

    async def _improve_prompt(self, current_prompt: str, feedback: str) -> str:
        """Use LLM to improve prompt based on feedback"""

        meta_prompt = f"""
You are an expert at prompt engineering. Improve the following prompt to better handle the failure cases.

Current Prompt:
{current_prompt}

Failure Analysis:
{feedback}

Provide an improved version of the prompt that addresses these failures while maintaining the original intent.

Improved Prompt:
"""

        improved = await self.llm.generate([meta_prompt], temperature=0.7)
        return improved[0]

Conclusion

Effective prompt engineering requires:

  1. Clear Structure - System, task, context, examples, constraints, format
  2. Chain-of-Thought - Guide reasoning for complex tasks
  3. Few-Shot Learning - Select relevant examples dynamically
  4. Structured Output - Use JSON schemas for parsing
  5. Tool Integration - ReAct pattern for agentic behavior
  6. Systematic Evaluation - Test cases, metrics, optimization

Prompt engineering is evolving from art to science. Treat prompts as code—version control them, test them, and optimize them based on data, not intuition.