April 2026

Classifying Request Complexity to Route to the Right LLM in Python

The most reliable way to cut LLM costs is to match model capability to task requirement. You do not need a smaller model for everything. You need the right model for each task. The challenge is that "right model" is a classification problem, and most teams solve it once (badly) at deploy time and never revisit it.

This post builds a complete complexity classification system in Python, from fast heuristics to a two-stage classifier, and shows how to replace the whole thing with outcome-based routing in Kalibr.

Stage 1: Heuristic Classification (No Extra API Call)

The fastest approach to classify request complexity is to inspect the request itself. No API calls, no latency overhead.

from __future__ import annotations

import re
from enum import Enum


class Complexity(str, Enum):
    SIMPLE = "simple"
    MEDIUM = "medium"
    COMPLEX = "complex"


# Signals that indicate a simple, bounded task
SIMPLE_INDICATORS = [
    r"\bclassify\b",
    r"\blabel\b",
    r"\bextract\b",
    r"\byes or no\b",
    r"\btrue or false\b",
    r"\bsentiment\b",
    r"\bspam or not\b",
    r"\bpositive or negative\b",
    r"\bwhat is the\b",
    r"\bidentify\b",
]

# Signals that indicate a complex, generative task
COMPLEX_INDICATORS = [
    r"\banalyze\b",
    r"\bsynthesize\b",
    r"\bcompare and contrast\b",
    r"\bwrite a\b",
    r"\bgenerate\b",
    r"\bexplain why\b",
    r"\bwhat are the tradeoffs\b",
    r"\bpros and cons\b",
    r"\bdesign\b",
    r"\bstrategy\b",
    r"\bimplications\b",
]


def classify_complexity_heuristic(
    prompt: str,
    task_type: str | None = None,
    max_simple_tokens: int = 150,
) -> Complexity:
    """Classify request complexity without any API call."""
    prompt_lower = prompt.lower()
    word_count = len(prompt.split())

    simple_hits = sum(1 for pattern in SIMPLE_INDICATORS if re.search(pattern, prompt_lower))
    complex_hits = sum(1 for pattern in COMPLEX_INDICATORS if re.search(pattern, prompt_lower))

    # Hard overrides based on task type
    if task_type in ("classification", "extraction", "label", "entity"):
        return Complexity.SIMPLE
    if task_type in ("synthesis", "analysis", "generation", "essay"):
        return Complexity.COMPLEX

    # Score-based
    if word_count <= max_simple_tokens and simple_hits >= 1 and complex_hits == 0:
        return Complexity.SIMPLE
    if complex_hits >= 2 or word_count > 500:
        return Complexity.COMPLEX

    return Complexity.MEDIUM


MODEL_FOR_COMPLEXITY = {
    Complexity.SIMPLE: "gpt-4o-mini",
    Complexity.MEDIUM: "gpt-4o-mini",  # Default medium to mini; observe outcomes
    Complexity.COMPLEX: "gpt-4o",
}

Usage:

from openai import OpenAI

client = OpenAI()


def route_and_complete(prompt: str, task_type: str | None = None) -> str:
    complexity = classify_complexity_heuristic(prompt, task_type=task_type)
    model = MODEL_FOR_COMPLEXITY[complexity]

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )
    return response.choices[0].message.content


# Simple, routes to gpt-4o-mini
result = route_and_complete(
    "Extract the invoice number from: 'Please find attached Invoice #INV-2024-0042'",
    task_type="extraction"
)

# Complex, routes to gpt-4o
result2 = route_and_complete(
    "Analyze the long-term implications of widespread LLM adoption on software engineering careers, "
    "including effects on hiring, skill requirements, and team composition."
)

Zero latency overhead. Works as a first pass. The weakness: heuristics encode your priors, not your actual quality data.

Stage 2: Lightweight Classifier (Small Model Judges Before Large Model Executes)

For higher accuracy, use a cheap model to classify complexity before deciding whether to use the expensive model.

import json
from openai import OpenAI

client = OpenAI()

CLASSIFIER_SYSTEM = """You are a task complexity classifier for LLM routing decisions.

Classify the user's prompt as one of:
- SIMPLE: factual lookup, extraction, classification, yes/no, short answer, verifiable output, MEDIUM: moderate synthesis, summarization, structured generation with clear constraints, COMPLEX: deep analysis, multi-step reasoning, creative generation, comparison of many concepts, long-form writing

Respond with JSON only. Format: {"complexity": "SIMPLE"|"MEDIUM"|"COMPLEX", "confidence": 0.0-1.0, "reason": "brief"}"""


def classify_with_llm(prompt: str) -> dict:
    """Use gpt-4o-mini to classify complexity. Costs ~$0.0001 per call."""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": CLASSIFIER_SYSTEM},
            {"role": "user", "content": prompt[:1000]},  # Cap classifier input
        ],
        response_format={"type": "json_object"},
        max_tokens=80,
        temperature=0.0,
    )
    return json.loads(response.choices[0].message.content)


def two_stage_completion(prompt: str) -> dict:
    """Classify complexity, then route to appropriate model."""
    classification = classify_with_llm(prompt)
    complexity = classification["complexity"]
    confidence = classification.get("confidence", 0.5)

    # High confidence simple -> mini
    # High confidence complex -> gpt-4o
    # Low confidence or medium -> mini with fallback option
    if complexity == "SIMPLE" and confidence >= 0.7:
        model = "gpt-4o-mini"
    elif complexity == "COMPLEX" and confidence >= 0.7:
        model = "gpt-4o"
    else:
        model = "gpt-4o-mini"  # Default to cheaper; accept occasional errors

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )

    return {
        "content": response.choices[0].message.content,
        "model_used": model,
        "classification": classification,
    }

The Economics of the Two-Stage Classifier

The classifier call adds cost. Whether it saves money depends on your traffic mix:

# Classifier call: ~100 input tokens + ~30 output tokens at gpt-4o-mini rates
classifier_cost_per_call = (100 * 0.15 + 30 * 0.60) / 1_000_000
print(f"Classifier cost: ${classifier_cost_per_call:.6f}")  # $0.000033

# Savings per correctly routed request (sending to mini instead of gpt-4o)
# Assume 500 token request, 100 token response
gpt4o_cost = (500 * 2.50 + 100 * 10.00) / 1_000_000
mini_cost = (500 * 0.15 + 100 * 0.60) / 1_000_000
savings_per_routed_request = gpt4o_cost, mini_cost
print(f"Savings per routed request: ${savings_per_routed_request:.6f}")  # ~$0.001285

# Break-even: what misclassification rate makes the classifier not worth it?
# classifier_cost = savings * (1 - misclassification_rate)
# If classifier costs $0.000033 and saves $0.001285 per routed request:
break_even_rate = 1 - (classifier_cost_per_call / savings_per_routed_request)
print(f"Break-even: classifier pays off if misclassification rate < {(1 - break_even_rate) * 100:.1f}%")
# Break-even: classifier pays off if misclassification rate < 97.4%

# More directly: classifier cost / savings per request = fraction that must be correctly routed
min_savings_fraction = classifier_cost_per_call / savings_per_routed_request
print(f"Classifier pays off if it routes >= {min_savings_fraction * 100:.1f}% of traffic correctly")
# 2.6% - if even 2.6% of requests are correctly sent to mini instead of gpt-4o, you break even

The math is generous. A classifier that is right 2.6% of the time on routing decisions breaks even. In practice, a well-prompted classifier running on gpt-4o-mini gets complexity right 85-95% of the time. The savings multiply accordingly.

The remaining problem: the classifier does not know whether the routed response was actually good. It classifies based on the prompt, not the outcome.

Stage 3: Continuous Outcome Routing with Kalibr

Both previous approaches classify based on inputs. Neither learns from whether the response was good. Kalibr's goal-based routing closes this loop.

import kalibr  # Must be first import
from kalibr import Router, Outcome

router = Router(
    paths=[
        {"model": "openai/gpt-4o-mini", "weight": 0.75},
        {"model": "openai/gpt-4o",      "weight": 0.25},
    ],
    goal_id="complexity_adaptive_routing"
)


def complete_with_outcome(
    prompt: str,
    validator: callable | None = None,
) -> dict:
    """Route completion and report outcome to update router beliefs."""
    response, request_id = router.completion(
        messages=[{"role": "user", "content": prompt}],
        return_request_id=True
    )

    content = response.choices[0].message.content
    success = True

    if validator is not None:
        success = validator(content)
    elif response.choices[0].finish_reason != "stop":
        success = False  # Truncated = failure

    router.report_outcome(
        request_id=request_id,
        outcome=Outcome.SUCCESS if success else Outcome.FAILURE
    )

    return {
        "content": content,
        "request_id": request_id,
        "outcome": "success" if success else "failure",
    }


# Example with a structured output validator
def validate_json_schema(output: str, required_keys: list) -> bool:
    try:
        data = json.loads(output)
        return all(k in data for k in required_keys)
    except (json.JSONDecodeError, TypeError):
        return False


# Extraction task
result = complete_with_outcome(
    'Extract from: "John Smith, john@example.com, +1-555-0123". Return JSON: {"name", "email", "phone"}',
    validator=lambda out: validate_json_schema(out, ["name", "email", "phone"])
)
print(result["content"])

The router does not need a separate classifier call. It routes probabilistically based on Thompson Sampling's learned beliefs about which model succeeds on your actual workload. The success condition is evaluated against real responses, not predicted from prompts.

Combining Approaches

For production systems, heuristic pre-filtering plus outcome-based routing is a strong combination:

from kalibr import Router, Outcome

# Use heuristics as a strong prior, then let outcomes update
router = Router(
    paths=[
        # Start with high mini weight since heuristics suggest ~70% of tasks are simple
        {"model": "openai/gpt-4o-mini", "weight": 0.70},
        {"model": "openai/gpt-4o",      "weight": 0.30},
    ],
    goal_id="hybrid_routing"
)


def smart_complete(prompt: str, task_type: str | None = None) -> str:
    # Fast heuristic check
    complexity = classify_complexity_heuristic(prompt, task_type=task_type)

    # Hard override for clearly simple tasks: no router needed
    if complexity == Complexity.SIMPLE:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
        )
        return response.choices[0].message.content

    # For everything else, use the outcome-learning router
    response, request_id = router.completion(
        messages=[{"role": "user", "content": prompt}],
        return_request_id=True
    )
    content = response.choices[0].message.content

    # Basic quality check
    success = len(content.strip()) > 10 and response.choices[0].finish_reason == "stop"
    router.report_outcome(
        request_id=request_id,
        outcome=Outcome.SUCCESS if success else Outcome.FAILURE
    )

    return content

The heuristics handle the obvious cases cheaply. The router learns from the ambiguous cases where capability actually varies.

Summary

Classifying request complexity then routing to the right model in Python is a three-layer problem:

  1. Heuristics: Fast, zero-cost, encode known rules. Good enough for well-defined task types.
  2. Classifier call: More accurate, costs ~$0.000033 per call, breaks even at 2.6% correct routing. Good for heterogeneous traffic.
  3. Outcome-based routing (Kalibr): No separate classifier call, learns from real responses, self-corrects as traffic changes. The right default for production systems.

Start with heuristics. Graduate to outcome-based routing when your task mix is complex enough that hand-tuned rules become a liability.

Kalibr keeps complex AI agents running without human intervention.

Get started free