Skip to content

Write a custom detector

chatbot-auditor ships with seven detectors but the 7-mode framework is not exhaustive. This tutorial walks through writing a new detector from scratch — in this case, one that flags conversations with abnormally long user wait times.

The scenario

Your customer service policy says the bot should acknowledge every user message within 5 seconds. In production, some sessions stall for 30+ seconds between "user asks" and "bot responds" — a failure mode not covered by the built-in detectors.

We'll build SlowResponseDetector.

Step 1: Subclass Detector

Every detector declares four class attributes plus a detect() method:

# slow_response.py
from __future__ import annotations

from typing import ClassVar

from chatbot_auditor import (
    Conversation,
    Detection,
    Detector,
    Evidence,
    FailureMode,
    Role,
    Severity,
)


class SlowResponseDetector(Detector):
    name: ClassVar[str] = "slow_response"
    description: ClassVar[str] = (
        "Flags conversations where the bot took longer than a configurable "
        "threshold to respond to a user message."
    )
    # Reuse an existing failure mode, or add your own FailureMode value upstream.
    failure_mode: ClassVar[FailureMode] = FailureMode.SILENT_CHURN
    requires_llm: ClassVar[bool] = False

    def __init__(self, *, max_seconds: float = 5.0, min_lag_count: int = 1) -> None:
        if max_seconds <= 0:
            raise ValueError("max_seconds must be positive")
        self.max_seconds = max_seconds
        self.min_lag_count = min_lag_count

    def detect(self, conversation: Conversation) -> list[Detection]:
        laggy_indices = list(self._find_laggy_responses(conversation))
        if len(laggy_indices) < self.min_lag_count:
            return []
        return [self._build_detection(conversation, laggy_indices)]

Step 2: Implement the core logic

Keep the business logic in a private method so it's easy to unit test.

    def _find_laggy_responses(self, conversation: Conversation) -> list[int]:
        laggy: list[int] = []
        for i, msg in enumerate(conversation.messages):
            if msg.role != Role.BOT or i == 0 or msg.timestamp is None:
                continue
            prior = conversation.messages[i - 1]
            if prior.role != Role.USER or prior.timestamp is None:
                continue
            lag = (msg.timestamp - prior.timestamp).total_seconds()
            if lag > self.max_seconds:
                laggy.append(i)
        return laggy

Step 3: Build the detection

Detections carry structured information so reports and dashboards can render them consistently.

    def _build_detection(
        self, conversation: Conversation, laggy_indices: list[int]
    ) -> Detection:
        count = len(laggy_indices)
        severity = Severity.CRITICAL if count >= 3 else Severity.MEDIUM

        evidence = [
            Evidence(
                message_index=i,
                quote=_truncate(conversation.messages[i].content, 120),
                note="Bot took longer than threshold to respond",
            )
            for i in laggy_indices
        ]

        return Detection(
            conversation_id=conversation.id,
            detector=self.name,
            failure_mode=self.failure_mode,
            severity=severity,
            confidence=0.9,
            explanation=(
                f"Bot took longer than {self.max_seconds:.0f}s to respond "
                f"{count} time(s)."
            ),
            evidence=evidence,
            recommended_action="alert",
            metadata={
                "laggy_response_count": count,
                "threshold_seconds": self.max_seconds,
            },
        )


def _truncate(text: str, n: int) -> str:
    return text if len(text) <= n else text[: n - 3] + "..."

Step 4: Write tests

Good detectors have good tests. Aim for high-confidence positive cases, confirmed non-detections, and edge cases.

# test_slow_response.py
from datetime import UTC, datetime, timedelta

from chatbot_auditor import Conversation, Message, Role
from slow_response import SlowResponseDetector


def _conv(timings: list[tuple[Role, float]]) -> Conversation:
    base = datetime(2026, 4, 17, tzinfo=UTC)
    return Conversation(
        id="c1",
        messages=[
            Message(
                role=role,
                content=f"msg{i}",
                timestamp=base + timedelta(seconds=secs),
            )
            for i, (role, secs) in enumerate(timings)
        ],
    )


def test_slow_response_detected() -> None:
    conv = _conv([(Role.USER, 0), (Role.BOT, 10)])
    detector = SlowResponseDetector(max_seconds=5.0)
    detections = detector.detect(conv)
    assert len(detections) == 1
    assert detections[0].metadata["laggy_response_count"] == 1


def test_fast_response_not_detected() -> None:
    conv = _conv([(Role.USER, 0), (Role.BOT, 2)])
    detector = SlowResponseDetector(max_seconds=5.0)
    assert detector.detect(conv) == []


def test_multiple_laggy_responses_raise_severity() -> None:
    conv = _conv([
        (Role.USER, 0),
        (Role.BOT, 15),
        (Role.USER, 20),
        (Role.BOT, 40),
        (Role.USER, 45),
        (Role.BOT, 70),
    ])
    detections = SlowResponseDetector(max_seconds=5.0).detect(conv)
    assert detections[0].severity.value == "critical"

Step 5: Register it

Use the new detector alongside the built-ins via a custom registry:

from chatbot_auditor import audit, default_registry
from slow_response import SlowResponseDetector

registry = default_registry()
registry.register(SlowResponseDetector(max_seconds=5.0))

for d in audit(conversations, detectors=registry):
    print(d.detector, d.severity, d.explanation)

Patterns worth following

  • Validate configuration in __init__. Fail loudly on bad thresholds.
  • Return an empty list often. Most conversations aren't failures — don't over-flag.
  • Set confidence thoughtfully. Values below 0.7 mean "review recommended"; higher means "act on this."
  • Populate evidence. Dashboards, reports, and human reviewers depend on it.
  • Use metadata for machine-readable details. Downstream systems can pivot on raw numbers rather than parsing the explanation string.
  • Keep detectors independent. Don't have one detector rely on another's output — run them in parallel and let the registry collect.