maestro/bench/tasks/reflection-smoke.yaml

# reflection-smoke.yaml
#
# Smoke test for the reflection / Hermes-mode system.
#
# DESIGN NOTE — why this is a single-step task
# ─────────────────────────────────────────────
# The ideal reflection bench is a two-run sequence:
#   Run 1: submit a task + negative feedback → reflection fires → memory
#           entry "feedback_user_prefers_terse_output" is written.
#   Run 2: submit a second task → the reflection-produced memory entry
#           appears in the system prompt → response is demonstrably terse.
#
# The current bench harness (src/bench/runner.ts) does not support multi-run
# sequences or DB assertions (reflection_metrics, memory tables).  The grader
# (src/bench/grader.ts) only evaluates:
#   A — tool calls from activity.log
#   B — checklist tool usage
#   C — file output constraints (file_first_line_equals, file_no_pattern, etc.)
#   D — LLM judge rubrics against output files
#
# Therefore this YAML exercises a single task whose prompt explicitly carries
# the lesson ("one-line terse reply") that reflection would have injected into
# the system prompt on a second run.  The programmatic constraints enforce the
# structural signature of a terse reply, and the LLM judge validates content
# quality.  This gives a useful regression gate even without multi-step support.
#
# FULL TWO-RUN FLOW (for manual / integration testing)
# ─────────────────────────────────────────────────────
# 1. Start orchestrator with reflection.enabled: true and a reflection worker.
# 2. Submit a chat task with body:
#      "Summarise the Pythagorean theorem."
#    The agent will produce a verbose multi-paragraph response.
# 3. Rate that task feedback_rating='bad' via the UI.
# 4. Wait ~60 s.  A task_kind='reflection' job should appear in the jobs table
#    with outcome='applied' in reflection_metrics.
#    Verify: SELECT outcome FROM reflection_metrics ORDER BY created_at DESC LIMIT 1;
# 5. In data/users/<userId>/memory/, confirm a file like
#    feedback_user_prefers_terse_output.md exists.
# 6. Submit a second task: "Summarise the Pythagorean theorem."
#    The reflection memory should now be in the system prompt.
#    The response should be ≤ 3 sentences with no "Certainly!" preamble.
#
# HOW THIS FILE IS DISCOVERED
# ───────────────────────────
# The bench runner (scripts/bench-run.ts) does:
#   glob("bench/tasks/*.yaml")
# No registration step is needed.  Drop this file and it is automatically
# included in `npm run bench` and `npm run bench -- --task=reflection-smoke`.

id: reflection-smoke
title: Reflection smoke — terse reply under explicit lesson
piece_hint: chat
timeout_minutes: 5

prompt: |
  IMPORTANT USER PREFERENCE (simulating a reflection-injected memory entry):
  The user prefers terse, one-line replies with no preamble phrases such as
  "Certainly!", "Of course!", "Sure!", "Great question!", or "Happy to help!".

  Task: What is the Pythagorean theorem?

  Instructions:
  1. Write your answer to `output/answer.md`.
  2. The answer MUST be a single Markdown line (no headings, no bullet lists).
  3. The line MUST NOT start with a preamble phrase.
  4. The line MUST be 120 characters or fewer.

expected:
  must_use_tools: [Write]
  forbidden_tools: [Bash]
  must_produce_files: [output/answer.md]
  completion_status: [succeeded]

grading:
  programmatic:
    weight: 0.6
    constraints:
      # The output must be a single non-empty line — no second non-empty line.
      - type: file_no_pattern
        file: output/answer.md
        pattern: '(?m)^.+\n\n?.+'
      # Must not contain heading markers.
      - type: file_no_pattern
        file: output/answer.md
        pattern: '^#'
      # Must not start with common preamble phrases.
      - type: file_no_pattern
        file: output/answer.md
        pattern: '(?i)^(certainly|of course|sure[,!]|great question|happy to help|absolutely)[!,.]'
      # Must not use bullet / numbered lists.
      - type: file_no_pattern
        file: output/answer.md
        pattern: '(?m)^[-*\d]'
      # Each line ≤ 120 chars (the single content line).
      - type: file_line_max_chars
        file: output/answer.md
        max: 120

  llm_judge:
    weight: 0.4
    rubrics:
      - name: terseness
        prompt: |
          The output should be a single terse line (no preamble, no bullet list,
          no heading) that correctly states the Pythagorean theorem.
          Score 10 if the answer is ≤ 2 short sentences, factually correct, and
          starts directly with the mathematical content (e.g. "In a right triangle…"
          or "a² + b² = c²…").
          Deduct points proportionally for verbosity, preamble phrases, or inaccuracy.
        max_score: 10
      - name: factual_accuracy
        prompt: |
          Does the answer correctly state the Pythagorean theorem
          (a² + b² = c² for a right triangle)?  Score 10 for correct, 0 for wrong.
        max_score: 10