#!/usr/bin/env bash
#MISE description="Auto-discover and run every plugins/*/hooks/tests/test-*.sh regression test. Aggregates pass/fail counts per test file and emits a marketplace-wide summary. Exits non-zero if any single test file fails. Use before cutting a release, or after editing any hook to verify the iter-39/40/41/46/47-style regression guards still pass. Self-contained — no CI/GitHub Actions required (cc-skills local-first policy)."
#
# test-marketplace-hook-regression-suite
#
# Iter-50 test-discoverability fix: regression tests added across iter-39
# through iter-47 (5 tests so far) all live in plugins/*/hooks/tests/test-*.sh
# but were INVISIBLE to anyone not grep'ing for the verbose filenames. This
# task makes them discoverable via a single command:
#
#   mise run test-marketplace-hook-regression-suite
#
# Auto-discovers via the glob `plugins/*/hooks/tests/test-*.sh`. New tests
# added in future iters following this convention are picked up
# automatically — no task edit required.
#
# Per the user "self-explanatory scaffolding" directive: the task name
# encodes WHAT it tests (marketplace-wide), WHICH layer (hook regression),
# and WHAT TYPE of runs are aggregated (the test suite, not benchmarks).
# Benchmarks live alongside as bench-*.sh and are NOT run by this task —
# they're for manual perf investigation, not regression coverage.

set -euo pipefail

# Iter-35 bash-5.2-patsub-replacement-defense (cross-plugin sweep):
shopt -u patsub_replacement 2>/dev/null || true

# Iter-75 recursion guard. The parity regression test for the parallel
# runner (test-marketplace-hook-regression-suite-parallel-runner-iter75-
# parity-against-iter50-sequential-baseline-and-iter54-fail-output-ux.sh)
# lives in .mise/tasks/tests/ — which means THIS runner auto-discovers
# and runs it. The parity test itself invokes THIS runner to verify
# integration parity, which would cause infinite recursion without a
# guard. Export this env var as a "I'm already inside the parent
# invocation" sentinel so the parity test can skip the live-tier
# integration assertion (its synthetic-tier still runs to exercise the
# parallelism primitive). When invoked standalone (operator running the
# parity test directly via bash), the env var is unset and the full
# two-tier coverage exercises.
export MARKETPLACE_HOOK_REGRESSION_SUITE_PARENT_INVOCATION_RECURSION_GUARD=1

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"

echo "═══════════════════════════════════════════════════════════"
echo "  Marketplace Hook Regression Suite"
echo "═══════════════════════════════════════════════════════════"
echo "→ Auto-discovering tests at plugins/*/hooks/tests/test-*.sh"
echo ""

# Discover all test files at TWO well-known paths:
#
#   1. plugins/*/hooks/tests/test-*.sh
#      Per-plugin hook regression tests (iter-50 original convention).
#
#   2. .mise/tasks/tests/test-*.sh
#      Repo-level infrastructure tests for mise tasks themselves —
#      added iter-59 so the async-eligibility audit task (and any
#      future load-bearing mise task) can have discoverable regression
#      coverage. Plugin-scope tests don't belong here because the audit
#      task lives outside any single plugin's scope.
#
# Sort for stable output and reproducible test order across both
# discovery paths.
TEST_FILES=()
while IFS= read -r test_file; do
    [ -f "$test_file" ] && TEST_FILES+=("$test_file")
done < <({
    # Iter-127 perf-win: -mindepth 4 -maxdepth 4 confines plugin-scope
    # discovery to exactly plugins/<plugin>/hooks/tests/<file> depth,
    # skipping descent into every plugin's skills/, scripts/, references/,
    # node_modules/, etc. Empirically measured: 340ms -> 20ms (~15x
    # speedup for this find alone, ~320ms saved per Check 4e invocation).
    # Same iter-125 bounded-depth pattern.
    find "$REPO_ROOT/plugins" -mindepth 4 -maxdepth 4 -type f \
         -name 'test-*.sh' -path '*/hooks/tests/*' 2>/dev/null
    find "$REPO_ROOT/.mise/tasks/tests" -maxdepth 1 -name 'test-*.sh' -type f 2>/dev/null
} | sort)

if [ "${#TEST_FILES[@]}" -eq 0 ]; then
    echo "⚠  No tests found at either discovery path:"
    echo "   - plugins/*/hooks/tests/test-*.sh   (per-plugin hook tests)"
    echo "   - .mise/tasks/tests/test-*.sh       (infrastructure mise-task tests)"
    echo "   Add tests following these naming conventions to have them auto-discovered."
    exit 0
fi

echo "→ Discovered ${#TEST_FILES[@]} test file(s)"
echo ""

# Iter-75 parallel-execution rewrite. Iter-74 baseline measurement
# identified Check 4e (this suite) at 3870ms / 48.8% of preflight as the
# top-1 wall-time bottleneck after iter-74 dropped Check 4b from 2032ms
# to 73ms. Per-test forensic profile (warm, sequential):
#
#   ms     test (truncated)
#   ----   -------------------------------------------------------------
#   635    userpromptsubmit-1password-context-injection-prejq-fastpath
#   429    posttooluse-1password-pattern-reminder-leading-executable
#   406    audit-hooks-for-async-true-eligibility-eight-layers
#   371    audit-stop-hook-additionalContext-emission-detection
#   216    audit-non-pretooluse-hooks-wrong-field-detection
#   <180   (remaining 10 tests)
#   ----
#   3324   sum
#
# Distribution is NOT Pareto-dominant — top 4 tests = 55.4% of suite
# time. Parallelization is the correct lever. Theoretical lower bound
# is the longest single test (~635ms = 5.2× speedup) since no further
# bin-packing can shrink wall-time below the max-test runtime.
#
# Concurrency tool choice: GNU parallel would give the best output-
# grouping semantics (`--group` buffers per-job stdout and emits
# atomically), but is NOT installed by default on macOS / mise / brew
# and would add a hard install dep. POSIX `xargs -P` is ubiquitous but
# has no output-grouping — fatal for diagnosable test logs IF tests
# wrote to stdout. We sidestep that by giving each test its OWN
# per-test stdout-capture file (the iter-54 design), so xargs's lack
# of `--group` doesn't matter: tests can run truly concurrently, each
# writing to a distinct file, and the sequential aggregation phase
# reads them back in stable sort order.
#
# Concurrency cap: ADAPTIVE per-host (operator-tunable via
# MARKETPLACE_HOOK_REGRESSION_PARALLEL_LANES). When unset, default is
# clamp(sysctl_hw_ncpu - 4, 4, 12) — i.e., leave 4 cores for OS+IDE,
# but never go below 4 or above 12. The iter-128 perf-win below is
# motivated by an empirical bottleneck-analysis with three-rep medians:
#
#   Sweep across lane counts on 14-core M-series host (57 tests):
#     lanes=8  → 3.75 s wall-clock median  (pre-iter-128 baseline)
#     lanes=10 → 3.09 s wall-clock median  (-660 ms vs lanes=8)
#     lanes=12 → 3.23 s wall-clock median  (bun-spawn contention shows)
#     lanes=14 → 2.89 s wall-clock median  (no OS/IDE headroom — unsafe)
#
#   Sweet spot is lanes=10:
#     - 660 ms saved vs pre-iter-128 (~18% reduction in Check 4e)
#     - 4-core headroom protects parent mise/IDE responsiveness during
#       the parallel fan-out
#     - lanes=12 shows non-monotonic regression due to bun's ~12-17ms
#       cold-start floor (measured by iter-80) hitting process-scheduling
#       contention with longest-test critical path
#     - lanes=14 is only 200ms faster than lanes=10 — not worth losing
#       the OS+IDE headroom for
#
# Why floor=4: protect low-end laptops (Intel MacBook Air with 2-core
# hyperthreaded = 4 logical cores) from degenerate lanes=1 sequential
# fallback that would make the suite 4× slower.
#
# Why ceiling=12: above 12 lanes the marginal speedup is offset by bun
# process scheduling contention (measured non-monotonic regression at
# lanes=12 vs lanes=10 above). Operators on >16-core hosts who want to
# experimentally push lanes higher can set the env var explicitly.
#
# References:
#   - https://www.gnu.org/software/parallel/parallel_alternatives.html
#     (GNU parallel maintainers' alternatives doc — confirms xargs -P
#     is acceptable when per-job output is captured separately)
#   - iter-54 docs (full-output-on-FAIL, compact-on-PASS UX)
#   - iter-74 docs/RELEASE.md iter-75+ candidate forensic notes
#   - iter-128 forensic notes (this comment) — adaptive lane sizing
__iter128_compute_adaptive_marketplace_hook_regression_parallel_lanes_clamped_against_host_cpu_count_with_four_cpu_headroom_for_os_and_ide_avoiding_bun_cold_start_contention_plateau_at_twelve_lanes() {
    local detected_host_cpu_count
    # macOS BSD: sysctl -n hw.ncpu; Linux GNU: nproc.
    detected_host_cpu_count=$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 8)
    local recommended_lane_count=$((detected_host_cpu_count - 4))
    # Clamp to [4, 12]. See header comment for empirical justification
    # (three-rep median sweep on 14-core M-series host).
    if [[ "$recommended_lane_count" -lt 4 ]]; then
        recommended_lane_count=4
    elif [[ "$recommended_lane_count" -gt 12 ]]; then
        recommended_lane_count=12
    fi
    echo "$recommended_lane_count"
}
MARKETPLACE_HOOK_REGRESSION_PARALLEL_LANES="${MARKETPLACE_HOOK_REGRESSION_PARALLEL_LANES:-$(__iter128_compute_adaptive_marketplace_hook_regression_parallel_lanes_clamped_against_host_cpu_count_with_four_cpu_headroom_for_os_and_ide_avoiding_bun_cold_start_contention_plateau_at_twelve_lanes)}"

# Single per-run results directory holds one .stdout file + one .exit
# file per test, keyed by test basename (test names are unique-by-design
# in this marketplace — no collision risk). Deterministic key avoids
# the random mktemp suffix that would prevent the post-pass aggregator
# from locating each test's output.
PER_RUN_TEST_OUTPUT_CAPTURE_DIRECTORY=$(mktemp -d -t marketplace-hook-regression-suite-results.XXXXXX)
trap 'rm -rf "$PER_RUN_TEST_OUTPUT_CAPTURE_DIRECTORY"' EXIT

# Run each test, capture exit code, aggregate counts
PASSED_TEST_FILES_COUNT=0
FAILED_TEST_FILES_COUNT=0
FAILED_TEST_FILES_LIST=()

# Phase 1 — PARALLEL FAN-OUT. Each test runs concurrently in its own
# bash subprocess, with stdout+stderr captured to a per-test file in
# the shared results directory. The inline `bash -c` worker is tiny
# (no shared bash functions needed) so we avoid the overhead of
# exporting functions across processes. xargs ensures only N tests
# run concurrently; the call blocks until ALL tests finish.
# shellcheck disable=SC2016
# SC2016 false positive: the $1 and $2 inside the single-quoted bash -c
# body refer to the positional args passed to that NESTED bash, not to
# the outer scope. Single-quoting is REQUIRED to defer their expansion
# until xargs invokes the per-test bash subprocess.
# BSD-xargs-portability hardening (iter-90 forensic finding): macOS BSD
# `xargs -I {}` has a default `-S` (replacement-string size) limit of
# 255 bytes per assembled invocation. When the iter-90 test filename
# (`test-pretooluse-edit-time-orchestrator-iter90-...verified.sh`,
# 233 chars) was added to the marketplace test corpus, the assembled
# command per invocation grew to ~280 chars (path prefix + filename +
# bash body + output dir), exceeding 255 and triggering
# `xargs: command line cannot be assembled, too long`. The fix sets
# -S 16384 (16 KB) which is well above the GNU-xargs equivalent default
# while still being safely below ARG_MAX (~1 MB on macOS). This is a
# PORTABILITY hardening, not a workaround — GNU xargs (Linux) defaults
# to a much higher limit and was never affected; macOS BSD users were
# silently exposed to a "longer test filenames break the runner" cliff.
printf '%s\n' "${TEST_FILES[@]}" | \
    xargs -S 16384 -P "$MARKETPLACE_HOOK_REGRESSION_PARALLEL_LANES" -I {} bash -c '
        single_test_file_path="$1"
        per_run_results_dir="$2"
        test_basename_for_stable_lookup="$(basename "$single_test_file_path" .sh)"
        # Iter-131: capture per-test wall-clock to sidecar `.elapsed_ms` file
        # using bash 5+ $EPOCHREALTIME. Cost is sub-millisecond (no fork)
        # so we always write — the opt-in iter-131 top-N-slowest-tests
        # summary at end-of-aggregation reads these files when
        # MARKETPLACE_HOOK_REGRESSION_SUITE_TOP_N_SLOWEST_TESTS_TO_DISPLAY=N
        # is set. Default behavior unchanged.
        iter131_per_test_wall_clock_start_seconds_for_ranked_bottleneck_summary="$EPOCHREALTIME"
        if bash "$single_test_file_path" \
            > "$per_run_results_dir/$test_basename_for_stable_lookup.stdout" 2>&1; then
            echo 0 > "$per_run_results_dir/$test_basename_for_stable_lookup.exit"
        else
            echo "$?" > "$per_run_results_dir/$test_basename_for_stable_lookup.exit"
        fi
        iter131_per_test_wall_clock_end_seconds_for_ranked_bottleneck_summary="$EPOCHREALTIME"
        # Portable sub-second math via awk (same primitive as iter-73 preflight).
        awk -v s="$iter131_per_test_wall_clock_start_seconds_for_ranked_bottleneck_summary" \
            -v e="$iter131_per_test_wall_clock_end_seconds_for_ranked_bottleneck_summary" \
            "BEGIN { printf \"%.0f\", (e - s) * 1000 }" \
            > "$per_run_results_dir/$test_basename_for_stable_lookup.elapsed_ms"
    ' _ {} "$PER_RUN_TEST_OUTPUT_CAPTURE_DIRECTORY"

# Phase 2 — SEQUENTIAL AGGREGATION. Iterate the original sorted test
# list (deterministic order) and render each test's result. The iter-54
# UX is preserved bit-for-bit: full output on FAIL, last-3-lines on
# PASS. Aggregation happens AFTER all parallel jobs complete, so the
# output stream is identical to the iter-50 sequential reference in
# terms of ordering — only the run-time has changed.
for test_file in "${TEST_FILES[@]}"; do
    # Compute repo-relative path for readable output
    relative_path="${test_file#"$REPO_ROOT"/}"
    test_name=$(basename "$test_file" .sh)
    test_stdout_capture_file="$PER_RUN_TEST_OUTPUT_CAPTURE_DIRECTORY/$test_name.stdout"
    test_exit_code_file="$PER_RUN_TEST_OUTPUT_CAPTURE_DIRECTORY/$test_name.exit"

    echo "─── $relative_path ───"

    # Defensive: if the parallel fan-out couldn't even create the per-
    # test files (e.g. xargs aborted, disk full, results dir wiped),
    # treat the test as FAILED rather than silently PASSing — this
    # protects the release gate from missing-evidence false-greens.
    if [[ ! -f "$test_exit_code_file" ]]; then
        echo "  ✗ FILE-FAIL: $test_name (no exit-code file from parallel fan-out — likely xargs aborted before this test ran)"
        FAILED_TEST_FILES_COUNT=$((FAILED_TEST_FILES_COUNT + 1))
        FAILED_TEST_FILES_LIST+=("$relative_path")
        echo ""
        continue
    fi

    captured_exit_code=$(cat "$test_exit_code_file")
    if [[ "$captured_exit_code" == "0" ]]; then
        # PASS path: show compact summary (last 3 lines)
        tail -3 "$test_stdout_capture_file"
        echo "  ✓ FILE-PASS: $test_name"
        PASSED_TEST_FILES_COUNT=$((PASSED_TEST_FILES_COUNT + 1))
    else
        # FAIL path: show FULL output so the operator can diagnose
        # without re-running the test manually
        cat "$test_stdout_capture_file"
        echo "  ✗ FILE-FAIL: $test_name (exit=$captured_exit_code)"
        FAILED_TEST_FILES_COUNT=$((FAILED_TEST_FILES_COUNT + 1))
        FAILED_TEST_FILES_LIST+=("$relative_path")
    fi
    echo ""
done

echo "═══════════════════════════════════════════════════════════"
echo "  Marketplace Hook Regression Suite — Summary"
echo "═══════════════════════════════════════════════════════════"
echo "  Test files discovered: ${#TEST_FILES[@]}"
echo "  Test files PASSED:     $PASSED_TEST_FILES_COUNT"
echo "  Test files FAILED:     $FAILED_TEST_FILES_COUNT"

if [ "$FAILED_TEST_FILES_COUNT" -gt 0 ]; then
    echo ""
    echo "  Failed test files:"
    for fp in "${FAILED_TEST_FILES_LIST[@]}"; do
        echo "    - $fp"
    done
    echo ""
    echo "  Run a failing test directly to see its detailed output:"
    echo "    bash <test-file-path>"
    exit 1
fi

# Iter-131: opt-in top-N-slowest-tests ranking summary. Mirrors iter-130's
# preflight bottleneck-ranking pattern: operators iterating on test perf
# see WHICH tests dominate the wall-clock without instrumenting each
# manually. Off by default to preserve bit-for-bit output for CI/release
# gate consumers; opt in with
#   MARKETPLACE_HOOK_REGRESSION_SUITE_TOP_N_SLOWEST_TESTS_TO_DISPLAY=N
# Workflow motivation: iter-128's adaptive-parallel-lanes work surfaced
# the question "which individual tests are slowest?" and the only way to
# answer was bespoke per-test profiling. Iter-131 makes that data routine.
__iter131_emit_top_n_slowest_marketplace_hook_regression_tests_ranked_by_wall_clock_milliseconds_descending_bottleneck_summary() {
    local top_n_threshold_for_slowest_test_ranking_display="${MARKETPLACE_HOOK_REGRESSION_SUITE_TOP_N_SLOWEST_TESTS_TO_DISPLAY:-0}"
    if [[ "$top_n_threshold_for_slowest_test_ranking_display" == "0" ]]; then
        return 0
    fi
    if [[ ! -d "$PER_RUN_TEST_OUTPUT_CAPTURE_DIRECTORY" ]]; then
        return 0
    fi
    # Build TAB-separated "elapsed_ms<TAB>test_basename" records from the
    # .elapsed_ms sidecars written by the parallel worker above.
    local elapsed_ms_sidecar_file_path test_basename elapsed_ms_value
    local -a per_test_timing_records_for_ranking=()
    for elapsed_ms_sidecar_file_path in "$PER_RUN_TEST_OUTPUT_CAPTURE_DIRECTORY"/*.elapsed_ms; do
        [[ -f "$elapsed_ms_sidecar_file_path" ]] || continue
        test_basename=$(basename "$elapsed_ms_sidecar_file_path" .elapsed_ms)
        elapsed_ms_value=$(cat "$elapsed_ms_sidecar_file_path")
        per_test_timing_records_for_ranking+=("${elapsed_ms_value}"$'\t'"${test_basename}")
    done
    if [[ "${#per_test_timing_records_for_ranking[@]}" -eq 0 ]]; then
        return 0
    fi
    echo ""
    echo "  ⧗ ─── Top ${top_n_threshold_for_slowest_test_ranking_display} slowest tests (iter-131 marketplace-suite bottleneck ranking) ───"
    printf '%s\n' "${per_test_timing_records_for_ranking[@]}" \
        | sort -rn -k1 \
        | head -n "${top_n_threshold_for_slowest_test_ranking_display}" \
        | awk -F'\t' '{ printf "      %2d. %6d ms  %s\n", NR, $1, $2 }'
    echo "  ⧗ (override count via MARKETPLACE_HOOK_REGRESSION_SUITE_TOP_N_SLOWEST_TESTS_TO_DISPLAY=N)"
}
__iter131_emit_top_n_slowest_marketplace_hook_regression_tests_ranked_by_wall_clock_milliseconds_descending_bottleneck_summary

echo ""
echo "  ✓ All marketplace hook regression tests PASSED"
