#!/usr/bin/env bash
# 5dive agent management CLI — runs on user's runtime VM.
# State: /var/lib/5dive/agents.json (registry) + agents.d/<name>.env (per-agent systemd env).
# Each agent = Linux user `agent-<name>` in `claude` group (inherits shared
# /home/claude/.config|.claude|.codex|.aws) + systemd unit 5dive-agent@<name>.service
# running tmux session `agent-<name>` with the chosen CLI in a restart loop.
#
# Output contract:
#   - `--json` is accepted as a GLOBAL flag on any subcommand; stdout is then an
#     envelope `{ok:true,data:...}` on success or `{ok:false,error:{code,class,message}}`
#     on error. Text-mode stderr stays human-readable. Exit code always matches
#     error.code (see E_* below) so shell pipelines can branch without parsing.
#   - Progress `==>` lines always go to stderr so JSON stdout parses cleanly.
set -euo pipefail

# Some sbin tools (adduser, usermod, userdel) live in /usr/sbin and /sbin. On
# a normal interactive shell they're on PATH already, but when this script is
# spawned from a systemd unit that overrides PATH= (or any other restricted
# parent), /usr/sbin can be missing and the very first agent-create fails
# with "adduser: command not found". Prepend them unconditionally — duplicate
# entries are harmless.
case ":$PATH:" in
  *":/usr/sbin:"*) ;;
  *) export PATH="/usr/local/sbin:/usr/sbin:/sbin:$PATH" ;;
esac

# Bumped on every public release. `build.sh` checks this line exists; CI fails
# the bundle-drift check if it's missing or empty.
readonly FIVE_VERSION="0.1.84"

STATE_DIR="/var/lib/5dive"
REGISTRY="${STATE_DIR}/agents.json"
ENV_DIR="${STATE_DIR}/agents.d"
SYSTEMD_UNIT="5dive-agent@"

# Bumped when the on-disk registry shape changes in a way that older CLIs
# can't read. ensure_state stamps this into agents.json on create + migrates
# v0 (no version field) registries in place. Keep migrations pure-jq so they
# run without extra deps.
readonly REGISTRY_SCHEMA_VERSION=1

# Exclusive lock for mutating commands. Two dashboard clicks on "create" with
# the same name used to race on adduser + registry_write; now every mutation
# goes through with_registry_lock so there's exactly one writer at a time.
REGISTRY_LOCK="${STATE_DIR}/registry.lock"

# Append-only audit trail. Every mutating CLI invocation emits one NDJSON
# line with {ts,user,cmd,args,result,code}. Sensitive flags (api keys, bot
# tokens, callback codes) are redacted before write. The HTTP/exec path can
# pass the Clerk user via FIVEDIVE_AUDIT_USER; otherwise we fall back to
# SUDO_USER / USER.
AUDIT_LOG="/var/log/5dive/agent-audit.log"

# Named auth profiles let two agents of the same type authenticate against
# different accounts/keys. Each profile is a directory of env files (one per
# type) + any captured CLI config (e.g. a per-profile ~/.claude). The default
# profile has no name and uses the shared /etc/5dive/connectors/*.env files
# so existing single-account setups keep working unchanged.
AUTH_PROFILES_DIR="${STATE_DIR}/auth-profiles"

# Device-code login sessions for the non-TTY auth flow. Each live session is
# a tmux window owned by the `claude` user, driving `claude setup-token` (or
# equivalent). State lives under sessions/<id>/ — the dashboard polls it via
# `5dive agent auth poll` so no PTY bridge is required.
AUTH_SESSIONS_DIR="${STATE_DIR}/auth-sessions"

# Default tmux cwd for a newly-created agent. Per-agent override goes in the
# registry as .agents[name].workdir and is written to AGENT_WORKDIR in the
# systemd env file — 5dive-agent-start.sh reads it and falls back to this
# path if the configured dir isn't accessible.
DEFAULT_WORKDIR="/home/claude/projects"

# Per-agent channel secrets live here (readable by the agent user via
# EnvironmentFile in 5dive-agent@.service). Mode 0640 root:claude is written
# by the 5dive-write-connector helper — we call it so perms stay consistent.
CONNECTORS_DIR="/etc/5dive/connectors"

# Known agent types -> (bin path, supports channels yes/no).
# auth_file is the shared-config path that indicates the type is authenticated.
# Extend here to add a new agent type.
declare -A TYPE_BIN=(
  [claude]="/home/claude/.local/bin/claude"
  [codex]="/home/claude/.nvm/versions/node/v24/bin/codex"
  [hermes]="/home/claude/.local/bin/hermes"
  [openclaw]="/home/claude/.local/bin/openclaw"
  [opencode]="/home/claude/.local/bin/opencode"
  # antigravity is Google's native-Go successor to gemini-cli. The installer
  # lands it at ~/.local/bin/agy. State dir is ~/.gemini/antigravity-cli/
  # (the binary identifies as product=antigravity but reuses Google's
  # ~/.gemini parent — see launch log in the antigravity scaffold landed
  # in 5dive@<post-removal>).
  [antigravity]="/home/claude/.local/bin/agy"
  # grok is xAI's CLI. Installer drops the binary at ~/.grok/bin/grok and
  # symlinks ~/.local/bin/grok — we point TYPE_BIN at the symlink to match
  # the convention of the other types.
  [grok]="/home/claude/.local/bin/grok"
)
# Which types accept --channels=telegram|discord. Each type wires the channel
# differently (see install_channel_for_<type>_agent below):
#   claude   — installs claude-plugins-official's telegram/discord plugin into
#              the agent user's ~/.claude/plugins; the bun server writes
#              ~/.claude/channels/<plugin>/access.json on first launch and
#              cmd_pair pops a pairing code into it.
#   openclaw — `openclaw channels add --channel <ch> --token <token>` writes
#              the credential into the openclaw gateway config; the openclaw
#              `pairing` subcommand handles inbound user approvals separately.
#   hermes   — writes TELEGRAM_BOT_TOKEN / DISCORD_BOT_TOKEN to the agent
#              user's ~/.hermes/.env; hermes' gateway picks it up at startup.
#   codex    — writes the bot token + access.json into the agent user's
#              ~/.codex/channels/telegram/; 5dive-agent-start wires the
#              telegram-codex MCP server + lifecycle hooks into config.toml
#              and launches codex with --dangerously-bypass-hook-trust.
#              telegram only (no discord build for codex yet).
#   grok     — same shape as codex: writes ~/.grok/channels/telegram/{.env,
#              access.json}; 5dive-agent-start writes [mcp_servers.telegram]
#              + [[hooks.*]] into ~/.grok/config.toml. grok runs with
#              --always-approve (set in 5dive-agent-start), which also
#              auto-trusts plugin/MCP commands. telegram only.
# Only claude needs the pair-code roundtrip — see cmd_pair's dispatch.
declare -A TYPE_CHANNELS=(
  [claude]=1
  [openclaw]=1
  [hermes]=1
  [codex]=1
  [grok]=1
  # opencode ships a telegram bridge too, but as a STANDALONE RELAY (not an MCP
  # server): telegram-opencode/server.ts IS the agent's main process and spawns
  # `opencode serve` over loopback HTTP. 5dive-agent-start launches `bun run
  # --cwd <plugin> start` instead of the opencode TUI; install writes the token
  # + access.json into ~/.opencode/channels/telegram. telegram only.
  [opencode]=1
  # antigravity (agy) ships the same telegram MCP bridge as grok/codex —
  # ~/.gemini/channels/telegram/{.env,access.json} + a shared plugin checkout
  # whose MCP server + lifecycle hooks 5dive-agent-start writes into the
  # GLOBAL ~/.gemini/config/{mcp_config.json,hooks.json} at boot (agy doesn't
  # auto-load a plugin's mcp_config/hooks — only skills/agents). telegram only.
  [antigravity]=1
)
# Auth sentinel per type. Agent users run as agent-<name> (in group `claude`)
# and cannot read /home/claude/.claude/settings.json (mode 0600), so for
# claude-family types we check /etc/5dive/connectors/anthropic.env (0640
# root:claude) — that's the file systemd injects via EnvironmentFile.
# Format: "<path>"          -> file must exist and be non-empty
#         "<path>:<KEY>"    -> if path ends in .env, grep ^KEY=; else jq .env[KEY]
# Omit a type entirely to mark it auth-optional — auth_status_one returns "ok"
# without checking. opencode is the canonical example: it ships with free models
# and runs out of the box, so the dashboard shouldn't gate `agent create` on a
# sign-in the user doesn't need.
declare -A TYPE_AUTH=(
  [claude]="/etc/5dive/connectors/anthropic.env:CLAUDE_CODE_OAUTH_TOKEN"
  [codex]="/home/claude/.codex/auth.json"
  # Apr 2026 Anthropic policy change: third-party harnesses can no longer ride
  # the user's Claude Pro/Max subscription token (suspension risk). hermes and
  # openclaw both sign in via OpenAI's /codex/device flow now. hermes writes
  # ~/.hermes/auth.json; openclaw writes its agent-scoped auth-profiles.json
  # under the default agent id "main" (resolved by openclaw's resolveAgentDir).
  [hermes]="/home/claude/.hermes/auth.json"
  [openclaw]="/home/claude/.openclaw/agents/main/agent/auth-profiles.json"
  # antigravity tries the OS keyring first (via DBus secret-service) and
  # falls back to a file at ~/.gemini/antigravity-cli/antigravity-oauth-token
  # (mode 0600). Verified empirically against agy 1.0.1: after the device-
  # code flow completes (user pastes the Google OAuth callback code), the
  # binary writes the token-blob file with this exact name — no .json
  # extension, just the bare filename. Agent users run without a DBus
  # session, so the file path is always the live sentinel.
  [antigravity]="/home/claude/.gemini/antigravity-cli/antigravity-oauth-token"
  # grok writes ~/.grok/auth.json on successful `grok login --device-auth`.
  # Verified empirically — auth.json.lock pre-exists the actual auth.json
  # file (created on first device-auth attempt for the locking mechanism).
  [grok]="/home/claude/.grok/auth.json"
)
# Installer recipe per type. Run as `claude` user via `sudo -u claude -i bash -lc <recipe>`
# so $HOME/.nvm and PATH resolve correctly. Empty string => no automated installer
# (caller must hand-install). Idempotent: each recipe checks first.
declare -A TYPE_INSTALL=(
  [claude]="command -v claude >/dev/null || curl -fsSL https://claude.ai/install.sh | bash"
  # Verify the EXACT TYPE_BIN path (not `command -v codex`): a stray
  # /usr/bin/codex from apt or a codex left over under a non-v24 nvm major
  # would short-circuit the install, leaving v24/bin/codex empty and
  # surfacing as "install reported success but bin missing". `nvm use 24`
  # forces npm install -g to land in v24's bin dir even when the default
  # alias has drifted.
  [codex]="[[ -x /home/claude/.nvm/versions/node/v24/bin/codex ]] || { . /home/claude/.nvm/nvm.sh && nvm use 24 >/dev/null && npm install -g @openai/codex; }"
  # opencode.ai's installer drops the binary at ~/.opencode/bin/opencode and
  # only adds it to PATH via .bashrc — but bash -lc skips .bashrc on
  # non-interactive shells, so neither the verify check below nor the agent
  # systemd unit (which uses TYPE_BIN's path directly) would find it.
  # Symlink into ~/.local/bin so TYPE_BIN[opencode] resolves on every box.
  [opencode]="[[ -x /home/claude/.local/bin/opencode ]] || { curl -fsSL https://opencode.ai/install | bash && mkdir -p /home/claude/.local/bin && ln -sf /home/claude/.opencode/bin/opencode /home/claude/.local/bin/opencode; }"
  # Both upstreams launch an interactive setup wizard that opens /dev/tty
  # after the binary lands. shelld runs us without a controlling terminal,
  # so the wizard's `exec </dev/tty` blows up with ENXIO and the recipe
  # exits non-zero even though install itself succeeded. Pass the upstream
  # opt-outs (--skip-setup / --no-onboard) to land at the binary and stop.
  # openclaw also defaults to an npm install that drops the binary in
  # nvm's per-version bin dir, not ~/.local/bin — symlink it so TYPE_BIN
  # resolves on every box (same dance as opencode above).
  # hermes' upstream installer recreates /home/claude/.hermes at mode 0700,
  # overriding the 2770 from users.sh and blocking agent-* (claude-group)
  # users from traversing it to exec the venv binary — the unit then
  # crash-loops with `binary not installed`. chmod back to 0775 to match
  # the live perms of /home/claude/.opencode and .local/share/claude.
  [hermes]="[[ -x /home/claude/.local/bin/hermes ]] || { curl -fsSL https://hermes-agent.nousresearch.com/install.sh | bash -s -- --skip-setup && chmod 0775 /home/claude/.hermes; }"
  [openclaw]="[[ -x /home/claude/.local/bin/openclaw ]] || { curl -fsSL https://openclaw.ai/install.sh | bash -s -- --no-onboard && mkdir -p /home/claude/.local/bin && ln -sf \"\$(npm prefix -g)/bin/openclaw\" /home/claude/.local/bin/openclaw; }"
  # antigravity's installer drops the native-Go binary at ~/.local/bin/agy
  # and self-updates in the background on each run, so no daily-cron
  # equivalent of @google/gemini-cli's npm update is needed.
  [antigravity]="command -v agy >/dev/null || curl -fsSL https://antigravity.google/cli/install.sh | bash"
  # grok's installer drops the binary at ~/.grok/bin/grok but only creates the
  # ~/.local/bin/grok symlink *opportunistically* (its line 328 requires
  # ~/.local/bin already on PATH and ~/.grok/bin not on PATH). On a fresh VM
  # those conditions often don't hold, so it just appends ~/.grok/bin to
  # .bashrc and never makes the symlink TYPE_BIN expects — hence we create the
  # symlink ourselves here rather than trusting the installer. We also drop the
  # installer's ~/.local/bin/agent symlink so it can't shadow future tooling.
  # The binary self-updates on launch; no daily-cron entry needed.
  [grok]="command -v grok >/dev/null 2>&1 || curl -fsSL https://x.ai/cli/install.sh | bash; mkdir -p /home/claude/.local/bin; [ -e /home/claude/.grok/bin/grok ] && ln -sf /home/claude/.grok/bin/grok /home/claude/.local/bin/grok; rm -f /home/claude/.local/bin/agent"
)

# vercel-labs/skills CLI agent ID per 5dive type. `npx skills add --agent <id>`
# uses this to drop SKILL.md into the right per-type dir. openclaw isn't in
# the upstream registry — passing through its own name makes the CLI fall
# back to a generic project install at ./skills/<id>, which is what we want.
declare -A SKILLS_AGENT_ID=(
  [claude]=claude-code
  [codex]=codex
  [hermes]=hermes-agent
  [openclaw]=openclaw
  [opencode]=opencode
  # `npx skills add --agent antigravity` is NOT in the upstream registry, but
  # the CLI silently falls back to a generic install path (.agents/skills/) —
  # which is exactly where agy itself reads from (see SKILLS_INSTALL_DIR below).
  # So passing it through works, even though it's an "unknown" agent id.
  [antigravity]=antigravity
  [grok]=grok
)
# Where the skills CLI lands SKILL.md inside the agent user's $HOME, per type.
# Used for post-install verification, the cmd_skill_list dir-scan fallback,
# and cmd_skill_rm. Probed empirically against npx skills v0.x — if upstream
# changes a path, update here. Unknown types fall through to ".claude/skills"
# in the lookup sites below.
declare -A SKILLS_INSTALL_DIR=(
  [claude]=".claude/skills"
  [codex]=".agents/skills"
  [hermes]=".hermes/skills"
  [openclaw]="skills"
  [opencode]=".agents/skills"
  # agy reads skills from {workspace}/.agents/skills/{name}/SKILL.md — confirmed
  # by grepping the antigravity binary for the path constant. Earlier map said
  # .gemini/antigravity-cli/skills (matching its state dir), which was a guess
  # — wrong. Upstream npx skills fallback already lands at .agents/skills.
  [antigravity]=".agents/skills"
  [grok]=".grok/skills"
)

# api-key target per type: the env file (in /etc/5dive/connectors for the
# default profile) and the env var inside it. Claude-family is special-cased
# in cmd_auth_set — `sk-ant-oat01-*` tokens write CLAUDE_CODE_OAUTH_TOKEN,
# everything else is ANTHROPIC_API_KEY. Non-claude types use a single var
# that matches what their CLI reads natively.
declare -A TYPE_API_FILE=(
  [claude]="anthropic.env"
  # hermes and openclaw intentionally omitted: both now sign in via OpenAI's
  # /codex/device flow and store credentials in their own files (~/.hermes/
  # auth.json, ~/.openclaw/agents/main/agent/auth-profiles.json). The
  # anthropic.env path no longer feeds either CLI. cmd_auth_set already
  # fails gracefully when a type isn't in this map.
  [codex]="openai.env"
  [opencode]="openai.env"
  [grok]="xai.env"
)
declare -A TYPE_API_VAR=(
  [claude]="ANTHROPIC_API_KEY"
  [codex]="OPENAI_API_KEY"
  [opencode]="OPENAI_API_KEY"
  [grok]="XAI_API_KEY"
)

# BYO provider catalog for hermes/openclaw. The dashboard's new-agent
# wizard collects a canonical id (lowercase, vendor-style) from the user;
# this table maps it to the provider id each agent CLI's native registry
# recognizes plus a sensible default model so the agent's first launch
# doesn't sit at a "model not configured" prompt. Empty string in the
# native column means the type's registry doesn't have that vendor — the
# wizard hides that tile for that agent type.
#
# Native ids were verified empirically:
#   - hermes auth add <p> --type api-key --api-key <k>   (writes ~/.hermes/auth.json,
#       auto-resolves base_url from the in-tree provider catalog).
#   - openclaw writes auth-profiles.json with type:"api_key" entries; provider
#       ids must match openclaw's built-in provider registry (anthropic, openai,
#       google, deepseek, moonshot, openrouter all present).
#
# hermes-moonshot is a special case: its registry has a Kimi provider but no
# `hermes auth add moonshot` subcommand — the key is read from KIMI_API_KEY in
# ~/.hermes/.env at gateway startup (see .env.example upstream). _apply_byo_hermes
# branches on canonical=="moonshot" to take the env-var path instead of `auth add`,
# and cmd_create copies the value into agent-<name>'s own .env before the gateway
# is started. The HERMES_PROVIDER_ID value for moonshot ("kimi") is used as the
# argument to `hermes config set model.provider`, not as an `auth add` id.
declare -A HERMES_PROVIDER_ID=(
  [openai]=""
  [anthropic]="anthropic"
  [google]="gemini"
  [deepseek]="deepseek"
  [moonshot]="kimi"
  [openrouter]="openrouter"
  [nous]="nous"
  [zai]="zai"
  [minimax]="minimax"
  [qwen]="alibaba"
  [huggingface]="huggingface"
)
declare -A OPENCLAW_PROVIDER_ID=(
  [openai]="openai"
  [anthropic]="anthropic"
  [google]="google"
  [deepseek]="deepseek"
  [moonshot]="moonshot"
  [openrouter]="openrouter"
  [nous]=""
  [zai]="zai"
  [minimax]="minimax"
  [qwen]="qwen"
  [huggingface]="huggingface"
)
# Optional per-(type, canonical) default model. Missing entry => leave the
# agent's own default selection logic alone. Conservative defaults: pick
# the vendor's flagship general-purpose model that's likely to exist in
# the in-tree catalog. When an entry turns out to be wrong (model id
# renamed upstream), the user can override via `5dive agent <name> tui`
# and the agent CLI's own model picker.
declare -A HERMES_PROVIDER_MODEL=(
  [anthropic]="claude-sonnet-4-5"
  [google]="gemini-2.0-flash"
  [deepseek]="deepseek-v4-pro"
  [moonshot]="kimi-k2-turbo-preview"
  [openrouter]="openrouter/auto"
)
declare -A OPENCLAW_PROVIDER_MODEL=(
  [openai]="openai/gpt-4o"
  [anthropic]="anthropic/claude-sonnet-4-5"
  [google]="google/gemini-2.0-flash"
  [deepseek]="deepseek/deepseek-v4-pro"
  [moonshot]="moonshot/kimi-k2-instruct"
  [openrouter]="openrouter/auto"
)
declare -A BYO_PROVIDER_LABEL=(
  [openai]="OpenAI"
  [anthropic]="Anthropic"
  [google]="Google AI"
  [deepseek]="DeepSeek"
  [moonshot]="Moonshot / Kimi"
  [openrouter]="OpenRouter"
  [nous]="Nous Portal"
  [zai]="Z.ai / GLM"
  [minimax]="MiniMax"
  [qwen]="Alibaba / Qwen"
  [huggingface]="Hugging Face"
)
valid_byo_provider() {
  [[ -n "${BYO_PROVIDER_LABEL[$1]:-}" ]]
}

# --- Claude (Claude Code) harness BYO custom-provider catalog -----------------
# The claude harness can be pointed at any third-party provider that ships an
# Anthropic Messages-API-compatible endpoint by overriding ANTHROPIC_BASE_URL +
# ANTHROPIC_AUTH_TOKEN and the per-tier model ids (the modern Claude Code knobs
# ANTHROPIC_DEFAULT_{OPUS,SONNET,HAIKU}_MODEL — so whichever tier the agent
# selects, and the background haiku tasks, map to a model the provider actually
# serves instead of 404-ing on "claude-…"). Only providers with a documented
# anthropic-compat endpoint are listed; the rest of BYO_PROVIDER_LABEL is
# intentionally absent here (no compat path → would break the harness). Model
# ids drift upstream — operators can override per agent via the model picker, or
# we bump these. Values verified against vendor Claude-Code docs 2026-06-03.
declare -A CLAUDE_PROVIDER_BASEURL=(
  [deepseek]="https://api.deepseek.com/anthropic"
  [moonshot]="https://api.moonshot.ai/anthropic"
  [zai]="https://api.z.ai/api/anthropic"
)
declare -A CLAUDE_PROVIDER_OPUS_MODEL=(
  [deepseek]="deepseek-v4-pro"
  [moonshot]="kimi-k2.5"
  [zai]="glm-5.1"
)
declare -A CLAUDE_PROVIDER_SONNET_MODEL=(
  [deepseek]="deepseek-v4-pro"
  [moonshot]="kimi-k2.5"
  [zai]="glm-5-turbo"
)
declare -A CLAUDE_PROVIDER_HAIKU_MODEL=(
  [deepseek]="deepseek-v4-flash"
  [moonshot]="kimi-k2.5"
  [zai]="glm-4.5-air"
)

# Resolve a canonical UI id to the agent CLI's native provider id. Empty
# result means the type doesn't support that vendor and the caller should
# fail with a clear error.
resolve_native_provider() {
  local type="$1" canonical="$2"
  case "$type" in
    hermes)   echo "${HERMES_PROVIDER_ID[$canonical]:-}" ;;
    openclaw) echo "${OPENCLAW_PROVIDER_ID[$canonical]:-}" ;;
    # claude maps a supported provider to itself (the env-var override path in
    # _apply_byo_claude keys off the canonical id, not a renamed native id).
    claude)   [[ -n "${CLAUDE_PROVIDER_BASEURL[$canonical]:-}" ]] && echo "$canonical" ;;
    *)        echo "" ;;
  esac
}

# Live auth probe: run "<cli> <args>" as user `claude` with a 5s wall-clock
# cap and see if exit==0. Empty string disables the probe for that type
# (fall back to sentinel-file presence). Args deliberately keep the prompt
# short — we care about "did the API accept our creds", not the response.
declare -A TYPE_PROBE=(
  [claude]='/home/claude/.local/bin/claude --print ping'
  # hermes/openclaw used to probe via `--print ping` against Anthropic; with the
  # OpenAI OAuth flow that argument shape no longer maps to a quick health check
  # we can rely on, so fall back to file-presence (auth_status_one returns "ok"
  # when no probe is configured and the credential file exists).
  [hermes]=''
  [openclaw]=''
  [codex]=''
  [opencode]=''
  # `agy --print ping` triggers a 30s OAuth wait when not authed and can't
  # tell stale-creds from rate-limit from a healthy box. File-presence is
  # the cheaper signal — fall through to TYPE_AUTH's sentinel.
  [antigravity]=''
  # `grok -p ping` would block on stdin via the inline UI; the `agent`
  # subcommand is meant for headless but takes longer to spin up than
  # we want for a 5s probe. Stick with file-presence.
  [grok]=''
)

# -------- exit codes & output helpers --------

# Distinct classes so the frontend can switch without grepping stderr. Keep
# err_class_for() in sync if you add a code.
readonly E_OK=0
readonly E_GENERIC=1         # catch-all / internal
readonly E_USAGE=2           # unknown flag, missing arg, bad subcommand
readonly E_VALIDATION=3      # format check failed (name, workdir, token, lines)
readonly E_NOT_FOUND=4       # agent/type doesn't exist
readonly E_CONFLICT=5        # already exists
readonly E_AUTH_REQUIRED=6   # type not authenticated, bot token missing
readonly E_NOT_INSTALLED=7   # CLI binary missing, no installer recipe
readonly E_NOT_RUNNING=8     # tmux session / systemd unit not active
readonly E_PAIRING=9         # pair code not pending, invalid code
readonly E_PERMISSION=10     # must run as root
readonly E_TIMEOUT=11        # plugin didn't materialize within waitloop

err_class_for() {
  case "$1" in
    0)  echo ok ;;
    2)  echo usage ;;
    3)  echo validation ;;
    4)  echo not_found ;;
    5)  echo conflict ;;
    6)  echo auth_required ;;
    7)  echo not_installed ;;
    8)  echo not_running ;;
    9)  echo pairing ;;
    10) echo permission ;;
    11) echo timeout ;;
    *)  echo generic ;;
  esac
}

# Set to 1 by the global --json preparse in main(). When 1:
#   - fail() emits {ok:false,error:{...}} on stdout instead of prose on stderr-only
#   - ok()   emits {ok:true,data:{...}} on stdout instead of "OK — ..." prose
#   - step() still emits progress to stderr (stdout stays clean)
JSON_MODE=0

# fail <code> <message>
# Always exits. In JSON mode, prints envelope on stdout AND a plain line on
# stderr (for logs). In text mode, prints prose on stderr only. Exit status
# always equals <code> so callers can branch on that alone.
fail() {
  local code="$1"; shift
  local msg="$*"
  if (( JSON_MODE )); then
    local class
    class=$(err_class_for "$code")
    jq -cn --argjson c "$code" --arg cl "$class" --arg m "$msg" \
      '{ok:false, error:{code:$c, class:$cl, message:$m}}'
  fi
  echo "error: $msg" >&2
  exit "$code"
}

die()  { fail "$E_GENERIC" "$@"; }
warn() { echo "warn: $*" >&2; }

# step <message>
# Progress chatter (what the old script printed as `echo "==> ..."`). Always
# goes to stderr so JSON stdout stays parseable. In text mode the user still
# sees it interleaved at the terminal.
step() { echo "==> $*" >&2; }

# ok <prose-line> [jq-expr] [jq-args...]
# Prose mode: `echo "OK — <prose-line>"` to stdout. Skipped if <prose-line> is
# empty.
# JSON mode:  emits `{ok:true, data: <jq-expr>}` on stdout. If <jq-expr> is
# omitted or empty, data defaults to `{}`. Any trailing args are forwarded to
# jq (typically --arg NAME VALUE) and can be referenced from the expr.
#
# Example:
#   ok "agent '$name' started" '{name:$n, action:"start"}' --arg n "$name"
ok() {
  local prose="${1:-}"; shift || true
  if (( JSON_MODE )); then
    local expr="${1:-}"
    [[ $# -gt 0 ]] && shift
    [[ -z "$expr" ]] && expr='{}'
    jq -cn "$@" "{ok:true, data: ($expr)}"
  else
    [[ -n "$prose" ]] && echo "OK — $prose"
  fi
  return 0
}

# -------- helpers --------

require_root() {
  [[ $EUID -eq 0 ]] || fail "$E_PERMISSION" "must run as root (try: sudo 5dive $*)"
}

is_known_type() {
  [[ -n "${TYPE_BIN[$1]+x}" ]]
}

valid_name() {
  # Linux user constraints: start with letter, <=16 chars total incl. agent- prefix (32 max)
  [[ "$1" =~ ^[a-z][a-z0-9-]{0,15}$ ]]
}

valid_channel() {
  [[ "$1" =~ ^(none|telegram|discord)$ ]]
}

valid_isolation() {
  [[ "$1" =~ ^(admin|standard|sandboxed)$ ]]
}

# Absolute path with no shell-metacharacters or control chars. The value ends
# up in a bash-sourced env file (agents.d/<name>.env), so anything exotic
# could break the parse. Existence is not checked here — the start script
# falls back to DEFAULT_WORKDIR with a warn if the path is missing at launch.
valid_workdir() {
  [[ "$1" =~ ^/[A-Za-z0-9._/-]+$ ]]
}

# Sender label embedded in inter-agent message envelopes. Same shape as agent
# names, plus a few literals for non-agent senders (human typing in a TTY,
# scheduled cron, dashboard).
valid_sender_label() {
  [[ "$1" =~ ^[a-z][a-z0-9-]{0,31}$ ]]
}

# 8-hex-char correlation id for inter-agent messages. Stable enough to grep
# scrollback for the receiver's reply window; short enough to type into a
# follow-up `agent send`. /dev/urandom keeps it process-id agnostic so two
# concurrent `agent send` calls can't collide.
gen_msg_id() {
  od -An -N4 -tx1 /dev/urandom 2>/dev/null | tr -d ' \n' | head -c 8
}

# When --from is omitted, infer it from $SUDO_USER. Agent users follow the
# `agent-<label>` convention, so we strip the prefix. Anything else (a real
# human ssh-ing in as `claude`, a build bot, etc.) returns empty — the caller
# then sends raw text with no envelope, preserving the pre-attribution shape.
auto_sender_from_sudo() {
  local u="${SUDO_USER:-}"
  [[ -n "$u" && "$u" == agent-* ]] || { echo ""; return; }
  echo "${u#agent-}"
}

# Same regex the marketplace plugin validates against. Telegram bot tokens
# are <bot-id>:<40-ish char secret>.
valid_telegram_token() {
  [[ "$1" =~ ^[0-9]{5,}:[A-Za-z0-9_-]{20,}$ ]]
}

# Telegram chat/user ids: numeric, optionally negative (for groups/channels).
# Bot API ids are 64-bit signed; cap at 20 chars to fence absurd input.
valid_telegram_chat_id() {
  [[ "$1" =~ ^-?[0-9]{1,20}$ ]]
}

# Comma-separated list of telegram chat/user ids. No spaces — the API arg
# allowlist forbids them anyway, and we don't want to depend on shell IFS.
valid_telegram_chat_id_list() {
  local list="$1" id
  [[ -n "$list" ]] || return 1
  while IFS= read -r id; do
    [[ -n "$id" ]] || continue
    valid_telegram_chat_id "$id" || return 1
  done < <(printf '%s\n' "$list" | tr ',' '\n')
}

# Auth profile names become file/dir names under /var/lib/5dive/auth-profiles
# and also end up as AGENT_AUTH_PROFILE in the systemd env file — keep them
# filename-safe and short.
valid_profile_name() {
  [[ "$1" =~ ^[a-z][a-z0-9_-]{0,31}$ ]]
}

# Any printable non-space run >=10 chars. We don't pin to a specific provider
# format (Anthropic keys start with sk-ant-, OpenAI with sk-, others vary) —
# the live probe (if configured) is the real validation.
valid_api_key() {
  [[ "$1" =~ ^[[:graph:]]{10,}$ ]]
}

# Model identifier accepted by `agent config set model=`. We don't pin to a
# provider catalogue (codex/grok/gemini/claude all use different families that
# keep changing) — just a conservative charset that's safe to drop verbatim
# into a TOML "double-quoted" value or a JSON string without escaping: letters,
# digits, and ._:/-  (covers gpt-5.4, claude-opus-4-8, gemini-2.0-flash,
# provider/model forms). The CLI it feeds is the real validator.
valid_model() {
  [[ "$1" =~ ^[A-Za-z0-9._:/-]+$ ]]
}

# Short random id for non-TTY device-code sessions. 16 hex chars = 64 bits —
# plenty for a workflow that already requires root-on-host to poll.
gen_session_id() {
  head -c 8 /dev/urandom | od -An -tx1 | tr -d ' \n'
}

# Prompt for a secret if stdin is a terminal, otherwise return nonzero so
# callers can error out with a useful message (HTTP/exec path has no TTY).
prompt_secret() {
  local label="$1" out
  if [[ -t 0 ]]; then
    read -r -s -p "$label: " out; echo >&2
    printf '%s' "$out"
    return 0
  fi
  return 1
}

# Inline connector writer — replaces the suid 5dive-write-connector helper.
# Writes var=value to /etc/5dive/connectors/<fname> with mode 640 root:claude.
_write_connector() {
  local fname="$1"
  [[ "$fname" =~ ^[a-zA-Z0-9_-]+\.env$ ]] || { echo "invalid connector filename: $fname" >&2; return 1; }
  local path="${CONNECTORS_DIR}/${fname}"
  cat > "$path"
  chmod 640 "$path"
  chown root:claude "$path"
}

# Write /etc/5dive/connectors/<kind>-<name>.env with correct perms.
write_channel_secret() {
  local kind="$1" name="$2" var="$3" value="$4"
  local fname="${kind}-${name}.env"
  printf '%s=%s\n' "$var" "$value" | _write_connector "$fname"
}

remove_channel_secret() {
  local kind="$1" name="$2"
  rm -f "${CONNECTORS_DIR}/${kind}-${name}.env"
}

# Standalone /usr/local/lib/5dive/ hook paths used to be preseeded into
# new-agent settings.json. As of plugin v0.4.4 every lifecycle hook
# (PreToolUse, PostToolUse, Stop, StopFailure) ships inside
# telegram@5dive-plugins/hooks/hooks.json, so new fork agents no longer
# wire any of them — preseeding would double-fire. The standalone files
# stay installed by scripts/install/agent-cli.sh + scripts/update.sh for
# existing-agent backward compatibility (settings.json on pre-fork
# agents still references them); update.sh's on_upstream_telegram() also
# strips them from fork agents that were provisioned before this change.
# Inter-agent group mirror is fully sender-side now: every `5dive agent
# send|ask` posts "@<receiver> <body>" to the SENDER's group via the sender's
# bot (see mirror_interagent_outbound in cmd_agent.sh). Both halves of an
# exchange — A's outbound question and B's outbound reply — therefore show up
# under the correct sender's identity. The previous receiver-side hooks
# (userprompt-mirror-inter-agent.sh for the inbound, stop-mirror-inter-agent.sh
# for the reply) are retired no-ops, kept on disk only so existing agents'
# settings.json don't error on a missing command. New-agent settings.json
# below no longer wires either of them.
AGENT_SKILLS_DIR="/usr/local/lib/5dive/skills"
# CLAUDE.md fragment dropped into the per-agent $HOME/.claude/ when the
# agent is created with --channels=telegram. Carries the per-turn reply
# mandate + AskUserQuestion/ExitPlanMode warning — guidance that only
# applies to telegram-paired agents and used to live in the shared
# projects-level CLAUDE.md, polluting every non-telegram agent's prompt.
TELEGRAM_AGENT_CLAUDE_MD="/usr/local/lib/5dive/telegram-agent-CLAUDE.md"

# Preseed a claude-family agent's home dir so:
#   - 'claude --dangerously-skip-permissions' doesn't hit the first-run
#     theme picker / trust dialog / project-onboarding prompts
#   - channels=telegram agents pick up the StopFailure hook that pings the
#     paired chat on rate limits
# Written per-agent-user — the agent user cannot read the shared
# /home/claude/.claude/settings.json (mode 0600), so 5dive-agent-start.sh
# unsets CLAUDE_CONFIG_DIR before launching claude, making $HOME/.claude
# (i.e. the preseed below) the effective config dir.
preseed_claude_agent() {
  local name="$1" channels="$2"
  local user="agent-${name}" home="/home/agent-${name}"
  [[ -d "$home" ]] || fail "$E_GENERIC" "agent home missing: $home"

  sudo -u "$user" mkdir -p "$home/.claude"

  # .claude.json: theme + onboarding + trust for /home/claude/projects
  sudo -u "$user" tee "$home/.claude.json" >/dev/null <<JSON
{
  "theme": "dark",
  "hasCompletedOnboarding": true,
  "projects": {
    "/home/claude/projects": {
      "hasTrustDialogAccepted": true,
      "hasCompletedProjectOnboarding": true
    }
  }
}
JSON
  chmod 600 "$home/.claude.json"

  # settings.json: bypassPermissions + the marketplace + (if telegram/discord)
  # the plugin. Telegram additionally wires a StopFailure hook that DMs the
  # paired chat on rate-limit; the discord equivalent would need a separate
  # hook script (not written yet) so discord agents just enable the plugin.
  # Built with jq so channels=none / telegram / discord diverge cleanly.
  # permissions.allow short-circuits BEFORE the channels/telegram plugin's
  # claude/channel/permission relay (anthropics/claude-cli-internal#23061),
  # so explicit allows skip the "🔐 Permission: Bash" Telegram prompt that
  # bypassPermissions alone doesn't suppress under channel-relay mode. The
  # 5dive-transcribe entry covers the voice-pack flow; harmless when voice
  # isn't installed.
  # statusLine points at the shared /usr/local/lib/5dive/statusline.sh (mode
  # 0755, installed by scripts/install/apps.sh + refreshed by update.sh). The
  # main user's /home/claude/.claude/statusline.sh is mode 0600 and not
  # reachable from agent-<name> UIDs. The shared script also tees its input
  # JSON to $HOME/.claude/statusline-last.json so the telegram plugin's
  # /status command can read live 5h/7d rate-limit usage. Omit the key if
  # the file is missing (e.g. host predates the shared-copy rollout) — claude
  # falls back to its built-in statusline.
  local status_line_obj='{}'
  if [[ -x /usr/local/lib/5dive/statusline.sh ]]; then
    status_line_obj='{statusLine: {type: "command", command: "bash /usr/local/lib/5dive/statusline.sh"}}'
  fi
  local settings
  settings=$(jq -n --argjson sl "$(jq -n "$status_line_obj")" '{
    model: "opus",
    permissions: {
      defaultMode: "bypassPermissions",
      allow: ["Bash(5dive-transcribe:*)"]
    },
    skipDangerousModePermissionPrompt: true,
    autoDreamEnabled: true,
    extraKnownMarketplaces: {
      "claude-plugins-official": {
        source: {source: "github", repo: "anthropics/claude-plugins-official"}
      },
      "5dive-plugins": {
        source: {source: "github", repo: "5dive-com/5dive-plugins"}
      }
    }
  } + $sl')
  if [[ "$channels" == "telegram" ]]; then
    # 5dive-plugins/telegram (our fork) bundles every lifecycle hook —
    # PreToolUse, PostToolUse, Stop, and (as of plugin v0.4.4) StopFailure
    # too — via its own hooks.json. We don't preseed any of the standalone
    # /usr/local/lib/5dive/ copies into settings.json for new fork agents;
    # doing so would double-fire on the same event.
    settings=$(jq '. + {enabledPlugins: {"telegram@5dive-plugins": true}}' <<<"$settings")
  elif [[ "$channels" == "discord" ]]; then
    settings=$(jq '. + {enabledPlugins: {"discord@claude-plugins-official": true}}' <<<"$settings")
  fi

  printf '%s\n' "$settings" | sudo -u "$user" tee "$home/.claude/settings.json" >/dev/null
  chmod 600 "$home/.claude/settings.json"

  # Telegram agents get the notify-user skill so claude knows how to ping the
  # paired chat with progress/completion/option-prompt messages.
  if [[ "$channels" == "telegram" && -f "$AGENT_SKILLS_DIR/notify-user/SKILL.md" ]]; then
    sudo -u "$user" mkdir -p "$home/.claude/skills/notify-user"
    sudo -u "$user" cp "$AGENT_SKILLS_DIR/notify-user/SKILL.md" \
      "$home/.claude/skills/notify-user/SKILL.md"
  fi

  # Telegram agents also get a per-agent CLAUDE.md fragment carrying the
  # reply mandate + AskUserQuestion/ExitPlanMode warning. Lands at
  # $HOME/.claude/CLAUDE.md — claude reads it on session start alongside
  # the shared projects-level CLAUDE.md. Best-effort: warn (don't fail)
  # if the installer hasn't placed the source file, since the agent boots
  # fine without it.
  if [[ "$channels" == "telegram" ]]; then
    if [[ -f "$TELEGRAM_AGENT_CLAUDE_MD" ]]; then
      sudo -u "$user" cp "$TELEGRAM_AGENT_CLAUDE_MD" "$home/.claude/CLAUDE.md"
      chmod 644 "$home/.claude/CLAUDE.md"
    else
      warn "$TELEGRAM_AGENT_CLAUDE_MD missing — per-agent telegram CLAUDE.md not wired (run: curl -fsSL https://raw.githubusercontent.com/5dive-com/5dive/main/install.sh | sudo bash)"
    fi
  fi

  # Default skills, best-effort: if npx isn't reachable yet (cold box, no network)
  # the agent still boots; users can re-trigger via the dashboard's Skills block.
  #   find-skills — search skills.sh and self-install additional skills on demand
  #   5dive-cli   — spawn sub-agents on this VM via the local 5dive CLI
  install_default_skill_for_agent "$name" claude vercel-labs/skills find-skills || true
  install_default_skill_for_agent "$name" claude 5dive-com/skills 5dive-cli || true
}

# Preseed default skills for an antigravity agent. Unlike claude/codex/grok,
# antigravity has no plugin marketplace and no telegram channel installer
# (install_channel_for_agent doesn't route antigravity), so the seed step
# that lives inside channel installers for codex/grok has nowhere to land.
# This runs unconditionally from cmd_create so antigravity gets the same
# find-skills + 5dive-cli inheritance every other type gets. Skills land at
# $HOME/.agents/skills/ (per SKILLS_INSTALL_DIR + agy's own loader path).
preseed_antigravity_agent() {
  local name="$1"
  local home="/home/agent-${name}"
  [[ -d "$home" ]] || fail "$E_GENERIC" "agent home missing: $home"
  install_default_skill_for_agent "$name" antigravity vercel-labs/skills find-skills || true
  install_default_skill_for_agent "$name" antigravity 5dive-com/skills 5dive-cli || true
}

# Types that `npx skills add --agent <id>` doesn't recognize. The upstream
# vercel-labs/skills CLI gates --agent against a hardcoded registry; passing
# an unknown id fails with "Invalid agents: <id>" (verified for grok 0.x).
# We fall back to a manual git-clone + cp for these, landing the skill at
# $HOME/${SKILLS_INSTALL_DIR[$type]}/<skill> just like the upstream installer
# would. Add new types here when upstream rejects them.
_skill_needs_manual_install() {
  local type="$1"
  case "$type" in
    grok) return 0 ;;
    *)    return 1 ;;
  esac
}

# Install one skill into an agent user's per-type skills dir. Routes through
# `npx skills add --agent <id>` for upstream-supported types, and falls back
# to a direct git-clone + cp -r for types upstream rejects (see
# _skill_needs_manual_install). Idempotent: skips if the target dir already
# exists. Returns non-zero on any failure so callers can decide whether to
# fail loudly or warn.
install_default_skill_for_agent() {
  local name="$1" type="$2" source="$3" skill="$4"
  local user="agent-${name}" home="/home/agent-${name}"
  local agent_id="${SKILLS_AGENT_ID[$type]:-claude-code}"
  local install_dir="${SKILLS_INSTALL_DIR[$type]:-.claude/skills}"
  [[ -d "$home" ]] || return 1
  id -u "$user" &>/dev/null || return 1
  if sudo -u "$user" test -d "$home/$install_dir/$skill"; then
    return 0
  fi
  if _skill_needs_manual_install "$type"; then
    sudo -u "$user" -H env SOURCE="$source" SKILL="$skill" INSTALL_DIR="$install_dir" bash -s >&2 <<'MANUAL_SKILL' \
      || { warn "default skill '$skill' install failed for agent '$name' (continuing)"; return 1; }
set -uo pipefail
unset CLAUDE_CONFIG_DIR
cd "$HOME"
TMPDIR=$(mktemp -d -t skill-XXXXXX)
trap 'rm -rf "$TMPDIR"' EXIT
timeout 60 git clone --depth=1 "https://github.com/$SOURCE.git" "$TMPDIR/repo" >/dev/null 2>&1
SRC_DIR=""
for d in "$TMPDIR/repo/$SKILL" "$TMPDIR/repo/skills/$SKILL"; do
  if [ -f "$d/SKILL.md" ]; then SRC_DIR="$d"; break; fi
done
[ -n "$SRC_DIR" ] || { echo "ERROR: skill '$SKILL' not found in $SOURCE (looked at top-level and skills/)" >&2; exit 1; }
mkdir -p "$HOME/$INSTALL_DIR"
cp -r "$SRC_DIR" "$HOME/$INSTALL_DIR/$SKILL"
echo "manual-installed $SKILL → $HOME/$INSTALL_DIR/$SKILL"
MANUAL_SKILL
    return 0
  fi
  sudo -u "$user" -H env SOURCE="$source" SKILL="$skill" AGENT_ID="$agent_id" bash -s >&2 <<'DEFAULT_SKILL' \
    || { warn "default skill '$skill' install failed for agent '$name' (continuing)"; return 1; }
set -uo pipefail
unset CLAUDE_CONFIG_DIR
export NVM_DIR="/home/claude/.nvm"
# shellcheck disable=SC1091
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"
export PATH="/home/claude/.local/bin:$PATH"
cd "$HOME"
timeout 180 npx -y skills add "https://github.com/$SOURCE" --skill "$SKILL" --agent "$AGENT_ID" --yes 2>&1 | tail -15
DEFAULT_SKILL
}

# Install a claude-plugins-official channel plugin into the agent user's
# $HOME/.claude/plugins so the bun server can start first-try. Mirrors the
# channel-user setup in scripts/install/services.sh — registers the marketplace,
# installs the plugin, then npm-installs its deps and patches the start script
# (bun install stalls on a fresh box). Each agent user has its own ~/.claude:
# this function runs with CLAUDE_CONFIG_DIR unset (non-login shell), and at
# runtime 5dive-agent-start.sh also unsets it so the preseed is what claude
# reads. <plugin> is the plugin slug (telegram | discord).
# DIVE-265: self-heal the bun dependency. Managed boxes get bun from
# provisioning (root symlink at /usr/local/bin/bun); a fresh OSS host has
# nothing until `doctor --repair`, so `5dive init` used to die at its first
# telegram agent (caught by lodar testing init, 2026-06-11). Install
# system-wide via BUN_INSTALL=/usr/local — root-owned, world-exec, visible to
# every agent user with zero PATH/profile wiring and no /home/claude traversal
# concerns. Idempotent: returns immediately when the user already sees bun.
ensure_bun_for_agent() {
  local user="$1"
  sudo -u "$user" -i bash -lc 'command -v bun' >/dev/null 2>&1 && return 0
  step "bun missing for $user — installing system-wide (/usr/local/bin/bun)"
  curl -fsSL https://bun.sh/install | BUN_INSTALL=/usr/local bash >&2 || true
  sudo -u "$user" -i bash -lc 'command -v bun' >/dev/null 2>&1
}

install_channel_plugin_for_agent() {
  local plugin="$1" name="$2" allowed_users="${3:-}"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"

  # Runtime precheck: the plugin is started with `bun server.ts`, so without
  # bun on the agent user's PATH the service would spin up, crash, and be
  # restarted by systemd — visible only in journalctl. Fail fast with a
  # message the frontend can show instead. Mirror 5dive-agent-start.sh by
  # sourcing nvm + shared profile the way the runtime shell does.
  if ! ensure_bun_for_agent "$user"; then
    fail "$E_NOT_INSTALLED" \
      "bun unavailable for $user (required by $plugin plugin) and automatic install failed. Check network access to bun.sh, or install bun to /usr/local/bin manually, then retry."
  fi

  # Pick the marketplace per plugin. Telegram lives on our 5dive-plugins
  # fork (bundled hooks + richer commands); discord stays upstream until
  # we fork it too. Other plugins default to upstream.
  local marketplace="claude-plugins-official"
  # Full HTTPS URL — `claude plugin marketplace add` resolves the
  # GitHub shorthand `owner/repo` to git@github.com (SSH) on at least
  # some claude versions, which fails on agent-<name> users with no
  # SSH key configured. Explicit https URL sidesteps the shorthand
  # resolver entirely.
  local mkt_repo="https://github.com/anthropics/claude-plugins-official.git"
  if [[ "$plugin" == "telegram" ]]; then
    marketplace="5dive-plugins"
    mkt_repo="https://github.com/5dive-com/5dive-plugins.git"
  fi

  step "Installing $plugin plugin for $user (from $marketplace)"
  # Deliberately NOT a login shell: /etc/profile.d/5dive-shared-configs.sh
  # exports CLAUDE_CONFIG_DIR=/home/claude/.claude, which would cause
  # `claude plugin install` to land the plugin in the wrong home. Mirror
  # 5dive-agent-start.sh and source nvm manually instead.
  #
  # set -e + pipefail: without pipefail, `npm install | tail -5` masked
  # npm's exit code with tail's zero-exit and the agent would ship with
  # half-installed deps; surface the failure so the frontend can show it.
  if ! sudo -u "$user" -H env PLUGIN="$plugin" MARKETPLACE="$marketplace" MKT_REPO="$mkt_repo" bash -s >&2 <<'AGENT_PLUGIN_INSTALL'
set -euo pipefail
unset CLAUDE_CONFIG_DIR
export NVM_DIR="/home/claude/.nvm"
# shellcheck disable=SC1091
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"
export PATH="/home/claude/.local/bin:$PATH"
CLAUDE=/home/claude/.local/bin/claude

# `claude plugin marketplace add` crashes headless for a user that has never
# run a claude session (ERR_STREAM_PREMATURE_CLOSE before git even spawns —
# verified on CC 2.1.169 and 2.1.170, DIVE-248). `marketplace update` works
# headless but needs the marketplace registered. So pre-register it ourselves
# (clone + known_marketplaces.json entry) and let `update` take it from there;
# `add` stays only as the fallback for an unforeseen registration shape.
MKT_DIR="$HOME/.claude/plugins/marketplaces/$MARKETPLACE"
if [ ! -d "$MKT_DIR/.git" ]; then
  mkdir -p "$HOME/.claude/plugins/marketplaces"
  rm -rf "$MKT_DIR"
  git clone -q --depth 1 "$MKT_REPO" "$MKT_DIR"
fi
MKT_SLUG=$(printf '%s' "$MKT_REPO" | sed -e 's#^https://github.com/##' -e 's#\.git$##')
KM_FILE="$HOME/.claude/plugins/known_marketplaces.json" \
  MKT_NAME="$MARKETPLACE" MKT_SLUG="$MKT_SLUG" MKT_DIR="$MKT_DIR" python3 <<'PREREG'
import json, os, datetime
km = os.environ["KM_FILE"]
d = {}
if os.path.exists(km):
    try:
        d = json.load(open(km))
    except Exception:
        d = {}
d.setdefault(os.environ["MKT_NAME"], {
    "source": {"source": "github", "repo": os.environ["MKT_SLUG"]},
    "installLocation": os.environ["MKT_DIR"],
    "lastUpdated": datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"),
})
json.dump(d, open(km, "w"), indent=2)
PREREG

"$CLAUDE" plugin marketplace update "$MARKETPLACE" 2>/dev/null \
  || "$CLAUDE" plugin marketplace add "$MKT_REPO"

yes | "$CLAUDE" plugin install "${PLUGIN}@${MARKETPLACE}" >/dev/null || true

PLUGIN_DIR=$(PLUGIN="$PLUGIN" MARKETPLACE="$MARKETPLACE" python3 -c '
import json, os
try:
    plugin = os.environ["PLUGIN"]
    marketplace = os.environ["MARKETPLACE"]
    d = json.load(open(os.path.expanduser("~/.claude/plugins/installed_plugins.json")))
    p = d.get("plugins", {}).get(f"{plugin}@{marketplace}", [])
    print(p[0]["installPath"] if p else "")
except Exception:
    print("")
' 2>/dev/null) || PLUGIN_DIR=""
[ -z "$PLUGIN_DIR" ] && PLUGIN_DIR=$(find "$HOME/.claude/plugins/cache" -name package.json -path "*/${PLUGIN}/*" 2>/dev/null | head -1 | xargs -r dirname)

if [ -z "$PLUGIN_DIR" ] || [ ! -d "$PLUGIN_DIR" ]; then
  echo "    ERROR: ${PLUGIN} plugin dir not found after install" >&2
  exit 1
fi

echo "    Plugin dir: $PLUGIN_DIR"
cd "$PLUGIN_DIR"
# tail keeps output bounded; pipefail above carries npm's real exit code.
timeout 60 npm install --omit=dev --ignore-scripts --no-audit --no-fund 2>&1 | tail -5

python3 <<'PATCHPY'
import json
with open("package.json") as f:
    d = json.load(f)
start = d.get("scripts", {}).get("start", "")
if "bun install" in start:
    d["scripts"]["start"] = "bun server.ts"
    with open("package.json", "w") as f:
        json.dump(d, f, indent=2)
    print("    Patched start script: removed bun install")
else:
    print("    Start script already clean: " + start)
PATCHPY
AGENT_PLUGIN_INSTALL
  then
    fail "$E_GENERIC" \
      "$plugin plugin install failed for agent '$name' (see journalctl / stderr above). Run: sudo 5dive doctor"
  fi

  # Pre-seed access.json with the operator's user id so the agent is usable on
  # the first DM. The plugin only writes access.json lazily — on the first
  # inbound message — so without this the wizard's post-create `agent pair
  # --user-id` could race the plugin: legacy pair waited up to 90s for the
  # file to exist, but if no second DM arrives the file never materializes
  # and pair times out. Pre-seeding here (before systemctl enable --now in
  # cmd_create) means the plugin reads our allowFrom on first message and
  # the queued DM goes straight to claude. The welcome DM stays in cmd_pair
  # so the dashboard's post-create `agent pair --user-id` is the single
  # welcome-delivery point on both old and new CLIs.
  if [[ "$plugin" == "telegram" && -n "$allowed_users" ]]; then
    seed_telegram_access_allowlist "$name" "$allowed_users"
  fi
}

# Write ~/.claude/channels/telegram/access.json for agent-<name> with allowFrom
# seeded from a CSV of user ids. Idempotent — merges into an existing file
# rather than clobbering, so re-running on an already-paired agent only adds
# new ids. Doesn't drop approved/<id> markers — that file is the trigger for
# the plugin's checkApprovals "Paired! Say hi to Claude." DM, and a fresh
# create that's about to be followed by `agent pair --user-id` already gets
# that path through the pair call. Doubling up here would duplicate the
# confirmation message.
seed_telegram_access_allowlist() {
  local name="$1" allowed_users="$2"
  local user="agent-${name}"
  local state_dir="/home/${user}/.claude/channels/telegram"

  step "Pre-seeding telegram allowlist for $user (${allowed_users})"
  if ! sudo -u "$user" env CSV="$allowed_users" STATE="$state_dir" python3 - <<'PY' >&2; then
import json, os, tempfile

state = os.environ['STATE']
csv = os.environ['CSV']
ids = [s.strip() for s in csv.split(',') if s.strip()]

os.makedirs(state, mode=0o700, exist_ok=True)
access_path = os.path.join(state, 'access.json')

try:
    with open(access_path) as f:
        data = json.load(f)
except FileNotFoundError:
    data = {"dmPolicy": "pairing", "allowFrom": [], "groups": {}, "pending": {}}

allow = list(data.get('allowFrom') or [])
for s in ids:
    if s not in allow:
        allow.append(s)
data['allowFrom'] = allow

fd, tmp = tempfile.mkstemp(dir=state, prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(data, f, indent=2)
os.replace(tmp, access_path)
print(f"Seeded allowFrom={allow} into {access_path}")
PY
    fail "$E_GENERIC" "telegram access.json pre-seed failed for agent '$name'"
  fi
}

# Back-compat shim so callers that still reference the telegram-specific name
# keep working. New code should call install_channel_plugin_for_agent directly.
install_telegram_plugin_for_agent() {
  install_channel_plugin_for_agent telegram "$1"
}

# Register a chat channel with openclaw's gateway for an agent user. openclaw
# stores the credential under $HOME/.openclaw/...; we just shell out to its
# native CLI (`openclaw channels add`) so future openclaw versions keep
# control of the on-disk schema. Token is passed via --token-file pointing
# at a stable secrets/<plugin>-bot-token file (mode 600), since openclaw
# re-reads the path every time the gateway restarts — a tmpfile would leave
# the gateway with a dangling reference. Pair-code roundtrips don't apply —
# openclaw does inbound DM approvals through its `pairing` subcommand
# instead, which the dashboard can wire up later. <plugin> is telegram |
# discord.
install_channel_for_openclaw_agent() {
  local plugin="$1" name="$2" token="$3" home_channel="${4:-}" allowed_users="${5:-}"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"
  [[ -n "$token" ]] || fail "$E_VALIDATION" "openclaw $plugin channel requires a bot token"

  # Pre-seed the allowlist + command-owner list from the dashboard wizard's
  # --telegram-allowed-users so the agent is usable immediately on first DM.
  # Without this, openclaw's default `dmPolicy: "pairing"` makes the bot
  # demand `openclaw pairing approve telegram <code>` from the user before
  # any message reaches the agent — which is what `5dive agent pair` solves
  # for claude agents but doesn't apply to openclaw (its native `pairing`
  # subcommand operates on codes the user has already DM'd in, so it can't
  # pre-authorize). The dashboard already collects the user's numeric id
  # via @userinfobot, so we just write it into the right two slots:
  #   channels.<plugin>.allowFrom  → bypasses the pairing reply gate
  #   commands.ownerAllowFrom      → grants command-owner status
  # Same JSON state `openclaw pairing approve` ends up in, just without the
  # roundtrip. Only telegram is wired up — discord uses different shapes.
  local allow_from_json="" owner_allow_from_json=""
  if [[ "$plugin" == "telegram" && -n "$allowed_users" ]]; then
    valid_telegram_chat_id_list "$allowed_users" \
      || fail "$E_VALIDATION" "invalid allowed_users (comma-separated numeric ids)"
    allow_from_json=$(jq -cn --arg csv "$allowed_users" \
      '$csv | split(",") | map(select(length>0))')
    owner_allow_from_json=$(jq -cn --arg csv "$allowed_users" \
      '$csv | split(",") | map(select(length>0) | "telegram:" + .)')
  fi

  step "Registering $plugin channel with openclaw for $user"
  if ! sudo -u "$user" -H env \
      PLUGIN="$plugin" \
      TOKEN="$token" \
      ALLOW_FROM_JSON="$allow_from_json" \
      OWNER_ALLOW_FROM_JSON="$owner_allow_from_json" \
      bash -s >&2 <<'OPENCLAW_CHANNEL'
set -euo pipefail
export NVM_DIR="/home/claude/.nvm"
# shellcheck disable=SC1091
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"
export PATH="/home/claude/.local/bin:$PATH"
OPENCLAW=/home/claude/.local/bin/openclaw

# `openclaw channels add --token-file` records the path inside openclaw.json
# and re-reads it whenever the gateway boots — so the token file has to
# survive past this script. Use a stable, mode-700 dir under the agent's
# home; tmpfile-with-trap-rm would silently strand the gateway on first
# restart with "ENOENT".
SECRET_DIR="$HOME/.openclaw/secrets"
install -d -m 700 "$SECRET_DIR"
TOKEN_FILE="$SECRET_DIR/${PLUGIN}-bot-token"
umask 077
printf '%s' "$TOKEN" > "$TOKEN_FILE"
chmod 600 "$TOKEN_FILE"

# openclaw refuses to start the gateway when gateway.mode is unset (its
# "suspicious or clobbered config" guard), so set it explicitly. `local`
# matches the binding the agent uses (loopback on 127.0.0.1:18789).
"$OPENCLAW" config set gateway.mode local 2>&1 | tail -3

# `openclaw channels add` is idempotent — re-running with the same channel/
# account just updates the stored token, which is what we want when an agent
# rotates its bot.
"$OPENCLAW" channels add --channel "$PLUGIN" --token-file "$TOKEN_FILE" 2>&1 | tail -10

# Auto-pair: if the dashboard supplied a Telegram allowlist, patch the
# config in a single validated write. `config patch` is recursive-merge for
# objects + replace for arrays, so this both seeds new fields and overwrites
# stale lists (e.g. when the agent is recreated with a different operator).
if [[ -n "$ALLOW_FROM_JSON" ]]; then
  PATCH=$(ALLOW_FROM="$ALLOW_FROM_JSON" OWNER_ALLOW_FROM="$OWNER_ALLOW_FROM_JSON" \
    PLUGIN="$PLUGIN" python3 -c '
import json, os
patch = {
  "channels": { os.environ["PLUGIN"]: { "allowFrom": json.loads(os.environ["ALLOW_FROM"]) } },
  "commands": { "ownerAllowFrom": json.loads(os.environ["OWNER_ALLOW_FROM"]) },
}
print(json.dumps(patch))
')
  printf '%s' "$PATCH" | "$OPENCLAW" config patch --stdin 2>&1 | tail -5
fi
OPENCLAW_CHANNEL
  then
    fail "$E_GENERIC" \
      "openclaw $plugin channel registration failed for agent '$name'. Try: sudo -u $user openclaw channels add --channel $plugin --token-file <path>"
  fi
}

# Configure a chat channel for a hermes agent. Unlike openclaw, hermes has no
# non-interactive `channels add` — its messaging gateway reads credentials
# from $HOME/.hermes/.env (TELEGRAM_BOT_TOKEN / DISCORD_BOT_TOKEN), so we
# write the env var directly. We strip any prior assignment of the same key
# so re-creating an agent with a rotated token doesn't leave a stale line.
#
# For telegram, hermes also needs TELEGRAM_HOME_CHANNEL (the chat the gateway
# posts unsolicited messages to) and TELEGRAM_ALLOWED_USERS (the comma-
# separated allowlist of inbound senders) — without those the gateway starts
# but ignores every incoming message. Both are passed in as the same numeric
# user id from the dashboard wizard (the user pastes their own Telegram id,
# fetched from @userinfobot), but we keep them as separate args so a future
# multi-user flow can fan out the allowlist without touching the home
# channel.
install_channel_for_hermes_agent() {
  local plugin="$1" name="$2" token="$3" home_channel="${4:-}" allowed_users="${5:-}"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"
  [[ -n "$token" ]] || fail "$E_VALIDATION" "hermes $plugin channel requires a bot token"

  local var
  case "$plugin" in
    telegram) var="TELEGRAM_BOT_TOKEN" ;;
    discord)  var="DISCORD_BOT_TOKEN"  ;;
    *) fail "$E_VALIDATION" "hermes channel plugin unsupported: $plugin" ;;
  esac

  # Build the (var,value) pairs to upsert. Always include the bot token; for
  # telegram, also include home_channel/allowed_users when supplied so a
  # missing dashboard arg doesn't silently wipe a previously-set value (we
  # strip-then-append per VAR, so omitted vars are left intact).
  local -a pairs=( "$var=$token" )
  if [[ "$plugin" == "telegram" ]]; then
    [[ -n "$home_channel"  ]] && pairs+=( "TELEGRAM_HOME_CHANNEL=$home_channel" )
    [[ -n "$allowed_users" ]] && pairs+=( "TELEGRAM_ALLOWED_USERS=$allowed_users" )
  fi

  step "Writing $plugin credential into ~/.hermes/.env for $user"
  if ! sudo -u "$user" -H env PAIRS="$(printf '%s\n' "${pairs[@]}")" bash -s >&2 <<'HERMES_ENV'
set -euo pipefail
ENV_FILE="$HOME/.hermes/.env"
mkdir -p "$HOME/.hermes"
chmod 700 "$HOME/.hermes"
touch "$ENV_FILE"
chmod 600 "$ENV_FILE"
# Build a tmpfile that drops every VAR we're about to set, then appends the
# fresh assignments. tmpfile + mv so a crash mid-write can't blank the env.
TMP=$(mktemp --tmpdir="$HOME/.hermes" .env.XXXXXX)
chmod 600 "$TMP"
cp "$ENV_FILE" "$TMP"
while IFS= read -r pair; do
  [[ -z "$pair" ]] && continue
  key="${pair%%=*}"
  grep -v "^${key}=" "$TMP" > "$TMP.next" || true
  mv "$TMP.next" "$TMP"
  printf '%s\n' "$pair" >> "$TMP"
done <<< "$PAIRS"
mv "$TMP" "$ENV_FILE"
HERMES_ENV
  then
    fail "$E_GENERIC" \
      "hermes $plugin env write failed for agent '$name'."
  fi
}

# Idempotently install + start the hermes messaging gateway for agent-<name>.
# The gateway is a separate long-running process from the chat CLI and runs
# as `systemctl --user` so it stays owned by agent-<name> and can be
# restarted without touching the 5dive-agent@.service that hosts the tmux
# loop. linger keeps the user bus alive after we exit so the gateway
# survives logout. `gateway install` is idempotent — safe to re-run when
# rotating a token or attaching a channel post-create. Used by cmd_create
# (initial wiring) and cmd_config (post-create channel attach).
ensure_hermes_gateway() {
  local name="$1"
  step "Enabling systemd linger for agent-${name}"
  loginctl enable-linger "agent-${name}" >&2 \
    || warn "loginctl enable-linger failed for agent-${name} — gateway may not survive reboots"
  step "Installing hermes user gateway for agent-${name}"
  sudo -u "agent-${name}" -H "${TYPE_BIN[hermes]}" gateway install >&2 \
    || warn "hermes gateway install failed for agent '$name' (rerun: sudo -u agent-${name} -H ${TYPE_BIN[hermes]} gateway install)"
  sudo -u "agent-${name}" -H "${TYPE_BIN[hermes]}" gateway start >&2 \
    || warn "hermes gateway start failed for agent '$name' (rerun: sudo -u agent-${name} -H ${TYPE_BIN[hermes]} gateway start)"
}

# Upsert a KEY=VALUE pair into agent-${name}'s $HOME/.hermes/.env. Uses the
# same strip-then-append pattern as install_channel_for_hermes_agent so any
# previously-written channel tokens (TELEGRAM_BOT_TOKEN, etc.) are preserved.
# Called from cmd_create for hermes BYO providers whose key lives in .env
# rather than auth.json — Kimi/Moonshot (KIMI_API_KEY) today. The agent user
# must already exist; the gateway daemon (started later in cmd_create) reads
# .env at startup, so this must run before `hermes gateway start`.
seed_hermes_byo_env() {
  local name="$1" var="$2" value="$3"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"
  if ! sudo -u "$user" -H env PAIR="$var=$value" bash -s >&2 <<'HERMES_BYO_ENV'
set -euo pipefail
ENV_FILE="$HOME/.hermes/.env"
mkdir -p "$HOME/.hermes"
chmod 700 "$HOME/.hermes"
touch "$ENV_FILE"
chmod 600 "$ENV_FILE"
TMP=$(mktemp --tmpdir="$HOME/.hermes" .env.XXXXXX)
chmod 600 "$TMP"
cp "$ENV_FILE" "$TMP"
key="${PAIR%%=*}"
grep -v "^${key}=" "$TMP" > "$TMP.next" || true
mv "$TMP.next" "$TMP"
printf '%s\n' "$PAIR" >> "$TMP"
mv "$TMP" "$ENV_FILE"
HERMES_BYO_ENV
  then
    fail "$E_GENERIC" "hermes BYO env write failed for agent '$name' ($var)"
  fi
}

# Resolve the telegram-codex plugin checkout. Ordered candidates so this works
# on the 5dive control-plane host (the 5dive-plugins repo checkout) today and
# on customer VMs once install.sh deploys the plugin into /usr/local/lib/5dive.
# Override with TELEGRAM_CODEX_PLUGIN_DIR for offline / test installs. Prints
# the dir on success; returns 1 if none has a server.ts. Kept in sync with the
# identical resolver in 5dive-agent-start (the boot script is standalone, not
# part of this bundle).
codex_plugin_dir() {
  local c
  for c in "${TELEGRAM_CODEX_PLUGIN_DIR:-}" \
           /usr/local/lib/5dive/telegram-codex \
           /home/claude/projects/5dive/5dive-plugins/plugins/telegram-codex; do
    [[ -n "$c" && -f "$c/server.ts" ]] && { printf '%s\n' "$c"; return 0; }
  done
  return 1
}

# Configure the telegram channel for a codex agent. Codex has no plugin
# marketplace, so unlike claude there's nothing to install per-agent — a single
# shared plugin checkout serves every codex agent (server.ts resolves its state
# dir from the agent's own $HOME via homedir(), so per-agent isolation is
# automatic). Here we just (1) write the bot token into the agent's
# ~/.codex/channels/telegram/.env (the path the MCP server + pair.ts read) and
# (2) seed access.json so the bot answers the operator on the first DM. The MCP
# server + lifecycle hooks are wired into config.toml at boot by
# 5dive-agent-start, which also launches codex with --dangerously-bypass-hook-trust.
install_channel_for_codex_agent() {
  local plugin="$1" name="$2" token="$3" allowed_users="${4:-}"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"
  [[ "$plugin" == "telegram" ]] \
    || fail "$E_VALIDATION" "codex channel plugin unsupported: $plugin (telegram only)"
  [[ -n "$token" ]] || fail "$E_VALIDATION" "codex telegram channel requires a bot token"
  if [[ -n "$allowed_users" ]]; then
    valid_telegram_chat_id_list "$allowed_users" \
      || fail "$E_VALIDATION" "invalid allowed_users (comma-separated numeric ids)"
  fi

  # Fail fast at create time if the plugin isn't deployed — otherwise
  # 5dive-agent-start would boot the agent with no telegram bridge wired and
  # the failure would only surface in journalctl.
  codex_plugin_dir >/dev/null \
    || fail "$E_NOT_INSTALLED" \
       "telegram-codex plugin not found (looked in /usr/local/lib/5dive/telegram-codex and the 5dive-plugins checkout). Set TELEGRAM_CODEX_PLUGIN_DIR or deploy the plugin."

  # bun runs server.ts + the hooks; fail fast like the claude installer does so
  # the frontend can show a clear message instead of a crash loop.
  if ! ensure_bun_for_agent "$user"; then
    fail "$E_NOT_INSTALLED" \
      "bun unavailable for $user (required by telegram-codex) and automatic install failed. Check network access to bun.sh, or install bun to /usr/local/bin manually, then retry."
  fi

  # Write the bot token into ~/.codex/channels/telegram/.env. Strip-then-append
  # the TELEGRAM_BOT_TOKEN line so a rotated token doesn't leave a stale value;
  # tmpfile + mv so a crash mid-write can't blank the file.
  step "Writing telegram token into ~/.codex/channels/telegram/.env for $user"
  if ! sudo -u "$user" -H env TOKEN="$token" bash -s >&2 <<'CODEX_ENV'
set -euo pipefail
mkdir -p "$HOME/.codex/channels/telegram"
chmod 700 "$HOME/.codex" "$HOME/.codex/channels" "$HOME/.codex/channels/telegram" 2>/dev/null || true
ENV_FILE="$HOME/.codex/channels/telegram/.env"
touch "$ENV_FILE"; chmod 600 "$ENV_FILE"
TMP=$(mktemp --tmpdir="$HOME/.codex/channels/telegram" .env.XXXXXX); chmod 600 "$TMP"
grep -v '^TELEGRAM_BOT_TOKEN=' "$ENV_FILE" > "$TMP" || true
printf 'TELEGRAM_BOT_TOKEN=%s\n' "$TOKEN" >> "$TMP"
mv "$TMP" "$ENV_FILE"
CODEX_ENV
  then
    fail "$E_GENERIC" "codex telegram .env write failed for agent '$name'"
  fi

  # Seed the allowlist so the operator's DMs reach codex on the first message.
  if [[ -n "$allowed_users" ]]; then
    seed_codex_telegram_access "$name" "$allowed_users"
  fi

  # Seed the notify-user skill into ~/.codex/.agents/skills so the agent
  # self-starts its comms loop on the first DM. Mirrors the grok 0.1.13 block
  # below — codex has no plugin marketplace either, but it reads SKILL.md
  # from its per-type skills dir natively. Without this, new codex+telegram
  # agents go dark on first contact until the operator manually copies SKILL.md
  # (or waits for update.sh's 03:00 type-aware refresh to heal them).
  if [[ -f "$AGENT_SKILLS_DIR/notify-user/SKILL.md" ]]; then
    sudo -u "$user" mkdir -p "/home/${user}/.agents/skills/notify-user"
    sudo -u "$user" cp "$AGENT_SKILLS_DIR/notify-user/SKILL.md" \
      "/home/${user}/.agents/skills/notify-user/SKILL.md"
  fi

  # Default skills, best-effort: match preseed_claude_agent + the grok
  # installer so codex+telegram agents get find-skills + 5dive-cli too.
  # Upstream `npx skills add --agent codex` IS supported (see SKILLS_AGENT_ID),
  # so these route through the normal path, not the manual-install fallback.
  install_default_skill_for_agent "$name" codex vercel-labs/skills find-skills || true
  install_default_skill_for_agent "$name" codex 5dive-com/skills 5dive-cli || true
}

# Write ~/.codex/channels/telegram/access.json for agent-<name> with allowFrom
# seeded from a CSV of user ids. Idempotent — merges into an existing file so
# re-running only adds new ids. Matches the {allowFrom, groups} schema the codex
# server + pair.ts use (allowFrom is an array of string ids).
seed_codex_telegram_access() {
  local name="$1" allowed_users="$2"
  local user="agent-${name}"
  step "Pre-seeding codex telegram allowlist for $user (${allowed_users})"
  if ! sudo -u "$user" env CSV="$allowed_users" python3 - <<'PY' >&2; then
import json, os, tempfile

state = os.path.join(os.path.expanduser('~'), '.codex', 'channels', 'telegram')
os.makedirs(state, mode=0o700, exist_ok=True)
access_path = os.path.join(state, 'access.json')
ids = [s.strip() for s in os.environ['CSV'].split(',') if s.strip()]

try:
    with open(access_path) as f:
        data = json.load(f)
except FileNotFoundError:
    data = {"allowFrom": [], "groups": {}}

allow = list(data.get('allowFrom') or [])
for s in ids:
    if s not in allow:
        allow.append(s)
data['allowFrom'] = allow
data.setdefault('groups', {})

fd, tmp = tempfile.mkstemp(dir=state, prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(data, f, indent=2)
os.chmod(tmp, 0o600)
os.replace(tmp, access_path)
print(f"Seeded allowFrom={allow} into {access_path}")
PY
    fail "$E_GENERIC" "codex telegram access.json pre-seed failed for agent '$name'"
  fi
}

# Resolve the telegram-grok plugin checkout. Same shape as codex_plugin_dir
# above — control-plane host uses the 5dive-plugins repo checkout, customer
# VMs use /usr/local/lib/5dive once install.sh deploys it. Override with
# TELEGRAM_GROK_PLUGIN_DIR for offline / test installs. The matching resolver
# in 5dive-agent-start is intentionally duplicated (the boot script is
# standalone).
grok_plugin_dir() {
  local c
  for c in "${TELEGRAM_GROK_PLUGIN_DIR:-}" \
           /usr/local/lib/5dive/telegram-grok \
           /home/claude/projects/5dive/5dive-plugins/plugins/telegram-grok; do
    [[ -n "$c" && -f "$c/server.ts" ]] && { printf '%s\n' "$c"; return 0; }
  done
  return 1
}

# Configure the telegram channel for a grok agent. Structurally identical to
# install_channel_for_codex_agent — grok has no plugin marketplace, so a single
# shared plugin checkout serves every grok agent (server.ts resolves its state
# dir from the agent's own $HOME via homedir()). We (1) write the bot token
# into ~/.grok/channels/telegram/.env and (2) seed access.json. The MCP server
# + lifecycle hooks are wired into ~/.grok/config.toml at boot by
# 5dive-agent-start. grok runs with --always-approve, which also auto-trusts
# plugin/MCP commands — no separate trust-bypass flag needed.
install_channel_for_grok_agent() {
  local plugin="$1" name="$2" token="$3" allowed_users="${4:-}"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"
  [[ "$plugin" == "telegram" ]] \
    || fail "$E_VALIDATION" "grok channel plugin unsupported: $plugin (telegram only)"
  [[ -n "$token" ]] || fail "$E_VALIDATION" "grok telegram channel requires a bot token"
  if [[ -n "$allowed_users" ]]; then
    valid_telegram_chat_id_list "$allowed_users" \
      || fail "$E_VALIDATION" "invalid allowed_users (comma-separated numeric ids)"
  fi

  grok_plugin_dir >/dev/null \
    || fail "$E_NOT_INSTALLED" \
       "telegram-grok plugin not found (looked in /usr/local/lib/5dive/telegram-grok and the 5dive-plugins checkout). Set TELEGRAM_GROK_PLUGIN_DIR or deploy the plugin."

  if ! ensure_bun_for_agent "$user"; then
    fail "$E_NOT_INSTALLED" \
      "bun unavailable for $user (required by telegram-grok) and automatic install failed. Check network access to bun.sh, or install bun to /usr/local/bin manually, then retry."
  fi

  step "Writing telegram token into ~/.grok/channels/telegram/.env for $user"
  if ! sudo -u "$user" -H env TOKEN="$token" bash -s >&2 <<'GROK_ENV'
set -euo pipefail
mkdir -p "$HOME/.grok/channels/telegram"
chmod 700 "$HOME/.grok" "$HOME/.grok/channels" "$HOME/.grok/channels/telegram" 2>/dev/null || true
ENV_FILE="$HOME/.grok/channels/telegram/.env"
touch "$ENV_FILE"; chmod 600 "$ENV_FILE"
TMP=$(mktemp --tmpdir="$HOME/.grok/channels/telegram" .env.XXXXXX); chmod 600 "$TMP"
grep -v '^TELEGRAM_BOT_TOKEN=' "$ENV_FILE" > "$TMP" || true
printf 'TELEGRAM_BOT_TOKEN=%s\n' "$TOKEN" >> "$TMP"
mv "$TMP" "$ENV_FILE"
GROK_ENV
  then
    fail "$E_GENERIC" "grok telegram .env write failed for agent '$name'"
  fi

  if [[ -n "$allowed_users" ]]; then
    seed_grok_telegram_access "$name" "$allowed_users"
  fi

  # Seed the notify-user skill into ~/.grok/skills so the agent self-starts
  # its comms loop on first DM. Mirrors what preseed_claude_agent does for
  # claude-channel=telegram agents; grok has no claude-style plugin marketplace,
  # but it does read ~/.grok/skills/*/SKILL.md natively, so a direct copy is
  # all that's needed.
  if [[ -f "$AGENT_SKILLS_DIR/notify-user/SKILL.md" ]]; then
    sudo -u "$user" mkdir -p "/home/${user}/.grok/skills/notify-user"
    sudo -u "$user" cp "$AGENT_SKILLS_DIR/notify-user/SKILL.md" \
      "/home/${user}/.grok/skills/notify-user/SKILL.md"
  fi

  # Default skills, best-effort: match the claude-side install in
  # preseed_claude_agent so grok agents get find-skills + 5dive-cli too.
  # Upstream `npx skills add` doesn't recognize --agent grok, so these
  # route through the manual-install fallback in install_default_skill_for_agent.
  install_default_skill_for_agent "$name" grok vercel-labs/skills find-skills || true
  install_default_skill_for_agent "$name" grok 5dive-com/skills 5dive-cli || true
}

# Write ~/.grok/channels/telegram/access.json for agent-<name> with allowFrom
# seeded from a CSV of user ids. Idempotent — mirrors seed_codex_telegram_access.
seed_grok_telegram_access() {
  local name="$1" allowed_users="$2"
  local user="agent-${name}"
  step "Pre-seeding grok telegram allowlist for $user (${allowed_users})"
  if ! sudo -u "$user" env CSV="$allowed_users" python3 - <<'PY' >&2; then
import json, os, tempfile

state = os.path.join(os.path.expanduser('~'), '.grok', 'channels', 'telegram')
os.makedirs(state, mode=0o700, exist_ok=True)
access_path = os.path.join(state, 'access.json')
ids = [s.strip() for s in os.environ['CSV'].split(',') if s.strip()]

try:
    with open(access_path) as f:
        data = json.load(f)
except FileNotFoundError:
    data = {"allowFrom": [], "groups": {}}

allow = list(data.get('allowFrom') or [])
for s in ids:
    if s not in allow:
        allow.append(s)
data['allowFrom'] = allow
data.setdefault('groups', {})

fd, tmp = tempfile.mkstemp(dir=state, prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(data, f, indent=2)
os.chmod(tmp, 0o600)
os.replace(tmp, access_path)
print(f"Seeded allowFrom={allow} into {access_path}")
PY
    fail "$E_GENERIC" "grok telegram access.json pre-seed failed for agent '$name'"
  fi
}

# Resolve the telegram-agy plugin checkout. Same shape as grok_plugin_dir
# above — control-plane host uses the 5dive-plugins repo checkout, customer
# VMs use /usr/local/lib/5dive once install.sh deploys it. Override with
# TELEGRAM_AGY_PLUGIN_DIR for offline / test installs. The matching resolver
# in 5dive-agent-start is intentionally duplicated (the boot script is
# standalone).
antigravity_plugin_dir() {
  local c
  for c in "${TELEGRAM_AGY_PLUGIN_DIR:-}" \
           /usr/local/lib/5dive/telegram-agy \
           /home/claude/projects/5dive/5dive-plugins/plugins/telegram-agy; do
    [[ -n "$c" && -f "$c/server.ts" ]] && { printf '%s\n' "$c"; return 0; }
  done
  return 1
}

# Configure the telegram channel for an antigravity (agy) agent. Structurally
# identical to install_channel_for_grok_agent — agy has no plugin marketplace,
# so a single shared plugin checkout serves every agy agent (server.ts resolves
# its state dir from the agent's own $HOME via homedir() → ~/.gemini/channels/
# telegram). We (1) write the bot token into ~/.gemini/channels/telegram/.env
# and (2) seed access.json. The MCP server + lifecycle hooks are wired into the
# GLOBAL ~/.gemini/config/{mcp_config.json,hooks.json} at boot by
# 5dive-agent-start (agy doesn't auto-load a plugin's mcp_config/hooks — only
# skills/agents). agy runs with --dangerously-skip-permissions (set in
# 5dive-agent-start), so there are no tool prompts to bridge.
install_channel_for_antigravity_agent() {
  local plugin="$1" name="$2" token="$3" allowed_users="${4:-}"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"
  [[ "$plugin" == "telegram" ]] \
    || fail "$E_VALIDATION" "antigravity channel plugin unsupported: $plugin (telegram only)"
  [[ -n "$token" ]] || fail "$E_VALIDATION" "antigravity telegram channel requires a bot token"
  if [[ -n "$allowed_users" ]]; then
    valid_telegram_chat_id_list "$allowed_users" \
      || fail "$E_VALIDATION" "invalid allowed_users (comma-separated numeric ids)"
  fi

  antigravity_plugin_dir >/dev/null \
    || fail "$E_NOT_INSTALLED" \
       "telegram-agy plugin not found (looked in /usr/local/lib/5dive/telegram-agy and the 5dive-plugins checkout). Set TELEGRAM_AGY_PLUGIN_DIR or deploy the plugin."

  if ! ensure_bun_for_agent "$user"; then
    fail "$E_NOT_INSTALLED" \
      "bun unavailable for $user (required by telegram-agy) and automatic install failed. Check network access to bun.sh, or install bun to /usr/local/bin manually, then retry."
  fi

  step "Writing telegram token into ~/.gemini/channels/telegram/.env for $user"
  if ! sudo -u "$user" -H env TOKEN="$token" bash -s >&2 <<'AGY_ENV'
set -euo pipefail
mkdir -p "$HOME/.gemini/channels/telegram"
chmod 700 "$HOME/.gemini" "$HOME/.gemini/channels" "$HOME/.gemini/channels/telegram" 2>/dev/null || true
ENV_FILE="$HOME/.gemini/channels/telegram/.env"
touch "$ENV_FILE"; chmod 600 "$ENV_FILE"
TMP=$(mktemp --tmpdir="$HOME/.gemini/channels/telegram" .env.XXXXXX); chmod 600 "$TMP"
grep -v '^TELEGRAM_BOT_TOKEN=' "$ENV_FILE" > "$TMP" || true
printf 'TELEGRAM_BOT_TOKEN=%s\n' "$TOKEN" >> "$TMP"
mv "$TMP" "$ENV_FILE"
AGY_ENV
  then
    fail "$E_GENERIC" "antigravity telegram .env write failed for agent '$name'"
  fi

  if [[ -n "$allowed_users" ]]; then
    seed_antigravity_telegram_access "$name" "$allowed_users"
  fi

  # Seed the notify-user skill into agy's skills dir so the agent self-starts
  # its comms loop on first DM. Mirrors the grok path; agy reads skills from
  # $HOME/.agents/skills/<name>/SKILL.md (per SKILLS_INSTALL_DIR[antigravity]).
  if [[ -f "$AGENT_SKILLS_DIR/notify-user/SKILL.md" ]]; then
    sudo -u "$user" mkdir -p "/home/${user}/.agents/skills/notify-user"
    sudo -u "$user" cp "$AGENT_SKILLS_DIR/notify-user/SKILL.md" \
      "/home/${user}/.agents/skills/notify-user/SKILL.md"
  fi

  # Default skills, best-effort: match the grok-side install so agy agents get
  # find-skills + 5dive-cli too. (preseed_antigravity_agent already runs these
  # from cmd_create; repeating here keeps channel-attach self-contained and the
  # installs are idempotent.)
  install_default_skill_for_agent "$name" antigravity vercel-labs/skills find-skills || true
  install_default_skill_for_agent "$name" antigravity 5dive-com/skills 5dive-cli || true
}

# Write ~/.gemini/channels/telegram/access.json for agent-<name> with allowFrom
# seeded from a CSV of user ids. Idempotent — mirrors seed_grok_telegram_access.
seed_antigravity_telegram_access() {
  local name="$1" allowed_users="$2"
  local user="agent-${name}"
  step "Pre-seeding antigravity telegram allowlist for $user (${allowed_users})"
  if ! sudo -u "$user" env CSV="$allowed_users" python3 - <<'PY' >&2; then
import json, os, tempfile

state = os.path.join(os.path.expanduser('~'), '.gemini', 'channels', 'telegram')
os.makedirs(state, mode=0o700, exist_ok=True)
access_path = os.path.join(state, 'access.json')
ids = [s.strip() for s in os.environ['CSV'].split(',') if s.strip()]

try:
    with open(access_path) as f:
        data = json.load(f)
except FileNotFoundError:
    data = {"allowFrom": [], "groups": {}}

allow = list(data.get('allowFrom') or [])
for s in ids:
    if s not in allow:
        allow.append(s)
data['allowFrom'] = allow
data.setdefault('groups', {})

fd, tmp = tempfile.mkstemp(dir=state, prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(data, f, indent=2)
os.chmod(tmp, 0o600)
os.replace(tmp, access_path)
print(f"Seeded allowFrom={allow} into {access_path}")
PY
    fail "$E_GENERIC" "antigravity telegram access.json pre-seed failed for agent '$name'"
  fi
}

# Resolve the telegram-opencode plugin checkout. Same shape as grok_plugin_dir
# above — control-plane host uses the 5dive-plugins repo checkout, customer VMs
# use /usr/local/lib/5dive once install.sh deploys it. Override with
# TELEGRAM_OPENCODE_PLUGIN_DIR for offline / test installs. The matching
# resolver in 5dive-agent-start is intentionally duplicated (the boot script is
# standalone).
opencode_plugin_dir() {
  local c
  for c in "${TELEGRAM_OPENCODE_PLUGIN_DIR:-}" \
           /usr/local/lib/5dive/telegram-opencode \
           /home/claude/projects/5dive/5dive-plugins/plugins/telegram-opencode; do
    [[ -n "$c" && -f "$c/server.ts" ]] && { printf '%s\n' "$c"; return 0; }
  done
  return 1
}

# Configure the telegram channel for an opencode agent. Unlike codex/grok/agy
# (MCP servers wired into the runtime's config), opencode is a STANDALONE RELAY:
# the bridge (telegram-opencode/server.ts) IS the agent's main process and
# itself spawns `opencode serve` over loopback HTTP — 5dive-agent-start launches
# `bun server.ts` instead of the opencode TUI for channels=telegram. So there's
# no config.toml/MCP wiring here; the installer just (1) writes the bot token
# into ~/.opencode/channels/telegram/.env (the path the relay reads its token
# from) and (2) seeds access.json. A single shared plugin checkout serves every
# opencode agent (the relay resolves its state dir from the agent's own $HOME).
install_channel_for_opencode_agent() {
  local plugin="$1" name="$2" token="$3" allowed_users="${4:-}"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"
  [[ "$plugin" == "telegram" ]] \
    || fail "$E_VALIDATION" "opencode channel plugin unsupported: $plugin (telegram only)"
  [[ -n "$token" ]] || fail "$E_VALIDATION" "opencode telegram channel requires a bot token"
  if [[ -n "$allowed_users" ]]; then
    valid_telegram_chat_id_list "$allowed_users" \
      || fail "$E_VALIDATION" "invalid allowed_users (comma-separated numeric ids)"
  fi

  opencode_plugin_dir >/dev/null \
    || fail "$E_NOT_INSTALLED" \
       "telegram-opencode plugin not found (looked in /usr/local/lib/5dive/telegram-opencode and the 5dive-plugins checkout). Set TELEGRAM_OPENCODE_PLUGIN_DIR or deploy the plugin."

  # bun runs the relay (server.ts) — it IS the agent process, so a missing bun
  # would crash-loop the unit. Fail fast at create time like the other bridges.
  if ! ensure_bun_for_agent "$user"; then
    fail "$E_NOT_INSTALLED" \
      "bun unavailable for $user (required by telegram-opencode) and automatic install failed. Check network access to bun.sh, or install bun to /usr/local/bin manually, then retry."
  fi

  step "Writing telegram token into ~/.opencode/channels/telegram/.env for $user"
  if ! sudo -u "$user" -H env TOKEN="$token" bash -s >&2 <<'OPENCODE_ENV'
set -euo pipefail
mkdir -p "$HOME/.opencode/channels/telegram"
chmod 700 "$HOME/.opencode" "$HOME/.opencode/channels" "$HOME/.opencode/channels/telegram" 2>/dev/null || true
ENV_FILE="$HOME/.opencode/channels/telegram/.env"
touch "$ENV_FILE"; chmod 600 "$ENV_FILE"
TMP=$(mktemp --tmpdir="$HOME/.opencode/channels/telegram" .env.XXXXXX); chmod 600 "$TMP"
grep -v '^TELEGRAM_BOT_TOKEN=' "$ENV_FILE" > "$TMP" || true
printf 'TELEGRAM_BOT_TOKEN=%s\n' "$TOKEN" >> "$TMP"
mv "$TMP" "$ENV_FILE"
OPENCODE_ENV
  then
    fail "$E_GENERIC" "opencode telegram .env write failed for agent '$name'"
  fi

  # Always seed access.json (DIVE-45). The opencode bridge's loadAccess()
  # defaults to {allowFrom:[], dmPolicy:allowlist} when the file is absent —
  # i.e. block-everything — so a freshly-provisioned bot with no access.json
  # SILENTLY DROPS every DM and the customer never gets a reply. seed_* now
  # always writes the file: with allowed_users it allowlists them; without, it
  # defaults dmPolicy to 'pairing' so the first DM yields a pairing code (parity
  # with the claude/codex/grok effective default of pairing-on-first-contact).
  seed_opencode_telegram_access "$name" "$allowed_users"

  # Seed the notify-user skill so the agent self-starts its comms loop on first
  # DM. opencode reads skills from $HOME/.agents/skills/<name>/SKILL.md
  # (SKILLS_INSTALL_DIR[opencode]).
  if [[ -f "$AGENT_SKILLS_DIR/notify-user/SKILL.md" ]]; then
    sudo -u "$user" mkdir -p "/home/${user}/.agents/skills/notify-user"
    sudo -u "$user" cp "$AGENT_SKILLS_DIR/notify-user/SKILL.md" \
      "/home/${user}/.agents/skills/notify-user/SKILL.md"
  fi

  # Default skills, best-effort: match the grok/agy installers. opencode IS in
  # the upstream skills registry (SKILLS_AGENT_ID[opencode]=opencode), so these
  # route through the normal `npx skills add` path.
  install_default_skill_for_agent "$name" opencode vercel-labs/skills find-skills || true
  install_default_skill_for_agent "$name" opencode 5dive-com/skills 5dive-cli || true
}

# Write ~/.opencode/channels/telegram/access.json for agent-<name>. Idempotent —
# mirrors seed_grok_telegram_access, but the opencode bridge's access schema also
# carries dmPolicy + pending (see the telegram-opencode PROVISIONING-CONTRACT
# §3), so we default those too. allowed_users may be empty: we still write the
# file (the bridge block-everything default makes a missing file a silent-DM-drop
# — DIVE-45), defaulting dmPolicy to 'pairing' so the first DM pairs instead of
# being dropped. An existing dmPolicy is never overridden (re-runs / token
# rotation preserve the operator's choice).
seed_opencode_telegram_access() {
  local name="$1" allowed_users="$2"
  local user="agent-${name}"
  if [[ -n "$allowed_users" ]]; then
    step "Seeding opencode telegram access.json for $user (allow ${allowed_users})"
  else
    step "Seeding opencode telegram access.json for $user (dmPolicy=pairing)"
  fi
  if ! sudo -u "$user" env CSV="$allowed_users" python3 - <<'PY' >&2; then
import json, os, tempfile

state = os.path.join(os.path.expanduser('~'), '.opencode', 'channels', 'telegram')
os.makedirs(state, mode=0o700, exist_ok=True)
access_path = os.path.join(state, 'access.json')
ids = [s.strip() for s in os.environ['CSV'].split(',') if s.strip()]

try:
    with open(access_path) as f:
        data = json.load(f)
except FileNotFoundError:
    data = {}

allow = list(data.get('allowFrom') or [])
for s in ids:
    if s not in allow:
        allow.append(s)
data['allowFrom'] = allow
data.setdefault('groups', {})
data.setdefault('pending', {})
# Never override an explicit dmPolicy (re-run / rotation keeps the operator's
# choice). For a fresh file: allowlist when we have ids to allow, else pairing
# so the first DM yields a pairing code instead of a silent drop.
if 'dmPolicy' not in data:
    data['dmPolicy'] = 'allowlist' if allow else 'pairing'

fd, tmp = tempfile.mkstemp(dir=state, prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(data, f, indent=2)
os.chmod(tmp, 0o600)
os.replace(tmp, access_path)
print(f"Seeded access.json dmPolicy={data['dmPolicy']} allowFrom={allow} into {access_path}")
PY
    fail "$E_GENERIC" "opencode telegram access.json pre-seed failed for agent '$name'"
  fi
}

# Single dispatch point used by cmd_create. Routes a (type, plugin) pair to
# the right install helper above, so the create flow stays type-agnostic.
# home_channel/allowed_users are hermes-telegram extras (ignored by other
# routes) — kept as positional so the call site stays uniform.
install_channel_for_agent() {
  local type="$1" plugin="$2" name="$3" token="$4" home_channel="${5:-}" allowed_users="${6:-}"
  case "$type" in
    claude)      install_channel_plugin_for_agent "$plugin" "$name" "$allowed_users" ;;
    codex)       install_channel_for_codex_agent "$plugin" "$name" "$token" "$allowed_users" ;;
    grok)        install_channel_for_grok_agent "$plugin" "$name" "$token" "$allowed_users" ;;
    antigravity) install_channel_for_antigravity_agent "$plugin" "$name" "$token" "$allowed_users" ;;
    opencode)    install_channel_for_opencode_agent "$plugin" "$name" "$token" "$allowed_users" ;;
    openclaw)    install_channel_for_openclaw_agent "$plugin" "$name" "$token" "$home_channel" "$allowed_users" ;;
    hermes)      install_channel_for_hermes_agent "$plugin" "$name" "$token" "$home_channel" "$allowed_users" ;;
    *) fail "$E_VALIDATION" "type '$type' does not support channels" ;;
  esac
}

ensure_state() {
  require_root
  mkdir -p "$STATE_DIR" "$ENV_DIR"
  chown root:claude "$STATE_DIR" "$ENV_DIR"
  chmod 2750 "$STATE_DIR" "$ENV_DIR"
  if [[ ! -f "$REGISTRY" ]]; then
    jq -cn --argjson v "$REGISTRY_SCHEMA_VERSION" \
      '{schemaVersion:$v, agents:{}}' > "$REGISTRY"
  else
    # v0 -> current migration. Pre-version registries had no top-level
    # schemaVersion; stamp it in place. Pure jq so no extra deps needed.
    local current
    current=$(jq -r '.schemaVersion // 0' "$REGISTRY" 2>/dev/null || echo 0)
    if (( current < REGISTRY_SCHEMA_VERSION )); then
      local tmp
      tmp=$(mktemp "${REGISTRY}.XXXXXX")
      jq --argjson v "$REGISTRY_SCHEMA_VERSION" '.schemaVersion = $v' "$REGISTRY" > "$tmp"
      chown root:claude "$tmp"
      chmod 640 "$tmp"
      mv "$tmp" "$REGISTRY"
    fi
  fi
  chown root:claude "$REGISTRY"
  chmod 640 "$REGISTRY"
  # Touch the lock file so flock -x has a target even on first run.
  [[ -f "$REGISTRY_LOCK" ]] || : > "$REGISTRY_LOCK"
  chown root:claude "$REGISTRY_LOCK"
  chmod 640 "$REGISTRY_LOCK"
  # Group-writable tasks/org store (unlike the rest of STATE_DIR, which is
  # root-only): the shared task queue is meant to be used by every agent
  # without sudo. 2770 + setgid keeps the db and its -wal/-shm sidecars
  # owned by group claude and writable across agent users. tasks_db_init
  # (re)applies the schema lazily on first use.
  mkdir -p "$TASKS_DIR"
  chown root:claude "$TASKS_DIR"
  chmod 2770 "$TASKS_DIR"
  audit_init
}

# Initialise the append-only audit log. Readable by group `claude` so the
# dashboard process (which runs as `claude`) can `tail` it without sudo.
audit_init() {
  local dir
  dir=$(dirname "$AUDIT_LOG")
  [[ -d "$dir" ]] || mkdir -p "$dir"
  chown root:claude "$dir"
  chmod 2750 "$dir"
  [[ -f "$AUDIT_LOG" ]] || : > "$AUDIT_LOG"
  chown root:claude "$AUDIT_LOG"
  chmod 640 "$AUDIT_LOG"
}

# audit_log <cmd> <result:ok|error> <code> -- <args...>
# Emits one NDJSON line. Sensitive =<value> args are redacted ("--api-key=..."
# becomes "--api-key=<redacted>"). Never fails the caller — writes are
# best-effort so a full disk can't block a rescue rm.
audit_log() {
  # Best-effort: skip silently if the audit dir isn't initialized yet.
  # Some code paths (cmd_auth_start, the read-only commands) don't go
  # through ensure_state and thus don't trigger audit_init, leaving
  # /var/log/5dive/ missing. Without this guard the `>> "$AUDIT_LOG"`
  # redirect bash-errors with "No such file or directory" BEFORE jq's
  # `2>/dev/null` can suppress anything, leaking a noisy line to stderr
  # on every invocation.
  [[ -d "${AUDIT_LOG%/*}" ]] || return 0
  local cmd="$1" result="$2" code="$3"; shift 3
  [[ "${1:-}" == "--" ]] && shift
  local -a sanitized=()
  local a
  for a in "$@"; do
    case "$a" in
      --api-key=*|--telegram-token=*|--discord-token=*|--code=*|--token=*)
        sanitized+=("${a%%=*}=<redacted>") ;;
      *)
        sanitized+=("$a") ;;
    esac
  done
  local user="${FIVEDIVE_AUDIT_USER:-${SUDO_USER:-${USER:-unknown}}}"
  local ts
  ts=$(date -Iseconds)
  jq -cn \
    --arg ts "$ts" --arg u "$user" --arg c "$cmd" \
    --arg r "$result" --argjson code "$code" \
    --args '{ts:$ts, user:$u, cmd:$c, result:$r, code:($code|tonumber? // 0), args:$ARGS.positional}' \
    "${sanitized[@]+"${sanitized[@]}"}" \
    >> "$AUDIT_LOG" 2>/dev/null || true
}

# Dispatcher-level audit state. main() populates these before calling the
# mutating handler; the EXIT trap below fires audit_log with the real exit
# code on the way out. Unset for read-only commands (list/logs/stats) so
# they don't clutter the log.
AUDIT_CMD=""
declare -a AUDIT_ARGS=()

on_exit_audit() {
  local code=$?
  [[ -n "$AUDIT_CMD" ]] || return 0
  local result="ok"
  (( code != 0 )) && result="error"
  audit_log "$AUDIT_CMD" "$result" "$code" -- "${AUDIT_ARGS[@]+"${AUDIT_ARGS[@]}"}"
}

# Serialize mutating calls against a single flock. Lock is released when the
# subshell exits, so even a crash inside the handler frees it. Re-entrancy:
# IN_REGISTRY_LOCK=1 lets cmd_clone -> cmd_create run the inner command
# without trying to re-acquire the same lock (which flock would block on).
with_registry_lock() {
  local fn="$1"; shift
  if [[ "${IN_REGISTRY_LOCK:-0}" == "1" ]]; then
    "$fn" "$@"
    return
  fi
  ensure_state
  (
    flock -x 200
    IN_REGISTRY_LOCK=1
    "$fn" "$@"
  ) 200>"$REGISTRY_LOCK"
}

registry_read() {
  [[ -f "$REGISTRY" ]] && cat "$REGISTRY" || echo '{"agents":{}}'
}

registry_write() {
  # stdin -> registry, atomic
  local tmp
  tmp=$(mktemp "${REGISTRY}.XXXXXX")
  cat > "$tmp"
  chown root:claude "$tmp"
  chmod 640 "$tmp"
  mv "$tmp" "$REGISTRY"
}

# -------- tasks + org store (sqlite) --------
#
# A light, host-shared task queue + agent org-chart, kept SEPARATE from the
# root-only agent registry. It lives in a GROUP-WRITABLE subdir so any agent
# (every agent-<x> user is in group `claude`) can add/list/update tasks
# WITHOUT sudo — these are high-frequency, low-risk operations, unlike
# `agent create` which provisions Linux users and stays root-only.
#
# Storage: /var/lib/5dive/tasks/tasks.db (sqlite, WAL). The dir is 2770
# root:claude (setgid) and we run under umask 0002 so the .db plus its
# -wal/-shm sidecars stay group-writable for the next agent's connection.

TASKS_DIR="${STATE_DIR}/tasks"
TASKS_DB="${TASKS_DIR}/tasks.db"

# Quote an arbitrary string as a SQL literal: double embedded single quotes
# and wrap. The sqlite3 CLI has no ergonomic bind-parameter path from bash,
# so this is the safe way to inline a shell value — use it for EVERY
# user-supplied TEXT value to keep injection impossible.
sqlq() {
  local s=${1//\'/\'\'}
  printf "'%s'" "$s"
}

# SQL NULL for empty input, otherwise a quoted literal.
sqlq_or_null() {
  [[ -z "${1:-}" ]] && { printf 'NULL'; return; }
  sqlq "$1"
}

# Agents can't apt-install, so route a missing binary to the repair path
# rather than a raw "sqlite3: command not found".
require_sqlite() {
  command -v sqlite3 >/dev/null 2>&1 || fail "$E_NOT_INSTALLED" \
    "sqlite3 not installed — run: sudo 5dive doctor --repair  (or: sudo apt-get install -y sqlite3)"
}

# Idempotent schema. CREATE IF NOT EXISTS throughout, so re-applying it on
# every command is cheap and self-heals a fresh box. DIVE-N idents come from
# a trigger off the autoincrement rowid.
_tasks_schema() {
  cat <<'SQL'
PRAGMA journal_mode=WAL;
PRAGMA foreign_keys=ON;

CREATE TABLE IF NOT EXISTS tasks (
  id          INTEGER PRIMARY KEY AUTOINCREMENT,
  ident       TEXT UNIQUE,
  title       TEXT NOT NULL,
  body        TEXT,
  status      TEXT NOT NULL DEFAULT 'todo',
  priority    TEXT NOT NULL DEFAULT 'medium',
  assignee    TEXT,
  created_by  TEXT,
  parent_id   INTEGER REFERENCES tasks(id) ON DELETE CASCADE,
  created_at  TEXT NOT NULL DEFAULT (datetime('now')),
  started_at  TEXT,
  done_at     TEXT,
  updated_at  TEXT NOT NULL DEFAULT (datetime('now')),
  -- Result text captured at close time via `5dive task done <id> --result=…`.
  -- Lets dashboard + creators read what the assignee produced without
  -- scraping the tmux pane. NULL for open tasks + legacy rows closed before
  -- the column existed.
  result      TEXT,
  -- Human-gate fields (Human Task Inbox, DIVE-103; parent feature DIVE-102).
  -- A task an agent can't finish without a human (a decision, a secret, an
  -- approval, a manual step) is parked with `5dive task need`: status=blocked
  -- + need_type set. The inbox is the still-pending gates — see the canonical
  -- definition just below (need_type IS NOT NULL AND need_answered_at IS NULL).
  -- All NULL for ordinary tasks. need_options is pipe-delimited (decision
  -- choices). need_answered_at is the single "answered" signal — set by
  -- `task answer` for EVERY gate type, so the inbox (need_type IS NOT NULL AND
  -- need_answered_at IS NULL) is decoupled from the overloaded `status` column
  -- (a task can be both human-gated AND blocked-by another task). need_answer
  -- holds the value for decision/approval/manual; for `secret` it stays NULL —
  -- a raw key must NEVER land in this group-readable db (answer records only
  -- that it was provided, and the agent loads the key out-of-band).
  need_type        TEXT,
  ask              TEXT,
  need_options     TEXT,
  -- DIVE-148. recommend is the option text the filing agent advises (strongly
  -- encouraged for decision/approval). When set it leads the human alert as
  -- '✅ Recommended: <X>' and that option's tap button sorts first (⭐). For a
  -- decision it must match one of need_options; for approval it's free text
  -- (typically 'approved'/'denied'). NULL when the agent gave no recommendation.
  recommend        TEXT,
  need_answer      TEXT,
  need_answered_at TEXT,
  -- Recurring task templates (DIVE step 1). kind='recurring' marks a row as a
  -- TEMPLATE, not work: it's excluded from the work board, the heartbeat TODO
  -- count + wake, and the human inbox, so it's never picked up directly.
  -- `schedule` is a 5-field cron expression; the step-2 materializer clones the
  -- template into a fresh kind='standard' todo when due and stamps
  -- last_fired_at. Ordinary tasks are kind='standard' (the default) with both
  -- schedule + last_fired_at NULL.
  kind             TEXT NOT NULL DEFAULT 'standard',
  schedule         TEXT,
  last_fired_at    TEXT,
  -- DIVE-138 step 2. A materialized instance links back to the recurring
  -- template it was cloned from via from_template_id (NULL for templates and
  -- ordinary tasks); the materializer's skip-if-open dedup keys on it. NOT a FK
  -- with cascade — deleting a template must not nuke its already-materialized
  -- instances' history. `fresh` (1/0/NULL) is the per-template clean-session
  -- pref copied onto each instance: when 1 the heartbeat sends /clear before
  -- working it regardless of the agent-level fresh setting.
  from_template_id INTEGER,
  fresh            INTEGER
);

CREATE TABLE IF NOT EXISTS task_deps (
  task_id     INTEGER NOT NULL REFERENCES tasks(id) ON DELETE CASCADE,
  blocked_by  INTEGER NOT NULL REFERENCES tasks(id) ON DELETE CASCADE,
  PRIMARY KEY (task_id, blocked_by)
);

CREATE TABLE IF NOT EXISTS agents_org (
  name        TEXT PRIMARY KEY,
  reports_to  TEXT REFERENCES agents_org(name) ON DELETE SET NULL,
  role        TEXT,
  title       TEXT,
  updated_at  TEXT NOT NULL DEFAULT (datetime('now'))
);

CREATE INDEX IF NOT EXISTS tasks_status_idx   ON tasks(status);
CREATE INDEX IF NOT EXISTS tasks_assignee_idx ON tasks(assignee, status);
CREATE INDEX IF NOT EXISTS tasks_parent_idx   ON tasks(parent_id);

CREATE TRIGGER IF NOT EXISTS tasks_ident_ai AFTER INSERT ON tasks
WHEN NEW.ident IS NULL
BEGIN
  UPDATE tasks SET ident='DIVE-'||NEW.id WHERE id=NEW.id;
END;

-- Touch updated_at on change. The WHEN guard stops the trigger recursing on
-- its own write (it only fires when updated_at wasn't itself just changed).
CREATE TRIGGER IF NOT EXISTS tasks_touch_au AFTER UPDATE ON tasks
WHEN OLD.updated_at = NEW.updated_at
BEGIN
  UPDATE tasks SET updated_at=datetime('now') WHERE id=NEW.id;
END;

-- The "organized view" behind `task ls`: open work, priority then age.
CREATE VIEW IF NOT EXISTS task_board AS
  SELECT ident, status, priority, COALESCE(assignee,'-') AS assignee,
         title, COALESCE(created_by,'-') AS created_by, created_at, id
  FROM tasks
  WHERE status NOT IN ('done','cancelled') AND kind = 'standard'
  ORDER BY CASE priority
             WHEN 'urgent' THEN 0 WHEN 'high' THEN 1
             WHEN 'medium' THEN 2 ELSE 3 END,
           created_at;
SQL
}

# Create the group-writable tasks dir + db and apply the schema. Safe to call
# repeatedly; command functions call it first. If the dir is missing and we
# aren't root we can't create it (parent /var/lib/5dive is 2750), so emit a
# one-time bootstrap hint instead of a cryptic failure.
tasks_db_init() {
  require_sqlite
  umask 0002
  if [[ ! -d "$TASKS_DIR" ]]; then
    if [[ $EUID -eq 0 ]]; then
      mkdir -p "$TASKS_DIR"
      chown root:claude "$TASKS_DIR"
      chmod 2770 "$TASKS_DIR"
    else
      fail "$E_PERMISSION" "tasks store not initialised — run once: sudo 5dive task init"
    fi
  fi
  # Apply the schema only when the db is uninitialised. Re-running it on every
  # command would take a write lock each time and, under concurrent agents,
  # collide ("database is locked"); a cheap read of sqlite_master takes only a
  # WAL read-lock, which never blocks writers. .timeout lets a genuine
  # first-run race serialise instead of erroring. stdout is discarded because
  # `PRAGMA journal_mode=WAL` echoes "wal".
  local has
  has=$(sqlite3 -cmd ".timeout 5000" "$TASKS_DB" \
        "SELECT 1 FROM sqlite_master WHERE type='table' AND name='tasks' LIMIT 1;" 2>/dev/null)
  if [[ "$has" != "1" ]]; then
    sqlite3 -cmd ".timeout 5000" "$TASKS_DB" < <(_tasks_schema) >/dev/null \
      || fail "$E_GENERIC" "failed to initialise tasks db at $TASKS_DB"
    chmod 0660 "$TASKS_DB" 2>/dev/null || true
  else
    _tasks_db_migrate
  fi
}

# Idempotent additive migrations for already-initialised stores. sqlite has
# no `ADD COLUMN IF NOT EXISTS`, so we check pragma_table_info first. Each
# migration is a one-shot check + ALTER; running it on every init is cheap
# (single PRAGMA read). Add new column migrations to the array below.
_tasks_db_migrate() {
  local cols
  cols=$(sqlite3 -cmd ".timeout 5000" "$TASKS_DB" \
         "SELECT name FROM pragma_table_info('tasks');" 2>/dev/null)
  local c
  # Each entry: "<column> <type>". Add new additive columns here; existing
  # rows backfill to NULL. Pure expand (no contract), so old queries/rows are
  # untouched and a downgrade still reads/writes the table fine.
  for c in 'result TEXT' 'need_type TEXT' 'ask TEXT' 'need_options TEXT' 'recommend TEXT' 'need_answer TEXT' 'need_answered_at TEXT' \
           "kind TEXT NOT NULL DEFAULT 'standard'" 'schedule TEXT' 'last_fired_at TEXT' \
           'from_template_id INTEGER' 'fresh INTEGER'; do
    if ! printf '%s\n' "$cols" | grep -qx "${c%% *}"; then
      sqlite3 -cmd ".timeout 5000" "$TASKS_DB" \
        "ALTER TABLE tasks ADD COLUMN ${c};" >/dev/null 2>&1 || true
    fi
  done
}

# Per-connection setup, passed via -cmd / .timeout so it produces NO output
# rows (an inline `PRAGMA busy_timeout=N;` echoes the value, which would
# corrupt anything that captures a query result). .timeout makes concurrent
# agent writers retry instead of erroring with "database is locked";
# foreign_keys=ON enables the ON DELETE cascades.
db() {
  umask 0002
  sqlite3 -cmd ".timeout 5000" -cmd "PRAGMA foreign_keys=ON" "$TASKS_DB" "$1"
}

# Formatted read: dbfmt <sqlite-flag> "<sql>"  (e.g. -box, -json, -line).
dbfmt() {
  umask 0002
  sqlite3 -cmd ".timeout 5000" -cmd "PRAGMA foreign_keys=ON" "$1" "$TASKS_DB" "$2"
}

# Resolve a task ref (numeric id or DIVE-N) into the global RESOLVED_TASK_ID,
# or fail. Sets a global rather than echoing so the `fail` error path runs in
# the caller's shell (not a $() subshell) — otherwise a --json error envelope
# would be captured into the caller's var instead of reaching stdout. Shape is
# validated before anything touches SQL.
RESOLVED_TASK_ID=""
resolve_task_id() {
  local ref="$1" id
  if [[ "$ref" =~ ^[0-9]+$ ]]; then
    id="$ref"
  elif [[ "$ref" =~ ^[Dd][Ii][Vv][Ee]-([0-9]+)$ ]]; then
    id="${BASH_REMATCH[1]}"
  else
    fail "$E_VALIDATION" "bad task ref '$ref' (expected <number> or DIVE-<number>)"
  fi
  local found
  found=$(db "SELECT id FROM tasks WHERE id=${id};")
  [[ -n "$found" ]] || fail "$E_NOT_FOUND" "no such task: $ref"
  RESOLVED_TASK_ID="$id"
}

# Who is acting: --from wins, else infer from SUDO_USER (sudo path) or $USER
# (agent running directly as agent-<x>), else the literal "cli".
task_actor() {
  local from="${1:-}"
  [[ -n "$from" ]] && { printf '%s' "$from"; return; }
  local s; s=$(auto_sender_from_sudo)
  [[ -n "$s" ]] && { printf '%s' "$s"; return; }
  local u="${USER:-$(id -un 2>/dev/null)}"
  [[ "$u" == agent-* ]] && { printf '%s' "${u#agent-}"; return; }
  printf 'cli'
}

valid_task_status()   { [[ "$1" =~ ^(todo|in_progress|blocked|done|cancelled)$ ]]; }
valid_task_priority() { [[ "$1" =~ ^(low|medium|high|urgent)$ ]]; }
valid_need_type()     { [[ "$1" =~ ^(decision|secret|approval|manual)$ ]]; }

# Shape-check a 5-field cron expression (minute hour dom month dow). This is a
# lightweight gate at create time — exactly five whitespace-separated fields,
# each built only from cron field chars ([0-9*,/-]). It does NOT validate ranges
# (e.g. minute 0-59); the step-2 materializer / system cron is the authority on
# semantics. Rejects obvious garbage so a typo can't silently store a never-
# firing template.
valid_cron_expr() {
  local expr="$1"
  read -r -a _cf <<<"$expr"
  [[ ${#_cf[@]} -eq 5 ]] || return 1
  local f
  for f in "${_cf[@]}"; do
    [[ "$f" =~ ^[0-9*,/-]+$ ]] || return 1
  done
  return 0
}

# Does a single cron field match an integer value? Supports the cron grammar the
# DIVE-138 materializer needs: '*', int, list a,b,c, range a-b, step */n and
# a-b/n. <value> is a date component (already an int). Returns 0 on match. Uses
# `read -ra` (not `for x in $field`) to split on commas WITHOUT triggering
# pathname expansion on the '*' wildcard. All numbers forced base-10 (10#) so a
# zero-padded date component like "08"/"09" isn't read as bad octal.
_cron_field_match() {
  local field="$1" val="$2" part lo hi step
  val=$((10#$val))
  local -a parts; IFS=',' read -ra parts <<<"$field"
  for part in "${parts[@]}"; do
    step=1
    if [[ "$part" == */* ]]; then
      step="${part##*/}"; part="${part%%/*}"
      [[ "$step" =~ ^[0-9]+$ ]] && (( step > 0 )) || continue
    fi
    if [[ "$part" == "*" ]]; then
      (( step == 1 )) && return 0          # bare '*' — everything matches
      (( val % step == 0 )) && return 0    # '*/n' — every nth from 0
      continue
    fi
    if [[ "$part" == *-* ]]; then
      [[ "${part%%-*}" =~ ^[0-9]+$ && "${part##*-}" =~ ^[0-9]+$ ]] || continue
      lo=$((10#${part%%-*})); hi=$((10#${part##*-}))
    elif [[ "$part" =~ ^[0-9]+$ ]]; then
      lo=$((10#$part)); hi=$lo
    else
      continue
    fi
    (( val < lo || val > hi )) && continue
    (( (val - lo) % step == 0 )) && return 0
  done
  return 1
}

# Day-of-week match with Sunday=0=7 (cron allows both). %w gives 0-6 (0=Sun).
_cron_dow_match() {
  local field="$1" v="$2"
  _cron_field_match "$field" "$v" && return 0
  (( v == 0 )) && _cron_field_match "$field" 7 && return 0
  return 1
}

# Does 5-field cron <expr> fire at <epoch> (unix seconds)? Implements standard
# cron semantics incl. the dom/dow OR-rule: when BOTH day-of-month and
# day-of-week are restricted (neither is '*'), the row fires if EITHER matches;
# otherwise every field ANDs. Backs the DIVE-138 heartbeat materializer.
# Returns 0 if due at that minute, 1 otherwise.
_cron_matches() {
  local expr="$1" epoch="$2"
  local -a cm; read -r -a cm <<<"$expr"
  [[ ${#cm[@]} -eq 5 ]] || return 1
  local emin ehour edom emon edow
  read -r emin ehour edom emon edow < <(date -u -d "@${epoch}" +'%M %H %d %m %w' 2>/dev/null)
  [[ -n "$edow" ]] || return 1
  _cron_field_match "${cm[0]}" "$emin"  || return 1
  _cron_field_match "${cm[1]}" "$ehour" || return 1
  _cron_field_match "${cm[3]}" "$emon"  || return 1
  if [[ "${cm[2]}" != "*" && "${cm[4]}" != "*" ]]; then
    _cron_field_match "${cm[2]}" "$edom" || _cron_dow_match "${cm[4]}" "$edow" || return 1
  else
    _cron_field_match "${cm[2]}" "$edom" || return 1
    _cron_dow_match  "${cm[4]}" "$edow" || return 1
  fi
  return 0
}

# Indent every line of stdin by two spaces. Used for the nested lists in
# `task show` / `org show`; a plain `printf '  %s\n' "$var"` only indents the
# first line, and unquoting splits values that contain spaces (task titles).
indent2() { while IFS= read -r _l; do printf '  %s\n' "$_l"; done; }

# -------- auth status (per type, default profile) --------

# auth_creds_present <type> — non-empty stdout if the default-profile credential
# file for this type has a usable token/key. Handles both env-file format
# (anthropic.env with CLAUDE_CODE_OAUTH_TOKEN or ANTHROPIC_API_KEY) and the
# JSON sentinels codex/opencode write on login.
auth_creds_present() {
  local type="$1" sentinel="${TYPE_AUTH[$1]:-}"
  [[ -n "$sentinel" ]] || return 1
  local path="${sentinel%%:*}" key="${sentinel##*:}"
  local sentinel_ok=0
  if [[ "$path" == "$key" ]]; then
    [[ -s "$path" ]] && sentinel_ok=1
  elif [[ -f "$path" ]]; then
    local val=""
    case "$path" in
      *.env)
        # Any non-empty KEY=... in the env file counts — user may have written
        # ANTHROPIC_API_KEY instead of CLAUDE_CODE_OAUTH_TOKEN and both are valid.
        val=$(grep -Ev '^\s*#' "$path" 2>/dev/null | grep -E '^[A-Z_]+=.+' | head -n1 || true)
        ;;
      *)
        val=$(jq -r --arg k "$key" '.env[$k] // empty' "$path" 2>/dev/null || true)
        ;;
    esac
    [[ -n "$val" ]] && sentinel_ok=1
  fi
  (( sentinel_ok )) && return 0

  # Fallback: types whose TYPE_AUTH sentinel is the OAuth state file
  # (codex) won't see an api-key written by `agent auth set`, which
  # lands in /etc/5dive/connectors/<TYPE_API_FILE>. Recognise that file as
  # equally-valid auth — matches the policy that API-key is the preferred
  # path for 3P-harness-blocked vendors (project_google_third_party_harness_policy,
  # project_anthropic_third_party_harness_policy).
  local api_file="${TYPE_API_FILE[$type]:-}"
  [[ -n "$api_file" ]] || return 1
  local api_path="${CONNECTORS_DIR}/${api_file}"
  [[ -f "$api_path" ]] || return 1
  local api_val
  api_val=$(grep -Ev '^\s*#' "$api_path" 2>/dev/null | grep -E '^[A-Z_]+=.+' | head -n1 || true)
  [[ -n "$api_val" ]]
}

# auth_probe_one <type> [profile] — run a short CLI invocation to verify the
# stored creds still work against the provider API. Returns 0 (ok) / 1 (stale)
# / 2 (no probe configured, caller should fall back to file-presence).
#
# The probe runs as user `claude` with a 5s cap. We source the same env files
# systemd loads for 5dive-agent@.service so the CLI sees CLAUDE_CODE_OAUTH_TOKEN
# / ANTHROPIC_API_KEY / OPENAI_API_KEY — otherwise `claude --print ping` exits
# non-zero with "Not logged in" even when the stored token is perfectly valid.
# When <profile> is set, the profile's combined.env is loaded LAST so it
# overrides the shared defaults (same precedence systemd uses).
#
# claude's `--print` exit code flips when stdout isn't a TTY, so we can't rely
# on `$?` alone — instead we capture stdout+stderr and grep for known "not
# authed" patterns. A rate-limit response is a valid-but-throttled token and
# counts as ok (we want to distinguish stale creds, not provider health).
auth_probe_one() {
  local type="$1" profile="${2:-}" probe="${TYPE_PROBE[$1]:-}"
  [[ -n "$probe" ]] || return 2
  local env_src=''
  for f in /etc/5dive/connectors/anthropic.env /etc/5dive/connectors/openai.env; do
    env_src+="[ -r $f ] && set -a && . $f && set +a; "
  done
  if [[ -n "$profile" ]]; then
    local pf="${AUTH_PROFILES_DIR}/${profile}/combined.env"
    env_src+="[ -r $pf ] && set -a && . $pf && set +a; "
  fi
  local out
  out=$(sudo -u claude -i timeout 5s bash -lc "${env_src}${probe}" 2>&1 || true)
  # Known "stale creds" signals from claude's --print output. We match on
  # substrings rather than exit codes because --print flips its exit code
  # based on whether stdout is a TTY. Rate-limit / usage-limit responses
  # are NOT in here — those mean the token works, the account is throttled.
  if grep -qiE 'not logged in|please run /login|invalid api key|invalid bearer token|failed to authenticate|authentication.{0,10}failed|unauthorized|\b401\b' <<<"$out"; then
    return 1
  fi
  return 0
}

# auth_status_one <type> [--no-probe]
# States:
#   unknown       — unrecognized type
#   not_installed — CLI binary missing on disk
#   needs_login   — no credential file / empty credential file
#   stale         — creds exist but the live probe rejected them
#   ok            — creds exist AND (probe passed OR no probe configured)
#
# Pass --no-probe for callers that only want a cheap file check (e.g. the
# bulk cmd_auth_status loop, which runs before the dashboard renders and
# shouldn't block for N*5s). Set FIVEDIVE_AUTH_PROBE=1 to force a probe.
auth_status_one() {
  local type="$1" probe_flag="${2:-}"
  is_known_type "$type" || { echo "unknown"; return; }
  [[ -x "${TYPE_BIN[$type]}" ]] || { echo "not_installed"; return; }
  # No sentinel configured = type doesn't require external auth (opencode ships
  # with free models). Report "ok" so the connect flow skips straight to create.
  [[ -n "${TYPE_AUTH[$type]:-}" ]] || { echo "ok"; return; }
  if ! auth_creds_present "$type"; then
    echo "needs_login"; return
  fi
  if [[ "$probe_flag" == "--no-probe" ]]; then
    echo "ok"; return
  fi
  auth_probe_one "$type"
  case "$?" in
    0) echo "ok" ;;
    1) echo "stale" ;;
    *) echo "ok" ;;
  esac
}

cmd_auth_status() {
  # Default: skip the live probe so the bulk status call stays fast (<100ms).
  # --probe runs a real `<cli> --print ping` for each installed type and
  # surfaces "stale" when the stored creds no longer work. FIVEDIVE_AUTH_PROBE
  # lets the API layer opt in once without parsing cli args.
  local probe_flag="--no-probe"
  local t=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --probe)       probe_flag="" ;;
      --no-probe)    probe_flag="--no-probe" ;;
      --type=*)      t="${1#--type=}" ;;
      -*)            fail "$E_USAGE" "unknown flag: $1" ;;
      *)             [[ -z "$t" ]] && t="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -z "$probe_flag" && "${FIVEDIVE_AUTH_PROBE:-}" == "0" ]] && probe_flag="--no-probe"
  [[ "$probe_flag" == "--no-probe" && "${FIVEDIVE_AUTH_PROBE:-}" == "1" ]] && probe_flag=""

  local types=()
  if [[ -n "$t" ]]; then
    is_known_type "$t" || fail "$E_NOT_FOUND" "unknown type: $t"
    types=("$t")
  else
    types=("${!TYPE_BIN[@]}")
  fi

  local out="{"
  local first=1
  for type in "${types[@]}"; do
    local s
    s=$(auth_status_one "$type" "$probe_flag")
    if (( first )); then first=0; else out+=","; fi
    out+="\"$type\":\"$s\""
  done
  out+="}"
  if (( JSON_MODE )); then
    echo "$out" | jq -c '{ok:true, data: .}'
  else
    echo "$out" | jq -r 'to_entries[] | "\(.key): \(.value)"' | sort
  fi
}

cmd_install() {
  local type="${1:-}"
  [[ -n "$type" ]] || fail "$E_USAGE" "usage: 5dive agent install <type>"
  is_known_type "$type" || fail "$E_NOT_FOUND" "unknown type: $type"
  local bin="${TYPE_BIN[$type]}"
  if [[ -x "$bin" ]]; then
    ok "$type already installed at $bin" \
       '{type:$t, bin:$b, installed:true, alreadyInstalled:true}' \
       --arg t "$type" --arg b "$bin"
    return 0
  fi
  local recipe="${TYPE_INSTALL[$type]:-}"
  [[ -n "$recipe" ]] || fail "$E_NOT_INSTALLED" "no installer configured for '$type' — please install $bin manually"
  step "Installing $type (as user 'claude')"
  # -i loads claude's login env (nvm, XDG redirects, etc.)
  sudo -u claude -i bash -lc "$recipe" >&2
  if [[ -x "$bin" ]]; then
    ok "$type installed at $bin" \
       '{type:$t, bin:$b, installed:true, alreadyInstalled:false}' \
       --arg t "$type" --arg b "$bin"
  else
    fail "$E_GENERIC" "$type install reported success but $bin still missing — investigate manually"
  fi
}

# ---- auth profile helpers ----
#
# A profile is a directory /var/lib/5dive/auth-profiles/<name>/ containing:
#   - combined.env      — key=value pairs merged into the agent's systemd env
#                         (CLAUDE_CODE_OAUTH_TOKEN, ANTHROPIC_API_KEY, OPENAI_API_KEY,
#                         etc.). systemd EnvironmentFile reads it as root before
#                         drop-priv, so mode 0600 root:root is fine.
#   - <type>/           — optional per-type CLI config dir (e.g. claude/ used as
#                         CLAUDE_CONFIG_DIR) for profiles created via `auth login
#                         --profile=<name>` or the device-code flow.
#
# Per-agent binding: 5dive-agent@<name>.service reads
# /var/lib/5dive/agents.d/<name>-auth.env — a symlink to the profile's
# combined.env when the agent opted into a profile, missing otherwise.
ensure_profile_dir() {
  local name="$1"
  valid_profile_name "$name" || fail "$E_VALIDATION" "invalid profile name '$name' (lowercase letters/digits/_-, start letter, <=32 chars)"
  require_root
  local dir="${AUTH_PROFILES_DIR}/${name}"
  mkdir -p "$dir"
  chown root:claude "${AUTH_PROFILES_DIR}" "$dir"
  chmod 2750 "${AUTH_PROFILES_DIR}" "$dir"
  local env_file="${dir}/combined.env"
  [[ -f "$env_file" ]] || : > "$env_file"
  # 0640 root:claude so agent users (all in group `claude`) can read the
  # file directly when systemd loads it via EnvironmentFile, AND so the live
  # auth probe (running as user `claude`) can source it to validate creds.
  # Same exposure as /etc/5dive/connectors/anthropic.env — if one profile's
  # token leaks to another agent user, they already shared the box.
  chown root:claude "$env_file"
  chmod 640 "$env_file"
  echo "$dir"
}

# profile_type_dir <profile> <type> — per-type state dir under a profile.
# Side effect: creates the dir mode 2750 owner=claude. Idempotent. Used as
# the redirect target for whichever env var the type honours (CODEX_HOME,
# HERMES_HOME, CLAUDE_CONFIG_DIR, or HOME for openclaw).
profile_type_dir() {
  local profile="$1" type="$2"
  [[ -n "$profile" ]] || fail "$E_GENERIC" "profile_type_dir: empty profile"
  is_known_type "$type" || fail "$E_GENERIC" "profile_type_dir: unknown type '$type'"
  local dir="${AUTH_PROFILES_DIR}/${profile}/${type}"
  install -d -m 2750 -o claude -g claude "$dir" 2>/dev/null || true
  echo "$dir"
}

# profile_type_env <profile> <type> — emits the KEY=VALUE env fragment that
# scopes the type's credential storage to the profile dir. Empty when profile
# is empty (default profile keeps writing to the shared /home/claude/.<type>).
# Designed for `env $(profile_type_env ...) <login_cmd>` plumbing.
profile_type_env() {
  local profile="$1" type="$2"
  [[ -n "$profile" ]] || return 0
  local dir
  dir=$(profile_type_dir "$profile" "$type")
  case "$type" in
    claude)            printf 'CLAUDE_CONFIG_DIR=%s' "$dir" ;;
    codex)             printf 'CODEX_HOME=%s' "$dir" ;;
    hermes)            printf 'HERMES_HOME=%s' "$dir" ;;
    # openclaw's resolveStateDir uses $HOME/.openclaw. HOME redirect is the
    # only handle. antigravity (writes ~/.gemini/antigravity-cli/) and grok
    # (writes ~/.grok/) are the same shape — no per-tool *_HOME var to use.
    openclaw|antigravity|grok) printf 'HOME=%s' "$dir" ;;
    *) return 1 ;;
  esac
}

# profile_type_auth_path <profile> <type> — absolute path to the credential
# sentinel for (profile, type). Empty profile returns TYPE_AUTH's shared
# default; non-empty returns the per-profile path corresponding to the
# state-root redirect that profile_type_env installs.
profile_type_auth_path() {
  local profile="$1" type="$2"
  if [[ -z "$profile" ]]; then
    echo "${TYPE_AUTH[$type]:-}"
    return
  fi
  local dir="${AUTH_PROFILES_DIR}/${profile}/${type}"
  case "$type" in
    codex)    echo "${dir}/auth.json" ;;
    hermes)   echo "${dir}/auth.json" ;;
    # openclaw/antigravity/grok use HOME redirect so the credential lives
    # at the same relative path each tool would write under a real $HOME.
    openclaw)    echo "${dir}/.openclaw/agents/main/agent/auth-profiles.json" ;;
    antigravity) echo "${dir}/.gemini/antigravity-cli/antigravity-oauth-token" ;;
    grok)        echo "${dir}/.grok/auth.json" ;;
    # claude detection in cmd_auth_poll is log-grep-based, not file-mtime —
    # this entry is here for completeness/symmetry.
    claude)   echo "${dir}/.credentials.json" ;;
    *) return 1 ;;
  esac
}

# paperclip_seed_for_type <type> <profile> — wire the host-default credential
# location (the path the `claude` Linux user reads) to a profile's credential
# file, so paperclipai (which runs as user `claude`) and any other host-level
# CLI invocation picks up the same auth as the agent does.
#
# Symlinks for codex/hermes/openclaw (file-based auth); env-file write
# for claude (token-via-env). Each per-type case skips when the host-default
# already holds a real (non-symlink) credential — manual host-level logins
# always win over the auto-seed. opencode has no auth and is a no-op.
#
# Idempotent: re-running with the same (type, profile) is a no-op; with a
# different profile, replaces the symlink so paperclip follows the new agent.
paperclip_seed_for_type() {
  local type="$1" profile="$2"
  [[ -n "$type" && -n "$profile" ]] || return 0
  local pdir="${AUTH_PROFILES_DIR}/${profile}/${type}"
  [[ -d "$pdir" ]] || return 0
  # Service-level fixups (PATH, sandbox env vars) are independent of which
  # type triggered the seed — write them once per call and let
  # _paperclip_ensure_runtime_drop_in skip the restart when content matches.
  _paperclip_ensure_runtime_drop_in
  case "$type" in
    codex|hermes)
      local src="${pdir}/auth.json"
      [[ -e "$src" ]] || return 0
      install -d -m 2770 -o claude -g claude "/home/claude/.${type}"
      _paperclip_link_file "$src" "/home/claude/.${type}/auth.json"
      # hermes also pins model.provider/base_url/default in config.yaml;
      # without it the host CLI lands at the "hermes setup" first-run prompt.
      if [[ "$type" == "hermes" && -e "${pdir}/config.yaml" ]]; then
        _paperclip_link_file "${pdir}/config.yaml" "/home/claude/.hermes/config.yaml"
      fi
      # codex needs an explicit sandbox/approval config to skip its first-run
      # trust prompt and the bubblewrap-fallback path that makes paperclip's
      # "respond with hello" probe time out. Mirrors the per-agent config
      # 5dive-agent-start.sh writes for codex agents, but keyed on /home/
      # claude (paperclipai.service's WorkingDirectory).
      if [[ "$type" == "codex" ]]; then
        local cfg=/home/claude/.codex/config.toml
        cat > "$cfg" <<'TOML'
approval_policy = "never"
sandbox_mode = "danger-full-access"
check_for_update_on_startup = false

[projects."/home/claude"]
trust_level = "trusted"
TOML
        chown claude:claude "$cfg"
        chmod 0600 "$cfg"
      fi
      ;;
    openclaw)
      local osrc="${pdir}/.openclaw"
      local oauth="${osrc}/agents/main/agent/auth-profiles.json"
      [[ -e "$oauth" ]] || return 0
      install -d -m 2770 -o claude -g claude \
        /home/claude/.openclaw \
        /home/claude/.openclaw/agents \
        /home/claude/.openclaw/agents/main \
        /home/claude/.openclaw/agents/main/agent
      _paperclip_link_file "$oauth" "/home/claude/.openclaw/agents/main/agent/auth-profiles.json"
      [[ -e "${osrc}/openclaw.json" ]] \
        && _paperclip_link_file "${osrc}/openclaw.json" "/home/claude/.openclaw/openclaw.json"
      ;;
    claude)
      # claude reads CLAUDE_CODE_OAUTH_TOKEN from env, not a file — copy the
      # token from the profile's combined.env into the anthropic.env that
      # paperclipai.service already loads via EnvironmentFile=. Skip when
      # anthropic.env already has any auth var (preserves manual host login).
      local pcombined="${AUTH_PROFILES_DIR}/${profile}/combined.env"
      [[ -s "$pcombined" ]] || return 0
      local target="${CONNECTORS_DIR}/anthropic.env"
      if [[ -s "$target" ]] \
          && grep -qE '^(CLAUDE_CODE_OAUTH_TOKEN|ANTHROPIC_API_KEY)=' "$target"; then
        return 0
      fi
      local line
      line=$(grep -E '^(CLAUDE_CODE_OAUTH_TOKEN|ANTHROPIC_API_KEY)=' "$pcombined" | head -1 || true)
      [[ -n "$line" ]] || return 0
      install -d -m 0750 -o root -g claude "$CONNECTORS_DIR"
      printf '%s\n' "$line" > "$target"
      chown root:claude "$target"
      chmod 0640 "$target"
      # Restart paperclipai so the new env var lands in its process. CLIs
      # invoked per-call (codex/hermes/openclaw) pick up new symlinks
      # without a restart, so we only restart for claude's env-var path.
      systemctl is-active --quiet paperclipai 2>/dev/null \
        && systemctl restart paperclipai >/dev/null 2>&1 || true
      ;;
    opencode|*) return 0 ;;
  esac
  return 0
}

# Internal helper: replace <link> with a symlink to <src>, but only when the
# current target is missing or already a symlink (never clobber a real file).
_paperclip_link_file() {
  local src="$1" link="$2"
  [[ -e "$link" && ! -L "$link" ]] && return 0
  ln -sfn "$src" "$link"
  chown -h claude:claude "$link" 2>/dev/null || true
}

# Internal helper: keep a 5dive-managed drop-in for paperclipai.service that
# patches PATH so paperclip's hello probes can find the agent binaries:
#   PATH — base unit's PATH omits /home/claude/.local/bin, so `claude`
#          (lives there) isn't found.
# Idempotent: skips the daemon-reload + restart when on-disk content matches
# what we'd write.
_paperclip_ensure_runtime_drop_in() {
  systemctl list-unit-files paperclipai.service >/dev/null 2>&1 || return 0
  local dir=/etc/systemd/system/paperclipai.service.d
  local conf="${dir}/5dive.conf"
  install -d -m 0755 "$dir"
  local desired
  desired=$(cat <<'CONF'
[Service]
Environment=PATH=/home/claude/.local/bin:/home/claude/.nvm/versions/node/v24/bin:/home/claude/.bun/bin:/usr/local/bin:/usr/bin
CONF
)
  if [[ -f "$conf" ]] && [[ "$(cat "$conf")" == "$desired" ]]; then
    return 0
  fi
  printf '%s\n' "$desired" > "$conf"
  chmod 0644 "$conf"
  systemctl daemon-reload >/dev/null 2>&1 || true
  systemctl is-active --quiet paperclipai 2>/dev/null \
    && systemctl restart paperclipai >/dev/null 2>&1 || true
}

# paperclip_unseed_for_profile <profile> — drop any /home/claude/.* symlinks
# pointing into this profile (called from cmd_rm so a deleted agent doesn't
# leave paperclip wedged on a vanished credential file). Best-effort; never
# fails the parent command. Re-seeds from another agent of the same type
# when one exists, so paperclip stays connected as long as any agent is up.
paperclip_unseed_for_profile() {
  local profile="$1"
  [[ -n "$profile" ]] || return 0
  local pdir="${AUTH_PROFILES_DIR}/${profile}"
  local link target
  while IFS= read -r -d '' link; do
    target=$(readlink "$link" 2>/dev/null || true)
    [[ "$target" == "$pdir/"* ]] && rm -f "$link"
  done < <(find /home/claude/.codex /home/claude/.hermes /home/claude/.openclaw \
              -maxdepth 6 -type l -print0 2>/dev/null)
  # Re-seed each type that just lost its source from the first remaining
  # agent of that type (registry is the source of truth).
  local reg
  reg=$(registry_read 2>/dev/null) || return 0
  local t fallback_profile
  for t in codex hermes openclaw claude; do
    fallback_profile=$(jq -r --arg t "$t" '
      .agents | to_entries | map(select(.value.type == $t and (.value.authProfile // "") != ""))
      | .[0].value.authProfile // empty' <<<"$reg")
    [[ -n "$fallback_profile" ]] && paperclip_seed_for_type "$t" "$fallback_profile"
  done
}

# paperclip_seed_all_from_registry — backfill the host-default credential
# locations from whatever agents already exist. Safe to run anytime; called
# from update.sh so existing customer VMs auto-fix on the next install.sh
# bash run, and from cmd_create to wire each fresh agent without per-type
# branching at the call site.
paperclip_seed_all_from_registry() {
  local reg
  reg=$(registry_read 2>/dev/null) || return 0
  local t profile
  for t in codex hermes openclaw claude; do
    profile=$(jq -r --arg t "$t" '
      .agents | to_entries | map(select(.value.type == $t and (.value.authProfile // "") != ""))
      | .[0].value.authProfile // empty' <<<"$reg")
    [[ -n "$profile" ]] && paperclip_seed_for_type "$t" "$profile"
  done
}

# profile_set_var <profile> <VAR> <VALUE> — idempotent KEY=VALUE upsert in
# combined.env. Value comes via stdin to keep it out of argv. If <profile>
# is empty, writes to the default-profile connector file via the system
# helper (preserves perms + filename validation).
profile_set_var() {
  local profile="$1" var="$2" file default_default_env
  if [[ -z "$profile" ]]; then
    fail "$E_GENERIC" "profile_set_var: no default-profile path (caller must use write_default_connector)"
  fi
  local dir
  dir=$(ensure_profile_dir "$profile")
  file="${dir}/combined.env"
  local value
  value=$(cat)
  local tmp
  tmp=$(mktemp "${file}.XXXXXX")
  grep -v "^${var}=" "$file" 2>/dev/null > "$tmp" || true
  printf '%s=%s\n' "$var" "$value" >> "$tmp"
  chown root:claude "$tmp"
  chmod 640 "$tmp"
  mv "$tmp" "$file"
}

# write_default_connector <filename.env> <VAR> <VALUE> — replaces any prior
# entries for <VAR> in /etc/5dive/connectors/<filename.env>, then rewrites
# with correct perms. Value via stdin.
write_default_connector() {
  local fname="$1" var="$2" value path existing
  value=$(cat)
  path="/etc/5dive/connectors/${fname}"
  existing=""
  if [[ -f "$path" ]]; then
    existing=$(grep -v "^${var}=" "$path" 2>/dev/null || true)
  fi
  { [[ -n "$existing" ]] && printf '%s\n' "$existing"; printf '%s=%s\n' "$var" "$value"; } \
    | _write_connector "$fname"
}

# cmd_auth_set — API-key path that bypasses the browser/OAuth flow. Some
# users prefer pasting a key; also the only option for ANTHROPIC_API_KEY /
# Vertex / Bedrock style auth where there's no device-code flow.
#
# Usage:
#   5dive agent auth set <type> --api-key=<key> [--auth-profile=<name>]
#   echo -n "<key>" | 5dive agent auth set <type> --api-key=- [--auth-profile=<name>]
#
# The "-" sentinel reads the key from stdin — use that from the API layer
# so the key never touches process argv (and thus never shows up in `ps`).
cmd_auth_set() {
  local type="" api_key="" profile="" byo_provider=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --api-key=*)       api_key="${1#--api-key=}" ;;
      --auth-profile=*)  profile="${1#--auth-profile=}" ;;
      --provider=*)      byo_provider="${1#--provider=}" ;;
      -*)                fail "$E_USAGE" "unknown flag: $1" ;;
      *)                 [[ -z "$type" ]] && type="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$type" ]] || fail "$E_USAGE" "usage: 5dive agent auth set <type> --api-key=<key> [--auth-profile=<name>] [--provider=<id>]"
  is_known_type "$type" || fail "$E_NOT_FOUND" "unknown type: $type"
  [[ -n "$api_key" ]] || fail "$E_USAGE" "--api-key=<key> required (use --api-key=- to read from stdin)"

  if [[ "$api_key" == "-" ]]; then
    [[ -t 0 ]] && fail "$E_USAGE" "--api-key=- expects the key on stdin, stdin is a TTY"
    api_key=$(cat)
  fi
  valid_api_key "$api_key" \
    || fail "$E_VALIDATION" "api key looks wrong (expected >=10 printable non-space chars)"

  # BYO path for hermes/openclaw: --provider=<canonical> picks which vendor's
  # api-key this is. Routes through apply_byo_provider, which writes into the
  # agent CLI's native state dir (hermes auth.json / openclaw auth-profiles.json)
  # rather than the env-var-style anthropic.env path the claude family uses.
  if [[ -n "$byo_provider" ]]; then
    [[ "$type" == "hermes" || "$type" == "openclaw" || "$type" == "claude" ]] \
      || fail "$E_VALIDATION" "--provider only supported for hermes/openclaw/claude (got: $type — drop --provider for env-var-style types)"
    # claude BYO writes the endpoint+model overrides into the profile's
    # combined.env, so it needs a profile (see _apply_byo_claude).
    [[ "$type" == "claude" && -z "$profile" ]] \
      && fail "$E_USAGE" "claude BYO (--provider) requires --auth-profile=<name> (custom-provider creds are profile-scoped)"
    require_root
    if [[ -n "$profile" ]]; then
      valid_profile_name "$profile" \
        || fail "$E_VALIDATION" "invalid --auth-profile (lowercase letters/digits/_-, start letter, <=32 chars)"
      ensure_profile_dir "$profile" >/dev/null
      # hermes/openclaw seed a per-type credential dir; claude stores its
      # override env vars directly in combined.env, so skip that step for it.
      [[ "$type" == "claude" ]] || profile_type_dir "$profile" "$type" >/dev/null
    fi
    apply_byo_provider "$type" "$byo_provider" "$api_key" "$profile"

    # Restart any running agents that consume this credential so the new
    # provider takes effect immediately. Without this, hermes/openclaw
    # gateways stay on the stale model.provider cached in memory at
    # startup (see 5dive-agent-start.sh's gateway-restart leg) and the
    # operator has to know to manually `agent restart` after every BYO
    # key swap. Match cmd_account_rename's restart loop semantics.
    local _affected
    if [[ -n "$profile" ]]; then
      _affected=$(registry_read | jq -r --arg p "$profile" \
        '.agents | to_entries[] | select(.value.authProfile == $p) | .key')
    else
      _affected=$(registry_read | jq -r \
        '.agents | to_entries[] | select((.value.authProfile // "") == "") | .key')
    fi
    local _agent
    while IFS= read -r _agent; do
      [[ -n "$_agent" ]] || continue
      step "Restarting 5dive-agent@${_agent}.service"
      systemctl restart "5dive-agent@${_agent}.service" >&2 2>&1 \
        || warn "restart of agent '$_agent' failed — check journalctl -u 5dive-agent@${_agent}"
    done <<<"$_affected"

    ok "api key stored for $type/$byo_provider${profile:+ (profile=$profile)}" \
       '{type:$t, provider:$pr, profile:$p}' \
       --arg t "$type" --arg pr "$byo_provider" --arg p "${profile:-}"
    return
  fi

  # Env-var-style path (claude/codex/opencode). hermes/openclaw fall
  # off the bottom because they're not in TYPE_API_FILE — by design: their
  # credentials live in native state dirs, not env files. Pass --provider
  # to route those through apply_byo_provider above.
  local var="${TYPE_API_VAR[$type]}" fname="${TYPE_API_FILE[$type]}"
  case "$type" in
    claude)
      if [[ "$api_key" =~ ^sk-ant-oat01- ]]; then
        var="CLAUDE_CODE_OAUTH_TOKEN"
      fi ;;
  esac
  [[ -n "$var" && -n "$fname" ]] \
    || fail "$E_GENERIC" "no api-key target configured for type '$type' (hermes/openclaw require --provider=<id>)"

  require_root
  if [[ -z "$profile" ]]; then
    step "Writing ${var} to /etc/5dive/connectors/${fname}"
    printf '%s' "$api_key" | write_default_connector "$fname" "$var"
  else
    step "Writing ${var} to auth profile '${profile}'"
    printf '%s' "$api_key" | profile_set_var "$profile" "$var"
  fi

  ok "api key stored for $type${profile:+ (profile=$profile)}" \
     '{type:$t, var:$v, profile:$p}' \
     --arg t "$type" --arg v "$var" --arg p "${profile:-}"
}

cmd_auth_login() {
  local type="" profile=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --auth-profile=*) profile="${1#--auth-profile=}" ;;
      -*)               fail "$E_USAGE" "unknown flag: $1" ;;
      *)                [[ -z "$type" ]] && type="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$type" ]] || fail "$E_USAGE" "usage: 5dive agent auth login <type> [--auth-profile=<name>]"
  is_known_type "$type" || fail "$E_NOT_FOUND" "unknown type: $type"
  local bin="${TYPE_BIN[$type]}"
  [[ -x "$bin" ]] || fail "$E_NOT_INSTALLED" "$type not installed at $bin"
  if [[ -n "$profile" ]]; then
    valid_profile_name "$profile" \
      || fail "$E_VALIDATION" "invalid --auth-profile (lowercase letters/digits/_-, start letter, <=32 chars)"
    require_root
    ensure_profile_dir "$profile" >/dev/null
    profile_type_dir "$profile" "$type" >/dev/null
  fi
  # `auth login` is a TTY handoff (exec replaces this process), so JSON output
  # is not meaningful here. We proceed regardless of JSON_MODE — the caller
  # gets whatever the underlying login tool emits, plus the process exit code.
  # For non-TTY / dashboard flows, use `auth start|poll|submit|cancel` instead.
  echo "Launching '$type' interactive login as user 'claude'${profile:+ (profile=$profile)}..." >&2

  # When profile is set, redirect the type's state-root env var so the
  # interactive login lands in the per-profile dir — same plumbing the
  # device-code flow uses (profile_type_env). Empty when no profile, in
  # which case the legacy shared /home/claude/.<type> remains the target.
  local extra_env=""
  if [[ -n "$profile" ]]; then
    extra_env=$(profile_type_env "$profile" "$type") \
      || fail "$E_GENERIC" "profile_type_env: no plumbing for type '$type'"
  fi

  case "$type" in
    claude)
      # claude setup-token only displays the token on stdout — it doesn't
      # write anywhere on disk. Without capture, the wrapper has no way to
      # promote the token into combined.env (profile case) or anthropic.env
      # (default case), so `5dive account show` reports types=- even after
      # a successful login. Use `script(1)` to tee stdout to a log file
      # while keeping the interactive TTY working, then post-process with
      # extract_claude_token (shared with the device-code flow).
      #
      # CLAUDE_CONFIG_DIR (passed via $extra_env when profile is set)
      # overrides claude's default ~/.claude location for config; the
      # token itself isn't persisted by claude anywhere — only printed.
      require_root
      local log rc=0
      log=$(sudo -u claude mktemp /tmp/5dive-claude-login.XXXXXX.log)
      sudo -u claude -i env $extra_env script -fq -c "$bin setup-token" "$log" || rc=$?
      local tok=""
      tok=$(extract_claude_token "$log" 2>/dev/null || true)
      # Token transited through this file — shred so it doesn't linger.
      sudo -u claude shred -u "$log" 2>/dev/null || sudo -u claude rm -f "$log"
      if [[ -z "$tok" ]]; then
        (( rc != 0 )) \
          && fail "$E_GENERIC" "claude setup-token exited with code $rc"
        fail "$E_GENERIC" "no OAuth token found in setup-token output — login may have been cancelled"
      fi
      if [[ -n "$profile" ]]; then
        step "Writing CLAUDE_CODE_OAUTH_TOKEN to auth profile '${profile}'"
        printf '%s' "$tok" | profile_set_var "$profile" "CLAUDE_CODE_OAUTH_TOKEN"
      else
        step "Writing CLAUDE_CODE_OAUTH_TOKEN to /etc/5dive/connectors/anthropic.env"
        printf '%s' "$tok" | write_default_connector "anthropic.env" "CLAUDE_CODE_OAUTH_TOKEN"
      fi
      ok "claude OAuth token stored${profile:+ (profile=$profile)}" \
         '{type:$t, var:$v, profile:$p}' \
         --arg t "claude" --arg v "CLAUDE_CODE_OAUTH_TOKEN" --arg p "${profile:-}"
      return ;;
    hermes)
      # hermes signs in to OpenAI via its own device-code flow. Run it
      # interactively so the user can see the URL/code and Ctrl+C cleanly.
      exec sudo -u claude -i env $extra_env "$bin" auth add openai-codex ;;
    openclaw)
      # openclaw runs the same OpenAI /codex/device flow as hermes, but it
      # routes through `models auth login`. Pass --provider + --method to
      # skip the @clack/prompts wizard's two pickers (auth.ts:185-188 short-
      # circuits when both resolve), and --set-default to apply the
      # provider's defaultModel (openai-codex/gpt-5.5) so no follow-on
      # `models set` is needed. DISPLAY=:0 forces isRemote=false in
      # infra/remote-env.ts so the user-code prints inline (else openclaw
      # redacts it as "[shown on the local device only]").
      # Profile-scoped: extra_env is HOME=<profile_dir>/openclaw, so
      # resolveStateDir lands at $HOME/.openclaw inside the profile dir.
      if [[ -n "$extra_env" ]]; then
        local oc_home
        oc_home=$(profile_type_dir "$profile" "$type")
        install -d -m 2750 -o claude -g claude \
          "${oc_home}/.openclaw" \
          "${oc_home}/.openclaw/agents" \
          "${oc_home}/.openclaw/agents/main" \
          "${oc_home}/.openclaw/agents/main/agent" 2>/dev/null || true
      fi
      exec sudo -u claude -i env DISPLAY=:0 $extra_env "$bin" \
        models auth login --provider openai-codex --method device-code --set-default ;;
    codex)
      # CODEX_HOME (when profiled) overrides /etc/profile.d's default.
      exec sudo -u claude -i env $extra_env bash -lc 'codex login' ;;
    antigravity)
      # agy has no `auth login` subcommand — OAuth fires automatically the
      # first time the binary needs a token. Use the interactive TUI
      # (`--prompt-interactive ping`) rather than `--print ping`: print mode
      # caps the OAuth wait at 30s, but the interactive TUI waits indefinitely
      # for the pasted code. Run directly at the terminal here, the user picks
      # "Google OAuth", opens the printed URL, and pastes the authorization
      # code — no 30s race.
      exec sudo -u claude -i env $extra_env bash -lc 'agy --prompt-interactive ping' ;;
    grok)
      # grok has both an interactive UI OAuth (localhost callback, no good
      # for headless VMs) and a dedicated --device-auth flag (URL + 4-dash-4
      # user code, CLI polls). Use --device-auth so the same flow works
      # whether we're invoked from a real TTY or via the tmux+script(1)
      # bridge that powers the dashboard's device-code flow.
      exec sudo -u claude -i env $extra_env bash -lc 'grok login --device-auth' ;;
    opencode) exec sudo -u claude -i bash -lc 'opencode auth login' ;;
  esac
}

# -------- non-TTY device-code flow (dashboard-driven) --------
#
# Lifecycle:
#   start   -> spawn the CLI's device-code login (claude setup-token / codex
#              login --device-auth / ...) in a detached tmux session owned by
#              `claude`, teed through script(1) into login.log so we can grep
#              OAuth URL / one-time code / success markers. Returns a session id.
#   poll    -> report state (pending_url|awaiting_code|ok|expired|error). The
#              dashboard calls this on a timer and displays whatever fields
#              are populated (url + optional code while awaiting_code, error on
#              error).
#   submit  -> claude only: feed the user-pasted callback code into the
#              tmux session. codex/hermes/openclaw don't have a submit
#              step — the CLI polls OpenAI itself and writes its credential
#              file on success, which poll detects via file mtime.
#   cancel  -> kill the tmux session.
#
# Wired for: claude (Anthropic setup-token, url + pasted callback code) and
# codex/hermes/openclaw (OpenAI /codex/device device-auth, url + displayed
# one-time code, no callback paste, mtime-based success detection on each
# CLI's credential file). opencode still falls back to TTY `auth login` or
# `auth set --api-key`.

require_auth_session_root() {
  require_root
  mkdir -p "$AUTH_SESSIONS_DIR"
  chown root:claude "$AUTH_SESSIONS_DIR"
  chmod 2750 "$AUTH_SESSIONS_DIR"
}

# Path to a session's dir, or die if unknown. Session ids are lowercase hex,
# so the regex rejects path traversal attempts cleanly.
auth_session_dir() {
  local sid="$1"
  [[ "$sid" =~ ^[0-9a-f]{16}$ ]] \
    || fail "$E_VALIDATION" "invalid session id"
  local dir="${AUTH_SESSIONS_DIR}/${sid}"
  [[ -d "$dir" ]] \
    || fail "$E_NOT_FOUND" "no such auth session: $sid"
  echo "$dir"
}

# Extract the OAuth URL from a claude setup-token PTY log. claude v2.1.119
# emits the URL in two places:
#   1. as the target of an OSC 8 hyperlink escape: `ESC]8;id=<id>;<URL>BEL<text>ESC]8;;BEL`
#      — this is the clean, unwrapped form.
#   2. as the visible `<text>`, hard-wrapped across ~6 lines with CSI
#      cursor-forward escapes between fragments.
#
# We prefer (1) because it's contiguous. Python does the scan — awk and
# grep both had edge cases on the OSC delimiter (BEL isn't easy to express
# portably, and earlier grep attempts stopped mid-URL at embedded CSI
# escapes inserted by the animation frame).
extract_claude_url() {
  local log="$1"
  [[ -s "$log" ]] || return 1
  python3 - "$log" <<'PY' 2>/dev/null || true
import re, sys
data = open(sys.argv[1], 'rb').read()
# OSC 8 hyperlink: ESC ] 8 ; <params> ; <URL> BEL <text> ESC ] 8 ; ; BEL
# <URL> ends at the BEL (\x07) that precedes the visible text.
m = re.search(rb'\x1b\]8;[^;]*;(https://claude\.com/[^\x07]+)\x07', data)
if m:
    print(m.group(1).decode('utf-8', 'replace'))
    sys.exit(0)
# Fallback: strip CSI + BEL + CR, then scan for a plain URL. Covers
# future claude versions that drop the OSC 8 wrapper.
clean = re.sub(rb'\x1b\[[0-9;]*[A-Za-z]', b'', data)
clean = re.sub(rb'[\x07\r]', b'', clean)
m = re.search(rb'https://claude\.com/[A-Za-z0-9._~:/?#=&%+_-]{20,600}', clean)
if m:
    print(m.group(0).decode('utf-8', 'replace'))
PY
}

# Pull the long-lived OAuth token out of the login log after the user has
# pasted their callback code. claude setup-token prints the literal token
# on success; grep returns the last match (tolerant of noisy output).
extract_claude_token() {
  local log="$1"
  [[ -s "$log" ]] || return 1
  sed 's/\x1b\[[0-9;]*[A-Za-z]//g; s/\r//g' "$log" \
    | grep -oE 'sk-ant-oat01-[A-Za-z0-9_-]+' \
    | tail -1 | sed 's/Store$//'
}

# grok login --device-auth prints `https://accounts.x.ai/oauth2/device?user_code=
# <4-dash-4>` as the device URL plus a separately-displayed user code in the
# same `<4-dash-4>` format (e.g. `XJ9P-ZW8T`). Strip CSI escapes first since
# grok renders the code in a colored block. The URL is the canonical anchor;
# code is a friendly fallback for users who want to type instead of click.
extract_grok_url() {
  local log="$1"
  [[ -s "$log" ]] || return 1
  sed 's/\x1b\[[0-9;]*[A-Za-z]//g; s/\r//g' "$log" \
    | grep -oE 'https://accounts\.x\.ai/oauth2/device\?[A-Za-z0-9._~:/?#=&%+_-]+' \
    | head -1
}

extract_grok_code() {
  local log="$1"
  [[ -s "$log" ]] || return 1
  # 4-dash-4 uppercase-alphanumeric. Codex uses 4-dash-5, so we can't share
  # extract_codex_code — different anchor lengths reject the wrong shape.
  sed 's/\x1b\[[0-9;]*[A-Za-z]//g; s/\r//g' "$log" \
    | grep -oE '\b[0-9A-Z]{4}-[0-9A-Z]{4}\b' \
    | head -1
}

# antigravity prints `Authentication required. Please visit the URL to log in:`
# followed by a Google OAuth URL with redirect_uri=antigravity.google/oauth-
# callback on a single (possibly wrapped) line, then a "Or, paste the
# authorization code here and press Enter" prompt. The URL anchor is the
# accounts.google.com prefix; we accept both /o/oauth2/auth and /o/oauth2/v2/auth.
extract_antigravity_url() {
  local log="$1"
  [[ -s "$log" ]] || return 1
  sed 's/\x1b\[[0-9;]*[A-Za-z]//g; s/\r//g' "$log" \
    | grep -oE 'https://accounts\.google\.com/o/oauth2/(v2/)?auth\?[A-Za-z0-9._~:/?#=&%+_-]{40,2000}' \
    | head -1
}

# codex login --device-auth prints a static device URL plus a one-time code
# like `06LC-O1CRK`. Both are wrapped in CSI color escapes, so strip those
# first. The URL is currently hard-coded but we still parse it so a future
# codex release that personalises it keeps working without a CLI change.
extract_codex_url() {
  local log="$1"
  [[ -s "$log" ]] || return 1
  sed 's/\x1b\[[0-9;]*[A-Za-z]//g; s/\r//g' "$log" \
    | grep -oE 'https://auth\.openai\.com/codex/device[A-Za-z0-9._~:/?#=&%+-]*' \
    | head -1
}

extract_codex_code() {
  local log="$1"
  [[ -s "$log" ]] || return 1
  # Match the 4-dash-5 uppercase-alphanumeric pattern that codex prints as
  # the one-time code. Anchors to word boundaries to avoid hex snippets.
  sed 's/\x1b\[[0-9;]*[A-Za-z]//g; s/\r//g' "$log" \
    | grep -oE '\b[0-9A-Z]{4}-[0-9A-Z]{5}\b' \
    | head -1
}

cmd_auth_start() {
  local type="" profile=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --auth-profile=*) profile="${1#--auth-profile=}" ;;
      -*)               fail "$E_USAGE" "unknown flag: $1" ;;
      *)                [[ -z "$type" ]] && type="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$type" ]] || fail "$E_USAGE" "usage: 5dive agent auth start <type> [--auth-profile=<name>]"
  is_known_type "$type" || fail "$E_NOT_FOUND" "unknown type: $type"
  case "$type" in
    claude|hermes|openclaw|codex|antigravity|grok) ;;
    *) fail "$E_VALIDATION" "device-code flow supports claude/hermes/openclaw/codex/antigravity/grok. Use 'auth set --api-key' or 'auth login' for $type." ;;
  esac
  local bin="${TYPE_BIN[$type]}"
  [[ -x "$bin" ]] || fail "$E_NOT_INSTALLED" "$type not installed at $bin"
  if [[ -n "$profile" ]]; then
    valid_profile_name "$profile" \
      || fail "$E_VALIDATION" "invalid --auth-profile (lowercase letters/digits/_-, start letter, <=32 chars)"
    ensure_profile_dir "$profile" >/dev/null
    # Pre-create the per-type state dir so the device-flow CLI has somewhere
    # to land its credential file. profile_type_env below points the right
    # env var at this dir per type.
    profile_type_dir "$profile" "$type" >/dev/null
  fi

  require_auth_session_root
  local sid dir
  sid=$(gen_session_id)
  dir="${AUTH_SESSIONS_DIR}/${sid}"
  mkdir -p "$dir"
  chown claude:claude "$dir"
  chmod 2750 "$dir"

  local log="${dir}/login.log"
  : > "$log"
  chown claude:claude "$log"
  chmod 640 "$log"

  # For codex, the success signal is a fresh credential file
  # (~/.codex/auth.json) — record the file's current mtime so poll can tell
  # a pre-existing login apart from the one this session produced. Missing
  # file ⇒ baseline 0, which any write beats. When profile is set, the
  # sentinel lives under the per-profile state dir (profile_type_auth_path),
  # not the shared /home/claude/.<type>.
  local auth_baseline=0
  case "$type" in
    codex|hermes|openclaw|antigravity|grok)
      local sentinel
      sentinel=$(profile_type_auth_path "$profile" "$type")
      if [[ -n "$sentinel" && -f "$sentinel" ]]; then
        auth_baseline=$(stat -c %Y "$sentinel" 2>/dev/null || echo 0)
      fi
      ;;
  esac

  jq -n --arg t "$type" --arg p "$profile" --arg s "pending_url" \
        --arg ts "$(date -Iseconds)" --arg sid "$sid" \
        --argjson ab "$auth_baseline" '{
    sessionId: $sid, type: $t, profile: $p, state: $s,
    url: null, code: null, error: null,
    authBaselineMtime: $ab,
    createdAt: $ts, updatedAt: $ts
  }' > "${dir}/meta.json"
  chmod 640 "${dir}/meta.json"
  chown claude:claude "${dir}/meta.json"

  # tmux session owned by `claude` so it shares the existing nvm/PATH setup
  # and any captured tokens end up in a path we can read back. Name includes
  # the sid so parallel sessions don't collide.
  local session="auth-${sid}"
  # Each session gets its own tmux socket under the session dir. Lets us
  # send-keys later without guessing which tmux server hosts it.
  local sock="${dir}/tmux.sock"

  # If profile set, redirect the type's state-root env var (CLAUDE_CONFIG_DIR /
  # CODEX_HOME / HERMES_HOME / HOME for openclaw) so the device-flow CLI
  # writes its credential file into the profile dir instead of the shared
  # /home/claude/.<type>. Two agents on different profiles can then re-auth
  # independently without overwriting each other. For claude, the extracted
  # token is also promoted into combined.env for systemd; for the others, the
  # credential file under the profile dir is the durable artifact and
  # 5dive-agent-start.sh seeds the agent's $HOME/.<type> from there.
  local extra_env=""
  if [[ -n "$profile" ]]; then
    extra_env=$(profile_type_env "$profile" "$type") \
      || fail "$E_GENERIC" "profile_type_env: no plumbing for type '$type'"
  fi
  # The CLI invocation differs per family: claude-setup-token prints a URL
  # and waits for a pasted callback code; codex prints a URL + one-time code
  # and polls the OAuth endpoint itself, so there's no submit step.
  local login_cmd preseed=""
  case "$type" in
    codex)  login_cmd="$bin login --device-auth" ;;
    antigravity)
      # agy's print mode (`--print ping`) hard-caps the interactive OAuth wait
      # at 30s ("Print mode: auth timed out") — far too short for a human to
      # finish Google sign-in and paste the code back through the dashboard.
      # The session dies, no antigravity-oauth-token is written, and
      # `agent create` then correctly rejects the empty profile. So drive auth
      # through the INTERACTIVE TUI instead (`--prompt-interactive ping`),
      # which waits indefinitely for the pasted authorization code. The TUI
      # first shows a login-method menu (Google OAuth pre-selected) and renders
      # the OAuth URL inside the TUI rather than the script log — cmd_auth_poll
      # presses Enter to pick the method and scrapes the URL from the pane (see
      # the antigravity branch there + the wide pane in the spawn below).
      login_cmd="$bin --prompt-interactive ping" ;;
    grok)
      # `grok login --device-auth` prints accounts.x.ai/oauth2/device + a
      # 4-dash-4 user code, polls the device-auth endpoint itself, and
      # writes ~/.grok/auth.json on success. Same UX shape as codex's
      # --device-auth (URL + displayed code, no callback paste).
      login_cmd="$bin login --device-auth" ;;
    hermes)
      # hermes auth add openai-codex prints a URL + one-time code (codex-style
      # device-auth via OpenAI), polls itself, then writes ~/.hermes/auth.json
      # on success. Same UX shape as codex from the wizard's POV.
      login_cmd="$bin auth add openai-codex" ;;
    openclaw)
      # openclaw routes the same OpenAI /codex/device flow through
      # `models auth login`. --provider + --method short-circuit the
      # @clack/prompts pickers in auth.ts:185-188 so no interactive prompts
      # appear before the URL+code print. --set-default applies the
      # provider's defaultModel (openai-codex/gpt-5.5). DISPLAY=:0 forces
      # isRemoteEnvironment() to return false so the user-code is logged
      # inline (otherwise infra/remote-env.ts treats headless Linux as
      # remote and openclaw redacts the code as "[shown on the local
      # device only]"). The opener falls through to "Open manually:" since
      # there's no real X server, which is exactly what we want.
      preseed='export DISPLAY=:0; '
      # When profile-scoped, openclaw's HOME is empty — pre-create the
      # nested state dirs so resolveAgentDir + upsertAuthProfile can write
      # auth-profiles.json without first-run mkdir races.
      if [[ -n "$profile" ]]; then
        local oc_home
        oc_home=$(profile_type_dir "$profile" "$type")
        install -d -m 2750 -o claude -g claude \
          "${oc_home}/.openclaw" \
          "${oc_home}/.openclaw/agents" \
          "${oc_home}/.openclaw/agents/main" \
          "${oc_home}/.openclaw/agents/main/agent" 2>/dev/null || true
      fi
      login_cmd="$bin models auth login --provider openai-codex --method device-code --set-default" ;;
    *)      login_cmd="$bin setup-token" ;;
  esac

  step "Starting device-code session $sid for $type${profile:+ (profile=$profile)}"
  # script(1) gives us a PTY so the CLI renders normally; -f flushes after
  # every write so the poll loop sees the URL/code seconds after they print.
  # bash -lc (login shell) sources /etc/profile.d/5dive-shared-configs.sh, so
  # CODEX_HOME points at /home/claude/.codex and the auth.json lands in the
  # shared location every agent-<name> login shell already reads.
  #
  # antigravity renders its OAuth URL inside the interactive TUI (alt-screen),
  # so cmd_auth_poll scrapes it via `tmux capture-pane` rather than $log. The
  # URL is ~650 chars; a normal-width pane hard-wraps it and the scrape would
  # truncate. Give agy a very wide pane so the whole URL stays on one logical
  # line. Other types print the URL to $log, where pane width is irrelevant.
  local pane_width=200
  [[ "$type" == "antigravity" ]] && pane_width=700
  sudo -u claude -H bash -lc "
    ${preseed}
    tmux -S '$sock' new-session -d -s '$session' -x $pane_width -y 50 \
      'env $extra_env script -q -f -c \"$login_cmd\" $log'
  " >&2 || fail "$E_GENERIC" "failed to spawn tmux session"

  ok "device-code session started" \
     '{sessionId:$s, type:$t, profile:$p, state:"pending_url"}' \
     --arg s "$sid" --arg t "$type" --arg p "${profile:-}"
}

cmd_auth_poll() {
  local sid="${1:-}"
  [[ -n "$sid" ]] || fail "$E_USAGE" "usage: 5dive agent auth poll <session_id>"
  local dir
  dir=$(auth_session_dir "$sid")
  local meta="${dir}/meta.json"
  local log="${dir}/login.log"
  local sock="${dir}/tmux.sock"
  local session="auth-${sid}"

  local state type profile
  state=$(jq -r '.state' "$meta")
  type=$(jq -r '.type' "$meta")
  profile=$(jq -r '.profile' "$meta")

  # Terminal states are returned as-is; don't reprobe.
  case "$state" in
    ok|expired|error) : ;;
    *)
      # Still running? If the tmux session is gone and we never reached ok,
      # mark expired so the dashboard stops polling.
      local alive=1
      sudo -u claude tmux -S "$sock" has-session -t "$session" 2>/dev/null \
        || alive=0

      if [[ "$state" == "pending_url" ]]; then
        case "$type" in
          codex|hermes|openclaw)
            # codex, hermes, and openclaw all go through OpenAI's
            # /codex/device endpoint and print a URL + a one-time code.
            # Wait until both are visible before advancing so the dashboard
            # never renders one without the other. The extractors are
            # URL/code-shape based, not vendor-specific, so they work for
            # all three (openclaw renders the URL/code via @clack/prompts
            # note, but stripping CSI escapes leaves the URL + 4-5 alnum
            # code intact).
            local url code_display
            url=$(extract_codex_url "$log" || true)
            code_display=$(extract_codex_code "$log" || true)
            if [[ -n "$url" && -n "$code_display" ]]; then
              state="awaiting_code"
              jq --arg u "$url" --arg c "$code_display" --arg s "$state" \
                 --arg ts "$(date -Iseconds)" \
                 '.url = $u | .code = $c | .state = $s | .updatedAt = $ts' "$meta" \
                 > "${meta}.tmp" && mv "${meta}.tmp" "$meta"
            fi
            ;;
          antigravity)
            # agy runs as an interactive TUI (no 30s print-mode cap). It first
            # shows a login-method menu with "Google OAuth" pre-selected, then
            # — once that's chosen — renders the OAuth URL + a "paste the
            # authorization code" prompt INSIDE the TUI (not $log). So scrape
            # the live pane: press Enter once to dismiss the menu, then pull the
            # URL out. The pane is spawned wide (cmd_auth_start) so the
            # ~650-char URL isn't wrapped/truncated. capture-pane -J rejoins
            # soft-wrapped lines; -S -400 includes scrollback in case the menu
            # has scrolled the URL up.
            local pane="${dir}/pane.txt"
            sudo -u claude tmux -S "$sock" capture-pane -p -J -S -400 \
              -t "$session" > "$pane" 2>/dev/null || true
            # Select "Google OAuth" exactly once, only while the menu is shown.
            if [[ ! -e "${dir}/.oauth_selected" ]] \
               && grep -qiE 'select login method|google oauth' "$pane"; then
              sudo -u claude tmux -S "$sock" send-keys -t "$session" Enter 2>/dev/null || true
              : > "${dir}/.oauth_selected"
            fi
            local url
            url=$(extract_antigravity_url "$pane" || true)
            if [[ -n "$url" ]]; then
              state="awaiting_code"
              jq --arg u "$url" --arg s "$state" --arg ts "$(date -Iseconds)" \
                '.url = $u | .state = $s | .updatedAt = $ts' "$meta" > "${meta}.tmp" \
                && mv "${meta}.tmp" "$meta"
            fi
            ;;
          grok)
            # codex-style: URL + displayed code, CLI polls. Advance once
            # both have appeared so the dashboard never renders one without
            # the other.
            local url code_display
            url=$(extract_grok_url "$log" || true)
            code_display=$(extract_grok_code "$log" || true)
            if [[ -n "$url" && -n "$code_display" ]]; then
              state="awaiting_code"
              jq --arg u "$url" --arg c "$code_display" --arg s "$state" \
                 --arg ts "$(date -Iseconds)" \
                 '.url = $u | .code = $c | .state = $s | .updatedAt = $ts' "$meta" \
                 > "${meta}.tmp" && mv "${meta}.tmp" "$meta"
            fi
            ;;
          *)
            local url
            url=$(extract_claude_url "$log" || true)
            if [[ -n "$url" ]]; then
              state="awaiting_code"
              jq --arg u "$url" --arg s "$state" --arg ts "$(date -Iseconds)" \
                '.url = $u | .state = $s | .updatedAt = $ts' "$meta" > "${meta}.tmp" \
                && mv "${meta}.tmp" "$meta"
            fi
            ;;
        esac
      fi

      if [[ "$state" == "awaiting_code" || "$state" == "submitted" ]]; then
        case "$type" in
          codex|hermes|openclaw|antigravity|grok)
            # All five signal success by writing a credential file:
            #   codex       — ~/.codex/auth.json     (CLI polls OpenAI itself)
            #   hermes      — ~/.hermes/auth.json    (CLI polls OpenAI itself)
            #   openclaw    — ~/.openclaw/agents/main/agent/auth-profiles.json
            #                 (CLI polls OpenAI itself, then upsertAuthProfile
            #                 writes the file synchronously before exit)
            #   antigravity — ~/.gemini/antigravity-cli/antigravity-oauth-token
            #                 (Google OAuth callback or pasted code; mtime
            #                 bumps once token_storage's file fallback writes
            #                 the bare token blob, mode 0600)
            #   grok        — ~/.grok/auth.json
            #                 (CLI polls xAI's device-auth endpoint, writes
            #                 auth.json on token receipt)
            # When this session is profile-scoped, the credential lands under
            # the per-profile state dir (profile_type_auth_path) instead of
            # the shared ~/.<type>. We mtime the sentinel against the
            # baseline captured at session start so a pre-existing login
            # can't masquerade as success.
            local sentinel baseline current
            sentinel=$(profile_type_auth_path "$profile" "$type")
            baseline=$(jq -r '.authBaselineMtime // 0' "$meta")
            current=0
            if [[ -f "$sentinel" ]]; then
              current=$(stat -c %Y "$sentinel" 2>/dev/null || echo 0)
            fi
            if (( current > baseline )); then
              sudo -u claude tmux -S "$sock" kill-session -t "$session" 2>/dev/null || true
              state="ok"
              jq --arg s "$state" --arg ts "$(date -Iseconds)" \
                '.state = $s | .updatedAt = $ts' "$meta" > "${meta}.tmp" \
                && mv "${meta}.tmp" "$meta"
            elif (( ! alive )); then
              # CLI quit without producing a fresh credential file — the user
              # cancelled, the OAuth window expired, or the CLI errored out
              # before writing creds.
              state="error"
              jq --arg s "$state" --arg e "$type exited without writing $sentinel (cancelled, expired, bad code, or failed)" \
                 --arg ts "$(date -Iseconds)" \
                 '.state = $s | .error = $e | .updatedAt = $ts' "$meta" > "${meta}.tmp" \
                && mv "${meta}.tmp" "$meta"
            fi
            ;;
          *)
            local tok
            tok=$(extract_claude_token "$log" || true)
            if [[ -n "$tok" ]]; then
              # Promote the captured token into its destination + tear down the
              # tmux session. Safe to re-run if the user hits poll again.
              if [[ -n "$profile" ]]; then
                printf '%s' "$tok" | profile_set_var "$profile" "CLAUDE_CODE_OAUTH_TOKEN"
              else
                printf '%s' "$tok" | write_default_connector "anthropic.env" "CLAUDE_CODE_OAUTH_TOKEN"
              fi
              sudo -u claude tmux -S "$sock" kill-session -t "$session" 2>/dev/null || true
              state="ok"
              jq --arg s "$state" --arg ts "$(date -Iseconds)" \
                '.state = $s | .updatedAt = $ts' "$meta" > "${meta}.tmp" \
                && mv "${meta}.tmp" "$meta"
            elif (( ! alive )); then
              # tmux session died before we saw a token — usually means the user
              # pasted a bad code and claude exited non-zero.
              state="error"
              jq --arg s "$state" --arg e "login process exited without writing a token (bad callback code?)" \
                 --arg ts "$(date -Iseconds)" \
                 '.state = $s | .error = $e | .updatedAt = $ts' "$meta" > "${meta}.tmp" \
                && mv "${meta}.tmp" "$meta"
            fi
            ;;
        esac
      fi

      if [[ "$state" == "pending_url" && "$alive" == "0" ]]; then
        state="error"
        jq --arg s "$state" --arg e "login process exited before printing an OAuth URL" \
           --arg ts "$(date -Iseconds)" \
           '.state = $s | .error = $e | .updatedAt = $ts' "$meta" > "${meta}.tmp" \
          && mv "${meta}.tmp" "$meta"
      fi
      ;;
  esac

  if (( JSON_MODE )); then
    jq -c '{ok:true, data: .}' "$meta"
  else
    jq -r '"sessionId: \(.sessionId)\ntype:      \(.type)\nprofile:   \(.profile // "-")\nstate:     \(.state)\nurl:       \(.url // "-")\ncode:      \(.code // "-")\nerror:     \(.error // "-")\nupdatedAt: \(.updatedAt)"' "$meta"
  fi
}

cmd_auth_submit() {
  local sid="" code=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --code=*) code="${1#--code=}" ;;
      -*)       fail "$E_USAGE" "unknown flag: $1" ;;
      *)        [[ -z "$sid" ]] && sid="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$sid" && -n "$code" ]] \
    || fail "$E_USAGE" "usage: 5dive agent auth submit <session_id> --code=<callback-code>"

  # Callback code is URL-safe base64 (`[A-Za-z0-9_-]+`), optional `#fragment`.
  # Allow `/` and `.` so future provider shapes pass; we still refuse spaces,
  # quotes, backticks etc. so tmux send-keys -l never sees something wild.
  [[ "$code" =~ ^[A-Za-z0-9._/-]+#?[A-Za-z0-9._/-]*$ ]] \
    || fail "$E_VALIDATION" "callback code contains unexpected characters"

  local dir
  dir=$(auth_session_dir "$sid")
  local meta="${dir}/meta.json"
  local sock="${dir}/tmux.sock"
  local session="auth-${sid}"

  local state type
  state=$(jq -r '.state' "$meta")
  type=$(jq -r '.type' "$meta")
  # codex/hermes/grok never ask for a pasted callback — the CLI polls the
  # OAuth endpoint on its own and writes auth.json. Submitting here would
  # wedge keystrokes into a prompt that doesn't exist. antigravity DOES
  # accept a pasted code at its "Or, paste the authorization code here"
  # prompt, so the submit step IS valid for it.
  case "$type" in
    codex|hermes|grok) fail "$E_VALIDATION" "$type device-auth has no submit step — keep polling until state=ok or state=error" ;;
  esac
  case "$state" in
    awaiting_code|submitted) ;;   # submitted -> retry after a rejected code
    pending_url)                  fail "$E_VALIDATION" "session not yet awaiting a code — poll until url is populated" ;;
    ok|expired|error)             fail "$E_VALIDATION" "session already in terminal state: $state" ;;
    *)                            fail "$E_VALIDATION" "session in unexpected state: $state" ;;
  esac

  sudo -u claude tmux -S "$sock" has-session -t "$session" 2>/dev/null \
    || fail "$E_NOT_RUNNING" "login tmux session is gone — start a new auth session"

  step "Submitting code to session $sid"
  # If claude left a half-typed retry prompt on screen from a prior rejected
  # code, send C-u first to clear the line so we don't paste onto tail of
  # the previous attempt.
  sudo -u claude tmux -S "$sock" send-keys -t "$session" C-u 2>/dev/null || true
  sudo -u claude tmux -S "$sock" send-keys -t "$session" -l -- "$code"
  sudo -u claude tmux -S "$sock" send-keys -t "$session" Enter

  jq --arg s "submitted" --arg ts "$(date -Iseconds)" \
     '.state = $s | .updatedAt = $ts' "$meta" > "${meta}.tmp" \
    && mv "${meta}.tmp" "$meta"

  ok "code submitted — poll for final state" \
     '{sessionId:$s, state:"submitted"}' --arg s "$sid"
}

cmd_auth_cancel() {
  local sid="${1:-}"
  [[ -n "$sid" ]] || fail "$E_USAGE" "usage: 5dive agent auth cancel <session_id>"
  local dir
  dir=$(auth_session_dir "$sid")
  local meta="${dir}/meta.json"
  local sock="${dir}/tmux.sock"
  local session="auth-${sid}"

  sudo -u claude tmux -S "$sock" kill-session -t "$session" 2>/dev/null || true
  jq --arg s "expired" --arg ts "$(date -Iseconds)" \
     '.state = (if (.state == "ok") then "ok" else $s end) | .updatedAt = $ts' "$meta" \
     > "${meta}.tmp" && mv "${meta}.tmp" "$meta"

  ok "session cancelled" \
     '{sessionId:$s, state:"expired"}' --arg s "$sid"
}

# -------- accounts (top-level noun over auth profiles) --------
#
# An "account" is the user-facing name for an auth profile. Storage and
# helpers live above (AUTH_PROFILES_DIR, ensure_profile_dir,
# profile_type_auth_path, link_agent_profile). These commands give it a
# first-class CLI surface so users don't have to think in terms of the
# split `agent auth set/start/login --auth-profile=<name>` verbs. The
# legacy `agent auth ...` commands keep working unchanged for back-compat
# and for the dashboard's device-code flow.

# account_agents_bound <name> — emit a JSON array of agent names whose
# registry .authProfile equals <name>. Empty array if none. Used by list,
# show, rename (to recover affected agents) and remove (refusal payload).
account_agents_bound() {
  local name="$1"
  ensure_state
  registry_read | jq -c --arg p "$name" \
    '[.agents | to_entries[] | select(.value.authProfile == $p) | .key]'
}

# account_types_authed <name> — JSON array of types whose credential
# sentinel exists under the profile dir. Mirrors auth_creds_present but
# scoped to a specific profile (no shared-config fallback).
account_types_authed() {
  local name="$1" type path out="[]"
  for type in "${!TYPE_BIN[@]}"; do
    path=$(profile_type_auth_path "$name" "$type" 2>/dev/null) || continue
    [[ -n "$path" && -s "$path" ]] || continue
    out=$(jq -c --arg t "$type" '. + [$t]' <<<"$out")
  done
  # Also surface env-var-only credentials (api keys written by `auth set`)
  # — combined.env carries them without a per-type credential file.
  local env_file="${AUTH_PROFILES_DIR}/${name}/combined.env"
  if [[ -s "$env_file" ]]; then
    for type in "${!TYPE_API_VAR[@]}"; do
      local var="${TYPE_API_VAR[$type]}"
      if grep -q "^${var}=" "$env_file" 2>/dev/null \
         || ([[ "$type" == "claude" ]] && grep -q "^CLAUDE_CODE_OAUTH_TOKEN=" "$env_file" 2>/dev/null); then
        # Dedup: skip if already added via per-type sentinel above.
        if ! jq -e --arg t "$type" 'index($t) != null' <<<"$out" >/dev/null; then
          out=$(jq -c --arg t "$type" '. + [$t]' <<<"$out")
        fi
      fi
    done
  fi
  echo "$out"
}

# account_signin_detail <name> <type> — per-(profile, type) sign-in detail
# for the new-agent wizard's "Which sign-in?" tile. Emits {provider, model,
# signedInAt} JSON; any field that can't be cheaply extracted comes back
# null. Returns "{}" (not failure) when the type isn't signed into the
# profile, so callers can ignore missing detail without conditional plumbing.
#
# hermes: active provider is config.yaml model.provider (the value the
# gateway loads at startup). credential_pool keys is only a fallback for
# pre-config-write profiles — its insertion order is not a source of
# truth, so reading `keys | first` makes the dashboard badge lie when
# a user adds a second credential (codex stays first, badge stays
# codex, even after model.provider flips to openrouter). openclaw:
# first profile's provider from auth-profiles.json. Everything else
# just gets a signedInAt mtime so the tile can at least show *when*
# the user signed in.
account_signin_detail() {
  local name="$1" type="$2"
  local profile_dir="${AUTH_PROFILES_DIR}/${name}"
  local auth_path provider=null model=null signed_at=null credentials='[]'
  case "$type" in
    hermes)
      auth_path="${profile_dir}/hermes/auth.json"
      [[ -s "$auth_path" ]] || { echo "{}"; return; }
      signed_at=$(jq -c '.updated_at // null' "$auth_path" 2>/dev/null) || signed_at=null
      # Every key in credential_pool is a separately-usable sign-in within
      # this profile. The dashboard renders one row per credential in the
      # Switch account modal so users can flip between providers that
      # share an auth-profile without re-running the sign-in wizard.
      credentials=$(jq -c '(.credential_pool // {}) | keys' "$auth_path" 2>/dev/null) \
        || credentials='[]'
      local cfg="${profile_dir}/hermes/config.yaml"
      if [[ -s "$cfg" ]]; then
        # Parse the `provider:` line scoped to the `model:` block — a top-
        # level grep would also match `model.providers:` / nested provider
        # entries elsewhere in the file.
        local provider_line
        provider_line=$(awk '
          /^model:/ { in_model=1; next }
          in_model && /^[^[:space:]]/ { in_model=0 }
          in_model && /^[[:space:]]+provider:/ {
            sub(/^[[:space:]]+provider:[[:space:]]*/, "")
            sub(/^["'\'']/, ""); sub(/["'\'']$/, "")
            print; exit
          }
        ' "$cfg" 2>/dev/null)
        [[ -n "$provider_line" ]] && provider=$(jq -cn --arg p "$provider_line" '$p')
        local default_line
        default_line=$(grep -E '^[[:space:]]*default:' "$cfg" 2>/dev/null | head -1 \
          | sed -E 's/^[[:space:]]*default:[[:space:]]*//; s/^["'\'']//; s/["'\'']$//')
        [[ -n "$default_line" ]] && model=$(jq -cn --arg m "$default_line" '$m')
      fi
      # Fall back to the first credential-pool key only if config.yaml
      # didn't yield a provider (cold profile or pre-config-write state).
      if [[ "$provider" == "null" ]]; then
        provider=$(jq -c '(.credential_pool // {}) | keys | first // null' \
          "$auth_path" 2>/dev/null) || provider=null
      fi
      ;;
    openclaw)
      auth_path="${profile_dir}/openclaw/.openclaw/agents/main/agent/auth-profiles.json"
      [[ -s "$auth_path" ]] || { echo "{}"; return; }
      provider=$(jq -c '
        (.profiles // {}) | [.[]?.provider?] | map(select(.!=null)) | first // null
      ' "$auth_path" 2>/dev/null) || provider=null
      credentials=$(jq -c '
        (.profiles // {}) | [.[]?.provider?] | map(select(.!=null)) | unique
      ' "$auth_path" 2>/dev/null) || credentials='[]'
      local mtime
      mtime=$(date -u -r "$auth_path" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null)
      [[ -n "$mtime" ]] && signed_at=$(jq -cn --arg s "$mtime" '$s')
      ;;
    *)
      auth_path=$(profile_type_auth_path "$name" "$type" 2>/dev/null) || true
      [[ -n "$auth_path" && -s "$auth_path" ]] || { echo "{}"; return; }
      local mtime
      mtime=$(date -u -r "$auth_path" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null)
      [[ -n "$mtime" ]] && signed_at=$(jq -cn --arg s "$mtime" '$s')
      ;;
  esac
  jq -cn --argjson p "$provider" --argjson m "$model" --argjson s "$signed_at" \
        --argjson c "$credentials" \
    '{provider:$p, model:$m, signedInAt:$s, credentials:$c}'
}

# Iterate every profile dir on disk. Skips entries that don't have a
# combined.env (incomplete state from an interrupted setup).
account_each() {
  [[ -d "$AUTH_PROFILES_DIR" ]] || return 0
  local d
  for d in "${AUTH_PROFILES_DIR}"/*/; do
    [[ -d "$d" && -f "${d}combined.env" ]] || continue
    basename "$d"
  done
}

cmd_account_list() {
  ensure_state
  [[ $# -eq 0 ]] || fail "$E_USAGE" "usage: 5dive account list"
  local rows="[]" name types agents signins t detail
  while IFS= read -r name; do
    [[ -n "$name" ]] || continue
    types=$(account_types_authed "$name")
    agents=$(account_agents_bound "$name")
    # Build per-type signin details so the new-agent wizard's profile tiles
    # can show "Anthropic · claude-sonnet-4-5 · signed in May 12" instead of
    # all three orphan profiles reading identically as "Not used yet".
    signins="{}"
    while IFS= read -r t; do
      [[ -n "$t" ]] || continue
      detail=$(account_signin_detail "$name" "$t")
      [[ "$detail" != "{}" ]] || continue
      signins=$(jq -c --arg k "$t" --argjson v "$detail" '. + {($k):$v}' <<<"$signins")
    done < <(jq -r '.[]' <<<"$types" 2>/dev/null)
    rows=$(jq -c --arg n "$name" --argjson t "$types" --argjson a "$agents" --argjson s "$signins" \
      '. + [{name:$n, types:$t, agents:$a, signins:$s}]' <<<"$rows")
  done < <(account_each)
  if (( JSON_MODE )); then
    echo "$rows" | jq -c '{ok:true, data: .}'
  else
    echo "$rows" | jq -r '
      def fmt(a): if (a | length) == 0 then "-" else (a | join(",")) end;
      if length == 0 then "no accounts" else
        (["NAME","TYPES","AGENTS"] | @tsv),
        (.[] | [.name, fmt(.types), (.agents | length | tostring)] | @tsv)
      end' | column -t -s $'\t'
  fi
}

# usage_agent_home <agent> — resolve the agent user's home dir (where its
# statusline cache lives) from passwd, falling back to /home/agent-<name>.
usage_agent_home() {
  local agent="$1" home
  home=$(getent passwd "agent-${agent}" 2>/dev/null | cut -d: -f6)
  [[ -n "$home" ]] && { printf '%s\n' "$home"; return; }
  printf '/home/agent-%s\n' "$agent"
}

# usage_read_ratelimits <agent> — emit a compact JSON object
#   {asOf, fiveHourPct, fiveResetsAt, sevenDayPct, sevenResetsAt}
# from the agent's statusline cache (the JSON Claude Code hands its
# statusline, mirrored to ~/.claude/statusline-last.json by statusline.sh).
# asOf is the cache file's mtime (epoch) — the cache carries no own timestamp,
# and mtime is when the live limits were last observed. Emits nothing when
# there's no readable cache or no rate_limits block: an agent that hasn't
# rendered its statusline since boot, or a non-claude type whose CLI doesn't
# surface Anthropic 5h/7d limits.
usage_read_ratelimits() {
  local agent="$1" cache mtime
  cache="$(usage_agent_home "$agent")/.claude/statusline-last.json"
  [[ -s "$cache" ]] || return 0
  mtime=$(stat -c %Y "$cache" 2>/dev/null) || return 0
  jq -c --argjson at "$mtime" '
    (.rate_limits // {}) as $r
    | ($r.five_hour // {}) as $f
    | ($r.seven_day // {}) as $s
    | if ($f.used_percentage == null and $s.used_percentage == null)
      then empty
      else {
        asOf: $at,
        fiveHourPct:   ($f.used_percentage // null),
        fiveResetsAt:  ($f.resets_at // null),
        sevenDayPct:   ($s.used_percentage // null),
        sevenResetsAt: ($s.resets_at // null)
      } end' "$cache" 2>/dev/null
}

# account_best_ratelimits <account> — freshest statusline rate-limit JSON across
# the account's bound agents (same selection cmd_account_usage uses), or empty
# when no bound agent has a readable cache. Emits the usage_read_ratelimits shape
# ({asOf, fiveHourPct, ...}). Caller must be root to read sibling agent homes.
account_best_ratelimits() {
  local account="$1" agents agent rl at best best_at=-1
  agents=$(account_agents_bound "$account")
  while IFS= read -r agent; do
    [[ -n "$agent" ]] || continue
    rl=$(usage_read_ratelimits "$agent") || continue
    [[ -n "$rl" ]] || continue
    at=$(jq -r '.asOf // -1' <<<"$rl")
    if [[ "$at" =~ ^[0-9]+$ ]] && (( at > best_at )); then
      best_at=$at; best="$rl"
    fi
  done < <(jq -r '.[]' <<<"$agents")
  printf '%s' "${best:-}"
}

# account_has_live_headroom <account> — exit 0 when the account's FRESHEST cache
# was rendered recently (within HEADROOM_FRESH_SECS) AND both observed windows
# are below HEADROOM_MAX_PCT. Used by rotation to override a STALE cooldown: if
# an agent on that account rendered a low-usage statusline minutes ago, the
# account genuinely has capacity regardless of a leftover cooldown epoch. A
# missing/old/high cache → non-zero (don't override — fail safe to the cooldown).
HEADROOM_FRESH_SECS="${HEADROOM_FRESH_SECS:-600}"   # 10 min
HEADROOM_MAX_PCT="${HEADROOM_MAX_PCT:-90}"
account_has_live_headroom() {
  local account="$1" rl now
  rl=$(account_best_ratelimits "$account")
  [[ -n "$rl" ]] || return 1
  now=$(date +%s)
  jq -e --argjson now "$now" --argjson fresh "$HEADROOM_FRESH_SECS" \
        --argjson max "$HEADROOM_MAX_PCT" '
    # Fresh: cache mtime within the window. 5h must be present and under max;
    # 7d may be null (not always surfaced) but if present must also be under max.
    (.asOf != null and ($now - .asOf) <= $fresh)
    and (.fiveHourPct != null and .fiveHourPct < $max)
    and (.sevenDayPct == null or .sevenDayPct < $max)
  ' <<<"$rl" >/dev/null 2>&1
}

# `account usage` — per-account snapshot of Anthropic 5h / 7d limit usage,
# backing the dashboard Switch-account modal dots and Telegram /account +
# /usage. For each account we read the FRESHEST statusline cache across its
# bound agents (an account is shared by several agents; whichever rendered
# most recently carries the truest live numbers). usage is null when no
# bound agent has a readable cache. Needs root to read sibling agents' 0750
# home dirs — dashboard and telegram both call via `sudo -n 5dive account usage`.
cmd_account_usage() {
  ensure_state
  [[ $# -eq 0 ]] || fail "$E_USAGE" "usage: 5dive account usage"
  require_root
  local rows="[]" name agents agent rl at best best_at src usage
  while IFS= read -r name; do
    [[ -n "$name" ]] || continue
    agents=$(account_agents_bound "$name")
    best=""; best_at=-1; src=""
    while IFS= read -r agent; do
      [[ -n "$agent" ]] || continue
      rl=$(usage_read_ratelimits "$agent") || continue
      [[ -n "$rl" ]] || continue
      at=$(jq -r '.asOf // -1' <<<"$rl")
      if [[ "$at" =~ ^[0-9]+$ ]] && (( at > best_at )); then
        best_at=$at; best="$rl"; src="$agent"
      fi
    done < <(jq -r '.[]' <<<"$agents")
    if [[ -n "$best" ]]; then
      usage=$(jq -c --arg src "$src" '{
        fiveHour: (if .fiveHourPct == null then null
                   else {pct: .fiveHourPct, resetsAt: .fiveResetsAt} end),
        sevenDay: (if .sevenDayPct == null then null
                   else {pct: .sevenDayPct, resetsAt: .sevenResetsAt} end),
        asOf: .asOf, source: $src}' <<<"$best")
    else
      usage="null"
    fi
    rows=$(jq -c --arg n "$name" --argjson a "$agents" --argjson u "$usage" \
      '. + [{name:$n, agents:$a, usage:$u}]' <<<"$rows")
  done < <(account_each)
  if (( JSON_MODE )); then
    echo "$rows" | jq -c '{ok:true, data: .}'
  else
    echo "$rows" | jq -r '
      def p(x): if x == null then "-" else ((x.pct | floor | tostring) + "%") end;
      if length == 0 then "no accounts" else
        (["ACCOUNT","5H","7D","SOURCE"] | @tsv),
        (.[] | [.name, p(.usage.fiveHour), p(.usage.sevenDay),
                (.usage.source // "-")] | @tsv)
      end' | column -t -s $'\t'
  fi
}

cmd_account_show() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive account show <name>"
  valid_profile_name "$name" \
    || fail "$E_VALIDATION" "invalid account name (lowercase letters/digits/_-, start letter, <=32 chars)"
  [[ -d "${AUTH_PROFILES_DIR}/${name}" ]] \
    || fail "$E_NOT_FOUND" "no account named '$name'"
  local types agents env_keys env_file="${AUTH_PROFILES_DIR}/${name}/combined.env"
  types=$(account_types_authed "$name")
  agents=$(account_agents_bound "$name")
  if [[ -s "$env_file" ]]; then
    env_keys=$(grep -oE '^[A-Z_][A-Z0-9_]*' "$env_file" 2>/dev/null | sort -u | jq -R . | jq -cs '.')
  else
    env_keys="[]"
  fi
  if (( JSON_MODE )); then
    jq -cn --arg n "$name" --argjson t "$types" --argjson a "$agents" --argjson e "$env_keys" \
      '{ok:true, data:{name:$n, types:$t, agents:$a, envKeys:$e}}'
  else
    local fmt='if length == 0 then "-" else join(", ") end'
    echo "name:    $name"
    echo "types:   $(jq -r "$fmt" <<<"$types")"
    echo "agents:  $(jq -r "$fmt" <<<"$agents")"
    echo "envKeys: $(jq -r "$fmt" <<<"$env_keys")"
  fi
}

cmd_account_add() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive account add <name>"
  valid_profile_name "$name" \
    || fail "$E_VALIDATION" "invalid account name (lowercase letters/digits/_-, start letter, <=32 chars)"
  # "default" is the magic value `agent config set auth-profile=default`
  # uses to clear an agent's binding — reject it as an account name so the
  # two meanings can't collide.
  [[ "$name" != "default" ]] \
    || fail "$E_VALIDATION" "'default' is reserved (clears an agent's account binding)"
  require_root
  if [[ -d "${AUTH_PROFILES_DIR}/${name}" ]]; then
    ok "account '$name' already exists" \
       '{name:$n, created:false, alreadyExisted:true}' --arg n "$name"
    return 0
  fi
  ensure_profile_dir "$name" >/dev/null
  ok "account '$name' created. Sign in with: sudo 5dive account login $name --type=<type>" \
     '{name:$n, created:true, alreadyExisted:false}' --arg n "$name"
}

cmd_account_rename() {
  local old="${1:-}" new="${2:-}"
  [[ -n "$old" && -n "$new" ]] || fail "$E_USAGE" "usage: 5dive account rename <old> <new>"
  valid_profile_name "$old" \
    || fail "$E_VALIDATION" "invalid old account name"
  valid_profile_name "$new" \
    || fail "$E_VALIDATION" "invalid new account name (lowercase letters/digits/_-, start letter, <=32 chars)"
  [[ "$new" != "default" ]] \
    || fail "$E_VALIDATION" "'default' is reserved (clears an agent's account binding)"
  [[ "$old" != "$new" ]] || fail "$E_VALIDATION" "old and new names are the same"
  require_root
  ensure_state
  [[ -d "${AUTH_PROFILES_DIR}/${old}" ]] \
    || fail "$E_NOT_FOUND" "no account named '$old'"
  [[ ! -e "${AUTH_PROFILES_DIR}/${new}" ]] \
    || fail "$E_CONFLICT" "account '$new' already exists"

  local affected
  affected=$(account_agents_bound "$old")

  step "Renaming account dir '$old' -> '$new'"
  mv "${AUTH_PROFILES_DIR}/${old}" "${AUTH_PROFILES_DIR}/${new}"

  # Update registry .authProfile fields and re-point each agent's symlink.
  if [[ "$(jq -r 'length' <<<"$affected")" -gt 0 ]]; then
    step "Updating registry bindings ($(jq -r 'length' <<<"$affected") agent(s))"
    local reg
    reg=$(registry_read)
    reg=$(jq --arg old "$old" --arg new "$new" \
      '.agents = (.agents | with_entries(if .value.authProfile == $old then .value.authProfile = $new else . end))' \
      <<<"$reg")
    echo "$reg" | registry_write

    local agent
    while IFS= read -r agent; do
      [[ -n "$agent" ]] || continue
      step "Re-pointing ${ENV_DIR}/${agent}-auth.env"
      link_agent_profile "$agent" "$new"
      step "Restarting 5dive-agent@${agent}.service"
      systemctl restart "5dive-agent@${agent}.service" >&2 2>&1 || \
        warn "restart of agent '$agent' failed — check journalctl -u 5dive-agent@${agent}"
    done < <(jq -r '.[]' <<<"$affected")
  fi

  ok "account renamed '$old' -> '$new'" \
     '{old:$o, new:$n, agents:$a}' \
     --arg o "$old" --arg n "$new" --argjson a "$affected"
}

cmd_account_remove() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive account remove <name>"
  valid_profile_name "$name" \
    || fail "$E_VALIDATION" "invalid account name"
  require_root
  ensure_state
  [[ -d "${AUTH_PROFILES_DIR}/${name}" ]] \
    || fail "$E_NOT_FOUND" "no account named '$name'"

  local agents
  agents=$(account_agents_bound "$name")
  if [[ "$(jq -r 'length' <<<"$agents")" -gt 0 ]]; then
    local list
    list=$(jq -r 'join(", ")' <<<"$agents")
    if (( JSON_MODE )); then
      jq -cn --arg n "$name" --argjson a "$agents" --argjson c "$E_CONFLICT" \
        '{ok:false, error:{code:$c, class:"conflict",
          message:("account \($n) is in use by: " + ($a | join(", "))),
          details:{agents:$a}}}'
      echo "error: account '$name' is in use by: $list" >&2
      exit "$E_CONFLICT"
    fi
    fail "$E_CONFLICT" "account '$name' is in use by: $list — rebind or remove those agents first"
  fi

  step "Deleting account dir ${AUTH_PROFILES_DIR}/${name}"
  rm -rf "${AUTH_PROFILES_DIR:?}/${name}"
  ok "account '$name' removed" \
     '{name:$n, removed:true}' --arg n "$name"
}

# `account login <name> --type=<type>` — TTY shortcut. Reorders args and
# hands off to cmd_auth_login, which exec's the underlying CLI's interactive
# device-code/OAuth flow. For non-TTY/dashboard use, the device-code lifecycle
# stays under `agent auth start|poll|submit|cancel --auth-profile=<name>`.
cmd_account_login() {
  local name="" type=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --type=*) type="${1#--type=}" ;;
      -*)       fail "$E_USAGE" "unknown flag: $1" ;;
      *)        [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" && -n "$type" ]] \
    || fail "$E_USAGE" "usage: 5dive account login <name> --type=<type>"
  valid_profile_name "$name" \
    || fail "$E_VALIDATION" "invalid account name"
  is_known_type "$type" || fail "$E_NOT_FOUND" "unknown type: $type"
  cmd_auth_login --auth-profile="$name" "$type"
}

cmd_agent_set_account() {
  local agent="${1:-}" account="${2:-}"
  [[ -n "$agent" && -n "$account" ]] \
    || fail "$E_USAGE" "usage: 5dive agent set-account <agent> <account|default>"
  cmd_config "$agent" set "auth-profile=${account}"
}

# ---------------------------------------------------------------------------
# `agent rotation` — opt-in multi-account auto-rotation.
#
# Lets an agent be bound to an ORDERED list of accounts and auto-swap to the
# next eligible one when it hits a usage limit (driven by the rate-limit
# recovery path — see Phase 2). Default OFF; never rotates into a cooling
# account; if none are eligible the caller falls back to wait-for-reset.
#
# State lives in the registry under .agents[<name>].rotation:
#   { enabled: bool, accounts: [ordered profile names], cooldowns: {profile: epoch} }
# The agent's ACTIVE account stays .agents[<name>].authProfile (single source
# of truth). `rotate` just selects the next eligible profile and re-points it
# through cmd_config, reusing the exact set-account plumbing (env rewrite,
# auth-symlink re-point, deferred restart).
#
# Mutating subcommands run under the registry lock; `get` is read-only. The
# get/set/rotate/cooldown/clear-cooldown routing lives in main.sh's `agent`
# dispatcher (mirrors how `auth` and `telegram-access` are routed inline).

# rotation_require_agent <name> — emit the registry JSON, failing if the agent
# is unknown. Caller captures stdout.
rotation_require_agent() {
  local name="$1" reg
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  printf '%s' "$reg"
}

cmd_agent_rotation_get() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent rotation get <agent>"
  ensure_state
  local reg active rot
  reg=$(rotation_require_agent "$name")
  active=$(jq -r --arg n "$name" '.agents[$n].authProfile // ""' <<<"$reg")
  rot=$(jq -c --arg n "$name" \
    '.agents[$n].rotation // {enabled:false, accounts:[], cooldowns:{}}' <<<"$reg")
  if (( JSON_MODE )); then
    jq -cn --arg a "$active" --argjson r "$rot" '{ok:true, data:{
      active:$a,
      enabled:($r.enabled // false),
      allAccounts:($r.allAccounts // false),
      accounts:($r.accounts // []),
      cooldowns:($r.cooldowns // {}),
      lastSet:($r.lastSet // null)
    }}'
  else
    local now; now=$(date +%s)
    echo "agent:    $name"
    echo "active:   ${active:--}"
    echo "rotation: $(jq -r '.enabled // false' <<<"$rot")"
    # allAccounts is the "use every eligible profile" sentinel — show it as the
    # pool rather than the (empty) explicit list.
    if [[ "$(jq -r '.allAccounts // false' <<<"$rot")" == "true" ]]; then
      echo "accounts: all (every eligible profile)"
    else
      echo "accounts: $(jq -r '(.accounts // []) | if length==0 then "-" else join(", ") end' <<<"$rot")"
    fi
    # only surface still-active cooldowns; expired ones are noise.
    jq -r --argjson now "$now" '(.cooldowns // {}) | to_entries[]
      | select(.value > $now) | "  cooling: \(.key) until \(.value)"' <<<"$rot" 2>/dev/null || true
    # Last toggle provenance (DIVE-126) — helps diagnose a concurrent-toggle war.
    jq -r '(.lastSet // empty)
      | "last set: \(.toEnabled) (was \(.fromEnabled)) by \(.by) at \(.at)"' <<<"$rot" 2>/dev/null || true
  fi
}

cmd_agent_rotation_set() {
  local name="${1:-}"; shift || true
  [[ -n "$name" ]] \
    || fail "$E_USAGE" "usage: 5dive agent rotation set <agent> [--enabled=true|false] [--accounts=a,b,c]"
  require_root
  ensure_state
  local set_enabled="" set_accounts="" accounts_provided=0 a
  for a in "$@"; do
    case "$a" in
      --enabled=*)  set_enabled="${a#--enabled=}" ;;
      --accounts=*) set_accounts="${a#--accounts=}"; accounts_provided=1 ;;
      *) fail "$E_USAGE" "unknown flag: $a" ;;
    esac
  done
  [[ -n "$set_enabled" || $accounts_provided -eq 1 ]] \
    || fail "$E_USAGE" "nothing to set — pass --enabled= and/or --accounts="
  local reg type
  reg=$(rotation_require_agent "$name")
  type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
  # DIVE-126: capture the prior enabled-state + the writer BEFORE applying, so a
  # concurrent-toggle war (two agents/dashboards flipping enabled, last-write-
  # wins under the registry lock) is self-diagnosing — `rotation get` then shows
  # who last set it and the transition. Writer precedence mirrors audit_log.
  local rot_prior_enabled rot_writer rot_set_ts
  rot_prior_enabled=$(jq -c --arg n "$name" '.agents[$n].rotation.enabled // false' <<<"$reg")
  rot_writer="${FIVEDIVE_AUDIT_USER:-${SUDO_USER:-${USER:-unknown}}}"
  rot_set_ts=$(date -Iseconds)
  if [[ -n "$set_enabled" ]]; then
    case "$set_enabled" in true|false) ;; *) fail "$E_VALIDATION" "--enabled must be true or false" ;; esac
  fi
  # Parse the account spec. `--accounts=all` is the sentinel for "use every
  # eligible (same-type) profile, auto-including ones added later" — we store an
  # allAccounts flag and an EMPTY explicit list, then resolve the live pool at
  # rotate-time (see cmd_agent_rotation_rotate). Otherwise it's an ordered list
  # (dedup, preserve order); each entry must be a configured profile that
  # carries a credential for THIS agent's type — rotating to an account the
  # agent can't authenticate as is a footgun.
  local accounts_json="[]" all_accounts="false"
  if (( accounts_provided )) && [[ "${set_accounts,,}" == "all" ]]; then
    all_accounts="true"
  elif (( accounts_provided )) && [[ -n "$set_accounts" ]]; then
    local -a arr=(); IFS=',' read -ra arr <<<"$set_accounts"
    local acct seen=""
    for acct in "${arr[@]}"; do
      acct="${acct//[[:space:]]/}"
      [[ -n "$acct" ]] || continue
      valid_profile_name "$acct" \
        || fail "$E_VALIDATION" "invalid account name in --accounts: '$acct'"
      [[ -f "${AUTH_PROFILES_DIR}/${acct}/combined.env" ]] \
        || fail "$E_NOT_FOUND" "account '$acct' not configured"
      jq -e --arg t "$type" 'index($t) != null' \
        <<<"$(account_types_authed "$acct")" >/dev/null \
        || fail "$E_VALIDATION" "account '$acct' has no $type credential — rotation needs same-type accounts"
      case ",$seen," in *",$acct,"*) continue ;; esac
      seen="${seen:+$seen,}$acct"
      accounts_json=$(jq -c --arg a "$acct" '. + [$a]' <<<"$accounts_json")
    done
  fi
  # Ensure the rotation object exists, then apply the requested fields.
  reg=$(jq --arg n "$name" \
    '.agents[$n].rotation = (.agents[$n].rotation // {enabled:false, accounts:[], cooldowns:{}})' <<<"$reg")
  if [[ -n "$set_enabled" ]]; then
    reg=$(jq --arg n "$name" --argjson e "$set_enabled" '.agents[$n].rotation.enabled = $e' <<<"$reg")
  fi
  if (( accounts_provided )); then
    reg=$(jq --arg n "$name" --argjson a "$accounts_json" --argjson all "$all_accounts" \
      '.agents[$n].rotation.accounts = $a | .agents[$n].rotation.allAccounts = $all' <<<"$reg")
    # Cooldown pruning only applies to an explicit list — drop cooldowns for
    # accounts no longer in it. Under allAccounts the pool is dynamic (resolved
    # at rotate-time), so we keep every cooldown; expired ones are ignored then.
    if [[ "$all_accounts" != "true" ]]; then
      reg=$(jq --arg n "$name" '
        (.agents[$n].rotation.cooldowns // {}) as $c
        | (.agents[$n].rotation.accounts // []) as $acc
        | .agents[$n].rotation.cooldowns =
            reduce $acc[] as $k ({}; if $c[$k] != null then .[$k] = $c[$k] else . end)' <<<"$reg")
    fi
  fi
  # Stamp the last-writer + enabled transition onto the rotation object so the
  # next reader can see who toggled it and from what (DIVE-126). toEnabled is
  # read from the just-applied state above.
  reg=$(jq --arg n "$name" --arg by "$rot_writer" --arg at "$rot_set_ts" \
    --argjson from "$rot_prior_enabled" \
    '.agents[$n].rotation.lastSet = {
       by: $by, at: $at,
       fromEnabled: $from,
       toEnabled: (.agents[$n].rotation.enabled // false)
     }' <<<"$reg")
  echo "$reg" | registry_write
  local out_enabled out_accounts out_all
  out_enabled=$(jq -c --arg n "$name" '.agents[$n].rotation.enabled' <<<"$reg")
  out_accounts=$(jq -c --arg n "$name" '.agents[$n].rotation.accounts' <<<"$reg")
  out_all=$(jq -c --arg n "$name" '.agents[$n].rotation.allAccounts // false' <<<"$reg")
  ok "rotation config updated for '$name'" \
     '{name:$n, enabled:$e, allAccounts:$all, accounts:$a}' \
     --arg n "$name" --argjson e "$out_enabled" --argjson all "$out_all" --argjson a "$out_accounts"
}

# rotation_eligible_accounts <type> — JSON array of every configured profile
# that carries a credential for <type>, in account_each (alphabetical) order.
# Resolves the live pool for an allAccounts rotation, so profiles added after
# the agent was configured automatically join the rotation.
rotation_eligible_accounts() {
  local type="$1" out="[]" acct
  while IFS= read -r acct; do
    [[ -n "$acct" ]] || continue
    jq -e --arg t "$type" 'index($t) != null' \
      <<<"$(account_types_authed "$acct")" >/dev/null 2>&1 \
      && out=$(jq -c --arg a "$acct" '. + [$a]' <<<"$out")
  done < <(account_each)
  printf '%s' "$out"
}

cmd_agent_rotation_rotate() {
  local name="${1:-}"; shift || true
  [[ -n "$name" ]] \
    || fail "$E_USAGE" "usage: 5dive agent rotation rotate <agent> [--cooldown-current=<epoch>]"
  require_root
  ensure_state
  local cooldown_current="" a
  for a in "$@"; do
    case "$a" in
      --cooldown-current=*) cooldown_current="${a#--cooldown-current=}" ;;
      *) fail "$E_USAGE" "unknown flag: $a" ;;
    esac
  done
  local reg enabled current
  reg=$(rotation_require_agent "$name")
  enabled=$(jq -r --arg n "$name" '.agents[$n].rotation.enabled // false' <<<"$reg")
  current=$(jq -r --arg n "$name" '.agents[$n].authProfile // ""' <<<"$reg")
  [[ "$enabled" == "true" ]] \
    || fail "$E_VALIDATION" "rotation not enabled for '$name' (enable: 5dive agent rotation set $name --enabled=true)"
  # Cool down the account we're leaving (the one that just hit its limit), so
  # we don't bounce straight back to it. Persist immediately — even if no
  # eligible target exists, the cooldown should stick for the next attempt.
  if [[ -n "$cooldown_current" && -n "$current" ]]; then
    [[ "$cooldown_current" =~ ^[0-9]+$ ]] \
      || fail "$E_VALIDATION" "--cooldown-current must be an epoch (seconds)"
    reg=$(jq --arg n "$name" --arg c "$current" --argjson u "$cooldown_current" '
      .agents[$n].rotation = (.agents[$n].rotation // {enabled:false, accounts:[], cooldowns:{}})
      | .agents[$n].rotation.cooldowns[$c] = $u' <<<"$reg")
    echo "$reg" | registry_write
  fi
  # Resolve the pool: under allAccounts it's every eligible same-type profile
  # (computed live so newly-added accounts join automatically), otherwise the
  # stored ordered list. Then pick the first entry that isn't the current
  # account and isn't still cooling. Order == preference.
  local allflag type pool now cd candidates target tier cooling_target=0
  allflag=$(jq -r --arg n "$name" '.agents[$n].rotation.allAccounts // false' <<<"$reg")
  if [[ "$allflag" == "true" ]]; then
    type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
    pool=$(rotation_eligible_accounts "$type")
  else
    pool=$(jq -c --arg n "$name" '.agents[$n].rotation.accounts // []' <<<"$reg")
  fi
  now=$(date +%s)
  cd=$(jq -c --arg n "$name" '.agents[$n].rotation.cooldowns // {}' <<<"$reg")
  # Candidate accounts = the pool minus the one we're leaving, in preference order.
  # -n (null input) is REQUIRED: this filter reads only from --argjson, never stdin.
  # Without it jq blocks on / consumes stdin — and when the rotate runs from the
  # StopFailure hook (empty stdin) it processes zero inputs and emits an empty
  # string, which then crashes Tier-1's `--argjson c ""` with "invalid JSON text".
  # That abort left the agent parked on the just-cooled account = no rotation.
  candidates=$(jq -cn --arg cur "$current" --argjson acc "$pool" '[ $acc[] | select(. != $cur) ]')

  # Tier 1 — a candidate that isn't cooling. The common case; preference order wins.
  target=$(jq -rn --argjson now "$now" --argjson c "$candidates" --argjson cd "$cd" \
    '([ $c[] | select(($cd[.] // 0) <= $now) ] | first) // ""')
  tier=1

  # Tier 2 — STALE-COOLDOWN RECOVERY (DIVE-55). Every candidate looks like it's
  # cooling, but a cooldown is just a scraped reset epoch and can be wrong (set
  # too late, or the account's window already reset). If an agent on a candidate
  # rendered a low-usage statusline in the last few minutes, that account really
  # has capacity — so override the stale cooldown and rotate there. We clear its
  # cooldown so the resume doesn't then sit waiting on the bogus epoch.
  if [[ -z "$target" ]]; then
    local cand
    while IFS= read -r cand; do
      [[ -n "$cand" ]] || continue
      if account_has_live_headroom "$cand"; then
        target="$cand"; tier=2
        reg=$(jq --arg n "$name" --arg a "$cand" 'del(.agents[$n].rotation.cooldowns[$a])' <<<"$reg")
        echo "$reg" | registry_write
        break
      fi
    done < <(jq -r '.[]' <<<"$candidates")
  fi

  # Tier 3 — SOONEST-FREEING (DIVE-55). All candidates are genuinely cooling.
  # Rather than park on the account we just left (which we cooled to the LATEST
  # reset), jump to whichever candidate frees first — but ONLY if it frees sooner
  # than staying put would. current_cd is what we just cooled the leaving account
  # to (or its stored cooldown); rotating to something that frees later would be
  # strictly worse and could thrash, so we decline and let the resume wait here.
  if [[ -z "$target" ]]; then
    local current_cd soonest soonest_cd
    if [[ "$cooldown_current" =~ ^[0-9]+$ ]]; then
      current_cd="$cooldown_current"
    else
      current_cd=$(jq -rn --arg cur "$current" --argjson cd "$cd" '($cd[$cur] // 0)')
    fi
    soonest=$(jq -rn --argjson c "$candidates" --argjson cd "$cd" \
      '([ $c[] | {a:., u:($cd[.] // 0)} ] | sort_by(.u) | first | .a) // ""')
    soonest_cd=$(jq -rn --arg a "$soonest" --argjson cd "$cd" '($cd[$a] // 0)')
    if [[ -n "$soonest" ]] && (( soonest_cd < current_cd )); then
      target="$soonest"; tier=3; cooling_target=1
    fi
  fi

  if [[ -z "$target" ]]; then
    if (( JSON_MODE )); then
      ok "" '{rotated:false, from:$f, to:null, reason:"no eligible account (all cooling later than current, or none configured)"}' \
         --arg f "$current"
    else
      echo "OK — no eligible account to rotate to (all cooling later than current / none configured)"
    fi
    return 0
  fi
  # Re-point via the existing set-account plumbing. We already hold the
  # registry lock and cmd_config reads/writes the registry directly (safe
  # under the lock); it also fires the deferred restart. Silence its own
  # stdout so we emit a single clean envelope. `tier`/`coolingTarget` are
  # surfaced for observability (3 + coolingTarget=true means we rotated to a
  # still-cooling-but-sooner account; the resume waits on its reset).
  cmd_config "$name" set "auth-profile=${target}" >/dev/null
  ok "rotated '$name': ${current:--} -> ${target}" \
     '{rotated:true, from:$f, to:$t, tier:$tier, coolingTarget:$ct}' \
     --arg f "$current" --arg t "$target" --argjson tier "$tier" \
     --argjson ct "$( ((cooling_target)) && echo true || echo false )"
}

cmd_agent_rotation_cooldown() {
  local name="${1:-}" account="${2:-}"
  [[ -n "$name" && -n "$account" ]] \
    || fail "$E_USAGE" "usage: 5dive agent rotation cooldown <agent> <account> --until=<epoch>"
  shift 2
  require_root
  ensure_state
  local until="" a
  for a in "$@"; do
    case "$a" in
      --until=*) until="${a#--until=}" ;;
      *) fail "$E_USAGE" "unknown flag: $a" ;;
    esac
  done
  [[ "$until" =~ ^[0-9]+$ ]] || fail "$E_VALIDATION" "--until=<epoch seconds> required"
  valid_profile_name "$account" || fail "$E_VALIDATION" "invalid account name"
  local reg
  reg=$(rotation_require_agent "$name")
  reg=$(jq --arg n "$name" --arg a "$account" --argjson u "$until" '
    .agents[$n].rotation = (.agents[$n].rotation // {enabled:false, accounts:[], cooldowns:{}})
    | .agents[$n].rotation.cooldowns[$a] = $u' <<<"$reg")
  echo "$reg" | registry_write
  ok "cooldown set: '$account' until $until" \
     '{name:$n, account:$a, until:$u}' --arg n "$name" --arg a "$account" --argjson u "$until"
}

cmd_agent_rotation_clear_cooldown() {
  local name="${1:-}" account="${2:-}"
  [[ -n "$name" ]] \
    || fail "$E_USAGE" "usage: 5dive agent rotation clear-cooldown <agent> [<account>]"
  require_root
  ensure_state
  local reg
  reg=$(rotation_require_agent "$name")
  if [[ -n "$account" ]]; then
    valid_profile_name "$account" || fail "$E_VALIDATION" "invalid account name"
    reg=$(jq --arg n "$name" --arg a "$account" '
      if .agents[$n].rotation.cooldowns then
        .agents[$n].rotation.cooldowns |= del(.[$a])
      else . end' <<<"$reg")
  else
    reg=$(jq --arg n "$name" '
      if .agents[$n].rotation then .agents[$n].rotation.cooldowns = {} else . end' <<<"$reg")
  fi
  echo "$reg" | registry_write
  ok "cooldown cleared for '$name'${account:+ ($account)}" \
     '{name:$n, account:$a}' --arg n "$name" --arg a "${account:-}"
}

# `account set-active-provider <profile> <type> <provider>` — flip which
# credential in a profile's credential_pool the gateway uses, without
# rerunning the sign-in flow. Driven by the Switch account modal when
# the user picks a dormant provider that already lives in the same
# profile's pool (e.g. switching from openrouter back to openai-codex
# after both have been signed in). hermes-only for now — openclaw's
# active provider lives in openclaw.json under agents.defaults.model
# and needs a separate flip path.
cmd_account_set_active_provider() {
  local profile="${1:-}" type="${2:-}" provider="${3:-}"
  [[ -n "$profile" && -n "$type" && -n "$provider" ]] \
    || fail "$E_USAGE" "usage: 5dive account set-active-provider <profile> <type> <provider>"
  valid_profile_name "$profile" \
    || fail "$E_VALIDATION" "invalid profile name"
  is_known_type "$type" || fail "$E_NOT_FOUND" "unknown type: $type"
  [[ "$type" == "hermes" ]] \
    || fail "$E_VALIDATION" "set-active-provider currently supports type=hermes only"
  require_root
  local prof_hermes="${AUTH_PROFILES_DIR}/${profile}/hermes"
  local auth_path="${prof_hermes}/auth.json"
  [[ -s "$auth_path" ]] \
    || fail "$E_NOT_FOUND" "no hermes auth for profile '$profile'"
  # Confirm the credential exists in the pool before touching anything —
  # the gateway will silently re-fall-back to its existing model.provider
  # if config.yaml names a missing credential, which would surface as a
  # ghost "switched but nothing changed" experience.
  jq -e --arg p "$provider" '(.credential_pool // {}) | has($p)' "$auth_path" >/dev/null 2>&1 \
    || fail "$E_NOT_FOUND" "provider '$provider' not in profile '$profile' credential_pool — sign in with it first"

  local bin="${TYPE_BIN[hermes]}"
  [[ -x "$bin" ]] || fail "$E_NOT_INSTALLED" "hermes not installed at $bin"
  step "Pinning hermes model.provider=${provider} on profile '${profile}'"
  sudo -u claude -H env HERMES_HOME="$prof_hermes" \
    "$bin" config set model.provider "$provider" >&2 \
    || fail "$E_GENERIC" "hermes config set model.provider=$provider failed"
  # base_url left for hermes to auto-resolve from its provider catalog;
  # explicitly unset so a stale value from a previous provider doesn't
  # pin the gateway at the wrong endpoint.
  sudo -u claude -H env HERMES_HOME="$prof_hermes" \
    "$bin" config set model.base_url "" >&2 2>/dev/null || true
  local model="${HERMES_PROVIDER_MODEL[$provider]:-}"
  if [[ -n "$model" ]]; then
    sudo -u claude -H env HERMES_HOME="$prof_hermes" \
      "$bin" config set model.default "$model" >&2 \
      || warn "hermes config set model.default=$model failed"
  fi

  # Restart every agent bound to this profile so the start-hook seeds the
  # new config.yaml and bounces the gateway. Same loop shape as cmd_auth_set.
  local affected
  affected=$(registry_read | jq -r --arg p "$profile" \
    '.agents | to_entries[] | select(.value.authProfile == $p) | .key')
  local agent
  while IFS= read -r agent; do
    [[ -n "$agent" ]] || continue
    step "Restarting 5dive-agent@${agent}.service"
    systemctl restart "5dive-agent@${agent}.service" >&2 2>&1 \
      || warn "restart of agent '$agent' failed — check journalctl -u 5dive-agent@${agent}"
  done <<<"$affected"

  ok "active provider set to '$provider' on profile '$profile'" \
     '{profile:$p, type:$t, provider:$pr}' \
     --arg p "$profile" --arg t "$type" --arg pr "$provider"
}

# -------- agent CRUD --------

cmd_list() {
  ensure_state
  local reg
  reg=$(registry_read)
  # Enrich with live systemd state.
  local out
  out=$(echo "$reg" | jq -c '.agents')
  local enriched="{}"
  for name in $(echo "$out" | jq -r 'keys[]' 2>/dev/null); do
    local svc="5dive-agent@${name}"
    local active sub
    active=$(systemctl is-active "$svc" 2>/dev/null || true)
    sub=$(systemctl is-enabled "$svc" 2>/dev/null || true)
    # Surface bot-to-bot status (DIVE-161) so the dashboard can flag which agents
    # can message bots outside the team — without N per-agent access fetches.
    # It lives in the agent's access.json, not the registry; read it here (root).
    local b2b="false" ltype lchan lsd
    ltype=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
    lchan=$(jq -r --arg n "$name" '.agents[$n].channels' <<<"$reg")
    if [[ "$lchan" == "telegram" ]]; then
      lsd=$(_tg_access_state_dir "agent-${name}" "$ltype" 2>/dev/null || echo "")
      if [[ -n "$lsd" && -f "$lsd/access.json" ]]; then
        b2b=$(jq -r '.botToBot.enabled // false' "$lsd/access.json" 2>/dev/null || echo "false")
      fi
    fi
    # Surface the configured model + reasoning effort (DIVE-211) so the dashboard
    # can render a per-row model badge + picker without an N×`agent info` fan-out.
    # Same best-effort reads `info` uses; empty -> null (model unset / non-claude
    # effort). Two extra per-agent file reads, in line with the systemctl + b2b
    # reads this loop already does.
    local amodel aeffort
    # `|| true`: belt-and-suspenders with the resolvers' own exit-0 contract so a
    # best-effort per-agent config read can never abort the whole list under
    # `set -e` (DIVE-230).
    amodel=$(resolve_agent_model "$ltype" "$name" || true)
    aeffort=$(resolve_agent_effort "$ltype" "$name" || true)
    enriched=$(jq -c --arg n "$name" --arg a "$active" --arg e "$sub" --argjson b2b "$b2b" \
      --arg model "$amodel" --arg effort "$aeffort" \
      '.[$n] = {active: $a, enabled: $e, botToBotEnabled: $b2b,
                model: (if $model == "" then null else $model end),
                effort: (if $effort == "" then null else $effort end)}' <<<"$enriched")
  done
  local merged
  merged=$(jq -c --arg default_wd "$DEFAULT_WORKDIR" --argjson live "$enriched" '.agents | to_entries | map({
    name: .key,
    type: .value.type,
    channels: .value.channels,
    workdir: (.value.workdir // $default_wd),
    authProfile: (.value.authProfile // null),
    botUsername: (.value.botUsername // null),
    isolation: (.value.isolation // "admin"),
    heartbeat: (.value.heartbeat // null),
    createdAt: .value.createdAt,
    active: ($live[.key].active // "unknown"),
    enabled: ($live[.key].enabled // "unknown"),
    botToBotEnabled: ($live[.key].botToBotEnabled // false),
    model: ($live[.key].model // null),
    effort: ($live[.key].effort // null)
  })' <<<"$reg")
  if (( JSON_MODE )); then
    echo "$merged" | jq -c '{ok:true, data: .}'
  else
    echo "$merged" | jq -r '
      if length == 0 then "no agents" else
        (["NAME","TYPE","CHANNELS","PROFILE","ACTIVE","ENABLED"] | @tsv),
        (.[] | [(.name + (if (.heartbeat.enabled // false) then " ∿" + ((.heartbeat.everyMin // 30)|tostring) + "m" else "" end)), .type, .channels, (.authProfile // "-"), .active, .enabled] | @tsv)
      end' | column -t -s $'\t'
  fi
}

# Resolve the coding-CLI version string for an agent type from its TYPE_BIN
# binary. Best-effort: returns "" if the binary is missing or doesn't answer
# --version in time. Runs as `claude` (owns the binaries + their caches) through
# a login shell so node/nvm-based CLIs (codex) inherit their PATH, capped at 5s
# so a wedged CLI can't hang `info`.
resolve_cli_version() {
  local type="$1"
  local bin="${TYPE_BIN[$type]:-}"
  [[ -n "$bin" ]] || { printf ''; return; }
  timeout 5 sudo -u claude bash -lc "$(printf '%q' "$bin") --version 2>/dev/null | head -1" 2>/dev/null || printf ''
}

# Resolve the model an agent is configured to use, read from the per-type
# runtime config the CLI actually loads (codex/grok TOML, claude/antigravity
# JSON). Best-effort: returns "" when the runtime doesn't persist a model
# (grok/antigravity fall back to the CLI's built-in pick), so callers should
# render "—"/null rather than treat empty as an error.
resolve_agent_model() {
  local type="$1" name="$2"
  local home="/home/agent-${name}"
  # MUST stay exit-0 on a missing/unreadable config: the caller assigns this in
  # `amodel=$(resolve_agent_model …)`, and under the bundle's `set -e` a non-zero
  # here aborts the whole command. A `--defer-auth` antigravity agent has no
  # settings.json until its first boot writes it, so the jq below exits non-zero
  # and (DIVE-230) crashed `agent list`/`info` mid-build → empty output → callers
  # read it as "agent missing". The `|| true` on every file read keeps the
  # contract: absent value → "" → exit 0. (sed|head needs it too: under
  # `pipefail` a missing config.toml propagates sed's non-zero status.)
  case "$type" in
    claude)
      sudo jq -r '.model // empty' "$home/.claude/settings.json" 2>/dev/null || true ;;
    codex)
      { sudo sed -nE 's/^[[:space:]]*model[[:space:]]*=[[:space:]]*"?([^"#]*[^"# ])"?.*/\1/p' \
        "$home/.codex/config.toml" 2>/dev/null | head -1; } || true ;;
    grok)
      { sudo sed -nE 's/^[[:space:]]*model[[:space:]]*=[[:space:]]*"?([^"#]*[^"# ])"?.*/\1/p' \
        "$home/.grok/config.toml" 2>/dev/null | head -1; } || true ;;
    antigravity)
      sudo jq -r '.model // .selectedModel // empty' \
        "$home/.gemini/antigravity-cli/settings.json" 2>/dev/null || true ;;
    *) printf '' ;;
  esac
}

# Resolve the reasoning effort an agent is configured with — claude-only
# (`effortLevel` in settings.json). Best-effort: returns "" for non-claude types
# or when unset (Claude Code then uses its built-in default), so callers render
# "—"/null rather than treat empty as an error.
resolve_agent_effort() {
  local type="$1" name="$2"
  case "$type" in
    claude)
      sudo jq -r '.effortLevel // empty' "/home/agent-${name}/.claude/settings.json" 2>/dev/null || true ;;
    *) printf '' ;;
  esac
}

# Write the selected model into the per-type runtime config the CLI loads, so
# `config set model=` is the single uniform path the forks' /model shells out to
# (replacing each plugin's own per-runtime config write). TOML (codex/grok) and
# JSON (claude/antigravity) are handled distinctly:
#   - TOML: split at the first table header; replace an existing top-level
#     `model = ...` in the preamble, else prepend one above the first [table] —
#     so the key stays document-root-level, never binds to a [section] and never
#     duplicates (matches telegram-{codex,grok} writeConfigModel()).
#   - JSON: merge-write the top-level `.model` key, preserving every other key.
# The runtime config must already exist (every provisioned+started agent has
# one) — we refuse to create it, both because a bare new file would drop the
# other required settings and because pre-seeding codex's config.toml would make
# 5dive-agent-start skip its approval_policy/sandbox baseline. Atomic (tmp +
# rename) with the existing owner:group + 600 mode preserved.
write_runtime_model() {
  local type="$1" name="$2" model="$3"
  local home="/home/agent-${name}" file fmt
  case "$type" in
    claude)      file="$home/.claude/settings.json"; fmt=json ;;
    codex)       file="$home/.codex/config.toml";     fmt=toml ;;
    grok)        file="$home/.grok/config.toml";       fmt=toml ;;
    antigravity) file="$home/.gemini/antigravity-cli/settings.json"; fmt=json ;;
    *) fail "$E_VALIDATION" "type '$type' has no model config (can't set model=)" ;;
  esac
  [[ -f "$file" ]] \
    || fail "$E_NOT_FOUND" "no $type runtime config at $file yet — start agent '$name' once before setting model"
  local dir own
  dir=$(dirname "$file")
  own=$(stat -c '%U:%G' "$file")
  local tmp
  tmp=$(mktemp -p "$dir" .model.XXXXXX) || fail "$E_GENERIC" "mktemp failed in $dir"
  if ! MODEL_FMT="$fmt" MODEL_VAL="$model" MODEL_SRC="$file" python3 - "$tmp" <<'PY'
import os, sys, json, re
fmt, val, src, tmp = os.environ["MODEL_FMT"], os.environ["MODEL_VAL"], os.environ["MODEL_SRC"], sys.argv[1]
with open(src) as f: orig = f.read()
if fmt == "json":
    try:
        data = json.loads(orig) if orig.strip() else {}
    except ValueError:
        sys.stderr.write("existing %s is not valid JSON\n" % src); sys.exit(3)
    if not isinstance(data, dict):
        sys.stderr.write("existing %s is not a JSON object\n" % src); sys.exit(3)
    data["model"] = val
    out = json.dumps(data, indent=2) + "\n"
else:  # toml — only ever touch the preamble before the first [table] header
    m = re.search(r'^\s*\[', orig, re.M)
    head = orig if m is None else orig[:m.start()]
    tail = "" if m is None else orig[m.start():]
    line = 'model = "%s"' % val
    if re.search(r'^[ \t]*model[ \t]*=.*$', head, re.M):
        head = re.sub(r'^[ \t]*model[ \t]*=.*$', line, head, count=1, flags=re.M)
    else:
        head = line + "\n" + head
    out = head + tail
with open(tmp, "w") as f: f.write(out)
PY
  then
    rm -f "$tmp"; fail "$E_GENERIC" "failed to write model into $file"
  fi
  chown "$own" "$tmp" 2>/dev/null || true
  chmod 600 "$tmp"
  mv -f "$tmp" "$file"
}

# Write the reasoning effort into claude's settings.json (`effortLevel`) — the
# same key Claude Code reads and the telegram plugin's /effort writes. Claude-only
# (other types have no effort knob). Same atomic merge-write contract as
# write_runtime_model: refuse to create a missing file, preserve owner:group + 600.
write_runtime_effort() {
  local name="$1" effort="$2"
  local file="/home/agent-${name}/.claude/settings.json"
  [[ -f "$file" ]] \
    || fail "$E_NOT_FOUND" "no claude runtime config at $file yet — start agent '$name' once before setting effort"
  local dir own tmp
  dir=$(dirname "$file")
  own=$(stat -c '%U:%G' "$file")
  tmp=$(mktemp -p "$dir" .effort.XXXXXX) || fail "$E_GENERIC" "mktemp failed in $dir"
  if ! EFFORT_VAL="$effort" EFFORT_SRC="$file" python3 - "$tmp" <<'PY'
import os, sys, json
val, src, tmp = os.environ["EFFORT_VAL"], os.environ["EFFORT_SRC"], sys.argv[1]
with open(src) as f: orig = f.read()
try:
    data = json.loads(orig) if orig.strip() else {}
except ValueError:
    sys.stderr.write("existing %s is not valid JSON\n" % src); sys.exit(3)
if not isinstance(data, dict):
    sys.stderr.write("existing %s is not a JSON object\n" % src); sys.exit(3)
data["effortLevel"] = val
with open(tmp, "w") as f: f.write(json.dumps(data, indent=2) + "\n")
PY
  then
    rm -f "$tmp"; fail "$E_GENERIC" "failed to write effortLevel into $file"
  fi
  chown "$own" "$tmp" 2>/dev/null || true
  chmod 600 "$tmp"
  mv -f "$tmp" "$file"
}

# Single-agent detail: registry identity/config + live systemd state, plus the
# resolved coding-CLI version and selected model. Added so each fork's /status
# reads one uniform source (cliName/cliVersion/model) instead of shelling each
# runtime's config itself — the version/model live in different files per type
# and the binaries aren't on the agent user's PATH.
cmd_info() {
  ensure_state
  local name=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent info <name> [--json]"
  require_agent "$name"

  local reg
  reg=$(registry_read)

  local svc="5dive-agent@${name}"
  local active enabled
  active=$(systemctl is-active "$svc" 2>/dev/null || true)
  enabled=$(systemctl is-enabled "$svc" 2>/dev/null || true)

  local type
  type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")

  local cli_version model effort
  cli_version=$(resolve_cli_version "$type")
  # `|| true`: a best-effort per-agent config read must never abort `info` under
  # `set -e` when the file is absent (e.g. --defer-auth agy pre-boot — DIVE-230).
  model=$(resolve_agent_model "$type" "$name" || true)
  effort=$(resolve_agent_effort "$type" "$name" || true)

  local obj
  obj=$(jq -c \
    --arg n "$name" \
    --arg default_wd "$DEFAULT_WORKDIR" \
    --arg active "${active:-unknown}" \
    --arg enabled "${enabled:-unknown}" \
    --arg cliName "$type" \
    --arg cliVersion "$cli_version" \
    --arg model "$model" \
    --arg effort "$effort" \
    '.agents[$n] as $a | {
      name: $n,
      type: $a.type,
      channels: ($a.channels // "none"),
      workdir: ($a.workdir // $default_wd),
      authProfile: ($a.authProfile // null),
      botUsername: ($a.botUsername // null),
      isolation: ($a.isolation // "admin"),
      heartbeat: ($a.heartbeat // null),
      createdAt: $a.createdAt,
      active: $active,
      enabled: $enabled,
      cliName: $cliName,
      cliVersion: (if $cliVersion == "" then null else $cliVersion end),
      model: (if $model == "" then null else $model end),
      effort: (if $effort == "" then null else $effort end)
    }' <<<"$reg")

  if (( JSON_MODE )); then
    jq -cn --argjson d "$obj" '{ok:true, data:$d}'
  else
    jq -r '
      "name:        \(.name)",
      "type:        \(.type)",
      "cli:         \(.cliName) \(.cliVersion // "unknown")",
      "model:       \(.model // "—")\(if .effort then " · effort \(.effort)" else "" end)",
      "channels:    \(.channels)\(if .botUsername then " (@\(.botUsername))" else "" end)",
      "profile:     \(.authProfile // "-")",
      "workdir:     \(.workdir)",
      "isolation:   \(.isolation)",
      "state:       \(.active) / \(.enabled)",
      "created:     \(.createdAt // "unknown")"
    ' <<<"$obj"
  fi
}

create_agent_user() {
  local name="$1" isolation="${2:-admin}"
  local user="agent-${name}"
  if ! id -u "$user" &>/dev/null; then
    adduser --disabled-password --gecos "" "$user" >/dev/null
  fi
  # Admin/standard join the claude group (shared workspace access); sandboxed stays isolated.
  local groups="systemd-journal"
  [[ "$isolation" != "sandboxed" ]] && groups="claude,systemd-journal"
  usermod -aG "$groups" "$user"
  # Only admin gets full sudo.
  if [[ "$isolation" == "admin" ]]; then
    cat > "/etc/sudoers.d/${user}" <<SUDOERS
${user} ALL=(ALL) NOPASSWD: ALL
SUDOERS
    chmod 440 "/etc/sudoers.d/${user}"
  fi
}

delete_agent_user() {
  local name="$1"
  local user="agent-${name}"
  id -u "$user" &>/dev/null || return 0
  # deluser removes the home dir; skip --remove-home to keep any per-agent
  # state the user may have in their $HOME. Home is minimal anyway since
  # configs live under /home/claude.
  deluser --quiet "$user" 2>/dev/null || true
  rm -f "/etc/sudoers.d/${user}"
}

write_agent_env() {
  local name="$1" type="$2" channels="$3" workdir="${4:-}" profile="${5:-}" isolation="${6:-admin}"
  local env_file="${ENV_DIR}/${name}.env"
  {
    printf 'AGENT_NAME=%s\n' "$name"
    printf 'AGENT_TYPE=%s\n' "$type"
    printf 'AGENT_CHANNELS=%s\n' "$channels"
    [[ -n "$workdir" ]] && printf 'AGENT_WORKDIR=%s\n' "$workdir"
    [[ -n "$profile" ]] && printf 'AGENT_AUTH_PROFILE=%s\n' "$profile"
    printf 'AGENT_ISOLATION=%s\n' "$isolation"
    # New telegram agents flow through our 5dive-plugins fork (bundled
    # hooks, richer slash commands). 5dive-agent-start reads this var to
    # build the runtime --channels arg, defaulting to claude-plugins-official
    # when unset — so existing agents created before this change keep
    # routing to the upstream plugin until manually migrated.
    [[ "$channels" == "telegram" ]] && printf 'AGENT_CHANNEL_MARKETPLACE=5dive-plugins\n'
  } > "$env_file"
  chown root:claude "$env_file"
  chmod 640 "$env_file"
}

# Point /var/lib/5dive/agents.d/<name>-auth.env at the profile's combined.env
# (systemd picks it up via EnvironmentFile=-/var/lib/5dive/agents.d/%i-auth.env).
# Empty <profile> removes the link — agent falls back to the shared
# /etc/5dive/connectors/*.env files, same as before profiles existed.
link_agent_profile() {
  local name="$1" profile="${2:-}"
  local link="${ENV_DIR}/${name}-auth.env"
  rm -f "$link"
  [[ -n "$profile" ]] || return 0
  local target="${AUTH_PROFILES_DIR}/${profile}/combined.env"
  [[ -f "$target" ]] \
    || fail "$E_NOT_FOUND" "auth profile '$profile' not configured — run: sudo 5dive agent auth set <type> --api-key=... --auth-profile=$profile"
  ln -s "$target" "$link"
}

# Write a BYO (bring-your-own) API-key credential for hermes/openclaw into
# the canonical state dir that 5dive-agent-start.sh seeds from at launch.
# Called from cmd_create (--provider=<canonical> --api-key=<key>) and
# cmd_auth_set (same flags, on already-created agents). Runs as the
# `claude` user so the resulting files land owned by claude:claude — the
# agent's start hook re-copies them into agent-<name>'s home with mode 0600.
#
# <type> hermes uses `hermes auth add <provider> --type api-key --api-key`
# which writes ~/.hermes/auth.json with the right base_url auto-resolved
# from hermes' built-in provider catalog. <type> openclaw has no scriptable
# auth-add path (paste-token requires TTY) — write auth-profiles.json
# directly with the {type:"api_key", provider, key} shape. Both binaries
# read what we write at startup; cmd_auth_set restarts every agent bound
# to the profile so the seed loop in 5dive-agent-start.sh picks up the
# new files and bounces the hermes/openclaw gateway daemon.
apply_byo_provider() {
  local type="$1" canonical="$2" api_key="$3" profile="${4:-}"
  valid_byo_provider "$canonical" \
    || fail "$E_VALIDATION" "unknown provider '$canonical' (known: ${!BYO_PROVIDER_LABEL[*]})"
  valid_api_key "$api_key" \
    || fail "$E_VALIDATION" "api key looks wrong (>=10 printable non-space chars)"
  local native
  native=$(resolve_native_provider "$type" "$canonical")
  [[ -n "$native" ]] \
    || fail "$E_VALIDATION" "$type does not support provider '$canonical' (${BYO_PROVIDER_LABEL[$canonical]})"

  case "$type" in
    hermes)   _apply_byo_hermes "$native" "$canonical" "$api_key" "$profile" ;;
    openclaw) _apply_byo_openclaw "$native" "$canonical" "$api_key" "$profile" ;;
    claude)   _apply_byo_claude "$canonical" "$api_key" "$profile" ;;
    *) fail "$E_VALIDATION" "BYO provider not supported for type '$type' (only: hermes, openclaw, claude)" ;;
  esac
}

# Claude (Claude Code) BYO custom-provider path. Unlike hermes/openclaw — which
# write native auth.json/auth-profiles.json — the claude harness reads its
# credentials and endpoint from the environment, so we upsert the override env
# vars into the auth-profile's combined.env. systemd loads that as
# EnvironmentFile=%i-auth.env *after* the shared anthropic.env (last-wins), so
# these override any default-account OAuth token that template otherwise leaks
# in. profile_set_var takes the value on stdin (keeps secrets out of argv).
_apply_byo_claude() {
  local canonical="$1" api_key="$2" profile="${3:-}"
  [[ -n "$profile" ]] \
    || fail "$E_USAGE" "claude BYO provider requires --auth-profile (custom-provider creds are profile-scoped)"
  local base_url="${CLAUDE_PROVIDER_BASEURL[$canonical]:-}"
  [[ -n "$base_url" ]] \
    || fail "$E_VALIDATION" "claude does not support provider '$canonical' (${BYO_PROVIDER_LABEL[$canonical]:-unknown}: no Anthropic-compatible endpoint)"
  step "Configuring claude BYO provider '$canonical' → ${base_url} (profile=$profile)"
  printf '%s' "$base_url"  | profile_set_var "$profile" ANTHROPIC_BASE_URL
  printf '%s' "$api_key"   | profile_set_var "$profile" ANTHROPIC_AUTH_TOKEN
  printf '%s' "${CLAUDE_PROVIDER_OPUS_MODEL[$canonical]}"   | profile_set_var "$profile" ANTHROPIC_DEFAULT_OPUS_MODEL
  printf '%s' "${CLAUDE_PROVIDER_SONNET_MODEL[$canonical]}" | profile_set_var "$profile" ANTHROPIC_DEFAULT_SONNET_MODEL
  printf '%s' "${CLAUDE_PROVIDER_HAIKU_MODEL[$canonical]}"  | profile_set_var "$profile" ANTHROPIC_DEFAULT_HAIKU_MODEL
  # Custom endpoints (esp. z.ai during peak hours) can be slow; raise the
  # client-side request timeout so long tool turns don't get cut off.
  printf '%s' "3000000" | profile_set_var "$profile" API_TIMEOUT_MS
  # Neutralize any shared-account creds the template's unconditional
  # anthropic.env EnvironmentFile= would inject ahead of our override —
  # combined.env loads last so empty values win, forcing the harness onto
  # ANTHROPIC_AUTH_TOKEN + ANTHROPIC_BASE_URL instead of OAuth-to-Anthropic.
  printf '%s' "" | profile_set_var "$profile" CLAUDE_CODE_OAUTH_TOKEN
  printf '%s' "" | profile_set_var "$profile" ANTHROPIC_API_KEY
}

_apply_byo_hermes() {
  local native="$1" canonical="$2" api_key="$3" profile="${4:-}"
  local bin="${TYPE_BIN[hermes]}"
  [[ -x "$bin" ]] || fail "$E_NOT_INSTALLED" "hermes not installed at $bin"

  # HERMES_HOME is the dir that contains auth.json/config.yaml directly —
  # `profile_type_dir` already returns that for profiled installs, matching
  # the path 5dive-agent-start.sh syncs from. Appending /.hermes here put
  # the credential one dir too deep and the per-agent seed silently no-op'd
  # (left every BYO-key hermes agent stuck on whatever auth was there at
  # create time). Default profile keeps writing to the shared dir.
  local hermes_home="/home/claude/.hermes"
  if [[ -n "$profile" ]]; then
    hermes_home="$(profile_type_dir "$profile" hermes)"
  fi

  # Kimi/Moonshot env-var path: hermes' Kimi provider reads KIMI_API_KEY from
  # ~/.hermes/.env at gateway startup; there is no `hermes auth add moonshot`
  # to populate auth.json. Write the env var into the shared dir (cmd_create
  # mirrors it into the agent-user's .env via seed_hermes_byo_env before the
  # gateway starts) and stamp a minimal auth.json so the cmd_create auth gate
  # (auth_creds_present → `-s ${TYPE_AUTH[hermes]}`) doesn't reject the agent
  # for "no credentials." `{}` is hermes' own pre-login shape.
  if [[ "$canonical" == "moonshot" ]]; then
    step "Writing hermes BYO credential for '$canonical' (KIMI_API_KEY → ${hermes_home}/.env)"
    install -d -m 0775 -o claude -g claude "$hermes_home"
    if ! sudo -u claude -H env HERMES_HOME="$hermes_home" KEY="$api_key" bash -s >&2 <<'KIMI_ENV'
set -euo pipefail
ENV_FILE="$HERMES_HOME/.env"
touch "$ENV_FILE"
chmod 600 "$ENV_FILE"
TMP=$(mktemp --tmpdir="$HERMES_HOME" .env.XXXXXX)
chmod 600 "$TMP"
grep -v '^KIMI_API_KEY=' "$ENV_FILE" > "$TMP" || true
printf 'KIMI_API_KEY=%s\n' "$KEY" >> "$TMP"
mv "$TMP" "$ENV_FILE"
AUTH_FILE="$HERMES_HOME/auth.json"
if [[ ! -s "$AUTH_FILE" ]]; then
  printf '{}\n' > "$AUTH_FILE"
  chmod 600 "$AUTH_FILE"
fi
KIMI_ENV
    then
      fail "$E_GENERIC" "hermes BYO env write failed for moonshot"
    fi
    # Point hermes at the Kimi provider so first launch doesn't hit the
    # "Hermes isn't configured yet" prompt. Non-fatal: if hermes' CLI rejects
    # the value, the agent can still run (KIMI_API_KEY is in .env) and the
    # user can pick the model via `5dive agent <name> tui`. `kimi` is an
    # alias on the upstream kimi-coding provider — see
    # plugins/model-providers/kimi-coding/__init__.py.
    sudo -u claude -H env HERMES_HOME="$hermes_home" \
      "$bin" config set model.provider "$native" >&2 \
      || warn "hermes config set model.provider=$native failed (user can pick the model in TUI)"
    local model="${HERMES_PROVIDER_MODEL[$canonical]:-}"
    if [[ -n "$model" ]]; then
      sudo -u claude -H env HERMES_HOME="$hermes_home" \
        "$bin" config set model.default "$model" >&2 \
        || warn "hermes config set model.default=$model failed"
    fi
    return 0
  fi

  step "Writing hermes BYO credential for '$canonical' (native id: $native)"
  printf '%s' "$api_key" | sudo -u claude -H env HERMES_HOME="$hermes_home" \
    "$bin" auth add "$native" --type api-key --api-key "$api_key" --label "${canonical}-byo" >&2 \
    || fail "$E_GENERIC" "hermes auth add $native failed"
  sudo -u claude -H env HERMES_HOME="$hermes_home" \
    "$bin" config set model.provider "$native" >&2 \
    || warn "hermes config set model.provider=$native failed (rerun: sudo -u claude -H $bin config set model.provider $native)"
  # hermes auto-resolves model.base_url from its provider catalog when
  # model.base_url is unset — explicitly unset it so a stale openai-codex
  # value from a prior oauth login doesn't pin the agent to chatgpt.com.
  sudo -u claude -H env HERMES_HOME="$hermes_home" \
    "$bin" config set model.base_url "" >&2 2>/dev/null || true
  local model="${HERMES_PROVIDER_MODEL[$canonical]:-}"
  if [[ -n "$model" ]]; then
    sudo -u claude -H env HERMES_HOME="$hermes_home" \
      "$bin" config set model.default "$model" >&2 \
      || warn "hermes config set model.default=$model failed"
  fi
}

_apply_byo_openclaw() {
  local native="$1" canonical="$2" api_key="$3" profile="${4:-}"
  local base="/home/claude"
  if [[ -n "$profile" ]]; then
    base="$(profile_type_dir "$profile" openclaw)"
    install -d -m 2750 -o claude -g claude "$base"
  fi
  local oc_dir="${base}/.openclaw/agents/main/agent"
  install -d -m 0750 -o claude -g claude \
    "${base}/.openclaw" \
    "${base}/.openclaw/agents" \
    "${base}/.openclaw/agents/main" \
    "$oc_dir"

  local profile_id="${native}:manual"
  local auth_file="${oc_dir}/auth-profiles.json"
  step "Writing openclaw BYO auth-profiles.json for '$canonical' (native id: $native)"
  local tmp
  tmp=$(mktemp -p "$oc_dir" .auth-profiles.XXXXXX) \
    || fail "$E_GENERIC" "mktemp failed in $oc_dir"
  jq -cn --arg pid "$profile_id" --arg p "$native" --arg k "$api_key" \
    '{version:1, profiles:{($pid):{type:"api_key", provider:$p, key:$k}}}' \
    > "$tmp" \
    || { rm -f "$tmp"; fail "$E_GENERIC" "failed to write $auth_file"; }
  chown claude:claude "$tmp"
  chmod 0600 "$tmp"
  mv "$tmp" "$auth_file"

  # Default model lands in openclaw.json's agents.defaults.model.primary;
  # 5dive-agent-start.sh syncs it from the shared/profile copy into the
  # per-agent openclaw.json on every launch.
  local model="${OPENCLAW_PROVIDER_MODEL[$canonical]:-}"
  if [[ -n "$model" ]]; then
    local openclaw_bin="${TYPE_BIN[openclaw]}"
    sudo -u claude -H env HOME="$base" "$openclaw_bin" \
      config set agents.defaults.model.primary "$model" >&2 \
      || warn "openclaw config set agents.defaults.model.primary=$model failed"
  fi
}

cmd_create() {
  local name="" type="" channels="none" telegram_token="" discord_token="" workdir="" profile=""
  local telegram_home_channel="" telegram_allowed_users=""
  local byo_provider="" byo_api_key=""
  local skills_arg="" skills_set=0 no_skills=0 defer_auth=0
  local isolation="admin" no_team_bot=0
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --type=*)                    type="${1#--type=}" ;;
      --channels=*)                channels="${1#--channels=}" ;;
      --telegram-token=*)          telegram_token="${1#--telegram-token=}" ;;
      --telegram-home-channel=*)   telegram_home_channel="${1#--telegram-home-channel=}" ;;
      --telegram-allowed-users=*)  telegram_allowed_users="${1#--telegram-allowed-users=}" ;;
      --discord-token=*)           discord_token="${1#--discord-token=}" ;;
      --workdir=*)                 workdir="${1#--workdir=}" ;;
      --auth-profile=*)            profile="${1#--auth-profile=}" ;;
      --provider=*)                byo_provider="${1#--provider=}" ;;
      --api-key=*)                 byo_api_key="${1#--api-key=}" ;;
      --with-skills=*)             skills_arg="${1#--with-skills=}"; skills_set=1 ;;
      --no-skills)                 no_skills=1 ;;
      --no-team-bot)               no_team_bot=1 ;;
      --defer-auth)                defer_auth=1 ;;
      --isolation=*)               isolation="${1#--isolation=}" ;;
      -*)                          fail "$E_USAGE" "unknown flag: $1" ;;
      *)                           [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent create <name> --type=<type> [--channels=none|telegram|discord] [--telegram-token=<token>] [--telegram-home-channel=<id>] [--telegram-allowed-users=<csv>] [--workdir=<path>] [--auth-profile=<name>] [--provider=<id> --api-key=<key|->] [--with-skills=<spec>[,...]] [--no-skills] [--no-team-bot] [--defer-auth] [--isolation=admin|standard|sandboxed]"
  [[ -n "$type" ]] || fail "$E_USAGE" "--type is required"
  valid_name "$name" || fail "$E_VALIDATION" "invalid name (lowercase letters/digits/hyphens, start letter, <=16 chars)"
  is_known_type "$type" || fail "$E_NOT_FOUND" "unknown type: $type (known: ${!TYPE_BIN[*]})"
  valid_channel "$channels" || fail "$E_VALIDATION" "invalid channels: $channels (none|telegram|discord)"
  valid_isolation "$isolation" || fail "$E_VALIDATION" "invalid --isolation (admin|standard|sandboxed)"
  if [[ -n "$workdir" ]]; then
    valid_workdir "$workdir" \
      || fail "$E_VALIDATION" "invalid --workdir (absolute path, allowed chars: letters/digits/._-/)"
  fi
  if [[ -n "$profile" ]]; then
    valid_profile_name "$profile" \
      || fail "$E_VALIDATION" "invalid --auth-profile (lowercase letters/digits/_-, start letter, <=32 chars)"
    if (( defer_auth )) || [[ -n "$byo_provider" ]]; then
      # "Set up later" path: the dashboard binds an auto-derived profile
      # (the slug) at create time before any auth has happened, so the
      # profile dir legitimately doesn't exist yet. Pre-create it now with
      # an empty combined.env so link_agent_profile's symlink target is
      # present (systemd's EnvironmentFile= loads the empty file as a
      # no-op) and the per-type *_HOME redirect (driven by
      # AGENT_AUTH_PROFILE in the unit env file) has a target dir for
      # first-run onboarding to write creds into.
      # Same treatment for the BYO API-key path: the dashboard's "fresh
      # account + paste key" flow names a new profile that doesn't exist
      # yet — apply_byo_provider populates the per-type dir below, and the
      # post-create auth gate accepts that as proof of auth.
      ensure_profile_dir "$profile" >/dev/null
    else
      # Non-defer path keeps the fail-fast check so a typo'd profile
      # name doesn't survive into agent state.
      [[ -f "${AUTH_PROFILES_DIR}/${profile}/combined.env" ]] \
        || fail "$E_NOT_FOUND" "auth profile '$profile' not configured — run: sudo 5dive agent auth set $type --api-key=... --auth-profile=$profile"
    fi
  fi

  if [[ "$channels" != "none" ]] && [[ "${TYPE_CHANNELS[$type]}" != "1" ]]; then
    fail "$E_VALIDATION" "type '$type' does not support channels (only: claude, codex, grok, antigravity, opencode, openclaw, hermes)"
  fi
  # codex + grok + antigravity + opencode ship a telegram bridge only — no discord build yet.
  if [[ ( "$type" == "codex" || "$type" == "grok" || "$type" == "antigravity" || "$type" == "opencode" ) && "$channels" == "discord" ]]; then
    fail "$E_VALIDATION" "type '$type' supports --channels=telegram only (no discord build)"
  fi

  # BYO API-key path for hermes/openclaw (--provider=<canonical> + --api-key=<key|->).
  # Mutually exclusive with --defer-auth: BYO is the alternative to "I'll sign in
  # later", not an add-on. The key sentinel "-" reads from stdin so the value
  # never appears in argv (and thus never in `ps`).
  if [[ -n "$byo_provider" || -n "$byo_api_key" ]]; then
    [[ "$type" == "hermes" || "$type" == "openclaw" || "$type" == "claude" ]] \
      || fail "$E_VALIDATION" "--provider/--api-key only supported for hermes/openclaw/claude (got: $type)"
    # claude BYO points the harness at an Anthropic-compatible third-party
    # endpoint and stores the override env vars in the auth-profile's
    # combined.env — so it requires a profile to scope the creds to this agent
    # (otherwise the override would have to live in the shared default
    # connector and bleed into every other claude agent).
    [[ "$type" == "claude" && -z "$profile" ]] \
      && fail "$E_USAGE" "claude BYO (--provider) requires --auth-profile=<name> (custom-provider creds are profile-scoped)"
    [[ -n "$byo_provider" && -n "$byo_api_key" ]] \
      || fail "$E_USAGE" "--provider and --api-key must be passed together"
    (( defer_auth )) \
      && fail "$E_USAGE" "--defer-auth and --provider/--api-key are mutually exclusive"
    valid_byo_provider "$byo_provider" \
      || fail "$E_VALIDATION" "unknown provider '$byo_provider' (known: ${!BYO_PROVIDER_LABEL[*]})"
    local _native
    _native=$(resolve_native_provider "$type" "$byo_provider")
    [[ -n "$_native" ]] \
      || fail "$E_VALIDATION" "$type does not support provider '$byo_provider'"
    if [[ "$byo_api_key" == "-" ]]; then
      [[ -t 0 ]] && fail "$E_USAGE" "--api-key=- expects the key on stdin, stdin is a TTY"
      byo_api_key=$(cat)
    fi
    valid_api_key "$byo_api_key" \
      || fail "$E_VALIDATION" "api key looks wrong (expected >=10 printable non-space chars)"
  fi

  # Resolve --with-skills. Default policy: when this create call is being made
  # by another agent (SUDO_USER=agent-*), preinstall the 5dive-cli skill so
  # the new agent inherits inter-agent comms knowledge — applies to every
  # supported type, since the skills CLI handles per-type install paths via
  # --agent (see SKILLS_AGENT_ID above). Humans creating from the dashboard
  # get no skills by default — they typically don't need the recursion story
  # and the skill is just context noise. --no-skills opts out of the default;
  # --with-skills="" also opts out.
  local -a skills_specs=()
  if (( no_skills )); then
    :
  elif (( skills_set )); then
    if [[ -n "$skills_arg" ]]; then
      IFS=',' read -r -a skills_specs <<<"$skills_arg"
    fi
  else
    if [[ "${SUDO_USER:-}" == agent-* ]]; then
      skills_specs=("5dive-cli")
    fi
  fi
  # Validate every spec up front so we fail before adduser/registry mutation
  # on bad input. Empty entries (trailing comma) are skipped.
  local -a skills_resolved=()
  local s pair src sk
  for s in "${skills_specs[@]+"${skills_specs[@]}"}"; do
    [[ -z "$s" ]] && continue
    pair=$(parse_skill_spec "$s")
    src="${pair% *}"
    sk="${pair#* }"
    valid_skill_source "$src" \
      || fail "$E_VALIDATION" "invalid --with-skills source in '$s' (expected owner/repo, got '$src')"
    valid_skill_id "$sk" \
      || fail "$E_VALIDATION" "invalid --with-skills id in '$s' (got '$sk')"
    skills_resolved+=("${src}:${sk}")
  done

  # Telegram/Discord need their own bot/app token per agent — two agents can't
  # share a bot (both would call getUpdates and race each other). Require the
  # token at create time so the plugin doesn't spin up with empty creds.
  if [[ "$channels" == "telegram" ]]; then
    if [[ -z "$telegram_token" ]]; then
      telegram_token=$(prompt_secret "Telegram bot token for agent '$name'") \
        || fail "$E_USAGE" "--channels=telegram requires --telegram-token=<token> (or run interactively to be prompted)"
    fi
    valid_telegram_token "$telegram_token" \
      || fail "$E_VALIDATION" "telegram token format looks wrong (expected <digits>:<20+ chars>)"
    if [[ -n "$telegram_home_channel" ]]; then
      valid_telegram_chat_id "$telegram_home_channel" \
        || fail "$E_VALIDATION" "invalid --telegram-home-channel (numeric chat id, optionally negative)"
    fi
    if [[ -n "$telegram_allowed_users" ]]; then
      valid_telegram_chat_id_list "$telegram_allowed_users" \
        || fail "$E_VALIDATION" "invalid --telegram-allowed-users (comma-separated numeric ids)"
    fi
  fi
  if [[ "$channels" == "discord" ]]; then
    [[ -n "$discord_token" ]] \
      || fail "$E_USAGE" "--channels=discord requires --discord-token=<token>"
  fi

  ensure_state
  local reg
  reg=$(registry_read)
  if jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null; then
    fail "$E_CONFLICT" "agent '$name' already exists"
  fi

  # Install-on-demand: if the requested CLI isn't on disk, try the recipe.
  if [[ ! -x "${TYPE_BIN[$type]}" ]]; then
    if [[ -n "${TYPE_INSTALL[$type]:-}" ]]; then
      step "$type not installed — installing now"
      # cmd_install emits its own ok/fail; we want install output on stderr
      # (progress) so flip JSON_MODE off for the nested call and restore.
      local prev_json="$JSON_MODE"
      JSON_MODE=0
      cmd_install "$type" >&2
      JSON_MODE="$prev_json"
    else
      fail "$E_NOT_INSTALLED" "$type is not installed and no installer is configured (expected at ${TYPE_BIN[$type]})"
    fi
  fi

  # BYO API-key path: write the credential into the shared (or profile-scoped)
  # state dir before the auth gate runs — auth_status_one then sees the
  # sentinel and lets create proceed without falling back to "needs login".
  # Must come after the install-on-demand block so the agent CLI exists
  # when apply_byo_provider shells out to `hermes auth add`.
  if [[ -n "$byo_provider" ]]; then
    apply_byo_provider "$type" "$byo_provider" "$byo_api_key" "$profile"
  fi

  # Don't create an agent that can't log in. When an auth-profile is named,
  # accept either the profile's combined.env (api-key path / claude OAuth, which
  # promote tokens via profile_set_var) or the per-type credential file written
  # by the device-code flow (codex/hermes/openclaw write only auth.json /
  # auth-profiles.json — combined.env stays empty). Skip the
  # live probe here: a slow API blip shouldn't block `agent create`.
  # --defer-auth bypasses the gate: the caller (typically the dashboard's "Set
  # up later" wizard option) is opting to finish authentication inside the
  # agent's first-run UI on tmux attach.
  if (( defer_auth )); then
    :
  elif [[ -n "$profile" ]]; then
    local _profile_authed=0
    if [[ -s "${AUTH_PROFILES_DIR}/${profile}/combined.env" ]]; then
      _profile_authed=1
    else
      local _profile_auth_path
      _profile_auth_path=$(profile_type_auth_path "$profile" "$type" 2>/dev/null) || true
      [[ -n "$_profile_auth_path" && -s "$_profile_auth_path" ]] && _profile_authed=1
    fi
    (( _profile_authed )) \
      || fail "$E_AUTH_REQUIRED" "auth profile '$profile' is empty — run: sudo 5dive agent auth login $type --auth-profile=$profile (or: sudo 5dive agent auth set $type --api-key=... --auth-profile=$profile)"
  else
    local auth
    auth=$(auth_status_one "$type" --no-probe)
    if [[ "$auth" != "ok" ]]; then
      fail "$E_AUTH_REQUIRED" "$type is not authenticated ($auth) — run: sudo 5dive agent auth login $type (or: sudo 5dive agent auth set $type --api-key=<key>)"
    fi
  fi

  step "Creating user agent-${name}"
  create_agent_user "$name" "$isolation"

  if [[ "$isolation" == "sandboxed" ]]; then
    step "Applying sandbox resource limits for agent-${name}"
    local dropin_dir="/etc/systemd/system/5dive-agent@${name}.service.d"
    mkdir -p "$dropin_dir"
    printf '[Service]\nMemoryMax=512M\nCPUQuota=50%%\n' > "${dropin_dir}/isolation.conf"
    chmod 644 "${dropin_dir}/isolation.conf"
  fi

  # claude needs the onboarding preseed + settings the channel user got at
  # provision time — otherwise first run hits the theme picker / trust dialog
  # inside tmux. hermes/openclaw don't read ~/.claude (they have their own
  # state dirs), so preseeding it for them is just dead weight; their first-
  # run prompts are handled by their own CLIs.
  if [[ "$type" == "claude" ]]; then
    step "Preseeding claude config for agent-${name}"
    preseed_claude_agent "$name" "$channels"
  elif [[ "$type" == "antigravity" ]]; then
    # antigravity needs no claude-style ~/.claude preseed (agy reads its own
    # ~/.gemini state). The default-skill seed every other type gets isn't
    # folded into a channel installer for the no-channel case, so run it
    # here unconditionally. When channels=telegram, install_channel_for_agent
    # (routed below) wires the bot token + access.json + notify-user skill on
    # top — the find-skills/5dive-cli seed below is idempotent so the overlap
    # is harmless.
    step "Preseeding antigravity default skills for agent-${name}"
    preseed_antigravity_agent "$name"

    # Seed the agy OAuth token into the agent user's runtime $HOME as root.
    # 5dive-agent-start also seeds at boot, but that path uses the agent's own
    # `sudo -n` to read the 0700 auth-profile dir — which only admin agents have
    # (standard/sandboxed get no NOPASSWD sudoers). Without seeding here a
    # non-admin agy agent boots unauthenticated and sits at the "select login
    # method" screen → the telegram bridge runs but the bot is silent (hit on a
    # customer standard-isolation create 2026-06-02). `agent create` runs as
    # root, so copy the token directly; agy is the only type whose credential is
    # a plain file (codex/grok land env/auth.json the agent can already read).
    local _agy_src _agy_home
    _agy_src=$(profile_type_auth_path "$profile" "$type" 2>/dev/null) || true
    if [[ -n "$_agy_src" ]] && [[ -s "$_agy_src" ]]; then
      _agy_home="/home/agent-${name}/.gemini/antigravity-cli"
      install -d -m 700 -o "agent-${name}" -g "agent-${name}" \
        "/home/agent-${name}/.gemini" "$_agy_home"
      if install -m 600 -o "agent-${name}" -g "agent-${name}" \
           "$_agy_src" "${_agy_home}/antigravity-oauth-token"; then
        step "Seeded agy OAuth token into agent-${name} runtime"
      fi
    fi
  fi

  # Channel registration is type-aware (see install_channel_for_agent's
  # comment above): claude installs claude-plugins-official's bun server,
  # openclaw shells out to `openclaw channels add`, hermes writes
  # ~/.hermes/.env. Each runs as agent-${name} so credentials land in that
  # user's home with correct ownership.
  case "$channels" in
    telegram)
      install_channel_for_agent "$type" telegram "$name" "$telegram_token" \
        "$telegram_home_channel" "$telegram_allowed_users" ;;
    discord)
      install_channel_for_agent "$type" discord  "$name" "$discord_token" ;;
  esac

  # Hermes BYO Kimi/Moonshot: KIMI_API_KEY lives in the agent user's
  # ~/.hermes/.env (hermes' Kimi provider reads it directly; there is no
  # `hermes auth add moonshot`). apply_byo_provider stamped it into the
  # shared dir for profile reuse; mirror it into the agent-user's .env here
  # so the gateway (started a few steps below) picks it up at first boot.
  # Runs after install_channel_for_agent so channel-token upserts can't
  # overwrite the KIMI_API_KEY line (they only touch their own var).
  if [[ "$type" == "hermes" && "$byo_provider" == "moonshot" ]]; then
    step "Seeding KIMI_API_KEY into ~/.hermes/.env for agent-${name}"
    seed_hermes_byo_env "$name" KIMI_API_KEY "$byo_api_key"
  fi

  if [[ -n "$telegram_token" ]]; then
    step "Writing telegram bot token (${CONNECTORS_DIR}/telegram-${name}.env)"
    write_channel_secret telegram "$name" TELEGRAM_BOT_TOKEN "$telegram_token"
  fi
  if [[ -n "$discord_token" ]]; then
    step "Writing discord token (${CONNECTORS_DIR}/discord-${name}.env)"
    write_channel_secret discord "$name" DISCORD_BOT_TOKEN "$discord_token"
  fi

  # Sandboxed agents can't access /home/claude/projects (not in claude group).
  # Default their workdir to their own home so the TUI starts somewhere useful.
  if [[ "$isolation" == "sandboxed" && -z "$workdir" ]]; then
    workdir="/home/agent-${name}"
  fi

  step "Writing agent env"
  write_agent_env "$name" "$type" "$channels" "$workdir" "$profile" "$isolation"
  link_agent_profile "$name" "$profile"

  # Resolve bot @username via Telegram getMe so the dashboard's agent list
  # can render a t.me/<bot> deep link without an extra round-trip per row.
  # Best-effort: a network blip here shouldn't fail agent creation — the
  # `agent telegram-info <name>` command can backfill on demand later.
  local bot_username=""
  if [[ "$channels" == "telegram" && -n "$telegram_token" ]]; then
    bot_username=$(fetch_bot_username "$telegram_token" 2>/dev/null) || bot_username=""
  fi

  step "Registering in $REGISTRY"
  jq --arg n "$name" --arg t "$type" --arg c "$channels" --arg w "$workdir" --arg p "$profile" --arg bu "$bot_username" --arg ts "$(date -Iseconds)" --arg iso "$isolation" \
    '.agents[$n] = (
      {type: $t, channels: $c, createdAt: $ts, isolation: $iso}
      + (if $w == "" then {} else {workdir: $w} end)
      + (if $p == "" then {} else {authProfile: $p} end)
      + (if $bu == "" then {} else {botUsername: $bu} end)
    )' <<<"$reg" | registry_write

  # users.sh creates /home/claude/.hermes at 2770, but `hermes auth add
  # openai-codex` (kicked off by `agent auth start hermes` before create)
  # tightens it back to 0700 when writing auth.json. The chmod 0775 in the
  # install recipe only fires on the install path — short-circuited when the
  # binary already exists, and bypassed when auth runs after install. Without
  # group-traverse the systemd unit (which runs as agent-<name> in the claude
  # group) can't reach /home/claude/.hermes/hermes-agent/venv/bin/hermes and
  # crash-loops with `binary not installed`. Repair perms unconditionally
  # right before `systemctl enable --now`, regardless of what tightened them.
  if [[ "$type" == "hermes" ]] && [[ -d /home/claude/.hermes ]]; then
    chmod 0775 /home/claude/.hermes
  fi

  # Hermes onboarding finalization. The chat CLI's first-run check
  # (_has_any_provider_configured) inspects ~/.hermes/config.yaml for an
  # explicit model.provider/base_url. Without those, every fresh hermes
  # invocation hits "It looks like Hermes isn't configured yet -- run:
  # hermes setup" and the tmux loop sits at the prompt forever. Pin the
  # values to what the device-code OAuth flow already wrote into
  # auth.json's credential_pool, so the first launch lands straight in
  # chat. Skipped when --defer-auth is set: the user opted to finish
  # setup interactively on tmux attach, and we don't know which provider
  # they'll pick. Also skipped on the BYO path — apply_byo_provider
  # already wrote model.provider/model.default for the user's chosen
  # vendor; overwriting with openai-codex here would clobber the BYO
  # choice and route the agent at chatgpt.com instead of e.g. Anthropic.
  # The pin only matters when the profile *doesn't* already carry a
  # config.yaml. If it does (BYO write, or a prior device-code login that
  # left one behind), agent-start.sh will content-sync it into the agent's
  # per-user dir — and pinning openai-codex here would land a fresher
  # config.yaml at the per-user path, beating the seed's content-diff and
  # silently routing the agent back to chatgpt.com regardless of what the
  # profile says. Matches the same skip we apply when --provider is on argv.
  local _profile_has_hermes_cfg=0
  if [[ -n "$profile" ]] \
     && [[ -s "${AUTH_PROFILES_DIR}/${profile}/hermes/config.yaml" ]]; then
    _profile_has_hermes_cfg=1
  fi
  if [[ "$type" == "hermes" ]] && (( ! defer_auth )) && [[ -z "$byo_provider" ]] \
     && (( ! _profile_has_hermes_cfg )); then
    step "Pinning hermes model.provider for agent-${name}"
    local hermes_bin="${TYPE_BIN[hermes]}"
    sudo -u "agent-${name}" -H "$hermes_bin" config set model.provider openai-codex >&2 \
      || warn "hermes config set model.provider failed — first launch may show setup prompt (rerun: sudo -u agent-${name} -H $hermes_bin config set model.provider openai-codex)"
    sudo -u "agent-${name}" -H "$hermes_bin" config set model.base_url https://chatgpt.com/backend-api/codex >&2 \
      || warn "hermes config set model.base_url failed for agent '$name'"
    sudo -u "agent-${name}" -H "$hermes_bin" config set model.default gpt-5.5 >&2 \
      || warn "hermes config set model.default failed for agent '$name'"
  fi

  # For hermes telegram/discord channels, install + start the per-user
  # hermes messaging gateway. Skipped when --defer-auth: no auth means
  # the gateway can't talk to the model. See ensure_hermes_gateway for
  # the underlying systemd-user plumbing.
  if [[ "$type" == "hermes" ]] \
      && [[ "$channels" == "telegram" || "$channels" == "discord" ]] \
      && (( ! defer_auth )); then
    ensure_hermes_gateway "$name"
  fi

  # DIVE-248 — auto-attach to the shared team bot. When this box has a team
  # bot configured (token + group persisted by `team-bot shared`), every new
  # relay-eligible agent (no personal bot, plugin-capable type) joins the team
  # group by default: own forum topic, send-only on the shared token, group
  # allowlisted. Opt out with --no-team-bot. Best-effort — a Telegram hiccup
  # must not roll back an otherwise healthy create.
  # Only channel-less creates qualify: the shared-attach path flips the agent
  # to channels=telegram (send-only), which would clobber a personal telegram
  # or discord setup requested in this very create.
  # MUST run before the service's first boot: the booting session races the
  # marketplace git clone inside the plugin install (ERR_STREAM_PREMATURE_CLOSE),
  # and the session should come up with the plugin already staged anyway.
  # _team_bot_do_shared's own restart at the end doubles as the first start.
  local team_bot_status="off"
  if (( ! no_team_bot )) && [[ "$channels" == "none" ]] \
      && [[ -r /etc/5dive/team-bot.token && -r /etc/5dive/team-bot.json ]]; then
    local tb_token tb_group tb_owner
    tb_token=$(cat /etc/5dive/team-bot.token 2>/dev/null)
    tb_group=$(jq -r '.group // empty' /etc/5dive/team-bot.json 2>/dev/null)
    tb_owner=$(jq -r '.owner // empty' /etc/5dive/team-bot.json 2>/dev/null)
    if [[ -z "$tb_token" || -z "$tb_group" ]]; then
      team_bot_status="off"
    elif _team_bot_relay_agent_list | grep -qxF "$name"; then
      step "Attaching $name to the shared team bot (group $tb_group)"
      local tb_prev_json="$JSON_MODE"
      JSON_MODE=0
      # Two attempts: on a never-booted agent, claude's first `plugin
      # marketplace add` reports a spurious ERR_STREAM_PREMATURE_CLOSE while
      # still completing the clone in the background (~5s) — the retry's
      # `marketplace update` then finds it and the install goes through.
      local tb_attached=0 tb_try
      for tb_try in 1 2; do
        if ( _team_bot_do_shared "$tb_group" "$tb_owner" "$name" "$tb_token" ) >/dev/null; then
          tb_attached=1; break
        fi
        (( tb_try == 1 )) && sleep 10
      done
      if (( tb_attached )); then
        team_bot_status="attached"
      else
        team_bot_status="failed"
        warn "team-bot auto-attach failed — agent is up; retry: sudo 5dive agent team-bot shared --group=$tb_group --agents=$name --token=<shared bot token>"
      fi
      JSON_MODE="$tb_prev_json"
    else
      # Team bot configured but this agent isn't a relay candidate (it has its
      # own personal bot, or its type ships no telegram plugin).
      team_bot_status="skipped"
    fi
  fi

  step "Enabling 5dive-agent@${name}.service"
  systemctl daemon-reload
  systemctl enable --now "5dive-agent@${name}.service" >&2

  # Install any preseeded skills. A failed install does NOT roll back the
  # agent — networks flake, the agent itself is fine, and the user can rerun
  # `5dive agent skill <name> add ...` to retry. We toggle JSON_MODE off
  # around cmd_skill_add and redirect its stdout so its own ok envelope
  # doesn't collide with this command's envelope; the failure path runs
  # under set -e because cmd_skill_add calls `fail` which exits — wrap in
  # a subshell so only the subshell exits, then catch the status.
  local installed_skills_json='[]' failed_skills_json='[]'
  if (( ${#skills_resolved[@]} > 0 )); then
    local prev_json="$JSON_MODE"
    JSON_MODE=0
    local entry pair src sk status
    for entry in "${skills_resolved[@]}"; do
      src="${entry%%:*}"
      sk="${entry##*:}"
      status=0
      ( cmd_skill_add "$name" --source="$src" --skill="$sk" ) >/dev/null || status=$?
      if (( status == 0 )); then
        installed_skills_json=$(jq -c --arg s "$src" --arg k "$sk" \
          '. + [{source:$s, skill:$k}]' <<<"$installed_skills_json")
      else
        warn "skill install failed for '$sk' from '$src' (exit $status) — agent is up; rerun: sudo 5dive agent skill $name add --source=$src --skill=$sk"
        failed_skills_json=$(jq -c --arg s "$src" --arg k "$sk" \
          '. + [{source:$s, skill:$k}]' <<<"$failed_skills_json")
      fi
    done
    JSON_MODE="$prev_json"
  fi

  # Wire paperclipai (running as user `claude`) to the new agent's auth so
  # its inner CLI-connection check stops reporting "not logged in" for this
  # type. No-op when the host-default credential location already holds a
  # real file — manual host logins win. Best-effort; never fails the create.
  paperclip_seed_for_type "$type" "$profile" 2>/dev/null || true

  local effective_workdir="${workdir:-$DEFAULT_WORKDIR}"
  ok "agent '$name' (type=$type, channels=$channels${profile:+, profile=$profile}) is running." \
     '{name:$n, type:$t, channels:$c, workdir:$w, authProfile:$p, created:true, skills:{installed:$inst, failed:$fail}, teamBot:$tb}' \
     --arg n "$name" --arg t "$type" --arg c "$channels" --arg w "$effective_workdir" --arg p "${profile:-}" \
     --argjson inst "$installed_skills_json" --argjson fail "$failed_skills_json" --arg tb "$team_bot_status"
}

cmd_restart() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent restart <name>"
  require_agent "$name"
  systemctl restart "5dive-agent@${name}.service" >&2
  ok "agent '$name' restarted." \
     '{name:$n, action:"restart"}' --arg n "$name"
}

cmd_rm() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent rm <name>"
  ensure_state
  local reg
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  local rm_profile
  rm_profile=$(jq -r --arg n "$name" '.agents[$n].authProfile // empty' <<<"$reg")
  step "Stopping 5dive-agent@${name}.service"
  systemctl disable --now "5dive-agent@${name}.service" 2>/dev/null || true
  step "Removing systemd env + channel secrets"
  rm -f "${ENV_DIR}/${name}.env" "${ENV_DIR}/${name}-auth.env"
  remove_channel_secret telegram "$name"
  remove_channel_secret discord  "$name"
  step "Deleting user agent-${name}"
  delete_agent_user "$name"
  step "Updating registry"
  jq --arg n "$name" 'del(.agents[$n])' <<<"$reg" | registry_write
  # Drop any paperclip-shared symlinks pointing into this agent's profile
  # and re-seed from another agent of the same type if one remains. Best-
  # effort — never fails the remove.
  [[ -n "$rm_profile" ]] && paperclip_unseed_for_profile "$rm_profile" 2>/dev/null || true
  ok "agent '$name' removed." \
     '{name:$n, removed:true}' --arg n "$name"
}

cmd_config() {
  # Usage: 5dive agent config <name> set <key>=<value> [<key>=<value>...]
  #   keys:
  #     channels                  (none|telegram|discord)
  #     model                     (model id for the agent's CLI — claude/codex/
  #                                grok/antigravity; written into the type's
  #                                runtime config, applied on the deferred restart)
  #     workdir                   (absolute path; tmux cwd on next launch;
  #                                value "default" or "" clears the override)
  #     telegram.token            (bot token for this agent's telegram plugin)
  #     telegram.home-channel     (hermes only — chat id the gateway posts to;
  #                                ignored by claude/openclaw)
  #     telegram.allowed-users    (csv of numeric ids allowed to DM the bot;
  #                                seeds access.json/openclaw.allowFrom/hermes env)
  #     discord.token             (bot/app token for this agent's discord plugin)
  #
  # When channels=<plugin> is being set (or a <plugin>.token is being rotated),
  # the matching install_channel_for_agent dispatch is also re-run so each
  # agent type's native state (claude access.json + plugin install, openclaw
  # channels add + allowFrom, hermes ~/.hermes/.env) lands in step with the
  # registry — same plumbing cmd_create uses, kept on a single code path.
  local name="${1:-}" verb="${2:-}"
  [[ -n "$name" && -n "$verb" ]] \
    || fail "$E_USAGE" "usage: 5dive agent config <name> set <key>=<value> [...]"
  shift 2
  [[ "$verb" == "set" ]] || fail "$E_USAGE" "only 'set' is supported"
  ensure_state
  local reg
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  local type
  type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
  # env_dirty marks that we need to rewrite agents.d/<name>.env from the
  # post-update registry at the end — channels/workdir/auth-profile all live there.
  local env_dirty=0
  # profile_dirty marks that the auth symlink needs to be re-pointed.
  local profile_dirty=0
  # Channel-attach state collected from this set call. We defer the actual
  # install_channel_for_agent dispatch until after the loop so all related
  # keys (channels= and <plugin>.{token,home-channel,allowed-users}) can be
  # applied together — order in argv shouldn't matter.
  local channels_changed_to=""    # value of channels= in this call (if any)
  local new_telegram_token=""
  local new_discord_token=""
  local new_home_channel=""
  local new_allowed_users=""
  local new_model=""
  local new_effort=""
  # applied_keys: names of keys that were actually changed, for the JSON payload.
  local -a applied_keys=()
  for kv in "$@"; do
    local k="${kv%%=*}" v="${kv#*=}"
    case "$k" in
      channels)
        valid_channel "$v" || fail "$E_VALIDATION" "invalid channels: $v"
        if [[ "$v" != "none" ]] && [[ "${TYPE_CHANNELS[$type]}" != "1" ]]; then
          fail "$E_VALIDATION" "type '$type' does not support channels"
        fi
        reg=$(jq --arg n "$name" --arg v "$v" '.agents[$n].channels = $v' <<<"$reg")
        channels_changed_to="$v"
        env_dirty=1
        applied_keys+=("channels")
        ;;
      workdir)
        if [[ -z "$v" || "$v" == "default" ]]; then
          reg=$(jq --arg n "$name" 'del(.agents[$n].workdir)' <<<"$reg")
        else
          valid_workdir "$v" \
            || fail "$E_VALIDATION" "invalid workdir (absolute path, allowed chars: letters/digits/._-/)"
          reg=$(jq --arg n "$name" --arg v "$v" '.agents[$n].workdir = $v' <<<"$reg")
        fi
        env_dirty=1
        applied_keys+=("workdir")
        ;;
      telegram.token)
        [[ "${TYPE_CHANNELS[$type]}" == "1" ]] \
          || fail "$E_VALIDATION" "type '$type' does not support telegram channels"
        valid_telegram_token "$v" \
          || fail "$E_VALIDATION" "telegram token format looks wrong (expected <digits>:<20+ chars>)"
        new_telegram_token="$v"
        applied_keys+=("telegram.token")
        ;;
      discord.token)
        [[ "${TYPE_CHANNELS[$type]}" == "1" ]] \
          || fail "$E_VALIDATION" "type '$type' does not support discord channels"
        [[ -n "$v" ]] || fail "$E_VALIDATION" "discord.token cannot be empty"
        new_discord_token="$v"
        applied_keys+=("discord.token")
        ;;
      telegram.home-channel)
        [[ "${TYPE_CHANNELS[$type]}" == "1" ]] \
          || fail "$E_VALIDATION" "type '$type' does not support telegram channels"
        valid_telegram_chat_id "$v" \
          || fail "$E_VALIDATION" "telegram.home-channel must be a numeric chat id"
        new_home_channel="$v"
        applied_keys+=("telegram.home-channel")
        ;;
      telegram.allowed-users)
        [[ "${TYPE_CHANNELS[$type]}" == "1" ]] \
          || fail "$E_VALIDATION" "type '$type' does not support telegram channels"
        valid_telegram_chat_id_list "$v" \
          || fail "$E_VALIDATION" "telegram.allowed-users must be a comma-separated list of numeric ids"
        new_allowed_users="$v"
        applied_keys+=("telegram.allowed-users")
        ;;
      model)
        # Uniform model switch — writes the selected model into the type's
        # runtime config (see write_runtime_model). Applied below, picked up by
        # the deferred restart at the end of this function. Not stored in the
        # registry: `agent info` reads the live file so a model changed via the
        # native CLI directly stays the source of truth.
        [[ -n "$v" ]] || fail "$E_VALIDATION" "model cannot be empty"
        valid_model "$v" \
          || fail "$E_VALIDATION" "invalid model '$v' (allowed chars: letters/digits/._:/-)"
        case "$type" in
          claude|codex|grok|antigravity) ;;
          *) fail "$E_VALIDATION" "type '$type' does not support 'model' config" ;;
        esac
        new_model="$v"
        applied_keys+=("model")
        ;;
      effort|effortLevel)
        # Reasoning-effort switch — claude-only (Claude Code's settings.json
        # effortLevel). Mirrors the telegram plugin's /effort: writes effortLevel
        # then restarts (deferred below). Not registry-stored — `agent info`
        # reads the live settings.json so an effort changed in-TUI stays truth.
        # Levels match the plugin's EFFORT_LEVELS; xhigh/max are Opus-only at the
        # model level but we don't gate by model here (same as the plugin picker).
        [[ "$type" == "claude" ]] \
          || fail "$E_VALIDATION" "type '$type' does not support 'effort' config (claude only)"
        case "$v" in
          low|medium|high|xhigh|max) ;;
          *) fail "$E_VALIDATION" "invalid effort '$v' (allowed: low, medium, high, xhigh, max)" ;;
        esac
        new_effort="$v"
        applied_keys+=("effort")
        ;;
      auth-profile|auth.profile)
        if [[ -z "$v" || "$v" == "default" ]]; then
          reg=$(jq --arg n "$name" 'del(.agents[$n].authProfile)' <<<"$reg")
        else
          valid_profile_name "$v" \
            || fail "$E_VALIDATION" "invalid auth-profile (lowercase letters/digits/_-, start letter, <=32 chars)"
          [[ -f "${AUTH_PROFILES_DIR}/${v}/combined.env" ]] \
            || fail "$E_NOT_FOUND" "auth profile '$v' not configured — run: sudo 5dive agent auth set $type --api-key=... --auth-profile=$v"
          reg=$(jq --arg n "$name" --arg v "$v" '.agents[$n].authProfile = $v' <<<"$reg")
        fi
        env_dirty=1
        profile_dirty=1
        applied_keys+=("auth-profile")
        ;;
      *) fail "$E_USAGE" "unknown config key: $k" ;;
    esac
  done
  # Pre-flight: setting channels=<plugin> without a token in the same call
  # only works if the connector secret is already on disk (e.g. rotating
  # the allowlist without touching the token). Otherwise the gateway boots
  # without credentials and silently goes deaf — better to fail loudly here.
  if [[ -n "$channels_changed_to" && "$channels_changed_to" != "none" ]]; then
    case "$channels_changed_to" in
      telegram)
        if [[ -z "$new_telegram_token" && ! -s "${CONNECTORS_DIR}/telegram-${name}.env" ]]; then
          fail "$E_VALIDATION" \
            "channels=telegram needs telegram.token=<token> in the same set call"
        fi
        ;;
      discord)
        if [[ -z "$new_discord_token" && ! -s "${CONNECTORS_DIR}/discord-${name}.env" ]]; then
          fail "$E_VALIDATION" \
            "channels=discord needs discord.token=<token> in the same set call"
        fi
        ;;
    esac
  fi
  echo "$reg" | registry_write
  if (( env_dirty )); then
    step "Rewriting ${ENV_DIR}/${name}.env"
    local new_channels new_workdir new_profile
    new_channels=$(jq -r --arg n "$name" '.agents[$n].channels // "none"' <<<"$reg")
    new_workdir=$(jq -r --arg n "$name" '.agents[$n].workdir // empty' <<<"$reg")
    new_profile=$(jq -r --arg n "$name" '.agents[$n].authProfile // empty' <<<"$reg")
    write_agent_env "$name" "$type" "$new_channels" "$new_workdir" "$new_profile"
    if (( profile_dirty )); then
      step "Re-pointing ${ENV_DIR}/${name}-auth.env"
      link_agent_profile "$name" "$new_profile"
    fi
  fi
  # Channel attach / rotate: when this call touched telegram.* or discord.*
  # we need to push the new values into each type's native state dir, the
  # same way cmd_create does. install_channel_for_agent routes to the right
  # helper (install_channel_plugin_for_agent for claude — installs the
  # plugin if missing + seeds access.json with allowed_users; openclaw
  # channels add for openclaw; ~/.hermes/.env write for hermes).
  #
  # A bare channels=telegram (token already on disk from a prior call) must
  # ALSO dispatch: the deferred restart below boots the session with
  # `--channels plugin:telegram@…`, and if the plugin was never staged for
  # this user the session comes up with no telegram tool and the agent
  # improvises (raw Bot-API curl — seen live on the demo box, DIVE-250).
  # The install helpers are idempotent, so re-running on an already-staged
  # agent is a cheap no-op.
  local effective_channels
  effective_channels=$(jq -r --arg n "$name" '.agents[$n].channels // "none"' <<<"$reg")
  if [[ -n "$new_telegram_token" || "$channels_changed_to" == "telegram" ]]; then
    [[ "$effective_channels" == "telegram" ]] \
      || fail "$E_VALIDATION" "telegram.* keys require channels=telegram (current: $effective_channels)"
    local token_for_install="$new_telegram_token"
    if [[ -z "$token_for_install" ]]; then
      # Token wasn't part of this call — pull the one already on disk so
      # the install helper still has something to register/seed. Falls
      # through to the connector-secret file written on the prior call.
      token_for_install=$(grep -E '^TELEGRAM_BOT_TOKEN=' "${CONNECTORS_DIR}/telegram-${name}.env" 2>/dev/null \
        | head -1 | cut -d= -f2-)
      [[ -n "$token_for_install" ]] \
        || fail "$E_NOT_FOUND" "no stored telegram token for agent '$name' — include telegram.token=<token>"
    fi
    if [[ -n "$new_telegram_token" ]]; then
      step "Writing ${CONNECTORS_DIR}/telegram-${name}.env"
      write_channel_secret telegram "$name" TELEGRAM_BOT_TOKEN "$new_telegram_token"
    fi
    step "Installing telegram channel for agent '$name' (type=$type)"
    install_channel_for_agent "$type" telegram "$name" \
      "$token_for_install" "$new_home_channel" "$new_allowed_users"
    # Hermes' messaging gateway is a separate user systemd unit from the
    # tmux loop. cmd_create wires it up only when channels=telegram|discord
    # at create time; attaching a channel post-create (channels was "none")
    # leaves the unit uninstalled, so the agent-start.sh `gateway restart`
    # at the end of this function would warn-and-skip. Install + start it
    # here (idempotent — safe if cmd_create already did it for a token
    # rotation). openclaw handles its own gateway state inside
    # install_channel_for_openclaw_agent, so no parallel hook there.
    if [[ "$type" == "hermes" ]]; then
      ensure_hermes_gateway "$name"
    fi
    # Cache the bot @handle in the registry so the dashboard's agents list
    # can render the t.me/<bot> deep link without an extra getMe roundtrip
    # (mirrors cmd_create's post-install backfill — best-effort, a network
    # blip shouldn't fail config). cmd_config already runs under the
    # registry lock so a direct in-place update is safe.
    local bu
    if bu=$(fetch_bot_username "$token_for_install" 2>/dev/null) && [[ -n "$bu" ]]; then
      reg=$(registry_read)
      jq --arg n "$name" --arg u "$bu" \
        '.agents[$n].botUsername = $u' <<<"$reg" | registry_write
    fi
  fi
  if [[ -n "$new_discord_token" || "$channels_changed_to" == "discord" ]]; then
    [[ "$effective_channels" == "discord" ]] \
      || fail "$E_VALIDATION" "discord.token requires channels=discord (current: $effective_channels)"
    # Same bare-attach rule as telegram above (DIVE-250): channels=discord
    # without a token in this call falls back to the stored connector secret
    # (the pre-flight above guarantees one exists).
    local discord_token_for_install="$new_discord_token"
    if [[ -z "$discord_token_for_install" ]]; then
      discord_token_for_install=$(grep -E '^DISCORD_BOT_TOKEN=' "${CONNECTORS_DIR}/discord-${name}.env" 2>/dev/null \
        | head -1 | cut -d= -f2-)
      [[ -n "$discord_token_for_install" ]] \
        || fail "$E_NOT_FOUND" "no stored discord token for agent '$name' — include discord.token=<token>"
    fi
    if [[ -n "$new_discord_token" ]]; then
      step "Writing ${CONNECTORS_DIR}/discord-${name}.env"
      write_channel_secret discord "$name" DISCORD_BOT_TOKEN "$new_discord_token"
    fi
    step "Installing discord channel for agent '$name' (type=$type)"
    install_channel_for_agent "$type" discord "$name" \
      "$discord_token_for_install" "$new_home_channel" "$new_allowed_users"
    if [[ "$type" == "hermes" ]]; then
      ensure_hermes_gateway "$name"
    fi
  fi
  if [[ -n "$new_model" ]]; then
    step "Writing model=$new_model into $type runtime config"
    write_runtime_model "$type" "$name" "$new_model"
  fi
  if [[ -n "$new_effort" ]]; then
    step "Writing effortLevel=$new_effort into claude runtime config"
    write_runtime_effort "$name" "$new_effort"
  fi
  # Fail-closed gate (DIVE-250): when this call attached a channel to a
  # claude agent, the restarted session boots with `--channels
  # plugin:<ch>@<marketplace>` (see 5dive-agent-start) and comes up with NO
  # channel tool if the plugin cache isn't staged yet. The dispatch above is
  # synchronous so the cache dir should already exist; poll briefly to absorb
  # any in-flight stager, then refuse to restart into a known-deaf session
  # rather than let the agent improvise. Scoped to channels_changed_to (not
  # every config call) so e.g. `model=` on a legacy claude-plugins-official
  # telegram agent can't trip a spurious marketplace mismatch.
  if [[ "$type" == "claude" ]] \
     && [[ "$channels_changed_to" == "telegram" || "$channels_changed_to" == "discord" ]]; then
    local gate_marketplace="claude-plugins-official"
    [[ "$channels_changed_to" == "telegram" ]] && gate_marketplace="5dive-plugins"
    local gate_dir="/home/agent-${name}/.claude/plugins/cache/${gate_marketplace}/${channels_changed_to}"
    local gate_waited=0
    while [[ ! -d "$gate_dir" ]] && (( gate_waited < 15 )); do
      sleep 1; gate_waited=$((gate_waited + 1))
    done
    [[ -d "$gate_dir" ]] || fail "$E_GENERIC" \
      "$channels_changed_to plugin not staged for agent '$name' ($gate_dir missing) — refusing to restart into a session with no $channels_changed_to tool. Re-run: sudo 5dive agent config $name set channels=$channels_changed_to"
  fi
  # Defer the restart so the calling process (often `sudo -n 5dive agent
  # set-account` invoked from inside the agent's own bot) gets to return
  # before its service is torn down. An immediate `systemctl restart` here
  # SIGTERMs our own sudo subprocess → caller sees a spurious failure even
  # though the config write committed. systemd-run --on-active=1 --collect
  # fires the restart ~1s later as a transient unit that survives our exit.
  step "Restarting agent to apply (deferred ~1s)"
  systemd-run --on-active=1 --collect \
    /bin/systemctl restart "5dive-agent@${name}.service" >&2
  local applied_json
  applied_json=$(printf '%s\n' "${applied_keys[@]+"${applied_keys[@]}"}" | jq -R . | jq -cs '. | map(select(length > 0))')
  ok "config applied." \
     '{name:$n, applied:$a}' \
     --arg n "$name" --argjson a "$applied_json"
}

# Attach the invoker's terminal to the agent's tmux session. The systemd unit
# runs tmux as user `agent-<name>`, so we sudo into that user to reach the
# right server socket. exec hands the TTY off for the whole attach — --json is
# a no-op here.
cmd_tui() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent <name> tui"
  ensure_state
  local reg
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  exec sudo -u "agent-${name}" tmux attach -t "agent-${name}"
}

cmd_types() {
  local arr="[]"
  for type in "${!TYPE_BIN[@]}"; do
    local bin="${TYPE_BIN[$type]}"
    local installed=false
    [[ -x "$bin" ]] && installed=true
    local channels=false
    [[ "${TYPE_CHANNELS[$type]}" == "1" ]] && channels=true
    arr=$(jq -c \
      --arg n "$type" --arg b "$bin" \
      --argjson i "$installed" --argjson c "$channels" \
      '. + [{name:$n, bin:$b, installed:$i, channels:$c}]' <<<"$arr")
  done
  if (( JSON_MODE )); then
    jq -c '{ok:true, data: .}' <<<"$arr"
  else
    jq -r '.[] | "\(.name) bin=\(.bin) installed=\(if .installed then "ok" else "missing" end) channels=\(if .channels then "yes" else "no" end)"' <<<"$arr" | sort
  fi
}

# Single getUpdates long-poll round against a Telegram bot token. Returns
# JSON `{found:bool, userId, chatId, username, firstName}` on stdout — the
# dashboard wraps this in a re-call loop so it can show a "send /start to
# your bot" UI and react the moment the user does. Each call clears any
# webhook first (getUpdates is incompatible with a registered webhook), then
# blocks for up to <poll_secs> waiting for an update. <poll_secs> is capped
# below the upstream exec timeout so the HTTP layer doesn't kill the call
# mid-poll. Pure curl + jq — no extra deps.
cmd_telegram_discover() {
  local token="" agent="" poll_secs=50
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --token=*)      token="${1#--token=}" ;;
      --agent=*)      agent="${1#--agent=}" ;;
      --poll-secs=*)  poll_secs="${1#--poll-secs=}" ;;
      -*)             fail "$E_USAGE" "unknown flag: $1" ;;
      *)              fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  # --agent=<name>: lookup the bot token from the agent's telegram connector
  # env file. Lets the dashboard discover-for-this-agent without having to
  # round-trip the token through the browser.
  if [[ -n "$agent" ]]; then
    [[ -z "$token" ]] \
      || fail "$E_USAGE" "--agent and --token are mutually exclusive"
    local env_file="${CONNECTORS_DIR}/telegram-${agent}.env"
    [[ -r "$env_file" ]] \
      || fail "$E_NOT_FOUND" "no telegram connector for agent '$agent' (looked at $env_file)"
    token=$(grep -E '^TELEGRAM_BOT_TOKEN=' "$env_file" 2>/dev/null \
            | head -1 | cut -d= -f2-)
    [[ -n "$token" ]] \
      || fail "$E_NOT_FOUND" "no TELEGRAM_BOT_TOKEN in $env_file"
  fi
  [[ -n "$token" ]] || fail "$E_USAGE" "usage: 5dive agent telegram-discover {--token=<bot-token>|--agent=<name>} [--poll-secs=N]"
  valid_telegram_token "$token" \
    || fail "$E_VALIDATION" "telegram token format looks wrong (expected <digits>:<20+ chars>)"
  [[ "$poll_secs" =~ ^[0-9]+$ ]] && (( poll_secs >= 1 && poll_secs <= 90 )) \
    || fail "$E_VALIDATION" "--poll-secs must be 1..90"

  # deleteWebhook + drop_pending_updates clears any existing webhook AND
  # discards stale updates so the first message we surface is one the user
  # actually just sent (not one queued from a prior session). Best-effort —
  # if Telegram returns non-200 we still try getUpdates; the caller will
  # just see found:false and re-poll.
  curl -sS -m 10 -o /dev/null \
    --data-urlencode "drop_pending_updates=true" \
    "https://api.telegram.org/bot${token}/deleteWebhook" || true

  # Long-poll. timeout=N tells Telegram to hold the connection open for up
  # to N seconds waiting for an update, returning earlier if one arrives.
  # curl's max-time is set just above so the socket survives the wait.
  local resp
  resp=$(curl -sS -m "$((poll_secs + 5))" \
    --data-urlencode "timeout=${poll_secs}" \
    --data-urlencode "limit=1" \
    --data-urlencode "allowed_updates=[\"message\"]" \
    "https://api.telegram.org/bot${token}/getUpdates" 2>/dev/null || true)

  # Empty / non-JSON response → treat as no message yet (dashboard re-polls).
  if ! jq -e . >/dev/null 2>&1 <<<"$resp"; then
    ok "" '{found:false}'
    return
  fi
  if [[ "$(jq -r '.ok' <<<"$resp" 2>/dev/null)" != "true" ]]; then
    local desc
    desc=$(jq -r '.description // "telegram api error"' <<<"$resp" 2>/dev/null)
    fail "$E_GENERIC" "telegram: $desc"
  fi
  local count
  count=$(jq -r '.result | length' <<<"$resp")
  if [[ "$count" == "0" ]]; then
    ok "" '{found:false}'
    return
  fi

  # Pull the message's `from` (user) + `chat`. For private DMs they're the
  # same numeric id, but allowing them to differ keeps groups working too.
  local user_id chat_id username first_name
  user_id=$(jq -r '.result[0].message.from.id // empty' <<<"$resp")
  chat_id=$(jq -r '.result[0].message.chat.id // empty' <<<"$resp")
  username=$(jq -r '.result[0].message.from.username // empty' <<<"$resp")
  first_name=$(jq -r '.result[0].message.from.first_name // empty' <<<"$resp")
  [[ -n "$user_id" && -n "$chat_id" ]] \
    || fail "$E_GENERIC" "telegram update missing from.id or chat.id"

  ok "discovered chat $chat_id (user $user_id)" \
     '{found:true, userId:$u, chatId:$c, username:$un, firstName:$fn}' \
     --arg u "$user_id" --arg c "$chat_id" --arg un "$username" --arg fn "$first_name"
}

# Token -> bot username via Telegram getMe. Returns username on stdout (exit 0)
# or empty (exit 1) on any failure (network, malformed response, missing
# username). Used by cmd_create and cmd_telegram_info to backfill the cached
# username in the registry; failures are non-fatal — callers degrade to
# "telegram" text without the @handle link.
fetch_bot_username() {
  local token="$1"
  local resp
  resp=$(curl -sS -m 10 \
    "https://api.telegram.org/bot${token}/getMe" 2>/dev/null) || return 1
  jq -e . >/dev/null 2>&1 <<<"$resp" || return 1
  [[ "$(jq -r '.ok // false' <<<"$resp" 2>/dev/null)" == "true" ]] || return 1
  local username
  username=$(jq -r '.result.username // empty' <<<"$resp")
  [[ -n "$username" ]] || return 1
  echo "$username"
}

# Fast bot-identity lookup. The dashboard fires this once when the user
# reaches the "discovering chat" step so the "open Telegram and send /start"
# instruction can render a t.me/<botusername> deep link rather than a plain
# text mention. Token never leaves the server.
cmd_telegram_getme() {
  local token=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --token=*) token="${1#--token=}" ;;
      -*)        fail "$E_USAGE" "unknown flag: $1" ;;
      *)         fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$token" ]] || fail "$E_USAGE" "usage: 5dive agent telegram-getme --token=<bot-token>"
  valid_telegram_token "$token" \
    || fail "$E_VALIDATION" "telegram token format looks wrong (expected <digits>:<20+ chars>)"

  local resp
  resp=$(curl -sS -m 10 \
    "https://api.telegram.org/bot${token}/getMe" 2>/dev/null || true)

  if ! jq -e . >/dev/null 2>&1 <<<"$resp"; then
    fail "$E_GENERIC" "telegram api unreachable"
  fi
  if [[ "$(jq -r '.ok' <<<"$resp" 2>/dev/null)" != "true" ]]; then
    local desc
    desc=$(jq -r '.description // "telegram api error"' <<<"$resp" 2>/dev/null)
    fail "$E_GENERIC" "telegram: $desc"
  fi

  local bot_id username first_name
  bot_id=$(jq -r '.result.id // empty' <<<"$resp")
  username=$(jq -r '.result.username // empty' <<<"$resp")
  first_name=$(jq -r '.result.first_name // empty' <<<"$resp")
  [[ -n "$username" ]] \
    || fail "$E_GENERIC" "telegram getMe missing username"

  ok "bot @$username" \
     '{botId:$id, username:$un, firstName:$fn}' \
     --arg id "$bot_id" --arg un "$username" --arg fn "$first_name"
}

# Name-based bot identity lookup. Reads the agent's stored telegram token
# server-side (so the dashboard never sees raw bot tokens), calls getMe,
# and caches the result under .agents.<name>.botUsername in the registry.
# Subsequent calls hit the cache and return without touching Telegram. Used
# by the dashboard's agents page to backfill @handles for agents created
# before botUsername-on-create was wired up. --refresh forces a re-fetch.
cmd_telegram_info() {
  local name=""
  local refresh=0
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --refresh) refresh=1 ;;
      -*)        fail "$E_USAGE" "unknown flag: $1" ;;
      *)         [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent telegram-info <name> [--refresh]"
  ensure_state
  local reg
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  local channels
  channels=$(jq -r --arg n "$name" '.agents[$n].channels' <<<"$reg")
  [[ "$channels" == "telegram" ]] \
    || fail "$E_VALIDATION" "agent '$name' has channels=$channels — telegram-info only applies to telegram"

  if (( ! refresh )); then
    local cached
    cached=$(jq -r --arg n "$name" '.agents[$n].botUsername // empty' <<<"$reg")
    if [[ -n "$cached" ]]; then
      ok "bot @$cached" \
         '{username:$un, cached:true}' \
         --arg un "$cached"
      return 0
    fi
  fi

  local token_env="${CONNECTORS_DIR}/telegram-${name}.env"
  local token
  token=$(sed -n 's/^TELEGRAM_BOT_TOKEN=//p' "$token_env" 2>/dev/null | head -1 || true)
  [[ -n "$token" ]] \
    || fail "$E_AUTH_REQUIRED" "no telegram bot token for agent '$name' (expected ${token_env})"

  local username
  username=$(fetch_bot_username "$token" 2>/dev/null) \
    || fail "$E_GENERIC" "telegram getMe failed (network or invalid token)"

  # Cache to registry so the next list/info call avoids the Telegram round-trip.
  with_registry_lock _persist_bot_username "$name" "$username"

  ok "bot @$username" \
     '{username:$un, cached:false}' \
     --arg un "$username"
}

_persist_bot_username() {
  local name="$1" username="$2"
  local reg
  reg=$(registry_read)
  jq --arg n "$name" --arg u "$username" \
    '.agents[$n].botUsername = $u' <<<"$reg" | registry_write
}

# DIVE-159 team-bot (decision A): the agent's forum topic in the shared team
# supergroup. teamTopic.threadId is the Telegram message_thread_id; teamTopic.chatId
# is the team supergroup id. Single source of truth — the provision createForumTopic
# hook writes it; the single listener reads it to route inbound thread->agent.
cmd_agent_topic_set() {
  local name="" thread_id="" chat_id=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --thread-id=*) thread_id="${1#--thread-id=}" ;;
      --chat-id=*)   chat_id="${1#--chat-id=}" ;;
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent topic set <name> --thread-id=N --chat-id=N"
  [[ "$thread_id" =~ ^[0-9]+$ ]]  || fail "$E_VALIDATION" "--thread-id must be a positive integer"
  [[ "$chat_id"   =~ ^-?[0-9]+$ ]] || fail "$E_VALIDATION" "--chat-id must be an integer (supergroup ids are negative)"
  ensure_state
  local reg
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  jq --arg n "$name" --argjson th "$thread_id" --argjson ch "$chat_id" \
    '.agents[$n].teamTopic = {threadId: $th, chatId: $ch}' <<<"$reg" | registry_write
  ok "team topic set for $name (thread $thread_id in chat $chat_id)" \
     '{agent:$n, threadId:$th, chatId:$ch}' \
     --arg n "$name" --argjson th "$thread_id" --argjson ch "$chat_id"
}

cmd_agent_topic_get() {
  local name=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent topic get <name>"
  ensure_state
  local reg tt
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  tt=$(jq -c --arg n "$name" '.agents[$n].teamTopic // null' <<<"$reg")
  if (( JSON_MODE )); then
    jq -cn --argjson d "$tt" '{ok:true, data:$d}'
  else
    [[ "$tt" == "null" ]] && echo "no team topic for $name" || echo "$tt"
  fi
}

# ─────────────────────────────────────────────────────────────────────────────
# DIVE-159 — Team group setup (personal-bot model). Each telegram agent keeps
# its OWN bot. The customer makes one Telegram group (Topics on) and adds each
# agent's bot as admin. This command creates a forum topic per agent and binds
# each agent's access.json so it replies ONLY in its own topic (no @mention),
# while its private DM bot keeps working unchanged. No central listener.
#
#   5dive agent team-bot status    --group=<chat_id>             (read-only probe)
#   5dive agent team-bot provision --group=<chat_id> [--owner=<user_id>]
#
# Per-agent status:
#   ready       — bot is admin in the group and bound to its topic
#   needs_add   — bot is not in the group yet (customer must add it)
#   needs_admin — bot is in the group but not an admin (can't read messages)
#   no_token    — no telegram bot token on file for the agent
#   error       — a Telegram call failed (detail in `error`)
# provision is idempotent: re-running re-uses an agent's existing topic.
cmd_agent_team_bot() {
  local sub="" group="" owner="" agents_filter="" token="" off=""
  [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive agent team-bot status|provision|shared|intercom|discover --group=<chat_id> [--owner=<user_id>]"
  sub="$1"; shift
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --group=*) group="${1#--group=}" ;;
      --owner=*) owner="${1#--owner=}" ;;
      --agents=*) agents_filter="${1#--agents=}" ;;
      --token=*) token="${1#--token=}" ;;
      --off) off=1 ;;
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  case "$sub" in status|provision|shared|intercom|discover) ;; *) fail "$E_USAGE" "unknown team-bot command: $sub (status|provision|shared|intercom|discover)" ;; esac

  # `discover` (DIVE-247) = find the team group with the bot itself — no
  # --group needed (that id is exactly what it returns). Handled before the
  # --group validation below.
  if [[ "$sub" == "discover" ]]; then
    ensure_state
    _team_bot_do_discover "$token"
    return
  fi

  [[ "$group" =~ ^-?[0-9]+$ ]] || fail "$E_VALIDATION" "--group must be a Telegram chat id (negative for supergroups)"
  [[ -z "$owner" || "$owner" =~ ^[0-9]+$ ]] || fail "$E_VALIDATION" "--owner must be a numeric Telegram user id"
  ensure_state

  # `shared` = the shared-team-bot path for agents WITHOUT their own bot. The
  # customer pastes ONE bot token; each selected no-bot agent gets a forum topic
  # and runs the telegram plugin in send-only mode against that token, while a
  # single listener routes inbound topic->agent. Self-contained handler.
  if [[ "$sub" == "shared" ]]; then
    _team_bot_do_shared "$group" "$owner" "$agents_filter" "$token"
    return
  fi

  # `intercom` (DIVE-195) = one dedicated topic in the team group where all
  # inter-agent chatter is mirrored (consolidated, not scattered per-agent).
  # Creates the topic + records it; the mirror (mirror_interagent_outbound)
  # then routes there. --off removes it.
  if [[ "$sub" == "intercom" ]]; then
    _team_bot_do_intercom "$group" "$off"
    return
  fi

  # Build {agent: {token, type, stateDir, threadId}} for every eligible telegram
  # agent. Token + state-dir resolution stays server-side; the dashboard only
  # ever sees the resulting status, never a bot token.
  local reg; reg=$(registry_read)
  local agents_json="{}"
  local name type token state_dir tt
  while IFS= read -r name; do
    [[ -n "$name" ]] || continue
    if [[ -n "$agents_filter" ]] && [[ ",${agents_filter}," != *",${name},"* ]]; then continue; fi
    # Skip agents wired send-only on the shared team bot — they're relayed, not
    # personal-bot members, and surface in the `relay` list below instead.
    if grep -q '^TELEGRAM_SEND_ONLY=1' "${CONNECTORS_DIR}/telegram-${name}.env" 2>/dev/null; then continue; fi
    type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
    token=$(_team_bot_token "$name")
    state_dir=$(_tg_access_state_dir "agent-${name}" "$type" 2>/dev/null || echo "")
    tt=$(jq -c --arg n "$name" '.agents[$n].teamTopic // null' <<<"$reg")
    agents_json=$(jq -c --arg n "$name" --arg tok "$token" --arg ty "$type" --arg sd "$state_dir" --argjson tt "$tt" \
      '.[$n] = {token:$tok, type:$ty, stateDir:$sd, teamTopic:$tt}' <<<"$agents_json")
  done < <(_team_bot_agent_list)

  # Relay list = agents WITHOUT a personal bot (or already wired send-only on the
  # shared team bot). Computed from local state only (no Telegram calls) so the
  # card can render the shared-bot section. status/provision both report it.
  local relay; relay=$(_team_bot_relay_status "$group")

  # Only error when there's nothing at all to show — no personal-bot agents AND
  # no relay candidates. (A customer may have only no-bot agents.)
  if [[ "$agents_json" == "{}" && "$relay" == "[]" ]]; then
    fail "$E_NOT_FOUND" "no telegram agents to add to a team group"
  fi

  # Heavy lifting in Python: Telegram getMe/getChatMember/createForumTopic +
  # atomic access.json merge (root writes then chowns to agent-<name>). Emits a
  # results JSON array on stdout. teamTopic registry updates come back as a side
  # channel (RESULTS_FILE) so we can persist them under the registry lock.
  local team_token=""; [[ -r /etc/5dive/team-bot.token ]] && team_token=$(cat /etc/5dive/team-bot.token 2>/dev/null)
  local reg_updates_file; reg_updates_file=$(mktemp)
  local results
  results=$(MODE="$sub" GROUP="$group" OWNER="$owner" AGENTS="$agents_json" TEAM_BOT_TOKEN="$team_token" REG_UPDATES_FILE="$reg_updates_file" python3 - <<'PY'
import json, os, re, tempfile, pwd, urllib.parse, urllib.request

MODE  = os.environ['MODE']
GROUP = os.environ['GROUP']
OWNER = os.environ.get('OWNER') or ''
TEAM  = os.environ.get('TEAM_BOT_TOKEN') or ''
AGENTS = json.loads(os.environ['AGENTS'])

def tg(token, method, **params):
    url = f"https://api.telegram.org/bot{token}/{method}"
    data = urllib.parse.urlencode(params).encode()
    try:
        with urllib.request.urlopen(urllib.request.Request(url, data=data), timeout=15) as r:
            return json.load(r)
    except urllib.error.HTTPError as e:
        try:
            return json.load(e)
        except Exception:
            return {"ok": False, "description": f"HTTP {e.code}"}
    except Exception as e:
        return {"ok": False, "description": str(e)}

# First pass: identify each bot + its membership; find a topic-manager bot.
info = {}
manager_token = None
for name, a in AGENTS.items():
    rec = {"agent": name, "status": "error", "botUsername": None, "threadId": None}
    info[name] = rec
    token = a.get("token") or ""
    if not token:
        rec["status"] = "no_token"; continue
    me = tg(token, "getMe")
    if not me.get("ok"):
        rec["error"] = me.get("description", "getMe failed"); continue
    bot_id = me["result"]["id"]
    rec["botUsername"] = me["result"].get("username")
    rec["_token"] = token
    rec["_botId"] = bot_id
    m = tg(token, "getChatMember", chat_id=GROUP, user_id=bot_id)
    if not m.get("ok"):
        # Not a member → Telegram returns ok:false ("user not found" / "chat not found").
        rec["status"] = "needs_add"; continue
    st = m["result"].get("status")
    if st in ("administrator", "creator"):
        rec["status"] = "admin_ok"  # provisional; topic binding decided below
        # Only a bot that can actually manage topics qualifies as the topic
        # creator (creators implicitly can, even without the flag set).
        if (m["result"].get("can_manage_topics") or st == "creator") and manager_token is None:
            manager_token = token
    elif st in ("member", "restricted"):
        rec["status"] = "needs_admin"
    else:  # left / kicked
        rec["status"] = "needs_add"

# Fleet fallback: the shared team bot is the group's topic janitor (customers
# have no such file — they rely on an agent bot with Manage Topics instead).
if manager_token is None and TEAM:
    me = tg(TEAM, "getMe")
    if me.get("ok"):
        mm = tg(TEAM, "getChatMember", chat_id=GROUP, user_id=me["result"]["id"])
        if mm.get("ok") and mm["result"].get("status") in ("administrator", "creator") and mm["result"].get("can_manage_topics"):
            manager_token = TEAM

results = []
reg_updates = {}  # name -> {threadId, chatId}
for name, rec in info.items():
    out = {"agent": rec["agent"], "botUsername": rec.get("botUsername"),
           "status": rec["status"], "threadId": rec.get("threadId")}
    if rec.get("error"): out["error"] = rec["error"]

    if rec["status"] != "admin_ok":
        # not bound; surface as-is (needs_add / needs_admin / no_token / error)
        out["status"] = rec["status"] if rec["status"] != "admin_ok" else "ready"
        results.append(out); continue

    a = AGENTS[name]
    existing = a.get("teamTopic") or {}
    thread = existing.get("threadId") if existing.get("chatId") == int(GROUP) else None

    if MODE == "status":
        out["status"] = "ready" if thread else "needs_topic"
        out["threadId"] = thread
        results.append(out); continue

    # provision: create the topic if missing, then wire access.json.
    if not thread:
        if not manager_token:
            out["status"] = "error"; out["error"] = "no admin bot can manage topics"
            results.append(out); continue
        cf = tg(manager_token, "createForumTopic", chat_id=GROUP, name=name)
        if not cf.get("ok"):
            out["status"] = "error"; out["error"] = cf.get("description", "createForumTopic failed")
            results.append(out); continue
        thread = cf["result"]["message_thread_id"]

    # Merge the group entry into the agent's access.json (root writes + chowns).
    sd = a.get("stateDir") or ""
    if sd:
        path = os.path.join(sd, "access.json")
        try:
            acc = json.load(open(path)) if os.path.exists(path) else {}
        except Exception:
            acc = {}
        acc.setdefault("dmPolicy", "pairing")
        acc.setdefault("allowFrom", [])
        acc.setdefault("groups", {})
        acc.setdefault("pending", {})
        entry = {"requireMention": False,
                 "allowFrom": [OWNER] if OWNER else [],
                 "message_thread_id": int(thread)}
        acc["groups"][GROUP] = entry
        os.makedirs(sd, exist_ok=True)
        fd, tmp = tempfile.mkstemp(dir=sd)
        with os.fdopen(fd, "w") as f:
            json.dump(acc, f, indent=2)
        try:
            u = pwd.getpwnam("agent-" + name)
            os.chown(tmp, u.pw_uid, u.pw_gid); os.chmod(tmp, 0o600)
            os.replace(tmp, path); os.chown(path, u.pw_uid, u.pw_gid)
        except KeyError:
            os.replace(tmp, path)

    reg_updates[name] = {"threadId": int(thread), "chatId": int(GROUP)}
    out["status"] = "ready"; out["threadId"] = int(thread)
    if rec.get("botUsername"):
        out["topicLink"] = f"https://t.me/{rec['botUsername']}"  # opens the bot; topic deep-links are client-built
    results.append(out)

with open(os.environ.get("REG_UPDATES_FILE", "/dev/null"), "w") as f:
    json.dump(reg_updates, f)

print(json.dumps(results))
PY
)
  local rc=$?
  [[ $rc -eq 0 && -n "$results" ]] || fail "$E_GENERIC" "team-bot $sub failed"

  # Persist teamTopic registry updates under the lock (provision only).
  if [[ "$sub" == "provision" && -s "$reg_updates_file" ]]; then
    with_registry_lock _team_bot_persist_topics "$reg_updates_file"
  fi
  rm -f "$reg_updates_file"

  local ready total
  ready=$(jq '[.[] | select(.status=="ready")] | length' <<<"$results")
  total=$(jq 'length' <<<"$results")
  # Intercom topic (DIVE-195) for this group, if configured — so the card can
  # show "intercom on" + a link.
  local intercom
  intercom=$(jq -c --arg g "$group" 'if (.intercomTopic.chatId|tostring) == $g then .intercomTopic else null end' <<<"$(registry_read)")
  ok "team-bot $sub: $ready/$total agents ready in group $group" \
     '{group:$g, agents:$a, relay:$r, intercom:$ic}' \
     --arg g "$group" --argjson a "$results" --argjson r "$relay" --argjson ic "$intercom"
}

# Helpers for cmd_agent_team_bot (kept module-local).
_team_bot_agent_list() {
  jq -r '.agents | to_entries[]
    | select(.value.channels=="telegram")
    | select(.value.type=="claude" or .value.type=="codex" or .value.type=="grok" or .value.type=="antigravity")
    | .key' <<<"$(registry_read)"
}
_team_bot_token() {
  sed -n 's/^TELEGRAM_BOT_TOKEN=//p' "${CONNECTORS_DIR}/telegram-${1}.env" 2>/dev/null | head -1
}
_team_bot_persist_topics() {
  local updates_file="$1" reg
  reg=$(registry_read)
  reg=$(jq --slurpfile u "$updates_file" '
    .agents as $a
    | reduce ($u[0] | to_entries[]) as $e (.;
        if .agents[$e.key] != null
        then .agents[$e.key].teamTopic = $e.value
        else . end)
  ' <<<"$reg")
  registry_write <<<"$reg"
}

# ─────────────────────────────────────────────────────────────────────────────
# DIVE-159 — shared-team-bot path (agents WITHOUT their own bot).
#
# An agent with no personal Telegram bot can still live in the team group by
# routing through ONE shared bot: the agent runs the telegram plugin in
# send-only mode against the shared token (never polls — Telegram allows a
# single getUpdates consumer per token), and a single listener daemon maps each
# inbound topic message back to the right agent's relay-in/. This is the hybrid
# completion of the personal-bot card: personal-bot agents join with their own
# bot; no-bot agents route through the shared bot. The two never overlap.
# ─────────────────────────────────────────────────────────────────────────────

# Relay candidates = plugin-capable agents that either have NO personal bot
# (channels != telegram) or are already wired send-only on the shared bot.
# Echoes one agent name per line.
_team_bot_relay_agent_list() {
  local reg name ch
  reg=$(registry_read)
  while IFS= read -r name; do
    [[ -n "$name" ]] || continue
    if grep -q '^TELEGRAM_SEND_ONLY=1' "${CONNECTORS_DIR}/telegram-${name}.env" 2>/dev/null; then
      echo "$name"; continue
    fi
    ch=$(jq -r --arg n "$name" '.agents[$n].channels // "none"' <<<"$reg")
    [[ "$ch" != "telegram" ]] && echo "$name"
  done < <(jq -r '.agents | to_entries[]
    | select(.value.type=="claude" or .value.type=="codex" or .value.type=="grok" or .value.type=="antigravity")
    | .key' <<<"$reg")
}

# Relay status array for the dashboard — local state only, no Telegram calls.
# Per agent: "relayed" (wired send-only + topic bound to this group) or
# "no_bot" (eligible, not yet relayed).
_team_bot_relay_status() {
  local group="$1" reg out name tt sendonly ttchat ttthread status threadId
  reg=$(registry_read)
  out="[]"
  while IFS= read -r name; do
    [[ -n "$name" ]] || continue
    tt=$(jq -c --arg n "$name" '.agents[$n].teamTopic // null' <<<"$reg")
    sendonly=0
    grep -q '^TELEGRAM_SEND_ONLY=1' "${CONNECTORS_DIR}/telegram-${name}.env" 2>/dev/null && sendonly=1
    status="no_bot"; threadId="null"
    if [[ "$tt" != "null" ]]; then
      ttchat=$(jq -r '.chatId' <<<"$tt")
      ttthread=$(jq -r '.threadId' <<<"$tt")
      if [[ "$ttchat" == "$group" && "$sendonly" == "1" ]]; then
        status="relayed"; threadId="$ttthread"
      fi
    fi
    out=$(jq -c --arg n "$name" --arg s "$status" --argjson th "$threadId" \
      '. + [{agent:$n, status:$s, threadId:$th}]' <<<"$out")
  done < <(_team_bot_relay_agent_list)
  printf '%s' "$out"
}

# Force a relay agent's connector env to the shared token + send-only, preserving
# any unrelated keys. root:claude 640 (same as install_channel writes).
_team_bot_write_sendonly_env() {
  local name="$1" token="$2" ef="${CONNECTORS_DIR}/telegram-${name}.env" tmp
  mkdir -p "$CONNECTORS_DIR"
  tmp=$(mktemp)
  { printf 'TELEGRAM_BOT_TOKEN=%s\n' "$token"
    printf 'TELEGRAM_SEND_ONLY=1\n'
    [[ -f "$ef" ]] && grep -vE '^(TELEGRAM_BOT_TOKEN|TELEGRAM_SEND_ONLY)=' "$ef" 2>/dev/null
  } > "$tmp"
  chown root:claude "$tmp" 2>/dev/null || true
  chmod 640 "$tmp"
  mv "$tmp" "$ef"
}

# Persist channels=telegram + teamTopic for each relayed agent (under reg lock).
_team_bot_persist_shared() {
  local updates_file="$1" reg
  reg=$(registry_read)
  reg=$(jq --slurpfile u "$updates_file" '
    .agents as $a
    | reduce ($u[0] | to_entries[]) as $e (.;
        if .agents[$e.key] != null
        then .agents[$e.key].teamTopic = $e.value
           | .agents[$e.key].channels = "telegram"
        else . end)
  ' <<<"$reg")
  registry_write <<<"$reg"
}

# `5dive agent team-bot intercom --group=<id> [--off]` (DIVE-195)
# Create (idempotent) one dedicated "intercom" topic in the team group + record
# it fleet-wide in the registry as .intercomTopic, so mirror_interagent_outbound
# consolidates all inter-agent chatter there. --off removes it.
_team_bot_do_intercom() {
  local group="$1" off="$2"
  local reg; reg=$(registry_read)

  if [[ "$off" == "1" ]]; then
    local tt; tt=$(jq -c '.intercomTopic // null' <<<"$reg")
    with_registry_lock _team_bot_clear_intercom
    if [[ "$tt" != "null" ]]; then
      local th tc team_token=""
      th=$(jq -r '.threadId' <<<"$tt"); tc=$(jq -r '.chatId' <<<"$tt")
      [[ -r /etc/5dive/team-bot.token ]] && team_token=$(cat /etc/5dive/team-bot.token 2>/dev/null)
      [[ -n "$team_token" && -n "$th" ]] && \
        curl -s "https://api.telegram.org/bot${team_token}/deleteForumTopic" \
          -d chat_id="$tc" -d message_thread_id="$th" >/dev/null 2>&1 || true
    fi
    ok "intercom topic removed for group $group" '{group:$g, intercom:null}' --arg g "$group"
    return
  fi

  # Idempotent: reuse an existing intercom topic already bound to THIS group.
  local existing_chat existing_thread
  existing_chat=$(jq -r '.intercomTopic.chatId // empty' <<<"$reg")
  existing_thread=$(jq -r '.intercomTopic.threadId // empty' <<<"$reg")
  if [[ -n "$existing_thread" && "$existing_chat" == "$group" ]]; then
    ok "intercom topic already set (thread $existing_thread)" \
       '{group:$g, intercom:{threadId:$th, chatId:($g|tonumber)}}' \
       --arg g "$group" --argjson th "$existing_thread"
    return
  fi

  # Find a topic-manager bot (an agent bot that is admin w/ can_manage_topics,
  # or the team-bot fallback) and create the topic. Token resolution stays
  # server-side; the dashboard only sees the resulting thread id.
  local agents_json="{}" name tok
  while IFS= read -r name; do
    [[ -n "$name" ]] || continue
    tok=$(_team_bot_token "$name")
    [[ -n "$tok" ]] && agents_json=$(jq -c --arg n "$name" --arg t "$tok" '.[$n]=$t' <<<"$agents_json")
  done < <(_team_bot_agent_list)
  local team_token=""; [[ -r /etc/5dive/team-bot.token ]] && team_token=$(cat /etc/5dive/team-bot.token 2>/dev/null)

  local reg_updates_file; reg_updates_file=$(mktemp)
  local result
  result=$(GROUP="$group" AGENTS="$agents_json" TEAM_BOT_TOKEN="$team_token" REG_UPDATES_FILE="$reg_updates_file" python3 - <<'PY'
import json, os, urllib.parse, urllib.request, urllib.error

GROUP = os.environ['GROUP']
TEAM  = os.environ.get('TEAM_BOT_TOKEN') or ''
AGENTS = json.loads(os.environ['AGENTS'])

def tg(token, method, **p):
    url = f"https://api.telegram.org/bot{token}/{method}"
    data = urllib.parse.urlencode(p).encode()
    try:
        with urllib.request.urlopen(urllib.request.Request(url, data=data), timeout=15) as r:
            return json.load(r)
    except urllib.error.HTTPError as e:
        try:
            return json.load(e)
        except Exception:
            return {"ok": False, "description": f"HTTP {e.code}"}
    except Exception as e:
        return {"ok": False, "description": str(e)}

def can_manage(tok):
    me = tg(tok, "getMe")
    if not me.get("ok"):
        return False
    m = tg(tok, "getChatMember", chat_id=GROUP, user_id=me["result"]["id"])
    return (m.get("ok") and m["result"].get("status") in ("administrator", "creator")
            and (m["result"].get("can_manage_topics") or m["result"].get("status") == "creator"))

manager = None
for name, tok in AGENTS.items():
    if can_manage(tok):
        manager = tok; break
if manager is None and TEAM and can_manage(TEAM):
    manager = TEAM
if not manager:
    print(json.dumps({"ok": False, "error": "no admin bot can manage topics in this group"})); raise SystemExit(0)

cf = tg(manager, "createForumTopic", chat_id=GROUP, name="intercom")
if not cf.get("ok"):
    print(json.dumps({"ok": False, "error": cf.get("description", "createForumTopic failed")})); raise SystemExit(0)
thread = cf["result"]["message_thread_id"]
with open(os.environ.get("REG_UPDATES_FILE", "/dev/null"), "w") as f:
    json.dump({"threadId": int(thread), "chatId": int(GROUP)}, f)
print(json.dumps({"ok": True, "threadId": int(thread)}))
PY
)
  local rc=$?
  if [[ $rc -ne 0 || -z "$result" ]]; then rm -f "$reg_updates_file"; fail "$E_GENERIC" "team-bot intercom failed"; fi
  if [[ "$(jq -r '.ok // false' <<<"$result")" != "true" ]]; then
    local err; err=$(jq -r '.error // "intercom failed"' <<<"$result"); rm -f "$reg_updates_file"
    fail "$E_GENERIC" "$err"
  fi
  with_registry_lock _team_bot_persist_intercom "$reg_updates_file"
  rm -f "$reg_updates_file"
  local thread; thread=$(jq -r '.threadId' <<<"$result")
  ok "intercom topic created (thread $thread) in group $group" \
     '{group:$g, intercom:{threadId:$th, chatId:($g|tonumber)}}' \
     --arg g "$group" --argjson th "$thread"
}

_team_bot_persist_intercom() {
  local updates_file="$1" reg upd
  reg=$(registry_read)
  upd=$(cat "$updates_file")
  reg=$(jq --argjson u "$upd" '.intercomTopic = $u' <<<"$reg")
  registry_write <<<"$reg"
}
_team_bot_clear_intercom() {
  local reg; reg=$(registry_read)
  reg=$(jq 'del(.intercomTopic)' <<<"$reg")
  registry_write <<<"$reg"
}

# Resolve a bun binary usable by a root-run systemd unit. The listener has no
# deps (raw fetch) so any bun works; prefer the system one, fall back to the
# claude user's nvm/bun install.
_team_bot_resolve_bun() {
  local c
  for c in /usr/local/bin/bun /home/claude/.bun/bin/bun; do
    [[ -x "$c" ]] && { printf '%s' "$c"; return 0; }
  done
  c=$(sudo -u claude -i bash -lc 'command -v bun' 2>/dev/null | tail -1)
  [[ -x "$c" ]] && { printf '%s' "$c"; return 0; }
  printf '/usr/local/bin/bun'
}

# Install (idempotently) the single getUpdates consumer of the shared team bot.
# The listener source is embedded so the CLI is self-contained on any box.
_team_bot_install_listener() {
  mkdir -p /opt/5dive
  cat > /opt/5dive/team-bot-listener.ts <<'LISTENER_TS'
#!/usr/bin/env bun
/**
 * 5dive team-bot listener (DIVE-159).
 *
 * The SINGLE getUpdates consumer of the shared team-bot token. Telegram allows
 * exactly one getUpdates consumer per token — a second poller = 409 = dead
 * channel for the whole fleet — so this runs as ONE systemd unit (Restart=on-
 * failure, single instance) which structurally enforces the singleton.
 *
 * Flow: long-poll getUpdates -> for each message in a forum topic, map
 * message_thread_id -> agent via the registry's teamTopic.threadId -> atomically
 * drop a JSON inbound file into that agent's relay-in/. The agent's telegram
 * plugin (TELEGRAM_SEND_ONLY=1) watches relay-in/, emits the normal channel
 * notification, and replies into the topic via the shared token.
 *
 * No external deps (raw fetch) to keep this a tiny, self-contained unit.
 */
import { readFileSync, writeFileSync, mkdirSync, renameSync, chownSync } from 'fs'
import { join } from 'path'
import { execFileSync } from 'child_process'

const TOKEN_FILE = process.env.TEAM_BOT_TOKEN_FILE ?? '/etc/5dive/team-bot.token'
const REGISTRY = process.env.FIVE_REGISTRY ?? '/var/lib/5dive/agents.json'
const OFFSET_FILE = process.env.TEAM_BOT_OFFSET_FILE ?? '/var/lib/5dive/team-bot.offset'
const HOME_ROOT = process.env.AGENT_HOME_ROOT ?? '/home'
const POLL_TIMEOUT = 30 // getUpdates long-poll seconds
// Loop/abuse guard: cap inbound drops per agent within a sliding window.
const RATE_MAX = 30
const RATE_WINDOW_MS = 10_000

const token = readFileSync(TOKEN_FILE, 'utf8').trim()
if (!token) {
  process.stderr.write('team-bot-listener: empty token at ' + TOKEN_FILE + '\n')
  process.exit(1)
}
const API = `https://api.telegram.org/bot${token}`

// Persisted update offset — a restart must never re-deliver. offset = last+1.
function loadOffset(): number {
  try {
    return parseInt(readFileSync(OFFSET_FILE, 'utf8').trim(), 10) || 0
  } catch {
    return 0
  }
}
function saveOffset(o: number): void {
  const tmp = `${OFFSET_FILE}.tmp`
  writeFileSync(tmp, String(o))
  renameSync(tmp, OFFSET_FILE) // atomic
}

// thread_id -> agent name, rebuilt each poll from the registry (tiny file, cheap).
function threadMap(): Map<number, string> {
  const m = new Map<number, string>()
  try {
    const reg = JSON.parse(readFileSync(REGISTRY, 'utf8'))
    for (const [name, a] of Object.entries<any>(reg.agents ?? {})) {
      const t = a?.teamTopic?.threadId
      if (typeof t === 'number') m.set(t, name)
    }
  } catch (e) {
    process.stderr.write(`team-bot-listener: registry read failed: ${e}\n`)
  }
  return m
}

// Cache agent-<name> uid/gid so dropped files are owned by the agent (which runs
// the plugin as agent-<name> and must read + delete them). Listener runs as root.
const idCache = new Map<string, { uid: number; gid: number } | null>()
function agentIds(agent: string): { uid: number; gid: number } | null {
  if (idCache.has(agent)) return idCache.get(agent)!
  let ids: { uid: number; gid: number } | null = null
  try {
    const user = `agent-${agent}`
    const uid = parseInt(execFileSync('id', ['-u', user]).toString().trim(), 10)
    const gid = parseInt(execFileSync('id', ['-g', user]).toString().trim(), 10)
    if (uid > 0 && gid > 0) ids = { uid, gid }
  } catch {}
  idCache.set(agent, ids)
  return ids
}

function relayInDir(agent: string): string {
  return join(HOME_ROOT, `agent-${agent}`, '.claude', 'channels', 'telegram', 'relay-in')
}

// Sliding-window rate limit per agent.
const hits = new Map<string, number[]>()
function rateOk(agent: string, now: number): boolean {
  const arr = (hits.get(agent) ?? []).filter(t => now - t < RATE_WINDOW_MS)
  if (arr.length >= RATE_MAX) {
    hits.set(agent, arr)
    return false
  }
  arr.push(now)
  hits.set(agent, arr)
  return true
}

let dropSeq = 0
function drop(agent: string, payload: Record<string, unknown>, now: number): void {
  const dir = relayInDir(agent)
  const ids = agentIds(agent)
  try {
    mkdirSync(dir, { recursive: true })
    if (ids) chownSync(dir, ids.uid, ids.gid)
  } catch {}
  const id = `${now}-${process.pid}-${dropSeq++}`
  const tmp = join(dir, `.${id}.tmp`)
  const fin = join(dir, `${id}.json`)
  // temp -> rename so the plugin watcher (reads only *.json) never sees a partial.
  writeFileSync(tmp, JSON.stringify({ id, ...payload }))
  try {
    if (ids) chownSync(tmp, ids.uid, ids.gid)
  } catch {}
  renameSync(tmp, fin)
}

async function tg(method: string, params: Record<string, unknown>): Promise<any> {
  const res = await fetch(`${API}/${method}`, {
    method: 'POST',
    headers: { 'content-type': 'application/json' },
    body: JSON.stringify(params),
  })
  return res.json()
}

let offset = loadOffset()
process.stderr.write(`team-bot-listener: starting (offset=${offset})\n`)

let shuttingDown = false
for (const sig of ['SIGTERM', 'SIGINT'] as const) {
  process.on(sig, () => {
    shuttingDown = true
    process.stderr.write('team-bot-listener: shutting down\n')
    process.exit(0)
  })
}

while (!shuttingDown) {
  let data: any
  try {
    data = await tg('getUpdates', {
      offset,
      timeout: POLL_TIMEOUT,
      allowed_updates: ['message'],
    })
  } catch (e) {
    process.stderr.write(`team-bot-listener: getUpdates failed: ${e}\n`)
    await new Promise(r => setTimeout(r, 3000))
    continue
  }
  if (!data?.ok) {
    process.stderr.write(`team-bot-listener: getUpdates not ok: ${JSON.stringify(data)}\n`)
    await new Promise(r => setTimeout(r, 3000))
    continue
  }

  const updates: any[] = data.result ?? []
  if (updates.length === 0) continue

  const map = threadMap()
  const now = Date.now()
  for (const u of updates) {
    offset = Math.max(offset, u.update_id + 1)
    const msg = u.message
    if (!msg) continue
    if (msg.from?.is_bot) continue
    const threadId = msg.message_thread_id
    if (threadId == null) continue
    const agent = map.get(threadId)
    if (!agent) continue
    if (!rateOk(agent, now)) {
      process.stderr.write(`team-bot-listener: rate-limited ${agent} (thread ${threadId})\n`)
      continue
    }
    const text = msg.text ?? msg.caption ?? ''
    drop(
      agent,
      {
        chat_id: String(msg.chat.id),
        message_thread_id: String(threadId),
        message_id: String(msg.message_id),
        content: text,
        user: msg.from?.username ?? String(msg.from?.id ?? 'team'),
        user_id: String(msg.from?.id ?? ''),
        ts: new Date((msg.date ?? 0) * 1000).toISOString(),
      },
      now,
    )
  }
  saveOffset(offset)
}
LISTENER_TS
  chown root:root /opt/5dive/team-bot-listener.ts
  chmod 644 /opt/5dive/team-bot-listener.ts

  local bun_bin; bun_bin=$(_team_bot_resolve_bun)
  cat > /etc/systemd/system/5dive-team-bot-listener.service <<UNIT
[Unit]
Description=5dive team-bot listener (DIVE-159 — single getUpdates consumer of the shared team bot)
After=network-online.target
Wants=network-online.target
StartLimitIntervalSec=0

[Service]
Type=simple
User=root
Group=root
ExecStart=${bun_bin} /opt/5dive/team-bot-listener.ts
Restart=on-failure
RestartSec=2

[Install]
WantedBy=multi-user.target
UNIT
  systemctl daemon-reload >&2 2>/dev/null || true
  systemctl enable 5dive-team-bot-listener.service >&2 2>/dev/null || true
  # Restart picks up the (possibly updated) token + source. Singleton by unit.
  systemctl restart 5dive-team-bot-listener.service >&2 2>/dev/null || true
}

# `5dive agent team-bot shared --group --token --agents [--owner]`
# Relay the listed no-bot agents through one shared bot. Idempotent: re-running
# reuses each agent's existing topic.
# DIVE-247 — find the team group without the manual id hunt. Telegram's UI
# never shows a group's chat id, and the Bot API can't create groups or add
# bots to them — a human does that once. But the moment a human adds the bot,
# its pending updates (my_chat_member / messages) carry the chat id. Read them
# WITHOUT acking an offset so nothing is consumed (the listener still sees
# them later). getUpdates 409s only against another poller on the SAME token,
# so skip the poll when our listener is live on this token and fall back to
# groups already recorded in the registry (teamTopic/intercom chat ids).
# Enriches each candidate with getChat + getChatMember so the dashboard can
# show exactly what's missing (forum off / not admin / no Manage Topics).
_team_bot_do_discover() {
  local token="$1"
  [[ -n "$token" ]] || fail "$E_USAGE" "team-bot discover requires --token=<shared bot token>"
  [[ "$token" =~ ^[0-9]+:[A-Za-z0-9_-]+$ ]] || fail "$E_VALIDATION" "--token does not look like a Telegram bot token"

  local saved_token="" listener_live=0
  [[ -r /etc/5dive/team-bot.token ]] && saved_token=$(cat /etc/5dive/team-bot.token 2>/dev/null)
  if [[ -n "$saved_token" && "$saved_token" == "$token" ]] \
     && systemctl is-active --quiet 5dive-team-bot-listener.service 2>/dev/null; then
    listener_live=1
  fi

  local reg known
  reg=$(registry_read)
  known=$(jq -c '[(.agents | to_entries[] | .value.teamTopic.chatId // empty), (.intercomTopic.chatId // empty)] | unique' <<<"$reg")

  local result
  result=$(TOKEN="$token" LISTENER_LIVE="$listener_live" KNOWN="$known" python3 - <<'PY'
import json, os, urllib.parse, urllib.request, urllib.error

TOKEN = os.environ['TOKEN']
LIVE  = os.environ.get('LISTENER_LIVE') == '1'
KNOWN = json.loads(os.environ.get('KNOWN') or '[]')

def tg(method, **p):
    url = f"https://api.telegram.org/bot{TOKEN}/{method}"
    data = urllib.parse.urlencode(p).encode()
    try:
        with urllib.request.urlopen(urllib.request.Request(url, data=data), timeout=15) as r:
            return json.load(r)
    except urllib.error.HTTPError as e:
        try:
            return json.load(e)
        except Exception:
            return {"ok": False, "description": f"HTTP {e.code}"}
    except Exception as e:
        return {"ok": False, "description": str(e)}

me = tg("getMe")
if not me.get("ok"):
    print(json.dumps({"ok": False, "error": me.get("description", "bot token invalid")}))
    raise SystemExit(0)
bot_id = me["result"]["id"]
bot_username = me["result"].get("username")

chat_ids = []
def add(cid):
    if cid not in chat_ids:
        chat_ids.append(cid)

if not LIVE:
    # No offset param = peek, not consume: Telegram only discards updates once
    # a HIGHER offset is acked, so the listener still gets everything.
    up = tg("getUpdates", timeout=0,
            allowed_updates='["my_chat_member","message","chat_member"]')
    if up.get("ok"):
        for u in up.get("result", []):
            for k in ("my_chat_member", "message", "chat_member"):
                c = (u.get(k) or {}).get("chat") or {}
                if c.get("type") in ("group", "supergroup"):
                    add(int(c["id"]))
for cid in KNOWN:
    add(int(cid))

groups = []
for cid in chat_ids:
    g = tg("getChat", chat_id=cid)
    if not g.get("ok"):
        continue  # kicked since, or the id migrated — not usable, skip
    info = g["result"]
    mm = tg("getChatMember", chat_id=cid, user_id=bot_id)
    st = (mm.get("result") or {}).get("status", "left") if mm.get("ok") else "left"
    if st in ("left", "kicked"):
        continue
    can_topics = bool((mm.get("result") or {}).get("can_manage_topics")) or st == "creator"
    is_admin = st in ("administrator", "creator")
    groups.append({
        "id": str(info["id"]),
        "title": info.get("title") or str(info["id"]),
        "isForum": bool(info.get("is_forum")),
        "isAdmin": is_admin,
        "canManageTopics": can_topics,
        "ready": bool(info.get("is_forum")) and is_admin and can_topics,
    })

print(json.dumps({"ok": True, "botUsername": bot_username, "groups": groups}))
PY
)
  [[ -n "$result" ]] || fail "$E_GENERIC" "team-bot discover failed"
  if [[ "$(jq -r '.ok // false' <<<"$result")" != "true" ]]; then
    fail "$E_GENERIC" "$(jq -r '.error // "discover failed"' <<<"$result")"
  fi
  local n bot_username groups
  n=$(jq '.groups | length' <<<"$result")
  bot_username=$(jq -r '.botUsername // ""' <<<"$result")
  groups=$(jq -c '.groups' <<<"$result")
  ok "team-bot discover: $n group(s) found" '{botUsername:$b, groups:$g}' \
     --arg b "$bot_username" --argjson g "$groups"
}

_team_bot_do_shared() {
  local group="$1" owner="$2" agents_filter="$3" token="$4"
  [[ -n "$token" ]] || fail "$E_USAGE" "team-bot shared requires --token=<shared bot token>"
  [[ "$token" =~ ^[0-9]+:[A-Za-z0-9_-]+$ ]] || fail "$E_VALIDATION" "--token does not look like a Telegram bot token"
  [[ -n "$agents_filter" ]] || fail "$E_USAGE" "team-bot shared requires --agents=<name[,name...]>"

  local reg; reg=$(registry_read)
  local candidates; candidates=$(_team_bot_relay_agent_list)

  # Resolve requested names against relay candidates — never touch a personal-bot
  # agent or an unknown/unsupported one.
  local targets_json="{}" name type sd tt
  local -a req
  IFS=',' read -ra req <<<"$agents_filter"
  for name in "${req[@]}"; do
    name="${name// /}"
    [[ -n "$name" ]] || continue
    grep -qxF "$name" <<<"$candidates" \
      || fail "$E_VALIDATION" "agent '$name' is not a no-bot relay candidate (already has its own bot, unknown, or its type has no telegram plugin)"
    type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
    sd=$(_tg_access_state_dir "agent-${name}" "$type" 2>/dev/null || echo "")
    tt=$(jq -c --arg n "$name" '.agents[$n].teamTopic // null' <<<"$reg")
    targets_json=$(jq -c --arg n "$name" --arg ty "$type" --arg sd "$sd" --argjson tt "$tt" \
      '.[$n]={type:$ty, stateDir:$sd, teamTopic:$tt}' <<<"$targets_json")
  done
  [[ "$targets_json" != "{}" ]] || fail "$E_NOT_FOUND" "no relay-eligible agents in --agents"

  # 1) Persist the shared token (root-only) — the listener reads it from here.
  mkdir -p /etc/5dive
  ( umask 077; printf '%s\n' "$token" > /etc/5dive/team-bot.token )
  chown root:root /etc/5dive/team-bot.token
  chmod 600 /etc/5dive/team-bot.token
  # Also persist the team group (+owner) so `agent create` can auto-attach
  # future no-bot agents to this group without re-asking (DIVE-248).
  ( umask 077; jq -n --arg g "$group" --arg o "$owner" \
      '{group:$g, owner:(if $o=="" then null else $o end)}' > /etc/5dive/team-bot.json )
  chown root:root /etc/5dive/team-bot.json
  chmod 600 /etc/5dive/team-bot.json

  # 2) Telegram: verify the shared bot can manage topics, create a topic per
  #    agent (reusing an existing one), and wire each access.json.
  local reg_updates_file; reg_updates_file=$(mktemp)
  local results
  results=$(GROUP="$group" OWNER="$owner" TOKEN="$token" AGENTS="$targets_json" REG_UPDATES_FILE="$reg_updates_file" python3 - <<'PY'
import json, os, tempfile, pwd, urllib.parse, urllib.request, urllib.error

GROUP  = os.environ['GROUP']
OWNER  = os.environ.get('OWNER') or ''
TOKEN  = os.environ['TOKEN']
AGENTS = json.loads(os.environ['AGENTS'])

def tg(method, **params):
    url = f"https://api.telegram.org/bot{TOKEN}/{method}"
    data = urllib.parse.urlencode(params).encode()
    try:
        with urllib.request.urlopen(urllib.request.Request(url, data=data), timeout=15) as r:
            return json.load(r)
    except urllib.error.HTTPError as e:
        try:
            return json.load(e)
        except Exception:
            return {"ok": False, "description": f"HTTP {e.code}"}
    except Exception as e:
        return {"ok": False, "description": str(e)}

def bail(status, **extra):
    out = [dict(agent=n, status=status, threadId=None, **extra) for n in AGENTS]
    with open(os.environ.get("REG_UPDATES_FILE", "/dev/null"), "w") as f:
        f.write("{}")
    print(json.dumps(out))
    raise SystemExit(0)

me = tg("getMe")
if not me.get("ok"):
    bail("error", error=me.get("description", "shared bot token invalid"))
bot_id = me["result"]["id"]

mm = tg("getChatMember", chat_id=GROUP, user_id=bot_id)
ok_admin = (mm.get("ok") and mm["result"].get("status") in ("administrator", "creator")
            and (mm["result"].get("can_manage_topics") or mm["result"].get("status") == "creator"))
if not ok_admin:
    bail("needs_shared_admin")

results = []
reg_updates = {}
for name, a in AGENTS.items():
    out = {"agent": name, "status": "error", "threadId": None}
    existing = a.get("teamTopic") or {}
    thread = existing.get("threadId") if existing.get("chatId") == int(GROUP) else None
    if not thread:
        cf = tg("createForumTopic", chat_id=GROUP, name=name)
        if not cf.get("ok"):
            out["error"] = cf.get("description", "createForumTopic failed")
            results.append(out); continue
        thread = cf["result"]["message_thread_id"]

    sd = a.get("stateDir") or ""
    if sd:
        path = os.path.join(sd, "access.json")
        try:
            acc = json.load(open(path)) if os.path.exists(path) else {}
        except Exception:
            acc = {}
        acc.setdefault("dmPolicy", "pairing")
        acc.setdefault("allowFrom", [])
        acc.setdefault("groups", {})
        acc.setdefault("pending", {})
        acc["groups"][GROUP] = {"requireMention": False,
                                "allowFrom": [OWNER] if OWNER else [],
                                "message_thread_id": int(thread)}
        os.makedirs(sd, exist_ok=True)
        fd, tmp = tempfile.mkstemp(dir=sd)
        with os.fdopen(fd, "w") as f:
            json.dump(acc, f, indent=2)
        os.chmod(tmp, 0o600)
        os.replace(tmp, path)
        # The plugin runs as the agent and must OWN its telegram state dir +
        # create relay-in/ for the SEND_ONLY inbound watcher. os.makedirs above
        # created any missing parents as root (no-bot agents have no prior
        # telegram dir), so chown the tree + pre-make relay-in — otherwise the
        # watcher trips on a root-owned dir at mkdirSync and never registers.
        try:
            u = pwd.getpwnam("agent-" + name)
            relay_in = os.path.join(sd, "relay-in")
            os.makedirs(relay_in, exist_ok=True)
            for d in (os.path.dirname(sd), sd, relay_in):
                try:
                    os.chown(d, u.pw_uid, u.pw_gid)
                except OSError:
                    pass
            os.chmod(relay_in, 0o700)
            os.chown(path, u.pw_uid, u.pw_gid)
        except KeyError:
            pass

    reg_updates[name] = {"threadId": int(thread), "chatId": int(GROUP)}
    out["status"] = "relayed"; out["threadId"] = int(thread)
    results.append(out)

with open(os.environ.get("REG_UPDATES_FILE", "/dev/null"), "w") as f:
    json.dump(reg_updates, f)
print(json.dumps(results))
PY
)
  local rc=$?
  if [[ $rc -ne 0 || -z "$results" ]]; then rm -f "$reg_updates_file"; fail "$E_GENERIC" "team-bot shared failed"; fi

  # 3) For each agent that got a topic: enable the telegram plugin (send-only)
  #    + regenerate its systemd env with channels=telegram.
  local -a wired
  while IFS= read -r name; do [[ -n "$name" ]] && wired+=("$name"); done \
    < <(jq -r '.[] | select(.status=="relayed") | .agent' <<<"$results")

  local ef wd pf iso
  for name in "${wired[@]}"; do
    type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
    step "Enabling telegram (send-only) for $name"
    install_channel_for_agent "$type" telegram "$name" "$token" "" "" || true
    _team_bot_write_sendonly_env "$name" "$token"
    ef="${ENV_DIR}/${name}.env"
    wd=$(sed -n 's/^AGENT_WORKDIR=//p' "$ef" 2>/dev/null | head -1)
    pf=$(sed -n 's/^AGENT_AUTH_PROFILE=//p' "$ef" 2>/dev/null | head -1)
    iso=$(sed -n 's/^AGENT_ISOLATION=//p' "$ef" 2>/dev/null | head -1); iso="${iso:-admin}"
    write_agent_env "$name" "$type" telegram "$wd" "$pf" "$iso"
  done

  # 4) Persist channels=telegram + teamTopic (under the registry lock).
  if [[ -s "$reg_updates_file" ]]; then
    with_registry_lock _team_bot_persist_shared "$reg_updates_file"
  fi
  rm -f "$reg_updates_file"

  # 5) Install + (re)start the single listener.
  if [[ ${#wired[@]} -gt 0 ]]; then
    _team_bot_install_listener
  fi

  # 6) Restart each wired agent so it loads the send-only plugin.
  for name in "${wired[@]}"; do
    step "Restarting 5dive-agent@${name}"
    systemctl restart "5dive-agent@${name}.service" >&2 2>/dev/null || true
  done

  local ready total
  ready=$(jq '[.[] | select(.status=="relayed")] | length' <<<"$results")
  total=$(jq 'length' <<<"$results")
  ok "team-bot shared: $ready/$total agents relayed in group $group" \
     '{group:$g, relay:$a}' --arg g "$group" --argjson a "$results"
}

# Read ~/.claude/channels/telegram/access.json for a claude-type agent. Used
# by the dashboard's access-control modal to render the current allowlist /
# groups / dmPolicy. Returns the parsed JSON in `data`. If the file doesn't
# exist yet (plugin hasn't persisted state), returns the same defaults the
# plugin would write on first run.
# Resolve the telegram-plugin state dir (where access.json lives) for an agent
# by type. claude, codex and grok each store it under ~/.<type>/channels/telegram/
# — the home subdir name matches the agent type, and all use the same
# access.json schema {dmPolicy, allowFrom, groups}. Echoes the dir on success;
# returns nonzero for types that have no telegram access.json (openclaw/hermes
# manage approvals through their own tooling, not this file). antigravity is the
# odd one out: its home subdir is ~/.gemini (not ~/.antigravity) because agy
# reuses Google's ~/.gemini parent, so it gets an explicit branch.
_tg_access_state_dir() {
  local user="$1" type="$2"
  case "$type" in
    claude|codex|grok) printf '/home/%s/.%s/channels/telegram' "$user" "$type" ;;
    antigravity)       printf '/home/%s/.gemini/channels/telegram' "$user" ;;
    *) return 1 ;;
  esac
}

cmd_telegram_access_get() {
  local name=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent telegram-access get <name>"
  ensure_state
  local reg type channels
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
  channels=$(jq -r --arg n "$name" '.agents[$n].channels' <<<"$reg")
  [[ "$channels" == "telegram" ]] \
    || fail "$E_VALIDATION" "agent '$name' has channels=$channels — telegram-access only applies to telegram"

  local user="agent-${name}"
  local state_dir
  state_dir=$(_tg_access_state_dir "$user" "$type") \
    || fail "$E_VALIDATION" "telegram-access supports claude, codex, grok and antigravity agents (got type=$type)"
  local access="${state_dir}/access.json"
  local raw
  raw=$(sudo -u "$user" cat "$access" 2>/dev/null || true)
  if [[ -z "$raw" ]] || ! jq -e . >/dev/null 2>&1 <<<"$raw"; then
    raw='{"dmPolicy":"pairing","allowFrom":[],"groups":{}}'
  fi
  ok "" '{access: $a, botUsername: $u}' \
     --argjson a "$raw" \
     --arg u "$(jq -r --arg n "$name" '.agents[$n].botUsername // ""' <<<"$reg")"
}

# Write the telegram access.json for a claude/codex/grok agent (path resolved
# by type via _tg_access_state_dir — all three share the same schema).
# The new JSON body comes in on stdin (the dashboard sends it via the
# `stdin` field on /server/agents/exec so it never lands in argv).
#
# Schema validated server-side: dmPolicy in {pairing,allowlist,disabled},
# allowFrom = array of numeric-string ids, groups = object keyed by chat id
# whose values are {requireMention: bool, allowFrom: string[]}. Any keys we
# don't expose (pending, mentionPatterns, replyToMode, textChunkLimit,
# chunkMode, ackReaction) are merged from the existing file rather than
# clobbered, so opaque settings the dashboard hasn't surfaced survive a
# save. Plugin re-reads on every inbound message — no agent restart needed.
cmd_telegram_access_set() {
  local name=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent telegram-access set <name>  (JSON body on stdin)"
  ensure_state
  local reg type channels
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
  channels=$(jq -r --arg n "$name" '.agents[$n].channels' <<<"$reg")
  [[ "$channels" == "telegram" ]] \
    || fail "$E_VALIDATION" "agent '$name' has channels=$channels — telegram-access only applies to telegram"
  local user="agent-${name}"
  local state_dir
  state_dir=$(_tg_access_state_dir "$user" "$type") \
    || fail "$E_VALIDATION" "telegram-access supports claude, codex, grok and antigravity agents (got type=$type)"

  local body
  body=$(cat)
  [[ -n "$body" ]] || fail "$E_USAGE" "telegram-access set expects JSON on stdin"
  jq -e . >/dev/null 2>&1 <<<"$body" \
    || fail "$E_VALIDATION" "stdin is not valid JSON"

  step "Updating telegram access for agent '$name'"
  # Validation + atomic write live in the same python step so a bad shape
  # exits non-zero before we touch the file. STATE is the agent's plugin
  # state dir; PATCH is the dashboard's proposed {dmPolicy, allowFrom,
  # groups, botToBot?} blob. Unknown keys in the existing file (pending,
  # mentionPatterns, replyToMode, textChunkLimit, chunkMode, ackReaction)
  # survive the merge — only the dashboard-owned keys are replaced, and
  # botToBot (DIVE-161) only when the patch includes it.
  local script
  script=$(cat <<'PY'
import json, os, re, sys, tempfile

ID_RE = re.compile(r"^-?[0-9]+$")
state = os.environ['STATE']
try:
    patch = json.loads(os.environ['PATCH'])
except json.JSONDecodeError as e:
    print(f"invalid JSON: {e}", file=sys.stderr); sys.exit(2)

def bad(msg):
    print(msg, file=sys.stderr); sys.exit(2)

if not isinstance(patch, dict):
    bad("top-level must be an object")
if patch.get('dmPolicy') not in ('pairing', 'allowlist', 'disabled'):
    bad("dmPolicy must be one of pairing|allowlist|disabled")
allow = patch.get('allowFrom')
if not isinstance(allow, list) or not all(isinstance(s, str) and ID_RE.match(s) for s in allow):
    bad("allowFrom must be an array of numeric-string ids")
groups = patch.get('groups')
if not isinstance(groups, dict):
    bad("groups must be an object")
for gid, gcfg in groups.items():
    if not ID_RE.match(gid):
        bad(f"group key '{gid}' is not numeric")
    if not isinstance(gcfg, dict):
        bad(f"group '{gid}' value must be an object")
    if 'requireMention' in gcfg and not isinstance(gcfg['requireMention'], bool):
        bad(f"group '{gid}'.requireMention must be a boolean")
    if 'allowFrom' in gcfg:
        gallow = gcfg['allowFrom']
        if not isinstance(gallow, list) or not all(isinstance(s, str) and ID_RE.match(s) for s in gallow):
            bad(f"group '{gid}'.allowFrom must be an array of numeric-string ids")

# botToBot (DIVE-161) is OPTIONAL — present only when the dashboard's bot-to-bot
# section is in play. Shape mirrors the plugin's BotToBotConfig (botguard.ts):
# {enabled:bool, allowFrom?:[bot username/id strings], maxPerMin?:int>0,
# dedupeWindowMs?:int>=0}. Omitting the key leaves any existing config untouched.
b2b = patch.get('botToBot')
if b2b is not None:
    if not isinstance(b2b, dict):
        bad("botToBot must be an object")
    if not isinstance(b2b.get('enabled'), bool):
        bad("botToBot.enabled must be a boolean")
    if 'allowFrom' in b2b:
        ba = b2b['allowFrom']
        if not isinstance(ba, list) or not all(isinstance(s, str) and s for s in ba):
            bad("botToBot.allowFrom must be an array of non-empty strings (bot @usernames or ids)")
    if 'maxPerMin' in b2b and not (isinstance(b2b['maxPerMin'], int) and not isinstance(b2b['maxPerMin'], bool) and b2b['maxPerMin'] > 0):
        bad("botToBot.maxPerMin must be a positive integer")
    if 'dedupeWindowMs' in b2b and not (isinstance(b2b['dedupeWindowMs'], int) and not isinstance(b2b['dedupeWindowMs'], bool) and b2b['dedupeWindowMs'] >= 0):
        bad("botToBot.dedupeWindowMs must be a non-negative integer")

os.makedirs(state, mode=0o700, exist_ok=True)
path = os.path.join(state, 'access.json')

try:
    with open(path) as f:
        existing = json.load(f)
except FileNotFoundError:
    existing = {}

merged = dict(existing)
for k in ('dmPolicy', 'allowFrom', 'groups'):
    merged[k] = patch[k]
if b2b is not None:
    merged['botToBot'] = b2b

fd, tmp = tempfile.mkstemp(dir=state, prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(merged, f, indent=2)
os.replace(tmp, path)
PY
)
  local err
  if ! err=$(sudo -u "$user" env STATE="$state_dir" PATCH="$body" python3 -c "$script" 2>&1 >/dev/null); then
    fail "$E_VALIDATION" "${err:-telegram access.json write failed for agent '$name'}"
  fi

  ok "telegram access updated for '$name'" \
     '{name:$n, updated:true}' \
     --arg n "$name"
}

# Drop a pending pairing entry without approving it. The dashboard's inbox
# UI calls this when the operator clicks "Ignore" on a stranger's DM —
# removes the code from access.json's pending map so it stops showing in
# the modal, but does NOT add the senderId to allowFrom. The plugin will
# re-prompt with a fresh code if the same sender messages again.
cmd_telegram_pending_ignore() {
  local name="" code=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  if [[ -z "$name" ]]; then name="$1"
          elif [[ -z "$code" ]]; then code="$1"
          else fail "$E_USAGE" "extra arg: $1"; fi ;;
    esac
    shift
  done
  [[ -n "$name" && -n "$code" ]] \
    || fail "$E_USAGE" "usage: 5dive agent telegram-pending-ignore <name> <code>"
  [[ "$code" =~ ^[A-Za-z0-9]{4,16}$ ]] \
    || fail "$E_VALIDATION" "invalid code format"
  ensure_state
  local reg type channels
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
  channels=$(jq -r --arg n "$name" '.agents[$n].channels' <<<"$reg")
  [[ "$channels" == "telegram" ]] \
    || fail "$E_VALIDATION" "agent '$name' has channels=$channels — telegram-pending-ignore only applies to telegram"

  local user="agent-${name}"
  local state_dir
  state_dir=$(_tg_access_state_dir "$user" "$type") \
    || fail "$E_VALIDATION" "telegram-pending-ignore supports claude, codex, grok and antigravity agents (got type=$type)"
  local access="${state_dir}/access.json"
  local err
  err=$(sudo -u "$user" env ACCESS="$access" CODE="$code" python3 - <<'PY' 2>&1 >/dev/null
import json, os, sys, tempfile

path = os.environ['ACCESS']
code = os.environ['CODE']

try:
    with open(path) as f:
        data = json.load(f)
except FileNotFoundError:
    print("access.json not found — nothing pending", file=sys.stderr); sys.exit(2)

pending = data.get('pending') or {}
if code not in pending:
    print(f"code '{code}' is not pending", file=sys.stderr); sys.exit(2)
pending.pop(code, None)
data['pending'] = pending

fd, tmp = tempfile.mkstemp(dir=os.path.dirname(path), prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(data, f, indent=2)
os.replace(tmp, path)
PY
  ) || fail "$E_PAIRING" "${err:-pending-ignore failed}"

  ok "ignored pending pairing '$code' for '$name'" \
     '{name:$n, code:$c, ignored:true}' \
     --arg n "$name" --arg c "$code"
}

# Resolve a Telegram chat reference — either a public @handle or a numeric
# chat id — to its full identity (id, displayName, type, isBot) via the
# agent's own bot token calling getChat. Used by the dashboard's
# add-allowlist UX (paste @handle instead of digging up an id) AND by the
# load-time name enrichment (turn cryptic ids in allowFrom into "Mark ·
# @lodar"). Token stays server-side. Returned id can then be written into
# allowFrom by the regular telegram-access set path — no schema change.
cmd_telegram_resolve_handle() {
  local name="" handle=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -[0-9]*) # leading-minus numeric — a group/channel id, not a flag
        if [[ -z "$name" ]]; then name="$1"
        elif [[ -z "$handle" ]]; then handle="$1"
        else fail "$E_USAGE" "extra arg: $1"; fi ;;
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  if [[ -z "$name" ]]; then name="$1"
          elif [[ -z "$handle" ]]; then handle="$1"
          else fail "$E_USAGE" "extra arg: $1"; fi ;;
    esac
    shift
  done
  [[ -n "$name" && -n "$handle" ]] \
    || fail "$E_USAGE" "usage: 5dive agent telegram-resolve-handle <name> <@handle|chat_id>"
  # Note on arg parsing: a leading '-' on the handle arg (e.g. group id
  # -100123…) would normally match the -* flag glob and error out. The
  # case branch above handles this by accepting -[0-9]* positionally.
  # Normalise: accept "@foo", "foo", or a numeric chat id. Anything else is
  # rejected. Numeric ids may be negative for groups/channels.
  local lookup
  if [[ "$handle" =~ ^-?[0-9]{1,20}$ ]]; then
    # Numeric id — pass through verbatim. getChat accepts these for chats
    # the bot can see (users who've messaged it, groups it's in, public
    # channels). Failures map to NOT_FOUND below.
    lookup="$handle"
  else
    handle="${handle#@}"
    [[ "$handle" =~ ^[A-Za-z][A-Za-z0-9_]{3,31}$ ]] \
      || fail "$E_VALIDATION" "invalid handle (expected 5-32 chars, letters/digits/underscore, or a numeric id)"
    lookup="@${handle}"
  fi
  ensure_state
  local reg type channels
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
  channels=$(jq -r --arg n "$name" '.agents[$n].channels' <<<"$reg")
  case "$type" in
    claude|codex|grok|antigravity) ;;
    *) fail "$E_VALIDATION" "telegram-resolve-handle supports claude, codex, grok and antigravity agents (got type=$type)" ;;
  esac
  [[ "$channels" == "telegram" ]] \
    || fail "$E_VALIDATION" "agent '$name' has channels=$channels — telegram-resolve-handle only applies to telegram"

  local token_env="${CONNECTORS_DIR}/telegram-${name}.env"
  local token
  token=$(sed -n 's/^TELEGRAM_BOT_TOKEN=//p' "$token_env" 2>/dev/null | head -1 || true)
  [[ -n "$token" ]] \
    || fail "$E_AUTH_REQUIRED" "no telegram bot token for agent '$name' (expected ${token_env})"

  local resp
  resp=$(curl -sS -m 10 --get \
    --data-urlencode "chat_id=${lookup}" \
    "https://api.telegram.org/bot${token}/getChat" 2>/dev/null || true)
  if ! jq -e . >/dev/null 2>&1 <<<"$resp"; then
    fail "$E_GENERIC" "telegram api unreachable"
  fi
  if [[ "$(jq -r '.ok' <<<"$resp" 2>/dev/null)" != "true" ]]; then
    local desc
    desc=$(jq -r '.description // "telegram api error"' <<<"$resp" 2>/dev/null)
    # getChat returns "Bad Request: chat not found" for unknown handles; map
    # to NOT_FOUND so the dashboard can show a friendly "no such bot" message.
    case "$desc" in
      *"chat not found"*|*"chat_id is empty"*) fail "$E_NOT_FOUND" "telegram: $desc" ;;
      *) fail "$E_GENERIC" "telegram: $desc" ;;
    esac
  fi

  # Per Bot API: getChat returns a Chat object (not User), which has no
  # is_bot field — that lives on User and is only delivered with inbound
  # messages. We derive isBot from the handle convention: Telegram requires
  # all bot usernames to end in "bot" at registration (case-insensitive),
  # so the suffix is a reliable signal for type=private chats.
  #
  # Chat objects: users/bots have first_name + optional last_name + optional
  # username. Groups/supergroups/channels have title + optional username.
  # We read both branches and prefer whichever populates.
  local chat_id chat_type username first_name last_name title
  chat_id=$(jq -r '.result.id // empty'         <<<"$resp")
  chat_type=$(jq -r '.result.type // empty'     <<<"$resp")
  username=$(jq -r '.result.username // empty'  <<<"$resp")
  first_name=$(jq -r '.result.first_name // empty' <<<"$resp")
  last_name=$(jq -r '.result.last_name // empty'   <<<"$resp")
  title=$(jq -r '.result.title // empty'        <<<"$resp")
  [[ -n "$chat_id" ]] \
    || fail "$E_GENERIC" "telegram getChat returned no id"

  local is_bot=false
  if [[ "$chat_type" == "private" ]] && [[ "${username,,}" == *bot ]]; then
    is_bot=true
  fi

  # Compose displayName by chat type. Falls back through: title (group/channel)
  # → first+last (user/bot) → @username → @handle from input → numeric id.
  local display="$title"
  if [[ -z "$display" ]]; then
    display="$first_name"
    [[ -n "$last_name" ]] && display="${display:+$display }${last_name}"
  fi
  if [[ -z "$display" ]]; then
    if [[ -n "$username" ]]; then display="@${username}"
    elif [[ -n "${handle:-}" ]]; then display="@${handle}"
    else display="$chat_id"
    fi
  fi

  local label="$display"
  [[ -n "$username" && "$display" != "@${username}" ]] && label="$display · @${username}"

  ok "resolved $label → $chat_id" \
     '{id:$id, isBot:($b == "true"), type:$t, username:$u, displayName:$d}' \
     --arg id "$chat_id" \
     --arg b  "$is_bot" \
     --arg t  "$chat_type" \
     --arg u  "$username" \
     --arg d  "$display"
}

# Interactive pairing for a telegram- or discord-enabled claude-family agent.
# Two paths:
#   --code=<code>     classic: user DMs bot, bot replies with "pair <code>",
#                     dashboard pastes that here. We pop <code> from access.json's
#                     pending map, add the senderId to allowFrom, drop
#                     approved/<senderId>.
#   --user-id=<id>    auto: caller already discovered the chat (via
#                     cmd_telegram_discover or out-of-band) and wants to seed
#                     access.json directly. Skips the code roundtrip — writes
#                     allowFrom/approved with the supplied id immediately.
#                     For private DMs chat_id == user_id, so --chat-id is
#                     optional.
#
# Telegram and Discord plugins use the same access.json schema + approved/
# dir layout, so the JSON patch is identical — only the paths, token env
# var, and welcome-delivery mechanism differ.
cmd_pair() {
  local name="" precode="" preuser="" prechat=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --code=*)     precode="${1#--code=}" ;;
      --user-id=*)  preuser="${1#--user-id=}" ;;
      --chat-id=*)  prechat="${1#--chat-id=}" ;;
      -*)           fail "$E_USAGE" "unknown flag: $1" ;;
      *)            [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent pair <name> [--code=<code> | --user-id=<id> [--chat-id=<id>]]"
  if [[ -n "$precode" && -n "$preuser" ]]; then
    fail "$E_USAGE" "--code and --user-id are mutually exclusive"
  fi
  if [[ -n "$preuser" ]]; then
    valid_telegram_chat_id "$preuser" \
      || fail "$E_VALIDATION" "invalid --user-id (numeric, optionally negative)"
    if [[ -n "$prechat" ]]; then
      valid_telegram_chat_id "$prechat" \
        || fail "$E_VALIDATION" "invalid --chat-id (numeric, optionally negative)"
    else
      # Private DM convention: chat_id matches user_id. Groups need explicit --chat-id.
      prechat="$preuser"
    fi
  fi
  ensure_state
  local reg
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
  local type channels
  type=$(jq -r --arg n "$name" '.agents[$n].type' <<<"$reg")
  channels=$(jq -r --arg n "$name" '.agents[$n].channels' <<<"$reg")
  case "$channels" in
    telegram|discord) ;;
    *) fail "$E_VALIDATION" "agent '$name' has channels=$channels — pairing only applies to telegram or discord" ;;
  esac
  # cmd_pair applies to claude, codex, grok and antigravity — their
  # telegram/discord plugins use a code-roundtrip (user DMs bot, bot replies with a code,
  # dashboard pastes the code back to seed access.json) and share the same
  # access.json schema + path layout (~/.<type>/channels/<channel>/). openclaw
  # and hermes are token-only: the bot token alone is enough to authorise the
  # agent, and inbound user approvals flow through openclaw's own `pairing`
  # subcommand rather than this code path.
  case "$type" in
    claude|codex|grok|antigravity) ;;
    openclaw|hermes)
      fail "$E_VALIDATION" "type=$type doesn't use pair codes — the bot token configured at create time is sufficient. To approve specific Telegram/Discord users for an openclaw agent, run: sudo -u agent-${name} openclaw pairing list" ;;
    *)
      fail "$E_VALIDATION" "pairing only applies to claude, codex, grok and antigravity agents (got type=$type)" ;;
  esac

  local user="agent-${name}"
  local access="/home/${user}/.${type}/channels/${channels}/access.json"
  local token_env token_var
  case "$channels" in
    telegram) token_env="${CONNECTORS_DIR}/telegram-${name}.env"; token_var="TELEGRAM_BOT_TOKEN" ;;
    discord)  token_env="${CONNECTORS_DIR}/discord-${name}.env";  token_var="DISCORD_BOT_TOKEN"  ;;
  esac

  local bot_token
  bot_token=$(sed -n "s/^${token_var}=//p" "$token_env" 2>/dev/null | head -1 || true)
  [[ -n "$bot_token" ]] \
    || fail "$E_AUTH_REQUIRED" "no bot token for agent '$name' — run: sudo 5dive agent config $name set ${channels}.token=<token>"

  # Auto-pair path: caller already knows the (user_id, chat_id) — typically
  # because cmd_telegram_discover surfaced them from getUpdates. Seed
  # access.json directly without waiting for the plugin: the plugin only
  # writes access.json when it has state to persist (a pending pairing
  # entry, etc.), so on a freshly-created agent that's never received a
  # message the file may never appear. Writing it ourselves means the
  # plugin reads our allowFrom on first message — same end state as the
  # code-roundtrip path, no race with a pending entry.
  if [[ -n "$preuser" ]]; then
    local chat_id="$prechat"
    local state_dir="/home/${user}/.${type}/channels/${channels}"
    sudo -u "$user" env SENDER="$preuser" CHAT="$chat_id" STATE="$state_dir" python3 - <<'PY' >&2 \
      || fail "$E_PAIRING" "auto-pair seed failed"
import json, os, tempfile

state = os.environ['STATE']
sender = os.environ['SENDER']
chat = os.environ['CHAT']

os.makedirs(state, mode=0o700, exist_ok=True)
path = os.path.join(state, 'access.json')

try:
    with open(path) as f:
        data = json.load(f)
except FileNotFoundError:
    data = {"dmPolicy": "pairing", "allowFrom": [], "groups": {}, "pending": {}}

allow = list(data.get('allowFrom') or [])
if sender not in allow:
    allow.append(sender)
data['allowFrom'] = allow

approved = os.path.join(state, 'approved')
os.makedirs(approved, mode=0o700, exist_ok=True)
with open(os.path.join(approved, sender), 'w') as f:
    f.write(chat)

fd, tmp = tempfile.mkstemp(dir=state, prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(data, f, indent=2)
os.replace(tmp, path)
print(f"Auto-paired user {sender} (chat {chat})")
PY

    if [[ "$channels" == "telegram" ]]; then
      send_welcome_message "$chat_id" "$bot_token" "$name" "$type"
    fi
    ok "agent '$name' paired with chat $chat_id." \
       '{name:$n, channels:$ch, chatId:$c, paired:true}' \
       --arg n "$name" --arg ch "$channels" --arg c "$chat_id"
    return
  fi

  # Pair-code path: the bot writes access.json with a pending entry when the
  # user DMs it, so wait for that before trying to consume the code. Cold
  # start can take ~45s on a fresh box (skill preinstalls + plugin install
  # run during agent startup), so wait 90s.
  step "Waiting for $channels plugin on agent '$name'..."
  local waited=0
  for _ in $(seq 1 90); do
    if sudo -u "$user" test -f "$access" 2>/dev/null; then
      break
    fi
    sleep 1
    waited=$((waited+1))
  done
  sudo -u "$user" test -f "$access" 2>/dev/null \
    || fail "$E_TIMEOUT" "$access not found after 90s. Is the agent running? (systemctl status 5dive-agent@${name})"

  # Interactive INTRO is only shown to a human at a TTY. JSON callers must
  # pass --code=<code>; the non-precode path is unreachable over the API.
  if [[ -z "$precode" && "$JSON_MODE" == "0" ]]; then
    local app_label example_code
    case "$channels" in
      telegram) app_label="Telegram"; example_code="d13dc3" ;;
      discord)  app_label="Discord";  example_code="a4f2b1" ;;
    esac
    cat >&2 <<INTRO
Open $app_label and send any message to your bot. The bot will reply with
something like:

    Pairing required — run in Claude Code:
    /${channels}:access pair ${example_code}

Paste the reply (or just the code) below.

INTRO
  fi

  # Either prompt interactively (TTY) or consume --code once (exec path).
  local msg code chat_id tries_left=5
  [[ -n "$precode" ]] && tries_left=1
  while (( tries_left-- > 0 )); do
    if [[ -n "$precode" ]]; then
      msg="$precode"
    else
      read -r -p "Paste: " msg
    fi

    # grep with no match is expected when the user pastes just the bare code.
    code=$(printf '%s' "$msg" \
      | grep -oE 'pair[[:space:]]+[A-Za-z0-9]+' \
      | head -1 | awk '{print $2}' || true)
    if [[ -z "$code" ]]; then
      code=$(printf '%s' "$msg" | tr -d '[:space:]')
    fi
    if [[ ! "$code" =~ ^[A-Za-z0-9]{4,16}$ ]]; then
      warn "Could not extract a pair code from that. Paste the full bot reply or just the code."
      [[ -n "$precode" ]] && fail "$E_VALIDATION" "invalid --code=<code>"
      continue
    fi

    if chat_id=$(sudo -u "$user" env CODE="$code" ACCESS="$access" python3 - <<'PY'
import json, os, sys, tempfile

path = os.environ['ACCESS']
code = os.environ['CODE']

with open(path) as f:
    data = json.load(f)

pending = data.get('pending') or {}
entry = pending.pop(code, None)
if entry is None:
    print(f"Pair code '{code}' is not pending. Message the bot within the "
          "last hour, then retry.", file=sys.stderr)
    sys.exit(2)

sender = str(entry.get('senderId', '')).strip()
chat = str(entry.get('chatId', '')).strip()
if not sender:
    print("Pending entry missing senderId", file=sys.stderr)
    sys.exit(3)

allow = list(data.get('allowFrom') or [])
if sender not in allow:
    allow.append(sender)

data['allowFrom'] = allow
data['pending'] = pending

approved = os.path.join(os.path.dirname(path), 'approved')
os.makedirs(approved, mode=0o700, exist_ok=True)
with open(os.path.join(approved, sender), 'w') as f:
    f.write(chat)

fd, tmp = tempfile.mkstemp(dir=os.path.dirname(path), prefix='.access.', suffix='.tmp')
with os.fdopen(fd, 'w') as f:
    json.dump(data, f, indent=2)
os.replace(tmp, path)
print(f"Paired user {sender}", file=sys.stderr)
print(chat)
PY
    ); then
      if [[ -n "$chat_id" ]]; then
        break
      fi
      warn "Pairing returned no chat id. Try again."
      [[ -n "$precode" ]] && fail "$E_PAIRING" "pairing failed"
    else
      warn "That code isn't pending. Message the bot first, then paste the reply."
      [[ -n "$precode" ]] && fail "$E_PAIRING" "pairing code not pending"
    fi
  done

  [[ -n "${chat_id:-}" ]] || fail "$E_PAIRING" "exhausted retries without a successful pairing"

  # Telegram: CLI sends a welcome DM via Telegram's HTTP API.
  # Discord: the plugin's channel server polls approved/<senderId> and sends
  # its own "you're in" DM through the gateway — we don't need (and don't
  # have) a simple HTTP send path here.
  if [[ "$channels" == "telegram" ]]; then
    send_welcome_message "$chat_id" "$bot_token" "$name"
  fi
  ok "agent '$name' paired with chat $chat_id." \
     '{name:$n, channels:$ch, chatId:$c, paired:true}' \
     --arg n "$name" --arg ch "$channels" --arg c "$chat_id"
}

# One-shot "it works" DM after a successful pairing — labelled with the agent
# name + type so users running many bots can tell them apart. Token goes via
# URL-encoded POST body (not argv) so it doesn't show up in `ps`. Copy is
# per-type: claude surfaces its model/effort + voice (Claude-plugin features);
# codex/grok drop those lines since they're Claude-specific (and reading
# claude's settings.local.json would render wrong values for them).
send_welcome_message() {
  local chat_id="$1" bot_token="$2" agent_name="${3:-}" agent_type="${4:-claude}" text

  # FIVE_DOMAIN is the host's public subdomain (e.g. agent.example.com),
  # set during provisioning. Folded into the message only when present so
  # self-hosted boxes / dev VMs don't surface a half-rendered URL.
  local domain=""
  if [[ -r /etc/5dive/provisioning.env ]]; then
    domain=$(sed -n 's/^FIVE_DOMAIN=//p' /etc/5dive/provisioning.env 2>/dev/null | head -1)
  fi
  local live_line=""
  if [[ -n "$domain" ]]; then
    live_line=" Anything you build goes live at https://${domain} ready to share, or ask me to add your own domain."
  fi

  # "'<name>', " when we know the agent name, else "" → "I'm your X agent."
  local name_q=""
  [[ -n "$agent_name" ]] && name_q="'${agent_name}', "

  case "$agent_type" in
    codex|grok|antigravity)
      local kind
      if [[ "$agent_type" == "codex" ]]; then kind="Codex agent (OpenAI Codex)"
      elif [[ "$agent_type" == "antigravity" ]]; then kind="Antigravity agent (Google Gemini)"
      else kind="Grok agent (xAI Grok)"; fi
      text=$(cat <<EOF
👋 We're connected! I'm ${name_q}your ${kind}.

Here 24/7, ready to pick up where we left off. Send text, photos, or files.

Tell me what to build: app, site, bot, report, campaign. Consider it shipped.${live_line} Need more hands? Siblings on demand, working in parallel.
EOF
)
      ;;
    *)
      # Read the agent's REAL model/effort from its own settings.json (where the
      # per-agent default — opus — is seeded). Fall back to the base claude user
      # and the legacy shared projects file. Never surface a raw "default"
      # placeholder: if we can't read a concrete model, show a generic line.
      local model="" effort="" f
      local candidates=()
      [[ -n "$agent_name" ]] && candidates+=("/home/agent-${agent_name}/.claude/settings.json")
      candidates+=("/home/claude/.claude/settings.json" "/home/claude/projects/.claude/settings.local.json")
      for f in "${candidates[@]}"; do
        [[ -r "$f" ]] || continue
        model=$(jq -r '.model // empty' "$f" 2>/dev/null)
        effort=$(jq -r '.effortLevel // empty' "$f" 2>/dev/null)
        [[ -n "$model" ]] && break
      done
      local model_line="Model and effort are switchable anytime, just ask."
      if [[ -n "$model" && "$model" != "default" ]]; then
        if [[ -n "$effort" && "$effort" != "default" ]]; then
          model_line="Running ${model} at ${effort} effort. Switchable anytime, just ask."
        else
          model_line="Running ${model}. Switchable anytime, just ask."
        fi
      fi
      text=$(cat <<EOF
👋 We're connected! I'm ${name_q}your Claude agent.

${model_line}

Here 24/7 with memory. Send text, photos, or files, or ask for voice if you'd rather talk.

Tell me what to build: app, site, bot, report, campaign. Consider it shipped.${live_line} Need more hands? Siblings on demand, working in parallel.
EOF
)
      ;;
  esac

  curl -sS -o /dev/null \
    --data-urlencode "chat_id=${chat_id}" \
    --data-urlencode "text=${text}" \
    "https://api.telegram.org/bot${bot_token}/sendMessage" \
    && echo "Sent welcome message to chat ${chat_id}" >&2 \
    || warn "Failed to send welcome message"
}

# -------- lifecycle / inspection (start, stop, logs, send, clone, stats) --------

# Shared: resolve a registry entry or die. Echo nothing on success; used for
# presence checks in the lifecycle commands below.
require_agent() {
  local name="$1"
  ensure_state
  local reg
  reg=$(registry_read)
  jq -e --arg n "$name" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "no agent named '$name'"
}

# Resolve an agent's type from the registry. Used by the skill subcommands so
# the per-type SKILLS_AGENT_ID / SKILLS_INSTALL_DIR maps drive --agent and
# the post-install verification path. Caller should `require_agent` first;
# returns empty string if the agent isn't registered.
agent_type() {
  local name="$1"
  registry_read | jq -r --arg n "$name" '.agents[$n].type // empty'
}

cmd_start() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent start <name>"
  require_agent "$name"
  systemctl start "5dive-agent@${name}.service" >&2
  ok "agent '$name' started." \
     '{name:$n, action:"start"}' --arg n "$name"
}

cmd_stop() {
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent stop <name>"
  require_agent "$name"
  systemctl stop "5dive-agent@${name}.service" >&2
  ok "agent '$name' stopped." \
     '{name:$n, action:"stop"}' --arg n "$name"
}

# journalctl for the agent's unit, or a tmux scrollback capture with --tmux.
# --follow streams until the caller hangs up; in the /agents/exec path the
# shelld timeout caps this, so the dashboard should prefer the WS session for
# true follow.
#
# JSON output:
#   --tmux     -> {ok:true, data:{name, source:"tmux",    lines:[...]}}
#   default    -> {ok:true, data:{name, source:"journal", lines:[...]}}
#   --follow   -> NDJSON, one {line:"..."} per event on stdout. (Not wrapped
#                 in an envelope because it is an unbounded stream; consumers
#                 watch exit code for the envelope-less failure signal.)
cmd_logs() {
  local name="" follow=0 lines=200 tmux_mode=0
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --follow|-f) follow=1 ;;
      --lines=*)   lines="${1#--lines=}" ;;
      --tmux)      tmux_mode=1 ;;
      -*)          fail "$E_USAGE" "unknown flag: $1" ;;
      *)           [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent logs <name> [--follow] [--lines=N] [--tmux]"
  [[ "$lines" =~ ^[0-9]+$ ]] || fail "$E_VALIDATION" "invalid --lines (must be a positive integer)"
  require_agent "$name"

  if (( tmux_mode )); then
    local capture
    capture=$(sudo -u "agent-${name}" tmux capture-pane -t "agent-${name}" -p -S "-${lines}" 2>/dev/null) \
      || fail "$E_NOT_RUNNING" "tmux session 'agent-${name}' not found (is the agent running?)"
    if (( JSON_MODE )); then
      jq -Rn --arg n "$name" \
        '{ok:true, data:{name:$n, source:"tmux", lines:[inputs]}}' <<<"$capture"
    else
      printf '%s\n' "$capture"
    fi
    return 0
  fi

  local args=(-u "5dive-agent@${name}.service" --no-pager -n "$lines")
  (( follow )) && args+=(-f)

  if (( JSON_MODE )); then
    if (( follow )); then
      # NDJSON stream; no envelope. Each line becomes one JSON object.
      journalctl "${args[@]}" | jq -Rc '{line: .}'
    else
      journalctl "${args[@]}" \
        | jq -Rn --arg n "$name" '{ok:true, data:{name:$n, source:"journal", lines:[inputs]}}'
    fi
  else
    journalctl "${args[@]}"
  fi
}

# Sender-side group mirror for inter-agent traffic. Posts "@<receiver>\n<body>"
# into the SENDER's Telegram group via the SENDER's own bot, so the operator
# sees agent-to-agent messages under the correct sender identity — canonical
# group "call" style (each bot addresses the @recipient).
#
# This lives in the CLI rather than a hook on purpose: here we have the fully
# expanded message body. The old sender-side PreToolUse Bash mirror only saw
# the pre-expansion command string and choked on heredoc bodies
# (`"$(cat <<EOF…EOF)"`), which is why it was moved receiver-side. Doing it in
# the command itself sidesteps that entirely.
#
# Best-effort and self-gating: returns 0 (never blocks/fails the send) when not
# invoked by an agent, when the sender has no bot token, or when no group is
# configured. The receiver's reply rides the same path — when the receiver
# answers via `5dive agent send <original-sender>`, that call posts the reply
# payload under the receiver's bot, completing the two-sided "call" view.
mirror_interagent_outbound() {
  local receiver="$1" body="$2"

  # Only a real agent (SUDO_USER=agent-<x>) has a bot identity to post under.
  local invoker="${SUDO_USER:-}"
  [[ -n "$invoker" && "$invoker" == agent-* ]] || return 0
  local invoker_name="${invoker#agent-}"

  local token_file="${CONNECTORS_DIR}/telegram-${invoker_name}.env"
  [[ -r "$token_file" ]] || return 0
  local token
  token=$(sed -n 's/^TELEGRAM_BOT_TOKEN=//p' "$token_file" | head -1)
  [[ -n "$token" ]] || return 0

  # access.json lives under ~/.<type>/channels/telegram/ — resolve the
  # invoker's type so codex/grok agents mirror too, not just claude. Bail
  # quietly for token-only types (openclaw/hermes) with no access.json.
  local reg
  reg=$(registry_read)
  local invoker_type
  invoker_type=$(jq -r --arg n "$invoker_name" '.agents[$n].type // empty' <<<"$reg" 2>/dev/null)
  local access_dir
  access_dir=$(_tg_access_state_dir "$invoker" "$invoker_type") || return 0
  local access_file="${access_dir}/access.json"
  [[ -r "$access_file" ]] || return 0
  local group_chat_id
  group_chat_id=$(jq -r '(.groups // {}) | keys | .[0] // empty' "$access_file" 2>/dev/null)
  [[ -n "$group_chat_id" ]] || return 0

  # Optional forum-topic routing: if the group entry carries a
  # message_thread_id, post into that topic (e.g. a dedicated "#5dive" thread)
  # instead of the supergroup's General channel.
  local thread_id
  thread_id=$(jq -r --arg g "$group_chat_id" '.groups[$g].message_thread_id // empty' "$access_file" 2>/dev/null)

  # DIVE-195 intercom: if a fleet intercom topic is set and this agent belongs to
  # that group, consolidate inter-agent chatter into the intercom topic —
  # overriding the first-sorted group picked above (an agent can be in several
  # groups, including stale ones, so don't rely on keys[0] matching). Single
  # source of truth = registry .intercomTopic {threadId, chatId}.
  local intercom_chat intercom_thread
  intercom_chat=$(jq -r '.intercomTopic.chatId // empty' <<<"$reg" 2>/dev/null)
  intercom_thread=$(jq -r '.intercomTopic.threadId // empty' <<<"$reg" 2>/dev/null)
  if [[ -n "$intercom_thread" ]] && \
     jq -e --arg g "$intercom_chat" '(.groups // {}) | has($g)' "$access_file" >/dev/null 2>&1; then
    group_chat_id="$intercom_chat"
    thread_id="$intercom_thread"
  fi

  local trimmed
  trimmed=$(printf '%s' "$body" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
  [[ -n "$trimmed" ]] || return 0

  # Resolve the receiver's @botUsername for a tappable mention; fall back to
  # the bare agent name if the registry has no cached username.
  local bot to_label
  bot=$(jq -r --arg n "$receiver" '.agents[$n].botUsername // empty' <<<"$reg" 2>/dev/null)
  if [[ -n "$bot" ]]; then to_label="@${bot}"; else to_label="@${receiver}"; fi

  # DIVE-195: shared-bot (send-only) agents all post under ONE bot identity, so
  # the intercom can't tell who sent it. Prepend the sender's name for those.
  # Personal-bot agents post under their own bot (name + avatar), so leave clean.
  if grep -q '^TELEGRAM_SEND_ONLY=1' "$token_file" 2>/dev/null; then
    to_label="${invoker_name} to ${to_label}"
  fi

  local max_chars="${MIRROR_MAX_BODY_CHARS:-800}"
  local body_disp overflow=""
  if (( ${#trimmed} > max_chars )); then
    body_disp="${trimmed:0:$((max_chars - 1))}…"
    overflow=" (+$(( ${#trimmed} - max_chars )) chars)"
  else
    body_disp="$trimmed"
  fi

  local mirror_text
  mirror_text=$(printf '%s\n%s%s' "$to_label" "$body_disp" "$overflow")
  _mirror_post "$token" "$group_chat_id" "$thread_id" "$mirror_text" "$access_file"
}

# POST a mirror message, threading into message_thread_id when set. Auto-follows
# a group→supergroup migration: once a group is upgraded (which is also how it
# gains forum topics), Telegram rejects sends to the old basic-group id with
# parameters.migrate_to_chat_id. On that error we rewrite the stored group id
# and retry once against the new supergroup id, so the mirror self-heals instead
# of silently dying. Best-effort throughout — a mirror post is never load-bearing.
_mirror_post() {
  local token="$1" chat="$2" thread="$3" text="$4" access_file="$5" reply_markup="${6:-}"
  local resp
  resp=$(_mirror_send "$token" "$chat" "$thread" "$text" "$reply_markup")
  [[ -n "$resp" ]] || return 0
  [[ "$(jq -r '.ok // false' <<<"$resp" 2>/dev/null)" == "true" ]] && return 0

  local new_chat
  new_chat=$(jq -r '.parameters.migrate_to_chat_id // empty' <<<"$resp" 2>/dev/null)
  if [[ -n "$new_chat" && "$new_chat" != "$chat" ]]; then
    _mirror_follow_migration "$access_file" "$chat" "$new_chat"
    _mirror_send "$token" "$new_chat" "$thread" "$text" "$reply_markup" >/dev/null 2>&1 || true
    return 0
  fi

  # DIVE-117: the send failed for a non-migration reason. A button-bearing send
  # can be rejected for the keyboard alone (a reply_markup Telegram dislikes)
  # while the plain text would deliver. The text alert is load-bearing
  # (DIVE-105), so retry once WITHOUT the keyboard — the ping must never be lost
  # to a button problem.
  if [[ -n "$reply_markup" ]]; then
    _mirror_send "$token" "$chat" "$thread" "$text" "" >/dev/null 2>&1 || true
  fi
}

# Optional 5th arg reply_markup: a Telegram inline_keyboard JSON string. When
# present it's attached so the message carries tap buttons (DIVE-117). Empty =
# a plain text send (unchanged). Built as an arg array so thread + reply_markup
# compose without duplicating the curl call.
_mirror_send() {
  local token="$1" chat="$2" thread="$3" text="$4" reply_markup="${5:-}"
  local args=(--data-urlencode "chat_id=${chat}" --data-urlencode "text=${text}")
  [[ -n "$thread" ]] && args+=(--data-urlencode "message_thread_id=${thread}")
  [[ -n "$reply_markup" ]] && args+=(--data-urlencode "reply_markup=${reply_markup}")
  # Bounded so a hung/slow Telegram API can't wedge the FOREGROUND callers
  # (task_need_notify runs this after the gate UPDATE has already committed;
  # mirror_interagent_outbound likewise). --connect-timeout caps the TCP/TLS
  # handshake, --max-time the whole request (DIVE-115).
  curl -s --connect-timeout 5 --max-time 10 -X POST "https://api.telegram.org/bot${token}/sendMessage" "${args[@]}" 2>/dev/null
}

# Rename a migrated group's key (old→new) in access.json, preserving the policy
# value (incl. message_thread_id) and the file's owner/mode. Runs as root (the
# mirror only fires under sudo), so chowning back to the agent owner is required
# — otherwise the plugin, running as the agent user, could no longer write it.
_mirror_follow_migration() {
  local access_file="$1" old="$2" new="$3"
  local tmp="${access_file}.migrate.$$" owner
  owner=$(stat -c '%U:%G' "$access_file" 2>/dev/null)
  jq --arg o "$old" --arg n "$new" '
    if (.groups // {}) | has($o)
    then (.groups[$n] = .groups[$o]) | del(.groups[$o])
    else . end
  ' "$access_file" > "$tmp" 2>/dev/null || { rm -f "$tmp"; return 0; }
  [[ -n "$owner" ]] && chown "$owner" "$tmp" 2>/dev/null
  chmod 600 "$tmp" 2>/dev/null
  mv "$tmp" "$access_file" 2>/dev/null || rm -f "$tmp" 2>/dev/null
}

# Wait until the agent's TUI has rendered its input prompt and can actually
# receive keystrokes. A freshly (re)started agent takes ~15-30s to boot Claude
# + its plugins/MCP servers; a send-keys before the input box exists is
# silently dropped and the message is LOST (the recurring "my ping never
# arrived after a restart" bug). We poll the pane for the prompt marker "❯",
# which Claude renders once the input box is up (present whether idle OR
# mid-generation — Claude queues typed input — so we don't needlessly block on
# a busy agent). Best-effort + bounded: returns 0 as soon as it's ready, 1 on
# timeout (caller still sends — better to try than to hang forever, and TUIs
# that never draw "❯" shouldn't wedge inter-agent sends).
wait_agent_input_ready() {
  local name="$1" timeout="${2:-45}"
  local user="agent-${name}" waited=0 pane
  while (( waited < timeout )); do
    pane=$(sudo -u "$user" tmux capture-pane -p -t "agent-${name}" 2>/dev/null || true)
    grep -q '❯' <<<"$pane" && return 0
    sleep 1; waited=$((waited+1))
  done
  return 1
}

# Inject a payload into the agent's tmux pane and SUBMIT it, robust against the
# TUI's bracketed-paste handling (DIVE-147). A large/multiline `send-keys -l` is
# absorbed by the TUI as a bracketed PASTE — the prompt shows
# "❯ [Pasted text #N +M lines]" — and a single trailing Enter races into / is
# swallowed by the paste, so the turn never starts and the message is SILENTLY
# DROPPED (small single-paragraph nudges usually submit, which is why the bug is
# size/linecount-correlated and intermittent). Strategy: type the body, pause so
# the paste commits, send Enter, then CONFIRM the pane left the pasted-but-unsent
# state — retrying the Enter a few times before giving up. Best-effort + bounded:
# returns 0 once submission is confirmed (or there was no paste buffer to begin
# with — small msgs / non-Claude TUIs that don't show the placeholder), 1 if it
# still looks unsubmitted after retries. The "[Pasted text #" marker is Claude's
# input-buffer rendering; other runtimes just fall through the fast path.
inject_and_submit() {
  local name="$1" payload="$2" user="agent-${name}" tries=0 pane
  sudo -u "$user" tmux send-keys -t "agent-${name}" -l -- "$payload"
  # Let the TUI finish ingesting the (possibly bracketed-paste) payload before the
  # Enter, so the newline isn't bundled into the paste sequence.
  sleep 0.3
  while (( tries < 5 )); do
    sudo -u "$user" tmux send-keys -t "agent-${name}" Enter
    sleep 0.4
    pane=$(sudo -u "$user" tmux capture-pane -p -t "agent-${name}" 2>/dev/null || true)
    # Submitted once the unsubmitted-paste placeholder clears from the prompt.
    # (We only re-send Enter while it's still showing, so a message that already
    # submitted never gets stray extra Enters.)
    grep -q '\[Pasted text #[0-9]' <<<"$pane" || return 0
    tries=$((tries+1))
  done
  return 1
}

# Inject a message into the agent's tmux session. Uses inject_and_submit so the
# text is delivered literally AND actually submitted (bracketed-paste safe).
# Not exposed via /agents/exec: arbitrary text won't pass the API arg regex, so
# this is CLI + direct-shelld only.
cmd_send() {
  local name="" message="" from="" from_set=0 raw=0
  local reply_to_chat="" reply_to_msg=""
  local -a positional=()
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --message=*)        message="${1#--message=}" ;;
      --from=*)           from="${1#--from=}"; from_set=1 ;;
      --raw)              raw=1 ;;
      --reply-to-chat=*)  reply_to_chat="${1#--reply-to-chat=}" ;;
      --reply-to-msg=*)   reply_to_msg="${1#--reply-to-msg=}" ;;
      --)                 shift; positional+=("$@"); break ;;
      -*)                 fail "$E_USAGE" "unknown flag: $1" ;;
      *)                  positional+=("$1") ;;
    esac
    shift
  done
  if [[ ${#positional[@]} -gt 0 ]]; then
    name="${positional[0]}"
    positional=("${positional[@]:1}")
  fi
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent send <name> <text...> | --message=<text> [--from=<sender>] [--raw] [--reply-to-chat=<id> [--reply-to-msg=<id>]]"
  if [[ -z "$message" && ${#positional[@]} -gt 0 ]]; then
    message="${positional[*]}"
  fi
  [[ -n "$message" ]] || fail "$E_USAGE" "message is empty"
  require_agent "$name"
  sudo -u "agent-${name}" tmux has-session -t "agent-${name}" 2>/dev/null \
    || fail "$E_NOT_RUNNING" "tmux session 'agent-${name}' not found (is the agent running?)"

  # Optional reply-target hint. If present, it tells the receiver: "the user is
  # reachable in this chat — reply there directly via your own bot rather than
  # back through me." --raw skips wrapping entirely, so combining the two has
  # nowhere to put the hint.
  if [[ -n "$reply_to_chat" || -n "$reply_to_msg" ]]; then
    (( ! raw )) || fail "$E_USAGE" "--raw cannot be combined with --reply-to-chat/--reply-to-msg"
  fi
  # --raw + --from is contradictory: --raw means "no envelope, no metadata"
  # (for piping pre-formatted prompts), so claiming a sender identity has
  # nowhere to land. The sender-side outbound mirror also gates on (!raw) —
  # if --raw silently strips the [5dive-msg from=X] envelope while --from
  # suggests "this is from me", the mirror would skip with no warning and
  # the operator would see neither side of the conversation. Force the
  # caller to pick one: identify yourself (and accept the envelope) or send
  # raw (and accept anonymity).
  if (( raw && from_set )); then
    fail "$E_USAGE" "--raw cannot be combined with --from (raw mode strips the envelope that carries sender identity)"
  fi
  if [[ -n "$reply_to_chat" ]]; then
    valid_telegram_chat_id "$reply_to_chat" \
      || fail "$E_VALIDATION" "invalid --reply-to-chat (expected numeric chat id, optionally negative)"
  fi
  if [[ -n "$reply_to_msg" ]]; then
    [[ -n "$reply_to_chat" ]] \
      || fail "$E_USAGE" "--reply-to-msg requires --reply-to-chat"
    [[ "$reply_to_msg" =~ ^[0-9]{1,20}$ ]] \
      || fail "$E_VALIDATION" "invalid --reply-to-msg (expected positive integer)"
  fi

  # Wrap with [5dive-msg from=<sender> id=<id> ...] when this is an inter-agent
  # send, so the receiver can see who's pinging it and reply by name. --raw
  # opts out (useful when piping prompts that already format themselves).
  # --from explicitly empty (`--from=`) also opts out — unless --reply-to-chat
  # is set, in which case we force-wrap (synthetic sender "human") so the hint
  # actually reaches the receiver.
  local payload="$message" sender="" msg_id=""
  if (( ! raw )); then
    if (( from_set )); then
      sender="$from"
    else
      sender="$(auto_sender_from_sudo)"
    fi
    if [[ -z "$sender" && -n "$reply_to_chat" ]]; then
      sender="human"
    fi
    if [[ -n "$sender" ]]; then
      valid_sender_label "$sender" \
        || fail "$E_VALIDATION" "invalid --from label '$sender' (lowercase letter start, [a-z0-9-], <=32 chars)"
      msg_id="$(gen_msg_id)"
      local header="[5dive-msg from=${sender} id=${msg_id}"
      [[ -n "$reply_to_chat" ]] && header+=" reply-to-chat=${reply_to_chat}"
      [[ -n "$reply_to_msg" ]] && header+=" reply-to-msg=${reply_to_msg}"
      header+="]"
      payload="${header} ${message}"
    fi
  fi

  # Don't fire keystrokes into a still-booting TUI — they'd be dropped and the
  # message lost. Wait for the input prompt to render (fast no-op when already
  # up). On timeout we still send best-effort and warn, rather than hang.
  if ! wait_agent_input_ready "$name"; then
    step "agent '$name' input prompt not detected after 45s — sending best-effort (may be lost if still booting)"
  fi

  if ! inject_and_submit "$name" "$payload"; then
    step "agent '$name': payload may not have submitted — pane still shows an unsent paste buffer after retries (large-paste submit race, DIVE-147)"
  fi

  # Mirror the outbound into the sender's group chat (best-effort). Gated on a
  # real envelope: a raw/anonymous send has no sender identity to mirror under.
  (( raw )) || mirror_interagent_outbound "$name" "$message"

  ok "sent to agent '$name'." \
     '{name:$n, sent:true, bytes:($p|length), from:($s|select(length>0)), msg_id:($i|select(length>0)), reply_to_chat:($rc|select(length>0)), reply_to_msg:($rm|select(length>0))}' \
     --arg n "$name" --arg p "$payload" --arg s "$sender" --arg i "$msg_id" --arg rc "$reply_to_chat" --arg rm "$reply_to_msg"
}

# Synchronous send + wait — the inter-agent counterpart to cmd_send. Drops the
# wrapped envelope into the receiver's tmux, then polls capture-pane until the
# scrollback after our marker line stops growing for --idle-secs (or
# --timeout fires). Returns just the reply body, not the receiver's prompt
# echo. Idle-by-stability is intentionally dumb: receiver CLIs don't all emit
# a clean "I'm done" sentinel, and trying to detect per-CLI idle prompts is
# brittle. A noisy receiver (e.g. one printing progress every second forever)
# will keep us awake until --timeout — that's correct behaviour.
cmd_ask() {
  local name="" message="" from="" from_set=0
  local reply_to_chat="" reply_to_msg=""
  local timeout=120 idle=5 poll=2 buf_lines=2000
  local -a positional=()
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --message=*)        message="${1#--message=}" ;;
      --from=*)           from="${1#--from=}"; from_set=1 ;;
      --reply-to-chat=*)  reply_to_chat="${1#--reply-to-chat=}" ;;
      --reply-to-msg=*)   reply_to_msg="${1#--reply-to-msg=}" ;;
      --timeout=*)        timeout="${1#--timeout=}" ;;
      --idle-secs=*)      idle="${1#--idle-secs=}" ;;
      --poll-secs=*)      poll="${1#--poll-secs=}" ;;
      --buffer-lines=*)   buf_lines="${1#--buffer-lines=}" ;;
      --)                 shift; positional+=("$@"); break ;;
      -*)                 fail "$E_USAGE" "unknown flag: $1" ;;
      *)                  positional+=("$1") ;;
    esac
    shift
  done
  if [[ ${#positional[@]} -gt 0 ]]; then
    name="${positional[0]}"
    positional=("${positional[@]:1}")
  fi
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent ask <name> <text...> [--from=<sender>] [--reply-to-chat=<id> [--reply-to-msg=<id>]] [--timeout=120] [--idle-secs=5] [--poll-secs=2]"
  if [[ -z "$message" && ${#positional[@]} -gt 0 ]]; then
    message="${positional[*]}"
  fi
  [[ -n "$message" ]] || fail "$E_USAGE" "message is empty"
  for n in "$timeout" "$idle" "$poll" "$buf_lines"; do
    [[ "$n" =~ ^[0-9]+$ ]] || fail "$E_VALIDATION" "timeout/idle/poll/buffer-lines must be positive integers"
  done
  (( poll >= 1 )) || fail "$E_VALIDATION" "--poll-secs must be >= 1"

  if [[ -n "$reply_to_chat" ]]; then
    valid_telegram_chat_id "$reply_to_chat" \
      || fail "$E_VALIDATION" "invalid --reply-to-chat (expected numeric chat id, optionally negative)"
  fi
  if [[ -n "$reply_to_msg" ]]; then
    [[ -n "$reply_to_chat" ]] \
      || fail "$E_USAGE" "--reply-to-msg requires --reply-to-chat"
    [[ "$reply_to_msg" =~ ^[0-9]{1,20}$ ]] \
      || fail "$E_VALIDATION" "invalid --reply-to-msg (expected positive integer)"
  fi

  require_agent "$name"
  sudo -u "agent-${name}" tmux has-session -t "agent-${name}" 2>/dev/null \
    || fail "$E_NOT_RUNNING" "tmux session 'agent-${name}' not found (is the agent running?)"

  # Resolve sender — ask always wraps because we need a marker to slice the
  # reply window. Fall back to a literal "ask" if neither --from nor SUDO_USER
  # gives us anything; at worst the receiver sees from=ask, which is
  # informative ("a script asked me, not a peer agent").
  local sender msg_id
  if (( from_set )); then
    sender="$from"
  else
    sender="$(auto_sender_from_sudo)"
  fi
  [[ -n "$sender" ]] || sender="ask"
  valid_sender_label "$sender" \
    || fail "$E_VALIDATION" "invalid --from label '$sender' (lowercase letter start, [a-z0-9-], <=32 chars)"
  msg_id="$(gen_msg_id)"
  local header="[5dive-msg from=${sender} id=${msg_id}"
  [[ -n "$reply_to_chat" ]] && header+=" reply-to-chat=${reply_to_chat}"
  [[ -n "$reply_to_msg" ]] && header+=" reply-to-msg=${reply_to_msg}"
  header+="]"
  local payload="${header} ${message}"

  # Same boot-race guard as cmd_send: wait for the input prompt before sending
  # so a freshly-(re)started target doesn't silently drop the question.
  if ! wait_agent_input_ready "$name"; then
    step "agent '$name' input prompt not detected after 45s — sending best-effort (may be lost if still booting)"
  fi

  if ! inject_and_submit "$name" "$payload"; then
    step "agent '$name': question may not have submitted — pane still shows an unsent paste buffer after retries (large-paste submit race, DIVE-147)"
  fi

  # Mirror the outbound into the sender's group chat (best-effort). ask always
  # wraps, so there's always a sender identity to mirror under.
  mirror_interagent_outbound "$name" "$message"

  local start now last_change reply="" prev_slice="" capture slice
  start=$(date +%s)
  last_change=$start
  while :; do
    sleep "$poll"
    now=$(date +%s)
    capture=$(sudo -u "agent-${name}" tmux capture-pane -t "agent-${name}" -p -S "-${buf_lines}" 2>/dev/null) || true
    # Everything after the first line containing our marker. The receiver's
    # CLI typically echoes the user input once, so the slice begins right
    # after that echo and grows as the receiver responds.
    slice=$(awk -v id="id=${msg_id}" 'found {print} index($0, id) {found=1}' <<<"$capture")

    if [[ "$slice" != "$prev_slice" ]]; then
      last_change=$now
      prev_slice="$slice"
    fi

    if (( now - start >= timeout )); then
      fail "$E_TIMEOUT" "no idle reply from '$name' within ${timeout}s (msg_id=${msg_id})"
    fi
    if [[ -n "$slice" ]] && (( now - last_change >= idle )); then
      reply="$slice"
      break
    fi
  done

  if (( JSON_MODE )); then
    jq -Rn --arg n "$name" --arg s "$sender" --arg i "$msg_id" --arg r "$reply" \
      --arg rc "$reply_to_chat" --arg rm "$reply_to_msg" \
      '{ok:true, data:{name:$n, from:$s, msg_id:$i, reply:$r, reply_to_chat:($rc|select(length>0)), reply_to_msg:($rm|select(length>0))}}'
  else
    printf '%s\n' "$reply"
  fi
}

# Create a new agent with the same type (and by default the same workdir) as an
# existing one. Channels default to none unless the caller provides a fresh
# token — two agents can't share a telegram/discord bot.
cmd_clone() {
  local src="" dst="" override_channels="" channels_set=0
  local telegram_token="" discord_token="" override_workdir=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --channels=*)        override_channels="${1#--channels=}"; channels_set=1 ;;
      --telegram-token=*)  telegram_token="${1#--telegram-token=}" ;;
      --discord-token=*)   discord_token="${1#--discord-token=}" ;;
      --workdir=*)         override_workdir="${1#--workdir=}" ;;
      -*)                  fail "$E_USAGE" "unknown flag: $1" ;;
      *)
        if [[ -z "$src" ]]; then src="$1"
        elif [[ -z "$dst" ]]; then dst="$1"
        else fail "$E_USAGE" "extra arg: $1"
        fi ;;
    esac
    shift
  done
  [[ -n "$src" && -n "$dst" ]] \
    || fail "$E_USAGE" "usage: 5dive agent clone <src> <dst> [--channels=...] [--telegram-token=...] [--discord-token=...] [--workdir=...]"
  ensure_state
  local reg
  reg=$(registry_read)
  jq -e --arg n "$src" '.agents[$n] != null' <<<"$reg" >/dev/null \
    || fail "$E_NOT_FOUND" "source agent '$src' does not exist"
  if jq -e --arg n "$dst" '.agents[$n] != null' <<<"$reg" >/dev/null; then
    fail "$E_CONFLICT" "destination agent '$dst' already exists"
  fi

  local src_type src_channels src_workdir src_profile
  src_type=$(jq     -r --arg n "$src" '.agents[$n].type'              <<<"$reg")
  src_channels=$(jq -r --arg n "$src" '.agents[$n].channels // "none"' <<<"$reg")
  src_workdir=$(jq  -r --arg n "$src" '.agents[$n].workdir // empty'  <<<"$reg")
  src_profile=$(jq  -r --arg n "$src" '.agents[$n].authProfile // empty' <<<"$reg")

  local new_channels
  if (( channels_set )); then
    new_channels="$override_channels"
  elif [[ "$src_channels" != "none" && -z "$telegram_token" && -z "$discord_token" ]]; then
    warn "source has channels=$src_channels but no --${src_channels}-token provided — clone defaults to channels=none"
    new_channels="none"
  else
    new_channels="$src_channels"
  fi

  local new_workdir="${override_workdir:-$src_workdir}"

  local -a args=("$dst" "--type=${src_type}" "--channels=${new_channels}")
  [[ -n "$new_workdir" ]]    && args+=("--workdir=${new_workdir}")
  [[ -n "$src_profile" ]]    && args+=("--auth-profile=${src_profile}")
  [[ -n "$telegram_token" ]] && args+=("--telegram-token=${telegram_token}")
  [[ -n "$discord_token" ]]  && args+=("--discord-token=${discord_token}")
  step "Cloning '$src' -> '$dst' (type=$src_type, channels=$new_channels)"
  # cmd_create emits its own ok/fail envelope, which becomes the clone's
  # output too — dashboards parse exactly one envelope.
  cmd_create "${args[@]}"
}

cmd_stats() {
  local name="" all=0
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --all) all=1 ;;
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done

  # Batched form: `stats --all` emits every agent's stats object in ONE
  # invocation so the dashboard/mobile collapse N per-agent box execs into a
  # single one (the box shell rate-limit is shared across all of a user's exec
  # traffic, so N calls every few seconds trip it). Reuses the single-agent path
  # per agent (no duplicated gather), JSON-only since it's a machine endpoint.
  # (DIVE-206)
  if (( all )); then
    [[ -z "$name" ]] || fail "$E_USAGE" "stats --all takes no name"
    local _reg _names _arr="[]" _n _d
    _reg=$(registry_read)
    _names=$(jq -r '.agents | keys[]' <<<"$_reg" 2>/dev/null || true)
    for _n in $_names; do
      # Subshell isolates the forced JSON_MODE; unwrap the single-agent `.data`.
      _d=$(JSON_MODE=1; cmd_stats "$_n" 2>/dev/null | jq -c '.data' 2>/dev/null) || continue
      [[ -n "$_d" && "$_d" != "null" ]] || continue
      _arr=$(jq -c --argjson d "$_d" '. + [$d]' <<<"$_arr")
    done
    printf '{"ok":true,"data":%s}\n' "$_arr"
    return 0
  fi

  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive agent stats <name> [--json] | stats --all --json"
  require_agent "$name"

  local reg
  reg=$(registry_read)

  local svc="5dive-agent@${name}.service"
  # One shell-out for all systemd fields we care about.
  local props
  props=$(systemctl show "$svc" \
    --property=ActiveState,SubState,Result,NRestarts,ActiveEnterTimestamp,ExecMainStartTimestamp,ExecMainStatus,ExecMainExitTimestamp \
    --no-page 2>/dev/null || true)
  local active sub result restarts active_ts main_ts exit_status exit_ts
  active=$(awk     -F= '/^ActiveState=/{print $2}'              <<<"$props")
  sub=$(awk        -F= '/^SubState=/{print $2}'                 <<<"$props")
  result=$(awk     -F= '/^Result=/{print $2}'                   <<<"$props")
  restarts=$(awk   -F= '/^NRestarts=/{print $2}'                <<<"$props")
  active_ts=$(awk  -F= '/^ActiveEnterTimestamp=/{print $2}'     <<<"$props")
  main_ts=$(awk    -F= '/^ExecMainStartTimestamp=/{print $2}'   <<<"$props")
  exit_status=$(awk -F= '/^ExecMainStatus=/{print $2}'          <<<"$props")
  exit_ts=$(awk    -F= '/^ExecMainExitTimestamp=/{print $2}'    <<<"$props")

  local type channels created workdir
  type=$(jq     -r --arg n "$name" '.agents[$n].type'                      <<<"$reg")
  channels=$(jq -r --arg n "$name" '.agents[$n].channels // "none"'        <<<"$reg")
  created=$(jq  -r --arg n "$name" '.agents[$n].createdAt // empty'        <<<"$reg")
  workdir=$(jq  -r --arg n "$name" --arg d "$DEFAULT_WORKDIR" '.agents[$n].workdir // $d' <<<"$reg")

  # Best-effort health: the bare systemd state says "active" even when the
  # agent is wedged at a rate-limit menu or a login screen. Scrape the live
  # pane for those banners (mirrors the telegram plugin's detectStallCause) so
  # the dashboard can surface a stall the operator would otherwise only learn
  # via Telegram. Empty/`null` when running clean or when we can't read the
  # pane (e.g. not root). Only meaningful while active.
  local health="null"
  if [[ "$active" == "active" ]]; then
    local pane
    pane=$(sudo -u "agent-${name}" tmux capture-pane -t "agent-${name}" -p -S -40 2>/dev/null | tail -c 4000 || true)
    if [[ -n "$pane" ]]; then
      if grep -qiE "session limit|usage limit|hit your (usage|session) limit|rate limit|/rate-limit-options" <<<"$pane"; then
        local reset; reset=$(grep -oiE "resets?[^|]*" <<<"$pane" | head -1 | tr -s ' ' | sed 's/[[:space:]]*$//')
        health=$(jq -cn --arg d "${reset:-no reset time shown}" '{cause:"rate_limited", detail:$d}')
      elif grep -qiE "(sign ?in|log ?in|authenticate|re-?authenticate|enter your api key)" <<<"$pane"; then
        health=$(jq -cn '{cause:"auth", detail:"sitting at a login screen — re-auth needed"}')
      fi
    fi
  fi

  if (( JSON_MODE )); then
    jq -cn \
      --arg name "$name" --arg type "$type" --arg channels "$channels" \
      --arg created "$created" --arg workdir "$workdir" \
      --arg active "$active" --arg sub "$sub" --arg result "$result" \
      --arg restarts "${restarts:-0}" --arg active_ts "$active_ts" \
      --arg main_ts "$main_ts" --arg exit_status "${exit_status:-}" --arg exit_ts "$exit_ts" \
      --argjson health "$health" '{
        ok:true, data:{
          name: $name, type: $type, channels: $channels,
          createdAt: $created, workdir: $workdir,
          active: $active, sub: $sub, result: $result,
          restarts: ($restarts | tonumber? // 0),
          activeEnter: $active_ts,
          execMainStart: $main_ts,
          execMainStatus: ($exit_status | tonumber? // null),
          execMainExit: $exit_ts,
          health: $health
        }
      }'
  else
    echo "name:         $name"
    echo "type:         $type"
    echo "channels:     $channels"
    echo "workdir:      $workdir"
    echo "created:      ${created:-unknown}"
    echo "state:        ${active:-unknown} (${sub:-unknown})"
    echo "result:       ${result:-unknown}"
    echo "restarts:     ${restarts:-0}"
    echo "active since: ${active_ts:-never}"
    echo "last start:   ${main_ts:-never}"
    echo "last exit:    ${exit_ts:-never} (status=${exit_status:-?})"
    if [[ "$health" != "null" ]]; then
      echo "health:       $(jq -r '"\(.cause) — \(.detail)"' <<<"$health")"
    fi
  fi
}

# -------- skills (per-agent, via npx skills, type-aware) --------
#
# Each agent user has its own per-type skills dir (.claude/skills for claude,
# .hermes/skills for hermes, .agents/skills for codex/opencode, plain
# ./skills for openclaw). `npx skills add` with `--agent <id>` lands the skill
# in the right place — see SKILLS_AGENT_ID / SKILLS_INSTALL_DIR at the top of
# this file. The dashboard's Skills block calls these subcommands through
# /agents/exec so install/list/remove all flow through the same auditable
# path as the rest of agent management.

# Validate `<owner>/<repo>` for skill source. The github URL passed to
# `npx skills add` is built from this; constraining the regex keeps the
# command line free of shell metacharacters even before bash quoting.
valid_skill_source() {
  [[ "$1" =~ ^[A-Za-z0-9._-]+/[A-Za-z0-9._-]+$ ]]
}

# Default GitHub source applied to bare skill ids in --with-skills (e.g.
# `5dive-cli` → `5dive-com/skills:5dive-cli`). Keeps the common path short
# while leaving the door open for third-party skill repos.
DEFAULT_SKILL_SOURCE="5dive-com/skills"

# parse_skill_spec <spec> -> "<source> <skill>"
# Accepts either bare `<id>` (uses DEFAULT_SKILL_SOURCE) or `<owner/repo>:<id>`.
# Caller splits the result on space.
parse_skill_spec() {
  local spec="$1"
  if [[ "$spec" == *:* ]]; then
    printf '%s %s\n' "${spec%%:*}" "${spec#*:}"
  else
    printf '%s %s\n' "$DEFAULT_SKILL_SOURCE" "$spec"
  fi
}

# Validate skill id (the directory name that will end up under the per-type
# skills dir, e.g. .claude/skills/<id>). Same character class skills.sh uses.
valid_skill_id() {
  [[ "$1" =~ ^[A-Za-z0-9._-]+$ ]]
}

# cmd_skill <agent-name>|--all <action> [args...]
# Dispatcher mirrors the auth subcommand shape so main()'s case stays flat.
#
# `--all list` is a bulk variant: it lists installed skills for EVERY agent
# in the registry in a single invocation, looping serially. The dashboard
# uses it instead of firing one exec per agent — the per-agent fan-out
# spawned N concurrent sudo+npx processes and saturated swap-bound boxes.
cmd_skill() {
  local name="${1:-}"
  [[ -n "$name" ]] \
    || fail "$E_USAGE" "usage: 5dive agent skill <name>|--all add|list|rm [...]"
  shift
  # --all only supports `list` (bulk read); add/rm stay per-agent so the
  # blast radius of a mutation is always a single named agent.
  if [[ "$name" == "--all" ]]; then
    local action="${1:-list}"
    [[ "$action" == "list" ]] \
      || fail "$E_USAGE" "--all only supports 'list' (got '$action')"
    cmd_skill_list_all
    return
  fi
  require_agent "$name"
  local action="${1:-}"
  [[ -n "$action" ]] \
    || fail "$E_USAGE" "usage: 5dive agent skill $name add|list|rm [...]"
  shift
  case "$action" in
    add)       cmd_skill_add  "$name" "$@" ;;
    list)      cmd_skill_list "$name" "$@" ;;
    rm|remove) cmd_skill_rm   "$name" "$@" ;;
    *) fail "$E_USAGE" "unknown skill action: $action (use add | list | rm)" ;;
  esac
}

cmd_skill_add() {
  local name="$1"; shift
  local source="" skill=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --source=*) source="${1#--source=}" ;;
      --skill=*)  skill="${1#--skill=}" ;;
      *) fail "$E_USAGE" "unknown flag: $1" ;;
    esac
    shift
  done
  [[ -n "$source" && -n "$skill" ]] \
    || fail "$E_USAGE" "usage: 5dive agent skill $name add --source=<owner/repo> --skill=<id>"
  valid_skill_source "$source" \
    || fail "$E_VALIDATION" "invalid source: '$source' (expected owner/repo)"
  valid_skill_id "$skill" \
    || fail "$E_VALIDATION" "invalid skill id: '$skill'"

  local user="agent-${name}" home="/home/agent-${name}"
  [[ -d "$home" ]] || fail "$E_GENERIC" "agent home missing: $home"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"

  local type agent_id install_dir
  type=$(agent_type "$name")
  [[ -n "$type" ]] || fail "$E_NOT_FOUND" "agent '$name' has no type recorded in registry"
  agent_id="${SKILLS_AGENT_ID[$type]:-claude-code}"
  install_dir="${SKILLS_INSTALL_DIR[$type]:-.claude/skills}"

  # Determine isolation so we can choose the right install strategy.
  local isolation
  isolation=$(grep -oP '(?<=AGENT_ISOLATION=)\S+' "${ENV_DIR}/${name}.env" 2>/dev/null || echo "admin")

  step "Installing skill '$skill' from '$source' for agent '$name' (--agent $agent_id)"
  # Same pattern as install_channel_plugin_for_agent: non-login shell,
  # CLAUDE_CONFIG_DIR unset so $HOME is the install target root, nvm
  # sourced manually so npx is on PATH. Output mirrored to stderr only.
  #
  # Sandboxed agents are not in the claude group, so /home/claude/ is
  # inaccessible to them and the claude binary can't be found on PATH.
  # For those agents we run the install as root (which has full access)
  # with HOME overridden to the agent's own home, then re-own the result.
  #
  # Manual-install types (grok today, see _skill_needs_manual_install in
  # lib/agent_setup.sh): upstream `npx skills add` rejects --agent grok with
  # "Invalid agents: grok", so we git-clone + cp -r the skill dir directly
  # into $HOME/$INSTALL_DIR. Bypasses the sandboxed branch too — git is
  # available everywhere npm/npx is.
  if _skill_needs_manual_install "$type"; then
    local run_as="sudo -u $user -H"
    [[ "$isolation" == "sandboxed" ]] && run_as=""
    if ! $run_as env HOME="$home" SOURCE="$source" SKILL="$skill" INSTALL_DIR="$install_dir" bash -s >&2 <<'SKILL_ADD_MANUAL'
set -euo pipefail
unset CLAUDE_CONFIG_DIR
cd "$HOME"
TMPDIR=$(mktemp -d -t skill-XXXXXX)
trap 'rm -rf "$TMPDIR"' EXIT
timeout 60 git clone --depth=1 "https://github.com/$SOURCE.git" "$TMPDIR/repo" >/dev/null 2>&1
SRC_DIR=""
for d in "$TMPDIR/repo/$SKILL" "$TMPDIR/repo/skills/$SKILL"; do
  if [ -f "$d/SKILL.md" ]; then SRC_DIR="$d"; break; fi
done
[ -n "$SRC_DIR" ] || { echo "ERROR: skill '$SKILL' not found in $SOURCE (looked at top-level and skills/)" >&2; exit 1; }
mkdir -p "$HOME/$INSTALL_DIR"
rm -rf "$HOME/$INSTALL_DIR/$SKILL"
cp -r "$SRC_DIR" "$HOME/$INSTALL_DIR/$SKILL"
[ -d "$HOME/$INSTALL_DIR/$SKILL" ] || { echo "ERROR: $INSTALL_DIR/$SKILL missing after install" >&2; exit 1; }
echo "manual-installed $SKILL → $HOME/$INSTALL_DIR/$SKILL"
SKILL_ADD_MANUAL
    then
      fail "$E_GENERIC" "skill install failed for '$skill' on agent '$name'"
    fi
    [[ "$isolation" == "sandboxed" ]] && chown -R "${user}:${user}" "$home/$install_dir/$skill" 2>/dev/null || true
    ok "skill '$skill' installed for agent '$name'." \
       '{name:$n, source:$s, skill:$k, agent:$a, action:"add", strategy:"manual"}' \
       --arg n "$name" --arg s "$source" --arg k "$skill" --arg a "$agent_id"
    return 0
  fi

  if [[ "$isolation" == "sandboxed" ]]; then
    if ! HOME="$home" \
         SOURCE="$source" SKILL="$skill" AGENT_ID="$agent_id" INSTALL_DIR="$install_dir" \
         bash -s >&2 <<'SKILL_ADD_SANDBOXED'
set -euo pipefail
unset CLAUDE_CONFIG_DIR
export NVM_DIR="/home/claude/.nvm"
# shellcheck disable=SC1091
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"
export PATH="/home/claude/.local/bin:$PATH"
cd "$HOME"
timeout 180 npx -y skills add "https://github.com/$SOURCE" --skill "$SKILL" --agent "$AGENT_ID" --yes 2>&1 | tail -25
[ -d "$INSTALL_DIR/$SKILL" ] || { echo "ERROR: $INSTALL_DIR/$SKILL missing after install" >&2; exit 1; }
SKILL_ADD_SANDBOXED
    then
      fail "$E_GENERIC" "skill install failed for '$skill' on agent '$name'"
    fi
    chown -R "${user}:${user}" "$home/$install_dir/$skill" 2>/dev/null || true
  else
    if ! sudo -u "$user" -H env SOURCE="$source" SKILL="$skill" AGENT_ID="$agent_id" INSTALL_DIR="$install_dir" bash -s >&2 <<'SKILL_ADD'
set -euo pipefail
unset CLAUDE_CONFIG_DIR
export NVM_DIR="/home/claude/.nvm"
# shellcheck disable=SC1091
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"
export PATH="/home/claude/.local/bin:$PATH"
cd "$HOME"
timeout 180 npx -y skills add "https://github.com/$SOURCE" --skill "$SKILL" --agent "$AGENT_ID" --yes 2>&1 | tail -25
[ -d "$INSTALL_DIR/$SKILL" ] || { echo "ERROR: $INSTALL_DIR/$SKILL missing after install" >&2; exit 1; }
SKILL_ADD
    then
      fail "$E_GENERIC" "skill install failed for '$skill' on agent '$name'"
    fi
  fi

  ok "skill '$skill' installed for agent '$name'." \
     '{name:$n, source:$s, skill:$k, agent:$a, action:"add"}' \
     --arg n "$name" --arg s "$source" --arg k "$skill" --arg a "$agent_id"
}

# _skill_list_json <name> -> prints the installed-skills JSON array for one
# agent (always valid JSON, "[]" on any failure). Shared by the single-agent
# `list` and the bulk `--all list` so both paths derive the list identically.
_skill_list_json() {
  local name="$1"
  local user="agent-${name}" home="/home/agent-${name}"
  [[ -d "$home" ]] && id -u "$user" &>/dev/null || { echo "[]"; return; }

  local type agent_id install_dir
  type=$(agent_type "$name")
  agent_id="${SKILLS_AGENT_ID[$type]:-claude-code}"
  install_dir="${SKILLS_INSTALL_DIR[$type]:-.claude/skills}"

  # `npx skills list --json` prints clean JSON when available. If the
  # skills CLI isn't reachable (no network, npx cache cold) we fall back
  # to a directory scan so the dashboard always has a list to render.
  local out
  out=$(sudo -u "$user" -H bash -s 2>/dev/null <<'SKILL_LIST' || true
set -uo pipefail
unset CLAUDE_CONFIG_DIR
export NVM_DIR="/home/claude/.nvm"
# shellcheck disable=SC1091
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"
export PATH="/home/claude/.local/bin:$PATH"
cd "$HOME"
timeout 30 npx -y skills list --json 2>/dev/null
SKILL_LIST
)
  local list
  list=$(jq -c '.' <<<"$out" 2>/dev/null || true)
  if [[ -z "$list" ]]; then
    # Fallback: enumerate the per-type skills dir in the agent home. Marks
    # each entry with the agent_id we'd pass to `skills add` so callers can
    # tell which CLI a skill is bound to without re-deriving the type.
    list=$(sudo -u "$user" env INSTALL_DIR="$install_dir" AGENT_ID="$agent_id" bash -c '
      shopt -s nullglob
      out="[]"
      for d in "$HOME"/"$INSTALL_DIR"/*/; do
        n=$(basename "$d")
        out=$(jq -c --arg n "$n" --arg p "$d" --arg a "$AGENT_ID" \
          ". + [{name:\$n, path:\$p, scope:\"project\", agents:[\$a]}]" <<<"$out")
      done
      echo "$out"
    ' 2>/dev/null || echo "[]")
  fi
  printf '%s' "${list:-[]}"
}

cmd_skill_list() {
  local name="$1"; shift
  local user="agent-${name}" home="/home/agent-${name}"
  [[ -d "$home" ]] || fail "$E_GENERIC" "agent home missing: $home"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"

  local list
  list=$(_skill_list_json "$name")

  if (( JSON_MODE )); then
    jq -cn --argjson list "$list" --arg n "$name" \
      '{ok:true, data:{name:$n, skills:$list}}'
  else
    if [[ "$list" == "[]" || -z "$list" ]]; then
      echo "no skills installed for '$name'"
    else
      jq -r '.[] | [.name, (.path // "-")] | @tsv' <<<"$list" | column -t -s $'\t'
    fi
  fi
}

# cmd_skill_list_all — installed skills for every registry agent, looped
# serially. Replaces the dashboard's per-agent exec fan-out (one concurrent
# sudo+npx per agent saturated swap-bound boxes → shelld timeout → 502).
# Best-effort per agent: a failure yields an empty list, never aborts the loop.
cmd_skill_list_all() {
  local reg names
  reg=$(registry_read 2>/dev/null || echo '{}')
  names=$(jq -r '.agents | keys[]' <<<"$reg" 2>/dev/null || true)

  # Build the {name: [skills]} object incrementally so one slow/failed agent
  # never discards the others already collected.
  local agents_json="{}" name list
  for name in $names; do
    list=$(_skill_list_json "$name")
    agents_json=$(jq -c --arg n "$name" --argjson l "${list:-[]}" \
      '.[$n] = $l' <<<"$agents_json" 2>/dev/null || printf '%s' "$agents_json")
  done

  if (( JSON_MODE )); then
    jq -cn --argjson agents "$agents_json" '{ok:true, data:{agents:$agents}}'
  else
    local n
    for n in $(jq -r 'keys[]' <<<"$agents_json"); do
      local count
      count=$(jq -r --arg n "$n" '.[$n] | length' <<<"$agents_json")
      printf '%s\t%s skill(s)\n' "$n" "$count"
    done | column -t -s $'\t'
  fi
}

cmd_skill_rm() {
  local name="$1"; shift
  local skill=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --skill=*) skill="${1#--skill=}" ;;
      *) [[ -z "$skill" ]] && skill="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$skill" ]] || fail "$E_USAGE" "usage: 5dive agent skill $name rm <skill_id>"
  valid_skill_id "$skill" || fail "$E_VALIDATION" "invalid skill id: '$skill'"

  local user="agent-${name}" home="/home/agent-${name}"
  [[ -d "$home" ]] || fail "$E_GENERIC" "agent home missing: $home"
  id -u "$user" &>/dev/null || fail "$E_GENERIC" "agent user missing: $user"

  local type install_dir
  type=$(agent_type "$name")
  install_dir="${SKILLS_INSTALL_DIR[$type]:-.claude/skills}"

  step "Removing skill '$skill' from agent '$name'"
  # `npx skills remove` is interactive without a flag; fall straight to
  # rm -rf since the skill is just a directory under the per-type skills dir.
  if ! sudo -u "$user" -H env SKILL="$skill" INSTALL_DIR="$install_dir" bash -s >&2 <<'SKILL_REMOVE'
set -euo pipefail
unset CLAUDE_CONFIG_DIR
cd "$HOME"
target="$INSTALL_DIR/$SKILL"
if [ -e "$target" ] || [ -L "$target" ]; then
  rm -rf "$target"
  echo "Removed $target"
else
  echo "Skill not found: $target" >&2
  exit 4
fi
SKILL_REMOVE
  then
    fail "$E_NOT_FOUND" "skill '$skill' not installed on agent '$name'"
  fi

  ok "skill '$skill' removed for agent '$name'." \
     '{name:$n, skill:$k, action:"rm"}' \
     --arg n "$name" --arg k "$skill"
}

# -------- 5dive init: interactive first-run wizard --------
# Orchestrates `agent install` + `agent auth` + `agent create` so a brand-new
# user can go from `curl | sudo bash` → working agent in one prompt-driven
# command. Everything it does is also reachable via the individual commands.

cmd_init() {
  # Fail fast before any prompts: every step the wizard drives (agent install /
  # create, channel wiring) is root-only — without this guard an unprivileged
  # user answers the whole questionnaire and dies mid-create instead.
  [[ $EUID -eq 0 ]] || fail "$E_PERMISSION" "init must run as root (sudo 5dive init)"
  if [[ ! -t 0 ]]; then
    fail "$E_USAGE" "5dive init is interactive — run it in a real terminal (not a pipe)"
  fi

  cat >&2 <<'WELCOME'

  ░█▀▀░█▀▄░▀█▀░█░█░█▀▀
  ░▀▀▄░█░█░░█░░█░█░█▀▀
  ░▀▀▀░█▀▀░▀▀▀░░▀░░▀▀▀     interactive setup

WELCOME

  # --- Step 1: pick a type ---
  local -a types=(claude codex antigravity grok hermes openclaw opencode)
  local -A type_desc=(
    [claude]="Anthropic's Claude — recommended"
    [codex]="OpenAI Codex"
    [antigravity]="Google Antigravity CLI"
    [grok]="xAI Grok CLI"
    [hermes]="Open-source agent — bring your own provider"
    [openclaw]="Open-source agent — bring your own provider"
    [opencode]="Open-source agent — bring your own provider"
  )
  echo "Pick an agent type:" >&2
  local i=1
  for t in "${types[@]}"; do
    printf "  %d) %-11s — %s\n" "$i" "$t" "${type_desc[$t]}" >&2
    i=$((i+1))
  done
  echo >&2
  local choice type
  while true; do
    read -r -p "  choice [1-${#types[@]}, default 1]: " choice
    choice="${choice:-1}"
    if [[ "$choice" =~ ^[1-7]$ ]] && (( choice <= ${#types[@]} )); then
      type="${types[$((choice-1))]}"
      break
    fi
    echo "  invalid choice" >&2
  done
  echo "  → $type" >&2
  echo >&2

  # --- Step 2: install binary if missing ---
  echo "Checking $type CLI…" >&2
  if ! cmd_install "$type" >&2; then
    fail "$E_NOT_INSTALLED" "failed to install $type — try '5dive agent install $type' manually"
  fi
  echo >&2

  # --- Step 3: auth ---
  local auth_ok=0
  # Probe current auth state — cmd_auth_status returns 0 if any creds exist.
  if 5dive agent auth status --probe --type="$type" --json 2>/dev/null | jq -e '.ok and (.data | any(.status == "ok"))' >/dev/null 2>&1; then
    echo "✓ $type already authenticated" >&2
    auth_ok=1
  fi

  if (( auth_ok == 0 )); then
    echo "Auth for $type:" >&2
    case "$type" in
      claude)
        echo "  1) OAuth (recommended) — opens an interactive login session" >&2
        echo "  2) API key paste" >&2
        local auth_choice
        read -r -p "  choice [1-2, default 1]: " auth_choice
        auth_choice="${auth_choice:-1}"
        if [[ "$auth_choice" == "1" ]]; then
          echo "  launching OAuth flow…" >&2
          5dive agent auth login claude || fail "$E_AUTH_REQUIRED" "auth failed"
        else
          local key
          read -r -s -p "  paste API key: " key; echo >&2
          [[ -n "$key" ]] || fail "$E_VALIDATION" "empty API key"
          printf '%s' "$key" | 5dive agent auth set claude --api-key=- || fail "$E_AUTH_REQUIRED" "auth failed"
        fi
        ;;
      codex|openclaw|antigravity|grok)
        echo "  launching interactive login for $type…" >&2
        5dive agent auth login "$type" || fail "$E_AUTH_REQUIRED" "auth failed"
        ;;
      hermes)
        local providers="openrouter anthropic openai google deepseek qwen nous minimax moonshot huggingface zai"
        echo "  hermes needs a provider + API key. Providers: $providers" >&2
        local provider
        read -r -p "  provider [default openrouter]: " provider
        provider="${provider:-openrouter}"
        local key
        read -r -s -p "  paste $provider API key: " key; echo >&2
        [[ -n "$key" ]] || fail "$E_VALIDATION" "empty API key"
        printf '%s' "$key" | 5dive agent auth set hermes --api-key=- --provider="$provider" \
          || fail "$E_AUTH_REQUIRED" "auth failed"
        ;;
      opencode)
        local key
        read -r -s -p "  paste OpenAI API key: " key; echo >&2
        [[ -n "$key" ]] || fail "$E_VALIDATION" "empty API key"
        printf '%s' "$key" | 5dive agent auth set opencode --api-key=- \
          || fail "$E_AUTH_REQUIRED" "auth failed"
        ;;
    esac
    echo >&2
  fi

  # --- Step 4: name ---
  local name
  read -r -p "Name your first agent [my-agent]: " name
  name="${name:-my-agent}"
  echo >&2

  # --- Step 5: pick a channel (mirrors the dashboard connect-flow) ---
  # Only claude/hermes/openclaw expose telegram via `agent create --channels=`.
  # Other types fall straight through to create with channels=none.
  local channels="none"
  local telegram_token=""
  local telegram_user_id=""
  local supports_telegram=0
  case "$type" in
    claude|hermes|openclaw) supports_telegram=1 ;;
  esac

  if (( supports_telegram == 1 )); then
    echo "Add a chat channel? (lets you message your agent from your phone)" >&2
    echo "  1) Skip — talk to your agent via 5dive CLI / TUI" >&2
    echo "  2) Telegram" >&2
    local ch_choice
    read -r -p "  choice [1-2, default 1]: " ch_choice
    ch_choice="${ch_choice:-1}"
    case "$ch_choice" in
      2) channels="telegram" ;;
      1) channels="none" ;;
      *) echo "  invalid choice — skipping" >&2; channels="none" ;;
    esac
    echo >&2
  fi

  local username=""
  if [[ "$channels" == "telegram" ]]; then
    echo "Get a bot token from BotFather: https://t.me/BotFather (send /newbot)" >&2
    read -r -s -p "  paste bot token: " telegram_token; echo >&2
    [[ -n "$telegram_token" ]] || fail "$E_VALIDATION" "empty bot token"
    echo >&2

    # Resolve bot @username up front (degrade silently on any failure —
    # discover still works, we just lose the tap-to-open hint).
    local getme_json
    getme_json=$(5dive agent telegram-getme --token="$telegram_token" --json 2>/dev/null || true)
    if jq -e '.ok and .data.username' <<<"$getme_json" >/dev/null 2>&1; then
      username=$(jq -r '.data.username' <<<"$getme_json")
      echo "Open Telegram → @$username → send /start. Waiting up to ~2 min…" >&2
    else
      echo "Open Telegram → your bot → send /start. Waiting up to ~2 min…" >&2
    fi

    # Long-poll for the first inbound DM so we can auto-allowlist the user
    # without a manual pair-code paste — same ~2-min budget as the
    # dashboard's discover loop. On miss, fall back to manual pair-code.
    local attempt discover_json
    for attempt in 1 2; do
      discover_json=$(5dive agent telegram-discover --token="$telegram_token" --poll-secs=60 --json 2>/dev/null || true)
      if jq -e '.ok and .data.found' <<<"$discover_json" >/dev/null 2>&1; then
        telegram_user_id=$(jq -r '.data.userId' <<<"$discover_json")
        local who
        who=$(jq -r '.data.username // .data.firstName // empty' <<<"$discover_json")
        echo "  ✓ detected${who:+ ($who)} → id $telegram_user_id" >&2
        break
      fi
      (( attempt == 1 )) && echo "  still waiting…" >&2
    done

    if [[ -z "$telegram_user_id" ]]; then
      echo "  → no DM yet. We'll create the agent now; pair after with:" >&2
      echo "       5dive agent pair $name --code=<code-from-bot>" >&2
    fi
    echo >&2
  fi

  # --- Step 6: create ---
  local -a create_args=("$name" "--type=$type")
  if [[ "$channels" != "none" ]]; then
    create_args+=("--channels=$channels")
    [[ -n "$telegram_token" ]] && create_args+=("--telegram-token=$telegram_token")
    [[ -n "$telegram_user_id" ]] && create_args+=("--telegram-allowed-users=$telegram_user_id")
  fi
  echo "Creating agent '$name'…" >&2
  if ! 5dive agent create "${create_args[@]}" >&2; then
    fail "$E_GENERIC" "failed to create agent — see logs above"
  fi
  echo >&2

  # --- Step 7: auto-pair welcome DM (claude+telegram with auto-detected id) ---
  # openclaw/hermes wire the allowlist inside `agent create` itself, so the
  # extra pair call only applies to claude — same gate the dashboard uses.
  if [[ "$type" == "claude" && "$channels" == "telegram" && -n "$telegram_user_id" ]]; then
    5dive agent pair "$name" --user-id="$telegram_user_id" >&2 || true
    echo >&2
  fi

  # --- Step 8: next steps ---
  cat >&2 <<NEXT
✓ agent '$name' is ready.

Try it out:
  5dive agent send '$name' 'hello, who are you?'
  5dive agent ask  '$name' 'what model are you?' --timeout=60
  5dive agent $name tui                # attach a terminal

NEXT

  if [[ "$channels" == "telegram" && -n "$telegram_user_id" ]]; then
    cat >&2 <<TG
From your phone:
  open Telegram → ${username:+@$username → }DM your bot directly

TG
  elif [[ "$channels" == "telegram" ]]; then
    cat >&2 <<TG
Finish Telegram pairing:
  5dive agent pair $name --code=<code-from-bot>
  (open Telegram, DM your bot — it replies with a pair code)

TG
  fi

  # Heads-up for Teams-org accounts: remote managed-settings can silently
  # override the local channel allowlist (Console-controlled). The check
  # runs as part of `5dive doctor --category=channels` after the agent boots.
  if [[ "$channels" == "telegram" ]]; then
    cat >&2 <<TEAMS
Anthropic Teams accounts:
  if your bot stays silent on incoming DMs, your org admin may need to
  allowlist this plugin in the Anthropic Console. Diagnose with:
    sudo 5dive doctor --category=channels
  Setup snippet: https://github.com/5dive-com/5dive-plugins#anthropic-teams-accounts

TEAMS
  fi

  cat >&2 <<MANAGE
Manage:
  5dive agent list
  5dive agent stats $name
  5dive doctor

MANAGE
}

# -------- doctor (health check + optional auto-repair) --------
#
# Mental model: the dashboard invokes `5dive doctor --json` periodically, and
# users hit `5dive doctor --repair` from a "fix problems" button. Each check
# reports:
#   - severity: ok | warn | error
#   - fixable:  does this check know how to repair itself?
#   - repaired: did --repair actually fix it this run?
# The envelope is always {ok:true, data:{summary,checks}} (exit 0) so the
# dashboard can render partial results even when individual checks fail.
# Use data.summary.errors to branch in CI.

# Accumulator rebuilt on every cmd_doctor invocation. Script-scope so the
# check helpers below don't need to pass it around.
DOCTOR_CHECKS='[]'
DOCTOR_REPAIR=0

# doctor_add <category> <name> <severity> <message> [fixable:true|false] [repaired:true|false]
doctor_add() {
  local category="$1" name="$2" severity="$3" message="$4"
  local fixable="${5:-false}" repaired="${6:-false}"
  DOCTOR_CHECKS=$(jq -c \
    --arg c "$category" --arg n "$name" --arg s "$severity" --arg m "$message" \
    --argjson f "$fixable" --argjson r "$repaired" \
    '. + [{category:$c, name:$n, severity:$s, message:$m, fixable:$f, repaired:$r}]' \
    <<<"$DOCTOR_CHECKS")
  [[ "$severity" != "ok" ]] && step "[$severity] $category/$name: $message"
  return 0
}

# doctor_check_cmd <name> <executable> [apt-repair-package]
# Uses the host's PATH (root). Not suitable for "is bun on user claude's
# PATH" — that needs a sudo hop; handled inline in cmd_doctor.
doctor_check_cmd() {
  local name="$1" exe="$2" pkg="${3:-}"
  if command -v "$exe" >/dev/null 2>&1; then
    doctor_add deps "$name" ok "$exe found at $(command -v "$exe")"
    return 0
  fi
  local fixable=false
  [[ -n "$pkg" ]] && fixable=true
  if (( DOCTOR_REPAIR )) && [[ -n "$pkg" ]]; then
    step "Installing $pkg (apt-get)"
    if DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "$pkg" >&2 \
       && command -v "$exe" >/dev/null 2>&1; then
      doctor_add deps "$name" ok "$exe installed via apt ($pkg)" true true
      return 0
    fi
    doctor_add deps "$name" error "$exe missing; apt install $pkg failed" "$fixable" false
    return 1
  fi
  doctor_add deps "$name" error "$exe not found on PATH" "$fixable" false
  return 1
}

cmd_doctor() {
  require_root
  local filter=""
  DOCTOR_REPAIR=0
  DOCTOR_CHECKS='[]'
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --repair)     DOCTOR_REPAIR=1 ;;
      --category=*) filter="${1#--category=}" ;;
      -*)           fail "$E_USAGE" "unknown flag: $1" ;;
      *)            fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  case "$filter" in
    ""|deps|types|auth|registry|shelld|channels) ;;
    *) fail "$E_USAGE" "unknown --category (deps|types|auth|registry|shelld|channels)" ;;
  esac

  local run_deps=0 run_types=0 run_auth=0 run_registry=0 run_shelld=0 run_channels=0
  [[ -z "$filter" || "$filter" == "deps"     ]] && run_deps=1
  [[ -z "$filter" || "$filter" == "types"    ]] && run_types=1
  [[ -z "$filter" || "$filter" == "auth"     ]] && run_auth=1
  [[ -z "$filter" || "$filter" == "registry" ]] && run_registry=1
  [[ -z "$filter" || "$filter" == "shelld"   ]] && run_shelld=1
  [[ -z "$filter" || "$filter" == "channels" ]] && run_channels=1

  # --- deps ---
  if (( run_deps )); then
    # /dev/null must be the character device. An agent with sudo (admin
    # isolation) can clobber it — e.g. `tmux -S /dev/null` unlinks it — which
    # crash-loops EVERY agent on the box (teal-fox 2026-06-03). Checked first
    # so --repair fixes it before other checks that redirect to /dev/null run.
    # 5dive-agent-start also self-heals this on each start.
    if [[ -c /dev/null ]]; then
      doctor_add deps devnull ok "/dev/null is a character device"
    elif (( DOCTOR_REPAIR )); then
      step "Recreating /dev/null device node"
      if sudo sh -c 'rm -f /dev/null && mknod /dev/null c 1 3 && chmod 666 /dev/null && chown root:root /dev/null' \
         && [[ -c /dev/null ]]; then
        doctor_add deps devnull ok "/dev/null recreated as character device" true true
      else
        doctor_add deps devnull error "/dev/null not a char device and repair failed (run: sudo mknod /dev/null c 1 3 && sudo chmod 666 /dev/null)" true false
      fi
    else
      doctor_add deps devnull error "/dev/null is not a character device — every agent crash-loops (fix: sudo 5dive doctor --repair)" true false
    fi

    doctor_check_cmd tmux      tmux      tmux
    doctor_check_cmd jq        jq        jq
    doctor_check_cmd python3   python3   python3
    doctor_check_cmd curl      curl      curl
    doctor_check_cmd sqlite3   sqlite3   sqlite3
    doctor_check_cmd sudo      sudo
    doctor_check_cmd systemctl systemctl
    doctor_check_cmd journalctl journalctl

    # bun is needed by the telegram plugin runtime. Checked via the agent
    # user's login shell (which sources /etc/profile.d/5dive-shared-configs.sh
    # + nvm), i.e. the same environment systemd ends up with. Falls back to
    # checking user `claude` if no agents exist yet.
    local bun_user="claude"
    if [[ -f "$REGISTRY" ]]; then
      local first_agent
      first_agent=$(jq -r '.agents | keys[0] // empty' "$REGISTRY" 2>/dev/null)
      [[ -n "$first_agent" ]] && id -u "agent-${first_agent}" &>/dev/null \
        && bun_user="agent-${first_agent}"
    fi
    local bun_path
    bun_path=$(sudo -u "$bun_user" -i bash -lc 'command -v bun' 2>/dev/null || true)
    if [[ -n "$bun_path" ]]; then
      doctor_add deps bun ok "bun at $bun_path (checked as $bun_user)"
    elif (( DOCTOR_REPAIR )); then
      step "Installing bun for user claude"
      if sudo -u claude -i bash -lc 'curl -fsSL https://bun.sh/install | bash' >&2 \
         && sudo -u "$bun_user" -i bash -lc 'command -v bun' >/dev/null 2>&1; then
        doctor_add deps bun ok "bun installed for user claude" true true
      else
        doctor_add deps bun error "bun install failed (telegram plugin won't start)" true false
      fi
    else
      doctor_add deps bun error "bun not on PATH for $bun_user (telegram plugin requires it)" true false
    fi

    # nvm + node + npm (node-based CLIs like codex depend on these)
    if [[ -s /home/claude/.nvm/nvm.sh ]]; then
      doctor_add deps nvm ok "/home/claude/.nvm/nvm.sh present"
    else
      doctor_add deps nvm error "/home/claude/.nvm/nvm.sh missing (codex won't run)" false false
    fi
    local node_ver npm_ver
    node_ver=$(sudo -u claude -i bash -lc 'node --version' 2>/dev/null || true)
    npm_ver=$(sudo -u claude -i bash -lc 'npm --version' 2>/dev/null || true)
    [[ -n "$node_ver" ]] \
      && doctor_add deps node ok "node $node_ver (via nvm)" \
      || doctor_add deps node error "node not available for user claude" false false
    [[ -n "$npm_ver" ]] \
      && doctor_add deps npm  ok "npm $npm_ver (via nvm)" \
      || doctor_add deps npm  error "npm not available for user claude" false false

    # 5dive shared helpers that every agent create/start depends on.
    for f in /usr/local/bin/5dive-agent-start; do
      if [[ -x "$f" ]]; then
        doctor_add deps "$(basename "$f")" ok "$f present"
      else
        doctor_add deps "$(basename "$f")" error "$f missing or not executable (rerun install.sh)" false false
      fi
    done
    # The StopFailure (rate-limit DM) and PreToolUse (AskUserQuestion/ExitPlanMode)
    # hooks used to be standalone scripts under /usr/local/lib/5dive checked here
    # via $STOP_FAILURE_HOOK / $PRETOOL_TELEGRAM_HOOK. They now ship bundled inside
    # the telegram plugin (per-agent, no fixed path), so those vars were removed —
    # the stale checks were left referencing them and crashed `doctor --json` with
    # an unbound-variable error under `set -u`. Dropped; nothing standalone to probe.
    local resume_helper="/usr/local/lib/5dive/resume-after-reset.sh"
    if [[ -x "$resume_helper" ]]; then
      doctor_add deps resume-after-reset ok "$resume_helper present"
    else
      doctor_add deps resume-after-reset warn "$resume_helper missing — agents won't auto-resume when usage limit resets" false false
    fi
  fi

  # --- type binaries ---
  if (( run_types )); then
    local type
    for type in "${!TYPE_BIN[@]}"; do
      local bin="${TYPE_BIN[$type]}"
      local recipe="${TYPE_INSTALL[$type]:-}"
      if [[ -x "$bin" ]]; then
        doctor_add types "$type" ok "$bin installed"
        continue
      fi
      if (( DOCTOR_REPAIR )) && [[ -n "$recipe" ]]; then
        step "Installing $type CLI"
        if sudo -u claude -i bash -lc "$recipe" >&2 && [[ -x "$bin" ]]; then
          doctor_add types "$type" ok "$type installed at $bin" true true
        else
          doctor_add types "$type" error "$type install recipe failed" true false
        fi
      elif [[ -n "$recipe" ]]; then
        doctor_add types "$type" warn "$bin missing (run with --repair to auto-install)" true false
      else
        doctor_add types "$type" warn "$bin missing (no automated installer for $type)" false false
      fi
    done
  fi

  # --- auth (live probe for installed types) ---
  if (( run_auth )); then
    local type status
    for type in "${!TYPE_BIN[@]}"; do
      [[ -x "${TYPE_BIN[$type]}" ]] || continue
      status=$(auth_status_one "$type")
      case "$status" in
        ok)
          doctor_add auth "$type" ok "live probe succeeded" ;;
        needs_login)
          doctor_add auth "$type" error "no credentials on file — run: sudo 5dive agent auth login $type" false false ;;
        stale)
          doctor_add auth "$type" error "credentials rejected by provider — re-auth required" false false ;;
        not_installed)
          : ;;  # already flagged by types/
        *)
          doctor_add auth "$type" warn "status=$status" false false ;;
      esac
    done
  fi

  # --- registry + per-agent state ---
  if (( run_registry )); then
    if [[ ! -f "$REGISTRY" ]]; then
      if (( DOCTOR_REPAIR )); then
        ensure_state
        doctor_add registry file ok "initialized empty $REGISTRY" true true
      else
        doctor_add registry file error "$REGISTRY missing (run with --repair to init)" true false
      fi
    elif ! jq -e '.agents | type == "object"' "$REGISTRY" >/dev/null 2>&1; then
      doctor_add registry file error "$REGISTRY unparseable or missing .agents object (manual fix required)" false false
    else
      doctor_add registry file ok "$REGISTRY intact"
      local schema_v
      schema_v=$(jq -r '.schemaVersion // 0' "$REGISTRY" 2>/dev/null || echo 0)
      if (( schema_v == REGISTRY_SCHEMA_VERSION )); then
        doctor_add registry schema ok "schemaVersion=$schema_v (current)"
      elif (( schema_v < REGISTRY_SCHEMA_VERSION )); then
        if (( DOCTOR_REPAIR )); then
          ensure_state   # stamps the current version in place
          doctor_add registry schema ok "migrated schemaVersion $schema_v -> $REGISTRY_SCHEMA_VERSION" true true
        else
          doctor_add registry schema warn "schemaVersion=$schema_v (expected $REGISTRY_SCHEMA_VERSION) — run with --repair" true false
        fi
      else
        doctor_add registry schema error "schemaVersion=$schema_v is newer than this CLI ($REGISTRY_SCHEMA_VERSION) — upgrade 5dive" false false
      fi
      local reg
      reg=$(registry_read)
      local name
      for name in $(jq -r '.agents | keys[]' <<<"$reg" 2>/dev/null); do
        local type env_file user
        type=$(jq -r --arg n "$name" '.agents[$n].type // empty' <<<"$reg")
        env_file="${ENV_DIR}/${name}.env"
        user="agent-${name}"
        if ! is_known_type "$type"; then
          doctor_add registry "agent:$name" error "unknown type '$type' in registry" false false
          continue
        fi
        if ! id -u "$user" &>/dev/null; then
          doctor_add registry "agent:$name" error "user $user missing (orphan registry entry — rm manually)" false false
          continue
        fi
        if [[ ! -f "$env_file" ]]; then
          if (( DOCTOR_REPAIR )); then
            local channels workdir profile
            channels=$(jq -r --arg n "$name" '.agents[$n].channels // "none"'    <<<"$reg")
            workdir=$(jq  -r --arg n "$name" '.agents[$n].workdir // empty'      <<<"$reg")
            profile=$(jq  -r --arg n "$name" '.agents[$n].authProfile // empty'  <<<"$reg")
            write_agent_env "$name" "$type" "$channels" "$workdir" "$profile"
            link_agent_profile "$name" "$profile"
            doctor_add registry "agent:$name" ok "recreated $env_file" true true
          else
            doctor_add registry "agent:$name" error "$env_file missing (run with --repair)" true false
          fi
        else
          doctor_add registry "agent:$name" ok "entry + user + env file all present"
        fi
      done
    fi
  fi

  # --- channels: managed-settings allowlist + per-agent registration health ---
  #
  # Two failure modes we surface here:
  #   1. /etc/claude-code/managed-settings.json missing or missing the
  #      telegram@5dive-plugins entry — the local self-hosted case. Install.sh
  #      writes this on first install; flag if it's been hand-edited away.
  #   2. Claude logs "Channel notifications skipped: plugin telegram@5dive-plugins
  #      is not on the approved channels allowlist" — strong signal the agent
  #      is on a Teams org whose admin hasn't allowlisted us via remote
  #      managed-settings (remote overrides local). Linked from README.
  if (( run_channels )); then
    local ms=/etc/claude-code/managed-settings.json
    if [[ ! -f "$ms" ]]; then
      doctor_add channels managed-settings warn \
        "$ms missing — rerun install.sh, or expect channel-skipped errors" false false
    elif ! jq -e '.channelsEnabled == true' "$ms" >/dev/null 2>&1; then
      doctor_add channels managed-settings error \
        "$ms missing channelsEnabled:true (Claude Code 2.1.150+ requires this; allowlist is otherwise inert)" false false
    elif ! jq -e '.allowedChannelPlugins | any(.plugin == "telegram" and .marketplace == "5dive-plugins")' "$ms" >/dev/null 2>&1; then
      doctor_add channels managed-settings warn \
        "$ms doesn't list telegram@5dive-plugins — local channel allowlist won't permit the fork" false false
    else
      doctor_add channels managed-settings ok "$ms has channelsEnabled + telegram@5dive-plugins allowlisted"
    fi

    # Per-agent: read the MOST RECENT MCP log for the telegram plugin and
    # check whether the last channel-registration event was "registered" or
    # "skipped". The log path is per-user, per-cwd (slashes → dashes):
    #   ~/.cache/claude-cli-nodejs/<cwd-dashed>/mcp-logs-plugin-telegram-*/*.jsonl
    # We glob the plugin dir to stay tolerant of marketplace name changes.
    # "Skipped" almost always means a Teams-org remote managed-settings is
    # overriding the local allowlist — admin action required; we link docs.
    if [[ -f "$REGISTRY" ]]; then
      local reg name channels
      reg=$(registry_read 2>/dev/null || echo '{"agents":{}}')
      for name in $(jq -r '.agents | keys[]' <<<"$reg" 2>/dev/null); do
        channels=$(jq -r --arg n "$name" '.agents[$n].channels // ""' <<<"$reg")
        [[ "$channels" == *telegram* ]] || continue
        local user="agent-${name}"
        id -u "$user" &>/dev/null || continue
        # Latest jsonl across any telegram-plugin mcp-logs dir for this user.
        local latest
        latest=$(sudo -u "$user" bash -lc \
          'ls -1t "$HOME"/.cache/claude-cli-nodejs/*/mcp-logs-plugin-telegram-*/*.jsonl 2>/dev/null | head -1' \
          2>/dev/null)
        if [[ -z "$latest" ]]; then
          doctor_add channels "agent:$name" warn \
            "no telegram MCP logs found for $user (agent never started? channel not actually attached?)" false false
          continue
        fi
        # Look at the LAST occurrence of either event — agents may have
        # registered earlier then been told to skip, or vice versa.
        local last_event
        last_event=$(sudo -u "$user" grep -E 'Channel notifications (registered|skipped|.*not on the approved channels allowlist)' "$latest" 2>/dev/null | tail -1)
        if [[ "$last_event" == *"not on the approved channels allowlist"* ]]; then
          doctor_add channels "agent:$name" error \
            "claude logged 'Channel notifications skipped' — likely on an Anthropic Teams org. Org admin must allowlist telegram@5dive-plugins via console. See: https://github.com/5dive-com/5dive-plugins#anthropic-teams-accounts" \
            false false
        elif [[ "$last_event" == *"registered"* ]]; then
          doctor_add channels "agent:$name" ok "channel registered (latest MCP log: $(basename "$latest"))"
        else
          doctor_add channels "agent:$name" warn \
            "no channel-registration event found in latest MCP log $(basename "$latest") — restart the agent to refresh" false false
        fi

        # Plugin-version drift: Claude loads plugins once at launch, so an
        # agent that's been running since before the last `plugin update` is
        # still executing the OLD telegram plugin in memory (and its old hooks)
        # even though the on-disk cache is newer. This is the recurring
        # "/account mis-gated, /status missing the 5dive line, stale stop-hook"
        # class of bug. We detect it WITHOUT introspecting process memory: if
        # installed_plugins.json was modified AFTER the agent's claude process
        # started, the running code predates the update. --repair restarts the
        # agent (deferred) to load the fresh version.
        local manifest_f="/home/${user}/.claude/plugins/installed_plugins.json"
        if [[ -f "$manifest_f" ]]; then
          local ondisk_ver plug_mtime cpid
          ondisk_ver=$(jq -r '.plugins["telegram@5dive-plugins"][0].version // empty' "$manifest_f" 2>/dev/null)
          plug_mtime=$(stat -c %Y "$manifest_f" 2>/dev/null || echo 0)
          # Oldest (longest-running) claude process for this user = the
          # persistent session, not a transient hook subprocess. Pick max
          # elapsed-time among matches.
          cpid=$(pgrep -u "$user" -f 'claude' 2>/dev/null \
                 | while read -r p; do echo "$(ps -o etimes= -p "$p" 2>/dev/null | tr -d ' ') $p"; done \
                 | sort -rn | awk 'NR==1{print $2}')
          if [[ -n "$cpid" && -n "$ondisk_ver" ]]; then
            local etimes start_epoch now_epoch
            etimes=$(ps -o etimes= -p "$cpid" 2>/dev/null | tr -d ' ')
            now_epoch=$(date +%s)
            if [[ "$etimes" =~ ^[0-9]+$ ]]; then
              start_epoch=$((now_epoch - etimes))
              if [[ "$plug_mtime" -gt "$start_epoch" ]]; then
                if (( DOCTOR_REPAIR )); then
                  if systemd-run --on-active=1 --collect \
                       /bin/systemctl restart "5dive-agent@${name}.service" >/dev/null 2>&1; then
                    doctor_add channels "agent:$name plugin-version" warn \
                      "was running a stale telegram plugin (on-disk $ondisk_ver, loaded before last update) — restart scheduled to load it" true true
                  else
                    doctor_add channels "agent:$name plugin-version" warn \
                      "running a stale telegram plugin (on-disk $ondisk_ver) — auto-restart failed; run: systemctl restart 5dive-agent@${name}.service" true false
                  fi
                else
                  doctor_add channels "agent:$name plugin-version" warn \
                    "running a stale telegram plugin — on-disk is $ondisk_ver but the agent loaded an older build at launch. Restart to apply: systemctl restart 5dive-agent@${name}.service (or 5dive doctor --repair)" true false
                fi
              else
                doctor_add channels "agent:$name plugin-version" ok "telegram plugin $ondisk_ver loaded (running matches on-disk)"
              fi
            fi
          fi
        fi
      done
    fi
  fi

  # --- shelld reachability (managed platform only) ---
  if (( run_shelld )); then
    if [[ ! -f /etc/5dive/provisioning.env ]]; then
      doctor_add shelld service ok "self-hosted install — shelld only runs on the managed platform"
    else
      local shelld_active
      shelld_active=$(systemctl is-active shelld 2>/dev/null || true)
      if [[ "$shelld_active" == "active" ]]; then
        doctor_add shelld service ok "shelld.service active"
      elif (( DOCTOR_REPAIR )); then
        step "Restarting shelld"
        if systemctl restart shelld >&2 \
           && [[ "$(systemctl is-active shelld 2>/dev/null)" == "active" ]]; then
          doctor_add shelld service ok "shelld restarted" true true
        else
          doctor_add shelld service error "shelld restart failed (check: journalctl -u shelld)" true false
        fi
      else
        doctor_add shelld service error "shelld.service not active (state=$shelld_active)" true false
      fi

      local health_code
      health_code=$(curl -fsS -o /dev/null -w '%{http_code}' --max-time 3 \
        http://127.0.0.1:3101/shell/health 2>/dev/null || echo "000")
      if [[ "$health_code" == "200" ]]; then
        doctor_add shelld health ok "http://127.0.0.1:3101/shell/health -> 200"
      else
        doctor_add shelld health error "shelld health endpoint returned $health_code (expected 200)" false false
      fi
    fi
  fi

  # --- summary + output ---
  local summary
  summary=$(jq -c '{
    total:    length,
    passed:   [.[] | select(.severity == "ok")]    | length,
    warnings: [.[] | select(.severity == "warn")]  | length,
    errors:   [.[] | select(.severity == "error")] | length,
    repaired: [.[] | select(.repaired == true)]    | length
  }' <<<"$DOCTOR_CHECKS")

  local payload
  payload=$(jq -cn --argjson checks "$DOCTOR_CHECKS" --argjson summary "$summary" \
    '{summary: $summary, checks: $checks}')

  if (( JSON_MODE )); then
    jq -c '{ok:true, data: .}' <<<"$payload"
  else
    jq -r '
      .checks | group_by(.category) | .[] as $g |
      "── \($g[0].category) ──",
      ($g[] | "  [\(.severity)] \(.name): \(.message)\(if .repaired then " (repaired)" else "" end)"),
      ""
    ' <<<"$payload"
    jq -r '.summary |
      "summary: \(.total) checks, \(.passed) ok, \(.warnings) warn, \(.errors) error" +
      (if .repaired > 0 then ", \(.repaired) repaired" else "" end)
    ' <<<"$payload"
  fi
  # Always exit 0 — the envelope carries the real state via summary.errors.
  # Matches `auth status` (also informational). CI branches on the payload.
  return 0
}

# -------- watch (live multi-agent dashboard) --------
#
# htop-style live view of every registered agent. Refreshes every <interval>
# seconds (default 2s) inside the alt-screen so the user's scrollback is
# preserved on quit. Pure bash + ANSI — no curses, no extra deps beyond
# what the rest of the CLI already requires (bash, jq, systemctl, tput).
#
# Keys:
#   q / Ctrl-C    quit
#   r             refresh now
#   ↑ ↓ / k j     move selection
#   ↵             attach to selected agent (sudo -u agent-<name> tmux attach).
#                 Control returns to watch when the user detaches (Ctrl-b d).

WATCH_ALT_ON=$'\033[?1049h'
WATCH_ALT_OFF=$'\033[?1049l'
WATCH_HIDE=$'\033[?25l'
WATCH_SHOW=$'\033[?25h'
WATCH_HOME=$'\033[H'
WATCH_CLR_DOWN=$'\033[J'
WATCH_CLR_EOL=$'\033[K'
WATCH_RESET=$'\033[0m'
WATCH_BOLD=$'\033[1m'
WATCH_DIM=$'\033[2m'
WATCH_REV=$'\033[7m'
WATCH_GREEN=$'\033[32m'
WATCH_RED=$'\033[31m'
WATCH_YELLOW=$'\033[33m'
WATCH_GREY=$'\033[90m'
WATCH_CYAN=$'\033[36m'

# State dot for the leading column. Mirrors cmd_list's category set.
_watch_dot() {
  case "$1" in
    active)                              printf '%s●%s' "$WATCH_GREEN"  "$WATCH_RESET" ;;
    activating|deactivating|reloading)   printf '%s●%s' "$WATCH_YELLOW" "$WATCH_RESET" ;;
    failed)                              printf '%s●%s' "$WATCH_RED"    "$WATCH_RESET" ;;
    *)                                   printf '%s○%s' "$WATCH_GREY"   "$WATCH_RESET" ;;
  esac
}

# Seconds → "1d 17h" / "5h 23m" / "12m 04s" / "23s". "-" if unknown / 0.
_watch_uptime() {
  local s="${1:-0}"
  [[ "$s" =~ ^[0-9]+$ ]] && (( s > 0 )) || { printf -- '-'; return; }
  if   (( s < 60 ));    then printf '%ds' "$s"
  elif (( s < 3600 ));  then printf '%dm %02ds' $((s/60)) $((s%60))
  elif (( s < 86400 )); then printf '%dh %02dm' $((s/3600)) $(((s%3600)/60))
  else                       printf '%dd %02dh' $((s/86400)) $(((s%86400)/3600))
  fi
}

# Bytes → "342 MiB" / "1.2 GiB". "-" for [not set] / uint64 sentinel.
_watch_mem() {
  local b="${1:-}"
  [[ "$b" =~ ^[0-9]+$ ]] || { printf -- '-'; return; }
  [[ "$b" == "18446744073709551615" ]] && { printf -- '-'; return; }
  if   (( b < 1024 ));        then printf '%d B'   "$b"
  elif (( b < 1048576 ));     then printf '%d KiB' $((b/1024))
  elif (( b < 1073741824 ));  then printf '%d MiB' $((b/1048576))
  else awk -v n="$b" 'BEGIN{printf "%.1f GiB", n/1073741824}'
  fi
}

# One snapshot of all agents → JSON array. One systemctl show per agent;
# fine for the typical 1-10 range. If it ever bottlenecks we can swap in a
# single Python helper that batches the showed properties.
_watch_snapshot() {
  local reg now name svc props
  reg=$(registry_read)
  now=$(date +%s)
  local rows=""
  for name in $(jq -r '.agents | keys[]' <<<"$reg" 2>/dev/null); do
    svc="5dive-agent@${name}.service"
    props=$(systemctl show "$svc" \
      --property=ActiveState,SubState,NRestarts,ActiveEnterTimestamp,MemoryCurrent \
      --no-page 2>/dev/null || true)
    local active sub restarts ts_str mem
    active=$(awk   -F= '/^ActiveState=/{print $2}'         <<<"$props")
    sub=$(awk      -F= '/^SubState=/{print $2}'            <<<"$props")
    restarts=$(awk -F= '/^NRestarts=/{print $2}'           <<<"$props")
    ts_str=$(awk   -F= '/^ActiveEnterTimestamp=/{print $2}' <<<"$props")
    mem=$(awk      -F= '/^MemoryCurrent=/{print $2}'       <<<"$props")
    local uptime=0
    if [[ -n "$ts_str" && "$ts_str" != "n/a" && "$active" == "active" ]]; then
      local since
      since=$(date -d "$ts_str" +%s 2>/dev/null || echo "")
      [[ -n "$since" ]] && uptime=$((now - since))
    fi
    local type channels bot
    type=$(jq     -r --arg n "$name" '.agents[$n].type'                  <<<"$reg")
    channels=$(jq -r --arg n "$name" '.agents[$n].channels // "none"'    <<<"$reg")
    bot=$(jq      -r --arg n "$name" '.agents[$n].botUsername // empty'  <<<"$reg")
    rows+=$(jq -cn \
      --arg name "$name" --arg type "$type" --arg channels "$channels" --arg bot "$bot" \
      --arg active "${active:-unknown}" --arg sub "${sub:-}" \
      --arg restarts "${restarts:-0}" --arg uptime "$uptime" --arg mem "${mem:-}" \
      '{name:$name, type:$type, channels:$channels, botUsername:$bot,
        active:$active, sub:$sub,
        restarts:($restarts|tonumber? // 0),
        uptime:($uptime|tonumber? // 0),
        mem:$mem}')
    rows+=$'\n'
  done
  printf '%s' "$rows" | jq -s -c '.'
}

# Visible width — strip ANSI before counting so padding stays correct.
_watch_visible_len() {
  local s="$1"
  s=$(printf '%s' "$s" | sed -E $'s/\x1b\\[[0-9;?]*[a-zA-Z]//g')
  printf '%d' "${#s}"
}
_watch_pad_right() {
  local s="$1" w="$2" cur
  cur=$(_watch_visible_len "$s")
  if (( cur >= w )); then printf '%s' "$s"; return; fi
  printf '%s%*s' "$s" $((w - cur)) ""
}
# Truncate to N visible chars (assumes input has no embedded ANSI — color
# is added around the cell after truncation).
_watch_truncate() {
  local s="$1" w="$2"
  if (( ${#s} <= w )); then printf '%s' "$s"; return; fi
  printf '%s…' "${s:0:w-1}"
}

# Render one frame to stdout. Cursor-home + clear-eol per line + clear-down
# at the end → no flicker.
_watch_render() {
  local data="$1" selected="$2" interval="$3"
  local cols
  cols=$(tput cols 2>/dev/null || echo 100)

  local total active failed
  total=$(jq  'length' <<<"$data")
  active=$(jq '[.[] | select(.active == "active")] | length' <<<"$data")
  failed=$(jq '[.[] | select(.active == "failed")] | length' <<<"$data")

  local now_str
  now_str=$(date '+%Y-%m-%d %H:%M:%S')

  # Buffer the whole frame, then write once — single syscall avoids tearing.
  local out=""
  out+="$WATCH_HOME"

  local title
  title=$(printf '%s5dive watch%s · %d agents · %s%d active%s · %s%d failed%s' \
    "$WATCH_BOLD$WATCH_CYAN" "$WATCH_RESET" "$total" \
    "$WATCH_GREEN" "$active" "$WATCH_RESET" \
    "$WATCH_RED" "$failed" "$WATCH_RESET")
  local title_len ts_len pad
  title_len=$(_watch_visible_len "$title")
  ts_len=${#now_str}
  pad=$((cols - title_len - ts_len))
  (( pad < 1 )) && pad=1
  out+="${title}$(printf '%*s' "$pad" '')${WATCH_DIM}${now_str}${WATCH_RESET}${WATCH_CLR_EOL}"$'\n'
  out+="${WATCH_CLR_EOL}"$'\n'

  out+="${WATCH_DIM}    NAME              TYPE     CHANNEL                  UPTIME    RESTART    MEMORY${WATCH_RESET}${WATCH_CLR_EOL}"$'\n'

  if (( total == 0 )); then
    out+="${WATCH_CLR_EOL}"$'\n'
    out+="    ${WATCH_DIM}no agents — try: ${WATCH_RESET}${WATCH_CYAN}5dive agent create my-agent --type=claude${WATCH_RESET}${WATCH_CLR_EOL}"$'\n'
  else
    local i=0
    local count
    count=$(jq 'length' <<<"$data")
    while (( i < count )); do
      local row name type channels bot up_secs restarts mem_b active
      row=$(jq -c --argjson i "$i" '.[$i]' <<<"$data")
      name=$(jq      -r '.name'                  <<<"$row")
      type=$(jq      -r '.type'                  <<<"$row")
      channels=$(jq  -r '.channels'              <<<"$row")
      bot=$(jq       -r '.botUsername // empty'  <<<"$row")
      up_secs=$(jq   -r '.uptime'                <<<"$row")
      restarts=$(jq  -r '.restarts'              <<<"$row")
      mem_b=$(jq     -r '.mem'                   <<<"$row")
      active=$(jq    -r '.active'                <<<"$row")

      local chan_disp="$channels"
      [[ -n "$bot" && "$channels" == "telegram" ]] && chan_disp="telegram (@${bot})"
      [[ "$channels" == "none" ]] && chan_disp="-"

      local up_disp mem_disp dot
      up_disp=$(_watch_uptime "$up_secs")
      mem_disp=$(_watch_mem    "$mem_b")
      dot=$(_watch_dot         "$active")

      local name_cell type_cell chan_cell up_cell rs_cell cell
      name_cell=$(_watch_pad_right "$(_watch_truncate "$name"      16)" 16)
      type_cell=$(_watch_pad_right "$(_watch_truncate "$type"       8)"  8)
      chan_cell=$(_watch_pad_right "$(_watch_truncate "$chan_disp" 24)" 24)
      up_cell=$(_watch_pad_right   "$up_disp"  9)
      rs_cell=$(printf '%7d  ' "$restarts")
      cell="${name_cell}  ${type_cell} ${chan_cell} ${up_cell} ${rs_cell}${mem_disp}"

      if (( i == selected )); then
        out+=" ${dot} ${WATCH_REV}${cell}${WATCH_RESET}${WATCH_CLR_EOL}"$'\n'
      else
        out+=" ${dot} ${cell}${WATCH_CLR_EOL}"$'\n'
      fi
      ((i++)) || true
    done
  fi

  out+="${WATCH_CLR_EOL}"$'\n'
  local foot_left foot_right fl_len fr_len fpad
  foot_left="${WATCH_DIM}↑↓ select · ↵ attach · r refresh · q quit${WATCH_RESET}"
  foot_right="${WATCH_DIM}refresh: ${interval}s${WATCH_RESET}"
  fl_len=$(_watch_visible_len "$foot_left")
  fr_len=$(_watch_visible_len "$foot_right")
  fpad=$((cols - fl_len - fr_len))
  (( fpad < 1 )) && fpad=1
  out+="${foot_left}$(printf '%*s' "$fpad" '')${foot_right}${WATCH_CLR_EOL}"

  # Wipe everything beneath in case the previous frame was taller.
  out+="${WATCH_CLR_DOWN}"

  printf '%s' "$out"
}

cmd_watch() {
  local interval=2
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --interval=*) interval="${1#--interval=}" ;;
      -h|--help)
        cat >&2 <<HELP
usage: 5dive watch [--interval=N]
  Live multi-agent dashboard. ↑↓ select, ↵ attach, r refresh, q quit.
HELP
        return 0 ;;
      *) fail "$E_USAGE" "unknown flag: $1" ;;
    esac
    shift
  done
  [[ "$interval" =~ ^[0-9]+$ ]] && (( interval >= 1 && interval <= 60 )) \
    || fail "$E_VALIDATION" "--interval must be 1-60 seconds"
  ensure_state

  # Need a TTY for the alt-screen + arrow-key reads to make sense. CI / pipes
  # get a clear error instead of garbled output.
  [[ -t 1 && -t 0 ]] || fail "$E_USAGE" "5dive watch requires a TTY (try running it directly, not piped)"

  local selected=0
  local quit=0

  _watch_teardown() {
    printf '%s%s%s' "$WATCH_SHOW" "$WATCH_RESET" "$WATCH_ALT_OFF"
  }
  trap '_watch_teardown; exit 130' INT TERM
  trap '_watch_teardown' EXIT
  printf '%s%s' "$WATCH_ALT_ON" "$WATCH_HIDE"

  while (( ! quit )); do
    local snap count
    snap=$(_watch_snapshot)
    count=$(jq 'length' <<<"$snap")
    (( count == 0 )) && selected=0
    (( selected >= count && count > 0 )) && selected=$((count - 1))
    (( selected < 0 )) && selected=0

    _watch_render "$snap" "$selected" "$interval"

    # Drain input within the refresh window. read returns non-zero on
    # timeout, which we use as the "tick" trigger.
    local tick_end
    tick_end=$(( $(date +%s) + interval ))
    while (( $(date +%s) < tick_end )); do
      local remaining=$(( tick_end - $(date +%s) ))
      (( remaining <= 0 )) && break
      local key=""
      if IFS= read -rsn1 -t "$remaining" key; then
        case "$key" in
          q|Q) quit=1; break ;;
          r|R) break ;;
          j|J) (( count > 0 && selected < count - 1 )) && ((selected++)) ;;
          k|K) (( selected > 0 ))                      && ((selected--)) ;;
          $'\x1b')
            # Arrow keys: ESC [ A (up) / B (down). Short timeouts so a bare
            # ESC press doesn't hang.
            local rest1 rest2
            read -rsn1 -t 0.05 rest1 || true
            read -rsn1 -t 0.05 rest2 || true
            if [[ "$rest1" == "[" ]]; then
              case "$rest2" in
                A) (( selected > 0 ))                      && ((selected--)) ;;
                B) (( count > 0 && selected < count - 1 )) && ((selected++)) ;;
              esac
            fi ;;
          ""|$'\n'|$'\r')
            (( count > 0 )) || continue
            local target
            target=$(jq -r --argjson i "$selected" '.[$i].name' <<<"$snap")
            [[ -n "$target" && "$target" != "null" ]] || continue
            # Leave alt screen so tmux attach sees the user's real terminal.
            # Re-enter when the user detaches.
            _watch_teardown
            sudo -u "agent-${target}" tmux attach -t "agent-${target}" || true
            printf '%s%s' "$WATCH_ALT_ON" "$WATCH_HIDE"
            break ;;
        esac
        # Re-render after a key for snappy navigation feedback.
        _watch_render "$snap" "$selected" "$interval"
      fi
    done
  done
}

# -------- compose (declarative agents via 5dive.yaml) --------
#
# Docker-Compose-style declarative manager. Define your AI team in a
# 5dive.yaml file and bring it up/down with one command. Re-running `5dive
# up` is idempotent — existing agents are left alone, missing ones are
# created. Drift between spec and live state is logged but not auto-applied;
# tear down + bring up to recreate.
#
# Schema (v1):
#   version: "1"
#   agents:
#     <name>:
#       type:           claude|codex|hermes|openclaw|opencode  (required)
#       channels:       none|telegram|discord                          (default none)
#       telegram_token: "<bot-token>"      # required if channels=telegram
#       discord_token:  "<bot-token>"      # required if channels=discord
#       workdir:        ./relative/or/absolute/path
#       skills:         [skill1, skill2]   # bare ids or owner/repo:id
#       no_skills:      true               # opt out of inherited skills
#       defer_auth:     true               # create without auth gate
#       isolation:      admin|standard|sandboxed
#       auth_profile:   <named-account>
#       provider:       <byo-id>           # hermes/openclaw only
#       api_key:        "<key>"            # hermes/openclaw only
#
# Env vars: any "${VAR}" in a string value is expanded from the process env.
# Missing/empty vars fail loudly so a misconfigured shell can't silently
# create agents with literal "${...}" strings as bot tokens.

# Default file: 5dive.yaml then 5dive.yml in cwd. Returns non-zero if neither.
_compose_default_file() {
  if   [[ -f 5dive.yaml ]]; then printf '%s' 5dive.yaml
  elif [[ -f 5dive.yml  ]]; then printf '%s' 5dive.yml
  else return 1
  fi
}

# YAML → JSON via python3 + PyYAML, with strict ${VAR} env expansion.
#
# v1 just normalised the agents map. v2 additionally:
#   - merges a top-level defaults{} into every agent (agent-level keys win),
#   - validates reports_to (targets must resolve to agent names, no self-edge,
#     no cycles) and the instructions / instructions_file XOR,
#   - warns (does not fail) on unknown per-agent keys for forward-compat.
# Output JSON keeps the same {..., agents:{<name>:{merged spec}}} shape so the
# v1 create path is untouched; team{}/defaults{}/version pass through for export.
_compose_parse() {
  local file="$1"
  python3 - "$file" <<'PY'
import yaml, json, sys, os, re
try:
    with open(sys.argv[1]) as f:
        data = yaml.safe_load(f)
except yaml.YAMLError as e:
    print(f"error: yaml parse failed: {e}", file=sys.stderr); sys.exit(3)
except OSError as e:
    print(f"error: cannot open {sys.argv[1]}: {e}", file=sys.stderr); sys.exit(4)
if not isinstance(data, dict) or "agents" not in data or not isinstance(data["agents"], dict):
    print("error: spec must have a top-level 'agents:' map", file=sys.stderr); sys.exit(3)

defaults = data.get("defaults") or {}
if not isinstance(defaults, dict):
    print("error: 'defaults:' must be a map", file=sys.stderr); sys.exit(3)

# Known per-agent keys (v1 + v2). Unknown → warn, not fail (forward-compat).
KNOWN = {
    "type","channels","telegram_token","discord_token","workdir","skills",
    "no_skills","defer_auth","isolation","auth_profile","provider","api_key",
    "role","instructions","instructions_file","model","effort","reports_to","goals",
}

agents = data["agents"]
# Merge defaults under each agent (agent keys win). None spec → empty map.
merged = {}
for name, spec in agents.items():
    spec = spec or {}
    if not isinstance(spec, dict):
        print(f"error: agent '{name}' must be a map", file=sys.stderr); sys.exit(3)
    m = dict(defaults); m.update(spec)
    merged[name] = m
    if m.get("instructions") and m.get("instructions_file"):
        print(f"error: agent '{name}': instructions and instructions_file are mutually exclusive", file=sys.stderr); sys.exit(3)
    for k in spec:
        if k not in KNOWN:
            print(f"warning: agent '{name}': unknown key '{k}' (ignored)", file=sys.stderr)
data["agents"] = merged

# reports_to: normalise to a list, validate targets resolve + no self-edge.
names = set(merged)
edges = {}
def rt_list(v):
    if v is None or v == "": return []
    return v if isinstance(v, list) else [v]
for name, m in merged.items():
    mgrs = rt_list(m.get("reports_to"))
    for mgr in mgrs:
        if mgr not in names:
            print(f"error: agent '{name}': reports_to '{mgr}' is not a declared agent", file=sys.stderr); sys.exit(3)
        if mgr == name:
            print(f"error: agent '{name}': cannot report to itself", file=sys.stderr); sys.exit(3)
    edges[name] = mgrs
# Reject cycles in the reporting graph (DFS, colour-marking).
WHITE, GREY, BLACK = 0, 1, 2
colour = {n: WHITE for n in names}
def visit(n, stack):
    colour[n] = GREY
    for mgr in edges.get(n, []):
        if colour[mgr] == GREY:
            cyc = " -> ".join(stack + [n, mgr])
            print(f"error: reporting cycle detected: {cyc}", file=sys.stderr); sys.exit(3)
        if colour[mgr] == WHITE:
            visit(mgr, stack + [n])
    colour[n] = BLACK
for n in names:
    if colour[n] == WHITE:
        visit(n, [])

env_re = re.compile(r"\$\{([A-Z_][A-Z0-9_]*)\}")
def expand(v):
    if isinstance(v, str):
        def sub(m):
            k = m.group(1)
            if k not in os.environ or os.environ[k] == "":
                print(f"error: env var '{k}' referenced in spec is unset", file=sys.stderr)
                sys.exit(3)
            return os.environ[k]
        return env_re.sub(sub, v)
    if isinstance(v, dict): return {k: expand(x) for k, x in v.items()}
    if isinstance(v, list): return [expand(x) for x in v]
    return v
print(json.dumps(expand(data)))
PY
}

# Resolve a workdir field. Relative paths are resolved against the directory
# containing the spec file (Docker-Compose convention). realpath -m so the
# target need not exist yet.
_compose_resolve_path() {
  local p="$1" spec_dir="$2"
  [[ "$p" = /* ]] && { printf '%s' "$p"; return; }
  realpath -m "${spec_dir}/${p}"
}

# Build argv for `5dive agent create <name> ...` from a parsed agent spec.
# Echoed one arg per line so the caller can mapfile-slurp into an array
# (handles spaces/quotes in values cleanly).
_compose_create_args() {
  local spec="$1" name="$2" spec_dir="$3"
  printf '%s\n' "$name"
  local type channels tg_token dc_token workdir profile isolation provider api_key
  type=$(jq      -r '.type             // empty' <<<"$spec")
  channels=$(jq  -r '.channels         // empty' <<<"$spec")
  tg_token=$(jq  -r '.telegram_token   // empty' <<<"$spec")
  dc_token=$(jq  -r '.discord_token    // empty' <<<"$spec")
  workdir=$(jq   -r '.workdir          // empty' <<<"$spec")
  profile=$(jq   -r '.auth_profile     // empty' <<<"$spec")
  isolation=$(jq -r '.isolation        // empty' <<<"$spec")
  provider=$(jq  -r '.provider         // empty' <<<"$spec")
  api_key=$(jq   -r '.api_key          // empty' <<<"$spec")
  local no_skills defer_auth
  no_skills=$(jq  -r '.no_skills  // false' <<<"$spec")
  defer_auth=$(jq -r '.defer_auth // false' <<<"$spec")

  printf '%s\n' "--type=${type}"
  [[ -n "$channels"  ]] && printf '%s\n' "--channels=${channels}"
  [[ -n "$tg_token"  ]] && printf '%s\n' "--telegram-token=${tg_token}"
  [[ -n "$dc_token"  ]] && printf '%s\n' "--discord-token=${dc_token}"
  if [[ -n "$workdir" ]]; then
    local wd_abs
    wd_abs=$(_compose_resolve_path "$workdir" "$spec_dir")
    printf '%s\n' "--workdir=${wd_abs}"
  fi
  [[ -n "$profile"   ]] && printf '%s\n' "--auth-profile=${profile}"
  [[ -n "$isolation" ]] && printf '%s\n' "--isolation=${isolation}"
  [[ -n "$provider"  ]] && printf '%s\n' "--provider=${provider}"
  [[ -n "$api_key"   ]] && printf '%s\n' "--api-key=${api_key}"

  # Skills: comma-join the array. --no-skills wins (cmd_create's parser
  # treats them as mutually exclusive at the call site).
  local skills_csv
  skills_csv=$(jq -r 'if (.skills // []) | length == 0 then "" else (.skills | join(",")) end' <<<"$spec")
  if [[ "$no_skills" == "true" ]]; then
    printf '%s\n' "--no-skills"
  elif [[ -n "$skills_csv" ]]; then
    printf '%s\n' "--with-skills=${skills_csv}"
  fi
  [[ "$defer_auth" == "true" ]] && printf '%s\n' "--defer-auth"
}

# Build the "## Role" + "## Reporting" markdown for one agent and append it to
# the agent's $HOME/.claude/CLAUDE.md — BELOW the shared telegram fragment that
# cmd_create already dropped (telegram agents) or as a fresh file (others). This
# is the v1 gap: every telegram agent used to get only the shared mandate; now a
# CEO vs DevOps carry distinct role instructions + a real delegation map.
#
# Reporting lines are generated from reports_to so delegation is executable, not
# decorative: each manager / direct report comes with the exact `5dive agent
# send` invocation. Runs on CREATE only (see cmd_compose_up), so re-running `up`
# never double-appends.
_compose_write_role_md() {
  local spec="$1" name="$2" spec_dir="$3"
  local agent role instructions ifile
  agent=$(jq -c --arg n "$name" '.agents[$n]' <<<"$spec")
  role=$(jq         -r '.role              // empty' <<<"$agent")
  instructions=$(jq -r '.instructions      // empty' <<<"$agent")
  ifile=$(jq        -r '.instructions_file // empty' <<<"$agent")

  # Resolve instructions_file against the spec dir (parser already enforced XOR).
  if [[ -z "$instructions" && -n "$ifile" ]]; then
    local ipath
    ipath=$(_compose_resolve_path "$ifile" "$spec_dir")
    if [[ -f "$ipath" ]]; then
      instructions=$(cat "$ipath")
    else
      warn "[$name] instructions_file not found: $ipath"
    fi
  fi

  # Managers (reports_to) and direct reports (who lists $name as a manager).
  local -a mgrs reports
  mapfile -t mgrs    < <(jq -r --arg n "$name" '.agents[$n].reports_to // empty | if type=="array" then .[] else . end' <<<"$spec")
  mapfile -t reports < <(jq -r --arg n "$name" '.agents | to_entries[] | select((.value.reports_to // empty) | if type=="array" then any(. == $n) else . == $n end) | .key' <<<"$spec")

  # Nothing role-specific → leave the agent's CLAUDE.md exactly as cmd_create
  # left it (keeps plain v1 specs byte-identical to before).
  [[ -n "$role" || -n "$instructions" || ${#mgrs[@]} -gt 0 || ${#reports[@]} -gt 0 ]] || return 0

  local block=$'\n\n'
  if [[ -n "$role" ]]; then block+="## Role: ${role}"$'\n\n'; else block+="## Role"$'\n\n'; fi
  [[ -n "$instructions" ]] && block+="${instructions}"$'\n\n'
  block+="## Reporting"$'\n'
  if [[ ${#mgrs[@]} -gt 0 ]]; then
    local m
    for m in "${mgrs[@]}"; do
      block+="- You report to **${m}**. Escalate or sync: \`5dive agent send ${m} '<message>'\`."$'\n'
    done
  else
    block+="- You sit at the top of this org; you answer to the human owner."$'\n'
  fi
  if [[ ${#reports[@]} -gt 0 ]]; then
    local r
    for r in "${reports[@]}"; do
      block+="- Direct report **${r}**. Delegate: \`5dive agent send ${r} '<task>'\`."$'\n'
    done
  fi

  local user="agent-${name}" home="/home/agent-${name}" md
  md="$home/.claude/CLAUDE.md"
  sudo -u "$user" mkdir -p "$home/.claude" 2>/dev/null || true
  if printf '%s' "$block" | sudo -u "$user" tee -a "$md" >/dev/null 2>&1; then
    sudo chmod 644 "$md" 2>/dev/null || true
  else
    warn "[$name] could not write role instructions to $md"
  fi
}

# Apply the v2 role wiring for one freshly-created agent: model, effort, org
# edge (reports_to), role instructions + reporting block, and seed goals into
# the task queue. Every step is best-effort and process-isolated (shelled out
# or subshell-guarded) so one failure can't abort the whole bring-up.
_compose_wire_role() {
  local spec="$1" name="$2" spec_dir="$3" self="$4"
  local agent type model effort role primary_mgr
  agent=$(jq -c --arg n "$name" '.agents[$n]' <<<"$spec")
  type=$(jq   -r '.type   // "claude"' <<<"$agent")
  model=$(jq  -r '.model  // empty'    <<<"$agent")
  effort=$(jq -r '.effort // empty'    <<<"$agent")
  role=$(jq   -r '.role   // empty'    <<<"$agent")
  primary_mgr=$(jq -r '.reports_to // empty | if type=="array" then (.[0] // "") else . end' <<<"$agent")

  # model / effort via the public config path (process-isolated; warns if the
  # runtime config isn't written yet — model just stays at its default).
  if [[ -n "$model" ]]; then
    bash "$self" agent config "$name" set "model=$model" >/dev/null 2>&1 \
      || warn "[$name] set model=$model failed (apply later: 5dive agent config $name set model=$model)"
  fi
  if [[ -n "$effort" ]]; then
    bash "$self" agent config "$name" set "effort=$effort" >/dev/null 2>&1 \
      || warn "[$name] set effort=$effort failed (apply later: 5dive agent config $name set effort=$effort)"
  fi

  # Org edge + title. org set carries one manager; the Reporting block lists all.
  if [[ -n "$role" || -n "$primary_mgr" ]]; then
    local -a oargs=(org set "$name")
    [[ -n "$role"        ]] && oargs+=("--role=$role")
    [[ -n "$primary_mgr" ]] && oargs+=("--manager=$primary_mgr")
    bash "$self" "${oargs[@]}" >/dev/null 2>&1 || warn "[$name] org set failed"
  fi

  # Role instructions + reporting block → agent CLAUDE.md.
  _compose_write_role_md "$spec" "$name" "$spec_dir"

  # Seed goals into the shared task queue, assigned to the role, from its manager.
  local -a goals
  mapfile -t goals < <(jq -r '(.goals // [])[]' <<<"$agent")
  local g
  for g in "${goals[@]}"; do
    [[ -n "$g" ]] || continue
    local -a targs=(task add "$g" "--assignee=$name")
    [[ -n "$primary_mgr" ]] && targs+=("--from=$primary_mgr")
    bash "$self" "${targs[@]}" >/dev/null 2>&1 || warn "[$name] seed goal failed: $g"
  done
}

# Re-exec self via bash so we work whether the script was installed (+x) or
# invoked from a source checkout (no +x).
_compose_self() { realpath "${BASH_SOURCE[0]}"; }

cmd_compose_up() {
  local file=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -f|--file)    file="$2"; shift ;;
      --file=*)     file="${1#--file=}" ;;
      -h|--help)
        cat >&2 <<HELP
usage: 5dive up [-f file]
  Bring up agents declared in 5dive.yaml. Idempotent — existing agents are
  left alone, missing ones are created and started.
  Default file: 5dive.yaml or 5dive.yml in the current directory.
HELP
        return 0 ;;
      *) fail "$E_USAGE" "unknown flag: $1" ;;
    esac
    shift
  done
  if [[ -z "$file" ]]; then
    file=$(_compose_default_file) \
      || fail "$E_NOT_FOUND" "no 5dive.yaml or 5dive.yml in $(pwd) — pass -f <file>"
  fi
  [[ -f "$file" ]] || fail "$E_NOT_FOUND" "spec file not found: $file"
  ensure_state

  local spec spec_dir self
  spec=$(_compose_parse "$file") || fail "$E_VALIDATION" "spec parse failed"
  spec_dir=$(realpath "$(dirname "$file")")
  self=$(_compose_self)

  local reg
  reg=$(registry_read)

  local names created=0 started=0 skipped=0 errors=0
  mapfile -t names < <(jq -r '.agents | keys[]' <<<"$spec")
  if (( ${#names[@]} == 0 )); then
    warn "spec has no agents declared"
    ok "no agents to apply" '{file:$f, created:0, started:0, skipped:0, errors:0}' --arg f "$file"
    return 0
  fi

  local name
  for name in "${names[@]}"; do
    if ! valid_name "$name"; then
      warn "[$name] invalid agent name — skipping"
      ((errors++)) || true
      continue
    fi
    local exists
    exists=$(jq --arg n "$name" '.agents[$n] != null' <<<"$reg")
    if [[ "$exists" == "true" ]]; then
      step "[$name] already exists — ensuring started"
      if bash "$self" agent start "$name" >/dev/null 2>&1; then
        ((started++)) || true
      else
        ((skipped++)) || true
      fi
      continue
    fi
    step "[$name] creating"
    local agent_spec
    agent_spec=$(jq -c --arg n "$name" '.agents[$n]' <<<"$spec")
    local -a args=()
    mapfile -t args < <(_compose_create_args "$agent_spec" "$name" "$spec_dir")
    if bash "$self" agent create "${args[@]}"; then
      ((created++)) || true
      # v2 role wiring (model/effort/org/instructions/goals). Best-effort: a
      # wiring hiccup must not fail the create that already succeeded.
      _compose_wire_role "$spec" "$name" "$spec_dir" "$self" || true
    else
      warn "[$name] create failed"
      ((errors++)) || true
    fi
  done

  if (( JSON_MODE )); then
    ok "" '{file:$f, created:($c|tonumber), started:($s|tonumber), skipped:($k|tonumber), errors:($e|tonumber)}' \
      --arg f "$file" --arg c "$created" --arg s "$started" --arg k "$skipped" --arg e "$errors"
  else
    echo "OK — applied $file: created=$created started=$started skipped=$skipped errors=$errors"
  fi
  (( errors == 0 )) || return "$E_GENERIC"
}

cmd_compose_down() {
  local file=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -f|--file)    file="$2"; shift ;;
      --file=*)     file="${1#--file=}" ;;
      -h|--help)
        cat >&2 <<HELP
usage: 5dive down [-f file]
  Tear down agents declared in 5dive.yaml — stops and removes each one.
HELP
        return 0 ;;
      *) fail "$E_USAGE" "unknown flag: $1" ;;
    esac
    shift
  done
  if [[ -z "$file" ]]; then
    file=$(_compose_default_file) \
      || fail "$E_NOT_FOUND" "no 5dive.yaml or 5dive.yml in $(pwd) — pass -f <file>"
  fi
  [[ -f "$file" ]] || fail "$E_NOT_FOUND" "spec file not found: $file"
  ensure_state

  local spec self
  spec=$(_compose_parse "$file") || fail "$E_VALIDATION" "spec parse failed"
  self=$(_compose_self)

  local reg
  reg=$(registry_read)

  local names removed=0 missing=0 errors=0
  mapfile -t names < <(jq -r '.agents | keys[]' <<<"$spec")
  local name
  for name in "${names[@]}"; do
    local exists
    exists=$(jq --arg n "$name" '.agents[$n] != null' <<<"$reg")
    if [[ "$exists" != "true" ]]; then
      step "[$name] not present — skipping"
      ((missing++)) || true
      continue
    fi
    step "[$name] removing"
    if bash "$self" agent rm "$name" >/dev/null 2>&1; then
      ((removed++)) || true
    else
      warn "[$name] remove failed"
      ((errors++)) || true
    fi
  done

  if (( JSON_MODE )); then
    ok "" '{file:$f, removed:($r|tonumber), missing:($m|tonumber), errors:($e|tonumber)}' \
      --arg f "$file" --arg r "$removed" --arg m "$missing" --arg e "$errors"
  else
    echo "OK — torn down $file: removed=$removed missing=$missing errors=$errors"
  fi
  (( errors == 0 )) || return "$E_GENERIC"
}

# 5dive export — round-trip the live fleet back to a v2 5dive.yaml, so a running
# org can be saved, versioned, and forked into a template. Dumps the structural
# spec (type/channels/workdir/auth_profile + model/effort + role/reports_to from
# the org graph). Role INSTRUCTIONS are not round-tripped: an agent's CLAUDE.md
# interleaves the shared telegram fragment with the role block, so re-deriving
# clean source is unsafe — a `# instructions: ...` reminder is emitted instead.
cmd_compose_export() {
  local out=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -o|--output) out="$2"; shift ;;
      --output=*)  out="${1#--output=}" ;;
      -h|--help)
        cat >&2 <<HELP
usage: 5dive export [-o team.yaml]
  Dump the running fleet to a v2 5dive.yaml (stdout if no -o). Captures
  type/channels/workdir/auth_profile, model/effort, and role/reports_to.
HELP
        return 0 ;;
      *) fail "$E_USAGE" "unknown flag: $1" ;;
    esac
    shift
  done
  ensure_state
  local reg
  reg=$(registry_read)

  local names agents="{}"
  mapfile -t names < <(jq -r '.agents | keys[]' <<<"$reg")
  local name
  for name in "${names[@]}"; do
    local type channels workdir profile model effort role mgr
    type=$(jq    -r --arg n "$name" '.agents[$n].type        // "claude"' <<<"$reg")
    channels=$(jq -r --arg n "$name" '.agents[$n].channels    // empty'    <<<"$reg")
    workdir=$(jq -r --arg n "$name" '.agents[$n].workdir      // empty'    <<<"$reg")
    profile=$(jq -r --arg n "$name" '.agents[$n].authProfile  // empty'    <<<"$reg")
    model=$(resolve_agent_model  "$type" "$name")
    effort=$(resolve_agent_effort "$type" "$name")
    role=$(db "SELECT COALESCE(role,'')       FROM agents_org WHERE name=$(sqlq "$name");" 2>/dev/null | head -1)
    mgr=$(db  "SELECT COALESCE(reports_to,'') FROM agents_org WHERE name=$(sqlq "$name");" 2>/dev/null | head -1)
    # Assemble one agent object, dropping empty fields.
    local obj
    obj=$(jq -n \
      --arg type "$type" --arg channels "$channels" --arg workdir "$workdir" \
      --arg profile "$profile" --arg model "$model" --arg effort "$effort" \
      --arg role "$role" --arg mgr "$mgr" '
      {type:$type}
      | (if $channels != "" then .channels = $channels else . end)
      | (if $workdir  != "" then .workdir  = $workdir  else . end)
      | (if $profile  != "" then .auth_profile = $profile else . end)
      | (if $model    != "" then .model    = $model    else . end)
      | (if $effort   != "" then .effort   = $effort   else . end)
      | (if $role     != "" then .role     = $role     else . end)
      | (if $mgr      != "" then .reports_to = $mgr    else . end)')
    agents=$(jq -c --arg n "$name" --argjson o "$obj" '. + {($n): $o}' <<<"$agents")
  done

  local doc
  doc=$(jq -n --argjson agents "$agents" '{version:"2", agents:$agents}')
  local yaml
  yaml=$(printf '%s' "$doc" | python3 -c 'import sys,yaml,json; print("# 5dive.yaml v2 — exported fleet\n# note: role instructions are not round-tripped; re-add per role as needed.\n" + yaml.safe_dump(json.load(sys.stdin), sort_keys=False, default_flow_style=False))') \
    || fail "$E_GENERIC" "yaml serialisation failed"
  if [[ -n "$out" ]]; then
    printf '%s' "$yaml" > "$out" || fail "$E_GENERIC" "cannot write $out"
    ok "exported ${#names[@]} agents to $out" '{file:$f, agents:($n|tonumber)}' --arg f "$out" --arg n "${#names[@]}"
  else
    printf '%s' "$yaml"
  fi
}

# Where curated team templates live. Installed alongside the other shared
# plugin assets; falls back to a repo-local team-templates/ for source checkouts.
_team_templates_dir() {
  if   [[ -d /usr/local/lib/5dive/team-templates ]]; then printf '%s' /usr/local/lib/5dive/team-templates
  elif [[ -d "$(dirname "$(_compose_self)")/../team-templates" ]]; then
    realpath "$(dirname "$(_compose_self)")/../team-templates"
  else return 1
  fi
}

# 5dive team import <slug|path> — resolve a curated/bundled template (or a path)
# and bring the whole org up via the existing compose engine. A thin, honest
# wrapper over `up`: the heavy lifting (idempotent create + v2 wiring) is shared.
cmd_team() {
  local sub="${1:-}"; shift || true
  case "$sub" in
    import) : ;;
    ls|list)
      local dir; dir=$(_team_templates_dir) || fail "$E_NOT_FOUND" "no team-templates dir found"
      echo "Available templates ($dir):"
      local f
      for f in "$dir"/*.5dive.yaml "$dir"/*.5dive.yml; do
        [[ -f "$f" ]] || continue
        local slug; slug=$(basename "$f"); slug="${slug%%.5dive.*}"
        printf '  %-16s %s\n' "$slug" "$f"
      done
      return 0 ;;
    -h|--help|"" )
      cat >&2 <<HELP
usage: 5dive team import <slug|path> [--auth-profile=<name>]
       5dive team ls
  Provision a whole company-structure template in one call (wraps 5dive up).
  <slug> resolves to a bundled template; a path is used as-is.
HELP
      return 0 ;;
    *) fail "$E_USAGE" "unknown team subcommand: $sub (try: import, ls)" ;;
  esac

  local ref="" profile=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --auth-profile=*) profile="${1#--auth-profile=}" ;;
      --auth-profile)   profile="$2"; shift ;;
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  [[ -z "$ref" ]] && ref="$1" || fail "$E_USAGE" "extra arg: $1" ;;
    esac
    shift
  done
  [[ -n "$ref" ]] || fail "$E_USAGE" "usage: 5dive team import <slug|path>"

  local file=""
  if [[ -f "$ref" ]]; then
    file="$ref"
  else
    local dir; dir=$(_team_templates_dir) || fail "$E_NOT_FOUND" "no team-templates dir — pass a path"
    if   [[ -f "$dir/${ref}.5dive.yaml" ]]; then file="$dir/${ref}.5dive.yaml"
    elif [[ -f "$dir/${ref}.5dive.yml"  ]]; then file="$dir/${ref}.5dive.yml"
    else fail "$E_NOT_FOUND" "no template '$ref' in $dir (try: 5dive team ls)"
    fi
  fi

  # --auth-profile overrides the template's ${TEAM_AUTH_PROFILE} default.
  [[ -n "$profile" ]] && export TEAM_AUTH_PROFILE="$profile"
  step "importing team from $file"
  cmd_compose_up -f "$file"
}

cmd_compose_ps() {
  local file=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -f|--file)    file="$2"; shift ;;
      --file=*)     file="${1#--file=}" ;;
      -h|--help)
        cat >&2 <<HELP
usage: 5dive ps [-f file]
  Show status of agents declared in 5dive.yaml.
HELP
        return 0 ;;
      *) fail "$E_USAGE" "unknown flag: $1" ;;
    esac
    shift
  done
  if [[ -z "$file" ]]; then
    file=$(_compose_default_file) \
      || fail "$E_NOT_FOUND" "no 5dive.yaml or 5dive.yml in $(pwd) — pass -f <file>"
  fi
  [[ -f "$file" ]] || fail "$E_NOT_FOUND" "spec file not found: $file"
  ensure_state

  local spec
  spec=$(_compose_parse "$file") || fail "$E_VALIDATION" "spec parse failed"
  local reg
  reg=$(registry_read)

  local names rows="[]"
  mapfile -t names < <(jq -r '.agents | keys[]' <<<"$spec")
  local name
  for name in "${names[@]}"; do
    local declared_type exists active
    declared_type=$(jq -r --arg n "$name" '.agents[$n].type // "?"' <<<"$spec")
    exists=$(jq           --arg n "$name" '.agents[$n] != null'     <<<"$reg")
    if [[ "$exists" == "true" ]]; then
      active=$(systemctl is-active "5dive-agent@${name}.service" 2>/dev/null || echo unknown)
    else
      active="missing"
    fi
    rows=$(jq -c --arg n "$name" --arg t "$declared_type" --arg a "$active" \
      '. + [{name:$n, type:$t, state:$a}]' <<<"$rows")
  done

  if (( JSON_MODE )); then
    ok "" '{file:$f, agents: $rows}' --arg f "$file" --argjson rows "$rows"
  else
    echo "$rows" | jq -r '
      (["NAME","TYPE","STATE"] | @tsv),
      (.[] | [.name, .type, .state] | @tsv)
    ' | column -t -s $'\t'
  fi
}


# -------- 5dive task — host-shared task queue --------

_task_usage() {
  cat <<USAGE
5dive task — shared task queue (sqlite at ${STATE_DIR}/tasks/tasks.db)

  5dive task init                                    # one-time root bootstrap of the store
  5dive task add <title...> [--body=<text>] [--priority=low|medium|high|urgent]
                            [--assignee=<agent>] [--parent=<id|DIVE-N>] [--from=<who>]
                            [--recurring="<cron>"]  # recurring=template (5-field cron, e.g. "0 2 * * *")
  5dive task ls [--status=<s>] [--assignee=<agent>] [--mine] [--all] [--recurring]
                                                     # default: open tasks, priority-ordered; --recurring: templates
  5dive task show <id|DIVE-N>                        # full detail + subtasks + blockers
  5dive task assign <id|DIVE-N> <agent>
  5dive task start  <id|DIVE-N>                      # -> in_progress
  5dive task done   <id|DIVE-N> [--result=<text>]    # -> done; --result captures the agent's response
  5dive task cancel <id|DIVE-N> [--result=<text>]    # -> cancelled; --result captures why
  5dive task block   <id|DIVE-N> --by=<id|DIVE-N>    # add a blocks edge, mark blocked
  5dive task unblock <id|DIVE-N> [--by=<id|DIVE-N>]  # drop edge(s); back to todo if clear
  5dive task rm <id|DIVE-N>                          # delete (cascades subtasks + edges)

  # Human Task Inbox — park a task on a human and clear it
  5dive task need <id|DIVE-N> --type=decision|secret|approval|manual --ask="..." [--options=A|B] [--recommend="A"]
    --ask: ONE crisp question + ~1 line essential context, recommendation up front. Heavy detail goes in the task BODY, not the ask.
    --recommend: your advised choice (strongly encouraged for decision/approval). Leads the alert as '✅ Recommended: <X>' and ⭐-marks its button. For a decision it must match one of --options.
                                                     # -> blocked, awaiting a human (decision/secret/approval/manual)
  5dive task inbox                                   # list ONLY human-gated tasks, priority-ordered
  5dive task answer <id|DIVE-N> --value="..."        # record the human's answer, unblock, ping the owning agent

  status: todo | in_progress | blocked | done | cancelled
  Any agent (group claude) can run these without sudo. Add --json for machine output.
USAGE
}

cmd_task() {
  [[ $# -gt 0 ]] || { _task_usage; exit "$E_USAGE"; }
  local sub="$1"; shift
  case "$sub" in
    init)            cmd_task_init "$@" ;;
    add|new)         cmd_task_add "$@" ;;
    ls|list)         cmd_task_ls "$@" ;;
    show|view)       cmd_task_show "$@" ;;
    assign)          cmd_task_assign "$@" ;;
    start)           cmd_task_start "$@" ;;
    done|close)      cmd_task_done "$@" ;;
    cancel)          cmd_task_cancel "$@" ;;
    block)           cmd_task_block "$@" ;;
    unblock)         cmd_task_unblock "$@" ;;
    need)            cmd_task_need "$@" ;;
    inbox)           cmd_task_inbox "$@" ;;
    answer)          cmd_task_answer "$@" ;;
    rm|delete)       cmd_task_rm "$@" ;;
    -h|--help|help)  _task_usage ;;
    *) fail "$E_USAGE" "unknown task command: $sub (try: 5dive task --help)" ;;
  esac
}

cmd_task_init() {
  require_root "task init"
  tasks_db_init
  ok "tasks store ready at $TASKS_DB" '{path:$p}' --arg p "$TASKS_DB"
}

cmd_task_add() {
  tasks_db_init
  local body="" priority="medium" assignee="" parent="" from="" recurring="" fresh=""
  local -a words=()
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --body=*)      body="${1#*=}" ;;
      --priority=*)  priority="${1#*=}" ;;
      --assignee=*)  assignee="${1#*=}" ;;
      --parent=*)    parent="${1#*=}" ;;
      --from=*)      from="${1#*=}" ;;
      --recurring=*) recurring="${1#*=}" ;;
      --schedule=*)  recurring="${1#*=}" ;;
      --fresh)       fresh="1" ;;
      --no-fresh)    fresh="0" ;;
      --)            shift; words+=("$@"); break ;;
      -*)            fail "$E_USAGE" "unknown flag: $1" ;;
      *)             words+=("$1") ;;
    esac
    shift
  done
  local title="${words[*]:-}"
  [[ -n "$title" ]] || fail "$E_USAGE" "usage: 5dive task add <title...> [--body=] [--priority=] [--assignee=] [--parent=] [--recurring=\"<cron>\"]"
  valid_task_priority "$priority" || fail "$E_VALIDATION" "bad priority '$priority' (low|medium|high|urgent)"
  # --recurring=<cron> makes this a TEMPLATE (kind='recurring'), not a worked
  # task — the step-2 materializer clones it into a standard todo on schedule.
  # A template + an explicit --parent is nonsensical (instances are top-level),
  # so reject the combo rather than store a confusing row.
  local kind="standard" schedule_sql="NULL"
  if [[ -n "$recurring" ]]; then
    valid_cron_expr "$recurring" || fail "$E_VALIDATION" "bad --recurring '$recurring' (need a 5-field cron expr, e.g. \"0 2 * * *\")"
    [[ -z "$parent" ]] || fail "$E_VALIDATION" "--recurring can't be combined with --parent (a template has no parent)"
    kind="recurring"; schedule_sql=$(sqlq "$recurring")
  fi
  local parent_sql="NULL"
  if [[ -n "$parent" ]]; then
    resolve_task_id "$parent"; parent_sql="$RESOLVED_TASK_ID"
  fi
  # fresh: per-task clean-session pref (DIVE-138). Recurring templates default to
  # fresh=1 (clean each run — Mark's decision for the community/marketing jobs)
  # and carry it onto every materialized instance; an explicit --fresh/--no-fresh
  # overrides. Standard tasks leave it NULL (fall back to the agent-level
  # heartbeat fresh setting at wake).
  local fresh_sql="NULL"
  if [[ -n "$fresh" ]]; then fresh_sql="$fresh"
  elif [[ "$kind" == "recurring" ]]; then fresh_sql="1"; fi
  local creator; creator=$(task_actor "$from")
  local id
  id=$(db "INSERT INTO tasks (title, body, priority, assignee, created_by, parent_id, kind, schedule, fresh)
           VALUES ($(sqlq "$title"), $(sqlq_or_null "$body"), $(sqlq "$priority"),
                   $(sqlq_or_null "$assignee"), $(sqlq "$creator"), ${parent_sql},
                   $(sqlq "$kind"), ${schedule_sql}, ${fresh_sql});
           SELECT last_insert_rowid();")
  if [[ "$kind" == "recurring" ]]; then
    ok "created recurring DIVE-$id (${recurring}, fresh=$([[ "$fresh_sql" == "1" ]] && echo on || echo off)) — $title" \
       '{id:($i|tonumber), ident:("DIVE-"+$i), title:$t, priority:$p, assignee:$a, created_by:$c, kind:"recurring", schedule:$s, fresh:($f=="1")}' \
       --arg i "$id" --arg t "$title" --arg p "$priority" --arg a "${assignee:-}" --arg c "$creator" --arg s "$recurring" --arg f "$fresh_sql"
  else
    ok "created DIVE-$id — $title" \
       '{id:($i|tonumber), ident:("DIVE-"+$i), title:$t, priority:$p, assignee:$a, created_by:$c, kind:"standard"}' \
       --arg i "$id" --arg t "$title" --arg p "$priority" --arg a "${assignee:-}" --arg c "$creator"
  fi
}

cmd_task_ls() {
  tasks_db_init
  local status="" assignee="" mine=0 all=0 from="" recurring=0
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --status=*)   status="${1#*=}" ;;
      --assignee=*) assignee="${1#*=}" ;;
      --mine)       mine=1 ;;
      --all)        all=1 ;;
      --recurring)  recurring=1 ;;
      --from=*)     from="${1#*=}" ;;
      -*)           fail "$E_USAGE" "unknown flag: $1" ;;
      *)            fail "$E_USAGE" "unexpected arg: $1" ;;
    esac
    shift
  done
  [[ $mine -eq 1 ]] && assignee=$(task_actor "$from")
  # --recurring lists the TEMPLATES (kind='recurring') with their schedule;
  # otherwise we list real work and always exclude templates (they're never
  # worked directly, so they'd be noise in the board).
  local where="1=1" order
  if (( recurring )); then
    where+=" AND kind='recurring'"
    order="ORDER BY id"
  else
    where+=" AND kind='standard'"
    if [[ -n "$status" ]]; then
      valid_task_status "$status" || fail "$E_VALIDATION" "bad status '$status' (todo|in_progress|blocked|done|cancelled)"
      where+=" AND status=$(sqlq "$status")"
    elif [[ $all -ne 1 ]]; then
      where+=" AND status NOT IN ('done','cancelled')"
    fi
    order="ORDER BY CASE priority WHEN 'urgent' THEN 0 WHEN 'high' THEN 1 WHEN 'medium' THEN 2 ELSE 3 END, created_at"
  fi
  [[ -n "$assignee" ]] && where+=" AND assignee=$(sqlq "$assignee")"
  if (( JSON_MODE )); then
    local rows
    rows=$(dbfmt -json "SELECT id, ident, title, status, priority, assignee, created_by, parent_id, created_at, done_at, body, result, need_type, ask, need_options, need_answer, need_answered_at, kind, schedule, last_fired_at FROM tasks WHERE ${where} ${order};")
    [[ -n "$rows" ]] || rows="[]"
    # Feed rows via stdin, not --argjson: a big board (179+ tasks w/ bodies)
    # blows past MAX_ARG_STRLEN (128K per argv string) -> execve E2BIG
    # ("Argument list too long"). stdin has no such cap. (DIVE-222)
    printf '%s' "$rows" | jq -c '{ok:true, data:{tasks:.}}'
  elif (( recurring )); then
    dbfmt -box "SELECT ident, schedule, COALESCE(assignee,'-') AS assignee, COALESCE(last_fired_at,'never') AS last_fired, title FROM tasks WHERE ${where} ${order};"
  else
    dbfmt -box "SELECT ident, status, priority, COALESCE(assignee,'-') AS assignee, title FROM tasks WHERE ${where} ${order};"
  fi
}

cmd_task_show() {
  tasks_db_init
  [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive task show <id|DIVE-N>"
  resolve_task_id "$1"; local id="$RESOLVED_TASK_ID"
  if (( JSON_MODE )); then
    local task subs deps
    task=$(dbfmt -json "SELECT * FROM tasks WHERE id=${id};")
    subs=$(dbfmt -json "SELECT id,ident,title,status FROM tasks WHERE parent_id=${id} ORDER BY id;")
    deps=$(dbfmt -json "SELECT t.id,t.ident,t.title,t.status FROM task_deps d JOIN tasks t ON t.id=d.blocked_by WHERE d.task_id=${id} ORDER BY t.id;")
    [[ -n "$subs" ]] || subs="[]"
    [[ -n "$deps" ]] || deps="[]"
    jq -cn --argjson t "$task" --argjson s "$subs" --argjson b "$deps" \
      '{ok:true, data:{task:($t[0]), subtasks:$s, blocked_by:$b}}'
  else
    dbfmt -line "SELECT ident, title, status, priority, assignee, created_by, parent_id, created_at, started_at, done_at, body, result FROM tasks WHERE id=${id};"
    # Human gate (only when set) — mirrors the conditional subtasks/blockers
    # blocks below so an ordinary task's `show` stays clean.
    local gate
    gate=$(db "SELECT 'type: '||need_type||
                      CASE WHEN need_options IS NOT NULL THEN '  options: '||need_options ELSE '' END||
                      CASE WHEN recommend IS NOT NULL THEN x'0a'||'recommend: '||recommend ELSE '' END||x'0a'||
                      'ask:  '||COALESCE(ask,'')||
                      CASE WHEN need_answered_at IS NOT NULL
                           THEN x'0a'||'answer: '||CASE WHEN need_type='secret' THEN '(provided — loaded out-of-band)' ELSE COALESCE(need_answer,'') END||'  ('||need_answered_at||')'
                           ELSE x'0a'||'answer: — pending' END
               FROM tasks WHERE id=${id} AND need_type IS NOT NULL;")
    [[ -n "$gate" ]] && { echo; echo "human gate:"; printf '%s\n' "$gate" | indent2; }
    local subs
    subs=$(db "SELECT ident||'  ['||status||']  '||title FROM tasks WHERE parent_id=${id} ORDER BY id;")
    [[ -n "$subs" ]] && { echo; echo "subtasks:"; printf '%s\n' "$subs" | indent2; }
    local deps
    deps=$(db "SELECT t.ident||'  ['||t.status||']  '||t.title FROM task_deps d JOIN tasks t ON t.id=d.blocked_by WHERE d.task_id=${id} ORDER BY t.id;")
    [[ -n "$deps" ]] && { echo; echo "blocked by:"; printf '%s\n' "$deps" | indent2; }
  fi
}

cmd_task_assign() {
  tasks_db_init
  [[ $# -ge 2 ]] || fail "$E_USAGE" "usage: 5dive task assign <id|DIVE-N> <agent>"
  resolve_task_id "$1"; local id="$RESOLVED_TASK_ID"
  local who="$2"
  # Handing a task to a NEW owner resets its in_progress clock: SQLite evaluates
  # SET column refs against the pre-update row, so `assignee IS NOT <who>` is the
  # OLD assignee. Without this, an inherited in_progress task keeps the prior
  # owner's started_at, and the heartbeat stale-reaper (_hb_reap_stale) can
  # cancel it on the new owner's very first tick before they touch it.
  db "UPDATE tasks SET assignee=$(sqlq "$who"),
        started_at=CASE WHEN status='in_progress' AND assignee IS NOT $(sqlq "$who")
                        THEN datetime('now') ELSE started_at END
      WHERE id=${id};"
  ok "DIVE-$id assigned to $who" '{id:($i|tonumber), assignee:$a}' --arg i "$id" --arg a "$who"
}

_task_status_cmd() {
  local newstatus="$1" extra="$2" verb="$3"; shift 3
  tasks_db_init
  local result="" want_result=0 notify=0
  local -a positional=()
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --result=*) result="${1#*=}"; want_result=1 ;;
      --notify)   notify=1 ;;
      --)         shift; positional+=("$@"); break ;;
      -*)         fail "$E_USAGE" "unknown flag: $1" ;;
      *)          positional+=("$1") ;;
    esac
    shift
  done
  [[ ${#positional[@]} -gt 0 ]] || fail "$E_USAGE" "usage: 5dive task $verb <id|DIVE-N> [--result=<text>] [--notify]"
  resolve_task_id "${positional[0]}"; local id="$RESOLVED_TASK_ID"
  local set_result=""
  if (( want_result )); then
    set_result=", result=$(sqlq_or_null "$result")"
  fi
  db "UPDATE tasks SET status=$(sqlq "$newstatus")${extra}${set_result} WHERE id=${id};"
  # --notify (done/cancel only): DM the paired human a one-line ✅/⚠️ summary so
  # autonomous queue work surfaces a finish line. Best-effort; never fails the
  # status write above.
  #
  # Suppress the DM for auto-materialized recurring tasks (from_template_id set):
  # those are agent housekeeping the user never asked for per-occurrence — the
  # daily recap, nightly sweeps, weekly cleanups — and pinging on every fire is
  # the noise Mark flagged. Their result still lands on the record + the daily
  # recap; only the redundant live ping is dropped. Manual/delegated closes
  # (no template parent) still notify. Cheap single-column read, fail-open to
  # "notify" so a DB hiccup never silently swallows a real finish line.
  if (( notify )) && [[ "$verb" == "done" || "$verb" == "cancel" ]]; then
    local from_tmpl
    from_tmpl=$(db "SELECT COALESCE(from_template_id,'') FROM tasks WHERE id=${id};" 2>/dev/null || echo "")
    if [[ -z "$from_tmpl" ]]; then
      _task_close_notify "DIVE-$id" "$verb" "$result" || true
    fi
  fi
  ok "DIVE-$id $verb" '{id:($i|tonumber), status:$s}' --arg i "$id" --arg s "$newstatus"
}

cmd_task_start()  { _task_status_cmd in_progress ", started_at=COALESCE(started_at, datetime('now'))" start "$@"; }
cmd_task_done()   { _task_status_cmd done ", done_at=datetime('now')" done "$@"; }
cmd_task_cancel() { _task_status_cmd cancelled ", done_at=datetime('now')" cancel "$@"; }

cmd_task_block() {
  tasks_db_init
  local task="" by=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --by=*) by="${1#*=}" ;;
      -*)     fail "$E_USAGE" "unknown flag: $1" ;;
      *)      [[ -z "$task" ]] && task="$1" || fail "$E_USAGE" "unexpected arg: $1" ;;
    esac
    shift
  done
  [[ -n "$task" && -n "$by" ]] || fail "$E_USAGE" "usage: 5dive task block <id|DIVE-N> --by=<id|DIVE-N>"
  resolve_task_id "$task"; local tid="$RESOLVED_TASK_ID"
  resolve_task_id "$by";   local bid="$RESOLVED_TASK_ID"
  [[ "$tid" != "$bid" ]] || fail "$E_VALIDATION" "a task can't block itself"
  db "INSERT OR IGNORE INTO task_deps (task_id, blocked_by) VALUES (${tid}, ${bid});
      UPDATE tasks SET status='blocked' WHERE id=${tid} AND status NOT IN ('done','cancelled');"
  ok "DIVE-$tid blocked by DIVE-$bid" '{task:($t|tonumber), blocked_by:($b|tonumber)}' --arg t "$tid" --arg b "$bid"
}

cmd_task_unblock() {
  tasks_db_init
  local task="" by=""
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --by=*) by="${1#*=}" ;;
      -*)     fail "$E_USAGE" "unknown flag: $1" ;;
      *)      [[ -z "$task" ]] && task="$1" || fail "$E_USAGE" "unexpected arg: $1" ;;
    esac
    shift
  done
  [[ -n "$task" ]] || fail "$E_USAGE" "usage: 5dive task unblock <id|DIVE-N> [--by=<id|DIVE-N>]"
  resolve_task_id "$task"; local tid="$RESOLVED_TASK_ID"
  if [[ -n "$by" ]]; then
    resolve_task_id "$by"; local bid="$RESOLVED_TASK_ID"
    db "DELETE FROM task_deps WHERE task_id=${tid} AND blocked_by=${bid};"
  else
    db "DELETE FROM task_deps WHERE task_id=${tid};"
  fi
  # Don't flip a still-pending human gate back to todo (DIVE-109): a task parked
  # on a human has need_type set and need_answered_at NULL. Only edge-blocks clear here.
  db "UPDATE tasks SET status='todo'
      WHERE id=${tid} AND status='blocked'
        AND (need_type IS NULL OR need_answered_at IS NOT NULL)
        AND NOT EXISTS (SELECT 1 FROM task_deps WHERE task_id=${tid});"
  ok "DIVE-$tid unblocked" '{task:($t|tonumber)}' --arg t "$tid"
}

# --- Human Task Inbox (DIVE-103; parent feature DIVE-102) ----------------
# `need` parks a task on a human; `inbox` lists what's waiting; `answer`
# records the human's reply, unblocks, and pings the agent that hit the gate.

cmd_task_need() {
  tasks_db_init
  local type="" ask="" options="" recommend="" from=""
  local -a positional=()
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --type=*)      type="${1#*=}" ;;
      --ask=*)       ask="${1#*=}" ;;
      --options=*)   options="${1#*=}" ;;
      --recommend=*) recommend="${1#*=}" ;;
      --from=*)      from="${1#*=}" ;;
      --)          shift; positional+=("$@"); break ;;
      -*)          fail "$E_USAGE" "unknown flag: $1" ;;
      *)           positional+=("$1") ;;
    esac
    shift
  done
  [[ ${#positional[@]} -gt 0 ]] || fail "$E_USAGE" "usage: 5dive task need <id|DIVE-N> --type=decision|secret|approval|manual --ask=\"...\" [--options=A|B] [--recommend=\"A\"]"
  resolve_task_id "${positional[0]}"; local id="$RESOLVED_TASK_ID"
  valid_need_type "$type" || fail "$E_VALIDATION" "bad --type '$type' (decision|secret|approval|manual)"
  [[ -n "$ask" ]] || fail "$E_USAGE" "--ask is required (what does the human need to provide?)"
  # Options are the choice list for a decision; reject them on the other types
  # so the gate shape stays honest for the dashboard.
  if [[ -n "$options" && "$type" != "decision" ]]; then
    fail "$E_VALIDATION" "--options only applies to --type=decision"
  fi
  # DIVE-148: --recommend surfaces the agent's advised choice first in the human
  # alert (and ⭐-marks its button). Only meaningful for the two finite-choice
  # gate types; reject it elsewhere so the gate shape stays honest. For a
  # decision it MUST be one of --options (same split rule as the buttons:
  # split '|', trim, drop empties) or a tapped/displayed recommend wouldn't
  # match any real option. For approval it's free text (e.g. approved/denied).
  if [[ -n "$recommend" ]]; then
    case "$type" in
      decision)
        [[ -n "$options" ]] || fail "$E_VALIDATION" "--recommend on a decision needs --options to match against"
        local _match
        _match=$(printf '%s' "$options" | jq -Rr --arg r "$recommend" '
          [ split("|")[] | gsub("^\\s+|\\s+$"; "") | select(length > 0) ]
          | (($r | gsub("^\\s+|\\s+$"; "")) as $rr | any(.[]; . == $rr)) | tostring' 2>/dev/null) || _match="false"
        [[ "$_match" == "true" ]] || fail "$E_VALIDATION" "--recommend \"$recommend\" must match one of --options ($options)"
        ;;
      approval) : ;;
      *) fail "$E_VALIDATION" "--recommend only applies to --type=decision or --type=approval" ;;
    esac
  fi
  local cur; cur=$(db "SELECT status FROM tasks WHERE id=${id};")
  [[ "$cur" == "done" || "$cur" == "cancelled" ]] \
    && fail "$E_CONFLICT" "DIVE-$id is $cur — reopen it before gating on a human"
  # assignee=actor: the agent hitting the gate becomes the owner-of-record, so
  # `task answer` knows who to ping to resume. The inbox is defined by the gate
  # (need_type set), not by assignee, so it still surfaces to the human.
  local actor; actor=$(task_actor "$from")
  db "UPDATE tasks
        SET status='blocked', assignee=$(sqlq "$actor"),
            need_type=$(sqlq "$type"), ask=$(sqlq "$ask"),
            need_options=$(sqlq_or_null "$options"),
            recommend=$(sqlq_or_null "$recommend"),
            need_answer=NULL, need_answered_at=NULL
      WHERE id=${id};"
  # DIVE-105: DM the paired human right now so the gate doesn't sit unseen.
  # `|| true` + the helper's own self-gating make this fully best-effort — a
  # failed DM must never fail the gate write that just committed above.
  task_need_notify "DIVE-$id" "$type" "$ask" "$options" "$recommend" || true
  ok "DIVE-$id needs a human ($type) — $ask" \
     '{id:($i|tonumber), ident:("DIVE-"+$i), status:"blocked", need_type:$ty, ask:$ak, need_options:(($op|select(length>0)) // null), recommend:(($rc|select(length>0)) // null), assignee:$ac}' \
     --arg i "$id" --arg ty "$type" --arg ak "$ask" --arg op "$options" --arg rc "$recommend" --arg ac "$actor"
}

# _task_owner_channel — resolve the filing agent's bot token + the per-type
# access.json that holds the paired human's DM/group targets. Sets globals
# TASK_CH_TOKEN / TASK_CH_ACCESS / TASK_CH_TYPE and returns 0 on success, 1 if
# anything is missing (so callers `_task_owner_channel || return 0` to stay
# best-effort — a missing channel must never fail a committed DB write). Works
# whether run directly as agent-<name> (common — task verbs need no sudo) or via
# sudo (resolved like task_actor; token from the group-claude-readable connector
# file or an inherited env var). Shared by task_need_notify + _task_close_notify.
TASK_CH_TOKEN="" TASK_CH_ACCESS="" TASK_CH_TYPE=""
_task_owner_channel() {
  TASK_CH_TOKEN="" TASK_CH_ACCESS="" TASK_CH_TYPE=""
  local name="" s
  s=$(auto_sender_from_sudo)
  if [[ -n "$s" ]]; then
    name="$s"
  else
    local u="${USER:-$(id -un 2>/dev/null)}"
    [[ "$u" == agent-* ]] && name="${u#agent-}"
  fi
  [[ -n "$name" ]] || return 1
  local token="" token_file="${CONNECTORS_DIR}/telegram-${name}.env"
  [[ -r "$token_file" ]] && token=$(sed -n 's/^TELEGRAM_BOT_TOKEN=//p' "$token_file" | head -1)
  [[ -z "$token" ]] && token="${TELEGRAM_BOT_TOKEN:-}"
  [[ -n "$token" ]] || return 1
  local t d
  for t in claude codex grok antigravity; do
    d=$(_tg_access_state_dir "agent-${name}" "$t") || continue
    if [[ -r "${d}/access.json" ]]; then
      TASK_CH_TOKEN="$token" TASK_CH_ACCESS="${d}/access.json" TASK_CH_TYPE="$t"
      return 0
    fi
  done
  return 1
}

# _task_send_owner — send ONE message ($1, optional reply_markup $2) to the
# paired human, using the channel resolved by _task_owner_channel. Routing
# (DIVE-259, Mark): follow the conversation — if the telegram plugin recorded
# where the human last talked to this agent (last-human-chat.json beside
# access.json), the alert and its tap buttons go THERE, but only when that
# chat is still allowlisted in access.json (a stale or hand-edited pointer
# must never widen the audience). No pointer (plugin predates the feature) =
# legacy flow: human DMs first (allowFrom — exactly the users who /started
# the bot), then the agent's bound forum topic(s) so nothing is silently
# lost. Always returns 0 (best-effort).
_task_send_owner() {
  local text="$1" reply_markup="${2:-}"
  local token="$TASK_CH_TOKEN" access_file="$TASK_CH_ACCESS"

  local ptr_file="${access_file%/*}/last-human-chat.json"
  if [[ -r "$ptr_file" ]]; then
    local p_chat p_thread
    p_chat=$(jq -r '.chatId // empty' "$ptr_file" 2>/dev/null) || p_chat=""
    p_thread=$(jq -r '.messageThreadId // empty' "$ptr_file" 2>/dev/null) || p_thread=""
    if [[ -n "$p_chat" ]]; then
      if jq -e --arg c "$p_chat" '(.allowFrom // []) | index($c) != null' "$access_file" >/dev/null 2>&1; then
        _mirror_post "$token" "$p_chat" "" "$text" "$access_file" "$reply_markup"
        return 0
      fi
      if jq -e --arg c "$p_chat" '(.groups // {}) | has($c)' "$access_file" >/dev/null 2>&1; then
        _mirror_post "$token" "$p_chat" "$p_thread" "$text" "$access_file" "$reply_markup"
        return 0
      fi
      # Pointer references a chat that is no longer allowed — ignore it.
    fi
  fi

  local dms sent=0 chat
  dms=$(jq -r '(.allowFrom // [])[]' "$access_file" 2>/dev/null) || dms=""
  if [[ -n "$dms" ]]; then
    while IFS= read -r chat; do
      [[ -n "$chat" ]] || continue
      _mirror_post "$token" "$chat" "" "$text" "$access_file" "$reply_markup"
      sent=1
    done <<<"$dms"
  fi
  if (( ! sent )); then
    local groups n i g_chat g_thread
    groups=$(jq -c '(.groups // {}) | to_entries' "$access_file" 2>/dev/null) || groups="[]"
    n=$(jq 'length' <<<"$groups" 2>/dev/null) || n=0
    n=${n:-0}
    for (( i=0; i<n; i++ )); do
      g_chat=$(jq -r ".[$i].key" <<<"$groups" 2>/dev/null) || continue
      g_thread=$(jq -r ".[$i].value.message_thread_id // \"\"" <<<"$groups" 2>/dev/null) || g_thread=""
      [[ -n "$g_chat" ]] || continue
      _mirror_post "$token" "$g_chat" "$g_thread" "$text" "$access_file" "$reply_markup"
    done
  fi
  return 0
}

# _task_close_notify — DM the paired human a one-line ✅/⚠️ summary when a task
# is closed with --notify (used by the heartbeat nudge so autonomous queue work
# surfaces a finish line without full progress streaming). Best-effort: every
# miss returns 0 so it can't fail the status write the caller just committed.
_task_close_notify() {
  local ident="$1" verb="$2" result="$3"
  _task_owner_channel || return 0
  local text
  if [[ "$verb" == "cancel" ]]; then
    text="⚠️ [${ident}] cancelled"
  else
    text="✅ [${ident}] done"
  fi
  # Ping shows only the result's FIRST line — done-results lead with a one-line
  # summary; a full paragraph is too noisy on the owner's phone. The complete
  # result stays on the record (`task show` renders all of it).
  [[ -n "$result" ]] && text+=": ${result%%$'\n'*}"
  _task_send_owner "$text" ""
  return 0
}

# task_need_notify — DIVE-105: the instant a human gate is filed, DM the paired
# human ONE alert so it doesn't sit unseen until someone opens the dashboard.
# Best-effort + self-gating in the shape of mirror_interagent_outbound, and
# reusing its _mirror_post send path (migration self-heal included). EVERY exit
# path returns 0: a missing token / access.json / dead Telegram call must NEVER
# block or fail the gate write (the DB UPDATE already committed before we run).
# The caller also invokes us as `... || true`, so set -e can't trip on anything
# inside either.
#
# Works whether `task need` is run directly as agent-<name> (the common path —
# task verbs need no sudo) OR via sudo: the agent is resolved the same way
# task_actor does; the token comes from the group-claude-readable connector
# file (or an inherited env var); and access.json is found by probing the
# per-type channel dirs (own file when direct, root-readable when sudo).
task_need_notify() {
  local ident="$1" need_type="$2" ask="$3" options="$4" recommend="${5:-}"

  # Resolve bot token + the human's DM/group targets (TASK_CH_* globals). The
  # matched access type (TASK_CH_TYPE) gates the tap-to-answer buttons below.
  _task_owner_channel || return 0

  # One message. Blank lines separate the header / ask / options so a long ask
  # doesn't render as an unreadable wall on mobile. No footer: tap buttons cover
  # decision/approval, and button-less gates (secret/manual) still surface on
  # the dashboard "Needs you" card — a redirect line is just noise in chat.
  # Options are listed one per line (numbered to match the tap buttons) so long
  # labels stay readable even when Telegram crops the button text.
  # DIVE-148: lead with the agent's recommendation (✅ Recommended: <X>) before
  # the ask, so the human sees the advised choice first instead of hunting for
  # it. Applies to decision + approval gates; NULL/empty recommend = no line.
  local text="🙋 [${ident}] needs you"
  [[ -n "$recommend" ]] && text+=$'\n\n'"✅ Recommended: ${recommend}"
  text+=$'\n\n'"${ask}"
  if [[ "$need_type" == "decision" && -n "$options" ]]; then
    local opts_list
    # ⭐-mark the recommended option in the numbered list (numbering stays the
    # original option order so it still maps to need_options on the dashboard).
    opts_list=$(printf '%s' "$options" | jq -Rr --arg r "$recommend" '
      ($r | gsub("^\\s+|\\s+$"; "")) as $rr
      | [ split("|")[] | gsub("^\\s+|\\s+$"; "") | select(length > 0) ]
      | to_entries | map("  \(.key + 1). \(.value)\(if .value == $rr and ($rr|length)>0 then " ⭐" else "" end)") | join("\n")' 2>/dev/null) || opts_list=""
    [[ -n "$opts_list" ]] && text+=$'\n\n'"Options:"$'\n'"${opts_list}"
  fi

  # DIVE-117/118 tap-to-answer buttons. GATED to the plugin types whose `tna:`
  # callback_query handler exists AND splits options byte-identically to this
  # emit: claude, codex, grok, antigravity (DIVE-118 — parity verified against
  # the actual handlers). opencode has no `tna:` handler yet, so it stays
  # excluded to avoid dead taps; add it here when its handler lands. Explicit
  # allowlist (not != "") so a future new plugin type never auto-emits dead
  # taps. Only finite-option gates get
  # buttons: decision-with-options (index into need_options) and approval
  # (approved/denied). callback_data is `tna:<numericId>:<idx|approved|denied>`
  # — numeric id + index keeps it under Telegram's 64-byte cap; the value is
  # re-resolved from the DB on tap, never trusted from the payload.
  # The option-split rule here MUST be byte-identical to the plugin's `tna:`
  # handler (split '|', trim, drop empties) or a tapped index resolves the wrong
  # option. Filtering empties also avoids an empty-text button (Telegram rejects
  # it, which would 400 the whole message — see the text-fallback in
  # _mirror_post). If nothing survives the filter, emit no keyboard (plain text).
  local reply_markup="" numid="${ident#DIVE-}"
  if [[ "$TASK_CH_TYPE" =~ ^(claude|codex|grok|antigravity)$ ]]; then
    if [[ "$need_type" == "decision" && -n "$options" ]]; then
      # Adaptive layout: greedily pack buttons onto a row up to a ~24-char width
      # budget (max 3 per row), so SHORT options share a row while a LONG label
      # breaks onto its own full-width row instead of being cropped. A single
      # over-budget label still lands alone (we always seat the first button of
      # an empty row). Index (to_entries .key) is the tna: payload, unchanged.
      # DIVE-148: ⭐-prefix the recommended option's button and sort it first so
      # the human's eye lands on it. callback_data keeps the ORIGINAL option
      # index (.key) — the tna: handler resolves the value by that index into
      # need_options, so reordering the display must not renumber the payload.
      reply_markup=$(printf '%s' "$options" | jq -Rc --arg id "$numid" --arg r "$recommend" '
        ($r | gsub("^\\s+|\\s+$"; "")) as $rr
        | [ split("|")[] | gsub("^\\s+|\\s+$"; "") | select(length > 0) ] as $o
        | ($o | to_entries
           | sort_by(.value == $rr and ($rr|length)>0 | not)
           | reduce .[] as $e ({rows: [], cur: [], w: 0};
               (($e.value | length) + (if $e.value == $rr and ($rr|length)>0 then 2 else 0 end)) as $len
               | {text: (if $e.value == $rr and ($rr|length)>0 then "⭐ " + $e.value else $e.value end), callback_data: ("tna:" + $id + ":" + ($e.key | tostring))} as $btn
               | if (.cur | length) > 0 and ((.cur | length) >= 3 or (.w + $len + 2) > 24)
                 then {rows: (.rows + [.cur]), cur: [$btn], w: $len}
                 else {rows: .rows, cur: (.cur + [$btn]), w: (.w + $len + 2)}
                 end)
           | .rows + (if (.cur | length) > 0 then [.cur] else [] end)) as $kb
        | if ($kb | length) > 0 then {inline_keyboard: $kb} else empty end' 2>/dev/null) || reply_markup=""
    elif [[ "$need_type" == "approval" ]]; then
      # DIVE-148: ⭐-mark whichever button the agent recommended (approved/denied)
      # and seat it first. Default order (Approve, Deny) when no recommendation.
      local _rl; _rl=$(printf '%s' "$recommend" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
      local _appr='{"text":"✅ Approve","callback_data":"tna:'"${numid}"':approved"}'
      local _deny='{"text":"🚫 Deny","callback_data":"tna:'"${numid}"':denied"}'
      case "$_rl" in
        approve|approved) _appr='{"text":"⭐ ✅ Approve","callback_data":"tna:'"${numid}"':approved"}'
                          reply_markup='{"inline_keyboard":[['"$_appr"','"$_deny"']]}' ;;
        deny|denied)      _deny='{"text":"⭐ 🚫 Deny","callback_data":"tna:'"${numid}"':denied"}'
                          reply_markup='{"inline_keyboard":[['"$_deny"','"$_appr"']]}' ;;
        *)                reply_markup='{"inline_keyboard":[['"$_appr"','"$_deny"']]}' ;;
      esac
    fi
  fi

  _task_send_owner "$text" "$reply_markup"
  return 0
}

cmd_task_inbox() {
  tasks_db_init
  while [[ $# -gt 0 ]]; do
    case "$1" in
      -*) fail "$E_USAGE" "unknown flag: $1" ;;
      *)  fail "$E_USAGE" "unexpected arg: $1 (inbox takes no positional args)" ;;
    esac
  done
  # A pending gate, decoupled from the overloaded `status` (a task can be both
  # human-gated and blocked-by another task): need set, not yet answered. We
  # still exclude TERMINAL statuses (done/cancelled) — a closed task waits on
  # no one, so a lingering unanswered gate must not leak into the human inbox.
  local where="need_type IS NOT NULL AND need_answered_at IS NULL AND status NOT IN ('done','cancelled')"
  local order="ORDER BY CASE priority WHEN 'urgent' THEN 0 WHEN 'high' THEN 1 WHEN 'medium' THEN 2 ELSE 3 END, created_at"
  if (( JSON_MODE )); then
    local rows
    rows=$(dbfmt -json "SELECT id, ident, title, status, priority, assignee, created_by, parent_id, created_at, need_type, ask, need_options, recommend, need_answer, need_answered_at FROM tasks WHERE ${where} ${order};")
    [[ -n "$rows" ]] || rows="[]"
    # stdin, not --argjson — same ARG_MAX guard as `task ls`. (DIVE-222)
    printf '%s' "$rows" | jq -c '{ok:true, data:{inbox:.}}'
  else
    local cnt; cnt=$(db "SELECT COUNT(*) FROM tasks WHERE ${where};")
    if [[ "$cnt" == "0" ]]; then
      echo "inbox empty — nothing waiting on a human."
    else
      dbfmt -box "SELECT ident, priority, need_type, COALESCE(assignee,'-') AS owner, COALESCE(recommend,'-') AS recommend, ask FROM tasks WHERE ${where} ${order};"
    fi
  fi
}

cmd_task_answer() {
  tasks_db_init
  local value="" value_set=0 from=""
  local -a positional=()
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --value=*) value="${1#*=}"; value_set=1 ;;
      --from=*)  from="${1#*=}" ;;
      --)        shift; positional+=("$@"); break ;;
      -*)        fail "$E_USAGE" "unknown flag: $1" ;;
      *)         positional+=("$1") ;;
    esac
    shift
  done
  [[ ${#positional[@]} -gt 0 ]] || fail "$E_USAGE" "usage: 5dive task answer <id|DIVE-N> --value=\"...\"  (omit --value for a secret gate)"
  resolve_task_id "${positional[0]}"; local id="$RESOLVED_TASK_ID"
  # Must have a pending (unanswered) gate to answer.
  local nt
  nt=$(db "SELECT CASE WHEN need_type IS NOT NULL AND need_answered_at IS NULL THEN need_type ELSE '' END FROM tasks WHERE id=${id};")
  [[ -n "$nt" ]] || fail "$E_CONFLICT" "DIVE-$id has no pending human gate (nothing to answer)"
  # Who resumes: the agent that hit the gate (assignee), else the creator.
  local owner; owner=$(db "SELECT COALESCE(NULLIF(assignee,''), NULLIF(created_by,''), '') FROM tasks WHERE id=${id};")

  # Record the answer. A `secret` gate NEVER stores its value — writing a raw
  # key into this group-claude-readable db is a plaintext-secret-at-rest leak.
  # We only stamp need_answered_at (the "provided" signal); the agent loads the
  # key out-of-band. decision/approval/manual store the value in need_answer.
  if [[ "$nt" == "secret" ]]; then
    (( value_set )) && fail "$E_USAGE" "DIVE-$id is a secret gate — do not pass --value; the key must not be stored in the shared db. Run: 5dive task answer DIVE-$id  (records it as provided + pings the agent to load it from where you placed it)"
    db "UPDATE tasks SET need_answered_at=datetime('now') WHERE id=${id};"
  else
    (( value_set )) || fail "$E_USAGE" "--value is required (the human's answer)"
    db "UPDATE tasks SET need_answer=$(sqlq "$value"), need_answered_at=datetime('now') WHERE id=${id};"
  fi

  # Clearing the gate ≠ unblocking. `status='blocked'` is overloaded (human
  # gate AND task-task `block` edges), so RECOMPUTE rather than hardcode todo:
  # flip to todo only if no block edges remain — same edge-check `unblock` does
  # — else stay blocked (still waiting on another task). Answered-ness lives in
  # need_answered_at, so the task already left the inbox regardless of status.
  db "UPDATE tasks SET status='todo'
      WHERE id=${id} AND status='blocked'
        AND NOT EXISTS (SELECT 1 FROM task_deps WHERE task_id=${id});"
  local newstatus; newstatus=$(db "SELECT status FROM tasks WHERE id=${id};")

  # Best-effort resume ping over the existing agent-send path. We deliberately
  # do NOT embed the answer value: cmd_send mirrors the outbound into the group
  # chat, so a `secret` answer would leak. The agent reads need_answer itself
  # via `task show` (its own pane only). A stopped or non-agent owner just
  # yields pinged:false — it never fails the answer.
  local pinged=0
  if [[ -n "$owner" ]]; then
    local pingmsg
    if [[ "$nt" == "secret" ]]; then
      pingmsg="DIVE-${id} secret gate marked provided — resume the task and load the key from where it was placed (its .env / your own channel), NOT from the task."
    else
      pingmsg="DIVE-${id} gate cleared — your '${nt}' ask was answered. Resume the task; run \`5dive task show DIVE-${id}\` for the value."
    fi
    local actor; actor=$(task_actor "$from")
    if valid_sender_label "$actor"; then
      ( cmd_send "$owner" --from="$actor" --message="$pingmsg" ) >/dev/null 2>&1 && pinged=1 || true
    else
      ( cmd_send "$owner" --message="$pingmsg" ) >/dev/null 2>&1 && pinged=1 || true
    fi
  fi

  local note=""
  [[ $pinged -eq 1 ]] && note=" + pinged $owner"
  ok "DIVE-$id answered ($nt) — now ${newstatus}${note}" \
     '{id:($i|tonumber), status:$st, need_type:$nt, provided:true, need_answer:(if $nt=="secret" then null else $v end), owner:(($o|select(length>0)) // null), pinged:($p=="1")}' \
     --arg i "$id" --arg st "$newstatus" --arg nt "$nt" --arg v "$value" --arg o "$owner" --arg p "$pinged"
}

cmd_task_rm() {
  tasks_db_init
  [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive task rm <id|DIVE-N>"
  resolve_task_id "$1"; local id="$RESOLVED_TASK_ID"
  db "DELETE FROM tasks WHERE id=${id};"
  ok "DIVE-$id deleted" '{id:($i|tonumber), deleted:true}' --arg i "$id"
}

# -------- 5dive org — agent org chart --------
#
# Subordination is a single self-referential column (agents_org.reports_to),
# the same shape Paperclip uses. The org chart only earns its keep once a
# fleet grows past a handful of agents, so this stays deliberately small:
# who reports to whom, plus an optional role/title label.

_org_usage() {
  cat <<USAGE
5dive org — agent org chart (who reports to whom)

  5dive org set <agent> [--manager=<agent>|default] [--role=<text>] [--title=<text>]
                                                     # upsert; --manager=default clears
  5dive org tree                                     # the whole hierarchy, indented
  5dive org show <agent>                             # manager + direct reports
  5dive org ls                                       # flat list of everyone placed
  5dive org rm <agent>                               # remove (reports re-parent to null)

  Any agent (group claude) can run these without sudo. Add --json for machine output.
USAGE
}

cmd_org() {
  [[ $# -gt 0 ]] || { _org_usage; exit "$E_USAGE"; }
  local sub="$1"; shift
  case "$sub" in
    set)             cmd_org_set "$@" ;;
    tree)            cmd_org_tree "$@" ;;
    show)            cmd_org_show "$@" ;;
    ls|list)         cmd_org_ls "$@" ;;
    rm|delete)       cmd_org_rm "$@" ;;
    -h|--help|help)  _org_usage ;;
    *) fail "$E_USAGE" "unknown org command: $sub (try: 5dive org --help)" ;;
  esac
}

cmd_org_set() {
  tasks_db_init
  local name="" manager="" role="" title=""
  local mgr_set=0 role_set=0 title_set=0
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --manager=*) manager="${1#*=}"; mgr_set=1 ;;
      --role=*)    role="${1#*=}";    role_set=1 ;;
      --title=*)   title="${1#*=}";   title_set=1 ;;
      -*)          fail "$E_USAGE" "unknown flag: $1" ;;
      *)           [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "unexpected arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive org set <agent> [--manager=] [--role=] [--title=]"
  valid_sender_label "$name" || fail "$E_VALIDATION" "bad agent name '$name'"
  (( mgr_set || role_set || title_set )) || fail "$E_USAGE" "nothing to set — pass --manager, --role and/or --title"

  # Resolve the manager value: default/none/empty clears the edge.
  local mgr_clear=0 mgr_name=""
  if (( mgr_set )); then
    case "$manager" in
      ""|default|none) mgr_clear=1 ;;
      *)
        valid_sender_label "$manager" || fail "$E_VALIDATION" "bad manager name '$manager'"
        [[ "$manager" != "$name" ]] || fail "$E_VALIDATION" "an agent can't report to itself"
        mgr_name="$manager"
        # Reject cycles: $name must not already sit above $manager in the chain.
        local cyc
        cyc=$(db "WITH RECURSIVE up(n) AS (
                    SELECT $(sqlq "$manager")
                    UNION ALL
                    SELECT a.reports_to FROM agents_org a JOIN up ON a.name=up.n
                    WHERE a.reports_to IS NOT NULL)
                  SELECT 1 FROM up WHERE n=$(sqlq "$name") LIMIT 1;")
        [[ -z "$cyc" ]] || fail "$E_CONFLICT" "that would create a reporting cycle ($manager already reports up to $name)"
        ;;
    esac
  fi

  db "INSERT OR IGNORE INTO agents_org (name) VALUES ($(sqlq "$name"));"
  if (( mgr_set )); then
    if (( mgr_clear )); then
      db "UPDATE agents_org SET reports_to=NULL, updated_at=datetime('now') WHERE name=$(sqlq "$name");"
    else
      db "INSERT OR IGNORE INTO agents_org (name) VALUES ($(sqlq "$mgr_name"));
          UPDATE agents_org SET reports_to=$(sqlq "$mgr_name"), updated_at=datetime('now') WHERE name=$(sqlq "$name");"
    fi
  fi
  (( role_set ))  && db "UPDATE agents_org SET role=$(sqlq_or_null "$role"),   updated_at=datetime('now') WHERE name=$(sqlq "$name");"
  (( title_set )) && db "UPDATE agents_org SET title=$(sqlq_or_null "$title"), updated_at=datetime('now') WHERE name=$(sqlq "$name");"

  if (( JSON_MODE )); then
    local row; row=$(dbfmt -json "SELECT name, reports_to, role, title FROM agents_org WHERE name=$(sqlq "$name");")
    jq -cn --argjson r "$row" '{ok:true, data:($r[0])}'
  else
    local mgr; mgr=$(db "SELECT COALESCE(reports_to,'(top)') FROM agents_org WHERE name=$(sqlq "$name");")
    ok "$name -> reports to $mgr"
  fi
}

cmd_org_tree() {
  tasks_db_init
  [[ $# -eq 0 ]] || fail "$E_USAGE" "usage: 5dive org tree"
  local count; count=$(db "SELECT COUNT(*) FROM agents_org;")
  if [[ "$count" == "0" ]]; then
    if (( JSON_MODE )); then jq -cn '{ok:true, data:{tree:[]}}'; else echo "(org chart empty — place agents with: 5dive org set <agent> --manager=<agent>)"; fi
    return 0
  fi
  # Roots: no manager, or a manager that isn't itself placed (orphans surface
  # rather than vanish). Walk down from there; path drives display order.
  local cte="WITH RECURSIVE tree(name, reports_to, role, title, depth, path) AS (
      SELECT name, reports_to, role, title, 0, name
      FROM agents_org
      WHERE reports_to IS NULL OR reports_to NOT IN (SELECT name FROM agents_org)
      UNION ALL
      SELECT a.name, a.reports_to, a.role, a.title, t.depth+1, t.path||'/'||a.name
      FROM agents_org a JOIN tree t ON a.reports_to = t.name)"
  if (( JSON_MODE )); then
    local rows; rows=$(dbfmt -json "${cte} SELECT name, reports_to, role, title, depth FROM tree ORDER BY path;")
    [[ -n "$rows" ]] || rows="[]"
    printf '%s' "$rows" | jq -c '{ok:true, data:{tree:.}}'  # stdin, not --argjson (DIVE-222)
  else
    # Default list mode: no header, one column -> one indented line per row.
    db "${cte}
      SELECT substr('                                        ', 1, depth*2)
             || name
             || CASE WHEN title IS NOT NULL THEN '  — '||title
                     WHEN role  IS NOT NULL THEN '  — '||role ELSE '' END
      FROM tree ORDER BY path;"
  fi
}

cmd_org_show() {
  tasks_db_init
  [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive org show <agent>"
  local name="$1"
  valid_sender_label "$name" || fail "$E_VALIDATION" "bad agent name '$name'"
  local exists; exists=$(db "SELECT 1 FROM agents_org WHERE name=$(sqlq "$name");")
  [[ -n "$exists" ]] || fail "$E_NOT_FOUND" "agent '$name' is not placed in the org chart"
  if (( JSON_MODE )); then
    local self reports
    self=$(dbfmt -json "SELECT name, reports_to, role, title FROM agents_org WHERE name=$(sqlq "$name");")
    reports=$(dbfmt -json "SELECT name, role, title FROM agents_org WHERE reports_to=$(sqlq "$name") ORDER BY name;")
    [[ -n "$reports" ]] || reports="[]"
    jq -cn --argjson s "$self" --argjson r "$reports" '{ok:true, data:($s[0] + {direct_reports:$r})}'
  else
    dbfmt -line "SELECT name, COALESCE(reports_to,'(top)') AS reports_to, role, title FROM agents_org WHERE name=$(sqlq "$name");"
    local reps; reps=$(db "SELECT name FROM agents_org WHERE reports_to=$(sqlq "$name") ORDER BY name;")
    if [[ -n "$reps" ]]; then echo; echo "direct reports:"; printf '%s\n' "$reps" | indent2; else echo; echo "direct reports: (none)"; fi
  fi
}

cmd_org_ls() {
  tasks_db_init
  [[ $# -eq 0 ]] || fail "$E_USAGE" "usage: 5dive org ls"
  if (( JSON_MODE )); then
    local rows; rows=$(dbfmt -json "SELECT name, reports_to, role, title FROM agents_org ORDER BY name;")
    [[ -n "$rows" ]] || rows="[]"
    printf '%s' "$rows" | jq -c '{ok:true, data:{agents:.}}'  # stdin, not --argjson (DIVE-222)
  else
    dbfmt -box "SELECT name, COALESCE(reports_to,'-') AS reports_to, COALESCE(role,'-') AS role, COALESCE(title,'-') AS title FROM agents_org ORDER BY name;"
  fi
}

cmd_org_rm() {
  tasks_db_init
  [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive org rm <agent>"
  local name="$1"
  valid_sender_label "$name" || fail "$E_VALIDATION" "bad agent name '$name'"
  local exists; exists=$(db "SELECT 1 FROM agents_org WHERE name=$(sqlq "$name");")
  [[ -n "$exists" ]] || fail "$E_NOT_FOUND" "agent '$name' is not placed in the org chart"
  db "DELETE FROM agents_org WHERE name=$(sqlq "$name");"
  ok "$name removed from org chart" '{name:$n, removed:true}' --arg n "$name"
}

# -------- 5dive heartbeat — wake agents that have queued work --------
#
# A per-agent "heartbeat": a single host cron runs `5dive heartbeat tick`
# every few minutes. For each enrolled agent the tick asks one question —
# "does this agent have a todo task on the shared board?" — and acts:
#
#   * no todo            -> do nothing. The agent never wakes, so it burns
#                           zero tokens and never starts its 5h usage window.
#   * already in_progress -> skip. The agent is still chewing on its last
#                           task; piling on a second nudge would interleave work.
#   * has todo + due      -> ensure the agent is running, optionally /clear it
#                           for a fresh context, then inject ONE nudge telling
#                           it to do a single task and then idle.
#
# "One task per tick" is the whole point: 1 nudge = 1 task. The next tick (no
# sooner than the agent's `everyMin`) picks up the next one. The agent process
# stays running between ticks (cheap tmux session) — `fresh` sends `/clear`
# before the nudge so each task starts from a clean conversation without the
# cold-start cost of a full restart.
#
# Config lives per-agent in the registry under .agents[<name>].heartbeat:
#   { enabled: bool, everyMin: int, fresh: bool, lastRunAt: <epoch> }
# lastRunAt throttles *wakes* (not checks): a no-work agent is re-checked every
# tick (a cheap sqlite count) but only counts against everyMin when it actually
# wakes. So everyMin is "minimum minutes between real wakes", honoured even
# though the cron fires more often.

_HB_DEFAULT_EVERY=30
# Deterministic hard cap for the /goal loop. A task left in_progress longer than
# everyMin * _HB_STALE_MULT minutes is force-closed by the tick (see the reaper
# in cmd_heartbeat_tick): /goal clear to stop any runaway loop, then auto-cancel.
# This is the real backstop — /goal's own "stop after N turns" is model-judged
# and was observed to overrun (see _hb_wake). Min floor keeps short everyMin sane.
_HB_STALE_MULT=3
_HB_STALE_MIN_MINUTES=45
# Starvation signal: a todo task that gets nudged this many times but never
# leaves 'todo' (started_at stays empty) is almost certainly being starved —
# e.g. the codex/grok listen-loop watchdog yanking the agent off the task before
# it runs `task start`. The reaper only catches runaway *in_progress* tasks; this
# catches the opposite silent failure (nudged but never started) and surfaces it
# instead of re-nudging forever. Per-task nudge counts live in the registry under
# .agents[<name>].heartbeat.nudges and are pruned once a task leaves todo.
_HB_STARVE_AFTER=3
# Orphan reclaim. An in_progress task whose claiming claude session is GONE — the
# agent's claude process started AFTER the task did (rotation, service restart,
# crash, a context reset that exited the process) — is reclaimed to 'todo'
# immediately rather than waiting out the _HB_STALE_MULT hard cap: nobody is
# working it, and the work still needs doing. _HB_PROC_SKEW_SEC absorbs the small
# gap between a process starting and the `task start` it then runs.
_HB_PROC_SKEW_SEC=20
# Backstop for the same-process abandon case (agent claimed a task, then went
# idle without closing it — its claiming process is unchanged, so the restart
# rule above can't see it). Reclaim to 'todo' once the task has sat in_progress
# past this grace AND the agent is idle right now.
_HB_STALL_MIN_MINUTES=20
# Idle probe window. An agent whose pane is byte-identical across this gap (and
# still shows its input prompt) is at rest; a working agent streams output or
# animates a spinner, so its pane changes between two samples. Deliberately dumb
# and CLI-agnostic — see _hb_agent_idle. Used to (a) never /clear+nudge an agent
# mid-turn/conversation and (b) gate idle-stall reclaim.
_HB_IDLE_SAMPLE_SEC=3

_hb_log() { printf '%s [heartbeat] %s\n' "$(date -u +%FT%TZ)" "$*" >&2; }

_hb_usage() {
  cat <<USAGE
5dive heartbeat — wake agents only when they have queued tasks

  5dive heartbeat on  <name> [--every=<dur>] [--no-fresh]
                                          # enrol agent; default every=${_HB_DEFAULT_EVERY}m, fresh on
  5dive heartbeat off <name>              # stop waking the agent (keeps its settings)
  5dive heartbeat ls                      # show enrolled agents + next-wake + queued count
  5dive heartbeat tick                    # cron driver: wake every due agent that has work

  <dur>: minutes (e.g. 30), or 45m / 2h / 1h30m.
  fresh (default on): send /clear before each task so context starts clean;
        --no-fresh keeps the running conversation across tasks.

Wire the driver into cron (root), e.g. every 5 minutes:
  */5 * * * * /usr/local/bin/5dive heartbeat tick >> /var/log/5dive-heartbeat.log 2>&1

Add --json to any subcommand for machine output.
USAGE
}

cmd_heartbeat() {
  [[ $# -gt 0 ]] || { _hb_usage; exit "$E_USAGE"; }
  local sub="$1"; shift
  case "$sub" in
    on|enable)       with_registry_lock cmd_heartbeat_on "$@" ;;
    off|disable)     with_registry_lock cmd_heartbeat_off "$@" ;;
    ls|list|status)  cmd_heartbeat_ls "$@" ;;
    tick)            cmd_heartbeat_tick "$@" ;;
    -h|--help|help)  _hb_usage ;;
    *) fail "$E_USAGE" "unknown heartbeat command: $sub (try: 5dive heartbeat --help)" ;;
  esac
}

# Parse a duration into whole minutes. Accepts a bare integer (minutes),
# or an h/m combo like 2h, 45m, 1h30m. Echoes minutes on success, returns 1
# on a malformed or zero-length value.
_hb_parse_every() {
  local s="$1"
  [[ -n "$s" ]] || return 1
  if [[ "$s" =~ ^[0-9]+$ ]]; then
    (( s > 0 )) || return 1
    printf '%s' "$s"; return 0
  fi
  [[ "$s" =~ ^([0-9]+h)?([0-9]+m)?$ ]] || return 1
  local h="${BASH_REMATCH[1]%h}" m="${BASH_REMATCH[2]%m}"
  local total=$(( ${h:-0} * 60 + ${m:-0} ))
  (( total > 0 )) || return 1
  printf '%s' "$total"
}

cmd_heartbeat_on() {
  require_root "heartbeat on"
  local name="" every="" fresh="true"
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --every=*)  every="${1#*=}" ;;
      --fresh)    fresh="true" ;;
      --no-fresh) fresh="false" ;;
      -*)         fail "$E_USAGE" "unknown flag: $1" ;;
      *)          [[ -z "$name" ]] && name="$1" || fail "$E_USAGE" "unexpected arg: $1" ;;
    esac
    shift
  done
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive heartbeat on <name> [--every=<dur>] [--no-fresh]"
  require_agent "$name"
  local everyMin="$_HB_DEFAULT_EVERY"
  if [[ -n "$every" ]]; then
    everyMin=$(_hb_parse_every "$every") || fail "$E_VALIDATION" "bad --every '$every' (use minutes, or 45m / 2h / 1h30m)"
  fi
  local reg; reg=$(registry_read)
  # Preserve any existing lastRunAt so toggling on/off doesn't reset the throttle.
  echo "$reg" | jq --arg n "$name" --argjson e "$everyMin" --argjson f "$fresh" \
    '.agents[$n].heartbeat = {
        enabled: true,
        everyMin: $e,
        fresh: $f,
        lastRunAt: (.agents[$n].heartbeat.lastRunAt // 0)
     }' | registry_write
  ok "heartbeat on for '$name' (every ${everyMin}m, fresh=${fresh})" \
     '{name:$n, enabled:true, everyMin:($e|tonumber), fresh:($f=="true")}' \
     --arg n "$name" --arg e "$everyMin" --arg f "$fresh"
}

cmd_heartbeat_off() {
  require_root "heartbeat off"
  local name="${1:-}"
  [[ -n "$name" ]] || fail "$E_USAGE" "usage: 5dive heartbeat off <name>"
  require_agent "$name"
  local reg; reg=$(registry_read)
  echo "$reg" | jq --arg n "$name" \
    '.agents[$n].heartbeat = ((.agents[$n].heartbeat // {everyMin: '"$_HB_DEFAULT_EVERY"', fresh: true, lastRunAt: 0}) + {enabled: false})' \
    | registry_write
  ok "heartbeat off for '$name'" '{name:$n, enabled:false}' --arg n "$name"
}

cmd_heartbeat_ls() {
  # Read-only: the registry is 640 root:claude, so any group-claude agent can
  # inspect its own heartbeat without sudo. No ensure_state (that requires root).
  local reg now; reg=$(registry_read); now=$(date +%s)
  # Enrich each agent that has a heartbeat object with live run-state + queued count.
  local rows="[]" name
  for name in $(jq -r '.agents | to_entries[] | select(.value.heartbeat != null) | .key' <<<"$reg"); do
    local enabled everyMin fresh lastRun running todo nextIn
    enabled=$(jq -r --arg n "$name"  '.agents[$n].heartbeat.enabled  // false' <<<"$reg")
    everyMin=$(jq -r --arg n "$name" '.agents[$n].heartbeat.everyMin // '"$_HB_DEFAULT_EVERY" <<<"$reg")
    fresh=$(jq -r --arg n "$name"    '(.agents[$n].heartbeat | if has("fresh") then .fresh else true end)' <<<"$reg")
    lastRun=$(jq -r --arg n "$name"  '.agents[$n].heartbeat.lastRunAt // 0' <<<"$reg")
    # is-active prints the state word AND exits nonzero for non-active units, so
    # capture its stdout directly — a `|| echo` here would append a second word.
    running=$(systemctl is-active "5dive-agent@${name}.service" 2>/dev/null || true)
    [[ -n "$running" ]] || running="unknown"
    todo=$(db "SELECT COUNT(*) FROM tasks WHERE assignee=$(sqlq "$name") AND status='todo' AND kind='standard';" 2>/dev/null || echo 0)
    # seconds until next eligible wake (0 = due now)
    nextIn=$(( lastRun + everyMin * 60 - now ))
    (( nextIn < 0 )) && nextIn=0
    rows=$(jq -c \
      --arg n "$name" --argjson en "$enabled" --argjson ev "$everyMin" \
      --argjson fr "$fresh" --arg run "$running" --argjson td "${todo:-0}" --argjson ni "$nextIn" \
      '. + [{name:$n, enabled:$en, everyMin:$ev, fresh:$fr, running:$run, todo:$td, nextInSec:$ni}]' <<<"$rows")
  done
  if (( JSON_MODE )); then
    printf '%s' "$rows" | jq -c '{ok:true, data:{agents:.}}'  # stdin, not --argjson (DIVE-222)
  else
    echo "$rows" | jq -r '
      if length == 0 then "no agents enrolled in heartbeat (5dive heartbeat on <name>)" else
        (["NAME","HEARTBEAT","EVERY","FRESH","RUNNING","TODO","NEXT-WAKE"] | @tsv),
        (.[] | [
          .name,
          (if .enabled then "on" else "off" end),
          ((.everyMin|tostring)+"m"),
          (if .fresh then "yes" else "no" end),
          .running,
          (.todo|tostring),
          (if (.enabled|not) then "-"
           elif .nextInSec == 0 then "now (if work)"
           else (((.nextInSec/60)|floor|tostring)+"m") end)
        ] | @tsv)
      end' | column -t -s $'\t'
  fi
}

# Persist a wake timestamp AND bump the per-task nudge counter. Runs under
# with_registry_lock from the tick loop. $3 is the DIVE id just nudged. Prunes
# nudge entries for tasks that have left 'todo' (started/done/cancelled/gone) so
# the map stays bounded and a counter resets cleanly if a task is re-queued.
# Echoes the post-increment nudge count for $task_id so the caller can decide
# whether the task is being starved.
_hb_mark_run() {
  local name="$1" now="$2" task_id="$3"
  local reg; reg=$(registry_read)
  # Current todo ids for this agent, as a JSON number array, to prune the map.
  local todo_ids
  todo_ids=$(db "SELECT id FROM tasks WHERE assignee=$(sqlq "$name") AND status='todo' AND kind='standard';" 2>/dev/null \
             | jq -R 'select(length>0)|tonumber' | jq -cs '.' 2>/dev/null) || todo_ids=""
  [[ -n "$todo_ids" ]] || todo_ids="[]"
  reg=$(echo "$reg" | jq --arg n "$name" --argjson t "$now" --arg tid "$task_id" --argjson todo "$todo_ids" '
    .agents[$n].heartbeat.lastRunAt = $t
    | .agents[$n].heartbeat.nudges = (
        ((.agents[$n].heartbeat.nudges // {})
          | with_entries(select((.key|tonumber) as $k | $todo | index($k) != null)))
        | .[$tid] = ((.[$tid] // 0) + 1)
      )')
  echo "$reg" | registry_write
  jq -r --arg n "$name" --arg tid "$task_id" '.agents[$n].heartbeat.nudges[$tid] // 0' <<<"$reg"
}

# Inject one literal line + Enter into an agent's tmux pane. Returns nonzero
# (never exits) so a single dead pane can't abort the whole tick.
_hb_send_line() {
  local name="$1" text="$2"
  sudo -u "agent-${name}" tmux send-keys -t "agent-${name}" -l -- "$text" 2>/dev/null || return 1
  sudo -u "agent-${name}" tmux send-keys -t "agent-${name}" Enter 2>/dev/null || return 1
}

# PID of this agent's live inner `claude` process, or empty if not found. This is
# the `claude` the `while true; do claude; ...` wrapper respawns. The bash wrapper
# and tmux lines also contain the claude argv, so exclude them (they carry
# 'while true' / 'tmux'). Non-claude agents (codex/grok/agy/opencode) won't match
# → empty, so both the restart-reclaim rule and the native idle probe simply
# don't apply to them (callers fall back).
_hb_claude_pid() {
  local name="$1"
  ps -u "agent-${name}" -o pid=,args= 2>/dev/null \
    | awk '/\/claude .*--dangerously-skip-permissions/ && !/while +true/ && !/tmux/ {print $1; exit}'
}

# Epoch when this agent's live claude process started, or empty if not found. Its
# start time is the agent's "session identity": a rotation, restart, crash, or
# context reset that exits the process gives the replacement a newer start time
# than any task its predecessor had already claimed.
_hb_claude_started() {
  local name="$1" pid lstart
  pid=$(_hb_claude_pid "$name")
  [[ -n "$pid" ]] || return 1
  lstart=$(ps -o lstart= -p "$pid" 2>/dev/null) || return 1
  [[ -n "$lstart" ]] || return 1
  date -d "$lstart" +%s 2>/dev/null || return 1
}

# Native run-state for a claude agent via `claude agents --json` (CC ≥2.1.162).
# Far more reliable than scraping the tmux pane, and it distinguishes a genuine
# block (a permission prompt / dialog / input-needed) from working vs idle — a
# distinction the pane-scrape can't make. We match the JSON entry by the agent's
# inner-claude PID so dispatched background sub-agents in the same list are
# ignored, and read that one session's status:
#   idle    -> "idle"            (at rest, no turn in flight)
#   busy    -> "busy"            (a turn is actively running)
#   waiting -> "blocked:<reason>" (waiting on a permission prompt / worker
#              request / sandbox request / dialog / input — surface, don't reclaim)
# Echoes that word and returns 0 on a definite reading; returns 1 (echoes
# nothing) when the signal is unavailable — non-claude CLI, claude not running,
# the binary is missing, or no matching session — so callers fall back to the
# pane-scrape probe. Runs as the agent's own user (its sessions live under that
# user's ~/.claude); the heartbeat tick runs as root, so the sudo is non-interactive.
_hb_agent_native_state() {
  local name="$1" pid bin out st wf
  pid=$(_hb_claude_pid "$name"); [[ -n "$pid" ]] || return 1
  bin="/home/agent-${name}/.local/bin/claude"
  [[ -x "$bin" ]] || return 1
  out=$(sudo -n -u "agent-${name}" "$bin" agents --json 2>/dev/null) || return 1
  [[ -n "$out" ]] || return 1
  st=$(jq -r --argjson p "$pid" '.[] | select(.pid==$p) | .status // empty' <<<"$out" 2>/dev/null) || return 1
  [[ -n "$st" ]] || return 1
  if [[ "$st" == "waiting" ]]; then
    wf=$(jq -r --argjson p "$pid" '.[] | select(.pid==$p) | .waitingFor // "input needed"' <<<"$out" 2>/dev/null)
    printf 'blocked:%s' "${wf:-input needed}"; return 0
  fi
  printf '%s' "$st"; return 0
}

# Is the agent at rest right now? Prefer the native `claude agents --json` signal
# (reliable, and it can tell a *blocked* agent apart from a working one); fall
# back to the dumb pane-scrape for non-claude CLIs or when the native signal is
# unavailable. Pane-scrape: sample the pane twice across a short gap — an idle
# agent's pane is byte-identical and shows its input prompt; a working one streams
# output / animates, so the two samples differ.
#
# Exit: 0 = idle, 1 = working/active, 2 = unknown (no signal), 3 = blocked
# (waiting on a permission prompt / dialog / input — native-only). When it
# returns 3 it also sets _HB_IDLE_REASON to the block reason for the caller to
# surface. Callers that must not clobber live work defer on 1 OR 3; reclaim-on-idle
# acts only on a confident 0 (a blocked agent is not idle, so it is never reclaimed).
_HB_IDLE_REASON=""
_hb_agent_idle() {
  local name="$1" gap="${2:-$_HB_IDLE_SAMPLE_SEC}"
  _HB_IDLE_REASON=""
  # Native signal first — when present it is authoritative and needs no sampling.
  local native; native=$(_hb_agent_native_state "$name") || native=""
  case "$native" in
    idle)       return 0 ;;
    busy)       return 1 ;;
    blocked:*)  _HB_IDLE_REASON="${native#blocked:}"; return 3 ;;
  esac
  # Fallback: pane-scrape (codex/grok/agy/opencode, or native unavailable).
  local user="agent-${name}" a b
  a=$(sudo -u "$user" tmux capture-pane -p -t "agent-${name}" 2>/dev/null) || return 2
  [[ -n "$a" ]] || return 2
  sleep "$gap"
  b=$(sudo -u "$user" tmux capture-pane -p -t "agent-${name}" 2>/dev/null) || return 2
  [[ "$a" == "$b" ]] || return 1
  grep -q '❯' <<<"$b" || return 1
  return 0
}

# Flip one in_progress task back to todo. Clears started_at so its age and the
# per-task nudge counter both restart cleanly, and stamps updated_at. Best-effort
# (a dead db or already-moved task is harmless). Logs why.
_hb_reclaim_to_todo() {
  local name="$1" id="$2" why="$3"
  db "UPDATE tasks SET status='todo', started_at=NULL, updated_at=datetime('now')
      WHERE id=${id} AND status='in_progress';" 2>/dev/null || true
  _hb_log "[$name] reclaimed DIVE-${id} -> todo ($why)"
}

# Unwedge this agent's stuck in_progress tasks. Three escalating rules, cheapest
# first, so a single stalled task can't block an agent's whole queue for hours:
#
#   (a) orphan-by-restart  -> todo. The claude process that would be doing the
#       work started AFTER the task did, so the session that claimed it is gone
#       (rotation/restart/crash/context-reset). Deterministic, instant — this is
#       the common case and needs no idle guessing.
#   (b) idle stall         -> todo. Same process still running, but the task has
#       sat in_progress past _HB_STALL_MIN_MINUTES AND the agent is idle now:
#       it claimed the task then walked away. Gated on a confident idle reading
#       so we never reclaim work that's actively in flight.
#   (c) hard cap           -> cancel. in_progress past everyMin*_HB_STALE_MULT
#       (floored): the deterministic runaway backstop. /goal clear then cancel
#       with an auto-result so the board shows it terminated, not silently stuck.
#
# (a)/(b) reclaim (the work still needs doing); only (c) cancels. Echoes
# "<reclaimed> <cancelled>". Uses started_at (falls back to created_at).
_hb_reclaim() {
  local name="$1" everyMin="$2"
  local budget=$(( everyMin * _HB_STALE_MULT ))
  (( budget < _HB_STALE_MIN_MINUTES )) && budget=$_HB_STALE_MIN_MINUTES
  local proc_start; proc_start=$(_hb_claude_started "$name" 2>/dev/null || true)
  local reclaimed=0 cancelled=0 id started_epoch age_min
  while IFS='|' read -r id started_epoch age_min; do
    [[ -n "$id" ]] || continue
    # (a) the claiming session is gone — process is newer than the claim.
    if [[ -n "$proc_start" && -n "$started_epoch" ]] \
       && (( proc_start > started_epoch + _HB_PROC_SKEW_SEC )); then
      _hb_reclaim_to_todo "$name" "$id" "claiming session gone (claude restarted $(( (proc_start - started_epoch) / 60 ))m after the claim)"
      reclaimed=$((reclaimed + 1)); continue
    fi
    # (c) hard cap before stall: a very old task is cancelled, not re-queued.
    if (( age_min >= budget )); then
      _hb_send_line "$name" "/goal clear" || true
      db "UPDATE tasks SET status='cancelled', done_at=datetime('now'),
            result='auto-cancelled by heartbeat: in_progress exceeded ${budget}m time budget'
          WHERE id=${id} AND status='in_progress';" 2>/dev/null || true
      _hb_log "[$name] reaped stale in_progress DIVE-${id} (>${budget}m)"
      cancelled=$((cancelled + 1)); continue
    fi
    # (b) idle stall — only if past grace AND a confident idle reading (rc 0).
    if (( age_min >= _HB_STALL_MIN_MINUTES )) && _hb_agent_idle "$name"; then
      _hb_reclaim_to_todo "$name" "$id" "idle ${age_min}m with the task still open (claimed then went idle)"
      reclaimed=$((reclaimed + 1)); continue
    fi
  done < <(db "SELECT id || '|' ||
                 strftime('%s', COALESCE(started_at, created_at)) || '|' ||
                 CAST((julianday('now') - julianday(COALESCE(started_at, created_at))) * 1440 AS INTEGER)
               FROM tasks
               WHERE assignee=$(sqlq "$name") AND status='in_progress';" 2>/dev/null || true)
  printf '%s %s\n' "$reclaimed" "$cancelled"
}

# Wake one agent: ensure it's running, optionally clear context, send the nudge.
# $3 is the concrete DIVE id (highest-priority todo) the tick picked for this
# agent — scoping the /goal to one known id makes its completion check reliable
# (a freeform "your tasks" condition is ambiguous to the goal evaluator).
# Returns 0 on a delivered nudge, nonzero on any failure (so the caller skips
# marking lastRunAt and retries next tick).
_hb_wake() {
  local name="$1" fresh="$2" task_id="$3"
  if ! systemctl is-active --quiet "5dive-agent@${name}.service"; then
    systemctl start "5dive-agent@${name}.service" 2>/dev/null \
      || { _hb_log "[$name] systemctl start failed"; return 1; }
    local i
    for ((i = 0; i < 30; i++)); do
      sudo -u "agent-${name}" tmux has-session -t "agent-${name}" 2>/dev/null && break
      sleep 2
    done
  fi
  sudo -u "agent-${name}" tmux has-session -t "agent-${name}" 2>/dev/null \
    || { _hb_log "[$name] no tmux session after start"; return 1; }

  if [[ "$fresh" == "true" ]]; then
    _hb_send_line "$name" "/clear" || { _hb_log "[$name] /clear failed"; return 1; }
    sleep 4
  fi

  # Issue a /goal scoped to the one task: Claude Code loops turns until the goal
  # evaluator sees the condition met, then auto-clears. "stop after N turns" is a
  # soft, model-judged guard — it does NOT reliably halt a runaway loop, so the
  # real hard cap is the deterministic stale-in_progress reaper in the tick.
  local nudge="/goal Task DIVE-${task_id} shows status done or cancelled, or is blocked with a human gate filed, on the 5dive board (verify ONLY by running: 5dive task show DIVE-${task_id}). To achieve it: claim it with '5dive task start DIVE-${task_id}', do the work, then close it with '5dive task done DIVE-${task_id} --result=\"<one or two self-contained sentences — any output the creator needs to see; the dashboard and creator read this>\"'. If it needs a human decision, approval, a secret, or a manual step only a person can do, do NOT cancel — file a gate that pings the owner: '5dive task need DIVE-${task_id} --type=decision --ask=\"<what you need from them>\"' (use --type=approval|secret|manual as fits). Keep the ask to ONE crisp question + ~1 line of essential context — put heavy detail in the task BODY, not the ask — and ALWAYS surface your recommended choice with --recommend=\"<option>\" (and --options=A|B for a decision) so the owner sees the advised answer first. Only if the task is genuinely irrelevant or impossible, run '5dive task cancel DIVE-${task_id} --result=\"<why>\"'. Work ONLY this one task — do not start any other. Stop after 6 turns."
  _hb_send_line "$name" "$nudge" || { _hb_log "[$name] nudge send failed"; return 1; }
  return 0
}

# DIVE-138 step 2: materialize due recurring TEMPLATES into standard todos. Runs
# as its own pass at the TOP of the tick (before the wake loop) so a freshly
# cloned todo is eligible to be picked up in the SAME tick. The caller isolates
# it (|| log) so a materializer failure can NEVER abort the wake loop — the
# heartbeat-never-woke bug class.
#
# For each kind='recurring' template: fire when its cron matches `now` AND it
# hasn't already fired THIS minute (last_fired_at guard — stops a double-fire if
# two ticks land in the same matching minute). DEDUP (skip-if-open): don't
# materialize if an unfinished instance from this template already exists, so
# dailies don't pile up when the assignee is behind. On fire: clone
# title/body/priority/assignee/created_by/fresh into a kind='standard' todo
# stamped with from_template_id, then stamp the template's last_fired_at.
#
# V1 LIMITATION: no catch-up for ticks the host missed — if the box was down over
# a scheduled minute, that occurrence is skipped, not backfilled. Acceptable for
# coarse (daily/hourly) recurring jobs; minute granularity finer than the tick
# interval can also be missed. Both documented in the CHANGELOG.
_hb_materialize_recurring() {
  local now="$1" minute_start tid sched last_fired open n_made=0
  minute_start=$(date -u -d "@${now}" +'%Y-%m-%d %H:%M:00')
  while IFS=$'\t' read -r tid sched last_fired; do
    [[ -n "$tid" ]] || continue
    _cron_matches "$sched" "$now" || continue
    # Already fired this minute? (string compare on ISO 'YYYY-MM-DD HH:MM:SS';
    # last_fired >= minute_start means a tick already materialized it this minute.)
    if [[ -n "$last_fired" ]] && ! [[ "$last_fired" < "$minute_start" ]]; then
      continue
    fi
    open=$(db "SELECT COUNT(*) FROM tasks WHERE from_template_id=${tid} AND status NOT IN ('done','cancelled');" 2>/dev/null || echo 1)
    if [[ "${open:-1}" != "0" ]]; then
      _hb_log "[materializer] DIVE-${tid} due but an open instance exists — skip"
      continue
    fi
    if db "INSERT INTO tasks (title, body, priority, assignee, created_by, kind, from_template_id, fresh)
           SELECT title, body, priority, assignee, created_by, 'standard', id, fresh FROM tasks WHERE id=${tid};
           UPDATE tasks SET last_fired_at=datetime('now') WHERE id=${tid};" >/dev/null 2>&1; then
      n_made=$((n_made + 1)); _hb_log "[materializer] DIVE-${tid} fired -> new standard todo"
    else
      _hb_log "[materializer] DIVE-${tid} insert failed"
    fi
  done < <(db "SELECT id, schedule, COALESCE(last_fired_at,'') FROM tasks WHERE kind='recurring' AND schedule IS NOT NULL;" 2>/dev/null | tr '|' '\t')
  _hb_log "[materializer] pass done — ${n_made} materialized"
  return 0
}

cmd_heartbeat_tick() {
  require_root "heartbeat tick"
  tasks_db_init
  local reg now; reg=$(registry_read); now=$(date +%s)
  local checked=0 woke=0 reaped=0 reclaimed=0 starved=0 sk_notdue=0 sk_busy=0 sk_nowork=0 sk_fail=0 sk_spread=0 sk_active=0
  # DIVE-138: materialize due recurring templates FIRST so a freshly-cloned todo
  # is eligible for the wake loop below this same tick. Isolated — a failure here
  # must never abort the wake loop.
  _hb_materialize_recurring "$now" || _hb_log "[materializer] pass errored (non-fatal)"
  # Accounts already woken during THIS tick. The $reg snapshot is read once up
  # front, so a wake we do mid-loop isn't visible to later iterations via the
  # registry — this map carries that within-tick fact so two same-account agents
  # can't both wake on one tick.
  local -A in_tick_woke=()
  local name
  # Process oldest-waiting first (smallest lastRunAt). When two same-account
  # agents contend for the one wake slot, the one that has waited longest wins,
  # so neither can be starved by a fresher sibling repeatedly taking the slot.
  while IFS= read -r name; do
    [[ -n "$name" ]] || continue
    checked=$((checked + 1))
    local everyMin lastRun fresh
    everyMin=$(jq -r --arg n "$name" '.agents[$n].heartbeat.everyMin // '"$_HB_DEFAULT_EVERY" <<<"$reg")
    lastRun=$(jq -r --arg n "$name"  '.agents[$n].heartbeat.lastRunAt // 0' <<<"$reg")
    fresh=$(jq -r --arg n "$name"    '(.agents[$n].heartbeat | if has("fresh") then .fresh else true end)' <<<"$reg")

    # Unwedge stuck in_progress first, every tick (NOT gated by everyMin): an
    # orphaned/stalled/runaway task must clear promptly regardless of the wake
    # throttle, or it blocks the agent's whole queue (the busy-guard below).
    local n_reclaimed n_cancelled
    read -r n_reclaimed n_cancelled < <(_hb_reclaim "$name" "$everyMin") || true
    reclaimed=$((reclaimed + ${n_reclaimed:-0})); reaped=$((reaped + ${n_cancelled:-0}))

    if (( now - lastRun < everyMin * 60 )); then
      # Wake-on-enqueue: don't make an urgent/high task wait out the full cadence.
      # If one landed in this agent's queue since its last wake, allow an early
      # wake this tick (still gated by busy/spread/idle below). created_at is UTC
      # text; strftime('%s') makes it an epoch comparable to lastRun.
      local hot
      hot=$(db "SELECT COUNT(*) FROM tasks
                WHERE assignee=$(sqlq "$name") AND status='todo' AND kind='standard'
                  AND priority IN ('urgent','high')
                  AND CAST(strftime('%s', created_at) AS INTEGER) > ${lastRun};" 2>/dev/null || echo 0)
      if [[ "${hot:-0}" != "0" ]]; then
        _hb_log "[$name] early wake — ${hot} urgent/high task(s) queued since last wake"
      else
        sk_notdue=$((sk_notdue + 1)); _hb_log "[$name] not due ($(( (lastRun + everyMin*60 - now + 59) / 60 ))m left)"; continue
      fi
    fi
    local inprog
    inprog=$(db "SELECT COUNT(*) FROM tasks WHERE assignee=$(sqlq "$name") AND status='in_progress';" 2>/dev/null || echo 0)
    if [[ "${inprog:-0}" != "0" ]]; then
      sk_busy=$((sk_busy + 1)); _hb_log "[$name] busy — $inprog in_progress, skip"; continue
    fi
    # Pick the single highest-priority todo and wake the agent against that exact
    # id — the /goal condition needs a concrete DIVE-N to evaluate reliably.
    local task_id
    task_id=$(db "SELECT id FROM tasks WHERE assignee=$(sqlq "$name") AND status='todo' AND kind='standard'
                  ORDER BY CASE priority WHEN 'urgent' THEN 0 WHEN 'high' THEN 1 WHEN 'medium' THEN 2 ELSE 3 END, id
                  LIMIT 1;" 2>/dev/null || echo "")
    if [[ -z "$task_id" ]]; then
      sk_nowork=$((sk_nowork + 1)); _hb_log "[$name] no todo — stay idle"; continue
    fi

    # --- Same-account spread ---------------------------------------------------
    # Never start two agents that share an Anthropic account close together: a
    # simultaneous session start bursts the shared account and trips a 429. The
    # account's most-recent wake is derived from existing lastRunAt values (plus
    # any wake done earlier this tick) — no extra state. Require an even slice of
    # the cadence between same-account wakes: gap = everyMin / agents-on-account
    # (2 agents @ 60m -> 30m apart, 3 -> 20m), and it self-heals as agents join.
    # Single-agent accounts are never deferred. A deferred agent is left due and
    # retried next tick, sliding later until it clears the gap, so the phases
    # converge to even spacing on their own. Agents with no authProfile get a
    # per-name sentinel account, so they never contend with anyone.
    local acct acct_count
    acct=$(jq -r --arg n "$name" '.agents[$n].authProfile // ("@self:" + $n)' <<<"$reg")
    acct_count=$(jq -r --arg a "$acct" '
      [.agents | to_entries[]
       | select(.value.heartbeat.enabled == true)
       | (.value.authProfile // ("@self:" + .key))
       | select(. == $a)] | length' <<<"$reg")
    if (( acct_count > 1 )); then
      local acct_last gap
      acct_last=$(jq -r --arg a "$acct" --arg n "$name" '
        [.agents | to_entries[]
         | select(.value.heartbeat.enabled == true)
         | select(.key != $n)
         | select((.value.authProfile // ("@self:" + .key)) == $a)
         | (.value.heartbeat.lastRunAt // 0)] | max // 0' <<<"$reg")
      if [[ -n "${in_tick_woke[$acct]:-}" ]] && (( in_tick_woke[$acct] > acct_last )); then
        acct_last=${in_tick_woke[$acct]}
      fi
      gap=$(( everyMin * 60 / acct_count ))
      if (( now - acct_last < gap )); then
        sk_spread=$((sk_spread + 1))
        _hb_log "[$name] spread-defer — account '$acct' (${acct_count} agents) last woke $(( (now - acct_last) / 60 ))m ago, need a $(( gap / 60 ))m gap; retry next tick"
        continue
      fi
    fi

    # No-clobber: never /clear + nudge an agent that's mid-turn, in a live
    # conversation (e.g. the orchestrator talking to a human), or blocked on a
    # prompt. The busy-guard above only catches an open *task*; this catches
    # working/interactive/blocked state with no task — a fresh nudge would /clear
    # it out from under the work or bury a pending permission prompt. Defer on a
    # confident "active" (rc 1) or "blocked" (rc 3); unknown (rc 2 — no signal)
    # falls through so the wake can still (re)start a stopped session.
    local idle_rc=0; _hb_agent_idle "$name" || idle_rc=$?
    if (( idle_rc == 3 )); then
      sk_active=$((sk_active + 1))
      _hb_log "[$name] WARN: blocked (${_HB_IDLE_REASON:-input needed}) — surfacing, defer nudge this tick (needs attention, not reclaim)"
      continue
    fi
    if (( idle_rc == 1 )); then
      sk_active=$((sk_active + 1)); _hb_log "[$name] active (mid-turn/conversation) — defer nudge this tick"; continue
    fi

    # Per-task fresh override (DIVE-138): a materialized recurring instance can
    # carry fresh=1 to force a clean /clear before its turn, regardless of the
    # agent-level heartbeat fresh setting. NULL/0 falls back to the agent default.
    local eff_fresh="$fresh" task_fresh
    task_fresh=$(db "SELECT COALESCE(fresh,'') FROM tasks WHERE id=${task_id};" 2>/dev/null || echo "")
    [[ "$task_fresh" == "1" ]] && eff_fresh="true"
    _hb_log "[$name] due + todo DIVE-${task_id} — waking (fresh=${eff_fresh})"
    if _hb_wake "$name" "$eff_fresh" "$task_id"; then
      in_tick_woke[$acct]=$now   # claim the account's slot for the rest of this tick
      local nudge_n
      nudge_n=$(with_registry_lock _hb_mark_run "$name" "$now" "$task_id")
      woke=$((woke + 1)); _hb_log "[$name] nudged (/goal DIVE-${task_id}, nudge #${nudge_n:-?})"
      # Nudged repeatedly but the task never left todo → it's being starved
      # (e.g. listen-loop watchdog yanking the agent before `task start` runs).
      # Surface it instead of silently re-nudging every tick forever.
      if [[ "${nudge_n:-0}" =~ ^[0-9]+$ ]] && (( nudge_n >= _HB_STARVE_AFTER )); then
        starved=$((starved + 1))
        _hb_log "[$name] WARN: DIVE-${task_id} nudged ${nudge_n}x but still todo (never started) — possible listen-loop starvation; check the agent's task-claim path"
      fi
    else
      sk_fail=$((sk_fail + 1)); _hb_log "[$name] wake failed — will retry next tick"
    fi
  done < <(jq -r '.agents | to_entries
                  | map(select(.value.heartbeat.enabled == true))
                  | sort_by(.value.heartbeat.lastRunAt // 0)
                  | .[].key' <<<"$reg")

  ok "heartbeat tick: woke ${woke} / reclaimed ${reclaimed} / reaped ${reaped} / starved ${starved} / spread-deferred ${sk_spread} / active-deferred ${sk_active} / checked ${checked}" \
     '{checked:($c|tonumber), woke:($w|tonumber), reclaimed:($rc|tonumber), reaped:($r|tonumber), starved:($st|tonumber),
       skipped:{notDue:($nd|tonumber), busy:($b|tonumber), noWork:($nw|tonumber), spread:($sp|tonumber), active:($ac|tonumber), failed:($sf|tonumber)}}' \
     --arg c "$checked" --arg w "$woke" --arg rc "$reclaimed" --arg r "$reaped" --arg st "$starved" --arg nd "$sk_notdue" --arg b "$sk_busy" --arg nw "$sk_nowork" --arg sp "$sk_spread" --arg ac "$sk_active" --arg sf "$sk_fail"
}

# -------- self-update (fetch installer + --upgrade, then restart agents) --------
#
# `5dive self-update` is the on-demand counterpart to the managed nightly
# soft-update, for OSS self-hosted boxes that have no scheduler of their own.
# It does two things:
#
#   1. Fetches install.sh and runs `--upgrade` — refreshes the 5dive CLI,
#      5dive-agent-start, hooks, skills, the systemd template, and the plugins
#      (via 5dive-refresh-plugins.sh). This reuses the same installer that
#      `uninstall` shells out to, so there's a single source of truth for
#      "what gets updated" rather than a second copy that drifts.
#
#   2. Restarts every running agent so the refreshed plugins/CLIs actually
#      load. A live agent keeps its old plugin (and shared CLI binary) in
#      memory until it restarts — that's the usual reason a plugin "still
#      shows the old version" after an upgrade.
#
# The agent AI CLIs themselves (claude/codex/grok/antigravity) self-update via
# their own vendor autoupdaters; the restart in step 2 is what loads the latest
# shared binary into each agent. Managed boxes have their own scheduler so they
# don't need this, but running it there is harmless — `--upgrade` and the
# restart loop are both idempotent.

# json_array <items...> — emit a compact JSON string array, "[]" when empty.
# Guards the empty-array case (printf with no args would otherwise emit a stray
# empty element).
json_array() {
  if [[ $# -eq 0 ]]; then
    echo '[]'
  else
    printf '%s\n' "$@" | jq -R . | jq -cs .
  fi
}

cmd_self_update() {
  [[ $# -eq 0 ]] || fail "$E_USAGE" "self-update takes no arguments"
  command -v curl >/dev/null 2>&1 || fail "$E_NOT_FOUND" "curl is required for 5dive self-update"

  local installer
  installer=$(mktemp) || fail "$E_GENERIC" "failed to create temp file"
  # shellcheck disable=SC2064
  trap "rm -f '$installer'" RETURN

  step "Fetching installer"
  curl -fsSL "https://raw.githubusercontent.com/5dive-com/5dive/main/install.sh" -o "$installer" \
    || fail "$E_GENERIC" "failed to fetch installer"

  step "Upgrading 5dive CLI + plugins"
  # Send installer chatter to stderr so JSON stdout stays parseable.
  bash "$installer" --upgrade >&2 || fail "$E_GENERIC" "upgrade failed"

  # Restart running agents so the refreshed plugins/CLIs load. Best-effort per
  # unit — one failed restart shouldn't abort the rest.
  local -a restarted=() failed=()
  local unit name
  if command -v systemctl >/dev/null 2>&1; then
    while read -r unit; do
      [[ -z "$unit" ]] && continue
      name="${unit#5dive-agent@}"; name="${name%.service}"
      if systemctl restart "$unit" 2>/dev/null; then
        step "restarted $name"
        restarted+=("$name")
      else
        warn "failed to restart agent '$name'"
        failed+=("$name")
      fi
    done < <(systemctl list-units '5dive-agent@*' --state=running --no-legend --plain 2>/dev/null | awk '{print $1}')
  fi

  local r f prose
  r=$(json_array "${restarted[@]}")
  f=$(json_array "${failed[@]}")
  prose="self-update complete — ${#restarted[@]} agent(s) restarted"
  (( ${#failed[@]} )) && prose+=", ${#failed[@]} failed to restart"
  ok "$prose" \
     '{restarted:$r, restarted_count:($r|length), failed:$f}' \
     --argjson r "$r" --argjson f "$f"
}

# version_lt A B — true when semver A is strictly older than B (sort -V).
version_lt() {
  [[ "$1" != "$2" && "$(printf '%s\n%s\n' "$1" "$2" | sort -V | head -n1)" == "$1" ]]
}

# How long the dashboard waits after a release before treating a still-behind
# box as "stale". One nightly soft-update (every 24h) should close the gap, so
# anything past ~1.5 days means the auto-update isn't keeping up.
readonly UPDATE_STALE_AFTER_SECS=$((36 * 3600))

# cmd_update_check — read-only (no root, no mutation) version probe for the
# dashboard maintenance tile. Compares the installed CLI to the published
# release and reads the last nightly soft-update result, then reports whether
# the box is GENUINELY stale (behind AND the auto-update isn't catching up) vs
# merely a release or two behind with a healthy nightly that'll close the gap.
cmd_update_check() {
  [[ $# -eq 0 ]] || fail "$E_USAGE" "update --check takes no arguments"
  command -v curl >/dev/null 2>&1 || fail "$E_NOT_FOUND" "curl is required for update --check"

  local current="$FIVE_VERSION" latest
  latest=$(curl -fsSL "https://raw.githubusercontent.com/5dive-com/5dive/main/5dive" 2>/dev/null \
    | grep -m1 -oP '(?<=^readonly FIVE_VERSION=")[^"]+') \
    || true
  [[ -n "$latest" ]] || fail "$E_GENERIC" "could not determine the latest published version"

  local behind=false
  version_lt "$current" "$latest" && behind=true

  # Inspect the last managed nightly soft-update run (managed boxes log to
  # /tmp/claude-soft-updates.log). Best-effort: absent log → unknown.
  local log="/tmp/claude-soft-updates.log"
  local last_ok_json="null" last_at_json="null" last_epoch=""
  if [[ -r "$log" ]]; then
    local start_line
    start_line=$(grep -n "soft updates start" "$log" | tail -1 | cut -d: -f1)
    if [[ -n "$start_line" ]]; then
      if tail -n "+${start_line}" "$log" | grep -q "CLI upgrade via install.5dive.com failed"; then
        last_ok_json="false"
      else
        last_ok_json="true"
      fi
    fi
    local last_at
    last_at=$(grep -oE "[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:+-]+ soft updates done" "$log" \
      | tail -1 | grep -oE "^[^ ]+")
    if [[ -n "$last_at" ]]; then
      last_at_json="\"$last_at\""
      last_epoch=$(date -d "$last_at" +%s 2>/dev/null || echo "")
    fi
  fi

  # "stale" = behind AND the nightly auto-update isn't closing the gap: it
  # failed, never ran on record, or hasn't run inside the staleness window.
  local stale=false now
  now=$(date +%s)
  if [[ "$behind" == true ]]; then
    if [[ "$last_ok_json" == "false" || -z "$last_epoch" ]]; then
      stale=true
    elif (( now - last_epoch > UPDATE_STALE_AFTER_SECS )); then
      stale=true
    fi
  fi

  local prose
  if [[ "$behind" == true ]]; then
    prose="CLI $current is behind (latest $latest)"
    [[ "$stale" == true ]] && prose+=" — stale, update recommended"
  else
    prose="CLI $current is up to date"
  fi

  ok "$prose" \
     '{current:$cur, latest:$lat, behind:$beh, stale:$stl, lastUpdateOk:$luo, lastUpdateAt:$lua}' \
     --arg cur "$current" --arg lat "$latest" \
     --argjson beh "$behind" --argjson stl "$stale" \
     --argjson luo "$last_ok_json" --argjson lua "$last_at_json"
}
# -------- top-level dispatch --------

usage() {
  cat <<USAGE
5dive — 5dive agent manager

Global flags:
  --json                              Emit machine-readable output on stdout
                                      ({ok:true,data:...} | {ok:false,error:{...}}).
                                      Works on any subcommand below.

Maintenance:
  5dive --version                                    # print version
  5dive init                                         # interactive first-run wizard
  5dive self-update                                  # update the CLI + plugins, then restart agents
                                                     # (alias: 5dive update). On-demand upgrade for
                                                     # self-hosted boxes; managed boxes update nightly.
  5dive update --check                               # read-only: is the CLI behind/stale? (no root)
  5dive uninstall [--purge] [--yes]                  # remove 5dive (--purge also wipes state + user)

Live view:
  5dive watch [--interval=N]                         # htop-style live view of every agent;
                                                     # ↑↓ select, ↵ attach, r refresh, q quit.

Compose (declarative agents via 5dive.yaml):
  5dive up   [-f file]                               # bring up agents declared in spec (idempotent)
  5dive down [-f file]                               # tear down declared agents
  5dive ps   [-f file]                               # show declared agents' state
  5dive export [-o file]                             # dump the live fleet to a v2 5dive.yaml
  5dive team import <slug|path> [--auth-profile=]    # provision a whole company template in one call
  5dive team ls                                      # list bundled team templates
  # Default file: 5dive.yaml or 5dive.yml in cwd.
  # Schema (v1) — see 'agents' map keys: type, channels, telegram_token,
  # discord_token, workdir, skills, no_skills, defer_auth, isolation,
  # auth_profile, provider, api_key. Strings expand "\${ENV_VAR}" from the
  # process env (missing vars fail loudly).

Agents:
  5dive agent list
  5dive agent info <name>                            # type, CLI version, selected model, channel + state
  5dive agent types
  5dive agent create <name> --type=<type> [--channels=none|telegram|discord]
                            [--telegram-token=<bot-token>] [--discord-token=<token>]
                            [--workdir=<path>] [--auth-profile=<name>]
                            [--provider=<id> --api-key=<key|->]
                            [--with-skills=<spec>[,<spec>...]] [--no-skills]
                            [--no-team-bot] [--defer-auth]
                            # When the box has a shared team bot configured
                            # (team-bot shared persists it), new no-bot agents
                            # auto-attach: own forum topic, send-only on the
                            # shared token. --no-team-bot opts the agent out.
                            # spec: <id> (defaults to ${DEFAULT_SKILL_SOURCE}) or <owner/repo>:<id>
                            # provider: hermes/openclaw only — BYO API key for one of
                            # ${!BYO_PROVIDER_LABEL[*]}. Mutually exclusive with --defer-auth.
                            # When called by another agent on a claude-typed agent,
                            # defaults to --with-skills=5dive-cli so the new agent
                            # inherits inter-agent comms knowledge. Use --no-skills
                            # to opt out. --defer-auth skips the auth gate so the
                            # agent can be created before credentials exist; useful
                            # when the agent's own first-run UI handles sign-in.
  5dive agent clone <src> <dst> [--channels=...] [--telegram-token=...]
                                [--discord-token=...] [--workdir=...]
  5dive agent start <name>
  5dive agent stop <name>
  5dive agent restart <name>
  5dive agent rm <name>
  5dive agent config <name> set channels=<none|telegram|discord>
  5dive agent config <name> set workdir=<path>       # tmux cwd; "default" clears override
  5dive agent config <name> set auth-profile=<name>  # swap profile; "default" clears override
  5dive agent config <name> set model=<id>           # runtime model (claude/codex/grok/antigravity)
  5dive agent config <name> set effort=<low|medium|high|xhigh|max>
                                                     # claude only — reasoning effort (effortLevel);
                                                     # xhigh/max are Opus-tier (Sonnet caps at high)
  5dive agent config <name> set telegram.token=<bot-token>
                                                     # combine with channels=telegram to attach a Telegram bot
                                                     # post-create (also runs install_channel_for_agent so the
                                                     # claude plugin / openclaw channels.add / hermes ~/.hermes/.env
                                                     # land in step with the registry).
  5dive agent config <name> set discord.token=<token>
  5dive agent config <name> set telegram.home-channel=<chat-id>
                                                     # hermes only — chat id the gateway posts unsolicited
                                                     # messages to; ignored by claude/openclaw.
  5dive agent config <name> set telegram.allowed-users=<id1,id2,...>
                                                     # comma-separated numeric user ids; seeds
                                                     # access.json/openclaw.allowFrom/hermes env so the bot
                                                     # forwards DMs from these users without a pair-code gate.
  5dive agent pair <name> [--code=<code> | --user-id=<id> [--chat-id=<id>]]
                                                     # telegram/discord pairing. --code accepts the bot reply or
                                                     # bare pairing code. --user-id seeds access.json directly
                                                     # (auto-detected via telegram-discover; chat_id defaults
                                                     # to user_id for private DMs).
  5dive agent telegram-discover {--token=<bot-token>|--agent=<name>} [--poll-secs=N]
                                                     # long-polls Telegram getUpdates (timeout N, max 90s).
                                                     # --agent reads the token from the agent's connector env
                                                     # file (so the dashboard can discover without handling the
                                                     # token client-side). On first inbound message returns
                                                     # {found:true, userId, chatId, username, firstName};
                                                     # otherwise {found:false} — callers re-poll until found.
  5dive agent telegram-getme --token=<bot-token>     # fast getMe lookup; returns {botId, username, firstName}.
  5dive agent telegram-info <name> [--refresh]       # name-based getMe; reads token from /etc/5dive/connectors,
                                                     # caches botUsername in the registry. Used by the dashboard
                                                     # to backfill @handles for agents created before the
                                                     # botUsername-on-create change. --refresh forces re-fetch.
  5dive agent telegram-access get <name>             # read access.json: who can DM the bot, group settings.
  5dive agent telegram-access set <name>             # write access.json from {dmPolicy,allowFrom,groups} JSON
                                                     # piped on stdin. Plugin re-reads per-message — no restart.
  5dive agent telegram-pending-ignore <name> <code>  # drop a pending pairing without approving (dashboard inbox).
  5dive agent telegram-resolve-handle <name> <@handle>
                                                     # getChat for @handle via the agent's bot token; returns
                                                     # {id,isBot,displayName} so the dashboard can add bots by
                                                     # handle instead of numeric id.
  5dive agent <name> tui                             # attach your terminal to the agent's tmux session
  5dive agent logs <name> [--follow] [--lines=N] [--tmux]
  5dive agent send <name> <text...> [--from=<sender>] [--raw]
                                    [--reply-to-chat=<id> [--reply-to-msg=<id>]]
                                                     # inject a message (tmux send-keys + Enter).
                                                     # When called from another agent, auto-wraps as
                                                     # [5dive-msg from=<caller> id=<id>] so the
                                                     # receiver sees who's pinging it. --raw skips wrapping.
                                                     # --reply-to-chat adds a hint telling the receiver
                                                     # to reply directly in that Telegram/Discord chat
                                                     # via its own bot (see SKILL.md).
  5dive agent ask <name> <text...> [--from=<sender>] [--timeout=120] [--idle-secs=5] [--poll-secs=2]
                                   [--reply-to-chat=<id> [--reply-to-msg=<id>]]
                                                     # synchronous send + wait. Polls scrollback after
                                                     # the marker line until it stops growing for
                                                     # --idle-secs, then prints the reply body.
  5dive agent stats <name>                           # state, restart count, last exit
  5dive agent install <type>                         # install the CLI for a type if missing
  5dive agent set-account <agent> <account|default>  # rebind to a named account; "default" clears

Default workdir: ${DEFAULT_WORKDIR}

Accounts (a named auth profile — group sign-ins so multiple agents share one login):
  5dive account list                                   # name, types signed in, # agents bound
  5dive account show <name>                            # detail incl. env keys present
  5dive account usage                                  # per-account 5h/7d limit usage (dashboard dots + /usage)
  5dive account add <name>                             # create empty account; sign in next
  5dive account login <name> --type=<type>             # interactive TTY login into an account
  5dive account rename <old> <new>                     # repoints all bound agents + restarts them
  5dive account remove <name>                          # refuses if any agents still bound

Auth (lower-level; the dashboard uses these — prefer 'account' for human-driven flows):
  5dive agent auth status [--probe] [--type=<type>]    # real --print probe reveals stale creds
  5dive agent auth login <type>                        # interactive TTY (hands off this process)
  5dive agent auth set <type> --api-key=<key|-> [--auth-profile=<name>] [--provider=<id>]
                                                       # --provider=<id> required for hermes/openclaw;
                                                       # id is one of: ${!BYO_PROVIDER_LABEL[*]}
  5dive agent auth start <type> [--auth-profile=<name>]      # non-TTY device-code: returns session id
  5dive agent auth poll <session_id>                         # {state, url, error}
  5dive agent auth submit <session_id> --code=<callback>     # paste the claude callback code
  5dive agent auth cancel <session_id>

Tasks (shared queue, sqlite — any agent, no sudo):
  5dive task add <title...> [--priority=low|medium|high|urgent] [--assignee=<agent>] [--parent=<id>]
  5dive task ls [--mine] [--status=<s>] [--all]      # open work, priority-ordered
  5dive task show|start|done|cancel|rm <id|DIVE-N>
  5dive task assign <id|DIVE-N> <agent>
  5dive task block <id|DIVE-N> --by=<id|DIVE-N>
  # full surface: 5dive task --help

Org chart (who reports to whom):
  5dive org set <agent> --manager=<agent> [--role=<text>] [--title=<text>]
  5dive org tree | show <agent> | ls | rm <agent>
  # full surface: 5dive org --help

Heartbeat (wake an agent only when it has queued tasks, one per tick):
  5dive heartbeat on  <name> [--every=<dur>] [--no-fresh]   # enrol (default 30m, /clear before each task)
  5dive heartbeat off <name>
  5dive heartbeat ls                                        # enrolled agents + next-wake + queued count
  5dive heartbeat tick                                      # cron driver (root); wakes due agents that have work
  # full surface: 5dive heartbeat --help

Health:
  5dive doctor [--repair] [--category=deps|types|auth|registry|shelld]
    Walks deps (tmux/jq/bun/python3/nvm/node/npm), type bins, live auth
    probes, registry integrity, and shelld reachability. --repair attempts
    reversible fixes (apt installs, type installer recipes, bun, shelld
    restart, registry reseed). Output envelope always {ok:true,data:{...}};
    branch on data.summary.errors in CI.

Types: ${!TYPE_BIN[*]}

Exit codes (also surfaced as error.code in --json mode):
  0 ok       2 usage       3 validation   4 not_found    5 conflict
  6 auth_required  7 not_installed  8 not_running  9 pairing
  10 permission  11 timeout         1 generic
USAGE
}

main() {
  # Global --json: strip every occurrence before dispatch so each subcommand
  # gets the same arg shape regardless of where the flag was placed.
  local -a rest=()
  local a
  for a in "$@"; do
    if [[ "$a" == "--json" ]]; then
      JSON_MODE=1
      continue
    fi
    rest+=("$a")
  done
  set -- "${rest[@]+"${rest[@]}"}"

  [[ $# -gt 0 ]] || { usage; exit "$E_USAGE"; }
  local top="$1"; shift
  # Handle --version / -v / version before the dispatch table so it stays a
  # zero-dependency one-liner check (reviewers grep for it first).
  case "$top" in
    -v|--version|version)
      if [[ "${JSON_MODE:-0}" == 1 ]]; then
        printf '{"ok":true,"data":{"version":"%s"}}\n' "$FIVE_VERSION"
      else
        echo "5dive $FIVE_VERSION"
      fi
      exit 0
      ;;
  esac
  # Mutating commands run under with_registry_lock so adduser/registry_write
  # can't race across concurrent dashboard clicks. Read-only commands (list,
  # logs, stats, types, auth status/poll) bypass the lock and the audit log.
  case "$top" in
    agent)
      [[ $# -gt 0 ]] || { usage; exit "$E_USAGE"; }
      local sub="$1"; shift
      case "$sub" in
        list)    cmd_list "$@" ;;
        info)    cmd_info "$@" ;;
        types)   cmd_types "$@" ;;
        logs)    cmd_logs "$@" ;;
        send)    cmd_send "$@" ;;
        ask)     cmd_ask "$@" ;;
        stats)   cmd_stats "$@" ;;
        create)
          AUDIT_CMD="agent create"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_create "$@" ;;
        clone)
          AUDIT_CMD="agent clone"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_clone "$@" ;;
        start)
          AUDIT_CMD="agent start"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_start "$@" ;;
        stop)
          AUDIT_CMD="agent stop"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_stop "$@" ;;
        restart)
          AUDIT_CMD="agent restart"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_restart "$@" ;;
        rm)
          AUDIT_CMD="agent rm"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_rm "$@" ;;
        config)
          AUDIT_CMD="agent config"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_config "$@" ;;
        pair)
          AUDIT_CMD="agent pair"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_pair "$@" ;;
        telegram-discover)
          # Read-only Telegram getUpdates poll — no registry mutation, no
          # state changes. Bot token would clutter the audit log if it were
          # passed verbatim, so skip auditing too (the post-pair allowlist
          # write is auditable on its own through cmd_pair).
          cmd_telegram_discover "$@" ;;
        telegram-getme)
          # Read-only bot identity lookup. Same audit/lock rationale as
          # telegram-discover.
          cmd_telegram_getme "$@" ;;
        telegram-info)
          # Mostly read; cache miss takes the registry lock internally to
          # write back the resolved botUsername. No audit — backfill is
          # idempotent and not worth log noise.
          cmd_telegram_info "$@" ;;
        telegram-access)
          [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive agent telegram-access get|set <name>"
          local accesscmd="$1"; shift
          case "$accesscmd" in
            get) cmd_telegram_access_get "$@" ;;  # read-only, no audit
            set)
              AUDIT_CMD="agent telegram-access set"; AUDIT_ARGS=("$@")
              cmd_telegram_access_set "$@" ;;
            *) fail "$E_USAGE" "unknown telegram-access command: $accesscmd" ;;
          esac ;;
        telegram-pending-ignore)
          AUDIT_CMD="agent telegram-pending-ignore"; AUDIT_ARGS=("$@")
          cmd_telegram_pending_ignore "$@" ;;
        telegram-resolve-handle)
          # Read-only getChat lookup against Telegram. Bot token stays
          # server-side; skip audit so handle probes don't spam the log.
          cmd_telegram_resolve_handle "$@" ;;
        topic)
          # DIVE-159 team-bot: get/set the agent's forum-topic mapping in the
          # registry. get is read-only; set takes the registry lock internally.
          [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive agent topic get|set <name> [--thread-id=N --chat-id=N]"
          local topiccmd="$1"; shift
          case "$topiccmd" in
            get) cmd_agent_topic_get "$@" ;;  # read-only, no audit
            set)
              AUDIT_CMD="agent topic set"; AUDIT_ARGS=("$@")
              with_registry_lock cmd_agent_topic_set "$@" ;;
            *) fail "$E_USAGE" "unknown topic command: $topiccmd" ;;
          esac ;;
        team-bot)
          # DIVE-159: provision/inspect the customer's team group (personal-bot
          # model — a forum topic per agent). status is read-only; provision
          # writes access.json + registry teamTopic (registry lock taken inside).
          AUDIT_CMD="agent team-bot"; AUDIT_ARGS=("$@")
          cmd_agent_team_bot "$@" ;;
        install)
          AUDIT_CMD="agent install"; AUDIT_ARGS=("$@")
          cmd_install "$@" ;;   # no registry mutation; auditable install recipe
        set-account)
          AUDIT_CMD="agent set-account"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_agent_set_account "$@" ;;
        rotation)
          [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive agent rotation get|set|rotate|cooldown|clear-cooldown <agent> [...]"
          local rotcmd="$1"; shift
          case "$rotcmd" in
            get) cmd_agent_rotation_get "$@" ;;  # read-only, no lock/audit
            set)
              AUDIT_CMD="agent rotation set"; AUDIT_ARGS=("$@")
              with_registry_lock cmd_agent_rotation_set "$@" ;;
            rotate)
              AUDIT_CMD="agent rotation rotate"; AUDIT_ARGS=("$@")
              with_registry_lock cmd_agent_rotation_rotate "$@" ;;
            cooldown)
              AUDIT_CMD="agent rotation cooldown"; AUDIT_ARGS=("$@")
              with_registry_lock cmd_agent_rotation_cooldown "$@" ;;
            clear-cooldown)
              AUDIT_CMD="agent rotation clear-cooldown"; AUDIT_ARGS=("$@")
              with_registry_lock cmd_agent_rotation_clear_cooldown "$@" ;;
            *) fail "$E_USAGE" "unknown rotation command: $rotcmd (get|set|rotate|cooldown|clear-cooldown)" ;;
          esac ;;
        skill)
          AUDIT_CMD="agent skill"; AUDIT_ARGS=("$@")
          cmd_skill "$@" ;;     # add/list/rm operate on the agent type's skills dir
        auth)
          [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive agent auth status|login|set|start|poll|submit|cancel"
          local authcmd="$1"; shift
          case "$authcmd" in
            status) cmd_auth_status "$@" ;;
            poll)   cmd_auth_poll "$@" ;;
            login)
              # exec-handoff — EXIT trap won't fire, so log the intent now.
              audit_log "agent auth login" "started" 0 -- "$@"
              cmd_auth_login "$@" ;;
            set)
              AUDIT_CMD="agent auth set"; AUDIT_ARGS=("$@")
              cmd_auth_set "$@" ;;
            start)
              AUDIT_CMD="agent auth start"; AUDIT_ARGS=("$@")
              cmd_auth_start "$@" ;;
            submit)
              AUDIT_CMD="agent auth submit"; AUDIT_ARGS=("$@")
              cmd_auth_submit "$@" ;;
            cancel)
              AUDIT_CMD="agent auth cancel"; AUDIT_ARGS=("$@")
              cmd_auth_cancel "$@" ;;
            *) fail "$E_USAGE" "unknown auth command: $authcmd" ;;
          esac ;;
        *)
          # `5dive agent <name> tui` — name-first form for terminal attach.
          if [[ "${1:-}" == "tui" ]]; then
            cmd_tui "$sub"
          else
            fail "$E_USAGE" "unknown agent command: $sub"
          fi ;;
      esac ;;
    account)
      [[ $# -gt 0 ]] || fail "$E_USAGE" "usage: 5dive account list|show|usage|add|rename|remove|login|set-active-provider"
      local acctcmd="$1"; shift
      case "$acctcmd" in
        list)   cmd_account_list "$@" ;;
        show)   cmd_account_show "$@" ;;
        usage)  cmd_account_usage "$@" ;;
        add)
          AUDIT_CMD="account add"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_account_add "$@" ;;
        rename)
          AUDIT_CMD="account rename"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_account_rename "$@" ;;
        remove|rm)
          AUDIT_CMD="account remove"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_account_remove "$@" ;;
        login)
          # exec-handoff like `agent auth login` — log intent now, the
          # EXIT trap won't fire after exec.
          audit_log "account login" "started" 0 -- "$@"
          cmd_account_login "$@" ;;
        set-active-provider)
          AUDIT_CMD="account set-active-provider"; AUDIT_ARGS=("$@")
          with_registry_lock cmd_account_set_active_provider "$@" ;;
        *) fail "$E_USAGE" "unknown account command: $acctcmd" ;;
      esac ;;
    doctor)
      # Only audit when --repair is set (read-only runs would spam the log).
      for a in "$@"; do
        if [[ "$a" == "--repair" ]]; then
          AUDIT_CMD="doctor"; AUDIT_ARGS=("$@")
          break
        fi
      done
      cmd_doctor "$@" ;;
    paperclip-seed)
      # Internal: backfill /home/claude/.<type>/ symlinks from registered
      # agents so paperclipai (running as user `claude`) sees the same auth
      # the agents use. Called from update.sh; safe to invoke manually too.
      ensure_state
      paperclip_seed_all_from_registry
      ok "paperclip credentials seeded from registry" '{seeded:true}' ;;
    watch)
      # Live multi-agent dashboard (htop-style). Read-only — no audit, no lock.
      cmd_watch "$@" ;;
    task)
      # Shared task queue (sqlite). Group-writable store, so no root/lock and
      # no audit — these are high-frequency, low-risk ops any agent runs. SQLite
      # serializes its own writes (busy_timeout) so with_registry_lock isn't needed.
      cmd_task "$@" ;;
    org)
      # Agent org chart (sqlite, same store as tasks). Read/write, no audit/lock.
      cmd_org "$@" ;;
    heartbeat)
      # Wake-on-work scheduler. on/off mutate the registry (lock taken inside
      # cmd_heartbeat); tick is the root cron driver; ls is read-only. No audit
      # — tick fires every few minutes and would flood the log; the wakes it
      # triggers are visible via each agent's own transcript.
      cmd_heartbeat "$@" ;;
    init)
      # Interactive first-run wizard: pick a type → install → auth → create
      # → "send hello". Calls back into the same CLI for each step.
      AUDIT_CMD="init"; AUDIT_ARGS=("$@")
      cmd_init "$@" ;;
    up)
      # Compose-style: bring up agents declared in 5dive.yaml. Mutating but
      # the per-agent `agent create` calls take the registry lock + audit
      # themselves, so no need to wrap here.
      AUDIT_CMD="up"; AUDIT_ARGS=("$@")
      cmd_compose_up "$@" ;;
    down)
      AUDIT_CMD="down"; AUDIT_ARGS=("$@")
      cmd_compose_down "$@" ;;
    ps)
      # Read-only — no audit, no lock.
      cmd_compose_ps "$@" ;;
    export)
      # Read-only — dump the live fleet to a v2 5dive.yaml.
      cmd_compose_export "$@" ;;
    team)
      # Provision a whole company-structure template (wraps `up`); the per-agent
      # create calls take the lock + audit themselves.
      AUDIT_CMD="team"; AUDIT_ARGS=("$@")
      cmd_team "$@" ;;
    uninstall)
      # Thin wrapper: fetch install.sh and exec --uninstall. Keeps a single
      # source of truth for what gets removed (install.sh) and dodges the
      # "old bundles ship stale uninstall logic" problem.
      [[ $EUID -eq 0 ]] || fail "$E_PERMISSION" "uninstall must run as root (sudo 5dive uninstall)"
      local installer
      if command -v curl >/dev/null 2>&1; then
        installer=$(mktemp)
        curl -fsSL "https://raw.githubusercontent.com/5dive-com/5dive/main/install.sh" -o "$installer" \
          || fail "$E_GENERIC" "failed to fetch installer"
        chmod +x "$installer"
        exec bash "$installer" --uninstall "$@"
      else
        fail "$E_NOT_FOUND" "curl is required for 5dive uninstall"
      fi ;;
    self-update|self_update|update)
      # `--check` is a read-only version probe (no root, no mutation): compares
      # the installed CLI to the published release so the dashboard maintenance
      # tile can show a "your CLI is behind — update now" prompt. Everything
      # else in this branch mutates the box, so it stays root-gated.
      if [[ "${1:-}" == "--check" ]]; then
        shift
        cmd_update_check "$@"
      else
        # On-demand "update everything + reload" for OSS self-hosters with no
        # scheduler: runs install.sh --upgrade (CLI + plugins) then restarts
        # running agents so the changes load. Mirrors the managed nightly.
        [[ $EUID -eq 0 ]] || fail "$E_PERMISSION" "self-update must run as root (sudo 5dive self-update)"
        AUDIT_CMD="self-update"; AUDIT_ARGS=("$@")
        cmd_self_update "$@"
      fi ;;
    -h|--help|help) usage ;;
    *) fail "$E_USAGE" "unknown command: $top" ;;
  esac
}

# EXIT trap picks up AUDIT_CMD set by the dispatcher + real exit code and
# appends one NDJSON line to the audit log. Installed once at script load so
# every code path (including fail/exit) passes through it.
trap on_exit_audit EXIT

main "$@"
