#!/usr/bin/env bash
set -euo pipefail

if ((BASH_VERSINFO[0] < 5)); then
  echo "error: scripts/buildbuddy-ci-lane-batch requires bash 5+ for wait -n diagnostics" >&2
  exit 2
fi

log_root="${MEERKAT_BUILDBUDDY_LOG_ROOT:-${RUNNER_TEMP:-/tmp}/buildbuddy-logs}"
export MEERKAT_BUILDBUDDY_LOG_ROOT="${log_root}"
mkdir -p "${log_root}"

trim_lane() {
  local lane="$1"
  lane="${lane#"${lane%%[![:space:]]*}"}"
  lane="${lane%"${lane##*[![:space:]]}"}"
  printf '%s' "${lane}"
}

lanes=()
if (($# > 0)); then
  for lane in "$@"; do
    lane="$(trim_lane "${lane}")"
    if [[ -z "${lane}" || "${lane}" == \#* ]]; then
      continue
    fi
    lanes+=("${lane}")
  done
else
  while IFS= read -r lane || [[ -n "${lane}" ]]; do
    lane="$(trim_lane "${lane}")"
    if [[ -z "${lane}" || "${lane}" == \#* ]]; then
      continue
    fi
    lanes+=("${lane}")
  done <<< "${LANES:-}"
fi

if ((${#lanes[@]} == 0)); then
  echo "No BuildBuddy lanes selected."
  exit 0
fi

max_jobs="${MEERKAT_BUILDBUDDY_BATCH_JOBS:-${#lanes[@]}}"
if ! [[ "${max_jobs}" =~ ^[0-9]+$ ]] || ((max_jobs < 1)); then
  echo "Invalid MEERKAT_BUILDBUDDY_BATCH_JOBS='${max_jobs}', using ${#lanes[@]}." >&2
  max_jobs="${#lanes[@]}"
fi

declare -a active_pids=()
declare -A pid_lanes=()
declare -A pid_logs=()
next_lane=0
watchdog_pid=""
heartbeat_pid=""

cleanup_watchdog() {
  if [[ -n "${watchdog_pid}" ]]; then
    kill "${watchdog_pid}" 2>/dev/null || true
    wait "${watchdog_pid}" 2>/dev/null || true
    watchdog_pid=""
  fi
}

cleanup_heartbeat() {
  if [[ -n "${heartbeat_pid}" ]]; then
    kill "${heartbeat_pid}" 2>/dev/null || true
    wait "${heartbeat_pid}" 2>/dev/null || true
    heartbeat_pid=""
  fi
}

remove_active_pid() {
  local done_pid="$1"
  local remaining=()
  local pid
  rm -f "${log_root}/active-${done_pid}"
  for pid in "${active_pids[@]}"; do
    if [[ "${pid}" != "${done_pid}" ]]; then
      remaining+=("${pid}")
    fi
  done
  active_pids=("${remaining[@]}")
}

tail_line_count() {
  local lines="${MEERKAT_BUILDBUDDY_BATCH_TAIL_LINES:-120}"
  if ! [[ "${lines}" =~ ^[0-9]+$ ]] || ((lines < 1)); then
    lines=120
  fi
  printf '%s' "${lines}"
}

print_log_tail() {
  local title="$1"
  local path="$2"
  local lines="${3:-$(tail_line_count)}"
  if [[ ! -f "${path}" ]]; then
    return
  fi

  echo "::group::${title}" >&2
  tail -"${lines}" "${path}" >&2 || true
  echo "::endgroup::" >&2
}

print_lane_summary() {
  local lane="$1"
  local summary="${log_root}/${lane}/summary.tsv"
  if [[ ! -f "${summary}" ]]; then
    return
  fi

  echo "::group::BuildBuddy lane ${lane} summary" >&2
  cat "${summary}" >&2 || true
  echo "::endgroup::" >&2
}

print_lane_diagnostics() {
  local lane="$1"
  local wrapper_log="${2:-}"
  local lines="${3:-$(tail_line_count)}"
  local detail_log
  local found_detail=0

  if [[ -n "${wrapper_log}" ]]; then
    print_log_tail "Batch wrapper for BuildBuddy lane ${lane}" "${wrapper_log}" "${lines}"
  fi

  print_lane_summary "${lane}"

  shopt -s nullglob
  for detail_log in "${log_root}/${lane}"/*.log; do
    found_detail=1
    print_log_tail "Detailed BuildBuddy log ${lane}/$(basename "${detail_log}")" "${detail_log}" "${lines}"
  done
  shopt -u nullglob

  if [[ "${found_detail}" == "0" && -z "${wrapper_log}" ]]; then
    echo "::warning::No BuildBuddy logs found for active lane ${lane} under ${log_root}." >&2
  fi
}

lane_summary_succeeded() {
  local lane="$1"
  local summary="${log_root}/${lane}/summary.tsv"
  [[ -f "${summary}" ]] || return 1

  awk -F '\t' '
    NR == 1 { next }
    NF >= 2 {
      rows += 1
      if ($2 != "0") {
        failed = 1
      }
    }
    END {
      exit (rows > 0 && failed != 1) ? 0 : 1
    }
  ' "${summary}"
}

# Returns 0 only if the lane wrapper finished cleanly. A missing sentinel
# means at least one substep of a multi-step lane (e.g. fmt-lint) is still
# running, even if summary.tsv already has a 0 row from an earlier substep.
# Without this guard, the SLO watchdog would call blocking `wait` on a still
# running lane and miss the CI deadline.
lane_wrapper_done() {
  local lane="$1"
  [[ -f "${log_root}/${lane}/lane_done" ]]
}

reconcile_successful_active_lanes() {
  local reason="$1"
  local pids=("${active_pids[@]}")
  local pid lane lane_log lane_status

  for pid in "${pids[@]}"; do
    lane="${pid_lanes[${pid}]:-unknown}"
    lane_log="${pid_logs[${pid}]:-}"
    if ! lane_summary_succeeded "${lane}"; then
      continue
    fi
    if ! lane_wrapper_done "${lane}"; then
      echo "::warning::BuildBuddy lane ${lane} has a successful summary row but no lane_done sentinel before ${reason}; substeps may still be running, leaving it active for dump/kill." >&2
      continue
    fi

    echo "::notice::BuildBuddy lane ${lane} recorded a successful summary before ${reason}; reaping it instead of treating it as timed out." >&2
    set +e
    wait "${pid}"
    lane_status="$?"
    set -e

    if [[ "${lane_status}" == "0" ]]; then
      remove_active_pid "${pid}"
      echo "BuildBuddy lane ${lane} passed."
      if [[ -n "${lane_log}" && -f "${lane_log}" ]]; then
        grep -E '^(PASS|SKIP) ' "${lane_log}" || tail -40 "${lane_log}" || true
      fi
    else
      echo "::warning::BuildBuddy lane ${lane} had a successful summary but wrapper exited with ${lane_status}; keeping it active for diagnostics." >&2
    fi
  done
}

all_lanes_dispatched() {
  ((next_lane >= ${#lanes[@]}))
}

print_active_tails() {
  local pid lane lane_log
  for pid in "${active_pids[@]}"; do
    lane="${pid_lanes[${pid}]:-unknown}"
    lane_log="${pid_logs[${pid}]:-}"
    print_lane_diagnostics "${lane}" "${lane_log}"
  done
}

print_active_tails_from_files() {
  local active_file lane lane_log
  shopt -s nullglob
  for active_file in "${log_root}"/active-*; do
    IFS=$'\t' read -r lane lane_log <"${active_file}" || true
    print_lane_diagnostics "${lane:-unknown}" "${lane_log:-}"
  done
  shopt -u nullglob
}

kill_active_lanes() {
  local pid
  if ((${#active_pids[@]} == 0)); then
    return
  fi

  for pid in "${active_pids[@]}"; do
    kill "${pid}" 2>/dev/null || true
  done
  sleep 2
  for pid in "${active_pids[@]}"; do
    kill -KILL "${pid}" 2>/dev/null || true
  done
  for pid in "${active_pids[@]}"; do
    wait "${pid}" 2>/dev/null || true
  done
  rm -f "${log_root}"/active-* 2>/dev/null || true
}

terminate_batch() {
  trap - TERM INT
  echo "::warning::BuildBuddy lane batch received a termination signal; dumping active lane logs." >&2
  print_active_tails
  kill_active_lanes
  cleanup_heartbeat
  cleanup_watchdog
  exit 143
}

deadline_batch() {
  trap - USR1
  echo "::warning::BuildBuddy lane batch reached the GCP CI SLO deadline; reconciling active lane summaries before failing." >&2
  reconcile_successful_active_lanes "the SLO deadline"
  if all_lanes_dispatched && ((${#active_pids[@]} == 0)); then
    echo "::notice::All BuildBuddy lanes completed successfully before the SLO watchdog was reaped; treating the batch as successful." >&2
    cleanup_heartbeat
    cleanup_watchdog
    exit 0
  fi

  echo "::error::BuildBuddy lane batch reached the GCP CI SLO deadline; dumping active lane logs." >&2
  print_active_tails
  kill_active_lanes
  cleanup_heartbeat
  cleanup_watchdog
  exit 124
}

trap terminate_batch TERM INT
trap deadline_batch USR1

start_slo_watchdog() {
  if ! [[ "${CI_STARTED_AT_EPOCH:-}" =~ ^[0-9]+$ ]] ||
    ! [[ "${MEERKAT_GCP_BUILDBUDDY_CI_MAX_SECONDS:-}" =~ ^[0-9]+$ ]] ||
    [[ "${MEERKAT_GCP_BUILDBUDDY_CI_MAX_SECONDS:-0}" == "0" ]]; then
    return
  fi

  local grace="${MEERKAT_BUILDBUDDY_BATCH_SLO_GRACE_SECONDS:-45}"
  if ! [[ "${grace}" =~ ^[0-9]+$ ]]; then
    grace=45
  fi

  local deadline_epoch="$((CI_STARTED_AT_EPOCH + MEERKAT_GCP_BUILDBUDDY_CI_MAX_SECONDS - grace))"
  local now
  now="$(date +%s)"
  local seconds_until_deadline="$((deadline_epoch - now))"
  if ((seconds_until_deadline <= 0)); then
    echo "::error::BuildBuddy lane batch started after the GCP CI SLO deadline." >&2
    exit 124
  fi

  echo "SLO watchdog will stop this lane batch in ${seconds_until_deadline}s."
  (
    sleep "${seconds_until_deadline}"
    kill -USR1 "$$" 2>/dev/null || true
  ) &
  watchdog_pid="$!"
}

start_slo_watchdog

start_heartbeat() {
  local interval="${MEERKAT_BUILDBUDDY_BATCH_HEARTBEAT_SECONDS:-60}"
  if ! [[ "${interval}" =~ ^[0-9]+$ ]] || ((interval < 1)); then
    interval=60
  fi

  (
    while true; do
      sleep "${interval}"
      shopt -s nullglob
      active_files=("${log_root}"/active-*)
      shopt -u nullglob
      if ((${#active_files[@]} == 0)); then
        continue
      fi
      echo "::group::Active BuildBuddy lane heartbeat" >&2
      print_active_tails_from_files
      echo "::endgroup::" >&2
    done
  ) &
  heartbeat_pid="$!"
}

start_heartbeat

start_lane() {
  local lane="$1"
  local lane_log="${log_root}/batch-${lane}.log"
  local pid

  (
    echo "::group::BuildBuddy lane ${lane}"
    echo "BuildBuddy lane ${lane} started at $(date -u +%Y-%m-%dT%H:%M:%SZ)."
    set +e
    scripts/buildbuddy-ci-lane "${lane}"
    lane_status="$?"
    set -e
    echo "BuildBuddy lane ${lane} exited with ${lane_status} at $(date -u +%Y-%m-%dT%H:%M:%SZ)."
    echo "::endgroup::"
    exit "${lane_status}"
  ) >"${lane_log}" 2>&1 &

  pid="$!"
  active_pids+=("${pid}")
  pid_lanes["${pid}"]="${lane}"
  pid_logs["${pid}"]="${lane_log}"
  printf '%s\t%s\n' "${lane}" "${lane_log}" >"${log_root}/active-${pid}"
  echo "Started BuildBuddy lane ${lane} (pid ${pid})."
}

reap_one_lane() {
  local done_pid=""
  local lane_status
  set +e
  wait -n -p done_pid "${active_pids[@]}"
  lane_status="$?"
  set -e

  if [[ -z "${done_pid}" ]]; then
    return 0
  fi

  local lane="${pid_lanes[${done_pid}]:-unknown}"
  local lane_log="${pid_logs[${done_pid}]:-}"
  remove_active_pid "${done_pid}"

  if [[ "${lane_status}" == "0" ]]; then
    echo "BuildBuddy lane ${lane} passed."
    grep -E '^(PASS|SKIP) ' "${lane_log}" || tail -40 "${lane_log}" || true
    return 0
  fi

  echo "::error::BuildBuddy lane ${lane} failed with ${lane_status}." >&2
  print_lane_diagnostics "${lane}" "${lane_log}" 180
  print_active_tails
  kill_active_lanes
  cleanup_heartbeat
  cleanup_watchdog
  exit "${lane_status}"
}

echo "Running ${#lanes[@]} BuildBuddy lanes with up to ${max_jobs} local submitters."

while ((next_lane < ${#lanes[@]} || ${#active_pids[@]} > 0)); do
  while ((next_lane < ${#lanes[@]} && ${#active_pids[@]} < max_jobs)); do
    start_lane "${lanes[${next_lane}]}"
    ((next_lane += 1))
  done

  if ((${#active_pids[@]} > 0)); then
    reap_one_lane
  fi
done

cleanup_heartbeat
cleanup_watchdog
