#!/bin/bash
# knowing-pre-edit: Claude Code PreToolUse hook for automatic context injection.
#
# Injects graph-aware context before file edits. Measures token overhead
# and hit rate for A/B evaluation.
#
# Configuration:
#   KNOWING_HOOKS=off          Disable entirely (default: on)
#   KNOWING_HOOKS_DB=<path>    Database path (default: knowing.db)
#   KNOWING_HOOKS_BUDGET=<n>   Token budget for injected context (default: 1500)
#   KNOWING_HOOKS_FORMAT=<fmt> Output format: kwf|xml (default: kwf)
#   KNOWING_HOOKS_LOG=<path>   Metrics log path (default: .knowing-hooks.jsonl)
#   KNOWING_HOOKS_VERBOSE=1    Print injection to stderr for debugging
#
# Install in .claude/settings.local.json:
#   "hooks": {
#     "PreToolUse": [{
#       "matcher": "Edit|Write",
#       "command": "./hooks/knowing-pre-edit"
#     }]
#   }
#
# The hook reads the tool input from stdin (JSON with tool_name and input fields).
# It extracts the file path, queries knowing for context, and outputs a
# system message that Claude Code prepends to the tool result.

set -euo pipefail

# Kill switch.
if [ "${KNOWING_HOOKS:-on}" = "off" ]; then
  exit 0
fi

# Configuration.
DB="${KNOWING_HOOKS_DB:-knowing.db}"
BUDGET="${KNOWING_HOOKS_BUDGET:-400}"
FORMAT="${KNOWING_HOOKS_FORMAT:-gcf}"
LOG="${KNOWING_HOOKS_LOG:-.knowing-hooks.jsonl}"
VERBOSE="${KNOWING_HOOKS_VERBOSE:-0}"

# Read hook input from stdin.
INPUT=$(cat)

# Extract the file path AND edit content from the tool input.
# PreToolUse receives JSON: {"tool_name": "Edit", "input": {"file_path": "...", "old_string": "..."}}
# We use old_string to extract symbols being modified (edit-aware seeding).
PARSED=$(echo "$INPUT" | python3 -c "
import json, sys, re
try:
    data = json.load(sys.stdin)
    inp = data.get('tool_input', data.get('input', {}))
    file_path = inp.get('file_path', inp.get('path', ''))
    old_string = inp.get('old_string', '')

    # Extract identifiers from the code being edited.
    # Match CamelCase words, snake_case words, and dotted paths that look like symbols.
    symbols = set()
    if old_string:
        # Function/method names: func (receiver) Name( or func Name(
        for m in re.finditer(r'func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(', old_string):
            symbols.add(m.group(1))
        # Type declarations: type Name struct/interface
        for m in re.finditer(r'type\s+(\w+)\s+(?:struct|interface)', old_string):
            symbols.add(m.group(1))
        # Variable/const assignments with clear names
        for m in re.finditer(r'(?:var|const)\s+(\w+)', old_string):
            symbols.add(m.group(1))
        # Method calls: .MethodName(
        for m in re.finditer(r'\.(\w+)\s*\(', old_string):
            name = m.group(1)
            if len(name) > 2 and name[0].isupper():
                symbols.add(name)
        # Package-qualified calls: pkg.Func(
        for m in re.finditer(r'(\w+)\.(\w+)\s*\(', old_string):
            symbols.add(m.group(2))
        # If no symbols found from patterns, use significant identifiers
        if not symbols:
            for m in re.finditer(r'\b([A-Z]\w{2,})\b', old_string):
                symbols.add(m.group(1))

    # Build query: use extracted symbols joined by spaces, fall back to filename
    if symbols:
        # Filter out common Go keywords
        skip = {'Context', 'Error', 'String', 'Close', 'Start', 'Stop', 'New', 'Get', 'Set'}
        symbols = [s for s in symbols if s not in skip]

    query = ' '.join(sorted(symbols)[:5]) if symbols else ''
    print(f'{file_path}|||{query}')
except:
    print('|||')
" 2>/dev/null)

FILE_PATH="${PARSED%%|||*}"
EDIT_SYMBOLS="${PARSED#*|||}"

if [ -z "$FILE_PATH" ]; then
  exit 0
fi

# Check if the database exists.
if [ ! -f "$DB" ]; then
  exit 0
fi

# Check if knowing binary is available.
if ! command -v knowing &>/dev/null; then
  # Try local binary.
  if [ -f "./knowing" ]; then
    KNOWING="./knowing"
  else
    exit 0
  fi
else
  KNOWING="knowing"
fi

# Query knowing for context.
# If we extracted symbols from the edit, use those as the query (edit-aware seeding).
# Fall back to filename if no symbols were extracted.
if [ -n "$EDIT_SYMBOLS" ]; then
  QUERY="$EDIT_SYMBOLS"
else
  QUERY=$(basename "$FILE_PATH" | sed 's/\.[^.]*$//')
fi

START_MS=$(python3 -c "import time; print(int(time.time()*1000))")

CONTEXT=$($KNOWING context -task "$QUERY" -budget "$BUDGET" -format "$FORMAT" -db "$DB" 2>/dev/null) || true

END_MS=$(python3 -c "import time; print(int(time.time()*1000))")
LATENCY_MS=$((END_MS - START_MS))

# Measure what we got. Check for empty results.
if [ -z "$CONTEXT" ] || echo "$CONTEXT" | grep -qE "symbols=0|error:"; then
  # No context returned (empty graph, no matches).
  # Log as a miss and exit without injecting.
  echo "{\"ts\":$(date +%s),\"file\":\"$FILE_PATH\",\"event\":\"miss\",\"latency_ms\":$LATENCY_MS,\"tokens\":0}" >> "$LOG"
  exit 0
fi

# Trim output for hook injection: keep header + top 20 symbols, drop edges.
# Full context is expensive in model tokens; the hook only needs the most relevant.
CONTEXT=$(echo "$CONTEXT" | python3 -c "
import sys
lines = sys.stdin.read().splitlines()
out = []
symbol_count = 0
max_symbols = 20
for line in lines:
    if line.startswith('## edges'):
        break
    if line.startswith('@'):
        symbol_count += 1
        if symbol_count > max_symbols:
            continue
    out.append(line)
print('\n'.join(out))
")

# Count approximate tokens (words + punctuation).
TOKEN_COUNT=$(echo "$CONTEXT" | wc -w | tr -d ' ')

# Log the injection event.
echo "{\"ts\":$(date +%s),\"file\":\"$FILE_PATH\",\"event\":\"inject\",\"latency_ms\":$LATENCY_MS,\"tokens\":$TOKEN_COUNT,\"format\":\"$FORMAT\",\"budget\":$BUDGET}" >> "$LOG"

# Verbose mode: print to stderr for debugging.
if [ "$VERBOSE" = "1" ]; then
  echo "[knowing-hook] Injecting $TOKEN_COUNT tokens for $FILE_PATH (${LATENCY_MS}ms)" >&2
fi

# Output as hookSpecificOutput with additionalContext (PreToolUse schema).
python3 -c "
import json, sys
context = sys.stdin.read()
print(json.dumps({
    'hookSpecificOutput': {
        'hookEventName': 'PreToolUse',
        'permissionDecision': 'allow',
        'additionalContext': f'[knowing context for {\"$FILE_PATH\"}] Graph-ranked symbols related to this file:\n{context}'
    }
}))
" <<< "$CONTEXT"
