# bench/corpora/conll2003_en — Reuters newswire NER as a precision probe.
#
# CoNLL-2003 is the canonical English NER benchmark (Reuters newswire,
# 1996–1997). Gold annotations are PER / LOC / ORG / MISC, validated by
# the original shared-task organisers. We sample 300 sentences from the
# `test` split (the standard evaluation split, ~3.5k sentences) for a
# bench cell that runs in a few minutes total across patterns + gliner
# + gliner-py.
#
# Why we include it: ai4privacy_en is synthetic, generated by an LLM
# from privacy-policy templates. We want a real-natural-text English
# corpus to test whether anonde over-flags on capitalised common nouns,
# headlines, etc. CoNLL-2003 is the obvious choice — every English NER
# paper benchmarks against it, so the precision number here is directly
# comparable to published literature.
#
# Targets mirror wikiann_de's layout so the top-level bench/Makefile
# can call `data-conll2003_en` and `corpus-conll2003_en` uniformly.

HERE   := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
DATA   := $(HERE)data
CORPUS := $(DATA)/corpus.jsonl

# How many sentences to sample. 300 keeps bench wall-clock low and
# matches wikiann_de's sample size for cross-corpus comparability.
N_SENTENCES ?= 300
# Deterministic seed so anonde-ner vs anonde-patterns score on the
# same 300 docs.
SEED        ?= 20260515
PYTHON      ?= python3

.PHONY: all data clean

all: data

$(CORPUS):
	mkdir -p $(DATA)
	$(PYTHON) $(HERE)loader.py --out $(CORPUS) --n $(N_SENTENCES) --seed $(SEED)

data: $(CORPUS)

clean:
	rm -f $(CORPUS)
