   Compiling train v0.1.5 (/home/ckl/projects/arle/crates/train)
    Finished `release` profile [optimized] target(s) in 4.56s
     Running `target/release/examples/opd_step_cuda_moderate_bench`
config backend=cuda hidden=512 intermediate=1536 layers=12 vocab=32768 num_heads=8 head_dim=64 num_kv_heads=4 prompt=[1, 3, 8] rollout_len=2 lr=0.001 steps_per_run=10 warmup_runs=1 measured_runs=3
correctness cpu_losses=[0.00031420233, 0.0003192265, 0.00031694287] cuda_losses=[0.00031420228, 0.0003192261, 0.00031694295] max_relative_error=0.000001276
run=1 wall_seconds=0.504184 per_step_seconds=0.050418 steps_per_sec=19.834045 first_loss=0.000314202 last_loss=0.000315440
run=2 wall_seconds=0.504611 per_step_seconds=0.050461 steps_per_sec=19.817250 first_loss=0.000314202 last_loss=0.000315440
run=3 wall_seconds=0.505571 per_step_seconds=0.050557 steps_per_sec=19.779611 first_loss=0.000314202 last_loss=0.000315440
summary mean_steps_per_sec=19.810302 median_steps_per_sec=19.817250 sigma_steps_per_sec=0.022759 sigma_pct=0.115 mean_step_seconds=0.050479 median_step_seconds=0.050461 max_loss_relative_error_vs_cpu=0.000001276
