config backend=cuda hidden=512 intermediate=1536 layers=12 vocab=32768 num_heads=8 head_dim=64 num_kv_heads=4 prompt=[1, 3, 8] rollout_len=2 lr=0.001 steps_per_run=10 warmup_runs=1 measured_runs=3
correctness cpu_losses=[0.00031420236, 0.00031922662, 0.0003169428] cuda_losses=[0.00031420228, 0.0003192261, 0.00031694295] max_relative_error=0.000001641
run=1 wall_seconds=0.659248 per_step_seconds=0.065925 steps_per_sec=15.168797 first_loss=0.000314202 last_loss=0.000315440
run=2 wall_seconds=0.660502 per_step_seconds=0.066050 steps_per_sec=15.140001 first_loss=0.000314202 last_loss=0.000315440
run=3 wall_seconds=0.658321 per_step_seconds=0.065832 steps_per_sec=15.190160 first_loss=0.000314202 last_loss=0.000315440
summary mean_steps_per_sec=15.166319 median_steps_per_sec=15.168797 sigma_steps_per_sec=0.020552 sigma_pct=0.136 mean_step_seconds=0.065936 median_step_seconds=0.065925 max_loss_relative_error_vs_cpu=0.000001641
