   Compiling autograd v0.1.5 (/home/ckl/projects/arle/crates/autograd)
   Compiling train v0.1.5 (/home/ckl/projects/arle/crates/train)
    Finished `release` profile [optimized] target(s) in 18.90s
     Running `target/release/examples/opd_step_cuda_realckpt_train --lr 1e-7 --steps 50`
config backend=cuda model_dir=/home/ckl/.cache/modelscope/hub/models/Qwen/Qwen3-0.6B train_steps=50 rollout_len=8 decode_len=16 lr=1.000000012e-7 default_lr=4.999999874e-5 grad_clip=1 perturb_scale=0.001 perturb_seed=0x0f0dcafe20260521 safety_first_step_max_seconds=0.5
prompt split=train index=0 ids=[1, 872, 198, 3456]
prompt split=train index=1 ids=[1, 198, 1512, 429]
prompt split=train index=2 ids=[1, 770, 3186, 25, 220]
prompt split=train index=3 ids=[1, 644, 374, 279, 1887]
prompt split=train index=4 ids=[1, 3838, 374, 264, 2077, 13]
prompt split=train index=5 ids=[1, 785, 594, 287, 374, 1690]
prompt split=train index=6 ids=[1, 3347, 11, 358, 1052, 429]
prompt split=train index=7 ids=[1, 2610, 527, 1139, 304, 279, 1670]
prompt split=heldout index=0 ids=[1, 4438, 374, 279, 2768]
prompt split=heldout index=1 ids=[1, 1516, 374, 264, 1296, 4339]
prompt split=heldout index=2 ids=[1, 785, 1401, 315, 279, 1967]
prompt split=heldout index=3 ids=[1, 3198, 279, 1296, 25, 220]
model_summary hidden=1024 intermediate=3072 layers=28 vocab=151936 num_heads=16 num_kv_heads=8 head_dim=128 tie_word_embeddings=true rope_theta=1000000 teacher_param_elements=601292800 student_model_elements=601292800 student_trainable_elements=596049920 teacher_load_seconds=7.078423 student_load_seconds=6.610343
eval_detail step=0 split=train prompt_index=0 prompt=[1, 872, 198, 3456] teacher_suffix=[888, 536, 4697, 972, 262, 584, 1099, 737, 1887, 2242, 1294, 2827, 8, 972, 286, 442] student_suffix=[888, 536, 4697, 972, 262, 584, 1099, 737, 1887, 2242, 1294, 2827, 8, 972, 286, 442] overlap_pct=100.000000 kl=5.621485744854e-3
eval_detail step=0 split=train prompt_index=1 prompt=[1, 198, 1512, 429] teacher_suffix=[374, 11, 279, 1372, 315, 3039, 279, 1372, 7952, 304, 279, 1140, 13, 2055, 11, 369] student_suffix=[374, 11, 279, 1372, 315, 3039, 279, 1372, 7952, 304, 279, 1140, 13, 2055, 11, 369] overlap_pct=100.000000 kl=1.115262489178e-2
eval_detail step=0 split=train prompt_index=2 prompt=[1, 770, 3186, 25, 220] teacher_suffix=[16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15] student_suffix=[16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15] overlap_pct=100.000000 kl=1.497549897173e-2
eval_detail step=0 split=train prompt_index=3 prompt=[1, 644, 374, 279, 1887] teacher_suffix=[2874, 369, 279, 31559, 315, 279, 8457, 11, 323, 279, 1887, 1815, 28469, 9363, 525, 279] student_suffix=[2874, 369, 279, 18003, 315, 279, 1156, 323, 2086, 2188, 315, 279, 1849, 315, 279, 8453] overlap_pct=31.250000 kl=1.330972498486e-2
eval_detail step=0 split=train prompt_index=4 prompt=[1, 3838, 374, 264, 2077, 13] teacher_suffix=[7521, 481, 362, 5714, 374, 264, 897, 429, 374, 5927, 553, 264, 729, 13, 1752, 3110] student_suffix=[3555, 374, 264, 5714, 30, 3555, 374, 264, 5714, 30, 3555, 374, 264, 5714, 30, 3555] overlap_pct=6.250000 kl=1.736336546338e-2
eval_detail step=0 split=train prompt_index=5 prompt=[1, 785, 594, 287, 374, 1690] teacher_suffix=[1, 374, 264, 4647, 429, 702, 1012, 1483, 311, 7512, 279, 1251, 315, 279, 7748, 1186] student_suffix=[1, 374, 264, 4647, 429, 19257, 311, 279, 1251, 315, 279, 5537, 315, 279, 7748, 21575] overlap_pct=56.250000 kl=2.734693989953e-2
eval_detail step=0 split=train prompt_index=6 prompt=[1, 3347, 11, 358, 1052, 429] teacher_suffix=[358, 646, 944, 1490, 432, 13, 358, 646, 944, 1490, 432, 13, 358, 646, 944, 1490] student_suffix=[358, 646, 944, 1490, 432, 13, 358, 646, 944, 1490, 432, 13, 358, 646, 944, 1490] overlap_pct=100.000000 kl=1.167650551419e-2
eval_detail step=0 split=train prompt_index=7 prompt=[1, 2610, 527, 1139, 304, 279, 1670] teacher_suffix=[477, 11, 323, 279, 62, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716] student_suffix=[477, 11, 323, 279, 62, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716] overlap_pct=100.000000 kl=1.322513795018e-2
eval_detail step=0 split=heldout prompt_index=0 prompt=[1, 4438, 374, 279, 2768] teacher_suffix=[315, 279, 3364, 13, 576, 3364, 374, 911, 279, 11618, 315, 264, 3908, 3743, 6941, 444] student_suffix=[315, 279, 3491, 13, 576, 3491, 374, 311, 1477, 279, 897, 315, 279, 7493, 13, 576] overlap_pct=31.250000 kl=1.923349009451e-2
eval_detail step=0 split=heldout prompt_index=1 prompt=[1, 1516, 374, 264, 1296, 4339] teacher_suffix=[291, 504, 279, 1467, 11, 714, 279, 1467, 374, 537, 304, 6364, 13, 5209, 14683, 279] student_suffix=[291, 504, 279, 2701, 1467, 25, 4710, 785, 2701, 374, 264, 1140, 315, 279, 1429, 3213] overlap_pct=18.750000 kl=1.733289758898e-2
eval_detail step=0 split=heldout prompt_index=2 prompt=[1, 785, 1401, 315, 279, 1967] teacher_suffix=[43059, 21938, 315, 7148, 1, 374, 264, 11245, 2168, 429, 702, 1012, 13570, 14975, 304, 279] student_suffix=[43059, 21938, 315, 7148, 1, 374, 264, 11245, 37250, 304, 279, 6424, 13, 1084, 374, 264] overlap_pct=50.000000 kl=4.475668043414e-2
eval_detail step=0 split=heldout prompt_index=3 prompt=[1, 3198, 279, 1296, 25, 220] teacher_suffix=[16, 24, 24, 15, 82, 11, 220, 17, 15, 15, 15, 82, 11, 220, 17, 15] student_suffix=[16, 24, 24, 15, 82, 11, 220, 17, 15, 15, 15, 82, 11, 220, 17, 15] overlap_pct=100.000000 kl=5.589429209807e-3
eval_summary step=0 train_overlap_pct=74.218750 heldout_overlap_pct=50.000000 train_kl=1.433391042756e-2 heldout_kl=2.172812433186e-2 eval_seconds=7.823648
train_step step=1 prompt_index=0 prompt=[1, 872, 198, 3456] loss=1.788745430531e-5 rollout_len=12 step_seconds=0.212615
train_step step=10 prompt_index=1 prompt=[1, 198, 1512, 429] loss=3.805938104051e-5 rollout_len=12 step_seconds=0.246916
train_step step=20 prompt_index=3 prompt=[1, 644, 374, 279, 1887] loss=3.934411506634e-5 rollout_len=13 step_seconds=0.249278
train_step step=30 prompt_index=5 prompt=[1, 785, 594, 287, 374, 1690] loss=3.520914106048e-5 rollout_len=14 step_seconds=0.248342
train_step step=40 prompt_index=7 prompt=[1, 2610, 527, 1139, 304, 279, 1670] loss=3.992492929683e-5 rollout_len=15 step_seconds=0.248984
train_step step=50 prompt_index=1 prompt=[1, 198, 1512, 429] loss=3.804211155511e-5 rollout_len=12 step_seconds=0.252589
eval_detail step=50 split=train prompt_index=0 prompt=[1, 872, 198, 3456] teacher_suffix=[888, 536, 4697, 972, 262, 584, 1099, 737, 1887, 2242, 1294, 2827, 8, 972, 286, 442] student_suffix=[888, 536, 4697, 972, 262, 584, 1099, 737, 1887, 2242, 1294, 2827, 8, 972, 286, 442] overlap_pct=100.000000 kl=4.205619347186e-3
eval_detail step=50 split=train prompt_index=1 prompt=[1, 198, 1512, 429] teacher_suffix=[374, 11, 279, 1372, 315, 3039, 279, 1372, 7952, 304, 279, 1140, 13, 2055, 11, 369] student_suffix=[374, 11, 279, 1372, 315, 3039, 279, 1372, 7952, 304, 279, 1140, 13, 2055, 11, 369] overlap_pct=100.000000 kl=9.303841193494e-3
eval_detail step=50 split=train prompt_index=2 prompt=[1, 770, 3186, 25, 220] teacher_suffix=[16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15] student_suffix=[16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15] overlap_pct=100.000000 kl=7.015143094485e-3
eval_detail step=50 split=train prompt_index=3 prompt=[1, 644, 374, 279, 1887] teacher_suffix=[2874, 369, 279, 31559, 315, 279, 8457, 11, 323, 279, 1887, 1815, 28469, 9363, 525, 279] student_suffix=[2874, 369, 279, 18003, 315, 279, 1156, 323, 2086, 2188, 315, 279, 1849, 315, 279, 8453] overlap_pct=31.250000 kl=1.266151387503e-2
eval_detail step=50 split=train prompt_index=4 prompt=[1, 3838, 374, 264, 2077, 13] teacher_suffix=[7521, 481, 362, 5714, 374, 264, 897, 429, 374, 5927, 553, 264, 729, 13, 1752, 3110] student_suffix=[576, 1102, 315, 264, 729, 374, 279, 897, 5927, 553, 279, 729, 1618, 13, 1752, 3110] overlap_pct=18.750000 kl=1.540783205582e-2
eval_detail step=50 split=train prompt_index=5 prompt=[1, 785, 594, 287, 374, 1690] teacher_suffix=[1, 374, 264, 4647, 429, 702, 1012, 1483, 311, 7512, 279, 1251, 315, 279, 7748, 1186] student_suffix=[1, 374, 264, 4647, 429, 19257, 311, 279, 1251, 315, 279, 5537, 315, 279, 3082, 315] overlap_pct=50.000000 kl=2.405758219685e-2
eval_detail step=50 split=train prompt_index=6 prompt=[1, 3347, 11, 358, 1052, 429] teacher_suffix=[358, 646, 944, 1490, 432, 13, 358, 646, 944, 1490, 432, 13, 358, 646, 944, 1490] student_suffix=[358, 646, 944, 1490, 432, 13, 358, 646, 944, 1490, 432, 13, 358, 646, 944, 1490] overlap_pct=100.000000 kl=9.536280958109e-3
eval_detail step=50 split=train prompt_index=7 prompt=[1, 2610, 527, 1139, 304, 279, 1670] teacher_suffix=[477, 11, 323, 279, 62, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716] student_suffix=[477, 11, 323, 279, 62, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716] overlap_pct=100.000000 kl=9.135539867783e-3
eval_detail step=50 split=heldout prompt_index=0 prompt=[1, 4438, 374, 279, 2768] teacher_suffix=[315, 279, 3364, 13, 576, 3364, 374, 911, 279, 11618, 315, 264, 3908, 3743, 6941, 444] student_suffix=[315, 279, 3491, 13, 576, 3491, 374, 311, 1477, 279, 897, 315, 279, 7493, 13, 576] overlap_pct=31.250000 kl=1.847108675102e-2
eval_detail step=50 split=heldout prompt_index=1 prompt=[1, 1516, 374, 264, 1296, 4339] teacher_suffix=[291, 504, 279, 1467, 11, 714, 279, 1467, 374, 537, 304, 6364, 13, 5209, 14683, 279] student_suffix=[291, 504, 279, 2701, 1467, 25, 4710, 785, 2701, 374, 264, 1140, 315, 279, 1429, 3213] overlap_pct=18.750000 kl=1.747763554751e-2
eval_detail step=50 split=heldout prompt_index=2 prompt=[1, 785, 1401, 315, 279, 1967] teacher_suffix=[43059, 21938, 315, 7148, 1, 374, 264, 11245, 2168, 429, 702, 1012, 13570, 14975, 304, 279] student_suffix=[43059, 21938, 315, 7148, 1, 374, 264, 11245, 37250, 304, 279, 6424, 13, 1084, 374, 264] overlap_pct=50.000000 kl=4.366684810067e-2
eval_detail step=50 split=heldout prompt_index=3 prompt=[1, 3198, 279, 1296, 25, 220] teacher_suffix=[16, 24, 24, 15, 82, 11, 220, 17, 15, 15, 15, 82, 11, 220, 17, 15] student_suffix=[16, 24, 24, 15, 82, 11, 220, 17, 15, 15, 15, 82, 11, 220, 17, 15] overlap_pct=100.000000 kl=5.384251043790e-3
eval_summary step=50 train_overlap_pct=75.000000 heldout_overlap_pct=50.000000 train_kl=1.141541907359e-2 heldout_kl=2.124995536075e-2 eval_seconds=6.370272
training_summary total_steps=50 total_wall_seconds=18.775730 mean_step_seconds=0.248107 median_step_seconds=0.248859 mean_sampled_loss=3.363959320268e-5 first_sampled_loss=1.788745430531e-5 step200_sampled_loss=NaN final_sampled_loss=3.804211155511e-5 sampled_loss_reduction_200_pct=NaN sampled_loss_reduction_final_pct=-112.674822 train_kl_reduction_200_pct=NaN train_kl_reduction_final_pct=20.360748
summary_eval_row step=0 train_overlap_pct=74.218750 heldout_overlap_pct=50.000000 train_kl=1.433391042756e-2 heldout_kl=2.172812433186e-2
summary_eval_row step=50 train_overlap_pct=75.000000 heldout_overlap_pct=50.000000 train_kl=1.141541907359e-2 heldout_kl=2.124995536075e-2
