| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9974847287100251, | |
| "eval_steps": 100, | |
| "global_step": 347, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 763.18623046875, | |
| "epoch": 0.01437297879985627, | |
| "grad_norm": 0.0765276625752449, | |
| "kl": -6.394833326339721e-06, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 0.0157, | |
| "reward": 0.17431640625, | |
| "reward_std": 0.23442449774593116, | |
| "rewards/accuracy_reward": 0.08994140625, | |
| "rewards/format_reward": 0.084375, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 698.62314453125, | |
| "epoch": 0.02874595759971254, | |
| "grad_norm": 0.11506624519824982, | |
| "kl": 0.00981593132019043, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 0.0584, | |
| "reward": 0.6732421875, | |
| "reward_std": 0.3674958860501647, | |
| "rewards/accuracy_reward": 0.07646484375, | |
| "rewards/format_reward": 0.59677734375, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 541.0048828125, | |
| "epoch": 0.04311893639956881, | |
| "grad_norm": 0.050542764365673065, | |
| "kl": 0.02561187744140625, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 0.0354, | |
| "reward": 1.036328125, | |
| "reward_std": 0.2127559134736657, | |
| "rewards/accuracy_reward": 0.102734375, | |
| "rewards/format_reward": 0.93359375, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.689453125, | |
| "epoch": 0.05749191519942508, | |
| "grad_norm": 0.03859843313694, | |
| "kl": 0.0311004638671875, | |
| "learning_rate": 1.1428571428571429e-05, | |
| "loss": 0.0249, | |
| "reward": 1.1552734375, | |
| "reward_std": 0.23164508808404208, | |
| "rewards/accuracy_reward": 0.20224609375, | |
| "rewards/format_reward": 0.95302734375, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.31357421875, | |
| "epoch": 0.07186489399928135, | |
| "grad_norm": 0.046529632061719894, | |
| "kl": 0.0368988037109375, | |
| "learning_rate": 1.4285714285714287e-05, | |
| "loss": 0.0151, | |
| "reward": 1.157421875, | |
| "reward_std": 0.20364541225135327, | |
| "rewards/accuracy_reward": 0.188671875, | |
| "rewards/format_reward": 0.96875, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 671.5869140625, | |
| "epoch": 0.08623787279913762, | |
| "grad_norm": 0.037584338337183, | |
| "kl": 0.03684234619140625, | |
| "learning_rate": 1.7142857142857142e-05, | |
| "loss": 0.0213, | |
| "reward": 1.165234375, | |
| "reward_std": 0.24268896747380495, | |
| "rewards/accuracy_reward": 0.2177734375, | |
| "rewards/format_reward": 0.9474609375, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.013671875, | |
| "epoch": 0.1006108515989939, | |
| "grad_norm": 0.34336549043655396, | |
| "kl": 0.151519775390625, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0359, | |
| "reward": 1.14091796875, | |
| "reward_std": 0.2653762998059392, | |
| "rewards/accuracy_reward": 0.2052734375, | |
| "rewards/format_reward": 0.93564453125, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 624.95908203125, | |
| "epoch": 0.11498383039885016, | |
| "grad_norm": 0.052160657942295074, | |
| "kl": 0.595263671875, | |
| "learning_rate": 1.9987329060020616e-05, | |
| "loss": 0.0668, | |
| "reward": 1.081640625, | |
| "reward_std": 0.3258050443604589, | |
| "rewards/accuracy_reward": 0.20439453125, | |
| "rewards/format_reward": 0.87724609375, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 691.025390625, | |
| "epoch": 0.12935680919870643, | |
| "grad_norm": 0.27032357454299927, | |
| "kl": 0.1677734375, | |
| "learning_rate": 1.9949348350626456e-05, | |
| "loss": 0.034, | |
| "reward": 0.9642578125, | |
| "reward_std": 0.4391048148274422, | |
| "rewards/accuracy_reward": 0.18740234375, | |
| "rewards/format_reward": 0.77685546875, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 440.34853515625, | |
| "epoch": 0.1437297879985627, | |
| "grad_norm": 0.6052369475364685, | |
| "kl": 0.800189208984375, | |
| "learning_rate": 1.9886154122075344e-05, | |
| "loss": 0.0919, | |
| "reward": 0.89814453125, | |
| "reward_std": 0.38281605690717696, | |
| "rewards/accuracy_reward": 0.11865234375, | |
| "rewards/format_reward": 0.7794921875, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 487.60771484375, | |
| "epoch": 0.15810276679841898, | |
| "grad_norm": 0.28784340620040894, | |
| "kl": 2.12225341796875, | |
| "learning_rate": 1.979790652042268e-05, | |
| "loss": 0.1039, | |
| "reward": 0.85263671875, | |
| "reward_std": 0.4635654494166374, | |
| "rewards/accuracy_reward": 0.13447265625, | |
| "rewards/format_reward": 0.7181640625, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 718.88359375, | |
| "epoch": 0.17247574559827525, | |
| "grad_norm": 0.38119208812713623, | |
| "kl": 0.38172607421875, | |
| "learning_rate": 1.9684829181681236e-05, | |
| "loss": 0.0502, | |
| "reward": 1.06494140625, | |
| "reward_std": 0.3414448471739888, | |
| "rewards/accuracy_reward": 0.21650390625, | |
| "rewards/format_reward": 0.8484375, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 621.63818359375, | |
| "epoch": 0.18684872439813152, | |
| "grad_norm": 0.3849119246006012, | |
| "kl": 1.819970703125, | |
| "learning_rate": 1.954720866508546e-05, | |
| "loss": 0.1892, | |
| "reward": 0.9689453125, | |
| "reward_std": 0.4041255243122578, | |
| "rewards/accuracy_reward": 0.16826171875, | |
| "rewards/format_reward": 0.80068359375, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 502.92744140625, | |
| "epoch": 0.2012217031979878, | |
| "grad_norm": 0.16367273032665253, | |
| "kl": 0.688922119140625, | |
| "learning_rate": 1.9385393726896492e-05, | |
| "loss": 0.0581, | |
| "reward": 1.1560546875, | |
| "reward_std": 0.22550129257142543, | |
| "rewards/accuracy_reward": 0.19248046875, | |
| "rewards/format_reward": 0.96357421875, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 563.1029296875, | |
| "epoch": 0.21559468199784404, | |
| "grad_norm": 0.1713869571685791, | |
| "kl": 0.0900238037109375, | |
| "learning_rate": 1.9199794436588244e-05, | |
| "loss": 0.0071, | |
| "reward": 1.1892578125, | |
| "reward_std": 0.2032089052721858, | |
| "rewards/accuracy_reward": 0.21513671875, | |
| "rewards/format_reward": 0.97412109375, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.5634765625, | |
| "epoch": 0.2299676607977003, | |
| "grad_norm": 0.2464917004108429, | |
| "kl": 0.144158935546875, | |
| "learning_rate": 1.899088113765426e-05, | |
| "loss": 0.0189, | |
| "reward": 1.1546875, | |
| "reward_std": 0.2610320156440139, | |
| "rewards/accuracy_reward": 0.21083984375, | |
| "rewards/format_reward": 0.94384765625, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.05927734375, | |
| "epoch": 0.24434063959755659, | |
| "grad_norm": 0.2248377948999405, | |
| "kl": 0.716436767578125, | |
| "learning_rate": 1.875918325566888e-05, | |
| "loss": 0.0578, | |
| "reward": 1.06005859375, | |
| "reward_std": 0.33321408815681935, | |
| "rewards/accuracy_reward": 0.171484375, | |
| "rewards/format_reward": 0.88857421875, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 669.3861328125, | |
| "epoch": 0.25871361839741286, | |
| "grad_norm": 0.27829509973526, | |
| "kl": 0.617529296875, | |
| "learning_rate": 1.8505287956623298e-05, | |
| "loss": 0.0585, | |
| "reward": 1.14755859375, | |
| "reward_std": 0.2751380069181323, | |
| "rewards/accuracy_reward": 0.20859375, | |
| "rewards/format_reward": 0.93896484375, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 662.1236328125, | |
| "epoch": 0.27308659719726913, | |
| "grad_norm": 0.2939702868461609, | |
| "kl": 0.5397705078125, | |
| "learning_rate": 1.8229838658936566e-05, | |
| "loss": 0.0555, | |
| "reward": 1.137890625, | |
| "reward_std": 0.2469838338904083, | |
| "rewards/accuracy_reward": 0.1900390625, | |
| "rewards/format_reward": 0.9478515625, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2874595759971254, | |
| "grad_norm": 0.1728806495666504, | |
| "learning_rate": 1.7933533402912354e-05, | |
| "loss": 0.103, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2874595759971254, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 611.2384828951579, | |
| "eval_kl": 0.50033329778157, | |
| "eval_loss": 0.06100574508309364, | |
| "eval_reward": 1.1420381825938566, | |
| "eval_reward_std": 0.27033696519433437, | |
| "eval_rewards/accuracy_reward": 0.2020051194539249, | |
| "eval_rewards/format_reward": 0.9400330631399317, | |
| "eval_runtime": 16336.0108, | |
| "eval_samples_per_second": 0.287, | |
| "eval_steps_per_second": 0.002, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.018017578125, | |
| "epoch": 0.3018325547969817, | |
| "grad_norm": 0.1207083985209465, | |
| "kl": 1.06016845703125, | |
| "learning_rate": 1.761712308177359e-05, | |
| "loss": 0.1074, | |
| "reward": 1.059326171875, | |
| "reward_std": 0.35213989242911337, | |
| "rewards/accuracy_reward": 0.18974609375, | |
| "rewards/format_reward": 0.869580078125, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.68330078125, | |
| "epoch": 0.31620553359683795, | |
| "grad_norm": 0.12369602918624878, | |
| "kl": 2.13466796875, | |
| "learning_rate": 1.7281409538757886e-05, | |
| "loss": 0.1546, | |
| "reward": 1.06484375, | |
| "reward_std": 0.3502559883520007, | |
| "rewards/accuracy_reward": 0.1806640625, | |
| "rewards/format_reward": 0.8841796875, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.0666015625, | |
| "epoch": 0.3305785123966942, | |
| "grad_norm": 0.13101035356521606, | |
| "kl": 0.932763671875, | |
| "learning_rate": 1.6927243535095995e-05, | |
| "loss": 0.0856, | |
| "reward": 1.14521484375, | |
| "reward_std": 0.2656426582485437, | |
| "rewards/accuracy_reward": 0.20322265625, | |
| "rewards/format_reward": 0.9419921875, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.12802734375, | |
| "epoch": 0.3449514911965505, | |
| "grad_norm": 0.13193248212337494, | |
| "kl": 0.9656982421875, | |
| "learning_rate": 1.655552259402295e-05, | |
| "loss": 0.0881, | |
| "reward": 1.14560546875, | |
| "reward_std": 0.27462361557409165, | |
| "rewards/accuracy_reward": 0.21337890625, | |
| "rewards/format_reward": 0.9322265625, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 653.04599609375, | |
| "epoch": 0.35932446999640677, | |
| "grad_norm": 0.3534374535083771, | |
| "kl": 1.867626953125, | |
| "learning_rate": 1.6167188726285433e-05, | |
| "loss": 0.1558, | |
| "reward": 1.05126953125, | |
| "reward_std": 0.36074890177696944, | |
| "rewards/accuracy_reward": 0.18544921875, | |
| "rewards/format_reward": 0.8658203125, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.14736328125, | |
| "epoch": 0.37369744879626304, | |
| "grad_norm": 2.0081052780151367, | |
| "kl": 1.8935546875, | |
| "learning_rate": 1.5763226042909455e-05, | |
| "loss": 0.1105, | |
| "reward": 1.0998046875, | |
| "reward_std": 0.3096121703274548, | |
| "rewards/accuracy_reward": 0.18486328125, | |
| "rewards/format_reward": 0.91494140625, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.62197265625, | |
| "epoch": 0.3880704275961193, | |
| "grad_norm": 0.1118120476603508, | |
| "kl": 0.59337158203125, | |
| "learning_rate": 1.5344658261278013e-05, | |
| "loss": 0.031, | |
| "reward": 1.16611328125, | |
| "reward_std": 0.24496497269719839, | |
| "rewards/accuracy_reward": 0.21005859375, | |
| "rewards/format_reward": 0.9560546875, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 610.4009765625, | |
| "epoch": 0.4024434063959756, | |
| "grad_norm": 0.18786092102527618, | |
| "kl": 0.7201416015625, | |
| "learning_rate": 1.4912546110838775e-05, | |
| "loss": 0.0608, | |
| "reward": 1.1451171875, | |
| "reward_std": 0.2563774929381907, | |
| "rewards/accuracy_reward": 0.2021484375, | |
| "rewards/format_reward": 0.94296875, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.14072265625, | |
| "epoch": 0.41681638519583186, | |
| "grad_norm": 0.12442336976528168, | |
| "kl": 0.96689453125, | |
| "learning_rate": 1.4467984645016259e-05, | |
| "loss": 0.0834, | |
| "reward": 1.13984375, | |
| "reward_std": 0.2728093104436994, | |
| "rewards/accuracy_reward": 0.2001953125, | |
| "rewards/format_reward": 0.9396484375, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.0369140625, | |
| "epoch": 0.4311893639956881, | |
| "grad_norm": 0.17537765204906464, | |
| "kl": 0.687255859375, | |
| "learning_rate": 1.4012100466140579e-05, | |
| "loss": 0.0628, | |
| "reward": 1.12919921875, | |
| "reward_std": 0.24853361072018743, | |
| "rewards/accuracy_reward": 0.17646484375, | |
| "rewards/format_reward": 0.952734375, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.915234375, | |
| "epoch": 0.44556234279554435, | |
| "grad_norm": 0.11783521622419357, | |
| "kl": 0.83641357421875, | |
| "learning_rate": 1.3546048870425356e-05, | |
| "loss": 0.0734, | |
| "reward": 1.12666015625, | |
| "reward_std": 0.264958731085062, | |
| "rewards/accuracy_reward": 0.18427734375, | |
| "rewards/format_reward": 0.9423828125, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.70517578125, | |
| "epoch": 0.4599353215954006, | |
| "grad_norm": 0.13742466270923615, | |
| "kl": 0.7468505859375, | |
| "learning_rate": 1.3071010920229909e-05, | |
| "loss": 0.0682, | |
| "reward": 1.122265625, | |
| "reward_std": 0.2766525615006685, | |
| "rewards/accuracy_reward": 0.18798828125, | |
| "rewards/format_reward": 0.93427734375, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.36552734375, | |
| "epoch": 0.4743083003952569, | |
| "grad_norm": 0.4238876700401306, | |
| "kl": 1.381640625, | |
| "learning_rate": 1.2588190451025209e-05, | |
| "loss": 0.1039, | |
| "reward": 1.13544921875, | |
| "reward_std": 0.31343956142663953, | |
| "rewards/accuracy_reward": 0.2201171875, | |
| "rewards/format_reward": 0.91533203125, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 672.3779296875, | |
| "epoch": 0.48868127919511317, | |
| "grad_norm": 0.13015827536582947, | |
| "kl": 1.4199462890625, | |
| "learning_rate": 1.2098811020648475e-05, | |
| "loss": 0.0989, | |
| "reward": 1.11416015625, | |
| "reward_std": 0.3195471292361617, | |
| "rewards/accuracy_reward": 0.208203125, | |
| "rewards/format_reward": 0.90595703125, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 631.84326171875, | |
| "epoch": 0.5030542579949695, | |
| "grad_norm": 0.2257327437400818, | |
| "kl": 1.1652099609375, | |
| "learning_rate": 1.1604112808577603e-05, | |
| "loss": 0.101, | |
| "reward": 1.1236328125, | |
| "reward_std": 0.30357036273926497, | |
| "rewards/accuracy_reward": 0.211328125, | |
| "rewards/format_reward": 0.9123046875, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.3265625, | |
| "epoch": 0.5174272367948257, | |
| "grad_norm": 0.11806362867355347, | |
| "kl": 0.7406005859375, | |
| "learning_rate": 1.11053494730832e-05, | |
| "loss": 0.0699, | |
| "reward": 1.1373046875, | |
| "reward_std": 0.25564199751242994, | |
| "rewards/accuracy_reward": 0.19658203125, | |
| "rewards/format_reward": 0.94072265625, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.4876953125, | |
| "epoch": 0.531800215594682, | |
| "grad_norm": 0.12807710468769073, | |
| "kl": 0.58621826171875, | |
| "learning_rate": 1.0603784974222862e-05, | |
| "loss": 0.0587, | |
| "reward": 1.173046875, | |
| "reward_std": 0.26026681158691645, | |
| "rewards/accuracy_reward": 0.2248046875, | |
| "rewards/format_reward": 0.9482421875, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.433203125, | |
| "epoch": 0.5461731943945383, | |
| "grad_norm": 0.10217402130365372, | |
| "kl": 0.9344970703125, | |
| "learning_rate": 1.0100690370728756e-05, | |
| "loss": 0.0809, | |
| "reward": 1.1609375, | |
| "reward_std": 0.2667428271844983, | |
| "rewards/accuracy_reward": 0.2150390625, | |
| "rewards/format_reward": 0.9458984375, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 617.68701171875, | |
| "epoch": 0.5605461731943946, | |
| "grad_norm": 0.13498954474925995, | |
| "kl": 0.67510986328125, | |
| "learning_rate": 9.597340598905851e-06, | |
| "loss": 0.0603, | |
| "reward": 1.1654296875, | |
| "reward_std": 0.25683426298201084, | |
| "rewards/accuracy_reward": 0.21796875, | |
| "rewards/format_reward": 0.9474609375, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5749191519942508, | |
| "grad_norm": 0.1882268339395523, | |
| "learning_rate": 9.095011241703623e-06, | |
| "loss": 0.0719, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5749191519942508, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 658.4643835907503, | |
| "eval_kl": 0.7806433980375427, | |
| "eval_loss": 0.06092459335923195, | |
| "eval_reward": 1.149637372013652, | |
| "eval_reward_std": 0.27955490747409467, | |
| "eval_rewards/accuracy_reward": 0.2150170648464164, | |
| "eval_rewards/format_reward": 0.9346203071672355, | |
| "eval_runtime": 16414.3395, | |
| "eval_samples_per_second": 0.286, | |
| "eval_steps_per_second": 0.002, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.56220703125, | |
| "epoch": 0.589292130794107, | |
| "grad_norm": 0.1547040194272995, | |
| "kl": 0.93699951171875, | |
| "learning_rate": 8.594975296149076e-06, | |
| "loss": 0.0647, | |
| "reward": 1.1623046875, | |
| "reward_std": 0.28741056518629193, | |
| "rewards/accuracy_reward": 0.23125, | |
| "rewards/format_reward": 0.9310546875, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 674.5626953125, | |
| "epoch": 0.6036651095939634, | |
| "grad_norm": 0.25151509046554565, | |
| "kl": 0.9999267578125, | |
| "learning_rate": 8.098499947332935e-06, | |
| "loss": 0.0775, | |
| "reward": 1.1466796875, | |
| "reward_std": 0.30369703844189644, | |
| "rewards/accuracy_reward": 0.22509765625, | |
| "rewards/format_reward": 0.92158203125, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 676.977734375, | |
| "epoch": 0.6180380883938196, | |
| "grad_norm": 0.20043928921222687, | |
| "kl": 0.7748779296875, | |
| "learning_rate": 7.606843357124426e-06, | |
| "loss": 0.0573, | |
| "reward": 1.15302734375, | |
| "reward_std": 0.28829708844423296, | |
| "rewards/accuracy_reward": 0.2244140625, | |
| "rewards/format_reward": 0.92861328125, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.955078125, | |
| "epoch": 0.6324110671936759, | |
| "grad_norm": 0.12682239711284637, | |
| "kl": 0.7095947265625, | |
| "learning_rate": 7.12125147575254e-06, | |
| "loss": 0.0548, | |
| "reward": 1.1763671875, | |
| "reward_std": 0.25821941047906877, | |
| "rewards/accuracy_reward": 0.23046875, | |
| "rewards/format_reward": 0.9458984375, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 647.839453125, | |
| "epoch": 0.6467840459935321, | |
| "grad_norm": 0.13890360295772552, | |
| "kl": 0.63245849609375, | |
| "learning_rate": 6.6429548843339554e-06, | |
| "loss": 0.0502, | |
| "reward": 1.1654296875, | |
| "reward_std": 0.2512395134195685, | |
| "rewards/accuracy_reward": 0.21337890625, | |
| "rewards/format_reward": 0.95205078125, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.53623046875, | |
| "epoch": 0.6611570247933884, | |
| "grad_norm": 0.15598197281360626, | |
| "kl": 0.87559814453125, | |
| "learning_rate": 6.173165676349103e-06, | |
| "loss": 0.0703, | |
| "reward": 1.155078125, | |
| "reward_std": 0.2729664742946625, | |
| "rewards/accuracy_reward": 0.213671875, | |
| "rewards/format_reward": 0.94140625, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.5361328125, | |
| "epoch": 0.6755300035932447, | |
| "grad_norm": 0.15446113049983978, | |
| "kl": 0.77437744140625, | |
| "learning_rate": 5.713074385969457e-06, | |
| "loss": 0.0688, | |
| "reward": 1.16953125, | |
| "reward_std": 0.28331395238637924, | |
| "rewards/accuracy_reward": 0.2296875, | |
| "rewards/format_reward": 0.93984375, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 647.3654296875, | |
| "epoch": 0.689902982393101, | |
| "grad_norm": 0.2089157998561859, | |
| "kl": 1.21328125, | |
| "learning_rate": 5.263846971020108e-06, | |
| "loss": 0.1016, | |
| "reward": 1.116796875, | |
| "reward_std": 0.31174491699784995, | |
| "rewards/accuracy_reward": 0.2029296875, | |
| "rewards/format_reward": 0.9138671875, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.523828125, | |
| "epoch": 0.7042759611929572, | |
| "grad_norm": 0.16784484684467316, | |
| "kl": 0.791552734375, | |
| "learning_rate": 4.826621858223431e-06, | |
| "loss": 0.0734, | |
| "reward": 1.143359375, | |
| "reward_std": 0.28859285488724706, | |
| "rewards/accuracy_reward": 0.2154296875, | |
| "rewards/format_reward": 0.9279296875, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.52958984375, | |
| "epoch": 0.7186489399928135, | |
| "grad_norm": 0.1753949671983719, | |
| "kl": 0.98125, | |
| "learning_rate": 4.40250705821178e-06, | |
| "loss": 0.0812, | |
| "reward": 1.1546875, | |
| "reward_std": 0.2736880548298359, | |
| "rewards/accuracy_reward": 0.2154296875, | |
| "rewards/format_reward": 0.9392578125, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.1673828125, | |
| "epoch": 0.7330219187926698, | |
| "grad_norm": 0.20336733758449554, | |
| "kl": 0.55863037109375, | |
| "learning_rate": 3.99257735762021e-06, | |
| "loss": 0.0458, | |
| "reward": 1.17392578125, | |
| "reward_std": 0.23981231823563576, | |
| "rewards/accuracy_reward": 0.21728515625, | |
| "rewards/format_reward": 0.956640625, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.9087890625, | |
| "epoch": 0.7473948975925261, | |
| "grad_norm": 0.16080701351165771, | |
| "kl": 0.696923828125, | |
| "learning_rate": 3.5978715953751207e-06, | |
| "loss": 0.0567, | |
| "reward": 1.1685546875, | |
| "reward_std": 0.24907034020870925, | |
| "rewards/accuracy_reward": 0.21376953125, | |
| "rewards/format_reward": 0.95478515625, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 623.18583984375, | |
| "epoch": 0.7617678763923823, | |
| "grad_norm": 0.18338614702224731, | |
| "kl": 1.0648681640625, | |
| "learning_rate": 3.2193900300810908e-06, | |
| "loss": 0.0778, | |
| "reward": 1.151953125, | |
| "reward_std": 0.26931764371693134, | |
| "rewards/accuracy_reward": 0.210546875, | |
| "rewards/format_reward": 0.94140625, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 634.9572265625, | |
| "epoch": 0.7761408551922386, | |
| "grad_norm": 0.13022945821285248, | |
| "kl": 0.7796142578125, | |
| "learning_rate": 2.8580918051775542e-06, | |
| "loss": 0.065, | |
| "reward": 1.165625, | |
| "reward_std": 0.27459610607475043, | |
| "rewards/accuracy_reward": 0.2244140625, | |
| "rewards/format_reward": 0.9412109375, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.88232421875, | |
| "epoch": 0.7905138339920948, | |
| "grad_norm": 0.1719103306531906, | |
| "kl": 0.8088623046875, | |
| "learning_rate": 2.514892518288988e-06, | |
| "loss": 0.0696, | |
| "reward": 1.15087890625, | |
| "reward_std": 0.2822716049849987, | |
| "rewards/accuracy_reward": 0.21640625, | |
| "rewards/format_reward": 0.93447265625, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.853125, | |
| "epoch": 0.8048868127919512, | |
| "grad_norm": 0.21337589621543884, | |
| "kl": 0.9040283203125, | |
| "learning_rate": 2.190661900928426e-06, | |
| "loss": 0.0753, | |
| "reward": 1.1412109375, | |
| "reward_std": 0.2784146698191762, | |
| "rewards/accuracy_reward": 0.2029296875, | |
| "rewards/format_reward": 0.93828125, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 639.5638671875, | |
| "epoch": 0.8192597915918074, | |
| "grad_norm": 0.1362425535917282, | |
| "kl": 0.95645751953125, | |
| "learning_rate": 1.8862216144342692e-06, | |
| "loss": 0.0749, | |
| "reward": 1.14130859375, | |
| "reward_std": 0.2679125562310219, | |
| "rewards/accuracy_reward": 0.20546875, | |
| "rewards/format_reward": 0.93583984375, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 639.32958984375, | |
| "epoch": 0.8336327703916637, | |
| "grad_norm": 0.13494881987571716, | |
| "kl": 0.8051513671875, | |
| "learning_rate": 1.6023431677260215e-06, | |
| "loss": 0.0684, | |
| "reward": 1.16240234375, | |
| "reward_std": 0.26225354727357625, | |
| "rewards/accuracy_reward": 0.21396484375, | |
| "rewards/format_reward": 0.9484375, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.16083984375, | |
| "epoch": 0.8480057491915199, | |
| "grad_norm": 0.16026277840137482, | |
| "kl": 0.8697265625, | |
| "learning_rate": 1.339745962155613e-06, | |
| "loss": 0.0712, | |
| "reward": 1.15966796875, | |
| "reward_std": 0.2733839010819793, | |
| "rewards/accuracy_reward": 0.21552734375, | |
| "rewards/format_reward": 0.944140625, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8623787279913762, | |
| "grad_norm": 0.155064195394516, | |
| "learning_rate": 1.099095468409156e-06, | |
| "loss": 0.0785, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8623787279913762, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 628.6954391531569, | |
| "eval_kl": 0.8880319432593856, | |
| "eval_loss": 0.07323075085878372, | |
| "eval_reward": 1.1617160836177474, | |
| "eval_reward_std": 0.2670084892838888, | |
| "eval_rewards/accuracy_reward": 0.21819005972696245, | |
| "eval_rewards/format_reward": 0.943526023890785, | |
| "eval_runtime": 16336.911, | |
| "eval_samples_per_second": 0.287, | |
| "eval_steps_per_second": 0.002, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.594287109375, | |
| "epoch": 0.8767517067912325, | |
| "grad_norm": 0.1458193063735962, | |
| "kl": 0.95950927734375, | |
| "learning_rate": 8.810015400790994e-07, | |
| "loss": 0.0809, | |
| "reward": 1.16162109375, | |
| "reward_std": 0.26864673662930727, | |
| "rewards/accuracy_reward": 0.2203125, | |
| "rewards/format_reward": 0.94130859375, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 642.09326171875, | |
| "epoch": 0.8911246855910887, | |
| "grad_norm": 0.14581456780433655, | |
| "kl": 0.82933349609375, | |
| "learning_rate": 6.860168681805946e-07, | |
| "loss": 0.0661, | |
| "reward": 1.16982421875, | |
| "reward_std": 0.26240854635834693, | |
| "rewards/accuracy_reward": 0.2216796875, | |
| "rewards/format_reward": 0.94814453125, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 642.76943359375, | |
| "epoch": 0.905497664390945, | |
| "grad_norm": 0.16072359681129456, | |
| "kl": 0.80216064453125, | |
| "learning_rate": 5.146355805285452e-07, | |
| "loss": 0.0637, | |
| "reward": 1.17431640625, | |
| "reward_std": 0.2672739554196596, | |
| "rewards/accuracy_reward": 0.22734375, | |
| "rewards/format_reward": 0.94697265625, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.221484375, | |
| "epoch": 0.9198706431908013, | |
| "grad_norm": 0.1719951331615448, | |
| "kl": 0.84737548828125, | |
| "learning_rate": 3.6729198952483725e-07, | |
| "loss": 0.0748, | |
| "reward": 1.158203125, | |
| "reward_std": 0.2642348381690681, | |
| "rewards/accuracy_reward": 0.2169921875, | |
| "rewards/format_reward": 0.9412109375, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 649.33896484375, | |
| "epoch": 0.9342436219906576, | |
| "grad_norm": 0.20107921957969666, | |
| "kl": 0.87275390625, | |
| "learning_rate": 2.4435949152906144e-07, | |
| "loss": 0.0757, | |
| "reward": 1.15966796875, | |
| "reward_std": 0.27580115627497437, | |
| "rewards/accuracy_reward": 0.22021484375, | |
| "rewards/format_reward": 0.939453125, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 643.75712890625, | |
| "epoch": 0.9486166007905138, | |
| "grad_norm": 0.14510348439216614, | |
| "kl": 0.821826171875, | |
| "learning_rate": 1.4614962060194303e-07, | |
| "loss": 0.0658, | |
| "reward": 1.140625, | |
| "reward_std": 0.2549537133425474, | |
| "rewards/accuracy_reward": 0.1978515625, | |
| "rewards/format_reward": 0.9427734375, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 643.530078125, | |
| "epoch": 0.9629895795903701, | |
| "grad_norm": 0.14030759036540985, | |
| "kl": 0.77998046875, | |
| "learning_rate": 7.291125901946027e-08, | |
| "loss": 0.0701, | |
| "reward": 1.1693359375, | |
| "reward_std": 0.2593334957957268, | |
| "rewards/accuracy_reward": 0.22529296875, | |
| "rewards/format_reward": 0.94404296875, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.19736328125, | |
| "epoch": 0.9773625583902263, | |
| "grad_norm": 0.16929689049720764, | |
| "kl": 0.835546875, | |
| "learning_rate": 2.4830006558373975e-08, | |
| "loss": 0.0697, | |
| "reward": 1.162109375, | |
| "reward_std": 0.26842295806854966, | |
| "rewards/accuracy_reward": 0.2173828125, | |
| "rewards/format_reward": 0.9447265625, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 642.66806640625, | |
| "epoch": 0.9917355371900827, | |
| "grad_norm": 0.17319317162036896, | |
| "kl": 0.85125732421875, | |
| "learning_rate": 2.0277101514987184e-09, | |
| "loss": 0.0724, | |
| "reward": 1.15966796875, | |
| "reward_std": 0.2777851399034262, | |
| "rewards/accuracy_reward": 0.21982421875, | |
| "rewards/format_reward": 0.93984375, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 643.1118812561035, | |
| "epoch": 0.9974847287100251, | |
| "kl": 0.8223876953125, | |
| "reward": 1.182861328125, | |
| "reward_std": 0.2758036791346967, | |
| "rewards/accuracy_reward": 0.25390625, | |
| "rewards/format_reward": 0.928955078125, | |
| "step": 347, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0704507840104852, | |
| "train_runtime": 435678.5475, | |
| "train_samples_per_second": 0.204, | |
| "train_steps_per_second": 0.001 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 347, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |