| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.1244167962674965, | |
| "eval_steps": 500, | |
| "global_step": 250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 736.4702529907227, | |
| "epoch": 0.049766718506998445, | |
| "grad_norm": 0.2507069706916809, | |
| "kl": 0.0, | |
| "learning_rate": 7.142857142857142e-08, | |
| "loss": 0.0, | |
| "reward": 0.04415178840281442, | |
| "reward_std": 0.07034091584500857, | |
| "rewards/equation_reward_func": 0.04415178793715313, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 723.1704015731812, | |
| "epoch": 0.09953343701399689, | |
| "grad_norm": 0.19884330034255981, | |
| "kl": 2.0936699339557663e-05, | |
| "learning_rate": 1.4285714285714285e-07, | |
| "loss": 0.0, | |
| "reward": 0.040647323767188936, | |
| "reward_std": 0.0637543131451821, | |
| "rewards/equation_reward_func": 0.04064732347615063, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 726.5163822174072, | |
| "epoch": 0.14930015552099535, | |
| "grad_norm": 0.21145105361938477, | |
| "kl": 0.00019492170304147294, | |
| "learning_rate": 2.1428571428571426e-07, | |
| "loss": 0.0, | |
| "reward": 0.04095238326408435, | |
| "reward_std": 0.06441530691517983, | |
| "rewards/equation_reward_func": 0.040952383089461364, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 737.8207015991211, | |
| "epoch": 0.19906687402799378, | |
| "grad_norm": 0.2020396590232849, | |
| "kl": 0.020334478189397487, | |
| "learning_rate": 2.857142857142857e-07, | |
| "loss": 0.0, | |
| "reward": 0.03635416827455629, | |
| "reward_std": 0.05670656039728783, | |
| "rewards/equation_reward_func": 0.03635416833276395, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 718.5461435317993, | |
| "epoch": 0.24883359253499224, | |
| "grad_norm": 27.479997634887695, | |
| "kl": 9.990212610488015, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 0.01, | |
| "reward": 0.04543898967676796, | |
| "reward_std": 0.07194261607946828, | |
| "rewards/equation_reward_func": 0.04543899020063691, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 721.6964378356934, | |
| "epoch": 0.2986003110419907, | |
| "grad_norm": 0.19479116797447205, | |
| "kl": 0.005109551766508957, | |
| "learning_rate": 4.285714285714285e-07, | |
| "loss": 0.0, | |
| "reward": 0.04148065741173923, | |
| "reward_std": 0.0677548690000549, | |
| "rewards/equation_reward_func": 0.04148065741173923, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 725.9003086090088, | |
| "epoch": 0.3483670295489891, | |
| "grad_norm": 0.24158786237239838, | |
| "kl": 0.502096803898894, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0005, | |
| "reward": 0.04821428797731642, | |
| "reward_std": 0.07803500922454987, | |
| "rewards/equation_reward_func": 0.048214288559393026, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 722.5781373977661, | |
| "epoch": 0.39813374805598756, | |
| "grad_norm": 0.290544331073761, | |
| "kl": 0.23321715661586495, | |
| "learning_rate": 4.999740409224932e-07, | |
| "loss": 0.0002, | |
| "reward": 0.05134672833082732, | |
| "reward_std": 0.07946415679180063, | |
| "rewards/equation_reward_func": 0.05134672856365796, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 723.7433156967163, | |
| "epoch": 0.447900466562986, | |
| "grad_norm": 4.716856479644775, | |
| "kl": 0.8582769820350222, | |
| "learning_rate": 4.998961690809627e-07, | |
| "loss": 0.0009, | |
| "reward": 0.050446430934243836, | |
| "reward_std": 0.07721506280358881, | |
| "rewards/equation_reward_func": 0.050446430992451496, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 729.6198072433472, | |
| "epoch": 0.4976671850699845, | |
| "grad_norm": 0.23771615326404572, | |
| "kl": 0.3220994914881885, | |
| "learning_rate": 4.997664006472578e-07, | |
| "loss": 0.0003, | |
| "reward": 0.045706847246037796, | |
| "reward_std": 0.07288302374945488, | |
| "rewards/equation_reward_func": 0.045706847246037796, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 712.8631057739258, | |
| "epoch": 0.5474339035769828, | |
| "grad_norm": 0.3750320374965668, | |
| "kl": 0.27479040302569047, | |
| "learning_rate": 4.995847625707292e-07, | |
| "loss": 0.0003, | |
| "reward": 0.05489583619055338, | |
| "reward_std": 0.0833382241835352, | |
| "rewards/equation_reward_func": 0.054895836423384026, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 731.4576015472412, | |
| "epoch": 0.5972006220839814, | |
| "grad_norm": 0.2089901864528656, | |
| "kl": 0.12606932656490244, | |
| "learning_rate": 4.993512925726318e-07, | |
| "loss": 0.0001, | |
| "reward": 0.0600520860607503, | |
| "reward_std": 0.0899482914537657, | |
| "rewards/equation_reward_func": 0.06005208553688135, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 706.6056671142578, | |
| "epoch": 0.6469673405909798, | |
| "grad_norm": 0.17620234191417694, | |
| "kl": 0.1556346261058934, | |
| "learning_rate": 4.990660391382923e-07, | |
| "loss": 0.0002, | |
| "reward": 0.05229166932986118, | |
| "reward_std": 0.07350753628998064, | |
| "rewards/equation_reward_func": 0.05229166956269182, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 727.1808137893677, | |
| "epoch": 0.6967340590979783, | |
| "grad_norm": 0.19479602575302124, | |
| "kl": 0.12168441573157907, | |
| "learning_rate": 4.987290615070384e-07, | |
| "loss": 0.0001, | |
| "reward": 0.053683038655435666, | |
| "reward_std": 0.08042177859169897, | |
| "rewards/equation_reward_func": 0.053683039121096954, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 720.8891496658325, | |
| "epoch": 0.7465007776049767, | |
| "grad_norm": 0.1857473999261856, | |
| "kl": 0.1632600230514072, | |
| "learning_rate": 4.983404296598978e-07, | |
| "loss": 0.0002, | |
| "reward": 0.05391369271092117, | |
| "reward_std": 0.08413292915793136, | |
| "rewards/equation_reward_func": 0.053913692419882864, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 720.4337940216064, | |
| "epoch": 0.7962674961119751, | |
| "grad_norm": 0.23092247545719147, | |
| "kl": 0.15535293571883813, | |
| "learning_rate": 4.979002243050646e-07, | |
| "loss": 0.0002, | |
| "reward": 0.05988095561042428, | |
| "reward_std": 0.09168167802272364, | |
| "rewards/equation_reward_func": 0.05988095601787791, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 718.6845378875732, | |
| "epoch": 0.8460342146189735, | |
| "grad_norm": 0.23407958447933197, | |
| "kl": 0.25782948260894045, | |
| "learning_rate": 4.974085368611381e-07, | |
| "loss": 0.0003, | |
| "reward": 0.06691220620996319, | |
| "reward_std": 0.09768064138188493, | |
| "rewards/equation_reward_func": 0.06691220562788658, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 718.0454006195068, | |
| "epoch": 0.895800933125972, | |
| "grad_norm": 0.3076172471046448, | |
| "kl": 0.2404527408652939, | |
| "learning_rate": 4.968654694381379e-07, | |
| "loss": 0.0002, | |
| "reward": 0.07349702704232186, | |
| "reward_std": 0.10955648736853618, | |
| "rewards/equation_reward_func": 0.07349702733336017, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 704.1488237380981, | |
| "epoch": 0.9455676516329704, | |
| "grad_norm": 0.2561110258102417, | |
| "kl": 0.43795167771168053, | |
| "learning_rate": 4.962711348162987e-07, | |
| "loss": 0.0004, | |
| "reward": 0.06241815793327987, | |
| "reward_std": 0.09217380215704907, | |
| "rewards/equation_reward_func": 0.0624181583407335, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 707.3921279907227, | |
| "epoch": 0.995334370139969, | |
| "grad_norm": 0.3400561511516571, | |
| "kl": 0.5494289128109813, | |
| "learning_rate": 4.956256564226487e-07, | |
| "loss": 0.0005, | |
| "reward": 0.0764508958091028, | |
| "reward_std": 0.11110821401234716, | |
| "rewards/equation_reward_func": 0.07645089708967134, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 715.0272221156529, | |
| "epoch": 1.0497667185069985, | |
| "grad_norm": 0.26081565022468567, | |
| "kl": 0.4236157455614635, | |
| "learning_rate": 4.949291683053768e-07, | |
| "loss": 0.0005, | |
| "reward": 0.07186394860701902, | |
| "reward_std": 0.10362207902861494, | |
| "rewards/equation_reward_func": 0.07186394876667432, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 714.9486722946167, | |
| "epoch": 1.0995334370139969, | |
| "grad_norm": 0.29378727078437805, | |
| "kl": 0.3755593653768301, | |
| "learning_rate": 4.941818151059955e-07, | |
| "loss": 0.0004, | |
| "reward": 0.0799404798890464, | |
| "reward_std": 0.11443577655882109, | |
| "rewards/equation_reward_func": 0.07994047965621576, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 727.829628944397, | |
| "epoch": 1.1493001555209954, | |
| "grad_norm": 2045.599365234375, | |
| "kl": 128.7541933595203, | |
| "learning_rate": 4.933837520293017e-07, | |
| "loss": 0.1288, | |
| "reward": 0.06808780113351531, | |
| "reward_std": 0.09949399236938916, | |
| "rewards/equation_reward_func": 0.06808780090068467, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 709.632453918457, | |
| "epoch": 1.1990668740279937, | |
| "grad_norm": 0.2698291838169098, | |
| "kl": 0.4989726666826755, | |
| "learning_rate": 4.925351448111454e-07, | |
| "loss": 0.0005, | |
| "reward": 0.09389881315291859, | |
| "reward_std": 0.13221543522377033, | |
| "rewards/equation_reward_func": 0.09389881303650327, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 719.485878944397, | |
| "epoch": 1.2488335925349923, | |
| "grad_norm": 0.36381521821022034, | |
| "kl": 0.550471473718062, | |
| "learning_rate": 4.91636169684011e-07, | |
| "loss": 0.0006, | |
| "reward": 0.08360863462439738, | |
| "reward_std": 0.11854775344545487, | |
| "rewards/equation_reward_func": 0.08360863421694376, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 725.6599855422974, | |
| "epoch": 1.2986003110419908, | |
| "grad_norm": 0.3374347686767578, | |
| "kl": 0.663099701050669, | |
| "learning_rate": 4.906870133404186e-07, | |
| "loss": 0.0007, | |
| "reward": 0.08503720644512214, | |
| "reward_std": 0.12180299674218986, | |
| "rewards/equation_reward_func": 0.0850372067943681, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 723.972484588623, | |
| "epoch": 1.3483670295489891, | |
| "grad_norm": 1.0345810651779175, | |
| "kl": 0.9573397457133979, | |
| "learning_rate": 4.896878728941531e-07, | |
| "loss": 0.001, | |
| "reward": 0.09177827867097221, | |
| "reward_std": 0.12253864679951221, | |
| "rewards/equation_reward_func": 0.09177827744861133, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 712.2269496917725, | |
| "epoch": 1.3981337480559874, | |
| "grad_norm": 0.27968963980674744, | |
| "kl": 0.8391579431481659, | |
| "learning_rate": 4.886389558393284e-07, | |
| "loss": 0.0008, | |
| "reward": 0.08570684934966266, | |
| "reward_std": 0.1181660912843654, | |
| "rewards/equation_reward_func": 0.08570684841834009, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 730.5327529907227, | |
| "epoch": 1.447900466562986, | |
| "grad_norm": 0.28138798475265503, | |
| "kl": 0.9094656470697373, | |
| "learning_rate": 4.875404800072976e-07, | |
| "loss": 0.0009, | |
| "reward": 0.08794643338478636, | |
| "reward_std": 0.12104765651747584, | |
| "rewards/equation_reward_func": 0.08794643309374806, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 732.3861742019653, | |
| "epoch": 1.4976671850699845, | |
| "grad_norm": 0.34412360191345215, | |
| "kl": 1.009782899171114, | |
| "learning_rate": 4.86392673521415e-07, | |
| "loss": 0.001, | |
| "reward": 0.10000744601711631, | |
| "reward_std": 0.13957228315121029, | |
| "rewards/equation_reward_func": 0.10000744566787034, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 725.0677175521851, | |
| "epoch": 1.5474339035769828, | |
| "grad_norm": 0.3454972207546234, | |
| "kl": 1.0763904643245041, | |
| "learning_rate": 4.851957747496606e-07, | |
| "loss": 0.0011, | |
| "reward": 0.10212798128486611, | |
| "reward_std": 0.13816983328433707, | |
| "rewards/equation_reward_func": 0.10212798012071289, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 730.5171251296997, | |
| "epoch": 1.5972006220839814, | |
| "grad_norm": 0.3473067581653595, | |
| "kl": 1.4565551071427763, | |
| "learning_rate": 4.839500322551386e-07, | |
| "loss": 0.0015, | |
| "reward": 0.10485119439545088, | |
| "reward_std": 0.14129075466189533, | |
| "rewards/equation_reward_func": 0.10485119334771298, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 735.0320043563843, | |
| "epoch": 1.64696734059098, | |
| "grad_norm": 0.3159619867801666, | |
| "kl": 1.5041364189237356, | |
| "learning_rate": 4.826557047444563e-07, | |
| "loss": 0.0015, | |
| "reward": 0.10093006424722262, | |
| "reward_std": 0.13811934839759488, | |
| "rewards/equation_reward_func": 0.1009300641308073, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 730.7455463409424, | |
| "epoch": 1.6967340590979783, | |
| "grad_norm": 1.146909236907959, | |
| "kl": 2.238507369533181, | |
| "learning_rate": 4.813130610139993e-07, | |
| "loss": 0.0022, | |
| "reward": 0.10973958898102865, | |
| "reward_std": 0.13851106038782746, | |
| "rewards/equation_reward_func": 0.10973958781687543, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 712.6971893310547, | |
| "epoch": 1.7465007776049766, | |
| "grad_norm": 7.27742338180542, | |
| "kl": 3.2542791040614247, | |
| "learning_rate": 4.799223798941089e-07, | |
| "loss": 0.0033, | |
| "reward": 0.12900298138265498, | |
| "reward_std": 0.15667404458508827, | |
| "rewards/equation_reward_func": 0.1290029831288848, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 729.6331987380981, | |
| "epoch": 1.7962674961119751, | |
| "grad_norm": 10.986953735351562, | |
| "kl": 4.106183127500117, | |
| "learning_rate": 4.78483950191177e-07, | |
| "loss": 0.0041, | |
| "reward": 0.12543899397132918, | |
| "reward_std": 0.16567694948753342, | |
| "rewards/equation_reward_func": 0.12543899344746023, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 737.0245656967163, | |
| "epoch": 1.8460342146189737, | |
| "grad_norm": 1.6122727394104004, | |
| "kl": 3.731540434062481, | |
| "learning_rate": 4.769980706276687e-07, | |
| "loss": 0.0037, | |
| "reward": 0.12507440976332873, | |
| "reward_std": 0.159569505834952, | |
| "rewards/equation_reward_func": 0.12507440929766744, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 729.0632581710815, | |
| "epoch": 1.895800933125972, | |
| "grad_norm": 0.5852969288825989, | |
| "kl": 2.9793617641553283, | |
| "learning_rate": 4.7546504978008595e-07, | |
| "loss": 0.003, | |
| "reward": 0.12817708833608776, | |
| "reward_std": 0.1600989469443448, | |
| "rewards/equation_reward_func": 0.1281770879868418, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 734.6302223205566, | |
| "epoch": 1.9455676516329703, | |
| "grad_norm": 0.9090600609779358, | |
| "kl": 3.139740688726306, | |
| "learning_rate": 4.738852060148848e-07, | |
| "loss": 0.0031, | |
| "reward": 0.13495536311529577, | |
| "reward_std": 0.1720278718858026, | |
| "rewards/equation_reward_func": 0.13495536299888045, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 742.833345413208, | |
| "epoch": 1.995334370139969, | |
| "grad_norm": 0.5681818723678589, | |
| "kl": 3.712686972692609, | |
| "learning_rate": 4.722588674223593e-07, | |
| "loss": 0.0037, | |
| "reward": 0.13085565919755027, | |
| "reward_std": 0.15991040458902717, | |
| "rewards/equation_reward_func": 0.1308556593139656, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 717.2042718184622, | |
| "epoch": 2.0248833592534994, | |
| "grad_norm": 1.5164953470230103, | |
| "kl": 5.466580171334116, | |
| "learning_rate": 4.70586371748506e-07, | |
| "loss": 0.0032, | |
| "reward": 0.14641604347056464, | |
| "reward_std": 0.18159407436063416, | |
| "rewards/equation_reward_func": 0.1464160444509042, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 730.2589464187622, | |
| "epoch": 2.0746500777604977, | |
| "grad_norm": 0.6375504732131958, | |
| "kl": 4.280845553614199, | |
| "learning_rate": 4.6886806632488363e-07, | |
| "loss": 0.0043, | |
| "reward": 0.14213542238576338, | |
| "reward_std": 0.1740714008337818, | |
| "rewards/equation_reward_func": 0.14213542168727145, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 744.4538831710815, | |
| "epoch": 2.124416796267496, | |
| "grad_norm": 0.9480769038200378, | |
| "kl": 7.16812994517386, | |
| "learning_rate": 4.6710430799648143e-07, | |
| "loss": 0.0072, | |
| "reward": 0.12831845637992956, | |
| "reward_std": 0.1582361755426973, | |
| "rewards/equation_reward_func": 0.12831845649634488, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 732.5520973205566, | |
| "epoch": 2.1741835147744943, | |
| "grad_norm": 16.496623992919922, | |
| "kl": 10.49539315700531, | |
| "learning_rate": 4.652954630476127e-07, | |
| "loss": 0.0105, | |
| "reward": 0.14677828032290563, | |
| "reward_std": 0.1764058277476579, | |
| "rewards/equation_reward_func": 0.1467782796244137, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 736.1361722946167, | |
| "epoch": 2.223950233281493, | |
| "grad_norm": 2.352017879486084, | |
| "kl": 10.109702784568071, | |
| "learning_rate": 4.6344190712584713e-07, | |
| "loss": 0.0101, | |
| "reward": 0.13781250565079972, | |
| "reward_std": 0.1627702646655962, | |
| "rewards/equation_reward_func": 0.13781250413740054, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 749.1317129135132, | |
| "epoch": 2.2737169517884914, | |
| "grad_norm": 3.804121255874634, | |
| "kl": 15.052036292850971, | |
| "learning_rate": 4.615440251639995e-07, | |
| "loss": 0.0151, | |
| "reward": 0.14105655340244994, | |
| "reward_std": 0.17247924709226936, | |
| "rewards/equation_reward_func": 0.14105655369348824, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 717.3884019851685, | |
| "epoch": 2.3234836702954897, | |
| "grad_norm": 2.226238489151001, | |
| "kl": 12.018643591552973, | |
| "learning_rate": 4.596022113001894e-07, | |
| "loss": 0.012, | |
| "reward": 0.15741816238733009, | |
| "reward_std": 0.17923290858743712, | |
| "rewards/equation_reward_func": 0.15741816128138453, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 726.2500143051147, | |
| "epoch": 2.3732503888024885, | |
| "grad_norm": 2.1459925174713135, | |
| "kl": 12.27118530496955, | |
| "learning_rate": 4.576168687959895e-07, | |
| "loss": 0.0123, | |
| "reward": 0.16154762578662485, | |
| "reward_std": 0.18940409342758358, | |
| "rewards/equation_reward_func": 0.16154762508813292, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 711.6696538925171, | |
| "epoch": 2.423017107309487, | |
| "grad_norm": 1.4883497953414917, | |
| "kl": 15.596692271530628, | |
| "learning_rate": 4.555884099526793e-07, | |
| "loss": 0.0156, | |
| "reward": 0.15925595845328644, | |
| "reward_std": 0.1815938005456701, | |
| "rewards/equation_reward_func": 0.1592559577547945, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 719.6242723464966, | |
| "epoch": 2.472783825816485, | |
| "grad_norm": 4.10906982421875, | |
| "kl": 17.258602559566498, | |
| "learning_rate": 4.5351725602562174e-07, | |
| "loss": 0.0173, | |
| "reward": 0.17212054354604334, | |
| "reward_std": 0.18435519566992298, | |
| "rewards/equation_reward_func": 0.17212054308038205, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 697.6637020111084, | |
| "epoch": 2.522550544323484, | |
| "grad_norm": 1.1079808473587036, | |
| "kl": 14.344636462628841, | |
| "learning_rate": 4.514038371367791e-07, | |
| "loss": 0.0143, | |
| "reward": 0.17430060362676159, | |
| "reward_std": 0.19522728596348315, | |
| "rewards/equation_reward_func": 0.17430060246260837, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 695.2105755805969, | |
| "epoch": 2.5723172628304822, | |
| "grad_norm": 1.298901081085205, | |
| "kl": 15.563006613403559, | |
| "learning_rate": 4.4924859218538936e-07, | |
| "loss": 0.0156, | |
| "reward": 0.17871280398685485, | |
| "reward_std": 0.19645729020703584, | |
| "rewards/equation_reward_func": 0.17871280352119356, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 687.2507581710815, | |
| "epoch": 2.6220839813374806, | |
| "grad_norm": 1.333657145500183, | |
| "kl": 14.787582196295261, | |
| "learning_rate": 4.470519687568185e-07, | |
| "loss": 0.0148, | |
| "reward": 0.19031250709667802, | |
| "reward_std": 0.2006249635014683, | |
| "rewards/equation_reward_func": 0.19031250721309334, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 672.3839402198792, | |
| "epoch": 2.671850699844479, | |
| "grad_norm": 1.4585353136062622, | |
| "kl": 20.08526621758938, | |
| "learning_rate": 4.4481442302960923e-07, | |
| "loss": 0.0201, | |
| "reward": 0.18158482806757092, | |
| "reward_std": 0.1955818484420888, | |
| "rewards/equation_reward_func": 0.18158482783474028, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 651.4077491760254, | |
| "epoch": 2.721617418351477, | |
| "grad_norm": 1.516221523284912, | |
| "kl": 17.027776926755905, | |
| "learning_rate": 4.4253641968074505e-07, | |
| "loss": 0.017, | |
| "reward": 0.1995759003330022, | |
| "reward_std": 0.21349556557834148, | |
| "rewards/equation_reward_func": 0.19957590056583285, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 672.9442043304443, | |
| "epoch": 2.771384136858476, | |
| "grad_norm": 2.0658159255981445, | |
| "kl": 20.176754418760538, | |
| "learning_rate": 4.402184317891501e-07, | |
| "loss": 0.0202, | |
| "reward": 0.20375744753982872, | |
| "reward_std": 0.18776777852326632, | |
| "rewards/equation_reward_func": 0.2037574463756755, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 665.7247114181519, | |
| "epoch": 2.8211508553654743, | |
| "grad_norm": 2.339445114135742, | |
| "kl": 22.64492540061474, | |
| "learning_rate": 4.37860940737443e-07, | |
| "loss": 0.0226, | |
| "reward": 0.1926413766341284, | |
| "reward_std": 0.2001927924575284, | |
| "rewards/equation_reward_func": 0.19264137593563646, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 669.665937423706, | |
| "epoch": 2.8709175738724726, | |
| "grad_norm": 2.852607011795044, | |
| "kl": 32.22943264245987, | |
| "learning_rate": 4.354644361119671e-07, | |
| "loss": 0.0322, | |
| "reward": 0.19950893591158092, | |
| "reward_std": 0.1933421454159543, | |
| "rewards/equation_reward_func": 0.19950893614441156, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 670.7053713798523, | |
| "epoch": 2.9206842923794714, | |
| "grad_norm": 2.6619129180908203, | |
| "kl": 27.73328886926174, | |
| "learning_rate": 4.3302941560111716e-07, | |
| "loss": 0.0277, | |
| "reward": 0.19388393545523286, | |
| "reward_std": 0.19777346146292984, | |
| "rewards/equation_reward_func": 0.1938839361537248, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 676.3571548461914, | |
| "epoch": 2.9704510108864697, | |
| "grad_norm": 3.816153049468994, | |
| "kl": 27.2223904132843, | |
| "learning_rate": 4.3055638489198236e-07, | |
| "loss": 0.0272, | |
| "reward": 0.20729167491663247, | |
| "reward_std": 0.20934273721650243, | |
| "rewards/equation_reward_func": 0.20729167328681797, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 659.7907361482319, | |
| "epoch": 3.0, | |
| "grad_norm": 0.624527633190155, | |
| "kl": 27.528421577654388, | |
| "learning_rate": 4.280458575653296e-07, | |
| "loss": 0.0163, | |
| "reward": 0.20659148869545838, | |
| "reward_std": 0.19081004316869535, | |
| "rewards/equation_reward_func": 0.20659148947973, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 659.4025421142578, | |
| "epoch": 3.0497667185069983, | |
| "grad_norm": 3.345853567123413, | |
| "kl": 21.34368522465229, | |
| "learning_rate": 4.2549835498894665e-07, | |
| "loss": 0.0213, | |
| "reward": 0.22118304355535656, | |
| "reward_std": 0.21869899448938668, | |
| "rewards/equation_reward_func": 0.22118304437026381, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 672.1183128356934, | |
| "epoch": 3.099533437013997, | |
| "grad_norm": 6.106723785400391, | |
| "kl": 23.556977652013302, | |
| "learning_rate": 4.229144062093679e-07, | |
| "loss": 0.0236, | |
| "reward": 0.21467262762598693, | |
| "reward_std": 0.2053254572674632, | |
| "rewards/equation_reward_func": 0.21467262762598693, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 653.0297751426697, | |
| "epoch": 3.1493001555209954, | |
| "grad_norm": 5.746135234832764, | |
| "kl": 26.1618300229311, | |
| "learning_rate": 4.2029454784200675e-07, | |
| "loss": 0.0262, | |
| "reward": 0.21742560202255845, | |
| "reward_std": 0.2172505116323009, | |
| "rewards/equation_reward_func": 0.217425603303127, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 645.058048248291, | |
| "epoch": 3.1990668740279937, | |
| "grad_norm": 60.6376953125, | |
| "kl": 53.1397475451231, | |
| "learning_rate": 4.1763932395971433e-07, | |
| "loss": 0.0531, | |
| "reward": 0.2241517937509343, | |
| "reward_std": 0.20952896296512336, | |
| "rewards/equation_reward_func": 0.22415179491508752, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 632.6659345626831, | |
| "epoch": 3.248833592534992, | |
| "grad_norm": 5.82427978515625, | |
| "kl": 41.686398059129715, | |
| "learning_rate": 4.1494928597979117e-07, | |
| "loss": 0.0417, | |
| "reward": 0.22440477029886097, | |
| "reward_std": 0.2128691952675581, | |
| "rewards/equation_reward_func": 0.22440477076452225, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 639.6711411476135, | |
| "epoch": 3.298600311041991, | |
| "grad_norm": 3.375183343887329, | |
| "kl": 36.797510489821434, | |
| "learning_rate": 4.122249925494726e-07, | |
| "loss": 0.0368, | |
| "reward": 0.2161235201638192, | |
| "reward_std": 0.20362528192345053, | |
| "rewards/equation_reward_func": 0.21612352062948048, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 651.2276935577393, | |
| "epoch": 3.348367029548989, | |
| "grad_norm": 5.04212760925293, | |
| "kl": 37.60325849056244, | |
| "learning_rate": 4.094670094299131e-07, | |
| "loss": 0.0376, | |
| "reward": 0.22996280749794096, | |
| "reward_std": 0.214357816032134, | |
| "rewards/equation_reward_func": 0.22996280703227967, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 631.5751585960388, | |
| "epoch": 3.3981337480559874, | |
| "grad_norm": 4.119243144989014, | |
| "kl": 43.57139265537262, | |
| "learning_rate": 4.066759093786931e-07, | |
| "loss": 0.0436, | |
| "reward": 0.2285714359022677, | |
| "reward_std": 0.21766341011971235, | |
| "rewards/equation_reward_func": 0.22857143532019109, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 647.8214359283447, | |
| "epoch": 3.447900466562986, | |
| "grad_norm": 7.117722988128662, | |
| "kl": 60.4551947414875, | |
| "learning_rate": 4.038522720308732e-07, | |
| "loss": 0.0605, | |
| "reward": 0.21806548640597612, | |
| "reward_std": 0.20702184177935123, | |
| "rewards/equation_reward_func": 0.2180654831463471, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 609.9583463668823, | |
| "epoch": 3.4976671850699845, | |
| "grad_norm": 4.748437881469727, | |
| "kl": 58.59304141998291, | |
| "learning_rate": 4.009966837786194e-07, | |
| "loss": 0.0586, | |
| "reward": 0.2300297737820074, | |
| "reward_std": 0.20853826915845275, | |
| "rewards/equation_reward_func": 0.23002976982388645, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 631.8430180549622, | |
| "epoch": 3.547433903576983, | |
| "grad_norm": 8.042330741882324, | |
| "kl": 82.30807757377625, | |
| "learning_rate": 3.981097376494259e-07, | |
| "loss": 0.0823, | |
| "reward": 0.21836310264188796, | |
| "reward_std": 0.20933940180111676, | |
| "rewards/equation_reward_func": 0.21836310101207346, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 624.0669736862183, | |
| "epoch": 3.5972006220839816, | |
| "grad_norm": 7.811219692230225, | |
| "kl": 77.89375275373459, | |
| "learning_rate": 3.951920331829592e-07, | |
| "loss": 0.0779, | |
| "reward": 0.2207961401436478, | |
| "reward_std": 0.21105306909885257, | |
| "rewards/equation_reward_func": 0.22079613932874054, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 623.5215888023376, | |
| "epoch": 3.64696734059098, | |
| "grad_norm": 8.836230278015137, | |
| "kl": 65.97143815457821, | |
| "learning_rate": 3.922441763065506e-07, | |
| "loss": 0.066, | |
| "reward": 0.2193824496353045, | |
| "reward_std": 0.20604081987403333, | |
| "rewards/equation_reward_func": 0.21938244777265936, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 634.7611751556396, | |
| "epoch": 3.6967340590979783, | |
| "grad_norm": 5.354574680328369, | |
| "kl": 56.36278319358826, | |
| "learning_rate": 3.8926677920936093e-07, | |
| "loss": 0.0564, | |
| "reward": 0.2112648879410699, | |
| "reward_std": 0.2029515573522076, | |
| "rewards/equation_reward_func": 0.21126488805748522, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 636.0297775268555, | |
| "epoch": 3.7465007776049766, | |
| "grad_norm": 5.276882648468018, | |
| "kl": 65.72037261724472, | |
| "learning_rate": 3.862604602152464e-07, | |
| "loss": 0.0657, | |
| "reward": 0.20753721124492586, | |
| "reward_std": 0.20195745571982116, | |
| "rewards/equation_reward_func": 0.20753721171058714, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 634.954626083374, | |
| "epoch": 3.796267496111975, | |
| "grad_norm": 8.027347564697266, | |
| "kl": 77.93326985836029, | |
| "learning_rate": 3.8322584365434934e-07, | |
| "loss": 0.0779, | |
| "reward": 0.2165699511533603, | |
| "reward_std": 0.2101849897298962, | |
| "rewards/equation_reward_func": 0.2165699495235458, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 638.3660817146301, | |
| "epoch": 3.8460342146189737, | |
| "grad_norm": 4.954690456390381, | |
| "kl": 83.4894488453865, | |
| "learning_rate": 3.8016355973344173e-07, | |
| "loss": 0.0835, | |
| "reward": 0.21200893796049058, | |
| "reward_std": 0.21022081119008362, | |
| "rewards/equation_reward_func": 0.21200893679633737, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 620.3281378746033, | |
| "epoch": 3.895800933125972, | |
| "grad_norm": 4.270212650299072, | |
| "kl": 82.2349089384079, | |
| "learning_rate": 3.7707424440504863e-07, | |
| "loss": 0.0822, | |
| "reward": 0.211755960714072, | |
| "reward_std": 0.20715959300287068, | |
| "rewards/equation_reward_func": 0.21175595885142684, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 632.0409350395203, | |
| "epoch": 3.9455676516329703, | |
| "grad_norm": 4.687271595001221, | |
| "kl": 90.35439342260361, | |
| "learning_rate": 3.739585392353787e-07, | |
| "loss": 0.0904, | |
| "reward": 0.21921131818089634, | |
| "reward_std": 0.20252067118417472, | |
| "rewards/equation_reward_func": 0.21921131608542055, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 630.2678661346436, | |
| "epoch": 3.995334370139969, | |
| "grad_norm": 5.595997333526611, | |
| "kl": 95.46352458000183, | |
| "learning_rate": 3.7081709127108767e-07, | |
| "loss": 0.0955, | |
| "reward": 0.22013393603265285, | |
| "reward_std": 0.2177246706560254, | |
| "rewards/equation_reward_func": 0.2201339368475601, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 632.1065288342928, | |
| "epoch": 4.024883359253499, | |
| "grad_norm": 8.787236213684082, | |
| "kl": 144.07192611694336, | |
| "learning_rate": 3.6765055290490513e-07, | |
| "loss": 0.0855, | |
| "reward": 0.20649123721216855, | |
| "reward_std": 0.21240881752026708, | |
| "rewards/equation_reward_func": 0.2064912359377271, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 619.5156345367432, | |
| "epoch": 4.074650077760498, | |
| "grad_norm": 7.552036762237549, | |
| "kl": 137.199125289917, | |
| "learning_rate": 3.644595817401501e-07, | |
| "loss": 0.1372, | |
| "reward": 0.2162797685014084, | |
| "reward_std": 0.21547920361626893, | |
| "rewards/equation_reward_func": 0.2162797685014084, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 618.7634057998657, | |
| "epoch": 4.1244167962674965, | |
| "grad_norm": 6.8007354736328125, | |
| "kl": 103.6235063970089, | |
| "learning_rate": 3.6124484045416483e-07, | |
| "loss": 0.1036, | |
| "reward": 0.23168899782467633, | |
| "reward_std": 0.21457487577572465, | |
| "rewards/equation_reward_func": 0.23168899829033762, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 637.4136991500854, | |
| "epoch": 4.174183514774494, | |
| "grad_norm": 8.004964828491211, | |
| "kl": 113.37393373250961, | |
| "learning_rate": 3.580069966606949e-07, | |
| "loss": 0.1134, | |
| "reward": 0.21156250836793333, | |
| "reward_std": 0.2123116059228778, | |
| "rewards/equation_reward_func": 0.21156250790227205, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 634.7485208511353, | |
| "epoch": 4.223950233281493, | |
| "grad_norm": 7.898318290710449, | |
| "kl": 109.72896337509155, | |
| "learning_rate": 3.547467227712444e-07, | |
| "loss": 0.1097, | |
| "reward": 0.2029910811688751, | |
| "reward_std": 0.20662414643447846, | |
| "rewards/equation_reward_func": 0.20299108081962913, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 621.2730751037598, | |
| "epoch": 4.273716951788492, | |
| "grad_norm": 7.211435317993164, | |
| "kl": 99.61057341098785, | |
| "learning_rate": 3.5146469585543386e-07, | |
| "loss": 0.0996, | |
| "reward": 0.22819941327907145, | |
| "reward_std": 0.2186455992050469, | |
| "rewards/equation_reward_func": 0.22819941234774888, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 640.9628086090088, | |
| "epoch": 4.32348367029549, | |
| "grad_norm": 7.790672302246094, | |
| "kl": 93.87813127040863, | |
| "learning_rate": 3.481615975003922e-07, | |
| "loss": 0.0939, | |
| "reward": 0.2149925670819357, | |
| "reward_std": 0.20749260939192027, | |
| "rewards/equation_reward_func": 0.2149925702251494, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 615.1093888282776, | |
| "epoch": 4.3732503888024885, | |
| "grad_norm": 22.329519271850586, | |
| "kl": 87.78260296583176, | |
| "learning_rate": 3.448381136692089e-07, | |
| "loss": 0.0878, | |
| "reward": 0.21617560542654246, | |
| "reward_std": 0.20247984025627375, | |
| "rewards/equation_reward_func": 0.2161756035638973, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 629.4829001426697, | |
| "epoch": 4.423017107309486, | |
| "grad_norm": 13.893996238708496, | |
| "kl": 98.21013808250427, | |
| "learning_rate": 3.4149493455847897e-07, | |
| "loss": 0.0982, | |
| "reward": 0.21152530901599675, | |
| "reward_std": 0.2093647257424891, | |
| "rewards/equation_reward_func": 0.21152530668769032, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 623.7224802970886, | |
| "epoch": 4.472783825816485, | |
| "grad_norm": 7.4938130378723145, | |
| "kl": 149.59339570999146, | |
| "learning_rate": 3.3813275445496766e-07, | |
| "loss": 0.1496, | |
| "reward": 0.2145535812014714, | |
| "reward_std": 0.2063142586266622, | |
| "rewards/equation_reward_func": 0.214553578523919, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 639.263400554657, | |
| "epoch": 4.522550544323484, | |
| "grad_norm": 6.325891494750977, | |
| "kl": 147.64970636367798, | |
| "learning_rate": 3.347522715914262e-07, | |
| "loss": 0.1476, | |
| "reward": 0.20923363824840635, | |
| "reward_std": 0.20685563085135072, | |
| "rewards/equation_reward_func": 0.20923363824840635, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 636.6897439956665, | |
| "epoch": 4.572317262830482, | |
| "grad_norm": 4.635812759399414, | |
| "kl": 130.48132091760635, | |
| "learning_rate": 3.313541880015877e-07, | |
| "loss": 0.1305, | |
| "reward": 0.21598215226549655, | |
| "reward_std": 0.2006415540818125, | |
| "rewards/equation_reward_func": 0.21598214923869818, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 631.9933152198792, | |
| "epoch": 4.6220839813374806, | |
| "grad_norm": 7.933198928833008, | |
| "kl": 118.75544810295105, | |
| "learning_rate": 3.279392093743747e-07, | |
| "loss": 0.1188, | |
| "reward": 0.22688244911842048, | |
| "reward_std": 0.22052743670064956, | |
| "rewards/equation_reward_func": 0.22688244772143662, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 632.7038769721985, | |
| "epoch": 4.671850699844479, | |
| "grad_norm": 6.763364791870117, | |
| "kl": 112.75827008485794, | |
| "learning_rate": 3.245080449073459e-07, | |
| "loss": 0.1128, | |
| "reward": 0.2060937569476664, | |
| "reward_std": 0.20044768252409995, | |
| "rewards/equation_reward_func": 0.2060937574133277, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 632.4464421272278, | |
| "epoch": 4.721617418351477, | |
| "grad_norm": 4.295353412628174, | |
| "kl": 108.82453501224518, | |
| "learning_rate": 3.210614071594162e-07, | |
| "loss": 0.1088, | |
| "reward": 0.20745536405593157, | |
| "reward_std": 0.21275918127503246, | |
| "rewards/equation_reward_func": 0.2074553637066856, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 634.1763515472412, | |
| "epoch": 4.771384136858476, | |
| "grad_norm": 4.46217679977417, | |
| "kl": 118.317107796669, | |
| "learning_rate": 3.1760001190287695e-07, | |
| "loss": 0.1183, | |
| "reward": 0.20520090113859624, | |
| "reward_std": 0.2021206704666838, | |
| "rewards/equation_reward_func": 0.20520090113859624, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 620.2395968437195, | |
| "epoch": 4.821150855365475, | |
| "grad_norm": 4.841196060180664, | |
| "kl": 119.24478554725647, | |
| "learning_rate": 3.141245779747502e-07, | |
| "loss": 0.1192, | |
| "reward": 0.21259673358872533, | |
| "reward_std": 0.21422103908844292, | |
| "rewards/equation_reward_func": 0.21259673358872533, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 609.0446557998657, | |
| "epoch": 4.870917573872473, | |
| "grad_norm": 4.3330559730529785, | |
| "kl": 119.67610502243042, | |
| "learning_rate": 3.106358271275056e-07, | |
| "loss": 0.1197, | |
| "reward": 0.22683036630041897, | |
| "reward_std": 0.20717181416694075, | |
| "rewards/equation_reward_func": 0.22683036653324962, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 614.8869152069092, | |
| "epoch": 4.920684292379471, | |
| "grad_norm": 92.09661102294922, | |
| "kl": 144.53644692897797, | |
| "learning_rate": 3.0713448387917227e-07, | |
| "loss": 0.1445, | |
| "reward": 0.21901042643003166, | |
| "reward_std": 0.20682094641961157, | |
| "rewards/equation_reward_func": 0.2190104245673865, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 631.4241156578064, | |
| "epoch": 4.970451010886469, | |
| "grad_norm": 6.355322360992432, | |
| "kl": 154.4233751296997, | |
| "learning_rate": 3.0362127536287636e-07, | |
| "loss": 0.1544, | |
| "reward": 0.21773066406603903, | |
| "reward_std": 0.21250074298586696, | |
| "rewards/equation_reward_func": 0.2177306618541479, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 624.7180488987973, | |
| "epoch": 5.0, | |
| "grad_norm": 5.770173072814941, | |
| "kl": 161.87928571199117, | |
| "learning_rate": 3.0009693117583523e-07, | |
| "loss": 0.0961, | |
| "reward": 0.21541354177813782, | |
| "reward_std": 0.20374000229333578, | |
| "rewards/equation_reward_func": 0.215413541386002, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 624.5647420883179, | |
| "epoch": 5.049766718506999, | |
| "grad_norm": 6.884070873260498, | |
| "kl": 157.92570447921753, | |
| "learning_rate": 2.965621832278401e-07, | |
| "loss": 0.1579, | |
| "reward": 0.22669643780682236, | |
| "reward_std": 0.20801680884324014, | |
| "rewards/equation_reward_func": 0.22669643454719335, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 614.1570081710815, | |
| "epoch": 5.099533437013997, | |
| "grad_norm": 4.670907497406006, | |
| "kl": 134.14546036720276, | |
| "learning_rate": 2.9301776558925875e-07, | |
| "loss": 0.1341, | |
| "reward": 0.2188244123244658, | |
| "reward_std": 0.20453347032889724, | |
| "rewards/equation_reward_func": 0.21882441325578839, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 614.4702506065369, | |
| "epoch": 5.149300155520995, | |
| "grad_norm": 14.716873168945312, | |
| "kl": 109.80421262979507, | |
| "learning_rate": 2.894644143385885e-07, | |
| "loss": 0.1098, | |
| "reward": 0.21839286445174366, | |
| "reward_std": 0.20062782417517155, | |
| "rewards/equation_reward_func": 0.21839286398608238, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 622.4672718048096, | |
| "epoch": 5.199066874027994, | |
| "grad_norm": 10.858051300048828, | |
| "kl": 114.28983092308044, | |
| "learning_rate": 2.859028674095937e-07, | |
| "loss": 0.1143, | |
| "reward": 0.2192782819038257, | |
| "reward_std": 0.2128367607947439, | |
| "rewards/equation_reward_func": 0.21927828167099506, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 612.6160840988159, | |
| "epoch": 5.248833592534992, | |
| "grad_norm": 3.8785901069641113, | |
| "kl": 125.06462055444717, | |
| "learning_rate": 2.823338644380566e-07, | |
| "loss": 0.1251, | |
| "reward": 0.23020090232603252, | |
| "reward_std": 0.2176531965378672, | |
| "rewards/equation_reward_func": 0.23020089999772608, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 635.8995633125305, | |
| "epoch": 5.298600311041991, | |
| "grad_norm": 5.062567234039307, | |
| "kl": 148.21274209022522, | |
| "learning_rate": 2.7875814660817504e-07, | |
| "loss": 0.1482, | |
| "reward": 0.2193973324028775, | |
| "reward_std": 0.22195886494591832, | |
| "rewards/equation_reward_func": 0.21939733054023236, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 630.8229269981384, | |
| "epoch": 5.348367029548989, | |
| "grad_norm": 5.181402206420898, | |
| "kl": 165.8618984222412, | |
| "learning_rate": 2.751764564986396e-07, | |
| "loss": 0.1659, | |
| "reward": 0.2077009006170556, | |
| "reward_std": 0.2193935844115913, | |
| "rewards/equation_reward_func": 0.2077009001513943, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 628.6517939567566, | |
| "epoch": 5.3981337480559874, | |
| "grad_norm": 4.105767726898193, | |
| "kl": 148.7712802886963, | |
| "learning_rate": 2.715895379284194e-07, | |
| "loss": 0.1488, | |
| "reward": 0.2191815583501011, | |
| "reward_std": 0.20989621221087873, | |
| "rewards/equation_reward_func": 0.21918155602179468, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 629.8006067276001, | |
| "epoch": 5.447900466562986, | |
| "grad_norm": 3.895611524581909, | |
| "kl": 142.22095596790314, | |
| "learning_rate": 2.6799813580229174e-07, | |
| "loss": 0.1422, | |
| "reward": 0.22290923492982984, | |
| "reward_std": 0.21323461562860757, | |
| "rewards/equation_reward_func": 0.2229092346969992, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 608.6183171272278, | |
| "epoch": 5.497667185069984, | |
| "grad_norm": 6.331876277923584, | |
| "kl": 135.1478552222252, | |
| "learning_rate": 2.6440299595614606e-07, | |
| "loss": 0.1351, | |
| "reward": 0.21991072362288833, | |
| "reward_std": 0.22133340197615325, | |
| "rewards/equation_reward_func": 0.21991072269156575, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 611.6756086349487, | |
| "epoch": 5.547433903576983, | |
| "grad_norm": 3.41554594039917, | |
| "kl": 135.47022581100464, | |
| "learning_rate": 2.6080486500209347e-07, | |
| "loss": 0.1355, | |
| "reward": 0.21784971025772393, | |
| "reward_std": 0.21086209290660918, | |
| "rewards/equation_reward_func": 0.2178497090935707, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 609.0922722816467, | |
| "epoch": 5.597200622083982, | |
| "grad_norm": 4.638352870941162, | |
| "kl": 149.68241280317307, | |
| "learning_rate": 2.572044901734166e-07, | |
| "loss": 0.1497, | |
| "reward": 0.22438989242073148, | |
| "reward_std": 0.2241612394573167, | |
| "rewards/equation_reward_func": 0.2243898919550702, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 629.8534321784973, | |
| "epoch": 5.6469673405909795, | |
| "grad_norm": 4.474099159240723, | |
| "kl": 164.97060561180115, | |
| "learning_rate": 2.536026191693893e-07, | |
| "loss": 0.165, | |
| "reward": 0.2060565553838387, | |
| "reward_std": 0.21067888580728322, | |
| "rewards/equation_reward_func": 0.20605655445251614, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 626.8482217788696, | |
| "epoch": 5.696734059097978, | |
| "grad_norm": 9.778329849243164, | |
| "kl": 169.21773087978363, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.1692, | |
| "reward": 0.20911459170747548, | |
| "reward_std": 0.21599237713962793, | |
| "rewards/equation_reward_func": 0.2091145912418142, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 629.8660821914673, | |
| "epoch": 5.746500777604977, | |
| "grad_norm": 5.210114479064941, | |
| "kl": 171.0250325202942, | |
| "learning_rate": 2.4639738083061073e-07, | |
| "loss": 0.171, | |
| "reward": 0.2135788791347295, | |
| "reward_std": 0.20587447995785624, | |
| "rewards/equation_reward_func": 0.21357887890189886, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 628.7165260314941, | |
| "epoch": 5.796267496111975, | |
| "grad_norm": 4.644392490386963, | |
| "kl": 149.7915449142456, | |
| "learning_rate": 2.4279550982658345e-07, | |
| "loss": 0.1498, | |
| "reward": 0.20833334070630372, | |
| "reward_std": 0.21195052459370345, | |
| "rewards/equation_reward_func": 0.20833334047347307, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 628.755964756012, | |
| "epoch": 5.846034214618974, | |
| "grad_norm": 6.456798076629639, | |
| "kl": 442.08424025774, | |
| "learning_rate": 2.3919513499790646e-07, | |
| "loss": 0.4421, | |
| "reward": 0.22005209047347307, | |
| "reward_std": 0.21488765871617943, | |
| "rewards/equation_reward_func": 0.22005209024064243, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 612.3988199234009, | |
| "epoch": 5.895800933125972, | |
| "grad_norm": 9.304161071777344, | |
| "kl": 118.21684062480927, | |
| "learning_rate": 2.3559700404385394e-07, | |
| "loss": 0.1182, | |
| "reward": 0.22447917505633086, | |
| "reward_std": 0.211615604814142, | |
| "rewards/equation_reward_func": 0.22447917482350022, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 633.3660821914673, | |
| "epoch": 5.94556765163297, | |
| "grad_norm": 5.745642185211182, | |
| "kl": 133.20424818992615, | |
| "learning_rate": 2.3200186419770823e-07, | |
| "loss": 0.1332, | |
| "reward": 0.2242708442499861, | |
| "reward_std": 0.2152464333921671, | |
| "rewards/equation_reward_func": 0.22427084331866354, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 618.1235270500183, | |
| "epoch": 5.995334370139969, | |
| "grad_norm": 4.167017936706543, | |
| "kl": 143.97905486822128, | |
| "learning_rate": 2.284104620715807e-07, | |
| "loss": 0.144, | |
| "reward": 0.22046875627711415, | |
| "reward_std": 0.21442426112480462, | |
| "rewards/equation_reward_func": 0.22046875732485205, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 634.5175580476459, | |
| "epoch": 6.024883359253499, | |
| "grad_norm": 3.44785213470459, | |
| "kl": 167.55113441065737, | |
| "learning_rate": 2.2482354350136043e-07, | |
| "loss": 0.0995, | |
| "reward": 0.21961153769179395, | |
| "reward_std": 0.2146961924276854, | |
| "rewards/equation_reward_func": 0.21961153769179395, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 634.5863180160522, | |
| "epoch": 6.074650077760498, | |
| "grad_norm": 7.954348564147949, | |
| "kl": 163.61565399169922, | |
| "learning_rate": 2.2124185339182496e-07, | |
| "loss": 0.1636, | |
| "reward": 0.23546131700277328, | |
| "reward_std": 0.2178129724925384, | |
| "rewards/equation_reward_func": 0.23546131781768054, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 610.0825996398926, | |
| "epoch": 6.1244167962674965, | |
| "grad_norm": 4.648006439208984, | |
| "kl": 167.8152883052826, | |
| "learning_rate": 2.1766613556194344e-07, | |
| "loss": 0.1678, | |
| "reward": 0.22144346224376932, | |
| "reward_std": 0.21030379901640117, | |
| "rewards/equation_reward_func": 0.22144346177810803, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 6.1244167962674965, | |
| "step": 250, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0, | |
| "train_runtime": 0.0058, | |
| "train_samples_per_second": 3851297.791, | |
| "train_steps_per_second": 17193.294 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |