Bakalarska_praca/results/checkpoint-11730/trainer_state.json

8365 lines
202 KiB
JSON
Raw Normal View History

2024-10-29 13:08:43 +00:00
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 500,
"global_step": 11730,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01278772378516624,
"grad_norm": 5291.65576171875,
"learning_rate": 1.0000000000000002e-06,
"loss": 46.8468,
"step": 10
},
{
"epoch": 0.02557544757033248,
"grad_norm": 657.4127807617188,
"learning_rate": 2.0000000000000003e-06,
"loss": 47.4188,
"step": 20
},
{
"epoch": 0.03836317135549872,
"grad_norm": 8509.5302734375,
"learning_rate": 3e-06,
"loss": 55.5624,
"step": 30
},
{
"epoch": 0.05115089514066496,
"grad_norm": 427.65924072265625,
"learning_rate": 4.000000000000001e-06,
"loss": 54.0157,
"step": 40
},
{
"epoch": 0.0639386189258312,
"grad_norm": 7448.37353515625,
"learning_rate": 5e-06,
"loss": 57.6548,
"step": 50
},
{
"epoch": 0.07672634271099744,
"grad_norm": 14357.810546875,
"learning_rate": 6e-06,
"loss": 45.0872,
"step": 60
},
{
"epoch": 0.08951406649616368,
"grad_norm": 4495.298828125,
"learning_rate": 7.000000000000001e-06,
"loss": 49.2105,
"step": 70
},
{
"epoch": 0.10230179028132992,
"grad_norm": 3922.909912109375,
"learning_rate": 8.000000000000001e-06,
"loss": 45.772,
"step": 80
},
{
"epoch": 0.11508951406649616,
"grad_norm": 47730.671875,
"learning_rate": 9e-06,
"loss": 54.9209,
"step": 90
},
{
"epoch": 0.1278772378516624,
"grad_norm": 27943.875,
"learning_rate": 1e-05,
"loss": 47.032,
"step": 100
},
{
"epoch": 0.14066496163682865,
"grad_norm": 185.7626953125,
"learning_rate": 1.1000000000000001e-05,
"loss": 55.6442,
"step": 110
},
{
"epoch": 0.1534526854219949,
"grad_norm": 4819.99365234375,
"learning_rate": 1.2e-05,
"loss": 47.2024,
"step": 120
},
{
"epoch": 0.16624040920716113,
"grad_norm": 16820.35546875,
"learning_rate": 1.3000000000000001e-05,
"loss": 47.1665,
"step": 130
},
{
"epoch": 0.17902813299232737,
"grad_norm": 408.82489013671875,
"learning_rate": 1.4000000000000001e-05,
"loss": 45.6075,
"step": 140
},
{
"epoch": 0.1918158567774936,
"grad_norm": 29451.880859375,
"learning_rate": 1.5e-05,
"loss": 50.9366,
"step": 150
},
{
"epoch": 0.20460358056265984,
"grad_norm": 28413.0390625,
"learning_rate": 1.6000000000000003e-05,
"loss": 44.6847,
"step": 160
},
{
"epoch": 0.21739130434782608,
"grad_norm": 799.9179077148438,
"learning_rate": 1.7000000000000003e-05,
"loss": 46.8802,
"step": 170
},
{
"epoch": 0.23017902813299232,
"grad_norm": 2510.53515625,
"learning_rate": 1.8e-05,
"loss": 46.2193,
"step": 180
},
{
"epoch": 0.24296675191815856,
"grad_norm": 2892.29248046875,
"learning_rate": 1.9e-05,
"loss": 41.2469,
"step": 190
},
{
"epoch": 0.2557544757033248,
"grad_norm": 738.676513671875,
"learning_rate": 2e-05,
"loss": 41.8658,
"step": 200
},
{
"epoch": 0.26854219948849106,
"grad_norm": 215.02032470703125,
"learning_rate": 2.1e-05,
"loss": 42.2854,
"step": 210
},
{
"epoch": 0.2813299232736573,
"grad_norm": 1281.8134765625,
"learning_rate": 2.2000000000000003e-05,
"loss": 48.7005,
"step": 220
},
{
"epoch": 0.29411764705882354,
"grad_norm": 677.4962158203125,
"learning_rate": 2.3000000000000003e-05,
"loss": 36.2834,
"step": 230
},
{
"epoch": 0.3069053708439898,
"grad_norm": 2442.72900390625,
"learning_rate": 2.4e-05,
"loss": 44.3992,
"step": 240
},
{
"epoch": 0.319693094629156,
"grad_norm": 135.88478088378906,
"learning_rate": 2.5e-05,
"loss": 42.5502,
"step": 250
},
{
"epoch": 0.33248081841432225,
"grad_norm": 5432.8203125,
"learning_rate": 2.6000000000000002e-05,
"loss": 41.375,
"step": 260
},
{
"epoch": 0.3452685421994885,
"grad_norm": 3573.05419921875,
"learning_rate": 2.7000000000000002e-05,
"loss": 40.6085,
"step": 270
},
{
"epoch": 0.35805626598465473,
"grad_norm": 806.6569213867188,
"learning_rate": 2.8000000000000003e-05,
"loss": 40.9676,
"step": 280
},
{
"epoch": 0.37084398976982097,
"grad_norm": 537.9384765625,
"learning_rate": 2.9e-05,
"loss": 36.0962,
"step": 290
},
{
"epoch": 0.3836317135549872,
"grad_norm": 1317.54150390625,
"learning_rate": 3e-05,
"loss": 38.838,
"step": 300
},
{
"epoch": 0.39641943734015345,
"grad_norm": 1044.3780517578125,
"learning_rate": 3.1e-05,
"loss": 35.0419,
"step": 310
},
{
"epoch": 0.4092071611253197,
"grad_norm": 6332.0888671875,
"learning_rate": 3.2000000000000005e-05,
"loss": 35.0986,
"step": 320
},
{
"epoch": 0.4219948849104859,
"grad_norm": 1020.596923828125,
"learning_rate": 3.3e-05,
"loss": 35.2819,
"step": 330
},
{
"epoch": 0.43478260869565216,
"grad_norm": 4903.22119140625,
"learning_rate": 3.4000000000000007e-05,
"loss": 27.0823,
"step": 340
},
{
"epoch": 0.4475703324808184,
"grad_norm": 705.4653930664062,
"learning_rate": 3.5e-05,
"loss": 35.8111,
"step": 350
},
{
"epoch": 0.46035805626598464,
"grad_norm": 166.60675048828125,
"learning_rate": 3.6e-05,
"loss": 32.9624,
"step": 360
},
{
"epoch": 0.4731457800511509,
"grad_norm": 1294.9737548828125,
"learning_rate": 3.7e-05,
"loss": 27.3774,
"step": 370
},
{
"epoch": 0.4859335038363171,
"grad_norm": 143.36048889160156,
"learning_rate": 3.8e-05,
"loss": 35.3593,
"step": 380
},
{
"epoch": 0.49872122762148335,
"grad_norm": 2351.956787109375,
"learning_rate": 3.9000000000000006e-05,
"loss": 31.6628,
"step": 390
},
{
"epoch": 0.5115089514066496,
"grad_norm": 1412.3145751953125,
"learning_rate": 4e-05,
"loss": 24.9052,
"step": 400
},
{
"epoch": 0.5242966751918159,
"grad_norm": 2179.52294921875,
"learning_rate": 4.1e-05,
"loss": 28.4615,
"step": 410
},
{
"epoch": 0.5370843989769821,
"grad_norm": 1550.5777587890625,
"learning_rate": 4.2e-05,
"loss": 27.8655,
"step": 420
},
{
"epoch": 0.5498721227621484,
"grad_norm": 14167.97265625,
"learning_rate": 4.3e-05,
"loss": 31.339,
"step": 430
},
{
"epoch": 0.5626598465473146,
"grad_norm": 1213.757568359375,
"learning_rate": 4.4000000000000006e-05,
"loss": 29.2414,
"step": 440
},
{
"epoch": 0.5754475703324808,
"grad_norm": 1910.325439453125,
"learning_rate": 4.5e-05,
"loss": 28.1563,
"step": 450
},
{
"epoch": 0.5882352941176471,
"grad_norm": 2075.203369140625,
"learning_rate": 4.600000000000001e-05,
"loss": 30.5137,
"step": 460
},
{
"epoch": 0.6010230179028133,
"grad_norm": 2741.365966796875,
"learning_rate": 4.7e-05,
"loss": 23.7269,
"step": 470
},
{
"epoch": 0.6138107416879796,
"grad_norm": 6818.3935546875,
"learning_rate": 4.8e-05,
"loss": 20.7367,
"step": 480
},
{
"epoch": 0.6265984654731458,
"grad_norm": 14128.44140625,
"learning_rate": 4.9e-05,
"loss": 23.4038,
"step": 490
},
{
"epoch": 0.639386189258312,
"grad_norm": 268.3045654296875,
"learning_rate": 5e-05,
"loss": 25.6224,
"step": 500
},
{
"epoch": 0.6521739130434783,
"grad_norm": 7697.00341796875,
"learning_rate": 4.995547640249332e-05,
"loss": 24.966,
"step": 510
},
{
"epoch": 0.6649616368286445,
"grad_norm": 1550.7242431640625,
"learning_rate": 4.9910952804986644e-05,
"loss": 27.2747,
"step": 520
},
{
"epoch": 0.6777493606138107,
"grad_norm": 5949.79150390625,
"learning_rate": 4.986642920747996e-05,
"loss": 25.6169,
"step": 530
},
{
"epoch": 0.690537084398977,
"grad_norm": 966.1629638671875,
"learning_rate": 4.982190560997329e-05,
"loss": 22.0359,
"step": 540
},
{
"epoch": 0.7033248081841432,
"grad_norm": 14809.646484375,
"learning_rate": 4.977738201246661e-05,
"loss": 20.4764,
"step": 550
},
{
"epoch": 0.7161125319693095,
"grad_norm": 5175.0810546875,
"learning_rate": 4.9732858414959934e-05,
"loss": 21.0584,
"step": 560
},
{
"epoch": 0.7289002557544757,
"grad_norm": 620.6530151367188,
"learning_rate": 4.968833481745325e-05,
"loss": 20.8162,
"step": 570
},
{
"epoch": 0.7416879795396419,
"grad_norm": 545.9930419921875,
"learning_rate": 4.9643811219946576e-05,
"loss": 19.4457,
"step": 580
},
{
"epoch": 0.7544757033248082,
"grad_norm": 394.00115966796875,
"learning_rate": 4.9599287622439894e-05,
"loss": 20.3071,
"step": 590
},
{
"epoch": 0.7672634271099744,
"grad_norm": 642.2852783203125,
"learning_rate": 4.955476402493322e-05,
"loss": 19.3772,
"step": 600
},
{
"epoch": 0.7800511508951407,
"grad_norm": 775.719970703125,
"learning_rate": 4.951024042742654e-05,
"loss": 21.5308,
"step": 610
},
{
"epoch": 0.7928388746803069,
"grad_norm": 3649.2021484375,
"learning_rate": 4.946571682991986e-05,
"loss": 18.0963,
"step": 620
},
{
"epoch": 0.8056265984654731,
"grad_norm": 3447.32421875,
"learning_rate": 4.9421193232413184e-05,
"loss": 18.1472,
"step": 630
},
{
"epoch": 0.8184143222506394,
"grad_norm": 250.08575439453125,
"learning_rate": 4.93766696349065e-05,
"loss": 17.4447,
"step": 640
},
{
"epoch": 0.8312020460358056,
"grad_norm": 371.5052490234375,
"learning_rate": 4.9332146037399826e-05,
"loss": 18.3444,
"step": 650
},
{
"epoch": 0.8439897698209718,
"grad_norm": 878.1161499023438,
"learning_rate": 4.928762243989314e-05,
"loss": 17.2934,
"step": 660
},
{
"epoch": 0.8567774936061381,
"grad_norm": 1265.5709228515625,
"learning_rate": 4.924309884238647e-05,
"loss": 16.3026,
"step": 670
},
{
"epoch": 0.8695652173913043,
"grad_norm": 591.113037109375,
"learning_rate": 4.919857524487979e-05,
"loss": 16.274,
"step": 680
},
{
"epoch": 0.8823529411764706,
"grad_norm": 219.59530639648438,
"learning_rate": 4.915405164737311e-05,
"loss": 17.4999,
"step": 690
},
{
"epoch": 0.8951406649616368,
"grad_norm": 708.5332641601562,
"learning_rate": 4.9109528049866433e-05,
"loss": 15.9393,
"step": 700
},
{
"epoch": 0.907928388746803,
"grad_norm": 430.86468505859375,
"learning_rate": 4.906500445235975e-05,
"loss": 16.0164,
"step": 710
},
{
"epoch": 0.9207161125319693,
"grad_norm": 1154.94873046875,
"learning_rate": 4.9020480854853075e-05,
"loss": 18.7462,
"step": 720
},
{
"epoch": 0.9335038363171355,
"grad_norm": 405.43511962890625,
"learning_rate": 4.897595725734639e-05,
"loss": 15.9433,
"step": 730
},
{
"epoch": 0.9462915601023018,
"grad_norm": 85.01339721679688,
"learning_rate": 4.893143365983972e-05,
"loss": 15.8037,
"step": 740
},
{
"epoch": 0.959079283887468,
"grad_norm": 309.97119140625,
"learning_rate": 4.888691006233304e-05,
"loss": 14.8662,
"step": 750
},
{
"epoch": 0.9718670076726342,
"grad_norm": 554.9390869140625,
"learning_rate": 4.884238646482636e-05,
"loss": 14.4255,
"step": 760
},
{
"epoch": 0.9846547314578005,
"grad_norm": 830.1895141601562,
"learning_rate": 4.879786286731968e-05,
"loss": 14.9826,
"step": 770
},
{
"epoch": 0.9974424552429667,
"grad_norm": 133.27218627929688,
"learning_rate": 4.8753339269813e-05,
"loss": 13.5141,
"step": 780
},
{
"epoch": 1.0,
"eval_loss": 8.323198318481445,
"eval_runtime": 0.8605,
"eval_samples_per_second": 113.894,
"eval_steps_per_second": 15.108,
"step": 782
},
{
"epoch": 1.010230179028133,
"grad_norm": 2045.128173828125,
"learning_rate": 4.8708815672306325e-05,
"loss": 12.1253,
"step": 790
},
{
"epoch": 1.0230179028132993,
"grad_norm": 288.3453369140625,
"learning_rate": 4.866429207479964e-05,
"loss": 12.204,
"step": 800
},
{
"epoch": 1.0358056265984654,
"grad_norm": 507.2064208984375,
"learning_rate": 4.8619768477292966e-05,
"loss": 13.964,
"step": 810
},
{
"epoch": 1.0485933503836318,
"grad_norm": 359.73388671875,
"learning_rate": 4.857524487978629e-05,
"loss": 13.5956,
"step": 820
},
{
"epoch": 1.061381074168798,
"grad_norm": 113.6794662475586,
"learning_rate": 4.8530721282279615e-05,
"loss": 12.8579,
"step": 830
},
{
"epoch": 1.0741687979539642,
"grad_norm": 754.2025756835938,
"learning_rate": 4.848619768477293e-05,
"loss": 13.0858,
"step": 840
},
{
"epoch": 1.0869565217391304,
"grad_norm": 191.79119873046875,
"learning_rate": 4.844167408726625e-05,
"loss": 10.2861,
"step": 850
},
{
"epoch": 1.0997442455242967,
"grad_norm": 138.1201629638672,
"learning_rate": 4.8397150489759574e-05,
"loss": 9.7727,
"step": 860
},
{
"epoch": 1.1125319693094629,
"grad_norm": 201.81227111816406,
"learning_rate": 4.835262689225289e-05,
"loss": 11.2379,
"step": 870
},
{
"epoch": 1.1253196930946292,
"grad_norm": 2496.734619140625,
"learning_rate": 4.8308103294746216e-05,
"loss": 12.2517,
"step": 880
},
{
"epoch": 1.1381074168797953,
"grad_norm": 126.88189697265625,
"learning_rate": 4.826357969723954e-05,
"loss": 10.6216,
"step": 890
},
{
"epoch": 1.1508951406649617,
"grad_norm": 189.23846435546875,
"learning_rate": 4.8219056099732865e-05,
"loss": 11.7789,
"step": 900
},
{
"epoch": 1.1636828644501278,
"grad_norm": 137.6693115234375,
"learning_rate": 4.817453250222618e-05,
"loss": 10.3337,
"step": 910
},
{
"epoch": 1.1764705882352942,
"grad_norm": 233.86508178710938,
"learning_rate": 4.8130008904719506e-05,
"loss": 10.3479,
"step": 920
},
{
"epoch": 1.1892583120204603,
"grad_norm": 1433.0694580078125,
"learning_rate": 4.8085485307212824e-05,
"loss": 10.1257,
"step": 930
},
{
"epoch": 1.2020460358056266,
"grad_norm": 241.59225463867188,
"learning_rate": 4.804096170970615e-05,
"loss": 10.8335,
"step": 940
},
{
"epoch": 1.2148337595907928,
"grad_norm": 751.7616577148438,
"learning_rate": 4.7996438112199466e-05,
"loss": 9.9136,
"step": 950
},
{
"epoch": 1.227621483375959,
"grad_norm": 137.66688537597656,
"learning_rate": 4.795191451469279e-05,
"loss": 9.6519,
"step": 960
},
{
"epoch": 1.2404092071611252,
"grad_norm": 576.8855590820312,
"learning_rate": 4.7907390917186114e-05,
"loss": 8.2642,
"step": 970
},
{
"epoch": 1.2531969309462916,
"grad_norm": 59.55119705200195,
"learning_rate": 4.786286731967943e-05,
"loss": 7.5509,
"step": 980
},
{
"epoch": 1.265984654731458,
"grad_norm": 364.87939453125,
"learning_rate": 4.7818343722172756e-05,
"loss": 8.2691,
"step": 990
},
{
"epoch": 1.278772378516624,
"grad_norm": 80.77793884277344,
"learning_rate": 4.777382012466607e-05,
"loss": 8.1777,
"step": 1000
},
{
"epoch": 1.2915601023017902,
"grad_norm": 166.04991149902344,
"learning_rate": 4.77292965271594e-05,
"loss": 8.2977,
"step": 1010
},
{
"epoch": 1.3043478260869565,
"grad_norm": 201.66940307617188,
"learning_rate": 4.7684772929652715e-05,
"loss": 7.6507,
"step": 1020
},
{
"epoch": 1.317135549872123,
"grad_norm": 115.5215835571289,
"learning_rate": 4.764024933214604e-05,
"loss": 8.4317,
"step": 1030
},
{
"epoch": 1.329923273657289,
"grad_norm": 856.268310546875,
"learning_rate": 4.7595725734639364e-05,
"loss": 7.4182,
"step": 1040
},
{
"epoch": 1.3427109974424551,
"grad_norm": 164.53457641601562,
"learning_rate": 4.755120213713268e-05,
"loss": 7.0188,
"step": 1050
},
{
"epoch": 1.3554987212276215,
"grad_norm": 284.8175354003906,
"learning_rate": 4.7506678539626005e-05,
"loss": 7.4934,
"step": 1060
},
{
"epoch": 1.3682864450127878,
"grad_norm": 914.1421508789062,
"learning_rate": 4.746215494211932e-05,
"loss": 6.3777,
"step": 1070
},
{
"epoch": 1.381074168797954,
"grad_norm": 364.2792663574219,
"learning_rate": 4.741763134461265e-05,
"loss": 7.0009,
"step": 1080
},
{
"epoch": 1.39386189258312,
"grad_norm": 40.856849670410156,
"learning_rate": 4.7373107747105965e-05,
"loss": 6.5175,
"step": 1090
},
{
"epoch": 1.4066496163682864,
"grad_norm": 20.656641006469727,
"learning_rate": 4.732858414959929e-05,
"loss": 6.5319,
"step": 1100
},
{
"epoch": 1.4194373401534528,
"grad_norm": 102.03244018554688,
"learning_rate": 4.728406055209261e-05,
"loss": 5.6601,
"step": 1110
},
{
"epoch": 1.432225063938619,
"grad_norm": 166.99356079101562,
"learning_rate": 4.723953695458593e-05,
"loss": 5.4954,
"step": 1120
},
{
"epoch": 1.445012787723785,
"grad_norm": 35.748558044433594,
"learning_rate": 4.7195013357079255e-05,
"loss": 5.5329,
"step": 1130
},
{
"epoch": 1.4578005115089514,
"grad_norm": 152.98487854003906,
"learning_rate": 4.715048975957257e-05,
"loss": 5.2992,
"step": 1140
},
{
"epoch": 1.4705882352941178,
"grad_norm": 35.46538162231445,
"learning_rate": 4.7105966162065897e-05,
"loss": 4.7582,
"step": 1150
},
{
"epoch": 1.4833759590792839,
"grad_norm": 667.7198486328125,
"learning_rate": 4.7061442564559214e-05,
"loss": 5.2354,
"step": 1160
},
{
"epoch": 1.49616368286445,
"grad_norm": 36.09316635131836,
"learning_rate": 4.7016918967052545e-05,
"loss": 4.4023,
"step": 1170
},
{
"epoch": 1.5089514066496164,
"grad_norm": 24.274316787719727,
"learning_rate": 4.697239536954586e-05,
"loss": 4.3065,
"step": 1180
},
{
"epoch": 1.5217391304347827,
"grad_norm": 76.46308898925781,
"learning_rate": 4.692787177203919e-05,
"loss": 3.7555,
"step": 1190
},
{
"epoch": 1.5345268542199488,
"grad_norm": 49.68375015258789,
"learning_rate": 4.6883348174532504e-05,
"loss": 4.5423,
"step": 1200
},
{
"epoch": 1.547314578005115,
"grad_norm": 41.64923858642578,
"learning_rate": 4.683882457702582e-05,
"loss": 4.1702,
"step": 1210
},
{
"epoch": 1.5601023017902813,
"grad_norm": 296.3228759765625,
"learning_rate": 4.6794300979519146e-05,
"loss": 3.4573,
"step": 1220
},
{
"epoch": 1.5728900255754477,
"grad_norm": 338.4577331542969,
"learning_rate": 4.6749777382012464e-05,
"loss": 3.5353,
"step": 1230
},
{
"epoch": 1.5856777493606138,
"grad_norm": 28.94730567932129,
"learning_rate": 4.6705253784505795e-05,
"loss": 3.1779,
"step": 1240
},
{
"epoch": 1.59846547314578,
"grad_norm": 132.22483825683594,
"learning_rate": 4.666073018699911e-05,
"loss": 2.745,
"step": 1250
},
{
"epoch": 1.6112531969309463,
"grad_norm": 63.76390838623047,
"learning_rate": 4.6616206589492436e-05,
"loss": 3.2324,
"step": 1260
},
{
"epoch": 1.6240409207161126,
"grad_norm": 24.974475860595703,
"learning_rate": 4.6571682991985754e-05,
"loss": 2.8275,
"step": 1270
},
{
"epoch": 1.6368286445012787,
"grad_norm": 42.1992301940918,
"learning_rate": 4.652715939447908e-05,
"loss": 2.9965,
"step": 1280
},
{
"epoch": 1.6496163682864449,
"grad_norm": 55.832916259765625,
"learning_rate": 4.6482635796972396e-05,
"loss": 2.7633,
"step": 1290
},
{
"epoch": 1.6624040920716112,
"grad_norm": 21.454418182373047,
"learning_rate": 4.643811219946571e-05,
"loss": 3.3381,
"step": 1300
},
{
"epoch": 1.6751918158567776,
"grad_norm": 186.13711547851562,
"learning_rate": 4.6393588601959044e-05,
"loss": 3.3175,
"step": 1310
},
{
"epoch": 1.6879795396419437,
"grad_norm": 43.50181579589844,
"learning_rate": 4.634906500445236e-05,
"loss": 2.947,
"step": 1320
},
{
"epoch": 1.7007672634271098,
"grad_norm": 280.3487854003906,
"learning_rate": 4.6304541406945686e-05,
"loss": 2.417,
"step": 1330
},
{
"epoch": 1.7135549872122762,
"grad_norm": 13.333784103393555,
"learning_rate": 4.6260017809439003e-05,
"loss": 2.0575,
"step": 1340
},
{
"epoch": 1.7263427109974425,
"grad_norm": 20.713420867919922,
"learning_rate": 4.621549421193233e-05,
"loss": 2.3133,
"step": 1350
},
{
"epoch": 1.7391304347826086,
"grad_norm": 200.07363891601562,
"learning_rate": 4.6170970614425645e-05,
"loss": 2.5783,
"step": 1360
},
{
"epoch": 1.7519181585677748,
"grad_norm": 27.835031509399414,
"learning_rate": 4.612644701691897e-05,
"loss": 2.5367,
"step": 1370
},
{
"epoch": 1.7647058823529411,
"grad_norm": 25.559825897216797,
"learning_rate": 4.6081923419412294e-05,
"loss": 1.879,
"step": 1380
},
{
"epoch": 1.7774936061381075,
"grad_norm": 133.1987762451172,
"learning_rate": 4.603739982190561e-05,
"loss": 1.8325,
"step": 1390
},
{
"epoch": 1.7902813299232738,
"grad_norm": 25.337984085083008,
"learning_rate": 4.5992876224398935e-05,
"loss": 1.7651,
"step": 1400
},
{
"epoch": 1.80306905370844,
"grad_norm": 23.645824432373047,
"learning_rate": 4.594835262689225e-05,
"loss": 2.1557,
"step": 1410
},
{
"epoch": 1.815856777493606,
"grad_norm": 6.187915325164795,
"learning_rate": 4.590382902938558e-05,
"loss": 1.8122,
"step": 1420
},
{
"epoch": 1.8286445012787724,
"grad_norm": 32.88364791870117,
"learning_rate": 4.5859305431878895e-05,
"loss": 2.1533,
"step": 1430
},
{
"epoch": 1.8414322250639388,
"grad_norm": 12.379995346069336,
"learning_rate": 4.581478183437222e-05,
"loss": 2.1713,
"step": 1440
},
{
"epoch": 1.854219948849105,
"grad_norm": 13.056475639343262,
"learning_rate": 4.577025823686554e-05,
"loss": 1.5537,
"step": 1450
},
{
"epoch": 1.867007672634271,
"grad_norm": 25.982215881347656,
"learning_rate": 4.572573463935886e-05,
"loss": 1.92,
"step": 1460
},
{
"epoch": 1.8797953964194374,
"grad_norm": 17.912216186523438,
"learning_rate": 4.5681211041852185e-05,
"loss": 1.4743,
"step": 1470
},
{
"epoch": 1.8925831202046037,
"grad_norm": 141.81936645507812,
"learning_rate": 4.56366874443455e-05,
"loss": 1.7076,
"step": 1480
},
{
"epoch": 1.9053708439897699,
"grad_norm": 56.60566711425781,
"learning_rate": 4.559216384683883e-05,
"loss": 1.9273,
"step": 1490
},
{
"epoch": 1.918158567774936,
"grad_norm": 21.652421951293945,
"learning_rate": 4.5547640249332144e-05,
"loss": 1.4124,
"step": 1500
},
{
"epoch": 1.9309462915601023,
"grad_norm": 8.854305267333984,
"learning_rate": 4.550311665182547e-05,
"loss": 1.4905,
"step": 1510
},
{
"epoch": 1.9437340153452687,
"grad_norm": 4.608613014221191,
"learning_rate": 4.545859305431879e-05,
"loss": 2.2376,
"step": 1520
},
{
"epoch": 1.9565217391304348,
"grad_norm": 16.537403106689453,
"learning_rate": 4.541406945681212e-05,
"loss": 1.0597,
"step": 1530
},
{
"epoch": 1.969309462915601,
"grad_norm": 6.274345874786377,
"learning_rate": 4.5369545859305434e-05,
"loss": 1.8588,
"step": 1540
},
{
"epoch": 1.9820971867007673,
"grad_norm": 38.86388397216797,
"learning_rate": 4.532502226179875e-05,
"loss": 1.4454,
"step": 1550
},
{
"epoch": 1.9948849104859336,
"grad_norm": 56.21317672729492,
"learning_rate": 4.5280498664292076e-05,
"loss": 1.9605,
"step": 1560
},
{
"epoch": 2.0,
"eval_loss": 0.8149464726448059,
"eval_runtime": 0.972,
"eval_samples_per_second": 100.82,
"eval_steps_per_second": 13.374,
"step": 1564
},
{
"epoch": 2.0076726342710995,
"grad_norm": 5.580387592315674,
"learning_rate": 4.5235975066785394e-05,
"loss": 2.1328,
"step": 1570
},
{
"epoch": 2.020460358056266,
"grad_norm": 7.484273433685303,
"learning_rate": 4.519145146927872e-05,
"loss": 1.5055,
"step": 1580
},
{
"epoch": 2.0332480818414322,
"grad_norm": 39.16282653808594,
"learning_rate": 4.514692787177204e-05,
"loss": 1.8466,
"step": 1590
},
{
"epoch": 2.0460358056265986,
"grad_norm": 45.915321350097656,
"learning_rate": 4.5102404274265367e-05,
"loss": 2.0529,
"step": 1600
},
{
"epoch": 2.0588235294117645,
"grad_norm": 12.101923942565918,
"learning_rate": 4.5057880676758684e-05,
"loss": 1.6502,
"step": 1610
},
{
"epoch": 2.071611253196931,
"grad_norm": 10.51504898071289,
"learning_rate": 4.501335707925201e-05,
"loss": 1.5942,
"step": 1620
},
{
"epoch": 2.084398976982097,
"grad_norm": 2.8996851444244385,
"learning_rate": 4.4968833481745326e-05,
"loss": 1.6974,
"step": 1630
},
{
"epoch": 2.0971867007672635,
"grad_norm": 10.019527435302734,
"learning_rate": 4.492430988423865e-05,
"loss": 1.6031,
"step": 1640
},
{
"epoch": 2.10997442455243,
"grad_norm": 18.028640747070312,
"learning_rate": 4.487978628673197e-05,
"loss": 2.0213,
"step": 1650
},
{
"epoch": 2.122762148337596,
"grad_norm": 19.63820457458496,
"learning_rate": 4.483526268922529e-05,
"loss": 1.9554,
"step": 1660
},
{
"epoch": 2.135549872122762,
"grad_norm": 97.62370300292969,
"learning_rate": 4.4790739091718616e-05,
"loss": 1.0171,
"step": 1670
},
{
"epoch": 2.1483375959079285,
"grad_norm": 26.990764617919922,
"learning_rate": 4.4746215494211934e-05,
"loss": 1.763,
"step": 1680
},
{
"epoch": 2.1611253196930944,
"grad_norm": 8.061385154724121,
"learning_rate": 4.470169189670526e-05,
"loss": 1.4569,
"step": 1690
},
{
"epoch": 2.1739130434782608,
"grad_norm": 8.058719635009766,
"learning_rate": 4.4657168299198575e-05,
"loss": 1.6015,
"step": 1700
},
{
"epoch": 2.186700767263427,
"grad_norm": 16.712255477905273,
"learning_rate": 4.46126447016919e-05,
"loss": 1.2064,
"step": 1710
},
{
"epoch": 2.1994884910485935,
"grad_norm": 9.311502456665039,
"learning_rate": 4.456812110418522e-05,
"loss": 2.1118,
"step": 1720
},
{
"epoch": 2.21227621483376,
"grad_norm": 8.408000946044922,
"learning_rate": 4.452359750667854e-05,
"loss": 1.6535,
"step": 1730
},
{
"epoch": 2.2250639386189257,
"grad_norm": 18.537572860717773,
"learning_rate": 4.4479073909171866e-05,
"loss": 1.2816,
"step": 1740
},
{
"epoch": 2.237851662404092,
"grad_norm": 17.14375114440918,
"learning_rate": 4.443455031166518e-05,
"loss": 0.7858,
"step": 1750
},
{
"epoch": 2.2506393861892584,
"grad_norm": 6.235766410827637,
"learning_rate": 4.439002671415851e-05,
"loss": 1.2958,
"step": 1760
},
{
"epoch": 2.2634271099744243,
"grad_norm": 16.74968719482422,
"learning_rate": 4.4345503116651825e-05,
"loss": 0.9718,
"step": 1770
},
{
"epoch": 2.2762148337595907,
"grad_norm": 17.026458740234375,
"learning_rate": 4.430097951914515e-05,
"loss": 2.1236,
"step": 1780
},
{
"epoch": 2.289002557544757,
"grad_norm": 4.0173516273498535,
"learning_rate": 4.4256455921638467e-05,
"loss": 1.5366,
"step": 1790
},
{
"epoch": 2.3017902813299234,
"grad_norm": 10.223082542419434,
"learning_rate": 4.421193232413179e-05,
"loss": 0.8786,
"step": 1800
},
{
"epoch": 2.3145780051150897,
"grad_norm": 2.731196880340576,
"learning_rate": 4.4167408726625115e-05,
"loss": 2.3721,
"step": 1810
},
{
"epoch": 2.3273657289002556,
"grad_norm": 10.35998821258545,
"learning_rate": 4.412288512911843e-05,
"loss": 1.5431,
"step": 1820
},
{
"epoch": 2.340153452685422,
"grad_norm": 12.369463920593262,
"learning_rate": 4.407836153161176e-05,
"loss": 1.5767,
"step": 1830
},
{
"epoch": 2.3529411764705883,
"grad_norm": 4.78343391418457,
"learning_rate": 4.4033837934105074e-05,
"loss": 1.7491,
"step": 1840
},
{
"epoch": 2.3657289002557547,
"grad_norm": 6.811643123626709,
"learning_rate": 4.39893143365984e-05,
"loss": 0.9933,
"step": 1850
},
{
"epoch": 2.3785166240409206,
"grad_norm": 16.536911010742188,
"learning_rate": 4.3944790739091716e-05,
"loss": 1.2678,
"step": 1860
},
{
"epoch": 2.391304347826087,
"grad_norm": 7.726188659667969,
"learning_rate": 4.390026714158505e-05,
"loss": 1.6206,
"step": 1870
},
{
"epoch": 2.4040920716112533,
"grad_norm": 22.88146209716797,
"learning_rate": 4.3855743544078365e-05,
"loss": 1.3968,
"step": 1880
},
{
"epoch": 2.4168797953964196,
"grad_norm": 4.256078243255615,
"learning_rate": 4.381121994657169e-05,
"loss": 1.2646,
"step": 1890
},
{
"epoch": 2.4296675191815855,
"grad_norm": 39.807491302490234,
"learning_rate": 4.3766696349065006e-05,
"loss": 1.8832,
"step": 1900
},
{
"epoch": 2.442455242966752,
"grad_norm": 4.80452299118042,
"learning_rate": 4.3722172751558324e-05,
"loss": 1.6629,
"step": 1910
},
{
"epoch": 2.455242966751918,
"grad_norm": 13.139750480651855,
"learning_rate": 4.367764915405165e-05,
"loss": 1.125,
"step": 1920
},
{
"epoch": 2.4680306905370846,
"grad_norm": 14.896947860717773,
"learning_rate": 4.363312555654497e-05,
"loss": 1.202,
"step": 1930
},
{
"epoch": 2.4808184143222505,
"grad_norm": 6.931974411010742,
"learning_rate": 4.35886019590383e-05,
"loss": 1.3819,
"step": 1940
},
{
"epoch": 2.493606138107417,
"grad_norm": 17.391084671020508,
"learning_rate": 4.3544078361531614e-05,
"loss": 1.5623,
"step": 1950
},
{
"epoch": 2.506393861892583,
"grad_norm": 7.759119033813477,
"learning_rate": 4.349955476402494e-05,
"loss": 0.8815,
"step": 1960
},
{
"epoch": 2.5191815856777495,
"grad_norm": 2.0047388076782227,
"learning_rate": 4.3455031166518256e-05,
"loss": 1.0101,
"step": 1970
},
{
"epoch": 2.531969309462916,
"grad_norm": 8.238865852355957,
"learning_rate": 4.341050756901158e-05,
"loss": 1.0075,
"step": 1980
},
{
"epoch": 2.544757033248082,
"grad_norm": 7.979090213775635,
"learning_rate": 4.33659839715049e-05,
"loss": 0.8694,
"step": 1990
},
{
"epoch": 2.557544757033248,
"grad_norm": 3.4456515312194824,
"learning_rate": 4.332146037399822e-05,
"loss": 0.7287,
"step": 2000
},
{
"epoch": 2.5703324808184145,
"grad_norm": 8.840888977050781,
"learning_rate": 4.3276936776491546e-05,
"loss": 1.2007,
"step": 2010
},
{
"epoch": 2.5831202046035804,
"grad_norm": 17.507421493530273,
"learning_rate": 4.3232413178984864e-05,
"loss": 0.9524,
"step": 2020
},
{
"epoch": 2.5959079283887467,
"grad_norm": 4.879913806915283,
"learning_rate": 4.318788958147819e-05,
"loss": 0.9294,
"step": 2030
},
{
"epoch": 2.608695652173913,
"grad_norm": 15.94214153289795,
"learning_rate": 4.3143365983971505e-05,
"loss": 1.5526,
"step": 2040
},
{
"epoch": 2.6214833759590794,
"grad_norm": 10.489286422729492,
"learning_rate": 4.309884238646483e-05,
"loss": 0.6439,
"step": 2050
},
{
"epoch": 2.634271099744246,
"grad_norm": 15.586228370666504,
"learning_rate": 4.305431878895815e-05,
"loss": 1.0779,
"step": 2060
},
{
"epoch": 2.6470588235294117,
"grad_norm": 2.084414005279541,
"learning_rate": 4.300979519145147e-05,
"loss": 1.169,
"step": 2070
},
{
"epoch": 2.659846547314578,
"grad_norm": 15.87989330291748,
"learning_rate": 4.2965271593944796e-05,
"loss": 1.0841,
"step": 2080
},
{
"epoch": 2.6726342710997444,
"grad_norm": 9.806713104248047,
"learning_rate": 4.292074799643811e-05,
"loss": 1.2079,
"step": 2090
},
{
"epoch": 2.6854219948849103,
"grad_norm": 22.467456817626953,
"learning_rate": 4.287622439893144e-05,
"loss": 1.3484,
"step": 2100
},
{
"epoch": 2.6982097186700766,
"grad_norm": 10.053953170776367,
"learning_rate": 4.2831700801424755e-05,
"loss": 0.7977,
"step": 2110
},
{
"epoch": 2.710997442455243,
"grad_norm": 23.656936645507812,
"learning_rate": 4.278717720391808e-05,
"loss": 1.2581,
"step": 2120
},
{
"epoch": 2.7237851662404093,
"grad_norm": 10.244109153747559,
"learning_rate": 4.27426536064114e-05,
"loss": 1.218,
"step": 2130
},
{
"epoch": 2.7365728900255757,
"grad_norm": 6.62468957901001,
"learning_rate": 4.269813000890472e-05,
"loss": 0.7606,
"step": 2140
},
{
"epoch": 2.7493606138107416,
"grad_norm": 2.3720364570617676,
"learning_rate": 4.2653606411398045e-05,
"loss": 0.582,
"step": 2150
},
{
"epoch": 2.762148337595908,
"grad_norm": 18.96427345275879,
"learning_rate": 4.260908281389136e-05,
"loss": 1.1205,
"step": 2160
},
{
"epoch": 2.7749360613810743,
"grad_norm": 5.456460475921631,
"learning_rate": 4.256455921638469e-05,
"loss": 1.0613,
"step": 2170
},
{
"epoch": 2.78772378516624,
"grad_norm": 6.44590950012207,
"learning_rate": 4.2520035618878004e-05,
"loss": 1.4278,
"step": 2180
},
{
"epoch": 2.8005115089514065,
"grad_norm": 13.97479248046875,
"learning_rate": 4.247551202137133e-05,
"loss": 1.2871,
"step": 2190
},
{
"epoch": 2.813299232736573,
"grad_norm": 7.399073123931885,
"learning_rate": 4.2430988423864646e-05,
"loss": 0.867,
"step": 2200
},
{
"epoch": 2.8260869565217392,
"grad_norm": 3.542203187942505,
"learning_rate": 4.238646482635798e-05,
"loss": 0.7119,
"step": 2210
},
{
"epoch": 2.8388746803069056,
"grad_norm": 1.8208072185516357,
"learning_rate": 4.2341941228851295e-05,
"loss": 1.0378,
"step": 2220
},
{
"epoch": 2.8516624040920715,
"grad_norm": 11.78288745880127,
"learning_rate": 4.229741763134462e-05,
"loss": 1.0857,
"step": 2230
},
{
"epoch": 2.864450127877238,
"grad_norm": 2.7360832691192627,
"learning_rate": 4.2252894033837936e-05,
"loss": 0.7267,
"step": 2240
},
{
"epoch": 2.877237851662404,
"grad_norm": 17.203723907470703,
"learning_rate": 4.2208370436331254e-05,
"loss": 1.222,
"step": 2250
},
{
"epoch": 2.89002557544757,
"grad_norm": 11.256026268005371,
"learning_rate": 4.216384683882458e-05,
"loss": 1.175,
"step": 2260
},
{
"epoch": 2.9028132992327365,
"grad_norm": 8.434975624084473,
"learning_rate": 4.2119323241317896e-05,
"loss": 0.8133,
"step": 2270
},
{
"epoch": 2.915601023017903,
"grad_norm": 8.774354934692383,
"learning_rate": 4.207479964381123e-05,
"loss": 0.9086,
"step": 2280
},
{
"epoch": 2.928388746803069,
"grad_norm": 6.846632480621338,
"learning_rate": 4.2030276046304544e-05,
"loss": 1.4552,
"step": 2290
},
{
"epoch": 2.9411764705882355,
"grad_norm": 11.470861434936523,
"learning_rate": 4.198575244879787e-05,
"loss": 1.0146,
"step": 2300
},
{
"epoch": 2.9539641943734014,
"grad_norm": 14.26189136505127,
"learning_rate": 4.1941228851291186e-05,
"loss": 0.9995,
"step": 2310
},
{
"epoch": 2.9667519181585678,
"grad_norm": 6.346394062042236,
"learning_rate": 4.189670525378451e-05,
"loss": 1.0182,
"step": 2320
},
{
"epoch": 2.979539641943734,
"grad_norm": 3.880919933319092,
"learning_rate": 4.185218165627783e-05,
"loss": 0.4778,
"step": 2330
},
{
"epoch": 2.9923273657289,
"grad_norm": 19.28879737854004,
"learning_rate": 4.180765805877115e-05,
"loss": 1.0621,
"step": 2340
},
{
"epoch": 3.0,
"eval_loss": 0.46207094192504883,
"eval_runtime": 0.9749,
"eval_samples_per_second": 100.52,
"eval_steps_per_second": 13.334,
"step": 2346
},
{
"epoch": 3.0051150895140664,
"grad_norm": 12.311324119567871,
"learning_rate": 4.1763134461264476e-05,
"loss": 1.1394,
"step": 2350
},
{
"epoch": 3.0179028132992327,
"grad_norm": 6.373651027679443,
"learning_rate": 4.1718610863757794e-05,
"loss": 1.3079,
"step": 2360
},
{
"epoch": 3.030690537084399,
"grad_norm": 20.532047271728516,
"learning_rate": 4.167408726625112e-05,
"loss": 1.5355,
"step": 2370
},
{
"epoch": 3.0434782608695654,
"grad_norm": 21.725034713745117,
"learning_rate": 4.1629563668744435e-05,
"loss": 0.7584,
"step": 2380
},
{
"epoch": 3.0562659846547313,
"grad_norm": 0.31800711154937744,
"learning_rate": 4.158504007123776e-05,
"loss": 0.7678,
"step": 2390
},
{
"epoch": 3.0690537084398977,
"grad_norm": 7.7792744636535645,
"learning_rate": 4.154051647373108e-05,
"loss": 0.876,
"step": 2400
},
{
"epoch": 3.081841432225064,
"grad_norm": 0.5140416026115417,
"learning_rate": 4.14959928762244e-05,
"loss": 1.2692,
"step": 2410
},
{
"epoch": 3.0946291560102304,
"grad_norm": 2.118436813354492,
"learning_rate": 4.1451469278717726e-05,
"loss": 0.463,
"step": 2420
},
{
"epoch": 3.1074168797953963,
"grad_norm": 10.993577003479004,
"learning_rate": 4.140694568121104e-05,
"loss": 0.5364,
"step": 2430
},
{
"epoch": 3.1202046035805626,
"grad_norm": 3.5870437622070312,
"learning_rate": 4.136242208370437e-05,
"loss": 1.6276,
"step": 2440
},
{
"epoch": 3.132992327365729,
"grad_norm": 83.34903717041016,
"learning_rate": 4.1317898486197685e-05,
"loss": 0.8779,
"step": 2450
},
{
"epoch": 3.1457800511508953,
"grad_norm": 7.2245941162109375,
"learning_rate": 4.127337488869101e-05,
"loss": 1.0543,
"step": 2460
},
{
"epoch": 3.1585677749360612,
"grad_norm": 2.145303249359131,
"learning_rate": 4.122885129118433e-05,
"loss": 1.2492,
"step": 2470
},
{
"epoch": 3.1713554987212276,
"grad_norm": 10.122068405151367,
"learning_rate": 4.118432769367765e-05,
"loss": 1.0897,
"step": 2480
},
{
"epoch": 3.184143222506394,
"grad_norm": 10.550411224365234,
"learning_rate": 4.1139804096170975e-05,
"loss": 0.8078,
"step": 2490
},
{
"epoch": 3.1969309462915603,
"grad_norm": 1.527519941329956,
"learning_rate": 4.109528049866429e-05,
"loss": 0.4696,
"step": 2500
},
{
"epoch": 3.209718670076726,
"grad_norm": 11.756329536437988,
"learning_rate": 4.105075690115762e-05,
"loss": 0.7907,
"step": 2510
},
{
"epoch": 3.2225063938618925,
"grad_norm": 14.163141250610352,
"learning_rate": 4.1006233303650935e-05,
"loss": 0.6561,
"step": 2520
},
{
"epoch": 3.235294117647059,
"grad_norm": 1.5355645418167114,
"learning_rate": 4.096170970614426e-05,
"loss": 0.3889,
"step": 2530
},
{
"epoch": 3.2480818414322252,
"grad_norm": 2.9927313327789307,
"learning_rate": 4.0917186108637576e-05,
"loss": 0.9113,
"step": 2540
},
{
"epoch": 3.260869565217391,
"grad_norm": 3.701603889465332,
"learning_rate": 4.08726625111309e-05,
"loss": 0.7163,
"step": 2550
},
{
"epoch": 3.2736572890025575,
"grad_norm": 2.665637731552124,
"learning_rate": 4.0828138913624225e-05,
"loss": 0.7217,
"step": 2560
},
{
"epoch": 3.286445012787724,
"grad_norm": 4.589602947235107,
"learning_rate": 4.078361531611755e-05,
"loss": 1.2966,
"step": 2570
},
{
"epoch": 3.29923273657289,
"grad_norm": 129.0029754638672,
"learning_rate": 4.0739091718610867e-05,
"loss": 0.6644,
"step": 2580
},
{
"epoch": 3.312020460358056,
"grad_norm": 5.486814498901367,
"learning_rate": 4.0694568121104184e-05,
"loss": 0.4016,
"step": 2590
},
{
"epoch": 3.3248081841432224,
"grad_norm": 0.4740515351295471,
"learning_rate": 4.065004452359751e-05,
"loss": 0.9797,
"step": 2600
},
{
"epoch": 3.337595907928389,
"grad_norm": 11.610332489013672,
"learning_rate": 4.0605520926090826e-05,
"loss": 1.2847,
"step": 2610
},
{
"epoch": 3.350383631713555,
"grad_norm": 99.53129577636719,
"learning_rate": 4.056099732858415e-05,
"loss": 1.3681,
"step": 2620
},
{
"epoch": 3.363171355498721,
"grad_norm": 15.345809936523438,
"learning_rate": 4.0516473731077474e-05,
"loss": 1.4627,
"step": 2630
},
{
"epoch": 3.3759590792838874,
"grad_norm": 3.9375596046447754,
"learning_rate": 4.04719501335708e-05,
"loss": 0.8383,
"step": 2640
},
{
"epoch": 3.3887468030690537,
"grad_norm": 14.221712112426758,
"learning_rate": 4.0427426536064116e-05,
"loss": 0.841,
"step": 2650
},
{
"epoch": 3.40153452685422,
"grad_norm": 9.657092094421387,
"learning_rate": 4.038290293855744e-05,
"loss": 0.7076,
"step": 2660
},
{
"epoch": 3.414322250639386,
"grad_norm": 15.229437828063965,
"learning_rate": 4.033837934105076e-05,
"loss": 0.7533,
"step": 2670
},
{
"epoch": 3.4271099744245523,
"grad_norm": 3.810375928878784,
"learning_rate": 4.029385574354408e-05,
"loss": 0.9037,
"step": 2680
},
{
"epoch": 3.4398976982097187,
"grad_norm": 3.3026020526885986,
"learning_rate": 4.02493321460374e-05,
"loss": 0.2881,
"step": 2690
},
{
"epoch": 3.452685421994885,
"grad_norm": 4.372268199920654,
"learning_rate": 4.0204808548530724e-05,
"loss": 0.7749,
"step": 2700
},
{
"epoch": 3.4654731457800514,
"grad_norm": 11.79289722442627,
"learning_rate": 4.016028495102405e-05,
"loss": 1.1843,
"step": 2710
},
{
"epoch": 3.4782608695652173,
"grad_norm": 6.681139945983887,
"learning_rate": 4.0115761353517366e-05,
"loss": 1.1241,
"step": 2720
},
{
"epoch": 3.4910485933503836,
"grad_norm": 13.276257514953613,
"learning_rate": 4.007123775601069e-05,
"loss": 1.1697,
"step": 2730
},
{
"epoch": 3.50383631713555,
"grad_norm": 4.372821807861328,
"learning_rate": 4.002671415850401e-05,
"loss": 0.7218,
"step": 2740
},
{
"epoch": 3.516624040920716,
"grad_norm": 2.8409574031829834,
"learning_rate": 3.998219056099733e-05,
"loss": 0.813,
"step": 2750
},
{
"epoch": 3.5294117647058822,
"grad_norm": 1.350940465927124,
"learning_rate": 3.993766696349065e-05,
"loss": 1.0097,
"step": 2760
},
{
"epoch": 3.5421994884910486,
"grad_norm": 12.746123313903809,
"learning_rate": 3.989314336598397e-05,
"loss": 0.8977,
"step": 2770
},
{
"epoch": 3.554987212276215,
"grad_norm": 5.898983001708984,
"learning_rate": 3.98486197684773e-05,
"loss": 0.8431,
"step": 2780
},
{
"epoch": 3.5677749360613813,
"grad_norm": 11.514519691467285,
"learning_rate": 3.9804096170970615e-05,
"loss": 1.2064,
"step": 2790
},
{
"epoch": 3.580562659846547,
"grad_norm": 2.989314556121826,
"learning_rate": 3.975957257346394e-05,
"loss": 0.8261,
"step": 2800
},
{
"epoch": 3.5933503836317136,
"grad_norm": 16.665599822998047,
"learning_rate": 3.971504897595726e-05,
"loss": 0.832,
"step": 2810
},
{
"epoch": 3.60613810741688,
"grad_norm": 0.406387597322464,
"learning_rate": 3.967052537845058e-05,
"loss": 0.3909,
"step": 2820
},
{
"epoch": 3.618925831202046,
"grad_norm": 2.7753970623016357,
"learning_rate": 3.96260017809439e-05,
"loss": 0.5041,
"step": 2830
},
{
"epoch": 3.631713554987212,
"grad_norm": 1.3813972473144531,
"learning_rate": 3.958147818343722e-05,
"loss": 0.7192,
"step": 2840
},
{
"epoch": 3.6445012787723785,
"grad_norm": 2.6115665435791016,
"learning_rate": 3.953695458593055e-05,
"loss": 1.1741,
"step": 2850
},
{
"epoch": 3.657289002557545,
"grad_norm": 9.115361213684082,
"learning_rate": 3.9492430988423865e-05,
"loss": 0.6617,
"step": 2860
},
{
"epoch": 3.670076726342711,
"grad_norm": 6.27280330657959,
"learning_rate": 3.944790739091719e-05,
"loss": 1.1405,
"step": 2870
},
{
"epoch": 3.682864450127877,
"grad_norm": 0.9005927443504333,
"learning_rate": 3.9403383793410506e-05,
"loss": 0.8749,
"step": 2880
},
{
"epoch": 3.6956521739130435,
"grad_norm": 1.6038532257080078,
"learning_rate": 3.935886019590383e-05,
"loss": 1.1794,
"step": 2890
},
{
"epoch": 3.70843989769821,
"grad_norm": 7.583934307098389,
"learning_rate": 3.931433659839715e-05,
"loss": 0.6017,
"step": 2900
},
{
"epoch": 3.7212276214833757,
"grad_norm": 0.9503026008605957,
"learning_rate": 3.926981300089048e-05,
"loss": 0.5053,
"step": 2910
},
{
"epoch": 3.734015345268542,
"grad_norm": 7.907811164855957,
"learning_rate": 3.92252894033838e-05,
"loss": 1.3511,
"step": 2920
},
{
"epoch": 3.7468030690537084,
"grad_norm": 10.489314079284668,
"learning_rate": 3.918076580587712e-05,
"loss": 0.508,
"step": 2930
},
{
"epoch": 3.7595907928388748,
"grad_norm": 7.821562767028809,
"learning_rate": 3.913624220837044e-05,
"loss": 1.0142,
"step": 2940
},
{
"epoch": 3.772378516624041,
"grad_norm": 12.84817123413086,
"learning_rate": 3.9091718610863756e-05,
"loss": 0.9375,
"step": 2950
},
{
"epoch": 3.785166240409207,
"grad_norm": 2.9846205711364746,
"learning_rate": 3.904719501335708e-05,
"loss": 0.8996,
"step": 2960
},
{
"epoch": 3.7979539641943734,
"grad_norm": 9.234708786010742,
"learning_rate": 3.90026714158504e-05,
"loss": 1.1906,
"step": 2970
},
{
"epoch": 3.8107416879795397,
"grad_norm": 1.4158365726470947,
"learning_rate": 3.895814781834373e-05,
"loss": 1.1976,
"step": 2980
},
{
"epoch": 3.8235294117647056,
"grad_norm": 8.958301544189453,
"learning_rate": 3.8913624220837046e-05,
"loss": 0.4074,
"step": 2990
},
{
"epoch": 3.836317135549872,
"grad_norm": 8.71174144744873,
"learning_rate": 3.886910062333037e-05,
"loss": 0.5862,
"step": 3000
},
{
"epoch": 3.8491048593350383,
"grad_norm": 0.366887629032135,
"learning_rate": 3.882457702582369e-05,
"loss": 0.3793,
"step": 3010
},
{
"epoch": 3.8618925831202047,
"grad_norm": 6.718595504760742,
"learning_rate": 3.878005342831701e-05,
"loss": 0.5965,
"step": 3020
},
{
"epoch": 3.874680306905371,
"grad_norm": 1.7599328756332397,
"learning_rate": 3.873552983081033e-05,
"loss": 0.7439,
"step": 3030
},
{
"epoch": 3.887468030690537,
"grad_norm": 4.962011337280273,
"learning_rate": 3.869100623330365e-05,
"loss": 1.2969,
"step": 3040
},
{
"epoch": 3.9002557544757033,
"grad_norm": 6.866610527038574,
"learning_rate": 3.864648263579698e-05,
"loss": 0.7168,
"step": 3050
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.8201662302017212,
"learning_rate": 3.8601959038290296e-05,
"loss": 0.2705,
"step": 3060
},
{
"epoch": 3.9258312020460355,
"grad_norm": 0.8354922533035278,
"learning_rate": 3.855743544078362e-05,
"loss": 0.7996,
"step": 3070
},
{
"epoch": 3.938618925831202,
"grad_norm": 2.216919422149658,
"learning_rate": 3.851291184327694e-05,
"loss": 0.6187,
"step": 3080
},
{
"epoch": 3.9514066496163682,
"grad_norm": 9.284915924072266,
"learning_rate": 3.846838824577026e-05,
"loss": 0.7844,
"step": 3090
},
{
"epoch": 3.9641943734015346,
"grad_norm": 3.3350045680999756,
"learning_rate": 3.842386464826358e-05,
"loss": 0.9315,
"step": 3100
},
{
"epoch": 3.976982097186701,
"grad_norm": 12.905816078186035,
"learning_rate": 3.8379341050756903e-05,
"loss": 0.8972,
"step": 3110
},
{
"epoch": 3.9897698209718673,
"grad_norm": 2.008113384246826,
"learning_rate": 3.833481745325023e-05,
"loss": 0.6918,
"step": 3120
},
{
"epoch": 4.0,
"eval_loss": 0.356393963098526,
"eval_runtime": 0.9935,
"eval_samples_per_second": 98.639,
"eval_steps_per_second": 13.085,
"step": 3128
},
{
"epoch": 4.002557544757034,
"grad_norm": 5.132030487060547,
"learning_rate": 3.8290293855743545e-05,
"loss": 0.8732,
"step": 3130
},
{
"epoch": 4.015345268542199,
"grad_norm": 7.4329400062561035,
"learning_rate": 3.824577025823687e-05,
"loss": 0.7742,
"step": 3140
},
{
"epoch": 4.028132992327365,
"grad_norm": 7.301215648651123,
"learning_rate": 3.820124666073019e-05,
"loss": 1.0238,
"step": 3150
},
{
"epoch": 4.040920716112532,
"grad_norm": 2.81072735786438,
"learning_rate": 3.815672306322351e-05,
"loss": 1.1583,
"step": 3160
},
{
"epoch": 4.053708439897698,
"grad_norm": 11.35189437866211,
"learning_rate": 3.811219946571683e-05,
"loss": 0.6364,
"step": 3170
},
{
"epoch": 4.0664961636828645,
"grad_norm": 1.3151694536209106,
"learning_rate": 3.806767586821015e-05,
"loss": 0.4789,
"step": 3180
},
{
"epoch": 4.079283887468031,
"grad_norm": 7.979901313781738,
"learning_rate": 3.802315227070348e-05,
"loss": 0.7974,
"step": 3190
},
{
"epoch": 4.092071611253197,
"grad_norm": 1.1267353296279907,
"learning_rate": 3.7978628673196795e-05,
"loss": 0.883,
"step": 3200
},
{
"epoch": 4.1048593350383635,
"grad_norm": 1.3425774574279785,
"learning_rate": 3.793410507569012e-05,
"loss": 0.8678,
"step": 3210
},
{
"epoch": 4.117647058823529,
"grad_norm": 13.647799491882324,
"learning_rate": 3.7889581478183437e-05,
"loss": 0.6365,
"step": 3220
},
{
"epoch": 4.130434782608695,
"grad_norm": 8.646533012390137,
"learning_rate": 3.784505788067676e-05,
"loss": 0.742,
"step": 3230
},
{
"epoch": 4.143222506393862,
"grad_norm": 14.215703964233398,
"learning_rate": 3.780053428317008e-05,
"loss": 1.4076,
"step": 3240
},
{
"epoch": 4.156010230179028,
"grad_norm": 5.101785659790039,
"learning_rate": 3.77560106856634e-05,
"loss": 0.5596,
"step": 3250
},
{
"epoch": 4.168797953964194,
"grad_norm": 3.0319337844848633,
"learning_rate": 3.771148708815673e-05,
"loss": 0.9117,
"step": 3260
},
{
"epoch": 4.181585677749361,
"grad_norm": 19.691225051879883,
"learning_rate": 3.766696349065005e-05,
"loss": 0.6898,
"step": 3270
},
{
"epoch": 4.194373401534527,
"grad_norm": 0.6862250566482544,
"learning_rate": 3.762243989314337e-05,
"loss": 0.755,
"step": 3280
},
{
"epoch": 4.207161125319693,
"grad_norm": 4.289923191070557,
"learning_rate": 3.7577916295636686e-05,
"loss": 0.6671,
"step": 3290
},
{
"epoch": 4.21994884910486,
"grad_norm": 1.4601982831954956,
"learning_rate": 3.753339269813001e-05,
"loss": 0.3689,
"step": 3300
},
{
"epoch": 4.232736572890025,
"grad_norm": 9.796490669250488,
"learning_rate": 3.748886910062333e-05,
"loss": 0.5699,
"step": 3310
},
{
"epoch": 4.245524296675192,
"grad_norm": 11.114338874816895,
"learning_rate": 3.744434550311665e-05,
"loss": 0.8346,
"step": 3320
},
{
"epoch": 4.258312020460358,
"grad_norm": 3.114365339279175,
"learning_rate": 3.7399821905609976e-05,
"loss": 0.6354,
"step": 3330
},
{
"epoch": 4.271099744245524,
"grad_norm": 2.011147975921631,
"learning_rate": 3.73552983081033e-05,
"loss": 0.6387,
"step": 3340
},
{
"epoch": 4.283887468030691,
"grad_norm": 0.5666208267211914,
"learning_rate": 3.731077471059662e-05,
"loss": 0.6872,
"step": 3350
},
{
"epoch": 4.296675191815857,
"grad_norm": 2.4012691974639893,
"learning_rate": 3.726625111308994e-05,
"loss": 0.7966,
"step": 3360
},
{
"epoch": 4.309462915601023,
"grad_norm": 2.9042274951934814,
"learning_rate": 3.722172751558326e-05,
"loss": 0.4021,
"step": 3370
},
{
"epoch": 4.322250639386189,
"grad_norm": 15.42292308807373,
"learning_rate": 3.7177203918076584e-05,
"loss": 0.5537,
"step": 3380
},
{
"epoch": 4.335038363171355,
"grad_norm": 5.009647369384766,
"learning_rate": 3.71326803205699e-05,
"loss": 0.6362,
"step": 3390
},
{
"epoch": 4.3478260869565215,
"grad_norm": 1.7608520984649658,
"learning_rate": 3.7088156723063226e-05,
"loss": 1.0878,
"step": 3400
},
{
"epoch": 4.360613810741688,
"grad_norm": 0.519935667514801,
"learning_rate": 3.704363312555655e-05,
"loss": 0.6975,
"step": 3410
},
{
"epoch": 4.373401534526854,
"grad_norm": 4.793030738830566,
"learning_rate": 3.699910952804987e-05,
"loss": 0.7543,
"step": 3420
},
{
"epoch": 4.3861892583120206,
"grad_norm": 7.277567386627197,
"learning_rate": 3.695458593054319e-05,
"loss": 0.7289,
"step": 3430
},
{
"epoch": 4.398976982097187,
"grad_norm": 2.9830596446990967,
"learning_rate": 3.691006233303651e-05,
"loss": 0.7286,
"step": 3440
},
{
"epoch": 4.411764705882353,
"grad_norm": 3.031907320022583,
"learning_rate": 3.6865538735529834e-05,
"loss": 0.5704,
"step": 3450
},
{
"epoch": 4.42455242966752,
"grad_norm": 11.57882308959961,
"learning_rate": 3.682101513802315e-05,
"loss": 0.4617,
"step": 3460
},
{
"epoch": 4.437340153452685,
"grad_norm": 10.076362609863281,
"learning_rate": 3.6776491540516475e-05,
"loss": 0.5074,
"step": 3470
},
{
"epoch": 4.450127877237851,
"grad_norm": 3.9779062271118164,
"learning_rate": 3.67319679430098e-05,
"loss": 1.1924,
"step": 3480
},
{
"epoch": 4.462915601023018,
"grad_norm": 2.4633734226226807,
"learning_rate": 3.668744434550312e-05,
"loss": 0.8225,
"step": 3490
},
{
"epoch": 4.475703324808184,
"grad_norm": 4.295853137969971,
"learning_rate": 3.664292074799644e-05,
"loss": 0.2683,
"step": 3500
},
{
"epoch": 4.4884910485933505,
"grad_norm": 11.388009071350098,
"learning_rate": 3.659839715048976e-05,
"loss": 1.0016,
"step": 3510
},
{
"epoch": 4.501278772378517,
"grad_norm": 1.5823692083358765,
"learning_rate": 3.655387355298308e-05,
"loss": 0.5582,
"step": 3520
},
{
"epoch": 4.514066496163683,
"grad_norm": 14.604119300842285,
"learning_rate": 3.65093499554764e-05,
"loss": 0.8775,
"step": 3530
},
{
"epoch": 4.526854219948849,
"grad_norm": 2.4483489990234375,
"learning_rate": 3.6464826357969725e-05,
"loss": 0.7407,
"step": 3540
},
{
"epoch": 4.539641943734015,
"grad_norm": 12.382391929626465,
"learning_rate": 3.642030276046305e-05,
"loss": 0.3615,
"step": 3550
},
{
"epoch": 4.552429667519181,
"grad_norm": 11.958683013916016,
"learning_rate": 3.637577916295637e-05,
"loss": 0.9151,
"step": 3560
},
{
"epoch": 4.565217391304348,
"grad_norm": 4.6142897605896,
"learning_rate": 3.633125556544969e-05,
"loss": 0.6341,
"step": 3570
},
{
"epoch": 4.578005115089514,
"grad_norm": 10.04975700378418,
"learning_rate": 3.628673196794301e-05,
"loss": 0.5591,
"step": 3580
},
{
"epoch": 4.59079283887468,
"grad_norm": 12.153048515319824,
"learning_rate": 3.624220837043633e-05,
"loss": 1.2612,
"step": 3590
},
{
"epoch": 4.603580562659847,
"grad_norm": 8.2946138381958,
"learning_rate": 3.619768477292965e-05,
"loss": 0.667,
"step": 3600
},
{
"epoch": 4.616368286445013,
"grad_norm": 8.6572265625,
"learning_rate": 3.615316117542298e-05,
"loss": 0.3353,
"step": 3610
},
{
"epoch": 4.629156010230179,
"grad_norm": 18.10332679748535,
"learning_rate": 3.61086375779163e-05,
"loss": 0.6227,
"step": 3620
},
{
"epoch": 4.641943734015345,
"grad_norm": 10.11817455291748,
"learning_rate": 3.6064113980409616e-05,
"loss": 0.9023,
"step": 3630
},
{
"epoch": 4.654731457800511,
"grad_norm": 9.292647361755371,
"learning_rate": 3.601959038290294e-05,
"loss": 0.5075,
"step": 3640
},
{
"epoch": 4.667519181585678,
"grad_norm": 14.415812492370605,
"learning_rate": 3.597506678539626e-05,
"loss": 0.8366,
"step": 3650
},
{
"epoch": 4.680306905370844,
"grad_norm": 10.451414108276367,
"learning_rate": 3.593054318788958e-05,
"loss": 0.599,
"step": 3660
},
{
"epoch": 4.69309462915601,
"grad_norm": 3.4356963634490967,
"learning_rate": 3.58860195903829e-05,
"loss": 0.4089,
"step": 3670
},
{
"epoch": 4.705882352941177,
"grad_norm": 13.768664360046387,
"learning_rate": 3.584149599287623e-05,
"loss": 1.0491,
"step": 3680
},
{
"epoch": 4.718670076726343,
"grad_norm": 1.4625777006149292,
"learning_rate": 3.579697239536955e-05,
"loss": 0.6929,
"step": 3690
},
{
"epoch": 4.731457800511509,
"grad_norm": 3.810811758041382,
"learning_rate": 3.575244879786287e-05,
"loss": 0.7565,
"step": 3700
},
{
"epoch": 4.744245524296675,
"grad_norm": 10.068679809570312,
"learning_rate": 3.570792520035619e-05,
"loss": 0.6768,
"step": 3710
},
{
"epoch": 4.757033248081841,
"grad_norm": 7.575318813323975,
"learning_rate": 3.5663401602849514e-05,
"loss": 0.8091,
"step": 3720
},
{
"epoch": 4.7698209718670075,
"grad_norm": 0.5645831823348999,
"learning_rate": 3.561887800534283e-05,
"loss": 0.6504,
"step": 3730
},
{
"epoch": 4.782608695652174,
"grad_norm": 11.733582496643066,
"learning_rate": 3.557435440783615e-05,
"loss": 0.76,
"step": 3740
},
{
"epoch": 4.79539641943734,
"grad_norm": 11.055776596069336,
"learning_rate": 3.552983081032948e-05,
"loss": 0.6466,
"step": 3750
},
{
"epoch": 4.8081841432225065,
"grad_norm": 2.825686454772949,
"learning_rate": 3.54853072128228e-05,
"loss": 0.4032,
"step": 3760
},
{
"epoch": 4.820971867007673,
"grad_norm": 1.2281103134155273,
"learning_rate": 3.544078361531612e-05,
"loss": 0.3983,
"step": 3770
},
{
"epoch": 4.833759590792839,
"grad_norm": 0.7035624384880066,
"learning_rate": 3.539626001780944e-05,
"loss": 0.4677,
"step": 3780
},
{
"epoch": 4.846547314578006,
"grad_norm": 12.086211204528809,
"learning_rate": 3.5351736420302764e-05,
"loss": 0.8847,
"step": 3790
},
{
"epoch": 4.859335038363171,
"grad_norm": 1.0651588439941406,
"learning_rate": 3.530721282279608e-05,
"loss": 0.6754,
"step": 3800
},
{
"epoch": 4.872122762148337,
"grad_norm": 1.574463129043579,
"learning_rate": 3.5262689225289405e-05,
"loss": 0.6701,
"step": 3810
},
{
"epoch": 4.884910485933504,
"grad_norm": 12.231953620910645,
"learning_rate": 3.521816562778273e-05,
"loss": 0.8218,
"step": 3820
},
{
"epoch": 4.89769820971867,
"grad_norm": 4.2090535163879395,
"learning_rate": 3.517364203027605e-05,
"loss": 0.6251,
"step": 3830
},
{
"epoch": 4.910485933503836,
"grad_norm": 4.578638076782227,
"learning_rate": 3.512911843276937e-05,
"loss": 0.7583,
"step": 3840
},
{
"epoch": 4.923273657289003,
"grad_norm": 3.189852237701416,
"learning_rate": 3.508459483526269e-05,
"loss": 0.2964,
"step": 3850
},
{
"epoch": 4.936061381074169,
"grad_norm": 2.299405097961426,
"learning_rate": 3.504007123775601e-05,
"loss": 0.4669,
"step": 3860
},
{
"epoch": 4.948849104859335,
"grad_norm": 2.1393966674804688,
"learning_rate": 3.499554764024933e-05,
"loss": 0.5147,
"step": 3870
},
{
"epoch": 4.961636828644501,
"grad_norm": 10.183451652526855,
"learning_rate": 3.4951024042742655e-05,
"loss": 0.6936,
"step": 3880
},
{
"epoch": 4.974424552429667,
"grad_norm": 0.556858479976654,
"learning_rate": 3.490650044523598e-05,
"loss": 0.7699,
"step": 3890
},
{
"epoch": 4.987212276214834,
"grad_norm": 7.906371116638184,
"learning_rate": 3.48619768477293e-05,
"loss": 0.5674,
"step": 3900
},
{
"epoch": 5.0,
"grad_norm": 9.858052253723145,
"learning_rate": 3.481745325022262e-05,
"loss": 0.5803,
"step": 3910
},
{
"epoch": 5.0,
"eval_loss": 0.308910071849823,
"eval_runtime": 0.8202,
"eval_samples_per_second": 119.488,
"eval_steps_per_second": 15.85,
"step": 3910
},
{
"epoch": 5.012787723785166,
"grad_norm": 4.255712985992432,
"learning_rate": 3.477292965271594e-05,
"loss": 0.7186,
"step": 3920
},
{
"epoch": 5.025575447570333,
"grad_norm": 0.03182278946042061,
"learning_rate": 3.472840605520926e-05,
"loss": 0.764,
"step": 3930
},
{
"epoch": 5.038363171355499,
"grad_norm": 1.7552779912948608,
"learning_rate": 3.468388245770258e-05,
"loss": 0.5904,
"step": 3940
},
{
"epoch": 5.051150895140665,
"grad_norm": 0.7406250238418579,
"learning_rate": 3.4639358860195904e-05,
"loss": 0.8398,
"step": 3950
},
{
"epoch": 5.063938618925831,
"grad_norm": 1.4716426134109497,
"learning_rate": 3.459483526268923e-05,
"loss": 0.5454,
"step": 3960
},
{
"epoch": 5.076726342710997,
"grad_norm": 2.5447280406951904,
"learning_rate": 3.455031166518255e-05,
"loss": 0.4558,
"step": 3970
},
{
"epoch": 5.089514066496164,
"grad_norm": 4.706602096557617,
"learning_rate": 3.450578806767587e-05,
"loss": 0.3279,
"step": 3980
},
{
"epoch": 5.10230179028133,
"grad_norm": 3.110121250152588,
"learning_rate": 3.446126447016919e-05,
"loss": 0.3657,
"step": 3990
},
{
"epoch": 5.115089514066496,
"grad_norm": 0.3601504862308502,
"learning_rate": 3.441674087266251e-05,
"loss": 0.4001,
"step": 4000
},
{
"epoch": 5.127877237851663,
"grad_norm": 1.9790247678756714,
"learning_rate": 3.437221727515583e-05,
"loss": 0.1499,
"step": 4010
},
{
"epoch": 5.140664961636829,
"grad_norm": 5.311811447143555,
"learning_rate": 3.4327693677649154e-05,
"loss": 0.5795,
"step": 4020
},
{
"epoch": 5.153452685421995,
"grad_norm": 6.410403728485107,
"learning_rate": 3.428317008014248e-05,
"loss": 0.4079,
"step": 4030
},
{
"epoch": 5.166240409207161,
"grad_norm": 3.0055534839630127,
"learning_rate": 3.42386464826358e-05,
"loss": 0.4303,
"step": 4040
},
{
"epoch": 5.179028132992327,
"grad_norm": 9.70368766784668,
"learning_rate": 3.419412288512912e-05,
"loss": 0.7073,
"step": 4050
},
{
"epoch": 5.1918158567774935,
"grad_norm": 14.46500301361084,
"learning_rate": 3.4149599287622444e-05,
"loss": 0.9534,
"step": 4060
},
{
"epoch": 5.20460358056266,
"grad_norm": 1.8181160688400269,
"learning_rate": 3.410507569011576e-05,
"loss": 0.5392,
"step": 4070
},
{
"epoch": 5.217391304347826,
"grad_norm": 0.3840659558773041,
"learning_rate": 3.406055209260908e-05,
"loss": 0.3438,
"step": 4080
},
{
"epoch": 5.2301790281329925,
"grad_norm": 4.869695663452148,
"learning_rate": 3.4016028495102404e-05,
"loss": 0.9978,
"step": 4090
},
{
"epoch": 5.242966751918159,
"grad_norm": 7.061380386352539,
"learning_rate": 3.397150489759573e-05,
"loss": 0.4002,
"step": 4100
},
{
"epoch": 5.255754475703325,
"grad_norm": 5.828289031982422,
"learning_rate": 3.392698130008905e-05,
"loss": 0.7073,
"step": 4110
},
{
"epoch": 5.268542199488491,
"grad_norm": 10.163734436035156,
"learning_rate": 3.388245770258237e-05,
"loss": 0.9711,
"step": 4120
},
{
"epoch": 5.281329923273657,
"grad_norm": 9.201620101928711,
"learning_rate": 3.3837934105075694e-05,
"loss": 0.5375,
"step": 4130
},
{
"epoch": 5.294117647058823,
"grad_norm": 1.4918196201324463,
"learning_rate": 3.379341050756901e-05,
"loss": 0.5364,
"step": 4140
},
{
"epoch": 5.30690537084399,
"grad_norm": 12.544402122497559,
"learning_rate": 3.3748886910062336e-05,
"loss": 0.7528,
"step": 4150
},
{
"epoch": 5.319693094629156,
"grad_norm": 3.5984480381011963,
"learning_rate": 3.370436331255565e-05,
"loss": 0.591,
"step": 4160
},
{
"epoch": 5.332480818414322,
"grad_norm": 3.8022067546844482,
"learning_rate": 3.365983971504898e-05,
"loss": 0.9976,
"step": 4170
},
{
"epoch": 5.345268542199489,
"grad_norm": 8.02660846710205,
"learning_rate": 3.36153161175423e-05,
"loss": 0.4874,
"step": 4180
},
{
"epoch": 5.358056265984655,
"grad_norm": 4.031845569610596,
"learning_rate": 3.357079252003562e-05,
"loss": 0.4577,
"step": 4190
},
{
"epoch": 5.370843989769821,
"grad_norm": 9.381792068481445,
"learning_rate": 3.352626892252894e-05,
"loss": 0.5708,
"step": 4200
},
{
"epoch": 5.383631713554987,
"grad_norm": 10.603078842163086,
"learning_rate": 3.348174532502226e-05,
"loss": 1.0511,
"step": 4210
},
{
"epoch": 5.396419437340153,
"grad_norm": 3.2949740886688232,
"learning_rate": 3.3437221727515585e-05,
"loss": 0.5616,
"step": 4220
},
{
"epoch": 5.40920716112532,
"grad_norm": 12.314652442932129,
"learning_rate": 3.33926981300089e-05,
"loss": 0.6216,
"step": 4230
},
{
"epoch": 5.421994884910486,
"grad_norm": 15.642436981201172,
"learning_rate": 3.334817453250223e-05,
"loss": 0.478,
"step": 4240
},
{
"epoch": 5.434782608695652,
"grad_norm": 11.80217456817627,
"learning_rate": 3.330365093499555e-05,
"loss": 0.7626,
"step": 4250
},
{
"epoch": 5.447570332480819,
"grad_norm": 32.96918487548828,
"learning_rate": 3.325912733748887e-05,
"loss": 1.0794,
"step": 4260
},
{
"epoch": 5.460358056265985,
"grad_norm": 12.918391227722168,
"learning_rate": 3.321460373998219e-05,
"loss": 0.5586,
"step": 4270
},
{
"epoch": 5.4731457800511505,
"grad_norm": 11.27306079864502,
"learning_rate": 3.317008014247551e-05,
"loss": 0.7194,
"step": 4280
},
{
"epoch": 5.485933503836317,
"grad_norm": 0.12751297652721405,
"learning_rate": 3.3125556544968835e-05,
"loss": 0.5606,
"step": 4290
},
{
"epoch": 5.498721227621483,
"grad_norm": 10.48075008392334,
"learning_rate": 3.308103294746215e-05,
"loss": 0.493,
"step": 4300
},
{
"epoch": 5.5115089514066495,
"grad_norm": 4.982821941375732,
"learning_rate": 3.303650934995548e-05,
"loss": 0.4245,
"step": 4310
},
{
"epoch": 5.524296675191816,
"grad_norm": 6.689045429229736,
"learning_rate": 3.29919857524488e-05,
"loss": 0.8354,
"step": 4320
},
{
"epoch": 5.537084398976982,
"grad_norm": 15.300660133361816,
"learning_rate": 3.294746215494212e-05,
"loss": 0.9657,
"step": 4330
},
{
"epoch": 5.549872122762149,
"grad_norm": 5.424358367919922,
"learning_rate": 3.290293855743544e-05,
"loss": 0.7693,
"step": 4340
},
{
"epoch": 5.562659846547315,
"grad_norm": 2.7610678672790527,
"learning_rate": 3.285841495992876e-05,
"loss": 0.9893,
"step": 4350
},
{
"epoch": 5.57544757033248,
"grad_norm": 4.664830684661865,
"learning_rate": 3.2813891362422084e-05,
"loss": 0.84,
"step": 4360
},
{
"epoch": 5.588235294117647,
"grad_norm": 2.3301279544830322,
"learning_rate": 3.27693677649154e-05,
"loss": 0.3313,
"step": 4370
},
{
"epoch": 5.601023017902813,
"grad_norm": 1.1856235265731812,
"learning_rate": 3.272484416740873e-05,
"loss": 0.4543,
"step": 4380
},
{
"epoch": 5.6138107416879794,
"grad_norm": 3.703605890274048,
"learning_rate": 3.268032056990205e-05,
"loss": 1.0115,
"step": 4390
},
{
"epoch": 5.626598465473146,
"grad_norm": 2.387458562850952,
"learning_rate": 3.2635796972395374e-05,
"loss": 0.4687,
"step": 4400
},
{
"epoch": 5.639386189258312,
"grad_norm": 7.303229808807373,
"learning_rate": 3.259127337488869e-05,
"loss": 0.5117,
"step": 4410
},
{
"epoch": 5.6521739130434785,
"grad_norm": 7.670161724090576,
"learning_rate": 3.2546749777382016e-05,
"loss": 0.3712,
"step": 4420
},
{
"epoch": 5.664961636828645,
"grad_norm": 6.873749256134033,
"learning_rate": 3.2502226179875334e-05,
"loss": 0.5735,
"step": 4430
},
{
"epoch": 5.677749360613811,
"grad_norm": 4.9012956619262695,
"learning_rate": 3.245770258236865e-05,
"loss": 0.7233,
"step": 4440
},
{
"epoch": 5.690537084398977,
"grad_norm": 3.3855135440826416,
"learning_rate": 3.241317898486198e-05,
"loss": 0.5361,
"step": 4450
},
{
"epoch": 5.703324808184143,
"grad_norm": 3.381133794784546,
"learning_rate": 3.23686553873553e-05,
"loss": 0.4158,
"step": 4460
},
{
"epoch": 5.716112531969309,
"grad_norm": 1.0660206079483032,
"learning_rate": 3.2324131789848624e-05,
"loss": 0.7874,
"step": 4470
},
{
"epoch": 5.728900255754476,
"grad_norm": 0.3694034516811371,
"learning_rate": 3.227960819234194e-05,
"loss": 0.4555,
"step": 4480
},
{
"epoch": 5.741687979539642,
"grad_norm": 5.135284900665283,
"learning_rate": 3.2235084594835266e-05,
"loss": 0.661,
"step": 4490
},
{
"epoch": 5.754475703324808,
"grad_norm": 8.120854377746582,
"learning_rate": 3.219056099732858e-05,
"loss": 0.7841,
"step": 4500
},
{
"epoch": 5.767263427109975,
"grad_norm": 4.406229019165039,
"learning_rate": 3.214603739982191e-05,
"loss": 0.7135,
"step": 4510
},
{
"epoch": 5.78005115089514,
"grad_norm": 16.42761993408203,
"learning_rate": 3.210151380231523e-05,
"loss": 0.5225,
"step": 4520
},
{
"epoch": 5.792838874680307,
"grad_norm": 4.247344970703125,
"learning_rate": 3.205699020480855e-05,
"loss": 0.509,
"step": 4530
},
{
"epoch": 5.805626598465473,
"grad_norm": 0.8684327006340027,
"learning_rate": 3.2012466607301873e-05,
"loss": 0.3948,
"step": 4540
},
{
"epoch": 5.818414322250639,
"grad_norm": 3.883173942565918,
"learning_rate": 3.196794300979519e-05,
"loss": 0.4688,
"step": 4550
},
{
"epoch": 5.831202046035806,
"grad_norm": 2.6903090476989746,
"learning_rate": 3.1923419412288515e-05,
"loss": 0.5118,
"step": 4560
},
{
"epoch": 5.843989769820972,
"grad_norm": 13.025918960571289,
"learning_rate": 3.187889581478183e-05,
"loss": 0.7126,
"step": 4570
},
{
"epoch": 5.856777493606138,
"grad_norm": 5.017139911651611,
"learning_rate": 3.183437221727516e-05,
"loss": 0.4241,
"step": 4580
},
{
"epoch": 5.869565217391305,
"grad_norm": 0.0795026496052742,
"learning_rate": 3.178984861976848e-05,
"loss": 0.367,
"step": 4590
},
{
"epoch": 5.882352941176471,
"grad_norm": 6.228437423706055,
"learning_rate": 3.17453250222618e-05,
"loss": 0.4223,
"step": 4600
},
{
"epoch": 5.8951406649616365,
"grad_norm": 13.155862808227539,
"learning_rate": 3.170080142475512e-05,
"loss": 0.8122,
"step": 4610
},
{
"epoch": 5.907928388746803,
"grad_norm": 1.7071353197097778,
"learning_rate": 3.165627782724844e-05,
"loss": 0.5693,
"step": 4620
},
{
"epoch": 5.920716112531969,
"grad_norm": 29.22410011291504,
"learning_rate": 3.1611754229741765e-05,
"loss": 0.5403,
"step": 4630
},
{
"epoch": 5.9335038363171355,
"grad_norm": 5.154636383056641,
"learning_rate": 3.156723063223508e-05,
"loss": 0.7832,
"step": 4640
},
{
"epoch": 5.946291560102302,
"grad_norm": 4.094592571258545,
"learning_rate": 3.1522707034728406e-05,
"loss": 0.2902,
"step": 4650
},
{
"epoch": 5.959079283887468,
"grad_norm": 8.081905364990234,
"learning_rate": 3.147818343722173e-05,
"loss": 0.6333,
"step": 4660
},
{
"epoch": 5.971867007672635,
"grad_norm": 0.0020114060025662184,
"learning_rate": 3.1433659839715055e-05,
"loss": 0.6819,
"step": 4670
},
{
"epoch": 5.9846547314578,
"grad_norm": 1.4741204977035522,
"learning_rate": 3.138913624220837e-05,
"loss": 0.6854,
"step": 4680
},
{
"epoch": 5.997442455242966,
"grad_norm": 2.4684624671936035,
"learning_rate": 3.134461264470169e-05,
"loss": 0.7138,
"step": 4690
},
{
"epoch": 6.0,
"eval_loss": 0.2926405668258667,
"eval_runtime": 0.9804,
"eval_samples_per_second": 99.964,
"eval_steps_per_second": 13.261,
"step": 4692
},
{
"epoch": 6.010230179028133,
"grad_norm": 5.0837931632995605,
"learning_rate": 3.1300089047195014e-05,
"loss": 0.4942,
"step": 4700
},
{
"epoch": 6.023017902813299,
"grad_norm": 8.149367332458496,
"learning_rate": 3.125556544968833e-05,
"loss": 0.4184,
"step": 4710
},
{
"epoch": 6.035805626598465,
"grad_norm": 0.08973731100559235,
"learning_rate": 3.121104185218166e-05,
"loss": 0.5757,
"step": 4720
},
{
"epoch": 6.048593350383632,
"grad_norm": 0.09891729801893234,
"learning_rate": 3.116651825467498e-05,
"loss": 0.5934,
"step": 4730
},
{
"epoch": 6.061381074168798,
"grad_norm": 0.7006903886795044,
"learning_rate": 3.1121994657168305e-05,
"loss": 0.3696,
"step": 4740
},
{
"epoch": 6.0741687979539645,
"grad_norm": 7.207807540893555,
"learning_rate": 3.107747105966162e-05,
"loss": 0.9691,
"step": 4750
},
{
"epoch": 6.086956521739131,
"grad_norm": 2.0564119815826416,
"learning_rate": 3.1032947462154946e-05,
"loss": 0.2924,
"step": 4760
},
{
"epoch": 6.099744245524296,
"grad_norm": 4.980827808380127,
"learning_rate": 3.0988423864648264e-05,
"loss": 0.6964,
"step": 4770
},
{
"epoch": 6.112531969309463,
"grad_norm": 0.09228426963090897,
"learning_rate": 3.094390026714158e-05,
"loss": 0.4555,
"step": 4780
},
{
"epoch": 6.125319693094629,
"grad_norm": 5.13065242767334,
"learning_rate": 3.089937666963491e-05,
"loss": 0.7797,
"step": 4790
},
{
"epoch": 6.138107416879795,
"grad_norm": 2.9278452396392822,
"learning_rate": 3.085485307212823e-05,
"loss": 0.1351,
"step": 4800
},
{
"epoch": 6.150895140664962,
"grad_norm": 1.4354749917984009,
"learning_rate": 3.0810329474621554e-05,
"loss": 0.2317,
"step": 4810
},
{
"epoch": 6.163682864450128,
"grad_norm": 3.3318662643432617,
"learning_rate": 3.076580587711487e-05,
"loss": 0.4199,
"step": 4820
},
{
"epoch": 6.176470588235294,
"grad_norm": 0.6563153266906738,
"learning_rate": 3.0721282279608196e-05,
"loss": 0.9203,
"step": 4830
},
{
"epoch": 6.189258312020461,
"grad_norm": 8.87299633026123,
"learning_rate": 3.067675868210151e-05,
"loss": 0.5379,
"step": 4840
},
{
"epoch": 6.202046035805626,
"grad_norm": 7.299469470977783,
"learning_rate": 3.063223508459484e-05,
"loss": 0.2947,
"step": 4850
},
{
"epoch": 6.2148337595907925,
"grad_norm": 5.325803279876709,
"learning_rate": 3.058771148708816e-05,
"loss": 0.7735,
"step": 4860
},
{
"epoch": 6.227621483375959,
"grad_norm": 9.854231834411621,
"learning_rate": 3.054318788958148e-05,
"loss": 1.1524,
"step": 4870
},
{
"epoch": 6.240409207161125,
"grad_norm": 4.8076372146606445,
"learning_rate": 3.0498664292074804e-05,
"loss": 0.3987,
"step": 4880
},
{
"epoch": 6.253196930946292,
"grad_norm": 5.543219566345215,
"learning_rate": 3.0454140694568124e-05,
"loss": 0.9449,
"step": 4890
},
{
"epoch": 6.265984654731458,
"grad_norm": 4.439374923706055,
"learning_rate": 3.0409617097061442e-05,
"loss": 0.7293,
"step": 4900
},
{
"epoch": 6.278772378516624,
"grad_norm": 0.6048958897590637,
"learning_rate": 3.0365093499554763e-05,
"loss": 0.9003,
"step": 4910
},
{
"epoch": 6.291560102301791,
"grad_norm": 7.064619541168213,
"learning_rate": 3.0320569902048084e-05,
"loss": 0.4962,
"step": 4920
},
{
"epoch": 6.304347826086957,
"grad_norm": 0.19604283571243286,
"learning_rate": 3.027604630454141e-05,
"loss": 0.3643,
"step": 4930
},
{
"epoch": 6.3171355498721224,
"grad_norm": 0.40240225195884705,
"learning_rate": 3.0231522707034732e-05,
"loss": 0.5883,
"step": 4940
},
{
"epoch": 6.329923273657289,
"grad_norm": 7.794588565826416,
"learning_rate": 3.0186999109528053e-05,
"loss": 0.5402,
"step": 4950
},
{
"epoch": 6.342710997442455,
"grad_norm": 1.4699820280075073,
"learning_rate": 3.0142475512021374e-05,
"loss": 0.2695,
"step": 4960
},
{
"epoch": 6.3554987212276215,
"grad_norm": 2.457961082458496,
"learning_rate": 3.0097951914514695e-05,
"loss": 0.3105,
"step": 4970
},
{
"epoch": 6.368286445012788,
"grad_norm": 6.704987525939941,
"learning_rate": 3.0053428317008016e-05,
"loss": 0.3334,
"step": 4980
},
{
"epoch": 6.381074168797954,
"grad_norm": 3.568899154663086,
"learning_rate": 3.0008904719501337e-05,
"loss": 0.5752,
"step": 4990
},
{
"epoch": 6.3938618925831205,
"grad_norm": 6.0692853927612305,
"learning_rate": 2.996438112199466e-05,
"loss": 0.6972,
"step": 5000
},
{
"epoch": 6.406649616368286,
"grad_norm": 0.15085622668266296,
"learning_rate": 2.9919857524487982e-05,
"loss": 0.2433,
"step": 5010
},
{
"epoch": 6.419437340153452,
"grad_norm": 9.642597198486328,
"learning_rate": 2.9875333926981303e-05,
"loss": 0.6038,
"step": 5020
},
{
"epoch": 6.432225063938619,
"grad_norm": 0.5172861218452454,
"learning_rate": 2.9830810329474623e-05,
"loss": 0.7294,
"step": 5030
},
{
"epoch": 6.445012787723785,
"grad_norm": 6.880235195159912,
"learning_rate": 2.9786286731967944e-05,
"loss": 0.8727,
"step": 5040
},
{
"epoch": 6.457800511508951,
"grad_norm": 9.20561695098877,
"learning_rate": 2.9741763134461265e-05,
"loss": 0.4577,
"step": 5050
},
{
"epoch": 6.470588235294118,
"grad_norm": 8.940627098083496,
"learning_rate": 2.9697239536954586e-05,
"loss": 0.7648,
"step": 5060
},
{
"epoch": 6.483375959079284,
"grad_norm": 2.744727611541748,
"learning_rate": 2.965271593944791e-05,
"loss": 0.9997,
"step": 5070
},
{
"epoch": 6.4961636828644505,
"grad_norm": 10.972113609313965,
"learning_rate": 2.960819234194123e-05,
"loss": 0.5985,
"step": 5080
},
{
"epoch": 6.508951406649617,
"grad_norm": 7.300151348114014,
"learning_rate": 2.9563668744434552e-05,
"loss": 0.7946,
"step": 5090
},
{
"epoch": 6.521739130434782,
"grad_norm": 9.26161003112793,
"learning_rate": 2.9519145146927873e-05,
"loss": 0.9692,
"step": 5100
},
{
"epoch": 6.534526854219949,
"grad_norm": 0.12795042991638184,
"learning_rate": 2.9474621549421194e-05,
"loss": 0.472,
"step": 5110
},
{
"epoch": 6.547314578005115,
"grad_norm": 5.266800880432129,
"learning_rate": 2.9430097951914515e-05,
"loss": 0.5951,
"step": 5120
},
{
"epoch": 6.560102301790281,
"grad_norm": 5.73391056060791,
"learning_rate": 2.9385574354407836e-05,
"loss": 0.5553,
"step": 5130
},
{
"epoch": 6.572890025575448,
"grad_norm": 4.122284889221191,
"learning_rate": 2.9341050756901163e-05,
"loss": 0.6291,
"step": 5140
},
{
"epoch": 6.585677749360614,
"grad_norm": 1.5334197282791138,
"learning_rate": 2.929652715939448e-05,
"loss": 0.5834,
"step": 5150
},
{
"epoch": 6.59846547314578,
"grad_norm": 2.3212168216705322,
"learning_rate": 2.92520035618878e-05,
"loss": 0.471,
"step": 5160
},
{
"epoch": 6.611253196930946,
"grad_norm": 7.480304718017578,
"learning_rate": 2.9207479964381123e-05,
"loss": 0.6053,
"step": 5170
},
{
"epoch": 6.624040920716112,
"grad_norm": 0.005698173772543669,
"learning_rate": 2.9162956366874443e-05,
"loss": 0.2767,
"step": 5180
},
{
"epoch": 6.6368286445012785,
"grad_norm": 5.176445960998535,
"learning_rate": 2.9118432769367764e-05,
"loss": 0.4619,
"step": 5190
},
{
"epoch": 6.649616368286445,
"grad_norm": 8.181547164916992,
"learning_rate": 2.9073909171861085e-05,
"loss": 0.8741,
"step": 5200
},
{
"epoch": 6.662404092071611,
"grad_norm": 6.4060211181640625,
"learning_rate": 2.9029385574354413e-05,
"loss": 0.4839,
"step": 5210
},
{
"epoch": 6.675191815856778,
"grad_norm": 3.9297218322753906,
"learning_rate": 2.8984861976847734e-05,
"loss": 0.4571,
"step": 5220
},
{
"epoch": 6.687979539641944,
"grad_norm": 9.544256210327148,
"learning_rate": 2.8940338379341055e-05,
"loss": 0.3365,
"step": 5230
},
{
"epoch": 6.70076726342711,
"grad_norm": 3.5246851444244385,
"learning_rate": 2.8895814781834375e-05,
"loss": 0.4995,
"step": 5240
},
{
"epoch": 6.713554987212277,
"grad_norm": 4.475048542022705,
"learning_rate": 2.8851291184327693e-05,
"loss": 0.6951,
"step": 5250
},
{
"epoch": 6.726342710997442,
"grad_norm": 7.613154888153076,
"learning_rate": 2.8806767586821014e-05,
"loss": 0.1763,
"step": 5260
},
{
"epoch": 6.739130434782608,
"grad_norm": 5.281979084014893,
"learning_rate": 2.8762243989314335e-05,
"loss": 0.4828,
"step": 5270
},
{
"epoch": 6.751918158567775,
"grad_norm": 3.480308771133423,
"learning_rate": 2.8717720391807662e-05,
"loss": 0.3431,
"step": 5280
},
{
"epoch": 6.764705882352941,
"grad_norm": 1.8283220529556274,
"learning_rate": 2.8673196794300983e-05,
"loss": 0.3137,
"step": 5290
},
{
"epoch": 6.7774936061381075,
"grad_norm": 0.4503525197505951,
"learning_rate": 2.8628673196794304e-05,
"loss": 0.582,
"step": 5300
},
{
"epoch": 6.790281329923274,
"grad_norm": 8.414701461791992,
"learning_rate": 2.8584149599287625e-05,
"loss": 0.6779,
"step": 5310
},
{
"epoch": 6.80306905370844,
"grad_norm": 9.155427932739258,
"learning_rate": 2.8539626001780946e-05,
"loss": 0.2913,
"step": 5320
},
{
"epoch": 6.8158567774936065,
"grad_norm": 0.7775105237960815,
"learning_rate": 2.8495102404274267e-05,
"loss": 0.5001,
"step": 5330
},
{
"epoch": 6.828644501278772,
"grad_norm": 2.658113956451416,
"learning_rate": 2.8450578806767588e-05,
"loss": 0.6299,
"step": 5340
},
{
"epoch": 6.841432225063938,
"grad_norm": 0.30859649181365967,
"learning_rate": 2.8406055209260912e-05,
"loss": 0.314,
"step": 5350
},
{
"epoch": 6.854219948849105,
"grad_norm": 1.178870677947998,
"learning_rate": 2.8361531611754233e-05,
"loss": 0.5978,
"step": 5360
},
{
"epoch": 6.867007672634271,
"grad_norm": 0.010601550340652466,
"learning_rate": 2.8317008014247554e-05,
"loss": 0.5799,
"step": 5370
},
{
"epoch": 6.879795396419437,
"grad_norm": 0.43929383158683777,
"learning_rate": 2.8272484416740874e-05,
"loss": 0.3263,
"step": 5380
},
{
"epoch": 6.892583120204604,
"grad_norm": 0.5252712965011597,
"learning_rate": 2.8227960819234195e-05,
"loss": 0.6099,
"step": 5390
},
{
"epoch": 6.90537084398977,
"grad_norm": 5.2658233642578125,
"learning_rate": 2.8183437221727516e-05,
"loss": 0.8632,
"step": 5400
},
{
"epoch": 6.918158567774936,
"grad_norm": 1.1721999645233154,
"learning_rate": 2.8138913624220837e-05,
"loss": 0.3408,
"step": 5410
},
{
"epoch": 6.930946291560103,
"grad_norm": 1.8380037546157837,
"learning_rate": 2.809439002671416e-05,
"loss": 0.6238,
"step": 5420
},
{
"epoch": 6.943734015345268,
"grad_norm": 2.2290737628936768,
"learning_rate": 2.8049866429207482e-05,
"loss": 0.4239,
"step": 5430
},
{
"epoch": 6.956521739130435,
"grad_norm": 14.996088027954102,
"learning_rate": 2.8005342831700803e-05,
"loss": 0.8926,
"step": 5440
},
{
"epoch": 6.969309462915601,
"grad_norm": 18.02151107788086,
"learning_rate": 2.7960819234194124e-05,
"loss": 0.7365,
"step": 5450
},
{
"epoch": 6.982097186700767,
"grad_norm": 9.642180442810059,
"learning_rate": 2.7916295636687445e-05,
"loss": 0.4924,
"step": 5460
},
{
"epoch": 6.994884910485934,
"grad_norm": 1.6962766647338867,
"learning_rate": 2.7871772039180766e-05,
"loss": 0.3256,
"step": 5470
},
{
"epoch": 7.0,
"eval_loss": 0.2672092914581299,
"eval_runtime": 0.9877,
"eval_samples_per_second": 99.225,
"eval_steps_per_second": 13.162,
"step": 5474
},
{
"epoch": 7.0076726342711,
"grad_norm": 2.8432042598724365,
"learning_rate": 2.7827248441674087e-05,
"loss": 0.2708,
"step": 5480
},
{
"epoch": 7.020460358056266,
"grad_norm": 6.073406219482422,
"learning_rate": 2.7782724844167414e-05,
"loss": 0.5792,
"step": 5490
},
{
"epoch": 7.033248081841432,
"grad_norm": 4.646320819854736,
"learning_rate": 2.7738201246660732e-05,
"loss": 0.2623,
"step": 5500
},
{
"epoch": 7.046035805626598,
"grad_norm": 6.342520236968994,
"learning_rate": 2.7693677649154053e-05,
"loss": 0.4399,
"step": 5510
},
{
"epoch": 7.0588235294117645,
"grad_norm": 8.213132858276367,
"learning_rate": 2.7649154051647374e-05,
"loss": 0.5036,
"step": 5520
},
{
"epoch": 7.071611253196931,
"grad_norm": 4.454671382904053,
"learning_rate": 2.7604630454140694e-05,
"loss": 0.6741,
"step": 5530
},
{
"epoch": 7.084398976982097,
"grad_norm": 0.019239643588662148,
"learning_rate": 2.7560106856634015e-05,
"loss": 0.3737,
"step": 5540
},
{
"epoch": 7.0971867007672635,
"grad_norm": 5.077779293060303,
"learning_rate": 2.7515583259127336e-05,
"loss": 0.8875,
"step": 5550
},
{
"epoch": 7.10997442455243,
"grad_norm": 3.9947173595428467,
"learning_rate": 2.7471059661620664e-05,
"loss": 0.3211,
"step": 5560
},
{
"epoch": 7.122762148337596,
"grad_norm": 8.417941093444824,
"learning_rate": 2.7426536064113985e-05,
"loss": 0.5168,
"step": 5570
},
{
"epoch": 7.135549872122763,
"grad_norm": 6.5370774269104,
"learning_rate": 2.7382012466607306e-05,
"loss": 0.5258,
"step": 5580
},
{
"epoch": 7.148337595907928,
"grad_norm": 5.088172912597656,
"learning_rate": 2.7337488869100626e-05,
"loss": 0.2866,
"step": 5590
},
{
"epoch": 7.161125319693094,
"grad_norm": 1.4238532781600952,
"learning_rate": 2.7292965271593944e-05,
"loss": 0.6187,
"step": 5600
},
{
"epoch": 7.173913043478261,
"grad_norm": 0.9053813219070435,
"learning_rate": 2.7248441674087265e-05,
"loss": 0.4809,
"step": 5610
},
{
"epoch": 7.186700767263427,
"grad_norm": 0.27804991602897644,
"learning_rate": 2.7203918076580586e-05,
"loss": 0.5625,
"step": 5620
},
{
"epoch": 7.1994884910485935,
"grad_norm": 1.3559807538986206,
"learning_rate": 2.7159394479073913e-05,
"loss": 0.4713,
"step": 5630
},
{
"epoch": 7.21227621483376,
"grad_norm": 3.623849868774414,
"learning_rate": 2.7114870881567234e-05,
"loss": 0.3054,
"step": 5640
},
{
"epoch": 7.225063938618926,
"grad_norm": 0.9959923624992371,
"learning_rate": 2.7070347284060555e-05,
"loss": 0.4528,
"step": 5650
},
{
"epoch": 7.2378516624040925,
"grad_norm": 1.5169447660446167,
"learning_rate": 2.7025823686553876e-05,
"loss": 0.2952,
"step": 5660
},
{
"epoch": 7.250639386189258,
"grad_norm": 2.1620376110076904,
"learning_rate": 2.6981300089047197e-05,
"loss": 0.3941,
"step": 5670
},
{
"epoch": 7.263427109974424,
"grad_norm": 0.8747214674949646,
"learning_rate": 2.6936776491540518e-05,
"loss": 0.6892,
"step": 5680
},
{
"epoch": 7.276214833759591,
"grad_norm": 5.04610013961792,
"learning_rate": 2.689225289403384e-05,
"loss": 0.4836,
"step": 5690
},
{
"epoch": 7.289002557544757,
"grad_norm": 1.117069959640503,
"learning_rate": 2.6847729296527163e-05,
"loss": 0.309,
"step": 5700
},
{
"epoch": 7.301790281329923,
"grad_norm": 5.628134727478027,
"learning_rate": 2.6803205699020484e-05,
"loss": 0.5118,
"step": 5710
},
{
"epoch": 7.31457800511509,
"grad_norm": 0.10263155400753021,
"learning_rate": 2.6758682101513805e-05,
"loss": 0.6117,
"step": 5720
},
{
"epoch": 7.327365728900256,
"grad_norm": 4.31287145614624,
"learning_rate": 2.6714158504007125e-05,
"loss": 0.3162,
"step": 5730
},
{
"epoch": 7.340153452685422,
"grad_norm": 1.4107064008712769,
"learning_rate": 2.6669634906500446e-05,
"loss": 0.8187,
"step": 5740
},
{
"epoch": 7.352941176470588,
"grad_norm": 4.531869411468506,
"learning_rate": 2.6625111308993767e-05,
"loss": 0.7014,
"step": 5750
},
{
"epoch": 7.365728900255754,
"grad_norm": 3.742617130279541,
"learning_rate": 2.6580587711487088e-05,
"loss": 0.4181,
"step": 5760
},
{
"epoch": 7.378516624040921,
"grad_norm": 0.5917960405349731,
"learning_rate": 2.6536064113980412e-05,
"loss": 0.5423,
"step": 5770
},
{
"epoch": 7.391304347826087,
"grad_norm": 10.565349578857422,
"learning_rate": 2.6491540516473733e-05,
"loss": 0.767,
"step": 5780
},
{
"epoch": 7.404092071611253,
"grad_norm": 1.371715784072876,
"learning_rate": 2.6447016918967054e-05,
"loss": 0.7285,
"step": 5790
},
{
"epoch": 7.41687979539642,
"grad_norm": 0.1052427589893341,
"learning_rate": 2.6402493321460375e-05,
"loss": 0.6098,
"step": 5800
},
{
"epoch": 7.429667519181586,
"grad_norm": 2.0800373554229736,
"learning_rate": 2.6357969723953696e-05,
"loss": 0.5199,
"step": 5810
},
{
"epoch": 7.442455242966752,
"grad_norm": 0.35600629448890686,
"learning_rate": 2.6313446126447017e-05,
"loss": 0.4764,
"step": 5820
},
{
"epoch": 7.455242966751918,
"grad_norm": 0.7949437499046326,
"learning_rate": 2.6268922528940338e-05,
"loss": 0.5367,
"step": 5830
},
{
"epoch": 7.468030690537084,
"grad_norm": 10.288331031799316,
"learning_rate": 2.6224398931433662e-05,
"loss": 0.8891,
"step": 5840
},
{
"epoch": 7.4808184143222505,
"grad_norm": 6.744941711425781,
"learning_rate": 2.6179875333926983e-05,
"loss": 0.6405,
"step": 5850
},
{
"epoch": 7.493606138107417,
"grad_norm": 15.893798828125,
"learning_rate": 2.6135351736420304e-05,
"loss": 0.6464,
"step": 5860
},
{
"epoch": 7.506393861892583,
"grad_norm": 0.04780289903283119,
"learning_rate": 2.6090828138913624e-05,
"loss": 0.5096,
"step": 5870
},
{
"epoch": 7.5191815856777495,
"grad_norm": 7.183560848236084,
"learning_rate": 2.6046304541406945e-05,
"loss": 0.402,
"step": 5880
},
{
"epoch": 7.531969309462916,
"grad_norm": 0.7552804350852966,
"learning_rate": 2.6001780943900266e-05,
"loss": 0.4915,
"step": 5890
},
{
"epoch": 7.544757033248082,
"grad_norm": 4.60145902633667,
"learning_rate": 2.5957257346393587e-05,
"loss": 0.5564,
"step": 5900
},
{
"epoch": 7.557544757033249,
"grad_norm": 3.2626659870147705,
"learning_rate": 2.5912733748886915e-05,
"loss": 0.6087,
"step": 5910
},
{
"epoch": 7.570332480818414,
"grad_norm": 3.3970375061035156,
"learning_rate": 2.5868210151380236e-05,
"loss": 0.5361,
"step": 5920
},
{
"epoch": 7.58312020460358,
"grad_norm": 1.0720630884170532,
"learning_rate": 2.5823686553873557e-05,
"loss": 0.5002,
"step": 5930
},
{
"epoch": 7.595907928388747,
"grad_norm": 0.7739387154579163,
"learning_rate": 2.5779162956366877e-05,
"loss": 0.5824,
"step": 5940
},
{
"epoch": 7.608695652173913,
"grad_norm": 12.361614227294922,
"learning_rate": 2.5734639358860195e-05,
"loss": 0.6861,
"step": 5950
},
{
"epoch": 7.621483375959079,
"grad_norm": 5.803438186645508,
"learning_rate": 2.5690115761353516e-05,
"loss": 0.4415,
"step": 5960
},
{
"epoch": 7.634271099744246,
"grad_norm": 0.18312691152095795,
"learning_rate": 2.5645592163846837e-05,
"loss": 0.4464,
"step": 5970
},
{
"epoch": 7.647058823529412,
"grad_norm": 6.178823947906494,
"learning_rate": 2.5601068566340164e-05,
"loss": 0.6077,
"step": 5980
},
{
"epoch": 7.659846547314578,
"grad_norm": 1.4608851671218872,
"learning_rate": 2.5556544968833485e-05,
"loss": 0.4675,
"step": 5990
},
{
"epoch": 7.672634271099744,
"grad_norm": 0.0074475789442658424,
"learning_rate": 2.5512021371326806e-05,
"loss": 0.1926,
"step": 6000
},
{
"epoch": 7.68542199488491,
"grad_norm": 4.761837959289551,
"learning_rate": 2.5467497773820127e-05,
"loss": 0.7276,
"step": 6010
},
{
"epoch": 7.698209718670077,
"grad_norm": 1.3654205799102783,
"learning_rate": 2.5422974176313448e-05,
"loss": 0.4568,
"step": 6020
},
{
"epoch": 7.710997442455243,
"grad_norm": 0.9348598122596741,
"learning_rate": 2.537845057880677e-05,
"loss": 0.2002,
"step": 6030
},
{
"epoch": 7.723785166240409,
"grad_norm": 9.504705429077148,
"learning_rate": 2.533392698130009e-05,
"loss": 0.492,
"step": 6040
},
{
"epoch": 7.736572890025576,
"grad_norm": 2.3037102222442627,
"learning_rate": 2.5289403383793414e-05,
"loss": 0.4039,
"step": 6050
},
{
"epoch": 7.749360613810742,
"grad_norm": 0.9193987250328064,
"learning_rate": 2.5244879786286735e-05,
"loss": 0.3467,
"step": 6060
},
{
"epoch": 7.762148337595908,
"grad_norm": 7.834420680999756,
"learning_rate": 2.5200356188780056e-05,
"loss": 0.3421,
"step": 6070
},
{
"epoch": 7.774936061381074,
"grad_norm": 6.3842878341674805,
"learning_rate": 2.5155832591273376e-05,
"loss": 0.6775,
"step": 6080
},
{
"epoch": 7.78772378516624,
"grad_norm": 0.7432451844215393,
"learning_rate": 2.5111308993766697e-05,
"loss": 0.5759,
"step": 6090
},
{
"epoch": 7.8005115089514065,
"grad_norm": 9.213702201843262,
"learning_rate": 2.5066785396260018e-05,
"loss": 0.737,
"step": 6100
},
{
"epoch": 7.813299232736573,
"grad_norm": 4.210599422454834,
"learning_rate": 2.502226179875334e-05,
"loss": 0.472,
"step": 6110
},
{
"epoch": 7.826086956521739,
"grad_norm": 5.298123836517334,
"learning_rate": 2.497773820124666e-05,
"loss": 0.5179,
"step": 6120
},
{
"epoch": 7.838874680306906,
"grad_norm": 6.451060771942139,
"learning_rate": 2.493321460373998e-05,
"loss": 0.3755,
"step": 6130
},
{
"epoch": 7.851662404092072,
"grad_norm": 10.77486801147461,
"learning_rate": 2.4888691006233305e-05,
"loss": 0.2594,
"step": 6140
},
{
"epoch": 7.864450127877237,
"grad_norm": 1.9036270380020142,
"learning_rate": 2.4844167408726626e-05,
"loss": 0.1492,
"step": 6150
},
{
"epoch": 7.877237851662404,
"grad_norm": 0.526852011680603,
"learning_rate": 2.4799643811219947e-05,
"loss": 0.4159,
"step": 6160
},
{
"epoch": 7.89002557544757,
"grad_norm": 9.612473487854004,
"learning_rate": 2.475512021371327e-05,
"loss": 0.5882,
"step": 6170
},
{
"epoch": 7.9028132992327365,
"grad_norm": 2.0128557682037354,
"learning_rate": 2.4710596616206592e-05,
"loss": 0.3371,
"step": 6180
},
{
"epoch": 7.915601023017903,
"grad_norm": 12.005789756774902,
"learning_rate": 2.4666073018699913e-05,
"loss": 0.5436,
"step": 6190
},
{
"epoch": 7.928388746803069,
"grad_norm": 2.172546625137329,
"learning_rate": 2.4621549421193234e-05,
"loss": 0.7374,
"step": 6200
},
{
"epoch": 7.9411764705882355,
"grad_norm": 9.670553207397461,
"learning_rate": 2.4577025823686555e-05,
"loss": 0.8065,
"step": 6210
},
{
"epoch": 7.953964194373402,
"grad_norm": 5.221970081329346,
"learning_rate": 2.4532502226179875e-05,
"loss": 0.535,
"step": 6220
},
{
"epoch": 7.966751918158568,
"grad_norm": 7.669173240661621,
"learning_rate": 2.4487978628673196e-05,
"loss": 0.6655,
"step": 6230
},
{
"epoch": 7.979539641943734,
"grad_norm": 2.3195643424987793,
"learning_rate": 2.444345503116652e-05,
"loss": 0.8417,
"step": 6240
},
{
"epoch": 7.9923273657289,
"grad_norm": 3.1944046020507812,
"learning_rate": 2.439893143365984e-05,
"loss": 0.4028,
"step": 6250
},
{
"epoch": 8.0,
"eval_loss": 0.25757405161857605,
"eval_runtime": 0.9949,
"eval_samples_per_second": 98.507,
"eval_steps_per_second": 13.067,
"step": 6256
},
{
"epoch": 8.005115089514067,
"grad_norm": 13.27065658569336,
"learning_rate": 2.4354407836153162e-05,
"loss": 0.3224,
"step": 6260
},
{
"epoch": 8.017902813299234,
"grad_norm": 7.944550037384033,
"learning_rate": 2.4309884238646483e-05,
"loss": 0.7011,
"step": 6270
},
{
"epoch": 8.030690537084398,
"grad_norm": 6.031693458557129,
"learning_rate": 2.4265360641139808e-05,
"loss": 0.4088,
"step": 6280
},
{
"epoch": 8.043478260869565,
"grad_norm": 13.39229679107666,
"learning_rate": 2.4220837043633125e-05,
"loss": 0.5026,
"step": 6290
},
{
"epoch": 8.05626598465473,
"grad_norm": 11.215279579162598,
"learning_rate": 2.4176313446126446e-05,
"loss": 0.5442,
"step": 6300
},
{
"epoch": 8.069053708439897,
"grad_norm": 6.160999298095703,
"learning_rate": 2.413178984861977e-05,
"loss": 0.3378,
"step": 6310
},
{
"epoch": 8.081841432225064,
"grad_norm": 4.923915863037109,
"learning_rate": 2.408726625111309e-05,
"loss": 0.2429,
"step": 6320
},
{
"epoch": 8.09462915601023,
"grad_norm": 7.593574523925781,
"learning_rate": 2.4042742653606412e-05,
"loss": 0.6619,
"step": 6330
},
{
"epoch": 8.107416879795396,
"grad_norm": 16.820772171020508,
"learning_rate": 2.3998219056099733e-05,
"loss": 0.4277,
"step": 6340
},
{
"epoch": 8.120204603580563,
"grad_norm": 0.1135517805814743,
"learning_rate": 2.3953695458593057e-05,
"loss": 0.2982,
"step": 6350
},
{
"epoch": 8.132992327365729,
"grad_norm": 2.25067138671875,
"learning_rate": 2.3909171861086378e-05,
"loss": 0.3036,
"step": 6360
},
{
"epoch": 8.145780051150895,
"grad_norm": 1.080831527709961,
"learning_rate": 2.38646482635797e-05,
"loss": 0.2754,
"step": 6370
},
{
"epoch": 8.158567774936062,
"grad_norm": 2.591698169708252,
"learning_rate": 2.382012466607302e-05,
"loss": 0.4765,
"step": 6380
},
{
"epoch": 8.171355498721228,
"grad_norm": 2.097321033477783,
"learning_rate": 2.377560106856634e-05,
"loss": 0.167,
"step": 6390
},
{
"epoch": 8.184143222506394,
"grad_norm": 0.8653718829154968,
"learning_rate": 2.373107747105966e-05,
"loss": 0.5256,
"step": 6400
},
{
"epoch": 8.19693094629156,
"grad_norm": 1.544346809387207,
"learning_rate": 2.3686553873552982e-05,
"loss": 0.4664,
"step": 6410
},
{
"epoch": 8.209718670076727,
"grad_norm": 1.176542043685913,
"learning_rate": 2.3642030276046307e-05,
"loss": 0.8735,
"step": 6420
},
{
"epoch": 8.222506393861893,
"grad_norm": 0.06903265416622162,
"learning_rate": 2.3597506678539627e-05,
"loss": 0.4592,
"step": 6430
},
{
"epoch": 8.235294117647058,
"grad_norm": 7.001086711883545,
"learning_rate": 2.3552983081032948e-05,
"loss": 0.4594,
"step": 6440
},
{
"epoch": 8.248081841432224,
"grad_norm": 0.028170613572001457,
"learning_rate": 2.3508459483526273e-05,
"loss": 0.4996,
"step": 6450
},
{
"epoch": 8.26086956521739,
"grad_norm": 8.073884010314941,
"learning_rate": 2.3463935886019593e-05,
"loss": 0.5063,
"step": 6460
},
{
"epoch": 8.273657289002557,
"grad_norm": 34.870094299316406,
"learning_rate": 2.341941228851291e-05,
"loss": 0.5557,
"step": 6470
},
{
"epoch": 8.286445012787723,
"grad_norm": 4.562534809112549,
"learning_rate": 2.3374888691006232e-05,
"loss": 0.0968,
"step": 6480
},
{
"epoch": 8.29923273657289,
"grad_norm": 0.04304574057459831,
"learning_rate": 2.3330365093499556e-05,
"loss": 0.2486,
"step": 6490
},
{
"epoch": 8.312020460358056,
"grad_norm": 7.053394794464111,
"learning_rate": 2.3285841495992877e-05,
"loss": 0.505,
"step": 6500
},
{
"epoch": 8.324808184143222,
"grad_norm": 9.683124542236328,
"learning_rate": 2.3241317898486198e-05,
"loss": 0.6914,
"step": 6510
},
{
"epoch": 8.337595907928389,
"grad_norm": 6.789582252502441,
"learning_rate": 2.3196794300979522e-05,
"loss": 0.4666,
"step": 6520
},
{
"epoch": 8.350383631713555,
"grad_norm": 4.472419261932373,
"learning_rate": 2.3152270703472843e-05,
"loss": 0.381,
"step": 6530
},
{
"epoch": 8.363171355498721,
"grad_norm": 10.337482452392578,
"learning_rate": 2.3107747105966164e-05,
"loss": 0.5651,
"step": 6540
},
{
"epoch": 8.375959079283888,
"grad_norm": 1.6700971126556396,
"learning_rate": 2.3063223508459485e-05,
"loss": 0.4265,
"step": 6550
},
{
"epoch": 8.388746803069054,
"grad_norm": 1.6565567255020142,
"learning_rate": 2.3018699910952806e-05,
"loss": 0.3659,
"step": 6560
},
{
"epoch": 8.40153452685422,
"grad_norm": 8.731335639953613,
"learning_rate": 2.2974176313446126e-05,
"loss": 0.4689,
"step": 6570
},
{
"epoch": 8.414322250639387,
"grad_norm": 8.600381851196289,
"learning_rate": 2.2929652715939447e-05,
"loss": 0.8944,
"step": 6580
},
{
"epoch": 8.427109974424553,
"grad_norm": 10.421477317810059,
"learning_rate": 2.288512911843277e-05,
"loss": 0.4084,
"step": 6590
},
{
"epoch": 8.43989769820972,
"grad_norm": 0.0770091563463211,
"learning_rate": 2.2840605520926092e-05,
"loss": 0.4492,
"step": 6600
},
{
"epoch": 8.452685421994884,
"grad_norm": 0.04360765591263771,
"learning_rate": 2.2796081923419413e-05,
"loss": 0.6236,
"step": 6610
},
{
"epoch": 8.46547314578005,
"grad_norm": 5.0289626121521,
"learning_rate": 2.2751558325912734e-05,
"loss": 0.5115,
"step": 6620
},
{
"epoch": 8.478260869565217,
"grad_norm": 5.965487957000732,
"learning_rate": 2.270703472840606e-05,
"loss": 0.7749,
"step": 6630
},
{
"epoch": 8.491048593350383,
"grad_norm": 2.7336697578430176,
"learning_rate": 2.2662511130899376e-05,
"loss": 0.3559,
"step": 6640
},
{
"epoch": 8.50383631713555,
"grad_norm": 0.37265846133232117,
"learning_rate": 2.2617987533392697e-05,
"loss": 0.5246,
"step": 6650
},
{
"epoch": 8.516624040920716,
"grad_norm": 14.674330711364746,
"learning_rate": 2.257346393588602e-05,
"loss": 0.6532,
"step": 6660
},
{
"epoch": 8.529411764705882,
"grad_norm": 6.816307544708252,
"learning_rate": 2.2528940338379342e-05,
"loss": 0.2127,
"step": 6670
},
{
"epoch": 8.542199488491049,
"grad_norm": 9.356270790100098,
"learning_rate": 2.2484416740872663e-05,
"loss": 0.5478,
"step": 6680
},
{
"epoch": 8.554987212276215,
"grad_norm": 1.5472646951675415,
"learning_rate": 2.2439893143365984e-05,
"loss": 0.4139,
"step": 6690
},
{
"epoch": 8.567774936061381,
"grad_norm": 8.75796127319336,
"learning_rate": 2.2395369545859308e-05,
"loss": 0.5247,
"step": 6700
},
{
"epoch": 8.580562659846548,
"grad_norm": 2.2100446224212646,
"learning_rate": 2.235084594835263e-05,
"loss": 0.4889,
"step": 6710
},
{
"epoch": 8.593350383631714,
"grad_norm": 4.241175174713135,
"learning_rate": 2.230632235084595e-05,
"loss": 0.3766,
"step": 6720
},
{
"epoch": 8.60613810741688,
"grad_norm": 0.9913263916969299,
"learning_rate": 2.226179875333927e-05,
"loss": 0.2065,
"step": 6730
},
{
"epoch": 8.618925831202047,
"grad_norm": 4.9824442863464355,
"learning_rate": 2.221727515583259e-05,
"loss": 0.3993,
"step": 6740
},
{
"epoch": 8.631713554987213,
"grad_norm": 4.201189041137695,
"learning_rate": 2.2172751558325912e-05,
"loss": 0.3504,
"step": 6750
},
{
"epoch": 8.644501278772378,
"grad_norm": 6.985659599304199,
"learning_rate": 2.2128227960819233e-05,
"loss": 0.6149,
"step": 6760
},
{
"epoch": 8.657289002557544,
"grad_norm": 2.4724535942077637,
"learning_rate": 2.2083704363312558e-05,
"loss": 0.6447,
"step": 6770
},
{
"epoch": 8.67007672634271,
"grad_norm": 10.667593002319336,
"learning_rate": 2.203918076580588e-05,
"loss": 0.6225,
"step": 6780
},
{
"epoch": 8.682864450127877,
"grad_norm": 8.419607162475586,
"learning_rate": 2.19946571682992e-05,
"loss": 0.5348,
"step": 6790
},
{
"epoch": 8.695652173913043,
"grad_norm": 0.02665606141090393,
"learning_rate": 2.1950133570792524e-05,
"loss": 0.607,
"step": 6800
},
{
"epoch": 8.70843989769821,
"grad_norm": 6.91499662399292,
"learning_rate": 2.1905609973285844e-05,
"loss": 0.5484,
"step": 6810
},
{
"epoch": 8.721227621483376,
"grad_norm": 3.57250714302063,
"learning_rate": 2.1861086375779162e-05,
"loss": 0.1793,
"step": 6820
},
{
"epoch": 8.734015345268542,
"grad_norm": 2.3197195529937744,
"learning_rate": 2.1816562778272486e-05,
"loss": 0.4837,
"step": 6830
},
{
"epoch": 8.746803069053708,
"grad_norm": 6.936196327209473,
"learning_rate": 2.1772039180765807e-05,
"loss": 0.4397,
"step": 6840
},
{
"epoch": 8.759590792838875,
"grad_norm": 1.3397008180618286,
"learning_rate": 2.1727515583259128e-05,
"loss": 0.7489,
"step": 6850
},
{
"epoch": 8.772378516624041,
"grad_norm": 1.0735710859298706,
"learning_rate": 2.168299198575245e-05,
"loss": 0.4262,
"step": 6860
},
{
"epoch": 8.785166240409207,
"grad_norm": 2.5606706142425537,
"learning_rate": 2.1638468388245773e-05,
"loss": 0.4965,
"step": 6870
},
{
"epoch": 8.797953964194374,
"grad_norm": 3.640894889831543,
"learning_rate": 2.1593944790739094e-05,
"loss": 0.8421,
"step": 6880
},
{
"epoch": 8.81074168797954,
"grad_norm": 4.286122798919678,
"learning_rate": 2.1549421193232415e-05,
"loss": 0.35,
"step": 6890
},
{
"epoch": 8.823529411764707,
"grad_norm": 5.966396331787109,
"learning_rate": 2.1504897595725736e-05,
"loss": 0.5545,
"step": 6900
},
{
"epoch": 8.836317135549873,
"grad_norm": 0.748144268989563,
"learning_rate": 2.1460373998219057e-05,
"loss": 0.5225,
"step": 6910
},
{
"epoch": 8.84910485933504,
"grad_norm": 1.9871532917022705,
"learning_rate": 2.1415850400712377e-05,
"loss": 0.4146,
"step": 6920
},
{
"epoch": 8.861892583120204,
"grad_norm": 6.072329521179199,
"learning_rate": 2.13713268032057e-05,
"loss": 0.2274,
"step": 6930
},
{
"epoch": 8.87468030690537,
"grad_norm": 3.948272466659546,
"learning_rate": 2.1326803205699023e-05,
"loss": 0.5427,
"step": 6940
},
{
"epoch": 8.887468030690536,
"grad_norm": 4.448310375213623,
"learning_rate": 2.1282279608192343e-05,
"loss": 0.3465,
"step": 6950
},
{
"epoch": 8.900255754475703,
"grad_norm": 2.879702568054199,
"learning_rate": 2.1237756010685664e-05,
"loss": 0.4667,
"step": 6960
},
{
"epoch": 8.91304347826087,
"grad_norm": 5.268418788909912,
"learning_rate": 2.119323241317899e-05,
"loss": 0.4483,
"step": 6970
},
{
"epoch": 8.925831202046036,
"grad_norm": 10.505093574523926,
"learning_rate": 2.114870881567231e-05,
"loss": 0.3888,
"step": 6980
},
{
"epoch": 8.938618925831202,
"grad_norm": 0.7593215107917786,
"learning_rate": 2.1104185218165627e-05,
"loss": 0.1886,
"step": 6990
},
{
"epoch": 8.951406649616368,
"grad_norm": 9.993316650390625,
"learning_rate": 2.1059661620658948e-05,
"loss": 0.6512,
"step": 7000
},
{
"epoch": 8.964194373401535,
"grad_norm": 0.009588400833308697,
"learning_rate": 2.1015138023152272e-05,
"loss": 0.4605,
"step": 7010
},
{
"epoch": 8.976982097186701,
"grad_norm": 0.6530927419662476,
"learning_rate": 2.0970614425645593e-05,
"loss": 0.35,
"step": 7020
},
{
"epoch": 8.989769820971867,
"grad_norm": 0.30554449558258057,
"learning_rate": 2.0926090828138914e-05,
"loss": 0.728,
"step": 7030
},
{
"epoch": 9.0,
"eval_loss": 0.25451144576072693,
"eval_runtime": 0.9753,
"eval_samples_per_second": 100.482,
"eval_steps_per_second": 13.329,
"step": 7038
},
{
"epoch": 9.002557544757034,
"grad_norm": 1.9777408838272095,
"learning_rate": 2.0881567230632238e-05,
"loss": 0.4878,
"step": 7040
},
{
"epoch": 9.0153452685422,
"grad_norm": 0.005459375213831663,
"learning_rate": 2.083704363312556e-05,
"loss": 0.3858,
"step": 7050
},
{
"epoch": 9.028132992327366,
"grad_norm": 1.3175240755081177,
"learning_rate": 2.079252003561888e-05,
"loss": 0.5197,
"step": 7060
},
{
"epoch": 9.040920716112533,
"grad_norm": 12.455684661865234,
"learning_rate": 2.07479964381122e-05,
"loss": 0.3106,
"step": 7070
},
{
"epoch": 9.053708439897699,
"grad_norm": 1.8953137397766113,
"learning_rate": 2.070347284060552e-05,
"loss": 0.675,
"step": 7080
},
{
"epoch": 9.066496163682864,
"grad_norm": 0.25667712092399597,
"learning_rate": 2.0658949243098843e-05,
"loss": 0.5439,
"step": 7090
},
{
"epoch": 9.07928388746803,
"grad_norm": 1.5751233100891113,
"learning_rate": 2.0614425645592163e-05,
"loss": 0.4046,
"step": 7100
},
{
"epoch": 9.092071611253196,
"grad_norm": 0.4334716200828552,
"learning_rate": 2.0569902048085488e-05,
"loss": 0.7046,
"step": 7110
},
{
"epoch": 9.104859335038363,
"grad_norm": 6.322332859039307,
"learning_rate": 2.052537845057881e-05,
"loss": 0.421,
"step": 7120
},
{
"epoch": 9.117647058823529,
"grad_norm": 7.568769454956055,
"learning_rate": 2.048085485307213e-05,
"loss": 0.3811,
"step": 7130
},
{
"epoch": 9.130434782608695,
"grad_norm": 4.8575334548950195,
"learning_rate": 2.043633125556545e-05,
"loss": 0.5919,
"step": 7140
},
{
"epoch": 9.143222506393862,
"grad_norm": 2.816992998123169,
"learning_rate": 2.0391807658058775e-05,
"loss": 0.4486,
"step": 7150
},
{
"epoch": 9.156010230179028,
"grad_norm": 0.024615757167339325,
"learning_rate": 2.0347284060552092e-05,
"loss": 0.4216,
"step": 7160
},
{
"epoch": 9.168797953964194,
"grad_norm": 13.121672630310059,
"learning_rate": 2.0302760463045413e-05,
"loss": 0.5703,
"step": 7170
},
{
"epoch": 9.18158567774936,
"grad_norm": 2.4997825622558594,
"learning_rate": 2.0258236865538737e-05,
"loss": 0.3972,
"step": 7180
},
{
"epoch": 9.194373401534527,
"grad_norm": 0.672987163066864,
"learning_rate": 2.0213713268032058e-05,
"loss": 0.3447,
"step": 7190
},
{
"epoch": 9.207161125319693,
"grad_norm": 0.3629339337348938,
"learning_rate": 2.016918967052538e-05,
"loss": 0.4216,
"step": 7200
},
{
"epoch": 9.21994884910486,
"grad_norm": 0.4874151647090912,
"learning_rate": 2.01246660730187e-05,
"loss": 0.4348,
"step": 7210
},
{
"epoch": 9.232736572890026,
"grad_norm": 3.5700504779815674,
"learning_rate": 2.0080142475512024e-05,
"loss": 0.7219,
"step": 7220
},
{
"epoch": 9.245524296675192,
"grad_norm": 7.892800331115723,
"learning_rate": 2.0035618878005345e-05,
"loss": 0.6184,
"step": 7230
},
{
"epoch": 9.258312020460359,
"grad_norm": 6.028756618499756,
"learning_rate": 1.9991095280498666e-05,
"loss": 0.2687,
"step": 7240
},
{
"epoch": 9.271099744245525,
"grad_norm": 7.790216445922852,
"learning_rate": 1.9946571682991987e-05,
"loss": 0.6128,
"step": 7250
},
{
"epoch": 9.28388746803069,
"grad_norm": 0.28933781385421753,
"learning_rate": 1.9902048085485308e-05,
"loss": 0.3225,
"step": 7260
},
{
"epoch": 9.296675191815856,
"grad_norm": 3.2421770095825195,
"learning_rate": 1.985752448797863e-05,
"loss": 0.4387,
"step": 7270
},
{
"epoch": 9.309462915601022,
"grad_norm": 0.5195609331130981,
"learning_rate": 1.981300089047195e-05,
"loss": 0.4904,
"step": 7280
},
{
"epoch": 9.322250639386189,
"grad_norm": 1.5599663257598877,
"learning_rate": 1.9768477292965274e-05,
"loss": 0.2514,
"step": 7290
},
{
"epoch": 9.335038363171355,
"grad_norm": 3.887960195541382,
"learning_rate": 1.9723953695458594e-05,
"loss": 0.4621,
"step": 7300
},
{
"epoch": 9.347826086956522,
"grad_norm": 2.202698230743408,
"learning_rate": 1.9679430097951915e-05,
"loss": 0.2866,
"step": 7310
},
{
"epoch": 9.360613810741688,
"grad_norm": 0.9991092085838318,
"learning_rate": 1.963490650044524e-05,
"loss": 0.2594,
"step": 7320
},
{
"epoch": 9.373401534526854,
"grad_norm": 0.00048051172052510083,
"learning_rate": 1.959038290293856e-05,
"loss": 0.2282,
"step": 7330
},
{
"epoch": 9.38618925831202,
"grad_norm": 10.449288368225098,
"learning_rate": 1.9545859305431878e-05,
"loss": 0.4363,
"step": 7340
},
{
"epoch": 9.398976982097187,
"grad_norm": 6.0978569984436035,
"learning_rate": 1.95013357079252e-05,
"loss": 0.4363,
"step": 7350
},
{
"epoch": 9.411764705882353,
"grad_norm": 13.751110076904297,
"learning_rate": 1.9456812110418523e-05,
"loss": 0.5147,
"step": 7360
},
{
"epoch": 9.42455242966752,
"grad_norm": 1.5191411972045898,
"learning_rate": 1.9412288512911844e-05,
"loss": 0.2839,
"step": 7370
},
{
"epoch": 9.437340153452686,
"grad_norm": 0.6311479806900024,
"learning_rate": 1.9367764915405165e-05,
"loss": 0.4466,
"step": 7380
},
{
"epoch": 9.450127877237852,
"grad_norm": 2.6968650817871094,
"learning_rate": 1.932324131789849e-05,
"loss": 0.3821,
"step": 7390
},
{
"epoch": 9.462915601023019,
"grad_norm": 2.450756549835205,
"learning_rate": 1.927871772039181e-05,
"loss": 0.4983,
"step": 7400
},
{
"epoch": 9.475703324808185,
"grad_norm": 2.733262538909912,
"learning_rate": 1.923419412288513e-05,
"loss": 0.3849,
"step": 7410
},
{
"epoch": 9.48849104859335,
"grad_norm": 1.4597264528274536,
"learning_rate": 1.9189670525378452e-05,
"loss": 0.1508,
"step": 7420
},
{
"epoch": 9.501278772378516,
"grad_norm": 2.9642603397369385,
"learning_rate": 1.9145146927871773e-05,
"loss": 0.093,
"step": 7430
},
{
"epoch": 9.514066496163682,
"grad_norm": 2.610978603363037,
"learning_rate": 1.9100623330365093e-05,
"loss": 0.4723,
"step": 7440
},
{
"epoch": 9.526854219948849,
"grad_norm": 4.64446496963501,
"learning_rate": 1.9056099732858414e-05,
"loss": 0.5699,
"step": 7450
},
{
"epoch": 9.539641943734015,
"grad_norm": 2.086815357208252,
"learning_rate": 1.901157613535174e-05,
"loss": 0.3695,
"step": 7460
},
{
"epoch": 9.552429667519181,
"grad_norm": 5.845988750457764,
"learning_rate": 1.896705253784506e-05,
"loss": 0.5565,
"step": 7470
},
{
"epoch": 9.565217391304348,
"grad_norm": 1.279890537261963,
"learning_rate": 1.892252894033838e-05,
"loss": 0.4849,
"step": 7480
},
{
"epoch": 9.578005115089514,
"grad_norm": 0.4533943235874176,
"learning_rate": 1.88780053428317e-05,
"loss": 0.3737,
"step": 7490
},
{
"epoch": 9.59079283887468,
"grad_norm": 2.034996271133423,
"learning_rate": 1.8833481745325026e-05,
"loss": 0.6928,
"step": 7500
},
{
"epoch": 9.603580562659847,
"grad_norm": 0.6041597723960876,
"learning_rate": 1.8788958147818343e-05,
"loss": 0.382,
"step": 7510
},
{
"epoch": 9.616368286445013,
"grad_norm": 8.170845031738281,
"learning_rate": 1.8744434550311664e-05,
"loss": 0.4653,
"step": 7520
},
{
"epoch": 9.62915601023018,
"grad_norm": 0.5266035199165344,
"learning_rate": 1.8699910952804988e-05,
"loss": 0.5368,
"step": 7530
},
{
"epoch": 9.641943734015346,
"grad_norm": 9.099715232849121,
"learning_rate": 1.865538735529831e-05,
"loss": 0.5995,
"step": 7540
},
{
"epoch": 9.654731457800512,
"grad_norm": 0.364877313375473,
"learning_rate": 1.861086375779163e-05,
"loss": 0.5252,
"step": 7550
},
{
"epoch": 9.667519181585678,
"grad_norm": 0.291299045085907,
"learning_rate": 1.856634016028495e-05,
"loss": 0.5014,
"step": 7560
},
{
"epoch": 9.680306905370845,
"grad_norm": 10.438728332519531,
"learning_rate": 1.8521816562778275e-05,
"loss": 0.5446,
"step": 7570
},
{
"epoch": 9.693094629156011,
"grad_norm": 1.1329154968261719,
"learning_rate": 1.8477292965271596e-05,
"loss": 0.5574,
"step": 7580
},
{
"epoch": 9.705882352941176,
"grad_norm": 0.9410423040390015,
"learning_rate": 1.8432769367764917e-05,
"loss": 0.6805,
"step": 7590
},
{
"epoch": 9.718670076726342,
"grad_norm": 13.469749450683594,
"learning_rate": 1.8388245770258238e-05,
"loss": 0.7721,
"step": 7600
},
{
"epoch": 9.731457800511508,
"grad_norm": 12.993340492248535,
"learning_rate": 1.834372217275156e-05,
"loss": 0.4185,
"step": 7610
},
{
"epoch": 9.744245524296675,
"grad_norm": 0.5286237001419067,
"learning_rate": 1.829919857524488e-05,
"loss": 0.2877,
"step": 7620
},
{
"epoch": 9.757033248081841,
"grad_norm": 7.435681343078613,
"learning_rate": 1.82546749777382e-05,
"loss": 0.6757,
"step": 7630
},
{
"epoch": 9.769820971867007,
"grad_norm": 2.0388455390930176,
"learning_rate": 1.8210151380231525e-05,
"loss": 0.4057,
"step": 7640
},
{
"epoch": 9.782608695652174,
"grad_norm": 2.632791757583618,
"learning_rate": 1.8165627782724845e-05,
"loss": 0.5251,
"step": 7650
},
{
"epoch": 9.79539641943734,
"grad_norm": 0.16664129495620728,
"learning_rate": 1.8121104185218166e-05,
"loss": 0.252,
"step": 7660
},
{
"epoch": 9.808184143222507,
"grad_norm": 0.003070043632760644,
"learning_rate": 1.807658058771149e-05,
"loss": 0.7866,
"step": 7670
},
{
"epoch": 9.820971867007673,
"grad_norm": 5.778616428375244,
"learning_rate": 1.8032056990204808e-05,
"loss": 0.4152,
"step": 7680
},
{
"epoch": 9.83375959079284,
"grad_norm": 0.07909457385540009,
"learning_rate": 1.798753339269813e-05,
"loss": 0.3552,
"step": 7690
},
{
"epoch": 9.846547314578006,
"grad_norm": 8.888358116149902,
"learning_rate": 1.794300979519145e-05,
"loss": 0.3736,
"step": 7700
},
{
"epoch": 9.859335038363172,
"grad_norm": 11.318596839904785,
"learning_rate": 1.7898486197684774e-05,
"loss": 0.4869,
"step": 7710
},
{
"epoch": 9.872122762148338,
"grad_norm": 0.5175634026527405,
"learning_rate": 1.7853962600178095e-05,
"loss": 0.2856,
"step": 7720
},
{
"epoch": 9.884910485933505,
"grad_norm": 7.7081379890441895,
"learning_rate": 1.7809439002671416e-05,
"loss": 0.3757,
"step": 7730
},
{
"epoch": 9.89769820971867,
"grad_norm": 7.398036956787109,
"learning_rate": 1.776491540516474e-05,
"loss": 0.6392,
"step": 7740
},
{
"epoch": 9.910485933503836,
"grad_norm": 0.41945111751556396,
"learning_rate": 1.772039180765806e-05,
"loss": 0.3977,
"step": 7750
},
{
"epoch": 9.923273657289002,
"grad_norm": 0.057822152972221375,
"learning_rate": 1.7675868210151382e-05,
"loss": 0.6515,
"step": 7760
},
{
"epoch": 9.936061381074168,
"grad_norm": 2.6882266998291016,
"learning_rate": 1.7631344612644703e-05,
"loss": 0.4997,
"step": 7770
},
{
"epoch": 9.948849104859335,
"grad_norm": 1.6198879480361938,
"learning_rate": 1.7586821015138024e-05,
"loss": 0.364,
"step": 7780
},
{
"epoch": 9.961636828644501,
"grad_norm": 4.458327293395996,
"learning_rate": 1.7542297417631344e-05,
"loss": 0.5551,
"step": 7790
},
{
"epoch": 9.974424552429667,
"grad_norm": 9.222285270690918,
"learning_rate": 1.7497773820124665e-05,
"loss": 0.5828,
"step": 7800
},
{
"epoch": 9.987212276214834,
"grad_norm": 8.898882865905762,
"learning_rate": 1.745325022261799e-05,
"loss": 0.4641,
"step": 7810
},
{
"epoch": 10.0,
"grad_norm": 8.259880065917969,
"learning_rate": 1.740872662511131e-05,
"loss": 0.4681,
"step": 7820
},
{
"epoch": 10.0,
"eval_loss": 0.24825578927993774,
"eval_runtime": 0.8161,
"eval_samples_per_second": 120.081,
"eval_steps_per_second": 15.929,
"step": 7820
},
{
"epoch": 10.012787723785166,
"grad_norm": 11.80496597290039,
"learning_rate": 1.736420302760463e-05,
"loss": 0.4748,
"step": 7830
},
{
"epoch": 10.025575447570333,
"grad_norm": 0.5205655694007874,
"learning_rate": 1.7319679430097952e-05,
"loss": 0.593,
"step": 7840
},
{
"epoch": 10.038363171355499,
"grad_norm": 1.392087459564209,
"learning_rate": 1.7275155832591277e-05,
"loss": 0.5828,
"step": 7850
},
{
"epoch": 10.051150895140665,
"grad_norm": 6.822452545166016,
"learning_rate": 1.7230632235084594e-05,
"loss": 0.4847,
"step": 7860
},
{
"epoch": 10.063938618925832,
"grad_norm": 4.766864776611328,
"learning_rate": 1.7186108637577915e-05,
"loss": 0.3282,
"step": 7870
},
{
"epoch": 10.076726342710998,
"grad_norm": 9.132004737854004,
"learning_rate": 1.714158504007124e-05,
"loss": 0.4287,
"step": 7880
},
{
"epoch": 10.089514066496164,
"grad_norm": 0.027325913310050964,
"learning_rate": 1.709706144256456e-05,
"loss": 0.2849,
"step": 7890
},
{
"epoch": 10.10230179028133,
"grad_norm": 3.668032169342041,
"learning_rate": 1.705253784505788e-05,
"loss": 0.2403,
"step": 7900
},
{
"epoch": 10.115089514066495,
"grad_norm": 0.08681362867355347,
"learning_rate": 1.7008014247551202e-05,
"loss": 0.6101,
"step": 7910
},
{
"epoch": 10.127877237851662,
"grad_norm": 0.0037536800373345613,
"learning_rate": 1.6963490650044526e-05,
"loss": 0.2353,
"step": 7920
},
{
"epoch": 10.140664961636828,
"grad_norm": 4.245620250701904,
"learning_rate": 1.6918967052537847e-05,
"loss": 0.4882,
"step": 7930
},
{
"epoch": 10.153452685421994,
"grad_norm": 3.511157751083374,
"learning_rate": 1.6874443455031168e-05,
"loss": 0.2636,
"step": 7940
},
{
"epoch": 10.16624040920716,
"grad_norm": 0.33653515577316284,
"learning_rate": 1.682991985752449e-05,
"loss": 0.2167,
"step": 7950
},
{
"epoch": 10.179028132992327,
"grad_norm": 2.5212366580963135,
"learning_rate": 1.678539626001781e-05,
"loss": 0.3128,
"step": 7960
},
{
"epoch": 10.191815856777493,
"grad_norm": 0.11155296862125397,
"learning_rate": 1.674087266251113e-05,
"loss": 0.1596,
"step": 7970
},
{
"epoch": 10.20460358056266,
"grad_norm": 4.983904838562012,
"learning_rate": 1.669634906500445e-05,
"loss": 0.4555,
"step": 7980
},
{
"epoch": 10.217391304347826,
"grad_norm": 2.878169536590576,
"learning_rate": 1.6651825467497776e-05,
"loss": 0.4383,
"step": 7990
},
{
"epoch": 10.230179028132993,
"grad_norm": 2.0731053352355957,
"learning_rate": 1.6607301869991096e-05,
"loss": 0.4476,
"step": 8000
},
{
"epoch": 10.242966751918159,
"grad_norm": 1.8638560771942139,
"learning_rate": 1.6562778272484417e-05,
"loss": 0.6536,
"step": 8010
},
{
"epoch": 10.255754475703325,
"grad_norm": 3.486363172531128,
"learning_rate": 1.651825467497774e-05,
"loss": 0.2838,
"step": 8020
},
{
"epoch": 10.268542199488492,
"grad_norm": 1.8177838325500488,
"learning_rate": 1.647373107747106e-05,
"loss": 0.5319,
"step": 8030
},
{
"epoch": 10.281329923273658,
"grad_norm": 2.788775682449341,
"learning_rate": 1.642920747996438e-05,
"loss": 0.2796,
"step": 8040
},
{
"epoch": 10.294117647058824,
"grad_norm": 3.992607355117798,
"learning_rate": 1.63846838824577e-05,
"loss": 0.3864,
"step": 8050
},
{
"epoch": 10.30690537084399,
"grad_norm": 1.2651846408843994,
"learning_rate": 1.6340160284951025e-05,
"loss": 0.5798,
"step": 8060
},
{
"epoch": 10.319693094629155,
"grad_norm": 5.219095706939697,
"learning_rate": 1.6295636687444346e-05,
"loss": 0.6797,
"step": 8070
},
{
"epoch": 10.332480818414322,
"grad_norm": 9.029091835021973,
"learning_rate": 1.6251113089937667e-05,
"loss": 0.7711,
"step": 8080
},
{
"epoch": 10.345268542199488,
"grad_norm": 1.4664132595062256,
"learning_rate": 1.620658949243099e-05,
"loss": 0.6837,
"step": 8090
},
{
"epoch": 10.358056265984654,
"grad_norm": 1.5009568929672241,
"learning_rate": 1.6162065894924312e-05,
"loss": 0.2394,
"step": 8100
},
{
"epoch": 10.37084398976982,
"grad_norm": 3.754551410675049,
"learning_rate": 1.6117542297417633e-05,
"loss": 0.5243,
"step": 8110
},
{
"epoch": 10.383631713554987,
"grad_norm": 0.4475407302379608,
"learning_rate": 1.6073018699910954e-05,
"loss": 0.4114,
"step": 8120
},
{
"epoch": 10.396419437340153,
"grad_norm": 0.2138330191373825,
"learning_rate": 1.6028495102404275e-05,
"loss": 0.4244,
"step": 8130
},
{
"epoch": 10.40920716112532,
"grad_norm": 0.3168162703514099,
"learning_rate": 1.5983971504897595e-05,
"loss": 0.3199,
"step": 8140
},
{
"epoch": 10.421994884910486,
"grad_norm": 7.511226654052734,
"learning_rate": 1.5939447907390916e-05,
"loss": 0.8374,
"step": 8150
},
{
"epoch": 10.434782608695652,
"grad_norm": 1.9676591157913208,
"learning_rate": 1.589492430988424e-05,
"loss": 0.3264,
"step": 8160
},
{
"epoch": 10.447570332480819,
"grad_norm": 0.2713398039340973,
"learning_rate": 1.585040071237756e-05,
"loss": 0.1938,
"step": 8170
},
{
"epoch": 10.460358056265985,
"grad_norm": 3.062129020690918,
"learning_rate": 1.5805877114870882e-05,
"loss": 0.3254,
"step": 8180
},
{
"epoch": 10.473145780051151,
"grad_norm": 6.017972946166992,
"learning_rate": 1.5761353517364203e-05,
"loss": 0.298,
"step": 8190
},
{
"epoch": 10.485933503836318,
"grad_norm": 4.598347187042236,
"learning_rate": 1.5716829919857528e-05,
"loss": 0.5199,
"step": 8200
},
{
"epoch": 10.498721227621484,
"grad_norm": 4.863534927368164,
"learning_rate": 1.5672306322350845e-05,
"loss": 0.6863,
"step": 8210
},
{
"epoch": 10.51150895140665,
"grad_norm": 2.691654682159424,
"learning_rate": 1.5627782724844166e-05,
"loss": 0.3303,
"step": 8220
},
{
"epoch": 10.524296675191817,
"grad_norm": 4.457635879516602,
"learning_rate": 1.558325912733749e-05,
"loss": 0.4365,
"step": 8230
},
{
"epoch": 10.537084398976981,
"grad_norm": 5.06966495513916,
"learning_rate": 1.553873552983081e-05,
"loss": 0.3855,
"step": 8240
},
{
"epoch": 10.549872122762148,
"grad_norm": 1.14505934715271,
"learning_rate": 1.5494211932324132e-05,
"loss": 0.3378,
"step": 8250
},
{
"epoch": 10.562659846547314,
"grad_norm": 2.8286354541778564,
"learning_rate": 1.5449688334817456e-05,
"loss": 0.3858,
"step": 8260
},
{
"epoch": 10.57544757033248,
"grad_norm": 0.6444804668426514,
"learning_rate": 1.5405164737310777e-05,
"loss": 0.3181,
"step": 8270
},
{
"epoch": 10.588235294117647,
"grad_norm": 1.543545126914978,
"learning_rate": 1.5360641139804098e-05,
"loss": 0.5232,
"step": 8280
},
{
"epoch": 10.601023017902813,
"grad_norm": 1.701751708984375,
"learning_rate": 1.531611754229742e-05,
"loss": 0.4632,
"step": 8290
},
{
"epoch": 10.61381074168798,
"grad_norm": 3.7274434566497803,
"learning_rate": 1.527159394479074e-05,
"loss": 0.3775,
"step": 8300
},
{
"epoch": 10.626598465473146,
"grad_norm": 3.264287233352661,
"learning_rate": 1.5227070347284062e-05,
"loss": 0.5725,
"step": 8310
},
{
"epoch": 10.639386189258312,
"grad_norm": 1.2630383968353271,
"learning_rate": 1.5182546749777381e-05,
"loss": 0.2777,
"step": 8320
},
{
"epoch": 10.652173913043478,
"grad_norm": 1.6009305715560913,
"learning_rate": 1.5138023152270706e-05,
"loss": 0.6674,
"step": 8330
},
{
"epoch": 10.664961636828645,
"grad_norm": 3.4451444149017334,
"learning_rate": 1.5093499554764027e-05,
"loss": 0.3947,
"step": 8340
},
{
"epoch": 10.677749360613811,
"grad_norm": 11.27043342590332,
"learning_rate": 1.5048975957257347e-05,
"loss": 0.6128,
"step": 8350
},
{
"epoch": 10.690537084398978,
"grad_norm": 5.958801746368408,
"learning_rate": 1.5004452359750668e-05,
"loss": 0.4243,
"step": 8360
},
{
"epoch": 10.703324808184144,
"grad_norm": 1.0724457502365112,
"learning_rate": 1.4959928762243991e-05,
"loss": 0.2594,
"step": 8370
},
{
"epoch": 10.71611253196931,
"grad_norm": 0.009859677404165268,
"learning_rate": 1.4915405164737312e-05,
"loss": 0.3739,
"step": 8380
},
{
"epoch": 10.728900255754475,
"grad_norm": 0.07025722414255142,
"learning_rate": 1.4870881567230633e-05,
"loss": 0.4447,
"step": 8390
},
{
"epoch": 10.741687979539641,
"grad_norm": 0.9080604314804077,
"learning_rate": 1.4826357969723955e-05,
"loss": 0.3147,
"step": 8400
},
{
"epoch": 10.754475703324808,
"grad_norm": 4.071467399597168,
"learning_rate": 1.4781834372217276e-05,
"loss": 0.808,
"step": 8410
},
{
"epoch": 10.767263427109974,
"grad_norm": 1.357560634613037,
"learning_rate": 1.4737310774710597e-05,
"loss": 0.4693,
"step": 8420
},
{
"epoch": 10.78005115089514,
"grad_norm": 9.695436477661133,
"learning_rate": 1.4692787177203918e-05,
"loss": 0.7551,
"step": 8430
},
{
"epoch": 10.792838874680307,
"grad_norm": 7.561251163482666,
"learning_rate": 1.464826357969724e-05,
"loss": 0.5718,
"step": 8440
},
{
"epoch": 10.805626598465473,
"grad_norm": 5.493609428405762,
"learning_rate": 1.4603739982190561e-05,
"loss": 0.5034,
"step": 8450
},
{
"epoch": 10.81841432225064,
"grad_norm": 1.3392680883407593,
"learning_rate": 1.4559216384683882e-05,
"loss": 0.9159,
"step": 8460
},
{
"epoch": 10.831202046035806,
"grad_norm": 0.01922355219721794,
"learning_rate": 1.4514692787177206e-05,
"loss": 0.0897,
"step": 8470
},
{
"epoch": 10.843989769820972,
"grad_norm": 4.552350044250488,
"learning_rate": 1.4470169189670527e-05,
"loss": 0.4221,
"step": 8480
},
{
"epoch": 10.856777493606138,
"grad_norm": 0.30915215611457825,
"learning_rate": 1.4425645592163846e-05,
"loss": 0.2513,
"step": 8490
},
{
"epoch": 10.869565217391305,
"grad_norm": 7.572796821594238,
"learning_rate": 1.4381121994657167e-05,
"loss": 0.6666,
"step": 8500
},
{
"epoch": 10.882352941176471,
"grad_norm": 3.443830966949463,
"learning_rate": 1.4336598397150492e-05,
"loss": 0.2706,
"step": 8510
},
{
"epoch": 10.895140664961637,
"grad_norm": 2.4022560119628906,
"learning_rate": 1.4292074799643812e-05,
"loss": 0.4621,
"step": 8520
},
{
"epoch": 10.907928388746804,
"grad_norm": 1.282814860343933,
"learning_rate": 1.4247551202137133e-05,
"loss": 0.2189,
"step": 8530
},
{
"epoch": 10.92071611253197,
"grad_norm": 0.48741990327835083,
"learning_rate": 1.4203027604630456e-05,
"loss": 0.6214,
"step": 8540
},
{
"epoch": 10.933503836317136,
"grad_norm": 5.9614434242248535,
"learning_rate": 1.4158504007123777e-05,
"loss": 0.3349,
"step": 8550
},
{
"epoch": 10.946291560102301,
"grad_norm": 1.9144011735916138,
"learning_rate": 1.4113980409617098e-05,
"loss": 0.4819,
"step": 8560
},
{
"epoch": 10.959079283887467,
"grad_norm": 1.8512147665023804,
"learning_rate": 1.4069456812110419e-05,
"loss": 0.3379,
"step": 8570
},
{
"epoch": 10.971867007672634,
"grad_norm": 5.208499908447266,
"learning_rate": 1.4024933214603741e-05,
"loss": 0.1968,
"step": 8580
},
{
"epoch": 10.9846547314578,
"grad_norm": 0.16035664081573486,
"learning_rate": 1.3980409617097062e-05,
"loss": 0.4305,
"step": 8590
},
{
"epoch": 10.997442455242966,
"grad_norm": 7.619056224822998,
"learning_rate": 1.3935886019590383e-05,
"loss": 0.6934,
"step": 8600
},
{
"epoch": 11.0,
"eval_loss": 0.23721352219581604,
"eval_runtime": 0.9685,
"eval_samples_per_second": 101.184,
"eval_steps_per_second": 13.422,
"step": 8602
},
{
"epoch": 11.010230179028133,
"grad_norm": 6.740601062774658,
"learning_rate": 1.3891362422083707e-05,
"loss": 0.459,
"step": 8610
},
{
"epoch": 11.023017902813299,
"grad_norm": 0.46416130661964417,
"learning_rate": 1.3846838824577026e-05,
"loss": 0.3525,
"step": 8620
},
{
"epoch": 11.035805626598465,
"grad_norm": 7.859157562255859,
"learning_rate": 1.3802315227070347e-05,
"loss": 0.4976,
"step": 8630
},
{
"epoch": 11.048593350383632,
"grad_norm": 0.018627116456627846,
"learning_rate": 1.3757791629563668e-05,
"loss": 0.2593,
"step": 8640
},
{
"epoch": 11.061381074168798,
"grad_norm": 0.6259503960609436,
"learning_rate": 1.3713268032056992e-05,
"loss": 0.4457,
"step": 8650
},
{
"epoch": 11.074168797953964,
"grad_norm": 0.4090126156806946,
"learning_rate": 1.3668744434550313e-05,
"loss": 0.4761,
"step": 8660
},
{
"epoch": 11.08695652173913,
"grad_norm": 0.9712822437286377,
"learning_rate": 1.3624220837043632e-05,
"loss": 0.3631,
"step": 8670
},
{
"epoch": 11.099744245524297,
"grad_norm": 5.4518046379089355,
"learning_rate": 1.3579697239536957e-05,
"loss": 0.3915,
"step": 8680
},
{
"epoch": 11.112531969309464,
"grad_norm": 11.45752239227295,
"learning_rate": 1.3535173642030278e-05,
"loss": 0.4239,
"step": 8690
},
{
"epoch": 11.12531969309463,
"grad_norm": 0.3467662036418915,
"learning_rate": 1.3490650044523598e-05,
"loss": 0.6385,
"step": 8700
},
{
"epoch": 11.138107416879796,
"grad_norm": 0.231705442070961,
"learning_rate": 1.344612644701692e-05,
"loss": 0.3373,
"step": 8710
},
{
"epoch": 11.15089514066496,
"grad_norm": 7.90257453918457,
"learning_rate": 1.3401602849510242e-05,
"loss": 0.5158,
"step": 8720
},
{
"epoch": 11.163682864450127,
"grad_norm": 0.020741138607263565,
"learning_rate": 1.3357079252003563e-05,
"loss": 0.4563,
"step": 8730
},
{
"epoch": 11.176470588235293,
"grad_norm": 9.415164947509766,
"learning_rate": 1.3312555654496884e-05,
"loss": 0.5224,
"step": 8740
},
{
"epoch": 11.18925831202046,
"grad_norm": 11.169380187988281,
"learning_rate": 1.3268032056990206e-05,
"loss": 0.477,
"step": 8750
},
{
"epoch": 11.202046035805626,
"grad_norm": 6.20239782333374,
"learning_rate": 1.3223508459483527e-05,
"loss": 0.41,
"step": 8760
},
{
"epoch": 11.214833759590793,
"grad_norm": 11.079065322875977,
"learning_rate": 1.3178984861976848e-05,
"loss": 0.5932,
"step": 8770
},
{
"epoch": 11.227621483375959,
"grad_norm": 0.30395638942718506,
"learning_rate": 1.3134461264470169e-05,
"loss": 0.4147,
"step": 8780
},
{
"epoch": 11.240409207161125,
"grad_norm": 1.8934444189071655,
"learning_rate": 1.3089937666963491e-05,
"loss": 0.4328,
"step": 8790
},
{
"epoch": 11.253196930946292,
"grad_norm": 8.08961009979248,
"learning_rate": 1.3045414069456812e-05,
"loss": 0.5754,
"step": 8800
},
{
"epoch": 11.265984654731458,
"grad_norm": 0.6705631017684937,
"learning_rate": 1.3000890471950133e-05,
"loss": 0.4323,
"step": 8810
},
{
"epoch": 11.278772378516624,
"grad_norm": 11.220414161682129,
"learning_rate": 1.2956366874443457e-05,
"loss": 0.5894,
"step": 8820
},
{
"epoch": 11.29156010230179,
"grad_norm": 8.43408489227295,
"learning_rate": 1.2911843276936778e-05,
"loss": 0.6214,
"step": 8830
},
{
"epoch": 11.304347826086957,
"grad_norm": 1.6605695486068726,
"learning_rate": 1.2867319679430097e-05,
"loss": 0.2958,
"step": 8840
},
{
"epoch": 11.317135549872123,
"grad_norm": 5.640063762664795,
"learning_rate": 1.2822796081923418e-05,
"loss": 0.433,
"step": 8850
},
{
"epoch": 11.32992327365729,
"grad_norm": 2.0308477878570557,
"learning_rate": 1.2778272484416743e-05,
"loss": 0.7216,
"step": 8860
},
{
"epoch": 11.342710997442456,
"grad_norm": 0.7579576373100281,
"learning_rate": 1.2733748886910063e-05,
"loss": 0.1198,
"step": 8870
},
{
"epoch": 11.355498721227622,
"grad_norm": 5.299111366271973,
"learning_rate": 1.2689225289403384e-05,
"loss": 0.2332,
"step": 8880
},
{
"epoch": 11.368286445012787,
"grad_norm": 4.141612529754639,
"learning_rate": 1.2644701691896707e-05,
"loss": 0.3213,
"step": 8890
},
{
"epoch": 11.381074168797953,
"grad_norm": 5.730710983276367,
"learning_rate": 1.2600178094390028e-05,
"loss": 0.3246,
"step": 8900
},
{
"epoch": 11.39386189258312,
"grad_norm": 2.543301582336426,
"learning_rate": 1.2555654496883349e-05,
"loss": 0.4036,
"step": 8910
},
{
"epoch": 11.406649616368286,
"grad_norm": 0.17073774337768555,
"learning_rate": 1.251113089937667e-05,
"loss": 0.6859,
"step": 8920
},
{
"epoch": 11.419437340153452,
"grad_norm": 0.9711840152740479,
"learning_rate": 1.246660730186999e-05,
"loss": 0.3302,
"step": 8930
},
{
"epoch": 11.432225063938619,
"grad_norm": 11.181052207946777,
"learning_rate": 1.2422083704363313e-05,
"loss": 0.5149,
"step": 8940
},
{
"epoch": 11.445012787723785,
"grad_norm": 0.5489789247512817,
"learning_rate": 1.2377560106856636e-05,
"loss": 0.767,
"step": 8950
},
{
"epoch": 11.457800511508951,
"grad_norm": 2.264814615249634,
"learning_rate": 1.2333036509349956e-05,
"loss": 0.1734,
"step": 8960
},
{
"epoch": 11.470588235294118,
"grad_norm": 2.2042288780212402,
"learning_rate": 1.2288512911843277e-05,
"loss": 0.5622,
"step": 8970
},
{
"epoch": 11.483375959079284,
"grad_norm": 0.05039510875940323,
"learning_rate": 1.2243989314336598e-05,
"loss": 0.801,
"step": 8980
},
{
"epoch": 11.49616368286445,
"grad_norm": 8.120935440063477,
"learning_rate": 1.219946571682992e-05,
"loss": 0.3447,
"step": 8990
},
{
"epoch": 11.508951406649617,
"grad_norm": 6.028167724609375,
"learning_rate": 1.2154942119323242e-05,
"loss": 0.6221,
"step": 9000
},
{
"epoch": 11.521739130434783,
"grad_norm": 0.06073115020990372,
"learning_rate": 1.2110418521816562e-05,
"loss": 0.1343,
"step": 9010
},
{
"epoch": 11.53452685421995,
"grad_norm": 1.3959262371063232,
"learning_rate": 1.2065894924309885e-05,
"loss": 0.213,
"step": 9020
},
{
"epoch": 11.547314578005116,
"grad_norm": 0.5288462042808533,
"learning_rate": 1.2021371326803206e-05,
"loss": 0.5034,
"step": 9030
},
{
"epoch": 11.56010230179028,
"grad_norm": 5.911684989929199,
"learning_rate": 1.1976847729296529e-05,
"loss": 0.3314,
"step": 9040
},
{
"epoch": 11.572890025575447,
"grad_norm": 2.7740604877471924,
"learning_rate": 1.193232413178985e-05,
"loss": 0.4807,
"step": 9050
},
{
"epoch": 11.585677749360613,
"grad_norm": 0.6244329810142517,
"learning_rate": 1.188780053428317e-05,
"loss": 0.1838,
"step": 9060
},
{
"epoch": 11.59846547314578,
"grad_norm": 2.633812189102173,
"learning_rate": 1.1843276936776491e-05,
"loss": 0.4862,
"step": 9070
},
{
"epoch": 11.611253196930946,
"grad_norm": 10.810276985168457,
"learning_rate": 1.1798753339269814e-05,
"loss": 0.4788,
"step": 9080
},
{
"epoch": 11.624040920716112,
"grad_norm": 2.0004940032958984,
"learning_rate": 1.1754229741763136e-05,
"loss": 0.3771,
"step": 9090
},
{
"epoch": 11.636828644501279,
"grad_norm": 0.30808359384536743,
"learning_rate": 1.1709706144256455e-05,
"loss": 0.335,
"step": 9100
},
{
"epoch": 11.649616368286445,
"grad_norm": 5.277163028717041,
"learning_rate": 1.1665182546749778e-05,
"loss": 0.4029,
"step": 9110
},
{
"epoch": 11.662404092071611,
"grad_norm": 0.022072020918130875,
"learning_rate": 1.1620658949243099e-05,
"loss": 0.4137,
"step": 9120
},
{
"epoch": 11.675191815856778,
"grad_norm": 3.9940779209136963,
"learning_rate": 1.1576135351736421e-05,
"loss": 0.514,
"step": 9130
},
{
"epoch": 11.687979539641944,
"grad_norm": 0.14408734440803528,
"learning_rate": 1.1531611754229742e-05,
"loss": 0.1492,
"step": 9140
},
{
"epoch": 11.70076726342711,
"grad_norm": 2.5065701007843018,
"learning_rate": 1.1487088156723063e-05,
"loss": 0.4501,
"step": 9150
},
{
"epoch": 11.713554987212277,
"grad_norm": 3.592348098754883,
"learning_rate": 1.1442564559216386e-05,
"loss": 0.2521,
"step": 9160
},
{
"epoch": 11.726342710997443,
"grad_norm": 4.004711151123047,
"learning_rate": 1.1398040961709707e-05,
"loss": 0.3974,
"step": 9170
},
{
"epoch": 11.73913043478261,
"grad_norm": 8.429434776306152,
"learning_rate": 1.135351736420303e-05,
"loss": 0.4025,
"step": 9180
},
{
"epoch": 11.751918158567776,
"grad_norm": 5.526464462280273,
"learning_rate": 1.1308993766696348e-05,
"loss": 0.479,
"step": 9190
},
{
"epoch": 11.764705882352942,
"grad_norm": 0.15864869952201843,
"learning_rate": 1.1264470169189671e-05,
"loss": 0.7387,
"step": 9200
},
{
"epoch": 11.777493606138107,
"grad_norm": 0.2955300509929657,
"learning_rate": 1.1219946571682992e-05,
"loss": 0.2657,
"step": 9210
},
{
"epoch": 11.790281329923273,
"grad_norm": 1.9746955633163452,
"learning_rate": 1.1175422974176314e-05,
"loss": 0.3473,
"step": 9220
},
{
"epoch": 11.80306905370844,
"grad_norm": 0.009776926599442959,
"learning_rate": 1.1130899376669635e-05,
"loss": 0.4985,
"step": 9230
},
{
"epoch": 11.815856777493606,
"grad_norm": 0.8222724199295044,
"learning_rate": 1.1086375779162956e-05,
"loss": 0.1595,
"step": 9240
},
{
"epoch": 11.828644501278772,
"grad_norm": 2.545729875564575,
"learning_rate": 1.1041852181656279e-05,
"loss": 0.2437,
"step": 9250
},
{
"epoch": 11.841432225063938,
"grad_norm": 1.1305873394012451,
"learning_rate": 1.09973285841496e-05,
"loss": 0.4101,
"step": 9260
},
{
"epoch": 11.854219948849105,
"grad_norm": 1.6995846033096313,
"learning_rate": 1.0952804986642922e-05,
"loss": 0.3361,
"step": 9270
},
{
"epoch": 11.867007672634271,
"grad_norm": 1.532027244567871,
"learning_rate": 1.0908281389136243e-05,
"loss": 0.2246,
"step": 9280
},
{
"epoch": 11.879795396419437,
"grad_norm": 0.10980970412492752,
"learning_rate": 1.0863757791629564e-05,
"loss": 0.1659,
"step": 9290
},
{
"epoch": 11.892583120204604,
"grad_norm": 1.9785058498382568,
"learning_rate": 1.0819234194122887e-05,
"loss": 0.4823,
"step": 9300
},
{
"epoch": 11.90537084398977,
"grad_norm": 2.5999562740325928,
"learning_rate": 1.0774710596616207e-05,
"loss": 0.2816,
"step": 9310
},
{
"epoch": 11.918158567774936,
"grad_norm": 7.072868824005127,
"learning_rate": 1.0730186999109528e-05,
"loss": 0.4803,
"step": 9320
},
{
"epoch": 11.930946291560103,
"grad_norm": 0.15491001307964325,
"learning_rate": 1.068566340160285e-05,
"loss": 0.6164,
"step": 9330
},
{
"epoch": 11.94373401534527,
"grad_norm": 5.728983402252197,
"learning_rate": 1.0641139804096172e-05,
"loss": 0.323,
"step": 9340
},
{
"epoch": 11.956521739130435,
"grad_norm": 2.930337905883789,
"learning_rate": 1.0596616206589494e-05,
"loss": 0.3664,
"step": 9350
},
{
"epoch": 11.969309462915602,
"grad_norm": 1.9165003299713135,
"learning_rate": 1.0552092609082813e-05,
"loss": 0.8193,
"step": 9360
},
{
"epoch": 11.982097186700766,
"grad_norm": 1.9771157503128052,
"learning_rate": 1.0507569011576136e-05,
"loss": 0.3378,
"step": 9370
},
{
"epoch": 11.994884910485933,
"grad_norm": 0.9281581044197083,
"learning_rate": 1.0463045414069457e-05,
"loss": 0.5581,
"step": 9380
},
{
"epoch": 12.0,
"eval_loss": 0.23496317863464355,
"eval_runtime": 0.9841,
"eval_samples_per_second": 99.579,
"eval_steps_per_second": 13.21,
"step": 9384
},
{
"epoch": 12.007672634271099,
"grad_norm": 2.9337079524993896,
"learning_rate": 1.041852181656278e-05,
"loss": 0.3057,
"step": 9390
},
{
"epoch": 12.020460358056265,
"grad_norm": 0.2717020809650421,
"learning_rate": 1.03739982190561e-05,
"loss": 0.5384,
"step": 9400
},
{
"epoch": 12.033248081841432,
"grad_norm": 0.10977739095687866,
"learning_rate": 1.0329474621549421e-05,
"loss": 0.461,
"step": 9410
},
{
"epoch": 12.046035805626598,
"grad_norm": 1.6865909099578857,
"learning_rate": 1.0284951024042744e-05,
"loss": 0.2365,
"step": 9420
},
{
"epoch": 12.058823529411764,
"grad_norm": 0.12684215605258942,
"learning_rate": 1.0240427426536065e-05,
"loss": 0.3649,
"step": 9430
},
{
"epoch": 12.07161125319693,
"grad_norm": 0.053901903331279755,
"learning_rate": 1.0195903829029387e-05,
"loss": 0.4141,
"step": 9440
},
{
"epoch": 12.084398976982097,
"grad_norm": 0.8139101266860962,
"learning_rate": 1.0151380231522706e-05,
"loss": 0.3258,
"step": 9450
},
{
"epoch": 12.097186700767264,
"grad_norm": 7.989099685801193e-05,
"learning_rate": 1.0106856634016029e-05,
"loss": 0.5458,
"step": 9460
},
{
"epoch": 12.10997442455243,
"grad_norm": 1.4419445991516113,
"learning_rate": 1.006233303650935e-05,
"loss": 0.4101,
"step": 9470
},
{
"epoch": 12.122762148337596,
"grad_norm": 8.941499710083008,
"learning_rate": 1.0017809439002672e-05,
"loss": 0.3453,
"step": 9480
},
{
"epoch": 12.135549872122763,
"grad_norm": 0.2980097532272339,
"learning_rate": 9.973285841495993e-06,
"loss": 0.1952,
"step": 9490
},
{
"epoch": 12.148337595907929,
"grad_norm": 7.380556583404541,
"learning_rate": 9.928762243989314e-06,
"loss": 0.3819,
"step": 9500
},
{
"epoch": 12.161125319693095,
"grad_norm": 0.3187771439552307,
"learning_rate": 9.884238646482637e-06,
"loss": 0.3993,
"step": 9510
},
{
"epoch": 12.173913043478262,
"grad_norm": 3.5564401149749756,
"learning_rate": 9.839715048975958e-06,
"loss": 0.4138,
"step": 9520
},
{
"epoch": 12.186700767263428,
"grad_norm": 0.04383537545800209,
"learning_rate": 9.79519145146928e-06,
"loss": 0.2085,
"step": 9530
},
{
"epoch": 12.199488491048593,
"grad_norm": 8.45487117767334,
"learning_rate": 9.7506678539626e-06,
"loss": 0.3879,
"step": 9540
},
{
"epoch": 12.212276214833759,
"grad_norm": 0.07102257758378983,
"learning_rate": 9.706144256455922e-06,
"loss": 0.2747,
"step": 9550
},
{
"epoch": 12.225063938618925,
"grad_norm": 8.977646827697754,
"learning_rate": 9.661620658949245e-06,
"loss": 0.5841,
"step": 9560
},
{
"epoch": 12.237851662404092,
"grad_norm": 6.482713222503662,
"learning_rate": 9.617097061442565e-06,
"loss": 0.5037,
"step": 9570
},
{
"epoch": 12.250639386189258,
"grad_norm": 0.6768947243690491,
"learning_rate": 9.572573463935886e-06,
"loss": 0.629,
"step": 9580
},
{
"epoch": 12.263427109974424,
"grad_norm": 2.5135774194495752e-05,
"learning_rate": 9.528049866429207e-06,
"loss": 0.3964,
"step": 9590
},
{
"epoch": 12.27621483375959,
"grad_norm": 7.924706935882568,
"learning_rate": 9.48352626892253e-06,
"loss": 0.4249,
"step": 9600
},
{
"epoch": 12.289002557544757,
"grad_norm": 0.7605132460594177,
"learning_rate": 9.43900267141585e-06,
"loss": 0.2857,
"step": 9610
},
{
"epoch": 12.301790281329923,
"grad_norm": 0.39697137475013733,
"learning_rate": 9.394479073909172e-06,
"loss": 0.2537,
"step": 9620
},
{
"epoch": 12.31457800511509,
"grad_norm": 0.43480339646339417,
"learning_rate": 9.349955476402494e-06,
"loss": 0.2062,
"step": 9630
},
{
"epoch": 12.327365728900256,
"grad_norm": 2.8665430545806885,
"learning_rate": 9.305431878895815e-06,
"loss": 0.5541,
"step": 9640
},
{
"epoch": 12.340153452685422,
"grad_norm": 1.6264675855636597,
"learning_rate": 9.260908281389138e-06,
"loss": 0.366,
"step": 9650
},
{
"epoch": 12.352941176470589,
"grad_norm": 9.299280166625977,
"learning_rate": 9.216384683882458e-06,
"loss": 0.6633,
"step": 9660
},
{
"epoch": 12.365728900255755,
"grad_norm": 0.8438981175422668,
"learning_rate": 9.17186108637578e-06,
"loss": 0.5287,
"step": 9670
},
{
"epoch": 12.378516624040921,
"grad_norm": 13.061861038208008,
"learning_rate": 9.1273374888691e-06,
"loss": 0.3324,
"step": 9680
},
{
"epoch": 12.391304347826088,
"grad_norm": 3.5456817150115967,
"learning_rate": 9.082813891362423e-06,
"loss": 0.3801,
"step": 9690
},
{
"epoch": 12.404092071611252,
"grad_norm": 5.760250091552734,
"learning_rate": 9.038290293855745e-06,
"loss": 0.4844,
"step": 9700
},
{
"epoch": 12.416879795396419,
"grad_norm": 6.475959777832031,
"learning_rate": 8.993766696349064e-06,
"loss": 0.4455,
"step": 9710
},
{
"epoch": 12.429667519181585,
"grad_norm": 3.8550329208374023,
"learning_rate": 8.949243098842387e-06,
"loss": 0.4175,
"step": 9720
},
{
"epoch": 12.442455242966751,
"grad_norm": 2.0850658416748047,
"learning_rate": 8.904719501335708e-06,
"loss": 0.361,
"step": 9730
},
{
"epoch": 12.455242966751918,
"grad_norm": 4.074941158294678,
"learning_rate": 8.86019590382903e-06,
"loss": 0.2447,
"step": 9740
},
{
"epoch": 12.468030690537084,
"grad_norm": 1.6458179950714111,
"learning_rate": 8.815672306322351e-06,
"loss": 0.4367,
"step": 9750
},
{
"epoch": 12.48081841432225,
"grad_norm": 1.9982742071151733,
"learning_rate": 8.771148708815672e-06,
"loss": 0.3444,
"step": 9760
},
{
"epoch": 12.493606138107417,
"grad_norm": 6.526026725769043,
"learning_rate": 8.726625111308995e-06,
"loss": 0.5819,
"step": 9770
},
{
"epoch": 12.506393861892583,
"grad_norm": 9.310763359069824,
"learning_rate": 8.682101513802316e-06,
"loss": 0.3059,
"step": 9780
},
{
"epoch": 12.51918158567775,
"grad_norm": 1.0071550607681274,
"learning_rate": 8.637577916295638e-06,
"loss": 0.3726,
"step": 9790
},
{
"epoch": 12.531969309462916,
"grad_norm": 0.882957935333252,
"learning_rate": 8.593054318788957e-06,
"loss": 0.4725,
"step": 9800
},
{
"epoch": 12.544757033248082,
"grad_norm": 1.2012654542922974,
"learning_rate": 8.54853072128228e-06,
"loss": 0.2205,
"step": 9810
},
{
"epoch": 12.557544757033249,
"grad_norm": 1.7305279970169067,
"learning_rate": 8.504007123775601e-06,
"loss": 0.4537,
"step": 9820
},
{
"epoch": 12.570332480818415,
"grad_norm": 4.674372673034668,
"learning_rate": 8.459483526268923e-06,
"loss": 0.4568,
"step": 9830
},
{
"epoch": 12.583120204603581,
"grad_norm": 6.6475138664245605,
"learning_rate": 8.414959928762244e-06,
"loss": 0.3144,
"step": 9840
},
{
"epoch": 12.595907928388748,
"grad_norm": 0.38528770208358765,
"learning_rate": 8.370436331255565e-06,
"loss": 0.4792,
"step": 9850
},
{
"epoch": 12.608695652173914,
"grad_norm": 0.9754725694656372,
"learning_rate": 8.325912733748888e-06,
"loss": 0.5235,
"step": 9860
},
{
"epoch": 12.621483375959079,
"grad_norm": 4.076246738433838,
"learning_rate": 8.281389136242209e-06,
"loss": 0.348,
"step": 9870
},
{
"epoch": 12.634271099744245,
"grad_norm": 1.0100876092910767,
"learning_rate": 8.23686553873553e-06,
"loss": 0.5218,
"step": 9880
},
{
"epoch": 12.647058823529411,
"grad_norm": 17.8681640625,
"learning_rate": 8.19234194122885e-06,
"loss": 0.6763,
"step": 9890
},
{
"epoch": 12.659846547314578,
"grad_norm": 6.97352933883667,
"learning_rate": 8.147818343722173e-06,
"loss": 0.4291,
"step": 9900
},
{
"epoch": 12.672634271099744,
"grad_norm": 0.3931565582752228,
"learning_rate": 8.103294746215496e-06,
"loss": 0.4243,
"step": 9910
},
{
"epoch": 12.68542199488491,
"grad_norm": 2.1343562602996826,
"learning_rate": 8.058771148708816e-06,
"loss": 0.3933,
"step": 9920
},
{
"epoch": 12.698209718670077,
"grad_norm": 5.404961109161377,
"learning_rate": 8.014247551202137e-06,
"loss": 0.4113,
"step": 9930
},
{
"epoch": 12.710997442455243,
"grad_norm": 0.09293472766876221,
"learning_rate": 7.969723953695458e-06,
"loss": 0.351,
"step": 9940
},
{
"epoch": 12.72378516624041,
"grad_norm": 0.13212403655052185,
"learning_rate": 7.92520035618878e-06,
"loss": 0.462,
"step": 9950
},
{
"epoch": 12.736572890025576,
"grad_norm": 5.489344120025635,
"learning_rate": 7.880676758682102e-06,
"loss": 0.3703,
"step": 9960
},
{
"epoch": 12.749360613810742,
"grad_norm": 1.962679386138916,
"learning_rate": 7.836153161175422e-06,
"loss": 0.2338,
"step": 9970
},
{
"epoch": 12.762148337595908,
"grad_norm": 9.600525856018066,
"learning_rate": 7.791629563668745e-06,
"loss": 0.43,
"step": 9980
},
{
"epoch": 12.774936061381075,
"grad_norm": 1.3438434600830078,
"learning_rate": 7.747105966162066e-06,
"loss": 0.2854,
"step": 9990
},
{
"epoch": 12.787723785166241,
"grad_norm": 0.0004410437832120806,
"learning_rate": 7.702582368655389e-06,
"loss": 0.3249,
"step": 10000
},
{
"epoch": 12.800511508951407,
"grad_norm": 0.4983418881893158,
"learning_rate": 7.65805877114871e-06,
"loss": 0.3296,
"step": 10010
},
{
"epoch": 12.813299232736572,
"grad_norm": 0.41612160205841064,
"learning_rate": 7.613535173642031e-06,
"loss": 0.5248,
"step": 10020
},
{
"epoch": 12.826086956521738,
"grad_norm": 13.50173568725586,
"learning_rate": 7.569011576135353e-06,
"loss": 0.5579,
"step": 10030
},
{
"epoch": 12.838874680306905,
"grad_norm": 3.2554118633270264,
"learning_rate": 7.524487978628674e-06,
"loss": 0.6241,
"step": 10040
},
{
"epoch": 12.851662404092071,
"grad_norm": 1.226417064666748,
"learning_rate": 7.4799643811219954e-06,
"loss": 0.2834,
"step": 10050
},
{
"epoch": 12.864450127877237,
"grad_norm": 2.9790737628936768,
"learning_rate": 7.435440783615316e-06,
"loss": 0.4032,
"step": 10060
},
{
"epoch": 12.877237851662404,
"grad_norm": 13.057442665100098,
"learning_rate": 7.390917186108638e-06,
"loss": 0.247,
"step": 10070
},
{
"epoch": 12.89002557544757,
"grad_norm": 5.512662410736084,
"learning_rate": 7.346393588601959e-06,
"loss": 0.3662,
"step": 10080
},
{
"epoch": 12.902813299232736,
"grad_norm": 1.990576148033142,
"learning_rate": 7.301869991095281e-06,
"loss": 0.5448,
"step": 10090
},
{
"epoch": 12.915601023017903,
"grad_norm": 0.43409115076065063,
"learning_rate": 7.257346393588603e-06,
"loss": 0.6568,
"step": 10100
},
{
"epoch": 12.92838874680307,
"grad_norm": 1.7592841386795044,
"learning_rate": 7.212822796081923e-06,
"loss": 0.4341,
"step": 10110
},
{
"epoch": 12.941176470588236,
"grad_norm": 5.928600788116455,
"learning_rate": 7.168299198575246e-06,
"loss": 0.6926,
"step": 10120
},
{
"epoch": 12.953964194373402,
"grad_norm": 0.49512559175491333,
"learning_rate": 7.123775601068567e-06,
"loss": 0.3134,
"step": 10130
},
{
"epoch": 12.966751918158568,
"grad_norm": 0.061214692890644073,
"learning_rate": 7.079252003561888e-06,
"loss": 0.8422,
"step": 10140
},
{
"epoch": 12.979539641943735,
"grad_norm": 10.013786315917969,
"learning_rate": 7.034728406055209e-06,
"loss": 0.4474,
"step": 10150
},
{
"epoch": 12.992327365728901,
"grad_norm": 2.202415943145752,
"learning_rate": 6.990204808548531e-06,
"loss": 0.23,
"step": 10160
},
{
"epoch": 13.0,
"eval_loss": 0.23341350257396698,
"eval_runtime": 0.9774,
"eval_samples_per_second": 100.265,
"eval_steps_per_second": 13.301,
"step": 10166
},
{
"epoch": 13.005115089514067,
"grad_norm": 8.305723190307617,
"learning_rate": 6.9456812110418536e-06,
"loss": 0.5032,
"step": 10170
},
{
"epoch": 13.017902813299234,
"grad_norm": 9.274243354797363,
"learning_rate": 6.901157613535174e-06,
"loss": 0.2192,
"step": 10180
},
{
"epoch": 13.030690537084398,
"grad_norm": 2.3518593311309814,
"learning_rate": 6.856634016028496e-06,
"loss": 0.5402,
"step": 10190
},
{
"epoch": 13.043478260869565,
"grad_norm": 0.012316963635385036,
"learning_rate": 6.812110418521816e-06,
"loss": 0.4061,
"step": 10200
},
{
"epoch": 13.05626598465473,
"grad_norm": 6.4899582862854,
"learning_rate": 6.767586821015139e-06,
"loss": 0.3366,
"step": 10210
},
{
"epoch": 13.069053708439897,
"grad_norm": 0.12233175337314606,
"learning_rate": 6.72306322350846e-06,
"loss": 0.1374,
"step": 10220
},
{
"epoch": 13.081841432225064,
"grad_norm": 0.4392085075378418,
"learning_rate": 6.678539626001781e-06,
"loss": 0.1306,
"step": 10230
},
{
"epoch": 13.09462915601023,
"grad_norm": 13.07235050201416,
"learning_rate": 6.634016028495103e-06,
"loss": 0.4273,
"step": 10240
},
{
"epoch": 13.107416879795396,
"grad_norm": 7.395537376403809,
"learning_rate": 6.589492430988424e-06,
"loss": 0.5841,
"step": 10250
},
{
"epoch": 13.120204603580563,
"grad_norm": 10.473769187927246,
"learning_rate": 6.544968833481746e-06,
"loss": 0.6586,
"step": 10260
},
{
"epoch": 13.132992327365729,
"grad_norm": 1.897230863571167,
"learning_rate": 6.5004452359750666e-06,
"loss": 0.5209,
"step": 10270
},
{
"epoch": 13.145780051150895,
"grad_norm": 4.206538200378418,
"learning_rate": 6.455921638468389e-06,
"loss": 0.3086,
"step": 10280
},
{
"epoch": 13.158567774936062,
"grad_norm": 5.3881001472473145,
"learning_rate": 6.411398040961709e-06,
"loss": 0.614,
"step": 10290
},
{
"epoch": 13.171355498721228,
"grad_norm": 0.24799497425556183,
"learning_rate": 6.366874443455032e-06,
"loss": 0.0553,
"step": 10300
},
{
"epoch": 13.184143222506394,
"grad_norm": 7.40368127822876,
"learning_rate": 6.3223508459483535e-06,
"loss": 0.5088,
"step": 10310
},
{
"epoch": 13.19693094629156,
"grad_norm": 0.08739714324474335,
"learning_rate": 6.277827248441674e-06,
"loss": 0.092,
"step": 10320
},
{
"epoch": 13.209718670076727,
"grad_norm": 1.746079921722412,
"learning_rate": 6.233303650934995e-06,
"loss": 0.2567,
"step": 10330
},
{
"epoch": 13.222506393861893,
"grad_norm": 0.45384278893470764,
"learning_rate": 6.188780053428318e-06,
"loss": 0.0995,
"step": 10340
},
{
"epoch": 13.235294117647058,
"grad_norm": 1.0237295627593994,
"learning_rate": 6.144256455921639e-06,
"loss": 0.6154,
"step": 10350
},
{
"epoch": 13.248081841432224,
"grad_norm": 6.016015529632568,
"learning_rate": 6.09973285841496e-06,
"loss": 0.623,
"step": 10360
},
{
"epoch": 13.26086956521739,
"grad_norm": 3.6509177684783936,
"learning_rate": 6.055209260908281e-06,
"loss": 0.3978,
"step": 10370
},
{
"epoch": 13.273657289002557,
"grad_norm": 3.9235923290252686,
"learning_rate": 6.010685663401603e-06,
"loss": 0.8585,
"step": 10380
},
{
"epoch": 13.286445012787723,
"grad_norm": 4.775753974914551,
"learning_rate": 5.966162065894925e-06,
"loss": 0.3978,
"step": 10390
},
{
"epoch": 13.29923273657289,
"grad_norm": 11.553483009338379,
"learning_rate": 5.9216384683882456e-06,
"loss": 0.424,
"step": 10400
},
{
"epoch": 13.312020460358056,
"grad_norm": 3.354985237121582,
"learning_rate": 5.877114870881568e-06,
"loss": 0.539,
"step": 10410
},
{
"epoch": 13.324808184143222,
"grad_norm": 0.004566879011690617,
"learning_rate": 5.832591273374889e-06,
"loss": 0.5421,
"step": 10420
},
{
"epoch": 13.337595907928389,
"grad_norm": 13.376380920410156,
"learning_rate": 5.788067675868211e-06,
"loss": 0.8866,
"step": 10430
},
{
"epoch": 13.350383631713555,
"grad_norm": 5.068173408508301,
"learning_rate": 5.743544078361532e-06,
"loss": 0.4386,
"step": 10440
},
{
"epoch": 13.363171355498721,
"grad_norm": 0.2643067538738251,
"learning_rate": 5.699020480854853e-06,
"loss": 0.4168,
"step": 10450
},
{
"epoch": 13.375959079283888,
"grad_norm": 6.765013694763184,
"learning_rate": 5.654496883348174e-06,
"loss": 0.4382,
"step": 10460
},
{
"epoch": 13.388746803069054,
"grad_norm": 0.811938464641571,
"learning_rate": 5.609973285841496e-06,
"loss": 0.4302,
"step": 10470
},
{
"epoch": 13.40153452685422,
"grad_norm": 2.1787633895874023,
"learning_rate": 5.565449688334818e-06,
"loss": 0.4329,
"step": 10480
},
{
"epoch": 13.414322250639387,
"grad_norm": 0.008785980753600597,
"learning_rate": 5.520926090828139e-06,
"loss": 0.2213,
"step": 10490
},
{
"epoch": 13.427109974424553,
"grad_norm": 3.6294312477111816,
"learning_rate": 5.476402493321461e-06,
"loss": 0.7337,
"step": 10500
},
{
"epoch": 13.43989769820972,
"grad_norm": 1.247524619102478,
"learning_rate": 5.431878895814782e-06,
"loss": 0.4815,
"step": 10510
},
{
"epoch": 13.452685421994884,
"grad_norm": 0.05368124693632126,
"learning_rate": 5.387355298308104e-06,
"loss": 0.3219,
"step": 10520
},
{
"epoch": 13.46547314578005,
"grad_norm": 3.9428138732910156,
"learning_rate": 5.342831700801425e-06,
"loss": 0.6114,
"step": 10530
},
{
"epoch": 13.478260869565217,
"grad_norm": 8.967109680175781,
"learning_rate": 5.298308103294747e-06,
"loss": 0.6697,
"step": 10540
},
{
"epoch": 13.491048593350383,
"grad_norm": 4.612414836883545,
"learning_rate": 5.253784505788068e-06,
"loss": 0.2481,
"step": 10550
},
{
"epoch": 13.50383631713555,
"grad_norm": 0.3696252107620239,
"learning_rate": 5.20926090828139e-06,
"loss": 0.4671,
"step": 10560
},
{
"epoch": 13.516624040920716,
"grad_norm": 2.8999485969543457,
"learning_rate": 5.164737310774711e-06,
"loss": 0.2148,
"step": 10570
},
{
"epoch": 13.529411764705882,
"grad_norm": 0.0017122033750638366,
"learning_rate": 5.120213713268032e-06,
"loss": 0.328,
"step": 10580
},
{
"epoch": 13.542199488491049,
"grad_norm": 0.07302047312259674,
"learning_rate": 5.075690115761353e-06,
"loss": 0.2222,
"step": 10590
},
{
"epoch": 13.554987212276215,
"grad_norm": 77.11892700195312,
"learning_rate": 5.031166518254675e-06,
"loss": 0.3489,
"step": 10600
},
{
"epoch": 13.567774936061381,
"grad_norm": 13.24903678894043,
"learning_rate": 4.986642920747997e-06,
"loss": 0.3993,
"step": 10610
},
{
"epoch": 13.580562659846548,
"grad_norm": 0.006769936066120863,
"learning_rate": 4.942119323241318e-06,
"loss": 0.2839,
"step": 10620
},
{
"epoch": 13.593350383631714,
"grad_norm": 6.966930389404297,
"learning_rate": 4.89759572573464e-06,
"loss": 0.5219,
"step": 10630
},
{
"epoch": 13.60613810741688,
"grad_norm": 5.570155620574951,
"learning_rate": 4.853072128227961e-06,
"loss": 0.2924,
"step": 10640
},
{
"epoch": 13.618925831202047,
"grad_norm": 8.221465110778809,
"learning_rate": 4.808548530721283e-06,
"loss": 0.4148,
"step": 10650
},
{
"epoch": 13.631713554987213,
"grad_norm": 6.763041019439697,
"learning_rate": 4.764024933214604e-06,
"loss": 0.3678,
"step": 10660
},
{
"epoch": 13.644501278772378,
"grad_norm": 5.139638423919678,
"learning_rate": 4.719501335707925e-06,
"loss": 0.3983,
"step": 10670
},
{
"epoch": 13.657289002557544,
"grad_norm": 0.2467830628156662,
"learning_rate": 4.674977738201247e-06,
"loss": 0.4656,
"step": 10680
},
{
"epoch": 13.67007672634271,
"grad_norm": 2.647254705429077,
"learning_rate": 4.630454140694569e-06,
"loss": 0.5215,
"step": 10690
},
{
"epoch": 13.682864450127877,
"grad_norm": 8.770064353942871,
"learning_rate": 4.58593054318789e-06,
"loss": 0.4681,
"step": 10700
},
{
"epoch": 13.695652173913043,
"grad_norm": 0.30153679847717285,
"learning_rate": 4.541406945681211e-06,
"loss": 0.2378,
"step": 10710
},
{
"epoch": 13.70843989769821,
"grad_norm": 0.015129966661334038,
"learning_rate": 4.496883348174532e-06,
"loss": 0.2995,
"step": 10720
},
{
"epoch": 13.721227621483376,
"grad_norm": 8.25349235534668,
"learning_rate": 4.452359750667854e-06,
"loss": 0.5158,
"step": 10730
},
{
"epoch": 13.734015345268542,
"grad_norm": 2.6685609817504883,
"learning_rate": 4.407836153161176e-06,
"loss": 0.3549,
"step": 10740
},
{
"epoch": 13.746803069053708,
"grad_norm": 0.4903467297554016,
"learning_rate": 4.363312555654497e-06,
"loss": 0.1934,
"step": 10750
},
{
"epoch": 13.759590792838875,
"grad_norm": 0.016465764492750168,
"learning_rate": 4.318788958147819e-06,
"loss": 0.3642,
"step": 10760
},
{
"epoch": 13.772378516624041,
"grad_norm": 11.288249015808105,
"learning_rate": 4.27426536064114e-06,
"loss": 0.6398,
"step": 10770
},
{
"epoch": 13.785166240409207,
"grad_norm": 0.20837096869945526,
"learning_rate": 4.229741763134462e-06,
"loss": 0.1693,
"step": 10780
},
{
"epoch": 13.797953964194374,
"grad_norm": 0.0036407741717994213,
"learning_rate": 4.185218165627783e-06,
"loss": 0.1372,
"step": 10790
},
{
"epoch": 13.81074168797954,
"grad_norm": 3.989978790283203,
"learning_rate": 4.140694568121104e-06,
"loss": 0.3317,
"step": 10800
},
{
"epoch": 13.823529411764707,
"grad_norm": 0.8736965656280518,
"learning_rate": 4.096170970614425e-06,
"loss": 0.2362,
"step": 10810
},
{
"epoch": 13.836317135549873,
"grad_norm": 0.37958985567092896,
"learning_rate": 4.051647373107748e-06,
"loss": 0.2248,
"step": 10820
},
{
"epoch": 13.84910485933504,
"grad_norm": 0.4776633381843567,
"learning_rate": 4.007123775601069e-06,
"loss": 0.2542,
"step": 10830
},
{
"epoch": 13.861892583120204,
"grad_norm": 2.976607084274292,
"learning_rate": 3.96260017809439e-06,
"loss": 0.3775,
"step": 10840
},
{
"epoch": 13.87468030690537,
"grad_norm": 2.79518723487854,
"learning_rate": 3.918076580587711e-06,
"loss": 0.283,
"step": 10850
},
{
"epoch": 13.887468030690536,
"grad_norm": 7.698398590087891,
"learning_rate": 3.873552983081033e-06,
"loss": 0.3615,
"step": 10860
},
{
"epoch": 13.900255754475703,
"grad_norm": 5.496623992919922,
"learning_rate": 3.829029385574355e-06,
"loss": 0.6363,
"step": 10870
},
{
"epoch": 13.91304347826087,
"grad_norm": 2.927433490753174,
"learning_rate": 3.7845057880676764e-06,
"loss": 0.45,
"step": 10880
},
{
"epoch": 13.925831202046036,
"grad_norm": 0.43909209966659546,
"learning_rate": 3.7399821905609977e-06,
"loss": 0.5708,
"step": 10890
},
{
"epoch": 13.938618925831202,
"grad_norm": 0.40781036019325256,
"learning_rate": 3.695458593054319e-06,
"loss": 0.2988,
"step": 10900
},
{
"epoch": 13.951406649616368,
"grad_norm": 0.11614171415567398,
"learning_rate": 3.6509349955476403e-06,
"loss": 0.3176,
"step": 10910
},
{
"epoch": 13.964194373401535,
"grad_norm": 7.913348197937012,
"learning_rate": 3.6064113980409616e-06,
"loss": 0.3468,
"step": 10920
},
{
"epoch": 13.976982097186701,
"grad_norm": 0.017280923202633858,
"learning_rate": 3.5618878005342833e-06,
"loss": 0.4185,
"step": 10930
},
{
"epoch": 13.989769820971867,
"grad_norm": 9.15585994720459,
"learning_rate": 3.5173642030276046e-06,
"loss": 0.5216,
"step": 10940
},
{
"epoch": 14.0,
"eval_loss": 0.23057223856449127,
"eval_runtime": 0.9717,
"eval_samples_per_second": 100.856,
"eval_steps_per_second": 13.379,
"step": 10948
},
{
"epoch": 14.002557544757034,
"grad_norm": 9.044001579284668,
"learning_rate": 3.4728406055209268e-06,
"loss": 0.5606,
"step": 10950
},
{
"epoch": 14.0153452685422,
"grad_norm": 1.5585741996765137,
"learning_rate": 3.428317008014248e-06,
"loss": 0.1146,
"step": 10960
},
{
"epoch": 14.028132992327366,
"grad_norm": 0.24437369406223297,
"learning_rate": 3.3837934105075694e-06,
"loss": 0.7053,
"step": 10970
},
{
"epoch": 14.040920716112533,
"grad_norm": 0.48745203018188477,
"learning_rate": 3.3392698130008907e-06,
"loss": 0.5357,
"step": 10980
},
{
"epoch": 14.053708439897699,
"grad_norm": 0.3812559247016907,
"learning_rate": 3.294746215494212e-06,
"loss": 0.2654,
"step": 10990
},
{
"epoch": 14.066496163682864,
"grad_norm": 4.564992904663086,
"learning_rate": 3.2502226179875333e-06,
"loss": 0.6689,
"step": 11000
},
{
"epoch": 14.07928388746803,
"grad_norm": 7.109955310821533,
"learning_rate": 3.2056990204808546e-06,
"loss": 0.4216,
"step": 11010
},
{
"epoch": 14.092071611253196,
"grad_norm": 3.396707773208618,
"learning_rate": 3.1611754229741767e-06,
"loss": 0.4376,
"step": 11020
},
{
"epoch": 14.104859335038363,
"grad_norm": 2.241431951522827,
"learning_rate": 3.1166518254674976e-06,
"loss": 0.4878,
"step": 11030
},
{
"epoch": 14.117647058823529,
"grad_norm": 0.006932465359568596,
"learning_rate": 3.0721282279608193e-06,
"loss": 0.6128,
"step": 11040
},
{
"epoch": 14.130434782608695,
"grad_norm": 3.4660990238189697,
"learning_rate": 3.0276046304541406e-06,
"loss": 0.3435,
"step": 11050
},
{
"epoch": 14.143222506393862,
"grad_norm": 2.24495792388916,
"learning_rate": 2.9830810329474623e-06,
"loss": 0.3927,
"step": 11060
},
{
"epoch": 14.156010230179028,
"grad_norm": 11.082945823669434,
"learning_rate": 2.938557435440784e-06,
"loss": 0.2957,
"step": 11070
},
{
"epoch": 14.168797953964194,
"grad_norm": 7.921354293823242,
"learning_rate": 2.8940338379341054e-06,
"loss": 0.4532,
"step": 11080
},
{
"epoch": 14.18158567774936,
"grad_norm": 7.9056806564331055,
"learning_rate": 2.8495102404274267e-06,
"loss": 0.5782,
"step": 11090
},
{
"epoch": 14.194373401534527,
"grad_norm": 9.842604637145996,
"learning_rate": 2.804986642920748e-06,
"loss": 0.4558,
"step": 11100
},
{
"epoch": 14.207161125319693,
"grad_norm": 9.401638984680176,
"learning_rate": 2.7604630454140697e-06,
"loss": 0.3192,
"step": 11110
},
{
"epoch": 14.21994884910486,
"grad_norm": 1.4201183319091797,
"learning_rate": 2.715939447907391e-06,
"loss": 0.1412,
"step": 11120
},
{
"epoch": 14.232736572890026,
"grad_norm": 3.4593756198883057,
"learning_rate": 2.6714158504007123e-06,
"loss": 0.3824,
"step": 11130
},
{
"epoch": 14.245524296675192,
"grad_norm": 7.897853851318359,
"learning_rate": 2.626892252894034e-06,
"loss": 0.3943,
"step": 11140
},
{
"epoch": 14.258312020460359,
"grad_norm": 3.6695263385772705,
"learning_rate": 2.5823686553873553e-06,
"loss": 0.2146,
"step": 11150
},
{
"epoch": 14.271099744245525,
"grad_norm": 1.173981785774231,
"learning_rate": 2.5378450578806766e-06,
"loss": 0.3024,
"step": 11160
},
{
"epoch": 14.28388746803069,
"grad_norm": 0.7691462635993958,
"learning_rate": 2.4933214603739983e-06,
"loss": 0.2728,
"step": 11170
},
{
"epoch": 14.296675191815856,
"grad_norm": 6.5430169105529785,
"learning_rate": 2.44879786286732e-06,
"loss": 0.2054,
"step": 11180
},
{
"epoch": 14.309462915601022,
"grad_norm": 6.1036505699157715,
"learning_rate": 2.4042742653606414e-06,
"loss": 0.5137,
"step": 11190
},
{
"epoch": 14.322250639386189,
"grad_norm": 6.128425598144531,
"learning_rate": 2.3597506678539627e-06,
"loss": 0.727,
"step": 11200
},
{
"epoch": 14.335038363171355,
"grad_norm": 5.091713905334473,
"learning_rate": 2.3152270703472844e-06,
"loss": 0.405,
"step": 11210
},
{
"epoch": 14.347826086956522,
"grad_norm": 5.891820430755615,
"learning_rate": 2.2707034728406057e-06,
"loss": 0.3249,
"step": 11220
},
{
"epoch": 14.360613810741688,
"grad_norm": 7.751905918121338,
"learning_rate": 2.226179875333927e-06,
"loss": 0.375,
"step": 11230
},
{
"epoch": 14.373401534526854,
"grad_norm": 2.656019687652588,
"learning_rate": 2.1816562778272487e-06,
"loss": 0.3961,
"step": 11240
},
{
"epoch": 14.38618925831202,
"grad_norm": 4.47562313079834,
"learning_rate": 2.13713268032057e-06,
"loss": 0.2487,
"step": 11250
},
{
"epoch": 14.398976982097187,
"grad_norm": 0.4639877676963806,
"learning_rate": 2.0926090828138913e-06,
"loss": 0.3217,
"step": 11260
},
{
"epoch": 14.411764705882353,
"grad_norm": 3.0707197189331055,
"learning_rate": 2.0480854853072126e-06,
"loss": 0.2258,
"step": 11270
},
{
"epoch": 14.42455242966752,
"grad_norm": 1.7559298276901245,
"learning_rate": 2.0035618878005343e-06,
"loss": 0.3735,
"step": 11280
},
{
"epoch": 14.437340153452686,
"grad_norm": 3.869406223297119,
"learning_rate": 1.9590382902938556e-06,
"loss": 0.3116,
"step": 11290
},
{
"epoch": 14.450127877237852,
"grad_norm": 0.0035378236789256334,
"learning_rate": 1.9145146927871773e-06,
"loss": 0.5503,
"step": 11300
},
{
"epoch": 14.462915601023019,
"grad_norm": 10.881844520568848,
"learning_rate": 1.8699910952804989e-06,
"loss": 0.2782,
"step": 11310
},
{
"epoch": 14.475703324808185,
"grad_norm": 0.003400342771783471,
"learning_rate": 1.8254674977738202e-06,
"loss": 0.5413,
"step": 11320
},
{
"epoch": 14.48849104859335,
"grad_norm": 0.07425220310688019,
"learning_rate": 1.7809439002671417e-06,
"loss": 0.2179,
"step": 11330
},
{
"epoch": 14.501278772378516,
"grad_norm": 8.549428939819336,
"learning_rate": 1.7364203027604634e-06,
"loss": 0.4874,
"step": 11340
},
{
"epoch": 14.514066496163682,
"grad_norm": 2.6542978286743164,
"learning_rate": 1.6918967052537847e-06,
"loss": 0.4715,
"step": 11350
},
{
"epoch": 14.526854219948849,
"grad_norm": 2.429234266281128,
"learning_rate": 1.647373107747106e-06,
"loss": 0.2447,
"step": 11360
},
{
"epoch": 14.539641943734015,
"grad_norm": 0.5469067096710205,
"learning_rate": 1.6028495102404273e-06,
"loss": 0.2469,
"step": 11370
},
{
"epoch": 14.552429667519181,
"grad_norm": 3.950800657272339,
"learning_rate": 1.5583259127337488e-06,
"loss": 0.5352,
"step": 11380
},
{
"epoch": 14.565217391304348,
"grad_norm": 7.118297576904297,
"learning_rate": 1.5138023152270703e-06,
"loss": 0.5058,
"step": 11390
},
{
"epoch": 14.578005115089514,
"grad_norm": 0.0335397832095623,
"learning_rate": 1.469278717720392e-06,
"loss": 0.1879,
"step": 11400
},
{
"epoch": 14.59079283887468,
"grad_norm": 3.543941020965576,
"learning_rate": 1.4247551202137133e-06,
"loss": 0.2616,
"step": 11410
},
{
"epoch": 14.603580562659847,
"grad_norm": 8.556031227111816,
"learning_rate": 1.3802315227070348e-06,
"loss": 0.9001,
"step": 11420
},
{
"epoch": 14.616368286445013,
"grad_norm": 6.954402446746826,
"learning_rate": 1.3357079252003561e-06,
"loss": 0.5127,
"step": 11430
},
{
"epoch": 14.62915601023018,
"grad_norm": 1.0035245418548584,
"learning_rate": 1.2911843276936777e-06,
"loss": 0.3251,
"step": 11440
},
{
"epoch": 14.641943734015346,
"grad_norm": 1.676684021949768,
"learning_rate": 1.2466607301869992e-06,
"loss": 0.1386,
"step": 11450
},
{
"epoch": 14.654731457800512,
"grad_norm": 0.003588082268834114,
"learning_rate": 1.2021371326803207e-06,
"loss": 0.4051,
"step": 11460
},
{
"epoch": 14.667519181585678,
"grad_norm": 0.2613386809825897,
"learning_rate": 1.1576135351736422e-06,
"loss": 0.2099,
"step": 11470
},
{
"epoch": 14.680306905370845,
"grad_norm": 2.826719045639038,
"learning_rate": 1.1130899376669635e-06,
"loss": 0.4122,
"step": 11480
},
{
"epoch": 14.693094629156011,
"grad_norm": 0.5583800673484802,
"learning_rate": 1.068566340160285e-06,
"loss": 0.8122,
"step": 11490
},
{
"epoch": 14.705882352941176,
"grad_norm": 4.03103494644165,
"learning_rate": 1.0240427426536063e-06,
"loss": 0.6181,
"step": 11500
},
{
"epoch": 14.718670076726342,
"grad_norm": 1.6675188541412354,
"learning_rate": 9.795191451469278e-07,
"loss": 0.123,
"step": 11510
},
{
"epoch": 14.731457800511508,
"grad_norm": 4.71064567565918,
"learning_rate": 9.349955476402494e-07,
"loss": 0.2775,
"step": 11520
},
{
"epoch": 14.744245524296675,
"grad_norm": 12.104289054870605,
"learning_rate": 8.904719501335708e-07,
"loss": 0.6795,
"step": 11530
},
{
"epoch": 14.757033248081841,
"grad_norm": 5.273158550262451,
"learning_rate": 8.459483526268923e-07,
"loss": 0.3161,
"step": 11540
},
{
"epoch": 14.769820971867007,
"grad_norm": 4.798052787780762,
"learning_rate": 8.014247551202136e-07,
"loss": 0.7235,
"step": 11550
},
{
"epoch": 14.782608695652174,
"grad_norm": 0.33628830313682556,
"learning_rate": 7.569011576135352e-07,
"loss": 0.2189,
"step": 11560
},
{
"epoch": 14.79539641943734,
"grad_norm": 3.923007011413574,
"learning_rate": 7.123775601068567e-07,
"loss": 0.3109,
"step": 11570
},
{
"epoch": 14.808184143222507,
"grad_norm": 1.1927415132522583,
"learning_rate": 6.678539626001781e-07,
"loss": 0.3873,
"step": 11580
},
{
"epoch": 14.820971867007673,
"grad_norm": 0.0009544580243527889,
"learning_rate": 6.233303650934996e-07,
"loss": 0.203,
"step": 11590
},
{
"epoch": 14.83375959079284,
"grad_norm": 5.8102641105651855,
"learning_rate": 5.788067675868211e-07,
"loss": 0.3458,
"step": 11600
},
{
"epoch": 14.846547314578006,
"grad_norm": 0.0030290207359939814,
"learning_rate": 5.342831700801425e-07,
"loss": 0.6794,
"step": 11610
},
{
"epoch": 14.859335038363172,
"grad_norm": 9.012167930603027,
"learning_rate": 4.897595725734639e-07,
"loss": 0.4832,
"step": 11620
},
{
"epoch": 14.872122762148338,
"grad_norm": 18.302406311035156,
"learning_rate": 4.452359750667854e-07,
"loss": 0.6115,
"step": 11630
},
{
"epoch": 14.884910485933505,
"grad_norm": 3.3673505783081055,
"learning_rate": 4.007123775601068e-07,
"loss": 0.1894,
"step": 11640
},
{
"epoch": 14.89769820971867,
"grad_norm": 0.5946460366249084,
"learning_rate": 3.5618878005342833e-07,
"loss": 0.2352,
"step": 11650
},
{
"epoch": 14.910485933503836,
"grad_norm": 5.374065399169922,
"learning_rate": 3.116651825467498e-07,
"loss": 0.5661,
"step": 11660
},
{
"epoch": 14.923273657289002,
"grad_norm": 0.6203376054763794,
"learning_rate": 2.6714158504007125e-07,
"loss": 0.2715,
"step": 11670
},
{
"epoch": 14.936061381074168,
"grad_norm": 0.9134934544563293,
"learning_rate": 2.226179875333927e-07,
"loss": 0.3172,
"step": 11680
},
{
"epoch": 14.948849104859335,
"grad_norm": 0.07863820344209671,
"learning_rate": 1.7809439002671417e-07,
"loss": 0.4238,
"step": 11690
},
{
"epoch": 14.961636828644501,
"grad_norm": 3.0305428504943848,
"learning_rate": 1.3357079252003563e-07,
"loss": 0.5941,
"step": 11700
},
{
"epoch": 14.974424552429667,
"grad_norm": 3.0656425952911377,
"learning_rate": 8.904719501335708e-08,
"loss": 0.3601,
"step": 11710
},
{
"epoch": 14.987212276214834,
"grad_norm": 1.1194722652435303,
"learning_rate": 4.452359750667854e-08,
"loss": 0.3946,
"step": 11720
},
{
"epoch": 15.0,
"grad_norm": 0.5027629137039185,
"learning_rate": 0.0,
"loss": 0.5649,
"step": 11730
},
{
"epoch": 15.0,
"eval_loss": 0.22965534031391144,
"eval_runtime": 0.8342,
"eval_samples_per_second": 117.472,
"eval_steps_per_second": 15.583,
"step": 11730
}
],
"logging_steps": 10,
"max_steps": 11730,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1550558468505600.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}