Bakalarska_praca/results2/checkpoint-23460/trainer_state.json
2024-10-29 14:08:43 +01:00

16696 lines
405 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.0,
"eval_steps": 500,
"global_step": 23460,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01278772378516624,
"grad_norm": 5291.65576171875,
"learning_rate": 1.0000000000000002e-06,
"loss": 46.8468,
"step": 10
},
{
"epoch": 0.02557544757033248,
"grad_norm": 657.4127807617188,
"learning_rate": 2.0000000000000003e-06,
"loss": 47.4188,
"step": 20
},
{
"epoch": 0.03836317135549872,
"grad_norm": 8509.5302734375,
"learning_rate": 3e-06,
"loss": 55.5624,
"step": 30
},
{
"epoch": 0.05115089514066496,
"grad_norm": 427.65924072265625,
"learning_rate": 4.000000000000001e-06,
"loss": 54.0157,
"step": 40
},
{
"epoch": 0.0639386189258312,
"grad_norm": 7448.37353515625,
"learning_rate": 5e-06,
"loss": 57.6548,
"step": 50
},
{
"epoch": 0.07672634271099744,
"grad_norm": 14357.810546875,
"learning_rate": 6e-06,
"loss": 45.0872,
"step": 60
},
{
"epoch": 0.08951406649616368,
"grad_norm": 4495.298828125,
"learning_rate": 7.000000000000001e-06,
"loss": 49.2105,
"step": 70
},
{
"epoch": 0.10230179028132992,
"grad_norm": 3922.909912109375,
"learning_rate": 8.000000000000001e-06,
"loss": 45.772,
"step": 80
},
{
"epoch": 0.11508951406649616,
"grad_norm": 47730.671875,
"learning_rate": 9e-06,
"loss": 54.9209,
"step": 90
},
{
"epoch": 0.1278772378516624,
"grad_norm": 27943.875,
"learning_rate": 1e-05,
"loss": 47.032,
"step": 100
},
{
"epoch": 0.14066496163682865,
"grad_norm": 185.7626953125,
"learning_rate": 1.1000000000000001e-05,
"loss": 55.6442,
"step": 110
},
{
"epoch": 0.1534526854219949,
"grad_norm": 4819.99365234375,
"learning_rate": 1.2e-05,
"loss": 47.2024,
"step": 120
},
{
"epoch": 0.16624040920716113,
"grad_norm": 16820.35546875,
"learning_rate": 1.3000000000000001e-05,
"loss": 47.1665,
"step": 130
},
{
"epoch": 0.17902813299232737,
"grad_norm": 408.82489013671875,
"learning_rate": 1.4000000000000001e-05,
"loss": 45.6075,
"step": 140
},
{
"epoch": 0.1918158567774936,
"grad_norm": 29451.880859375,
"learning_rate": 1.5e-05,
"loss": 50.9366,
"step": 150
},
{
"epoch": 0.20460358056265984,
"grad_norm": 28413.0390625,
"learning_rate": 1.6000000000000003e-05,
"loss": 44.6847,
"step": 160
},
{
"epoch": 0.21739130434782608,
"grad_norm": 799.9179077148438,
"learning_rate": 1.7000000000000003e-05,
"loss": 46.8802,
"step": 170
},
{
"epoch": 0.23017902813299232,
"grad_norm": 2510.53515625,
"learning_rate": 1.8e-05,
"loss": 46.2193,
"step": 180
},
{
"epoch": 0.24296675191815856,
"grad_norm": 2892.29248046875,
"learning_rate": 1.9e-05,
"loss": 41.2469,
"step": 190
},
{
"epoch": 0.2557544757033248,
"grad_norm": 738.676513671875,
"learning_rate": 2e-05,
"loss": 41.8658,
"step": 200
},
{
"epoch": 0.26854219948849106,
"grad_norm": 215.02032470703125,
"learning_rate": 2.1e-05,
"loss": 42.2854,
"step": 210
},
{
"epoch": 0.2813299232736573,
"grad_norm": 1281.8134765625,
"learning_rate": 2.2000000000000003e-05,
"loss": 48.7005,
"step": 220
},
{
"epoch": 0.29411764705882354,
"grad_norm": 677.4962158203125,
"learning_rate": 2.3000000000000003e-05,
"loss": 36.2834,
"step": 230
},
{
"epoch": 0.3069053708439898,
"grad_norm": 2442.72900390625,
"learning_rate": 2.4e-05,
"loss": 44.3992,
"step": 240
},
{
"epoch": 0.319693094629156,
"grad_norm": 135.88478088378906,
"learning_rate": 2.5e-05,
"loss": 42.5502,
"step": 250
},
{
"epoch": 0.33248081841432225,
"grad_norm": 5432.8203125,
"learning_rate": 2.6000000000000002e-05,
"loss": 41.375,
"step": 260
},
{
"epoch": 0.3452685421994885,
"grad_norm": 3573.05419921875,
"learning_rate": 2.7000000000000002e-05,
"loss": 40.6085,
"step": 270
},
{
"epoch": 0.35805626598465473,
"grad_norm": 806.6569213867188,
"learning_rate": 2.8000000000000003e-05,
"loss": 40.9676,
"step": 280
},
{
"epoch": 0.37084398976982097,
"grad_norm": 537.9384765625,
"learning_rate": 2.9e-05,
"loss": 36.0962,
"step": 290
},
{
"epoch": 0.3836317135549872,
"grad_norm": 1317.54150390625,
"learning_rate": 3e-05,
"loss": 38.838,
"step": 300
},
{
"epoch": 0.39641943734015345,
"grad_norm": 1044.3780517578125,
"learning_rate": 3.1e-05,
"loss": 35.0419,
"step": 310
},
{
"epoch": 0.4092071611253197,
"grad_norm": 6332.0888671875,
"learning_rate": 3.2000000000000005e-05,
"loss": 35.0986,
"step": 320
},
{
"epoch": 0.4219948849104859,
"grad_norm": 1020.596923828125,
"learning_rate": 3.3e-05,
"loss": 35.2819,
"step": 330
},
{
"epoch": 0.43478260869565216,
"grad_norm": 4903.22119140625,
"learning_rate": 3.4000000000000007e-05,
"loss": 27.0823,
"step": 340
},
{
"epoch": 0.4475703324808184,
"grad_norm": 705.4653930664062,
"learning_rate": 3.5e-05,
"loss": 35.8111,
"step": 350
},
{
"epoch": 0.46035805626598464,
"grad_norm": 166.60675048828125,
"learning_rate": 3.6e-05,
"loss": 32.9624,
"step": 360
},
{
"epoch": 0.4731457800511509,
"grad_norm": 1294.9737548828125,
"learning_rate": 3.7e-05,
"loss": 27.3774,
"step": 370
},
{
"epoch": 0.4859335038363171,
"grad_norm": 143.36048889160156,
"learning_rate": 3.8e-05,
"loss": 35.3593,
"step": 380
},
{
"epoch": 0.49872122762148335,
"grad_norm": 2351.956787109375,
"learning_rate": 3.9000000000000006e-05,
"loss": 31.6628,
"step": 390
},
{
"epoch": 0.5115089514066496,
"grad_norm": 1412.3145751953125,
"learning_rate": 4e-05,
"loss": 24.9052,
"step": 400
},
{
"epoch": 0.5242966751918159,
"grad_norm": 2179.52294921875,
"learning_rate": 4.1e-05,
"loss": 28.4615,
"step": 410
},
{
"epoch": 0.5370843989769821,
"grad_norm": 1550.5777587890625,
"learning_rate": 4.2e-05,
"loss": 27.8655,
"step": 420
},
{
"epoch": 0.5498721227621484,
"grad_norm": 14167.97265625,
"learning_rate": 4.3e-05,
"loss": 31.339,
"step": 430
},
{
"epoch": 0.5626598465473146,
"grad_norm": 1213.757568359375,
"learning_rate": 4.4000000000000006e-05,
"loss": 29.2414,
"step": 440
},
{
"epoch": 0.5754475703324808,
"grad_norm": 1910.325439453125,
"learning_rate": 4.5e-05,
"loss": 28.1563,
"step": 450
},
{
"epoch": 0.5882352941176471,
"grad_norm": 2075.203369140625,
"learning_rate": 4.600000000000001e-05,
"loss": 30.5137,
"step": 460
},
{
"epoch": 0.6010230179028133,
"grad_norm": 2741.365966796875,
"learning_rate": 4.7e-05,
"loss": 23.7269,
"step": 470
},
{
"epoch": 0.6138107416879796,
"grad_norm": 6818.3935546875,
"learning_rate": 4.8e-05,
"loss": 20.7367,
"step": 480
},
{
"epoch": 0.6265984654731458,
"grad_norm": 14128.44140625,
"learning_rate": 4.9e-05,
"loss": 23.4038,
"step": 490
},
{
"epoch": 0.639386189258312,
"grad_norm": 268.3045654296875,
"learning_rate": 5e-05,
"loss": 25.6224,
"step": 500
},
{
"epoch": 0.6521739130434783,
"grad_norm": 8016.40673828125,
"learning_rate": 4.9978222996515684e-05,
"loss": 24.9662,
"step": 510
},
{
"epoch": 0.6649616368286445,
"grad_norm": 2014.55078125,
"learning_rate": 4.995644599303136e-05,
"loss": 27.3142,
"step": 520
},
{
"epoch": 0.6777493606138107,
"grad_norm": 2465.661376953125,
"learning_rate": 4.993466898954704e-05,
"loss": 24.8702,
"step": 530
},
{
"epoch": 0.690537084398977,
"grad_norm": 1037.0167236328125,
"learning_rate": 4.991289198606272e-05,
"loss": 22.7556,
"step": 540
},
{
"epoch": 0.7033248081841432,
"grad_norm": 11746.3017578125,
"learning_rate": 4.9891114982578404e-05,
"loss": 20.6752,
"step": 550
},
{
"epoch": 0.7161125319693095,
"grad_norm": 1548.5609130859375,
"learning_rate": 4.986933797909408e-05,
"loss": 21.2267,
"step": 560
},
{
"epoch": 0.7289002557544757,
"grad_norm": 2361.02392578125,
"learning_rate": 4.984756097560976e-05,
"loss": 20.3482,
"step": 570
},
{
"epoch": 0.7416879795396419,
"grad_norm": 11062.6640625,
"learning_rate": 4.9825783972125436e-05,
"loss": 19.0836,
"step": 580
},
{
"epoch": 0.7544757033248082,
"grad_norm": 553.4859619140625,
"learning_rate": 4.980400696864112e-05,
"loss": 19.9769,
"step": 590
},
{
"epoch": 0.7672634271099744,
"grad_norm": 497.746337890625,
"learning_rate": 4.978222996515679e-05,
"loss": 18.8568,
"step": 600
},
{
"epoch": 0.7800511508951407,
"grad_norm": 930.5474243164062,
"learning_rate": 4.9760452961672475e-05,
"loss": 20.3492,
"step": 610
},
{
"epoch": 0.7928388746803069,
"grad_norm": 585.6785278320312,
"learning_rate": 4.9738675958188156e-05,
"loss": 16.9816,
"step": 620
},
{
"epoch": 0.8056265984654731,
"grad_norm": 1422.135009765625,
"learning_rate": 4.971689895470383e-05,
"loss": 17.72,
"step": 630
},
{
"epoch": 0.8184143222506394,
"grad_norm": 375.1902770996094,
"learning_rate": 4.969512195121951e-05,
"loss": 17.7093,
"step": 640
},
{
"epoch": 0.8312020460358056,
"grad_norm": 4669.2646484375,
"learning_rate": 4.9673344947735195e-05,
"loss": 16.5356,
"step": 650
},
{
"epoch": 0.8439897698209718,
"grad_norm": 3034.7099609375,
"learning_rate": 4.965156794425087e-05,
"loss": 17.4828,
"step": 660
},
{
"epoch": 0.8567774936061381,
"grad_norm": 2345.01708984375,
"learning_rate": 4.962979094076655e-05,
"loss": 15.835,
"step": 670
},
{
"epoch": 0.8695652173913043,
"grad_norm": 363.0520935058594,
"learning_rate": 4.960801393728223e-05,
"loss": 15.788,
"step": 680
},
{
"epoch": 0.8823529411764706,
"grad_norm": 491.0669250488281,
"learning_rate": 4.9586236933797915e-05,
"loss": 16.4292,
"step": 690
},
{
"epoch": 0.8951406649616368,
"grad_norm": 415.5701904296875,
"learning_rate": 4.956445993031359e-05,
"loss": 15.2496,
"step": 700
},
{
"epoch": 0.907928388746803,
"grad_norm": 361.4362487792969,
"learning_rate": 4.954268292682927e-05,
"loss": 14.1571,
"step": 710
},
{
"epoch": 0.9207161125319693,
"grad_norm": 3015.107177734375,
"learning_rate": 4.952090592334495e-05,
"loss": 17.2341,
"step": 720
},
{
"epoch": 0.9335038363171355,
"grad_norm": 86.11393737792969,
"learning_rate": 4.9499128919860635e-05,
"loss": 14.5296,
"step": 730
},
{
"epoch": 0.9462915601023018,
"grad_norm": 47.42806625366211,
"learning_rate": 4.947735191637631e-05,
"loss": 14.4388,
"step": 740
},
{
"epoch": 0.959079283887468,
"grad_norm": 150.82362365722656,
"learning_rate": 4.945557491289199e-05,
"loss": 13.377,
"step": 750
},
{
"epoch": 0.9718670076726342,
"grad_norm": 136.4108428955078,
"learning_rate": 4.943379790940767e-05,
"loss": 13.7265,
"step": 760
},
{
"epoch": 0.9846547314578005,
"grad_norm": 374.7745056152344,
"learning_rate": 4.941202090592335e-05,
"loss": 13.6729,
"step": 770
},
{
"epoch": 0.9974424552429667,
"grad_norm": 809.5958862304688,
"learning_rate": 4.9390243902439024e-05,
"loss": 12.0761,
"step": 780
},
{
"epoch": 1.0,
"eval_loss": 7.0809831619262695,
"eval_runtime": 0.8598,
"eval_samples_per_second": 113.984,
"eval_steps_per_second": 15.12,
"step": 782
},
{
"epoch": 1.010230179028133,
"grad_norm": 3061.148193359375,
"learning_rate": 4.9368466898954705e-05,
"loss": 11.7078,
"step": 790
},
{
"epoch": 1.0230179028132993,
"grad_norm": 134.5844268798828,
"learning_rate": 4.934668989547039e-05,
"loss": 10.9549,
"step": 800
},
{
"epoch": 1.0358056265984654,
"grad_norm": 300.20782470703125,
"learning_rate": 4.932491289198606e-05,
"loss": 12.3417,
"step": 810
},
{
"epoch": 1.0485933503836318,
"grad_norm": 156.21311950683594,
"learning_rate": 4.9303135888501744e-05,
"loss": 11.6636,
"step": 820
},
{
"epoch": 1.061381074168798,
"grad_norm": 164.83450317382812,
"learning_rate": 4.9281358885017425e-05,
"loss": 11.6409,
"step": 830
},
{
"epoch": 1.0741687979539642,
"grad_norm": 137.92869567871094,
"learning_rate": 4.92595818815331e-05,
"loss": 11.9903,
"step": 840
},
{
"epoch": 1.0869565217391304,
"grad_norm": 103.73356628417969,
"learning_rate": 4.923780487804878e-05,
"loss": 8.9094,
"step": 850
},
{
"epoch": 1.0997442455242967,
"grad_norm": 64.21759033203125,
"learning_rate": 4.9216027874564464e-05,
"loss": 9.6173,
"step": 860
},
{
"epoch": 1.1125319693094629,
"grad_norm": 695.460205078125,
"learning_rate": 4.9194250871080146e-05,
"loss": 9.6963,
"step": 870
},
{
"epoch": 1.1253196930946292,
"grad_norm": 3314.6865234375,
"learning_rate": 4.917247386759582e-05,
"loss": 10.868,
"step": 880
},
{
"epoch": 1.1381074168797953,
"grad_norm": 148.59613037109375,
"learning_rate": 4.91506968641115e-05,
"loss": 8.8247,
"step": 890
},
{
"epoch": 1.1508951406649617,
"grad_norm": 330.5743103027344,
"learning_rate": 4.9128919860627184e-05,
"loss": 10.6854,
"step": 900
},
{
"epoch": 1.1636828644501278,
"grad_norm": 40.26800537109375,
"learning_rate": 4.910714285714286e-05,
"loss": 9.2873,
"step": 910
},
{
"epoch": 1.1764705882352942,
"grad_norm": 91.26012420654297,
"learning_rate": 4.908536585365854e-05,
"loss": 10.0678,
"step": 920
},
{
"epoch": 1.1892583120204603,
"grad_norm": 516.1912841796875,
"learning_rate": 4.9063588850174216e-05,
"loss": 9.0135,
"step": 930
},
{
"epoch": 1.2020460358056266,
"grad_norm": 70.36582946777344,
"learning_rate": 4.90418118466899e-05,
"loss": 8.7321,
"step": 940
},
{
"epoch": 1.2148337595907928,
"grad_norm": 80.45369720458984,
"learning_rate": 4.902003484320557e-05,
"loss": 8.8967,
"step": 950
},
{
"epoch": 1.227621483375959,
"grad_norm": 1352.676513671875,
"learning_rate": 4.8998257839721254e-05,
"loss": 8.731,
"step": 960
},
{
"epoch": 1.2404092071611252,
"grad_norm": 572.7108764648438,
"learning_rate": 4.8976480836236936e-05,
"loss": 7.8549,
"step": 970
},
{
"epoch": 1.2531969309462916,
"grad_norm": 34.12448501586914,
"learning_rate": 4.895470383275261e-05,
"loss": 6.734,
"step": 980
},
{
"epoch": 1.265984654731458,
"grad_norm": 55.90411376953125,
"learning_rate": 4.893292682926829e-05,
"loss": 6.6569,
"step": 990
},
{
"epoch": 1.278772378516624,
"grad_norm": 23.259788513183594,
"learning_rate": 4.8911149825783975e-05,
"loss": 6.8574,
"step": 1000
},
{
"epoch": 1.2915601023017902,
"grad_norm": 112.81903839111328,
"learning_rate": 4.8889372822299656e-05,
"loss": 7.3217,
"step": 1010
},
{
"epoch": 1.3043478260869565,
"grad_norm": 46.77792739868164,
"learning_rate": 4.886759581881533e-05,
"loss": 6.389,
"step": 1020
},
{
"epoch": 1.317135549872123,
"grad_norm": 290.9834899902344,
"learning_rate": 4.884581881533101e-05,
"loss": 7.2083,
"step": 1030
},
{
"epoch": 1.329923273657289,
"grad_norm": 683.7643432617188,
"learning_rate": 4.8824041811846695e-05,
"loss": 6.0531,
"step": 1040
},
{
"epoch": 1.3427109974424551,
"grad_norm": 177.46051025390625,
"learning_rate": 4.880226480836237e-05,
"loss": 5.2371,
"step": 1050
},
{
"epoch": 1.3554987212276215,
"grad_norm": 67.67430877685547,
"learning_rate": 4.878048780487805e-05,
"loss": 5.6214,
"step": 1060
},
{
"epoch": 1.3682864450127878,
"grad_norm": 151.2257080078125,
"learning_rate": 4.875871080139373e-05,
"loss": 4.9522,
"step": 1070
},
{
"epoch": 1.381074168797954,
"grad_norm": 89.38740539550781,
"learning_rate": 4.8736933797909415e-05,
"loss": 5.1789,
"step": 1080
},
{
"epoch": 1.39386189258312,
"grad_norm": 150.41270446777344,
"learning_rate": 4.871515679442509e-05,
"loss": 4.3908,
"step": 1090
},
{
"epoch": 1.4066496163682864,
"grad_norm": 18.98243522644043,
"learning_rate": 4.869337979094077e-05,
"loss": 4.7472,
"step": 1100
},
{
"epoch": 1.4194373401534528,
"grad_norm": 21.242076873779297,
"learning_rate": 4.867160278745645e-05,
"loss": 4.2189,
"step": 1110
},
{
"epoch": 1.432225063938619,
"grad_norm": 125.7092056274414,
"learning_rate": 4.864982578397213e-05,
"loss": 4.3067,
"step": 1120
},
{
"epoch": 1.445012787723785,
"grad_norm": 33.442665100097656,
"learning_rate": 4.86280487804878e-05,
"loss": 4.1315,
"step": 1130
},
{
"epoch": 1.4578005115089514,
"grad_norm": 192.08267211914062,
"learning_rate": 4.8606271777003485e-05,
"loss": 3.8742,
"step": 1140
},
{
"epoch": 1.4705882352941178,
"grad_norm": 86.93074798583984,
"learning_rate": 4.858449477351917e-05,
"loss": 3.7819,
"step": 1150
},
{
"epoch": 1.4833759590792839,
"grad_norm": 28.52533531188965,
"learning_rate": 4.856271777003484e-05,
"loss": 3.8282,
"step": 1160
},
{
"epoch": 1.49616368286445,
"grad_norm": 179.64064025878906,
"learning_rate": 4.8540940766550524e-05,
"loss": 3.0471,
"step": 1170
},
{
"epoch": 1.5089514066496164,
"grad_norm": 45.106510162353516,
"learning_rate": 4.8519163763066205e-05,
"loss": 3.5098,
"step": 1180
},
{
"epoch": 1.5217391304347827,
"grad_norm": 97.1431884765625,
"learning_rate": 4.849738675958189e-05,
"loss": 2.8188,
"step": 1190
},
{
"epoch": 1.5345268542199488,
"grad_norm": 385.2419128417969,
"learning_rate": 4.847560975609756e-05,
"loss": 3.1416,
"step": 1200
},
{
"epoch": 1.547314578005115,
"grad_norm": 20.7208194732666,
"learning_rate": 4.8453832752613244e-05,
"loss": 2.9539,
"step": 1210
},
{
"epoch": 1.5601023017902813,
"grad_norm": 30.659557342529297,
"learning_rate": 4.8432055749128926e-05,
"loss": 2.7002,
"step": 1220
},
{
"epoch": 1.5728900255754477,
"grad_norm": 23.814847946166992,
"learning_rate": 4.84102787456446e-05,
"loss": 2.5541,
"step": 1230
},
{
"epoch": 1.5856777493606138,
"grad_norm": 14.44255256652832,
"learning_rate": 4.838850174216028e-05,
"loss": 2.2534,
"step": 1240
},
{
"epoch": 1.59846547314578,
"grad_norm": 41.14075469970703,
"learning_rate": 4.8366724738675964e-05,
"loss": 2.0536,
"step": 1250
},
{
"epoch": 1.6112531969309463,
"grad_norm": 198.03524780273438,
"learning_rate": 4.8344947735191646e-05,
"loss": 2.8067,
"step": 1260
},
{
"epoch": 1.6240409207161126,
"grad_norm": 35.26166915893555,
"learning_rate": 4.832317073170732e-05,
"loss": 2.0826,
"step": 1270
},
{
"epoch": 1.6368286445012787,
"grad_norm": 11.258594512939453,
"learning_rate": 4.8301393728222996e-05,
"loss": 2.1802,
"step": 1280
},
{
"epoch": 1.6496163682864449,
"grad_norm": 31.164817810058594,
"learning_rate": 4.827961672473868e-05,
"loss": 2.2419,
"step": 1290
},
{
"epoch": 1.6624040920716112,
"grad_norm": 60.672122955322266,
"learning_rate": 4.825783972125435e-05,
"loss": 2.8254,
"step": 1300
},
{
"epoch": 1.6751918158567776,
"grad_norm": 370.33953857421875,
"learning_rate": 4.8236062717770034e-05,
"loss": 2.7776,
"step": 1310
},
{
"epoch": 1.6879795396419437,
"grad_norm": 758.9866333007812,
"learning_rate": 4.8214285714285716e-05,
"loss": 2.4551,
"step": 1320
},
{
"epoch": 1.7007672634271098,
"grad_norm": 41.34950256347656,
"learning_rate": 4.81925087108014e-05,
"loss": 1.8235,
"step": 1330
},
{
"epoch": 1.7135549872122762,
"grad_norm": 16.059200286865234,
"learning_rate": 4.817073170731707e-05,
"loss": 1.7094,
"step": 1340
},
{
"epoch": 1.7263427109974425,
"grad_norm": 9.618565559387207,
"learning_rate": 4.8148954703832754e-05,
"loss": 1.8371,
"step": 1350
},
{
"epoch": 1.7391304347826086,
"grad_norm": 50.72758102416992,
"learning_rate": 4.8127177700348436e-05,
"loss": 2.2624,
"step": 1360
},
{
"epoch": 1.7519181585677748,
"grad_norm": 56.254966735839844,
"learning_rate": 4.810540069686411e-05,
"loss": 2.118,
"step": 1370
},
{
"epoch": 1.7647058823529411,
"grad_norm": 14.840316772460938,
"learning_rate": 4.808362369337979e-05,
"loss": 1.4653,
"step": 1380
},
{
"epoch": 1.7774936061381075,
"grad_norm": 33.320518493652344,
"learning_rate": 4.8061846689895475e-05,
"loss": 1.47,
"step": 1390
},
{
"epoch": 1.7902813299232738,
"grad_norm": 38.10365676879883,
"learning_rate": 4.8040069686411156e-05,
"loss": 1.4439,
"step": 1400
},
{
"epoch": 1.80306905370844,
"grad_norm": 27.954633712768555,
"learning_rate": 4.801829268292683e-05,
"loss": 1.7437,
"step": 1410
},
{
"epoch": 1.815856777493606,
"grad_norm": 5.291255950927734,
"learning_rate": 4.799651567944251e-05,
"loss": 1.4063,
"step": 1420
},
{
"epoch": 1.8286445012787724,
"grad_norm": 209.9783477783203,
"learning_rate": 4.7974738675958195e-05,
"loss": 1.7947,
"step": 1430
},
{
"epoch": 1.8414322250639388,
"grad_norm": 9.803009986877441,
"learning_rate": 4.795296167247387e-05,
"loss": 1.9148,
"step": 1440
},
{
"epoch": 1.854219948849105,
"grad_norm": 11.24997615814209,
"learning_rate": 4.793118466898955e-05,
"loss": 1.1532,
"step": 1450
},
{
"epoch": 1.867007672634271,
"grad_norm": 45.51220703125,
"learning_rate": 4.7909407665505226e-05,
"loss": 1.5639,
"step": 1460
},
{
"epoch": 1.8797953964194374,
"grad_norm": 16.30727767944336,
"learning_rate": 4.788763066202091e-05,
"loss": 1.1181,
"step": 1470
},
{
"epoch": 1.8925831202046037,
"grad_norm": 136.94105529785156,
"learning_rate": 4.786585365853658e-05,
"loss": 1.3579,
"step": 1480
},
{
"epoch": 1.9053708439897699,
"grad_norm": 11.275238037109375,
"learning_rate": 4.7844076655052265e-05,
"loss": 1.7074,
"step": 1490
},
{
"epoch": 1.918158567774936,
"grad_norm": 25.395980834960938,
"learning_rate": 4.782229965156795e-05,
"loss": 1.0997,
"step": 1500
},
{
"epoch": 1.9309462915601023,
"grad_norm": 23.93743324279785,
"learning_rate": 4.780052264808362e-05,
"loss": 1.3025,
"step": 1510
},
{
"epoch": 1.9437340153452687,
"grad_norm": 2.8010175228118896,
"learning_rate": 4.77787456445993e-05,
"loss": 1.8831,
"step": 1520
},
{
"epoch": 1.9565217391304348,
"grad_norm": 23.10536766052246,
"learning_rate": 4.7756968641114985e-05,
"loss": 0.8556,
"step": 1530
},
{
"epoch": 1.969309462915601,
"grad_norm": 10.993846893310547,
"learning_rate": 4.773519163763067e-05,
"loss": 1.5948,
"step": 1540
},
{
"epoch": 1.9820971867007673,
"grad_norm": 11.615630149841309,
"learning_rate": 4.771341463414634e-05,
"loss": 1.1638,
"step": 1550
},
{
"epoch": 1.9948849104859336,
"grad_norm": 58.213191986083984,
"learning_rate": 4.7691637630662024e-05,
"loss": 1.6968,
"step": 1560
},
{
"epoch": 2.0,
"eval_loss": 0.5625534653663635,
"eval_runtime": 0.9781,
"eval_samples_per_second": 100.193,
"eval_steps_per_second": 13.291,
"step": 1564
},
{
"epoch": 2.0076726342710995,
"grad_norm": 3.010547161102295,
"learning_rate": 4.7669860627177705e-05,
"loss": 1.8739,
"step": 1570
},
{
"epoch": 2.020460358056266,
"grad_norm": 3.986027956008911,
"learning_rate": 4.764808362369339e-05,
"loss": 1.2554,
"step": 1580
},
{
"epoch": 2.0332480818414322,
"grad_norm": 6.573552131652832,
"learning_rate": 4.762630662020906e-05,
"loss": 1.5013,
"step": 1590
},
{
"epoch": 2.0460358056265986,
"grad_norm": 86.19767761230469,
"learning_rate": 4.7604529616724744e-05,
"loss": 1.7618,
"step": 1600
},
{
"epoch": 2.0588235294117645,
"grad_norm": 9.659221649169922,
"learning_rate": 4.7582752613240426e-05,
"loss": 1.2333,
"step": 1610
},
{
"epoch": 2.071611253196931,
"grad_norm": 11.018202781677246,
"learning_rate": 4.75609756097561e-05,
"loss": 1.3108,
"step": 1620
},
{
"epoch": 2.084398976982097,
"grad_norm": 5.9316725730896,
"learning_rate": 4.7539198606271775e-05,
"loss": 1.4457,
"step": 1630
},
{
"epoch": 2.0971867007672635,
"grad_norm": 6.019528865814209,
"learning_rate": 4.751742160278746e-05,
"loss": 1.3538,
"step": 1640
},
{
"epoch": 2.10997442455243,
"grad_norm": 15.951410293579102,
"learning_rate": 4.749564459930314e-05,
"loss": 1.7894,
"step": 1650
},
{
"epoch": 2.122762148337596,
"grad_norm": 17.659423828125,
"learning_rate": 4.7473867595818814e-05,
"loss": 1.6854,
"step": 1660
},
{
"epoch": 2.135549872122762,
"grad_norm": 6.573909282684326,
"learning_rate": 4.7452090592334496e-05,
"loss": 0.7702,
"step": 1670
},
{
"epoch": 2.1483375959079285,
"grad_norm": 19.226573944091797,
"learning_rate": 4.743031358885018e-05,
"loss": 1.5234,
"step": 1680
},
{
"epoch": 2.1611253196930944,
"grad_norm": 7.366319179534912,
"learning_rate": 4.740853658536585e-05,
"loss": 1.167,
"step": 1690
},
{
"epoch": 2.1739130434782608,
"grad_norm": 18.000076293945312,
"learning_rate": 4.7386759581881534e-05,
"loss": 1.3387,
"step": 1700
},
{
"epoch": 2.186700767263427,
"grad_norm": 23.309993743896484,
"learning_rate": 4.7364982578397216e-05,
"loss": 0.9478,
"step": 1710
},
{
"epoch": 2.1994884910485935,
"grad_norm": 11.471582412719727,
"learning_rate": 4.73432055749129e-05,
"loss": 1.8921,
"step": 1720
},
{
"epoch": 2.21227621483376,
"grad_norm": 6.368335723876953,
"learning_rate": 4.732142857142857e-05,
"loss": 1.3705,
"step": 1730
},
{
"epoch": 2.2250639386189257,
"grad_norm": 37.961761474609375,
"learning_rate": 4.7299651567944254e-05,
"loss": 1.0206,
"step": 1740
},
{
"epoch": 2.237851662404092,
"grad_norm": 2.239867925643921,
"learning_rate": 4.7277874564459936e-05,
"loss": 0.4718,
"step": 1750
},
{
"epoch": 2.2506393861892584,
"grad_norm": 3.7665998935699463,
"learning_rate": 4.725609756097561e-05,
"loss": 1.0531,
"step": 1760
},
{
"epoch": 2.2634271099744243,
"grad_norm": 6.445348262786865,
"learning_rate": 4.723432055749129e-05,
"loss": 0.7531,
"step": 1770
},
{
"epoch": 2.2762148337595907,
"grad_norm": 15.123578071594238,
"learning_rate": 4.7212543554006975e-05,
"loss": 1.8705,
"step": 1780
},
{
"epoch": 2.289002557544757,
"grad_norm": 0.8845268487930298,
"learning_rate": 4.719076655052265e-05,
"loss": 1.3302,
"step": 1790
},
{
"epoch": 2.3017902813299234,
"grad_norm": 4.527200698852539,
"learning_rate": 4.716898954703833e-05,
"loss": 0.6225,
"step": 1800
},
{
"epoch": 2.3145780051150897,
"grad_norm": 1.2486788034439087,
"learning_rate": 4.7147212543554006e-05,
"loss": 2.2166,
"step": 1810
},
{
"epoch": 2.3273657289002556,
"grad_norm": 5.734748363494873,
"learning_rate": 4.712543554006969e-05,
"loss": 1.3255,
"step": 1820
},
{
"epoch": 2.340153452685422,
"grad_norm": 11.985346794128418,
"learning_rate": 4.710365853658536e-05,
"loss": 1.3059,
"step": 1830
},
{
"epoch": 2.3529411764705883,
"grad_norm": 3.916868209838867,
"learning_rate": 4.7081881533101045e-05,
"loss": 1.5318,
"step": 1840
},
{
"epoch": 2.3657289002557547,
"grad_norm": 7.840545654296875,
"learning_rate": 4.7060104529616726e-05,
"loss": 0.7891,
"step": 1850
},
{
"epoch": 2.3785166240409206,
"grad_norm": 14.123540878295898,
"learning_rate": 4.703832752613241e-05,
"loss": 1.0555,
"step": 1860
},
{
"epoch": 2.391304347826087,
"grad_norm": 7.954935550689697,
"learning_rate": 4.701655052264808e-05,
"loss": 1.4652,
"step": 1870
},
{
"epoch": 2.4040920716112533,
"grad_norm": 14.094103813171387,
"learning_rate": 4.6994773519163765e-05,
"loss": 1.1833,
"step": 1880
},
{
"epoch": 2.4168797953964196,
"grad_norm": 4.043068885803223,
"learning_rate": 4.697299651567945e-05,
"loss": 1.0861,
"step": 1890
},
{
"epoch": 2.4296675191815855,
"grad_norm": 2.52896785736084,
"learning_rate": 4.695121951219512e-05,
"loss": 1.6684,
"step": 1900
},
{
"epoch": 2.442455242966752,
"grad_norm": 1.6495180130004883,
"learning_rate": 4.69294425087108e-05,
"loss": 1.4837,
"step": 1910
},
{
"epoch": 2.455242966751918,
"grad_norm": 12.262441635131836,
"learning_rate": 4.6907665505226485e-05,
"loss": 0.9211,
"step": 1920
},
{
"epoch": 2.4680306905370846,
"grad_norm": 20.890920639038086,
"learning_rate": 4.688588850174217e-05,
"loss": 0.9938,
"step": 1930
},
{
"epoch": 2.4808184143222505,
"grad_norm": 91.71107482910156,
"learning_rate": 4.686411149825784e-05,
"loss": 1.2399,
"step": 1940
},
{
"epoch": 2.493606138107417,
"grad_norm": 15.037610054016113,
"learning_rate": 4.6842334494773524e-05,
"loss": 1.3903,
"step": 1950
},
{
"epoch": 2.506393861892583,
"grad_norm": 13.86916732788086,
"learning_rate": 4.6820557491289205e-05,
"loss": 0.7887,
"step": 1960
},
{
"epoch": 2.5191815856777495,
"grad_norm": 1.1754240989685059,
"learning_rate": 4.679878048780488e-05,
"loss": 0.8378,
"step": 1970
},
{
"epoch": 2.531969309462916,
"grad_norm": 5.272491455078125,
"learning_rate": 4.6777003484320555e-05,
"loss": 0.8505,
"step": 1980
},
{
"epoch": 2.544757033248082,
"grad_norm": 5.7034831047058105,
"learning_rate": 4.675522648083624e-05,
"loss": 0.7224,
"step": 1990
},
{
"epoch": 2.557544757033248,
"grad_norm": 0.7890897393226624,
"learning_rate": 4.673344947735192e-05,
"loss": 0.6189,
"step": 2000
},
{
"epoch": 2.5703324808184145,
"grad_norm": 12.357426643371582,
"learning_rate": 4.6711672473867594e-05,
"loss": 1.0881,
"step": 2010
},
{
"epoch": 2.5831202046035804,
"grad_norm": 23.227792739868164,
"learning_rate": 4.6689895470383275e-05,
"loss": 0.8534,
"step": 2020
},
{
"epoch": 2.5959079283887467,
"grad_norm": 10.504785537719727,
"learning_rate": 4.666811846689896e-05,
"loss": 0.8183,
"step": 2030
},
{
"epoch": 2.608695652173913,
"grad_norm": 15.565208435058594,
"learning_rate": 4.664634146341464e-05,
"loss": 1.4254,
"step": 2040
},
{
"epoch": 2.6214833759590794,
"grad_norm": 1.6157219409942627,
"learning_rate": 4.6624564459930314e-05,
"loss": 0.5524,
"step": 2050
},
{
"epoch": 2.634271099744246,
"grad_norm": 19.441726684570312,
"learning_rate": 4.6602787456445996e-05,
"loss": 0.9524,
"step": 2060
},
{
"epoch": 2.6470588235294117,
"grad_norm": 7.794475078582764,
"learning_rate": 4.658101045296168e-05,
"loss": 1.0205,
"step": 2070
},
{
"epoch": 2.659846547314578,
"grad_norm": 15.814881324768066,
"learning_rate": 4.655923344947735e-05,
"loss": 0.9889,
"step": 2080
},
{
"epoch": 2.6726342710997444,
"grad_norm": 0.5755274891853333,
"learning_rate": 4.6537456445993034e-05,
"loss": 1.1448,
"step": 2090
},
{
"epoch": 2.6854219948849103,
"grad_norm": 17.273611068725586,
"learning_rate": 4.6515679442508716e-05,
"loss": 1.3059,
"step": 2100
},
{
"epoch": 2.6982097186700766,
"grad_norm": 3.9910295009613037,
"learning_rate": 4.64939024390244e-05,
"loss": 0.683,
"step": 2110
},
{
"epoch": 2.710997442455243,
"grad_norm": 14.555076599121094,
"learning_rate": 4.647212543554007e-05,
"loss": 1.1196,
"step": 2120
},
{
"epoch": 2.7237851662404093,
"grad_norm": 23.410642623901367,
"learning_rate": 4.6450348432055754e-05,
"loss": 1.1257,
"step": 2130
},
{
"epoch": 2.7365728900255757,
"grad_norm": 4.720057010650635,
"learning_rate": 4.642857142857143e-05,
"loss": 0.7206,
"step": 2140
},
{
"epoch": 2.7493606138107416,
"grad_norm": 1.535508632659912,
"learning_rate": 4.640679442508711e-05,
"loss": 0.6184,
"step": 2150
},
{
"epoch": 2.762148337595908,
"grad_norm": 14.630488395690918,
"learning_rate": 4.6385017421602786e-05,
"loss": 1.093,
"step": 2160
},
{
"epoch": 2.7749360613810743,
"grad_norm": 5.701061248779297,
"learning_rate": 4.636324041811847e-05,
"loss": 1.0357,
"step": 2170
},
{
"epoch": 2.78772378516624,
"grad_norm": 2.6190364360809326,
"learning_rate": 4.634146341463415e-05,
"loss": 1.3345,
"step": 2180
},
{
"epoch": 2.8005115089514065,
"grad_norm": 14.732501029968262,
"learning_rate": 4.6319686411149825e-05,
"loss": 1.1697,
"step": 2190
},
{
"epoch": 2.813299232736573,
"grad_norm": 54.13779830932617,
"learning_rate": 4.6297909407665506e-05,
"loss": 0.8014,
"step": 2200
},
{
"epoch": 2.8260869565217392,
"grad_norm": 3.1863250732421875,
"learning_rate": 4.627613240418119e-05,
"loss": 0.6579,
"step": 2210
},
{
"epoch": 2.8388746803069056,
"grad_norm": 0.7445744872093201,
"learning_rate": 4.625435540069686e-05,
"loss": 0.9363,
"step": 2220
},
{
"epoch": 2.8516624040920715,
"grad_norm": 12.610774040222168,
"learning_rate": 4.6232578397212545e-05,
"loss": 1.0073,
"step": 2230
},
{
"epoch": 2.864450127877238,
"grad_norm": 1.9731221199035645,
"learning_rate": 4.6210801393728226e-05,
"loss": 0.6766,
"step": 2240
},
{
"epoch": 2.877237851662404,
"grad_norm": 18.617942810058594,
"learning_rate": 4.618902439024391e-05,
"loss": 1.124,
"step": 2250
},
{
"epoch": 2.89002557544757,
"grad_norm": 15.15342903137207,
"learning_rate": 4.616724738675958e-05,
"loss": 1.0936,
"step": 2260
},
{
"epoch": 2.9028132992327365,
"grad_norm": 6.208033561706543,
"learning_rate": 4.6145470383275265e-05,
"loss": 0.8369,
"step": 2270
},
{
"epoch": 2.915601023017903,
"grad_norm": 5.944622993469238,
"learning_rate": 4.612369337979095e-05,
"loss": 0.8107,
"step": 2280
},
{
"epoch": 2.928388746803069,
"grad_norm": 12.07383918762207,
"learning_rate": 4.610191637630662e-05,
"loss": 1.4551,
"step": 2290
},
{
"epoch": 2.9411764705882355,
"grad_norm": 12.542791366577148,
"learning_rate": 4.6080139372822303e-05,
"loss": 0.9312,
"step": 2300
},
{
"epoch": 2.9539641943734014,
"grad_norm": 6.652247905731201,
"learning_rate": 4.6058362369337985e-05,
"loss": 0.944,
"step": 2310
},
{
"epoch": 2.9667519181585678,
"grad_norm": 0.610918402671814,
"learning_rate": 4.603658536585366e-05,
"loss": 0.9986,
"step": 2320
},
{
"epoch": 2.979539641943734,
"grad_norm": 0.7279015183448792,
"learning_rate": 4.6014808362369335e-05,
"loss": 0.4389,
"step": 2330
},
{
"epoch": 2.9923273657289,
"grad_norm": 25.247034072875977,
"learning_rate": 4.599303135888502e-05,
"loss": 1.0318,
"step": 2340
},
{
"epoch": 3.0,
"eval_loss": 0.445333868265152,
"eval_runtime": 0.967,
"eval_samples_per_second": 101.343,
"eval_steps_per_second": 13.443,
"step": 2346
},
{
"epoch": 3.0051150895140664,
"grad_norm": 18.33305549621582,
"learning_rate": 4.59712543554007e-05,
"loss": 1.0371,
"step": 2350
},
{
"epoch": 3.0179028132992327,
"grad_norm": 5.7453107833862305,
"learning_rate": 4.594947735191638e-05,
"loss": 1.2382,
"step": 2360
},
{
"epoch": 3.030690537084399,
"grad_norm": 24.08013153076172,
"learning_rate": 4.5927700348432055e-05,
"loss": 1.5925,
"step": 2370
},
{
"epoch": 3.0434782608695654,
"grad_norm": 16.559703826904297,
"learning_rate": 4.590592334494774e-05,
"loss": 0.7614,
"step": 2380
},
{
"epoch": 3.0562659846547313,
"grad_norm": 0.3231087625026703,
"learning_rate": 4.588414634146342e-05,
"loss": 0.6892,
"step": 2390
},
{
"epoch": 3.0690537084398977,
"grad_norm": 7.988368511199951,
"learning_rate": 4.5862369337979094e-05,
"loss": 0.8234,
"step": 2400
},
{
"epoch": 3.081841432225064,
"grad_norm": 0.9878854155540466,
"learning_rate": 4.5840592334494776e-05,
"loss": 1.2006,
"step": 2410
},
{
"epoch": 3.0946291560102304,
"grad_norm": 2.3492844104766846,
"learning_rate": 4.581881533101046e-05,
"loss": 0.4687,
"step": 2420
},
{
"epoch": 3.1074168797953963,
"grad_norm": 10.556395530700684,
"learning_rate": 4.579703832752614e-05,
"loss": 0.5203,
"step": 2430
},
{
"epoch": 3.1202046035805626,
"grad_norm": 2.9563450813293457,
"learning_rate": 4.5775261324041814e-05,
"loss": 1.575,
"step": 2440
},
{
"epoch": 3.132992327365729,
"grad_norm": 238.0205078125,
"learning_rate": 4.5753484320557496e-05,
"loss": 0.9001,
"step": 2450
},
{
"epoch": 3.1457800511508953,
"grad_norm": 5.97659158706665,
"learning_rate": 4.573170731707318e-05,
"loss": 1.0491,
"step": 2460
},
{
"epoch": 3.1585677749360612,
"grad_norm": 2.6114704608917236,
"learning_rate": 4.570993031358885e-05,
"loss": 1.176,
"step": 2470
},
{
"epoch": 3.1713554987212276,
"grad_norm": 11.491422653198242,
"learning_rate": 4.5688153310104534e-05,
"loss": 1.0819,
"step": 2480
},
{
"epoch": 3.184143222506394,
"grad_norm": 15.428108215332031,
"learning_rate": 4.566637630662021e-05,
"loss": 0.7879,
"step": 2490
},
{
"epoch": 3.1969309462915603,
"grad_norm": 1.30459725856781,
"learning_rate": 4.564459930313589e-05,
"loss": 0.432,
"step": 2500
},
{
"epoch": 3.209718670076726,
"grad_norm": 10.579148292541504,
"learning_rate": 4.5622822299651566e-05,
"loss": 0.8243,
"step": 2510
},
{
"epoch": 3.2225063938618925,
"grad_norm": 15.585744857788086,
"learning_rate": 4.560104529616725e-05,
"loss": 0.6711,
"step": 2520
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.4204480051994324,
"learning_rate": 4.557926829268293e-05,
"loss": 0.3794,
"step": 2530
},
{
"epoch": 3.2480818414322252,
"grad_norm": 2.049827814102173,
"learning_rate": 4.5557491289198604e-05,
"loss": 0.8661,
"step": 2540
},
{
"epoch": 3.260869565217391,
"grad_norm": 4.234211444854736,
"learning_rate": 4.5535714285714286e-05,
"loss": 0.6852,
"step": 2550
},
{
"epoch": 3.2736572890025575,
"grad_norm": 6.04722785949707,
"learning_rate": 4.551393728222997e-05,
"loss": 0.6973,
"step": 2560
},
{
"epoch": 3.286445012787724,
"grad_norm": 2.9452807903289795,
"learning_rate": 4.549216027874565e-05,
"loss": 1.2999,
"step": 2570
},
{
"epoch": 3.29923273657289,
"grad_norm": 5.985997200012207,
"learning_rate": 4.5470383275261325e-05,
"loss": 0.6524,
"step": 2580
},
{
"epoch": 3.312020460358056,
"grad_norm": 17.464967727661133,
"learning_rate": 4.5448606271777006e-05,
"loss": 0.414,
"step": 2590
},
{
"epoch": 3.3248081841432224,
"grad_norm": 0.4771990180015564,
"learning_rate": 4.542682926829269e-05,
"loss": 0.9716,
"step": 2600
},
{
"epoch": 3.337595907928389,
"grad_norm": 12.75200080871582,
"learning_rate": 4.540505226480836e-05,
"loss": 1.2216,
"step": 2610
},
{
"epoch": 3.350383631713555,
"grad_norm": 137.2834014892578,
"learning_rate": 4.5383275261324045e-05,
"loss": 1.339,
"step": 2620
},
{
"epoch": 3.363171355498721,
"grad_norm": 16.52651596069336,
"learning_rate": 4.5361498257839727e-05,
"loss": 1.3891,
"step": 2630
},
{
"epoch": 3.3759590792838874,
"grad_norm": 3.5017669200897217,
"learning_rate": 4.533972125435541e-05,
"loss": 0.8694,
"step": 2640
},
{
"epoch": 3.3887468030690537,
"grad_norm": 14.319966316223145,
"learning_rate": 4.531794425087108e-05,
"loss": 0.8546,
"step": 2650
},
{
"epoch": 3.40153452685422,
"grad_norm": 13.259135246276855,
"learning_rate": 4.529616724738676e-05,
"loss": 0.706,
"step": 2660
},
{
"epoch": 3.414322250639386,
"grad_norm": 12.226325035095215,
"learning_rate": 4.527439024390244e-05,
"loss": 0.6859,
"step": 2670
},
{
"epoch": 3.4271099744245523,
"grad_norm": 3.982544422149658,
"learning_rate": 4.5252613240418115e-05,
"loss": 0.8816,
"step": 2680
},
{
"epoch": 3.4398976982097187,
"grad_norm": 3.3434112071990967,
"learning_rate": 4.52308362369338e-05,
"loss": 0.2865,
"step": 2690
},
{
"epoch": 3.452685421994885,
"grad_norm": 1.9963968992233276,
"learning_rate": 4.520905923344948e-05,
"loss": 0.7698,
"step": 2700
},
{
"epoch": 3.4654731457800514,
"grad_norm": 11.71841812133789,
"learning_rate": 4.518728222996516e-05,
"loss": 1.1581,
"step": 2710
},
{
"epoch": 3.4782608695652173,
"grad_norm": 7.631697654724121,
"learning_rate": 4.5165505226480835e-05,
"loss": 1.0637,
"step": 2720
},
{
"epoch": 3.4910485933503836,
"grad_norm": 14.784839630126953,
"learning_rate": 4.514372822299652e-05,
"loss": 1.0658,
"step": 2730
},
{
"epoch": 3.50383631713555,
"grad_norm": 5.46140193939209,
"learning_rate": 4.51219512195122e-05,
"loss": 0.6536,
"step": 2740
},
{
"epoch": 3.516624040920716,
"grad_norm": 2.2502548694610596,
"learning_rate": 4.5100174216027874e-05,
"loss": 0.7903,
"step": 2750
},
{
"epoch": 3.5294117647058822,
"grad_norm": 2.154597043991089,
"learning_rate": 4.5078397212543555e-05,
"loss": 0.9926,
"step": 2760
},
{
"epoch": 3.5421994884910486,
"grad_norm": 13.152908325195312,
"learning_rate": 4.505662020905924e-05,
"loss": 0.8718,
"step": 2770
},
{
"epoch": 3.554987212276215,
"grad_norm": 5.203449249267578,
"learning_rate": 4.503484320557492e-05,
"loss": 0.8298,
"step": 2780
},
{
"epoch": 3.5677749360613813,
"grad_norm": 11.65567684173584,
"learning_rate": 4.5013066202090594e-05,
"loss": 1.1689,
"step": 2790
},
{
"epoch": 3.580562659846547,
"grad_norm": 2.916771411895752,
"learning_rate": 4.4991289198606276e-05,
"loss": 0.817,
"step": 2800
},
{
"epoch": 3.5933503836317136,
"grad_norm": 14.278409957885742,
"learning_rate": 4.496951219512196e-05,
"loss": 0.8092,
"step": 2810
},
{
"epoch": 3.60613810741688,
"grad_norm": 0.2527465522289276,
"learning_rate": 4.494773519163763e-05,
"loss": 0.3738,
"step": 2820
},
{
"epoch": 3.618925831202046,
"grad_norm": 0.8273968696594238,
"learning_rate": 4.4925958188153314e-05,
"loss": 0.4944,
"step": 2830
},
{
"epoch": 3.631713554987212,
"grad_norm": 2.636253833770752,
"learning_rate": 4.490418118466899e-05,
"loss": 0.6911,
"step": 2840
},
{
"epoch": 3.6445012787723785,
"grad_norm": 3.27070951461792,
"learning_rate": 4.488240418118467e-05,
"loss": 1.1412,
"step": 2850
},
{
"epoch": 3.657289002557545,
"grad_norm": 7.859626770019531,
"learning_rate": 4.4860627177700346e-05,
"loss": 0.6205,
"step": 2860
},
{
"epoch": 3.670076726342711,
"grad_norm": 4.144435882568359,
"learning_rate": 4.483885017421603e-05,
"loss": 1.1247,
"step": 2870
},
{
"epoch": 3.682864450127877,
"grad_norm": 0.911280632019043,
"learning_rate": 4.481707317073171e-05,
"loss": 0.8172,
"step": 2880
},
{
"epoch": 3.6956521739130435,
"grad_norm": 1.5579833984375,
"learning_rate": 4.479529616724739e-05,
"loss": 1.186,
"step": 2890
},
{
"epoch": 3.70843989769821,
"grad_norm": 7.854421615600586,
"learning_rate": 4.4773519163763066e-05,
"loss": 0.5919,
"step": 2900
},
{
"epoch": 3.7212276214833757,
"grad_norm": 3.1011366844177246,
"learning_rate": 4.475174216027875e-05,
"loss": 0.4518,
"step": 2910
},
{
"epoch": 3.734015345268542,
"grad_norm": 5.562145709991455,
"learning_rate": 4.472996515679443e-05,
"loss": 1.296,
"step": 2920
},
{
"epoch": 3.7468030690537084,
"grad_norm": 10.683934211730957,
"learning_rate": 4.4708188153310104e-05,
"loss": 0.5236,
"step": 2930
},
{
"epoch": 3.7595907928388748,
"grad_norm": 6.5699076652526855,
"learning_rate": 4.4686411149825786e-05,
"loss": 0.9732,
"step": 2940
},
{
"epoch": 3.772378516624041,
"grad_norm": 19.995325088500977,
"learning_rate": 4.466463414634147e-05,
"loss": 0.9127,
"step": 2950
},
{
"epoch": 3.785166240409207,
"grad_norm": 4.819314479827881,
"learning_rate": 4.464285714285715e-05,
"loss": 0.8555,
"step": 2960
},
{
"epoch": 3.7979539641943734,
"grad_norm": 7.258398532867432,
"learning_rate": 4.4621080139372825e-05,
"loss": 1.1759,
"step": 2970
},
{
"epoch": 3.8107416879795397,
"grad_norm": 1.841517448425293,
"learning_rate": 4.4599303135888506e-05,
"loss": 1.1711,
"step": 2980
},
{
"epoch": 3.8235294117647056,
"grad_norm": 2.7056972980499268,
"learning_rate": 4.457752613240419e-05,
"loss": 0.3712,
"step": 2990
},
{
"epoch": 3.836317135549872,
"grad_norm": 12.133644104003906,
"learning_rate": 4.455574912891986e-05,
"loss": 0.5701,
"step": 3000
},
{
"epoch": 3.8491048593350383,
"grad_norm": 5.757213115692139,
"learning_rate": 4.453397212543554e-05,
"loss": 0.3663,
"step": 3010
},
{
"epoch": 3.8618925831202047,
"grad_norm": 6.096423149108887,
"learning_rate": 4.451219512195122e-05,
"loss": 0.5234,
"step": 3020
},
{
"epoch": 3.874680306905371,
"grad_norm": 1.7655915021896362,
"learning_rate": 4.44904181184669e-05,
"loss": 0.7285,
"step": 3030
},
{
"epoch": 3.887468030690537,
"grad_norm": 4.150766849517822,
"learning_rate": 4.4468641114982576e-05,
"loss": 1.2942,
"step": 3040
},
{
"epoch": 3.9002557544757033,
"grad_norm": 0.9318215250968933,
"learning_rate": 4.444686411149826e-05,
"loss": 0.6762,
"step": 3050
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.7870925068855286,
"learning_rate": 4.442508710801394e-05,
"loss": 0.2768,
"step": 3060
},
{
"epoch": 3.9258312020460355,
"grad_norm": 0.7156265377998352,
"learning_rate": 4.4403310104529615e-05,
"loss": 0.7751,
"step": 3070
},
{
"epoch": 3.938618925831202,
"grad_norm": 2.052534580230713,
"learning_rate": 4.43815331010453e-05,
"loss": 0.6196,
"step": 3080
},
{
"epoch": 3.9514066496163682,
"grad_norm": 13.939319610595703,
"learning_rate": 4.435975609756098e-05,
"loss": 0.7721,
"step": 3090
},
{
"epoch": 3.9641943734015346,
"grad_norm": 2.3193609714508057,
"learning_rate": 4.433797909407666e-05,
"loss": 0.8792,
"step": 3100
},
{
"epoch": 3.976982097186701,
"grad_norm": 13.679859161376953,
"learning_rate": 4.4316202090592335e-05,
"loss": 0.8644,
"step": 3110
},
{
"epoch": 3.9897698209718673,
"grad_norm": 1.552064299583435,
"learning_rate": 4.429442508710802e-05,
"loss": 0.711,
"step": 3120
},
{
"epoch": 4.0,
"eval_loss": 0.3499666452407837,
"eval_runtime": 0.9683,
"eval_samples_per_second": 101.204,
"eval_steps_per_second": 13.425,
"step": 3128
},
{
"epoch": 4.002557544757034,
"grad_norm": 3.0008726119995117,
"learning_rate": 4.42726480836237e-05,
"loss": 0.9301,
"step": 3130
},
{
"epoch": 4.015345268542199,
"grad_norm": 6.133752822875977,
"learning_rate": 4.4250871080139374e-05,
"loss": 0.7322,
"step": 3140
},
{
"epoch": 4.028132992327365,
"grad_norm": 7.614703178405762,
"learning_rate": 4.4229094076655055e-05,
"loss": 1.0492,
"step": 3150
},
{
"epoch": 4.040920716112532,
"grad_norm": 2.1636641025543213,
"learning_rate": 4.420731707317074e-05,
"loss": 1.1332,
"step": 3160
},
{
"epoch": 4.053708439897698,
"grad_norm": 9.834653854370117,
"learning_rate": 4.418554006968641e-05,
"loss": 0.6502,
"step": 3170
},
{
"epoch": 4.0664961636828645,
"grad_norm": 1.8648459911346436,
"learning_rate": 4.4163763066202094e-05,
"loss": 0.5078,
"step": 3180
},
{
"epoch": 4.079283887468031,
"grad_norm": 8.35239315032959,
"learning_rate": 4.414198606271777e-05,
"loss": 0.7993,
"step": 3190
},
{
"epoch": 4.092071611253197,
"grad_norm": 0.7734344601631165,
"learning_rate": 4.412020905923345e-05,
"loss": 0.8665,
"step": 3200
},
{
"epoch": 4.1048593350383635,
"grad_norm": 1.2159333229064941,
"learning_rate": 4.409843205574913e-05,
"loss": 0.8147,
"step": 3210
},
{
"epoch": 4.117647058823529,
"grad_norm": 14.401145935058594,
"learning_rate": 4.407665505226481e-05,
"loss": 0.6267,
"step": 3220
},
{
"epoch": 4.130434782608695,
"grad_norm": 9.61027717590332,
"learning_rate": 4.405487804878049e-05,
"loss": 0.7327,
"step": 3230
},
{
"epoch": 4.143222506393862,
"grad_norm": 15.293912887573242,
"learning_rate": 4.403310104529617e-05,
"loss": 1.3573,
"step": 3240
},
{
"epoch": 4.156010230179028,
"grad_norm": 4.653273582458496,
"learning_rate": 4.4011324041811846e-05,
"loss": 0.5447,
"step": 3250
},
{
"epoch": 4.168797953964194,
"grad_norm": 3.500906467437744,
"learning_rate": 4.398954703832753e-05,
"loss": 0.9046,
"step": 3260
},
{
"epoch": 4.181585677749361,
"grad_norm": 13.659477233886719,
"learning_rate": 4.396777003484321e-05,
"loss": 0.6891,
"step": 3270
},
{
"epoch": 4.194373401534527,
"grad_norm": 0.9972584247589111,
"learning_rate": 4.394599303135889e-05,
"loss": 0.7421,
"step": 3280
},
{
"epoch": 4.207161125319693,
"grad_norm": 6.448363304138184,
"learning_rate": 4.3924216027874566e-05,
"loss": 0.6541,
"step": 3290
},
{
"epoch": 4.21994884910486,
"grad_norm": 2.5608551502227783,
"learning_rate": 4.390243902439025e-05,
"loss": 0.3677,
"step": 3300
},
{
"epoch": 4.232736572890025,
"grad_norm": 6.504319667816162,
"learning_rate": 4.388066202090593e-05,
"loss": 0.5709,
"step": 3310
},
{
"epoch": 4.245524296675192,
"grad_norm": 9.659370422363281,
"learning_rate": 4.3858885017421604e-05,
"loss": 0.7813,
"step": 3320
},
{
"epoch": 4.258312020460358,
"grad_norm": 3.0793986320495605,
"learning_rate": 4.3837108013937286e-05,
"loss": 0.5772,
"step": 3330
},
{
"epoch": 4.271099744245524,
"grad_norm": 1.2446378469467163,
"learning_rate": 4.381533101045297e-05,
"loss": 0.6206,
"step": 3340
},
{
"epoch": 4.283887468030691,
"grad_norm": 0.565462589263916,
"learning_rate": 4.379355400696864e-05,
"loss": 0.6753,
"step": 3350
},
{
"epoch": 4.296675191815857,
"grad_norm": 1.6904747486114502,
"learning_rate": 4.377177700348432e-05,
"loss": 0.7721,
"step": 3360
},
{
"epoch": 4.309462915601023,
"grad_norm": 2.9521095752716064,
"learning_rate": 4.375e-05,
"loss": 0.3611,
"step": 3370
},
{
"epoch": 4.322250639386189,
"grad_norm": 6.3527703285217285,
"learning_rate": 4.372822299651568e-05,
"loss": 0.5197,
"step": 3380
},
{
"epoch": 4.335038363171355,
"grad_norm": 3.786268949508667,
"learning_rate": 4.3706445993031356e-05,
"loss": 0.6294,
"step": 3390
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.7279968857765198,
"learning_rate": 4.368466898954704e-05,
"loss": 1.0253,
"step": 3400
},
{
"epoch": 4.360613810741688,
"grad_norm": 0.4943724274635315,
"learning_rate": 4.366289198606272e-05,
"loss": 0.668,
"step": 3410
},
{
"epoch": 4.373401534526854,
"grad_norm": 5.150463581085205,
"learning_rate": 4.36411149825784e-05,
"loss": 0.7717,
"step": 3420
},
{
"epoch": 4.3861892583120206,
"grad_norm": 8.849224090576172,
"learning_rate": 4.3619337979094076e-05,
"loss": 0.713,
"step": 3430
},
{
"epoch": 4.398976982097187,
"grad_norm": 2.910844087600708,
"learning_rate": 4.359756097560976e-05,
"loss": 0.7032,
"step": 3440
},
{
"epoch": 4.411764705882353,
"grad_norm": 3.0307321548461914,
"learning_rate": 4.357578397212544e-05,
"loss": 0.5685,
"step": 3450
},
{
"epoch": 4.42455242966752,
"grad_norm": 11.50002670288086,
"learning_rate": 4.3554006968641115e-05,
"loss": 0.4806,
"step": 3460
},
{
"epoch": 4.437340153452685,
"grad_norm": 9.223997116088867,
"learning_rate": 4.35322299651568e-05,
"loss": 0.4831,
"step": 3470
},
{
"epoch": 4.450127877237851,
"grad_norm": 4.3660197257995605,
"learning_rate": 4.351045296167248e-05,
"loss": 1.1399,
"step": 3480
},
{
"epoch": 4.462915601023018,
"grad_norm": 9.756316184997559,
"learning_rate": 4.348867595818816e-05,
"loss": 0.8305,
"step": 3490
},
{
"epoch": 4.475703324808184,
"grad_norm": 4.104464530944824,
"learning_rate": 4.3466898954703835e-05,
"loss": 0.2647,
"step": 3500
},
{
"epoch": 4.4884910485933505,
"grad_norm": 8.359086036682129,
"learning_rate": 4.344512195121952e-05,
"loss": 0.9752,
"step": 3510
},
{
"epoch": 4.501278772378517,
"grad_norm": 2.668789863586426,
"learning_rate": 4.342334494773519e-05,
"loss": 0.5549,
"step": 3520
},
{
"epoch": 4.514066496163683,
"grad_norm": 11.178609848022461,
"learning_rate": 4.3401567944250874e-05,
"loss": 0.8338,
"step": 3530
},
{
"epoch": 4.526854219948849,
"grad_norm": 29.559751510620117,
"learning_rate": 4.337979094076655e-05,
"loss": 0.7025,
"step": 3540
},
{
"epoch": 4.539641943734015,
"grad_norm": 13.457611083984375,
"learning_rate": 4.335801393728223e-05,
"loss": 0.3561,
"step": 3550
},
{
"epoch": 4.552429667519181,
"grad_norm": 8.7972993850708,
"learning_rate": 4.333623693379791e-05,
"loss": 0.8934,
"step": 3560
},
{
"epoch": 4.565217391304348,
"grad_norm": 4.532437324523926,
"learning_rate": 4.331445993031359e-05,
"loss": 0.6173,
"step": 3570
},
{
"epoch": 4.578005115089514,
"grad_norm": 9.719355583190918,
"learning_rate": 4.329268292682927e-05,
"loss": 0.5608,
"step": 3580
},
{
"epoch": 4.59079283887468,
"grad_norm": 10.024927139282227,
"learning_rate": 4.327090592334495e-05,
"loss": 1.2261,
"step": 3590
},
{
"epoch": 4.603580562659847,
"grad_norm": 8.742612838745117,
"learning_rate": 4.324912891986063e-05,
"loss": 0.6297,
"step": 3600
},
{
"epoch": 4.616368286445013,
"grad_norm": 4.9773077964782715,
"learning_rate": 4.322735191637631e-05,
"loss": 0.3228,
"step": 3610
},
{
"epoch": 4.629156010230179,
"grad_norm": 18.090656280517578,
"learning_rate": 4.320557491289199e-05,
"loss": 0.6028,
"step": 3620
},
{
"epoch": 4.641943734015345,
"grad_norm": 10.379446029663086,
"learning_rate": 4.318379790940767e-05,
"loss": 0.9047,
"step": 3630
},
{
"epoch": 4.654731457800511,
"grad_norm": 10.179900169372559,
"learning_rate": 4.3162020905923346e-05,
"loss": 0.4727,
"step": 3640
},
{
"epoch": 4.667519181585678,
"grad_norm": 9.0844144821167,
"learning_rate": 4.314024390243903e-05,
"loss": 0.8556,
"step": 3650
},
{
"epoch": 4.680306905370844,
"grad_norm": 8.229222297668457,
"learning_rate": 4.311846689895471e-05,
"loss": 0.5654,
"step": 3660
},
{
"epoch": 4.69309462915601,
"grad_norm": 2.8442490100860596,
"learning_rate": 4.309668989547039e-05,
"loss": 0.3812,
"step": 3670
},
{
"epoch": 4.705882352941177,
"grad_norm": 14.046589851379395,
"learning_rate": 4.3074912891986066e-05,
"loss": 1.0255,
"step": 3680
},
{
"epoch": 4.718670076726343,
"grad_norm": 1.3973981142044067,
"learning_rate": 4.305313588850175e-05,
"loss": 0.6829,
"step": 3690
},
{
"epoch": 4.731457800511509,
"grad_norm": 4.366006374359131,
"learning_rate": 4.303135888501742e-05,
"loss": 0.7007,
"step": 3700
},
{
"epoch": 4.744245524296675,
"grad_norm": 9.728074073791504,
"learning_rate": 4.30095818815331e-05,
"loss": 0.6583,
"step": 3710
},
{
"epoch": 4.757033248081841,
"grad_norm": 7.475890159606934,
"learning_rate": 4.298780487804878e-05,
"loss": 0.7781,
"step": 3720
},
{
"epoch": 4.7698209718670075,
"grad_norm": 0.4485682249069214,
"learning_rate": 4.296602787456446e-05,
"loss": 0.6201,
"step": 3730
},
{
"epoch": 4.782608695652174,
"grad_norm": 13.75088882446289,
"learning_rate": 4.294425087108014e-05,
"loss": 0.8055,
"step": 3740
},
{
"epoch": 4.79539641943734,
"grad_norm": 10.574711799621582,
"learning_rate": 4.292247386759582e-05,
"loss": 0.6285,
"step": 3750
},
{
"epoch": 4.8081841432225065,
"grad_norm": 24.870988845825195,
"learning_rate": 4.29006968641115e-05,
"loss": 0.3683,
"step": 3760
},
{
"epoch": 4.820971867007673,
"grad_norm": 2.3566386699676514,
"learning_rate": 4.287891986062718e-05,
"loss": 0.397,
"step": 3770
},
{
"epoch": 4.833759590792839,
"grad_norm": 0.8658470511436462,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.4488,
"step": 3780
},
{
"epoch": 4.846547314578006,
"grad_norm": 10.955270767211914,
"learning_rate": 4.283536585365854e-05,
"loss": 0.8643,
"step": 3790
},
{
"epoch": 4.859335038363171,
"grad_norm": 0.5680179595947266,
"learning_rate": 4.281358885017422e-05,
"loss": 0.6409,
"step": 3800
},
{
"epoch": 4.872122762148337,
"grad_norm": 0.8693060874938965,
"learning_rate": 4.27918118466899e-05,
"loss": 0.6387,
"step": 3810
},
{
"epoch": 4.884910485933504,
"grad_norm": 7.04971170425415,
"learning_rate": 4.2770034843205577e-05,
"loss": 0.8097,
"step": 3820
},
{
"epoch": 4.89769820971867,
"grad_norm": 11.212493896484375,
"learning_rate": 4.274825783972126e-05,
"loss": 0.6476,
"step": 3830
},
{
"epoch": 4.910485933503836,
"grad_norm": 3.368633270263672,
"learning_rate": 4.272648083623694e-05,
"loss": 0.7214,
"step": 3840
},
{
"epoch": 4.923273657289003,
"grad_norm": 3.019455671310425,
"learning_rate": 4.2704703832752615e-05,
"loss": 0.3225,
"step": 3850
},
{
"epoch": 4.936061381074169,
"grad_norm": 2.099149465560913,
"learning_rate": 4.26829268292683e-05,
"loss": 0.4397,
"step": 3860
},
{
"epoch": 4.948849104859335,
"grad_norm": 3.1926653385162354,
"learning_rate": 4.266114982578397e-05,
"loss": 0.522,
"step": 3870
},
{
"epoch": 4.961636828644501,
"grad_norm": 9.113776206970215,
"learning_rate": 4.2639372822299653e-05,
"loss": 0.6778,
"step": 3880
},
{
"epoch": 4.974424552429667,
"grad_norm": 0.5909568071365356,
"learning_rate": 4.261759581881533e-05,
"loss": 0.7403,
"step": 3890
},
{
"epoch": 4.987212276214834,
"grad_norm": 7.393810749053955,
"learning_rate": 4.259581881533101e-05,
"loss": 0.6318,
"step": 3900
},
{
"epoch": 5.0,
"grad_norm": 8.818605422973633,
"learning_rate": 4.257404181184669e-05,
"loss": 0.5702,
"step": 3910
},
{
"epoch": 5.0,
"eval_loss": 0.2965039610862732,
"eval_runtime": 0.8112,
"eval_samples_per_second": 120.812,
"eval_steps_per_second": 16.026,
"step": 3910
},
{
"epoch": 5.012787723785166,
"grad_norm": 6.940534591674805,
"learning_rate": 4.255226480836237e-05,
"loss": 0.6806,
"step": 3920
},
{
"epoch": 5.025575447570333,
"grad_norm": 0.07264159619808197,
"learning_rate": 4.253048780487805e-05,
"loss": 0.7137,
"step": 3930
},
{
"epoch": 5.038363171355499,
"grad_norm": 0.6328474283218384,
"learning_rate": 4.250871080139373e-05,
"loss": 0.5638,
"step": 3940
},
{
"epoch": 5.051150895140665,
"grad_norm": 0.7308364510536194,
"learning_rate": 4.248693379790941e-05,
"loss": 0.7956,
"step": 3950
},
{
"epoch": 5.063938618925831,
"grad_norm": 1.2225631475448608,
"learning_rate": 4.246515679442509e-05,
"loss": 0.5388,
"step": 3960
},
{
"epoch": 5.076726342710997,
"grad_norm": 3.707005023956299,
"learning_rate": 4.244337979094077e-05,
"loss": 0.4415,
"step": 3970
},
{
"epoch": 5.089514066496164,
"grad_norm": 5.035332679748535,
"learning_rate": 4.242160278745645e-05,
"loss": 0.3167,
"step": 3980
},
{
"epoch": 5.10230179028133,
"grad_norm": 8.26025676727295,
"learning_rate": 4.239982578397213e-05,
"loss": 0.3634,
"step": 3990
},
{
"epoch": 5.115089514066496,
"grad_norm": 0.4538026452064514,
"learning_rate": 4.237804878048781e-05,
"loss": 0.4083,
"step": 4000
},
{
"epoch": 5.127877237851663,
"grad_norm": 2.740149736404419,
"learning_rate": 4.235627177700349e-05,
"loss": 0.1352,
"step": 4010
},
{
"epoch": 5.140664961636829,
"grad_norm": 2.6353447437286377,
"learning_rate": 4.233449477351917e-05,
"loss": 0.5863,
"step": 4020
},
{
"epoch": 5.153452685421995,
"grad_norm": 2.091527223587036,
"learning_rate": 4.2312717770034846e-05,
"loss": 0.398,
"step": 4030
},
{
"epoch": 5.166240409207161,
"grad_norm": 2.8681185245513916,
"learning_rate": 4.229094076655053e-05,
"loss": 0.4215,
"step": 4040
},
{
"epoch": 5.179028132992327,
"grad_norm": 8.741495132446289,
"learning_rate": 4.22691637630662e-05,
"loss": 0.6843,
"step": 4050
},
{
"epoch": 5.1918158567774935,
"grad_norm": 9.90984058380127,
"learning_rate": 4.2247386759581884e-05,
"loss": 0.9233,
"step": 4060
},
{
"epoch": 5.20460358056266,
"grad_norm": 2.038926601409912,
"learning_rate": 4.222560975609756e-05,
"loss": 0.5204,
"step": 4070
},
{
"epoch": 5.217391304347826,
"grad_norm": 0.1147674098610878,
"learning_rate": 4.220383275261324e-05,
"loss": 0.342,
"step": 4080
},
{
"epoch": 5.2301790281329925,
"grad_norm": 51.04707717895508,
"learning_rate": 4.218205574912892e-05,
"loss": 0.969,
"step": 4090
},
{
"epoch": 5.242966751918159,
"grad_norm": 0.40241485834121704,
"learning_rate": 4.21602787456446e-05,
"loss": 0.3971,
"step": 4100
},
{
"epoch": 5.255754475703325,
"grad_norm": 1.017622709274292,
"learning_rate": 4.213850174216028e-05,
"loss": 0.6763,
"step": 4110
},
{
"epoch": 5.268542199488491,
"grad_norm": 11.975500106811523,
"learning_rate": 4.211672473867596e-05,
"loss": 0.9518,
"step": 4120
},
{
"epoch": 5.281329923273657,
"grad_norm": 8.183752059936523,
"learning_rate": 4.209494773519164e-05,
"loss": 0.5027,
"step": 4130
},
{
"epoch": 5.294117647058823,
"grad_norm": 2.0323333740234375,
"learning_rate": 4.207317073170732e-05,
"loss": 0.5327,
"step": 4140
},
{
"epoch": 5.30690537084399,
"grad_norm": 9.58705997467041,
"learning_rate": 4.2051393728223e-05,
"loss": 0.7124,
"step": 4150
},
{
"epoch": 5.319693094629156,
"grad_norm": 2.840604066848755,
"learning_rate": 4.202961672473868e-05,
"loss": 0.5643,
"step": 4160
},
{
"epoch": 5.332480818414322,
"grad_norm": 4.099714756011963,
"learning_rate": 4.2007839721254356e-05,
"loss": 0.9582,
"step": 4170
},
{
"epoch": 5.345268542199489,
"grad_norm": 10.31187629699707,
"learning_rate": 4.198606271777004e-05,
"loss": 0.494,
"step": 4180
},
{
"epoch": 5.358056265984655,
"grad_norm": 3.8303794860839844,
"learning_rate": 4.196428571428572e-05,
"loss": 0.4447,
"step": 4190
},
{
"epoch": 5.370843989769821,
"grad_norm": 7.313268661499023,
"learning_rate": 4.1942508710801395e-05,
"loss": 0.5681,
"step": 4200
},
{
"epoch": 5.383631713554987,
"grad_norm": 9.197836875915527,
"learning_rate": 4.1920731707317077e-05,
"loss": 1.0159,
"step": 4210
},
{
"epoch": 5.396419437340153,
"grad_norm": 3.0132012367248535,
"learning_rate": 4.189895470383275e-05,
"loss": 0.533,
"step": 4220
},
{
"epoch": 5.40920716112532,
"grad_norm": 9.22429084777832,
"learning_rate": 4.187717770034843e-05,
"loss": 0.5966,
"step": 4230
},
{
"epoch": 5.421994884910486,
"grad_norm": 13.826210975646973,
"learning_rate": 4.185540069686411e-05,
"loss": 0.4478,
"step": 4240
},
{
"epoch": 5.434782608695652,
"grad_norm": 11.591147422790527,
"learning_rate": 4.183362369337979e-05,
"loss": 0.7064,
"step": 4250
},
{
"epoch": 5.447570332480819,
"grad_norm": 12.315413475036621,
"learning_rate": 4.181184668989547e-05,
"loss": 1.0166,
"step": 4260
},
{
"epoch": 5.460358056265985,
"grad_norm": 10.030841827392578,
"learning_rate": 4.1790069686411153e-05,
"loss": 0.537,
"step": 4270
},
{
"epoch": 5.4731457800511505,
"grad_norm": 11.139789581298828,
"learning_rate": 4.176829268292683e-05,
"loss": 0.6873,
"step": 4280
},
{
"epoch": 5.485933503836317,
"grad_norm": 0.15211737155914307,
"learning_rate": 4.174651567944251e-05,
"loss": 0.5206,
"step": 4290
},
{
"epoch": 5.498721227621483,
"grad_norm": 12.38772964477539,
"learning_rate": 4.172473867595819e-05,
"loss": 0.4893,
"step": 4300
},
{
"epoch": 5.5115089514066495,
"grad_norm": 8.11055850982666,
"learning_rate": 4.170296167247387e-05,
"loss": 0.424,
"step": 4310
},
{
"epoch": 5.524296675191816,
"grad_norm": 18.673229217529297,
"learning_rate": 4.168118466898955e-05,
"loss": 0.8048,
"step": 4320
},
{
"epoch": 5.537084398976982,
"grad_norm": 15.203030586242676,
"learning_rate": 4.165940766550523e-05,
"loss": 0.9721,
"step": 4330
},
{
"epoch": 5.549872122762149,
"grad_norm": 5.294841766357422,
"learning_rate": 4.163763066202091e-05,
"loss": 0.7572,
"step": 4340
},
{
"epoch": 5.562659846547315,
"grad_norm": 3.338132858276367,
"learning_rate": 4.161585365853659e-05,
"loss": 0.911,
"step": 4350
},
{
"epoch": 5.57544757033248,
"grad_norm": 0.901577889919281,
"learning_rate": 4.159407665505227e-05,
"loss": 0.7884,
"step": 4360
},
{
"epoch": 5.588235294117647,
"grad_norm": 3.517756223678589,
"learning_rate": 4.157229965156795e-05,
"loss": 0.324,
"step": 4370
},
{
"epoch": 5.601023017902813,
"grad_norm": 0.7338356971740723,
"learning_rate": 4.1550522648083626e-05,
"loss": 0.4658,
"step": 4380
},
{
"epoch": 5.6138107416879794,
"grad_norm": 3.6566998958587646,
"learning_rate": 4.152874564459931e-05,
"loss": 0.9993,
"step": 4390
},
{
"epoch": 5.626598465473146,
"grad_norm": 2.2101399898529053,
"learning_rate": 4.150696864111498e-05,
"loss": 0.4704,
"step": 4400
},
{
"epoch": 5.639386189258312,
"grad_norm": 4.851430892944336,
"learning_rate": 4.1485191637630664e-05,
"loss": 0.4984,
"step": 4410
},
{
"epoch": 5.6521739130434785,
"grad_norm": 5.132842063903809,
"learning_rate": 4.146341463414634e-05,
"loss": 0.3548,
"step": 4420
},
{
"epoch": 5.664961636828645,
"grad_norm": 4.944459438323975,
"learning_rate": 4.144163763066202e-05,
"loss": 0.556,
"step": 4430
},
{
"epoch": 5.677749360613811,
"grad_norm": 2.2115225791931152,
"learning_rate": 4.14198606271777e-05,
"loss": 0.6975,
"step": 4440
},
{
"epoch": 5.690537084398977,
"grad_norm": 2.7226309776306152,
"learning_rate": 4.1398083623693384e-05,
"loss": 0.4694,
"step": 4450
},
{
"epoch": 5.703324808184143,
"grad_norm": 7.218973159790039,
"learning_rate": 4.137630662020906e-05,
"loss": 0.3918,
"step": 4460
},
{
"epoch": 5.716112531969309,
"grad_norm": 0.470683753490448,
"learning_rate": 4.135452961672474e-05,
"loss": 0.719,
"step": 4470
},
{
"epoch": 5.728900255754476,
"grad_norm": 0.4735340178012848,
"learning_rate": 4.133275261324042e-05,
"loss": 0.4324,
"step": 4480
},
{
"epoch": 5.741687979539642,
"grad_norm": 2.4921889305114746,
"learning_rate": 4.13109756097561e-05,
"loss": 0.6334,
"step": 4490
},
{
"epoch": 5.754475703324808,
"grad_norm": 7.904917240142822,
"learning_rate": 4.128919860627178e-05,
"loss": 0.7718,
"step": 4500
},
{
"epoch": 5.767263427109975,
"grad_norm": 5.907204627990723,
"learning_rate": 4.126742160278746e-05,
"loss": 0.6752,
"step": 4510
},
{
"epoch": 5.78005115089514,
"grad_norm": 22.04534912109375,
"learning_rate": 4.124564459930314e-05,
"loss": 0.5174,
"step": 4520
},
{
"epoch": 5.792838874680307,
"grad_norm": 5.223736763000488,
"learning_rate": 4.122386759581882e-05,
"loss": 0.4966,
"step": 4530
},
{
"epoch": 5.805626598465473,
"grad_norm": 1.3607535362243652,
"learning_rate": 4.12020905923345e-05,
"loss": 0.3876,
"step": 4540
},
{
"epoch": 5.818414322250639,
"grad_norm": 3.3651671409606934,
"learning_rate": 4.1180313588850175e-05,
"loss": 0.4492,
"step": 4550
},
{
"epoch": 5.831202046035806,
"grad_norm": 3.5978453159332275,
"learning_rate": 4.1158536585365856e-05,
"loss": 0.5008,
"step": 4560
},
{
"epoch": 5.843989769820972,
"grad_norm": 15.397109031677246,
"learning_rate": 4.113675958188153e-05,
"loss": 0.6904,
"step": 4570
},
{
"epoch": 5.856777493606138,
"grad_norm": 4.293745040893555,
"learning_rate": 4.111498257839721e-05,
"loss": 0.3919,
"step": 4580
},
{
"epoch": 5.869565217391305,
"grad_norm": 0.40358975529670715,
"learning_rate": 4.1093205574912895e-05,
"loss": 0.372,
"step": 4590
},
{
"epoch": 5.882352941176471,
"grad_norm": 6.08713960647583,
"learning_rate": 4.107142857142857e-05,
"loss": 0.4161,
"step": 4600
},
{
"epoch": 5.8951406649616365,
"grad_norm": 14.7813138961792,
"learning_rate": 4.104965156794425e-05,
"loss": 0.7926,
"step": 4610
},
{
"epoch": 5.907928388746803,
"grad_norm": 1.5015966892242432,
"learning_rate": 4.102787456445993e-05,
"loss": 0.564,
"step": 4620
},
{
"epoch": 5.920716112531969,
"grad_norm": 5.556253910064697,
"learning_rate": 4.100609756097561e-05,
"loss": 0.4083,
"step": 4630
},
{
"epoch": 5.9335038363171355,
"grad_norm": 4.575002670288086,
"learning_rate": 4.098432055749129e-05,
"loss": 0.7552,
"step": 4640
},
{
"epoch": 5.946291560102302,
"grad_norm": 7.121840476989746,
"learning_rate": 4.096254355400697e-05,
"loss": 0.2914,
"step": 4650
},
{
"epoch": 5.959079283887468,
"grad_norm": 8.14986801147461,
"learning_rate": 4.0940766550522653e-05,
"loss": 0.5904,
"step": 4660
},
{
"epoch": 5.971867007672635,
"grad_norm": 0.0028777301777154207,
"learning_rate": 4.091898954703833e-05,
"loss": 0.6281,
"step": 4670
},
{
"epoch": 5.9846547314578,
"grad_norm": 0.5826151371002197,
"learning_rate": 4.089721254355401e-05,
"loss": 0.6602,
"step": 4680
},
{
"epoch": 5.997442455242966,
"grad_norm": 3.5384085178375244,
"learning_rate": 4.087543554006969e-05,
"loss": 0.6806,
"step": 4690
},
{
"epoch": 6.0,
"eval_loss": 0.2815360426902771,
"eval_runtime": 0.985,
"eval_samples_per_second": 99.492,
"eval_steps_per_second": 13.198,
"step": 4692
},
{
"epoch": 6.010230179028133,
"grad_norm": 4.524169921875,
"learning_rate": 4.085365853658537e-05,
"loss": 0.4968,
"step": 4700
},
{
"epoch": 6.023017902813299,
"grad_norm": 7.202708721160889,
"learning_rate": 4.083188153310105e-05,
"loss": 0.3843,
"step": 4710
},
{
"epoch": 6.035805626598465,
"grad_norm": 0.15170826017856598,
"learning_rate": 4.081010452961673e-05,
"loss": 0.5316,
"step": 4720
},
{
"epoch": 6.048593350383632,
"grad_norm": 0.06313107907772064,
"learning_rate": 4.0788327526132405e-05,
"loss": 0.5724,
"step": 4730
},
{
"epoch": 6.061381074168798,
"grad_norm": 0.613458514213562,
"learning_rate": 4.076655052264808e-05,
"loss": 0.3703,
"step": 4740
},
{
"epoch": 6.0741687979539645,
"grad_norm": 5.777957916259766,
"learning_rate": 4.074477351916376e-05,
"loss": 0.9007,
"step": 4750
},
{
"epoch": 6.086956521739131,
"grad_norm": 2.215762138366699,
"learning_rate": 4.0722996515679444e-05,
"loss": 0.2629,
"step": 4760
},
{
"epoch": 6.099744245524296,
"grad_norm": 66.70785522460938,
"learning_rate": 4.070121951219512e-05,
"loss": 0.682,
"step": 4770
},
{
"epoch": 6.112531969309463,
"grad_norm": 0.04602469503879547,
"learning_rate": 4.06794425087108e-05,
"loss": 0.4461,
"step": 4780
},
{
"epoch": 6.125319693094629,
"grad_norm": 4.179677486419678,
"learning_rate": 4.065766550522648e-05,
"loss": 0.7186,
"step": 4790
},
{
"epoch": 6.138107416879795,
"grad_norm": 1.071020483970642,
"learning_rate": 4.0635888501742164e-05,
"loss": 0.1355,
"step": 4800
},
{
"epoch": 6.150895140664962,
"grad_norm": 1.201188564300537,
"learning_rate": 4.061411149825784e-05,
"loss": 0.2015,
"step": 4810
},
{
"epoch": 6.163682864450128,
"grad_norm": 3.307131290435791,
"learning_rate": 4.059233449477352e-05,
"loss": 0.3906,
"step": 4820
},
{
"epoch": 6.176470588235294,
"grad_norm": 0.4368687868118286,
"learning_rate": 4.05705574912892e-05,
"loss": 0.8862,
"step": 4830
},
{
"epoch": 6.189258312020461,
"grad_norm": 8.978657722473145,
"learning_rate": 4.0548780487804884e-05,
"loss": 0.5239,
"step": 4840
},
{
"epoch": 6.202046035805626,
"grad_norm": 9.51962661743164,
"learning_rate": 4.052700348432056e-05,
"loss": 0.2829,
"step": 4850
},
{
"epoch": 6.2148337595907925,
"grad_norm": 5.0570807456970215,
"learning_rate": 4.050522648083624e-05,
"loss": 0.743,
"step": 4860
},
{
"epoch": 6.227621483375959,
"grad_norm": 10.152450561523438,
"learning_rate": 4.048344947735192e-05,
"loss": 1.0856,
"step": 4870
},
{
"epoch": 6.240409207161125,
"grad_norm": 4.6442084312438965,
"learning_rate": 4.04616724738676e-05,
"loss": 0.3863,
"step": 4880
},
{
"epoch": 6.253196930946292,
"grad_norm": 5.835144996643066,
"learning_rate": 4.043989547038328e-05,
"loss": 0.9037,
"step": 4890
},
{
"epoch": 6.265984654731458,
"grad_norm": 5.151255130767822,
"learning_rate": 4.0418118466898954e-05,
"loss": 0.6878,
"step": 4900
},
{
"epoch": 6.278772378516624,
"grad_norm": 0.5118243098258972,
"learning_rate": 4.0396341463414636e-05,
"loss": 0.8633,
"step": 4910
},
{
"epoch": 6.291560102301791,
"grad_norm": 6.059098243713379,
"learning_rate": 4.037456445993031e-05,
"loss": 0.4825,
"step": 4920
},
{
"epoch": 6.304347826086957,
"grad_norm": 0.20288658142089844,
"learning_rate": 4.035278745644599e-05,
"loss": 0.3513,
"step": 4930
},
{
"epoch": 6.3171355498721224,
"grad_norm": 0.656362771987915,
"learning_rate": 4.0331010452961675e-05,
"loss": 0.5211,
"step": 4940
},
{
"epoch": 6.329923273657289,
"grad_norm": 8.95826244354248,
"learning_rate": 4.030923344947735e-05,
"loss": 0.5175,
"step": 4950
},
{
"epoch": 6.342710997442455,
"grad_norm": 0.8376319408416748,
"learning_rate": 4.028745644599303e-05,
"loss": 0.2661,
"step": 4960
},
{
"epoch": 6.3554987212276215,
"grad_norm": 2.73271107673645,
"learning_rate": 4.026567944250871e-05,
"loss": 0.3016,
"step": 4970
},
{
"epoch": 6.368286445012788,
"grad_norm": 8.35566520690918,
"learning_rate": 4.0243902439024395e-05,
"loss": 0.3719,
"step": 4980
},
{
"epoch": 6.381074168797954,
"grad_norm": 5.748067378997803,
"learning_rate": 4.022212543554007e-05,
"loss": 0.5457,
"step": 4990
},
{
"epoch": 6.3938618925831205,
"grad_norm": 3.471608877182007,
"learning_rate": 4.020034843205575e-05,
"loss": 0.6285,
"step": 5000
},
{
"epoch": 6.406649616368286,
"grad_norm": 0.2684521973133087,
"learning_rate": 4.017857142857143e-05,
"loss": 0.2251,
"step": 5010
},
{
"epoch": 6.419437340153452,
"grad_norm": 9.472249984741211,
"learning_rate": 4.015679442508711e-05,
"loss": 0.5518,
"step": 5020
},
{
"epoch": 6.432225063938619,
"grad_norm": 0.46002939343452454,
"learning_rate": 4.013501742160279e-05,
"loss": 0.6917,
"step": 5030
},
{
"epoch": 6.445012787723785,
"grad_norm": 6.503478527069092,
"learning_rate": 4.011324041811847e-05,
"loss": 0.8197,
"step": 5040
},
{
"epoch": 6.457800511508951,
"grad_norm": 7.9507036209106445,
"learning_rate": 4.0091463414634153e-05,
"loss": 0.4216,
"step": 5050
},
{
"epoch": 6.470588235294118,
"grad_norm": 7.5792412757873535,
"learning_rate": 4.006968641114983e-05,
"loss": 0.714,
"step": 5060
},
{
"epoch": 6.483375959079284,
"grad_norm": 3.8649039268493652,
"learning_rate": 4.004790940766551e-05,
"loss": 0.9393,
"step": 5070
},
{
"epoch": 6.4961636828644505,
"grad_norm": 10.020180702209473,
"learning_rate": 4.0026132404181185e-05,
"loss": 0.5577,
"step": 5080
},
{
"epoch": 6.508951406649617,
"grad_norm": 8.545390129089355,
"learning_rate": 4.000435540069686e-05,
"loss": 0.7541,
"step": 5090
},
{
"epoch": 6.521739130434782,
"grad_norm": 8.494593620300293,
"learning_rate": 3.998257839721254e-05,
"loss": 0.948,
"step": 5100
},
{
"epoch": 6.534526854219949,
"grad_norm": 4.0168352127075195,
"learning_rate": 3.9960801393728224e-05,
"loss": 0.428,
"step": 5110
},
{
"epoch": 6.547314578005115,
"grad_norm": 4.527425289154053,
"learning_rate": 3.9939024390243905e-05,
"loss": 0.598,
"step": 5120
},
{
"epoch": 6.560102301790281,
"grad_norm": 6.2703537940979,
"learning_rate": 3.991724738675958e-05,
"loss": 0.5118,
"step": 5130
},
{
"epoch": 6.572890025575448,
"grad_norm": 3.926626682281494,
"learning_rate": 3.989547038327526e-05,
"loss": 0.6562,
"step": 5140
},
{
"epoch": 6.585677749360614,
"grad_norm": 1.1774628162384033,
"learning_rate": 3.9873693379790944e-05,
"loss": 0.5402,
"step": 5150
},
{
"epoch": 6.59846547314578,
"grad_norm": 4.933875560760498,
"learning_rate": 3.985191637630662e-05,
"loss": 0.4469,
"step": 5160
},
{
"epoch": 6.611253196930946,
"grad_norm": 4.4834184646606445,
"learning_rate": 3.98301393728223e-05,
"loss": 0.5847,
"step": 5170
},
{
"epoch": 6.624040920716112,
"grad_norm": 0.006693384610116482,
"learning_rate": 3.980836236933798e-05,
"loss": 0.245,
"step": 5180
},
{
"epoch": 6.6368286445012785,
"grad_norm": 5.899873733520508,
"learning_rate": 3.9786585365853664e-05,
"loss": 0.4535,
"step": 5190
},
{
"epoch": 6.649616368286445,
"grad_norm": 8.414173126220703,
"learning_rate": 3.976480836236934e-05,
"loss": 0.8365,
"step": 5200
},
{
"epoch": 6.662404092071611,
"grad_norm": 5.519989967346191,
"learning_rate": 3.974303135888502e-05,
"loss": 0.4639,
"step": 5210
},
{
"epoch": 6.675191815856778,
"grad_norm": 3.984368085861206,
"learning_rate": 3.97212543554007e-05,
"loss": 0.4365,
"step": 5220
},
{
"epoch": 6.687979539641944,
"grad_norm": 7.652017593383789,
"learning_rate": 3.9699477351916384e-05,
"loss": 0.3163,
"step": 5230
},
{
"epoch": 6.70076726342711,
"grad_norm": 1.4251798391342163,
"learning_rate": 3.967770034843206e-05,
"loss": 0.4581,
"step": 5240
},
{
"epoch": 6.713554987212277,
"grad_norm": 4.420569896697998,
"learning_rate": 3.9655923344947734e-05,
"loss": 0.6572,
"step": 5250
},
{
"epoch": 6.726342710997442,
"grad_norm": 5.152069091796875,
"learning_rate": 3.9634146341463416e-05,
"loss": 0.1541,
"step": 5260
},
{
"epoch": 6.739130434782608,
"grad_norm": 3.8654263019561768,
"learning_rate": 3.961236933797909e-05,
"loss": 0.4638,
"step": 5270
},
{
"epoch": 6.751918158567775,
"grad_norm": 2.7325217723846436,
"learning_rate": 3.959059233449477e-05,
"loss": 0.3195,
"step": 5280
},
{
"epoch": 6.764705882352941,
"grad_norm": 1.6292638778686523,
"learning_rate": 3.9568815331010454e-05,
"loss": 0.3065,
"step": 5290
},
{
"epoch": 6.7774936061381075,
"grad_norm": 3.9360060691833496,
"learning_rate": 3.9547038327526136e-05,
"loss": 0.5371,
"step": 5300
},
{
"epoch": 6.790281329923274,
"grad_norm": 8.34693431854248,
"learning_rate": 3.952526132404181e-05,
"loss": 0.6615,
"step": 5310
},
{
"epoch": 6.80306905370844,
"grad_norm": 5.481771469116211,
"learning_rate": 3.950348432055749e-05,
"loss": 0.2692,
"step": 5320
},
{
"epoch": 6.8158567774936065,
"grad_norm": 0.8621829152107239,
"learning_rate": 3.9481707317073175e-05,
"loss": 0.4334,
"step": 5330
},
{
"epoch": 6.828644501278772,
"grad_norm": 2.479590654373169,
"learning_rate": 3.945993031358885e-05,
"loss": 0.5803,
"step": 5340
},
{
"epoch": 6.841432225063938,
"grad_norm": 2.5847859382629395,
"learning_rate": 3.943815331010453e-05,
"loss": 0.2913,
"step": 5350
},
{
"epoch": 6.854219948849105,
"grad_norm": 0.5854724645614624,
"learning_rate": 3.941637630662021e-05,
"loss": 0.494,
"step": 5360
},
{
"epoch": 6.867007672634271,
"grad_norm": 0.038145843893289566,
"learning_rate": 3.9394599303135895e-05,
"loss": 0.5454,
"step": 5370
},
{
"epoch": 6.879795396419437,
"grad_norm": 0.38679739832878113,
"learning_rate": 3.937282229965157e-05,
"loss": 0.2765,
"step": 5380
},
{
"epoch": 6.892583120204604,
"grad_norm": 0.7759974598884583,
"learning_rate": 3.935104529616725e-05,
"loss": 0.5901,
"step": 5390
},
{
"epoch": 6.90537084398977,
"grad_norm": 3.629659652709961,
"learning_rate": 3.932926829268293e-05,
"loss": 0.8057,
"step": 5400
},
{
"epoch": 6.918158567774936,
"grad_norm": 0.9093683362007141,
"learning_rate": 3.930749128919861e-05,
"loss": 0.2888,
"step": 5410
},
{
"epoch": 6.930946291560103,
"grad_norm": 1.096061110496521,
"learning_rate": 3.928571428571429e-05,
"loss": 0.6122,
"step": 5420
},
{
"epoch": 6.943734015345268,
"grad_norm": 1.6472046375274658,
"learning_rate": 3.9263937282229965e-05,
"loss": 0.3792,
"step": 5430
},
{
"epoch": 6.956521739130435,
"grad_norm": 12.249022483825684,
"learning_rate": 3.924216027874565e-05,
"loss": 0.8298,
"step": 5440
},
{
"epoch": 6.969309462915601,
"grad_norm": 14.496586799621582,
"learning_rate": 3.922038327526132e-05,
"loss": 0.6759,
"step": 5450
},
{
"epoch": 6.982097186700767,
"grad_norm": 7.142574310302734,
"learning_rate": 3.9198606271777003e-05,
"loss": 0.4921,
"step": 5460
},
{
"epoch": 6.994884910485934,
"grad_norm": 1.1950016021728516,
"learning_rate": 3.9176829268292685e-05,
"loss": 0.3,
"step": 5470
},
{
"epoch": 7.0,
"eval_loss": 0.2577835023403168,
"eval_runtime": 0.976,
"eval_samples_per_second": 100.408,
"eval_steps_per_second": 13.319,
"step": 5474
},
{
"epoch": 7.0076726342711,
"grad_norm": 1.785232424736023,
"learning_rate": 3.915505226480836e-05,
"loss": 0.2541,
"step": 5480
},
{
"epoch": 7.020460358056266,
"grad_norm": 5.674078464508057,
"learning_rate": 3.913327526132404e-05,
"loss": 0.5632,
"step": 5490
},
{
"epoch": 7.033248081841432,
"grad_norm": 4.966092109680176,
"learning_rate": 3.9111498257839724e-05,
"loss": 0.2459,
"step": 5500
},
{
"epoch": 7.046035805626598,
"grad_norm": 5.083249092102051,
"learning_rate": 3.9089721254355405e-05,
"loss": 0.4098,
"step": 5510
},
{
"epoch": 7.0588235294117645,
"grad_norm": 7.468497276306152,
"learning_rate": 3.906794425087108e-05,
"loss": 0.4966,
"step": 5520
},
{
"epoch": 7.071611253196931,
"grad_norm": 2.9461817741394043,
"learning_rate": 3.904616724738676e-05,
"loss": 0.624,
"step": 5530
},
{
"epoch": 7.084398976982097,
"grad_norm": 0.009090915322303772,
"learning_rate": 3.9024390243902444e-05,
"loss": 0.3501,
"step": 5540
},
{
"epoch": 7.0971867007672635,
"grad_norm": 5.245519638061523,
"learning_rate": 3.900261324041812e-05,
"loss": 0.8218,
"step": 5550
},
{
"epoch": 7.10997442455243,
"grad_norm": 5.124762535095215,
"learning_rate": 3.89808362369338e-05,
"loss": 0.2849,
"step": 5560
},
{
"epoch": 7.122762148337596,
"grad_norm": 7.454417705535889,
"learning_rate": 3.895905923344948e-05,
"loss": 0.4737,
"step": 5570
},
{
"epoch": 7.135549872122763,
"grad_norm": 5.743339538574219,
"learning_rate": 3.8937282229965164e-05,
"loss": 0.4892,
"step": 5580
},
{
"epoch": 7.148337595907928,
"grad_norm": 6.936069488525391,
"learning_rate": 3.891550522648084e-05,
"loss": 0.2564,
"step": 5590
},
{
"epoch": 7.161125319693094,
"grad_norm": 1.4387210607528687,
"learning_rate": 3.8893728222996514e-05,
"loss": 0.57,
"step": 5600
},
{
"epoch": 7.173913043478261,
"grad_norm": 0.5554598569869995,
"learning_rate": 3.8871951219512196e-05,
"loss": 0.4544,
"step": 5610
},
{
"epoch": 7.186700767263427,
"grad_norm": 0.05604790896177292,
"learning_rate": 3.885017421602787e-05,
"loss": 0.532,
"step": 5620
},
{
"epoch": 7.1994884910485935,
"grad_norm": 1.4710054397583008,
"learning_rate": 3.882839721254355e-05,
"loss": 0.441,
"step": 5630
},
{
"epoch": 7.21227621483376,
"grad_norm": 3.6691761016845703,
"learning_rate": 3.8806620209059234e-05,
"loss": 0.2938,
"step": 5640
},
{
"epoch": 7.225063938618926,
"grad_norm": 0.6838319897651672,
"learning_rate": 3.8784843205574916e-05,
"loss": 0.4238,
"step": 5650
},
{
"epoch": 7.2378516624040925,
"grad_norm": 1.6891676187515259,
"learning_rate": 3.876306620209059e-05,
"loss": 0.3278,
"step": 5660
},
{
"epoch": 7.250639386189258,
"grad_norm": 1.7671685218811035,
"learning_rate": 3.874128919860627e-05,
"loss": 0.3804,
"step": 5670
},
{
"epoch": 7.263427109974424,
"grad_norm": 0.5737038850784302,
"learning_rate": 3.8719512195121954e-05,
"loss": 0.6363,
"step": 5680
},
{
"epoch": 7.276214833759591,
"grad_norm": 4.315406799316406,
"learning_rate": 3.8697735191637636e-05,
"loss": 0.4526,
"step": 5690
},
{
"epoch": 7.289002557544757,
"grad_norm": 1.207388162612915,
"learning_rate": 3.867595818815331e-05,
"loss": 0.2925,
"step": 5700
},
{
"epoch": 7.301790281329923,
"grad_norm": 4.993164539337158,
"learning_rate": 3.865418118466899e-05,
"loss": 0.4904,
"step": 5710
},
{
"epoch": 7.31457800511509,
"grad_norm": 0.005563805811107159,
"learning_rate": 3.8632404181184675e-05,
"loss": 0.5732,
"step": 5720
},
{
"epoch": 7.327365728900256,
"grad_norm": 5.448471546173096,
"learning_rate": 3.861062717770035e-05,
"loss": 0.281,
"step": 5730
},
{
"epoch": 7.340153452685422,
"grad_norm": 1.2096017599105835,
"learning_rate": 3.858885017421603e-05,
"loss": 0.711,
"step": 5740
},
{
"epoch": 7.352941176470588,
"grad_norm": 3.929307222366333,
"learning_rate": 3.856707317073171e-05,
"loss": 0.6578,
"step": 5750
},
{
"epoch": 7.365728900255754,
"grad_norm": 3.536635160446167,
"learning_rate": 3.854529616724739e-05,
"loss": 0.3298,
"step": 5760
},
{
"epoch": 7.378516624040921,
"grad_norm": 0.848988950252533,
"learning_rate": 3.852351916376307e-05,
"loss": 0.4895,
"step": 5770
},
{
"epoch": 7.391304347826087,
"grad_norm": 8.26038646697998,
"learning_rate": 3.8501742160278745e-05,
"loss": 0.7074,
"step": 5780
},
{
"epoch": 7.404092071611253,
"grad_norm": 0.8512731194496155,
"learning_rate": 3.8479965156794427e-05,
"loss": 0.6852,
"step": 5790
},
{
"epoch": 7.41687979539642,
"grad_norm": 0.020886391401290894,
"learning_rate": 3.84581881533101e-05,
"loss": 0.5686,
"step": 5800
},
{
"epoch": 7.429667519181586,
"grad_norm": 2.0098299980163574,
"learning_rate": 3.843641114982578e-05,
"loss": 0.4906,
"step": 5810
},
{
"epoch": 7.442455242966752,
"grad_norm": 0.9866635799407959,
"learning_rate": 3.8414634146341465e-05,
"loss": 0.4961,
"step": 5820
},
{
"epoch": 7.455242966751918,
"grad_norm": 0.8240039944648743,
"learning_rate": 3.839285714285715e-05,
"loss": 0.4916,
"step": 5830
},
{
"epoch": 7.468030690537084,
"grad_norm": 4.04268217086792,
"learning_rate": 3.837108013937282e-05,
"loss": 0.7945,
"step": 5840
},
{
"epoch": 7.4808184143222505,
"grad_norm": 5.761824131011963,
"learning_rate": 3.8349303135888503e-05,
"loss": 0.6028,
"step": 5850
},
{
"epoch": 7.493606138107417,
"grad_norm": 14.809423446655273,
"learning_rate": 3.8327526132404185e-05,
"loss": 0.6021,
"step": 5860
},
{
"epoch": 7.506393861892583,
"grad_norm": 4.42140007019043,
"learning_rate": 3.830574912891986e-05,
"loss": 0.5033,
"step": 5870
},
{
"epoch": 7.5191815856777495,
"grad_norm": 5.502748966217041,
"learning_rate": 3.828397212543554e-05,
"loss": 0.2886,
"step": 5880
},
{
"epoch": 7.531969309462916,
"grad_norm": 0.987762451171875,
"learning_rate": 3.8262195121951224e-05,
"loss": 0.4663,
"step": 5890
},
{
"epoch": 7.544757033248082,
"grad_norm": 4.237513542175293,
"learning_rate": 3.8240418118466905e-05,
"loss": 0.5514,
"step": 5900
},
{
"epoch": 7.557544757033249,
"grad_norm": 3.0318350791931152,
"learning_rate": 3.821864111498258e-05,
"loss": 0.5669,
"step": 5910
},
{
"epoch": 7.570332480818414,
"grad_norm": 4.003817081451416,
"learning_rate": 3.819686411149826e-05,
"loss": 0.5131,
"step": 5920
},
{
"epoch": 7.58312020460358,
"grad_norm": 9.188379287719727,
"learning_rate": 3.8175087108013944e-05,
"loss": 0.4796,
"step": 5930
},
{
"epoch": 7.595907928388747,
"grad_norm": 0.7134082317352295,
"learning_rate": 3.815331010452962e-05,
"loss": 0.5296,
"step": 5940
},
{
"epoch": 7.608695652173913,
"grad_norm": 12.00949764251709,
"learning_rate": 3.8131533101045294e-05,
"loss": 0.6011,
"step": 5950
},
{
"epoch": 7.621483375959079,
"grad_norm": 4.412923336029053,
"learning_rate": 3.8109756097560976e-05,
"loss": 0.3967,
"step": 5960
},
{
"epoch": 7.634271099744246,
"grad_norm": 0.03846118226647377,
"learning_rate": 3.808797909407666e-05,
"loss": 0.4239,
"step": 5970
},
{
"epoch": 7.647058823529412,
"grad_norm": 6.899741172790527,
"learning_rate": 3.806620209059233e-05,
"loss": 0.5772,
"step": 5980
},
{
"epoch": 7.659846547314578,
"grad_norm": 1.2849925756454468,
"learning_rate": 3.8044425087108014e-05,
"loss": 0.4367,
"step": 5990
},
{
"epoch": 7.672634271099744,
"grad_norm": 0.005242053419351578,
"learning_rate": 3.8022648083623696e-05,
"loss": 0.1701,
"step": 6000
},
{
"epoch": 7.68542199488491,
"grad_norm": 5.540919780731201,
"learning_rate": 3.800087108013937e-05,
"loss": 0.6158,
"step": 6010
},
{
"epoch": 7.698209718670077,
"grad_norm": 0.8520113825798035,
"learning_rate": 3.797909407665505e-05,
"loss": 0.4181,
"step": 6020
},
{
"epoch": 7.710997442455243,
"grad_norm": 3.9124886989593506,
"learning_rate": 3.7957317073170734e-05,
"loss": 0.1994,
"step": 6030
},
{
"epoch": 7.723785166240409,
"grad_norm": 10.283561706542969,
"learning_rate": 3.7935540069686416e-05,
"loss": 0.4738,
"step": 6040
},
{
"epoch": 7.736572890025576,
"grad_norm": 0.4710548222064972,
"learning_rate": 3.791376306620209e-05,
"loss": 0.3861,
"step": 6050
},
{
"epoch": 7.749360613810742,
"grad_norm": 0.9403038024902344,
"learning_rate": 3.789198606271777e-05,
"loss": 0.2973,
"step": 6060
},
{
"epoch": 7.762148337595908,
"grad_norm": 7.959447860717773,
"learning_rate": 3.7870209059233454e-05,
"loss": 0.3342,
"step": 6070
},
{
"epoch": 7.774936061381074,
"grad_norm": 4.493699073791504,
"learning_rate": 3.7848432055749136e-05,
"loss": 0.645,
"step": 6080
},
{
"epoch": 7.78772378516624,
"grad_norm": 0.9736508131027222,
"learning_rate": 3.782665505226481e-05,
"loss": 0.5379,
"step": 6090
},
{
"epoch": 7.8005115089514065,
"grad_norm": 8.09215259552002,
"learning_rate": 3.780487804878049e-05,
"loss": 0.7049,
"step": 6100
},
{
"epoch": 7.813299232736573,
"grad_norm": 4.407011032104492,
"learning_rate": 3.778310104529617e-05,
"loss": 0.4454,
"step": 6110
},
{
"epoch": 7.826086956521739,
"grad_norm": 7.577841758728027,
"learning_rate": 3.776132404181185e-05,
"loss": 0.4755,
"step": 6120
},
{
"epoch": 7.838874680306906,
"grad_norm": 5.885177135467529,
"learning_rate": 3.7739547038327525e-05,
"loss": 0.3473,
"step": 6130
},
{
"epoch": 7.851662404092072,
"grad_norm": 3.9344217777252197,
"learning_rate": 3.7717770034843206e-05,
"loss": 0.2407,
"step": 6140
},
{
"epoch": 7.864450127877237,
"grad_norm": 0.8838673830032349,
"learning_rate": 3.769599303135889e-05,
"loss": 0.1372,
"step": 6150
},
{
"epoch": 7.877237851662404,
"grad_norm": 0.38971105217933655,
"learning_rate": 3.767421602787456e-05,
"loss": 0.3899,
"step": 6160
},
{
"epoch": 7.89002557544757,
"grad_norm": 15.273054122924805,
"learning_rate": 3.7652439024390245e-05,
"loss": 0.5726,
"step": 6170
},
{
"epoch": 7.9028132992327365,
"grad_norm": 0.9236469864845276,
"learning_rate": 3.7630662020905927e-05,
"loss": 0.3092,
"step": 6180
},
{
"epoch": 7.915601023017903,
"grad_norm": 9.27222728729248,
"learning_rate": 3.76088850174216e-05,
"loss": 0.503,
"step": 6190
},
{
"epoch": 7.928388746803069,
"grad_norm": 2.053985357284546,
"learning_rate": 3.758710801393728e-05,
"loss": 0.6894,
"step": 6200
},
{
"epoch": 7.9411764705882355,
"grad_norm": 8.98082447052002,
"learning_rate": 3.7565331010452965e-05,
"loss": 0.7604,
"step": 6210
},
{
"epoch": 7.953964194373402,
"grad_norm": 6.407993793487549,
"learning_rate": 3.754355400696865e-05,
"loss": 0.5022,
"step": 6220
},
{
"epoch": 7.966751918158568,
"grad_norm": 3.8097591400146484,
"learning_rate": 3.752177700348432e-05,
"loss": 0.6324,
"step": 6230
},
{
"epoch": 7.979539641943734,
"grad_norm": 3.067627429962158,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.7819,
"step": 6240
},
{
"epoch": 7.9923273657289,
"grad_norm": 3.3114781379699707,
"learning_rate": 3.7478222996515685e-05,
"loss": 0.3791,
"step": 6250
},
{
"epoch": 8.0,
"eval_loss": 0.24141965806484222,
"eval_runtime": 0.9886,
"eval_samples_per_second": 99.127,
"eval_steps_per_second": 13.15,
"step": 6256
},
{
"epoch": 8.005115089514067,
"grad_norm": 10.103604316711426,
"learning_rate": 3.745644599303136e-05,
"loss": 0.2827,
"step": 6260
},
{
"epoch": 8.017902813299234,
"grad_norm": 7.1643476486206055,
"learning_rate": 3.743466898954704e-05,
"loss": 0.6589,
"step": 6270
},
{
"epoch": 8.030690537084398,
"grad_norm": 5.2952423095703125,
"learning_rate": 3.741289198606272e-05,
"loss": 0.3989,
"step": 6280
},
{
"epoch": 8.043478260869565,
"grad_norm": 10.544265747070312,
"learning_rate": 3.73911149825784e-05,
"loss": 0.4586,
"step": 6290
},
{
"epoch": 8.05626598465473,
"grad_norm": 7.280800819396973,
"learning_rate": 3.7369337979094074e-05,
"loss": 0.5087,
"step": 6300
},
{
"epoch": 8.069053708439897,
"grad_norm": 4.117053031921387,
"learning_rate": 3.7347560975609755e-05,
"loss": 0.3276,
"step": 6310
},
{
"epoch": 8.081841432225064,
"grad_norm": 8.452136993408203,
"learning_rate": 3.732578397212544e-05,
"loss": 0.2142,
"step": 6320
},
{
"epoch": 8.09462915601023,
"grad_norm": 8.656983375549316,
"learning_rate": 3.730400696864111e-05,
"loss": 0.6328,
"step": 6330
},
{
"epoch": 8.107416879795396,
"grad_norm": 15.760722160339355,
"learning_rate": 3.7282229965156794e-05,
"loss": 0.3794,
"step": 6340
},
{
"epoch": 8.120204603580563,
"grad_norm": 0.8831185698509216,
"learning_rate": 3.7260452961672476e-05,
"loss": 0.2596,
"step": 6350
},
{
"epoch": 8.132992327365729,
"grad_norm": 3.0778136253356934,
"learning_rate": 3.723867595818816e-05,
"loss": 0.2729,
"step": 6360
},
{
"epoch": 8.145780051150895,
"grad_norm": 0.35585105419158936,
"learning_rate": 3.721689895470383e-05,
"loss": 0.2476,
"step": 6370
},
{
"epoch": 8.158567774936062,
"grad_norm": 1.6337690353393555,
"learning_rate": 3.7195121951219514e-05,
"loss": 0.4862,
"step": 6380
},
{
"epoch": 8.171355498721228,
"grad_norm": 2.012357711791992,
"learning_rate": 3.7173344947735196e-05,
"loss": 0.1733,
"step": 6390
},
{
"epoch": 8.184143222506394,
"grad_norm": 1.2589486837387085,
"learning_rate": 3.715156794425087e-05,
"loss": 0.514,
"step": 6400
},
{
"epoch": 8.19693094629156,
"grad_norm": 1.7067822217941284,
"learning_rate": 3.712979094076655e-05,
"loss": 0.4357,
"step": 6410
},
{
"epoch": 8.209718670076727,
"grad_norm": 0.9603003263473511,
"learning_rate": 3.7108013937282234e-05,
"loss": 0.7706,
"step": 6420
},
{
"epoch": 8.222506393861893,
"grad_norm": 0.010923515073955059,
"learning_rate": 3.7086236933797916e-05,
"loss": 0.4297,
"step": 6430
},
{
"epoch": 8.235294117647058,
"grad_norm": 6.709939956665039,
"learning_rate": 3.706445993031359e-05,
"loss": 0.4281,
"step": 6440
},
{
"epoch": 8.248081841432224,
"grad_norm": 0.016056306660175323,
"learning_rate": 3.704268292682927e-05,
"loss": 0.4342,
"step": 6450
},
{
"epoch": 8.26086956521739,
"grad_norm": 7.876967430114746,
"learning_rate": 3.702090592334495e-05,
"loss": 0.439,
"step": 6460
},
{
"epoch": 8.273657289002557,
"grad_norm": 0.3735317587852478,
"learning_rate": 3.699912891986063e-05,
"loss": 0.5231,
"step": 6470
},
{
"epoch": 8.286445012787723,
"grad_norm": 4.260231018066406,
"learning_rate": 3.6977351916376304e-05,
"loss": 0.0909,
"step": 6480
},
{
"epoch": 8.29923273657289,
"grad_norm": 0.04777367785573006,
"learning_rate": 3.6955574912891986e-05,
"loss": 0.2107,
"step": 6490
},
{
"epoch": 8.312020460358056,
"grad_norm": 7.151200294494629,
"learning_rate": 3.693379790940767e-05,
"loss": 0.4714,
"step": 6500
},
{
"epoch": 8.324808184143222,
"grad_norm": 10.462479591369629,
"learning_rate": 3.691202090592334e-05,
"loss": 0.6321,
"step": 6510
},
{
"epoch": 8.337595907928389,
"grad_norm": 6.835601329803467,
"learning_rate": 3.6890243902439025e-05,
"loss": 0.4306,
"step": 6520
},
{
"epoch": 8.350383631713555,
"grad_norm": 3.190450429916382,
"learning_rate": 3.6868466898954706e-05,
"loss": 0.3589,
"step": 6530
},
{
"epoch": 8.363171355498721,
"grad_norm": 8.741924285888672,
"learning_rate": 3.684668989547039e-05,
"loss": 0.5098,
"step": 6540
},
{
"epoch": 8.375959079283888,
"grad_norm": 1.7585572004318237,
"learning_rate": 3.682491289198606e-05,
"loss": 0.3946,
"step": 6550
},
{
"epoch": 8.388746803069054,
"grad_norm": 1.7931690216064453,
"learning_rate": 3.6803135888501745e-05,
"loss": 0.3422,
"step": 6560
},
{
"epoch": 8.40153452685422,
"grad_norm": 7.819911956787109,
"learning_rate": 3.6781358885017427e-05,
"loss": 0.4142,
"step": 6570
},
{
"epoch": 8.414322250639387,
"grad_norm": 6.050047874450684,
"learning_rate": 3.67595818815331e-05,
"loss": 0.8177,
"step": 6580
},
{
"epoch": 8.427109974424553,
"grad_norm": 12.437430381774902,
"learning_rate": 3.673780487804878e-05,
"loss": 0.3595,
"step": 6590
},
{
"epoch": 8.43989769820972,
"grad_norm": 0.0015749474987387657,
"learning_rate": 3.6716027874564465e-05,
"loss": 0.4131,
"step": 6600
},
{
"epoch": 8.452685421994884,
"grad_norm": 0.042624592781066895,
"learning_rate": 3.669425087108015e-05,
"loss": 0.5854,
"step": 6610
},
{
"epoch": 8.46547314578005,
"grad_norm": 5.490470886230469,
"learning_rate": 3.667247386759582e-05,
"loss": 0.4621,
"step": 6620
},
{
"epoch": 8.478260869565217,
"grad_norm": 5.620114326477051,
"learning_rate": 3.66506968641115e-05,
"loss": 0.6975,
"step": 6630
},
{
"epoch": 8.491048593350383,
"grad_norm": 2.929370641708374,
"learning_rate": 3.662891986062718e-05,
"loss": 0.3325,
"step": 6640
},
{
"epoch": 8.50383631713555,
"grad_norm": 0.27810731530189514,
"learning_rate": 3.6607142857142853e-05,
"loss": 0.4809,
"step": 6650
},
{
"epoch": 8.516624040920716,
"grad_norm": 14.763570785522461,
"learning_rate": 3.6585365853658535e-05,
"loss": 0.5995,
"step": 6660
},
{
"epoch": 8.529411764705882,
"grad_norm": 4.998637676239014,
"learning_rate": 3.656358885017422e-05,
"loss": 0.1925,
"step": 6670
},
{
"epoch": 8.542199488491049,
"grad_norm": 11.907294273376465,
"learning_rate": 3.65418118466899e-05,
"loss": 0.5023,
"step": 6680
},
{
"epoch": 8.554987212276215,
"grad_norm": 1.786744236946106,
"learning_rate": 3.6520034843205574e-05,
"loss": 0.4069,
"step": 6690
},
{
"epoch": 8.567774936061381,
"grad_norm": 8.682933807373047,
"learning_rate": 3.6498257839721255e-05,
"loss": 0.5076,
"step": 6700
},
{
"epoch": 8.580562659846548,
"grad_norm": 2.2297000885009766,
"learning_rate": 3.647648083623694e-05,
"loss": 0.4252,
"step": 6710
},
{
"epoch": 8.593350383631714,
"grad_norm": 4.3645100593566895,
"learning_rate": 3.645470383275261e-05,
"loss": 0.3388,
"step": 6720
},
{
"epoch": 8.60613810741688,
"grad_norm": 0.9659554958343506,
"learning_rate": 3.6432926829268294e-05,
"loss": 0.1987,
"step": 6730
},
{
"epoch": 8.618925831202047,
"grad_norm": 4.049929141998291,
"learning_rate": 3.6411149825783976e-05,
"loss": 0.3661,
"step": 6740
},
{
"epoch": 8.631713554987213,
"grad_norm": 1.467517614364624,
"learning_rate": 3.638937282229966e-05,
"loss": 0.3241,
"step": 6750
},
{
"epoch": 8.644501278772378,
"grad_norm": 7.087103843688965,
"learning_rate": 3.636759581881533e-05,
"loss": 0.5732,
"step": 6760
},
{
"epoch": 8.657289002557544,
"grad_norm": 2.423686981201172,
"learning_rate": 3.6345818815331014e-05,
"loss": 0.569,
"step": 6770
},
{
"epoch": 8.67007672634271,
"grad_norm": 11.937353134155273,
"learning_rate": 3.6324041811846696e-05,
"loss": 0.5757,
"step": 6780
},
{
"epoch": 8.682864450127877,
"grad_norm": 8.299654006958008,
"learning_rate": 3.630226480836237e-05,
"loss": 0.4712,
"step": 6790
},
{
"epoch": 8.695652173913043,
"grad_norm": 0.01959991082549095,
"learning_rate": 3.628048780487805e-05,
"loss": 0.555,
"step": 6800
},
{
"epoch": 8.70843989769821,
"grad_norm": 5.581100940704346,
"learning_rate": 3.625871080139373e-05,
"loss": 0.5037,
"step": 6810
},
{
"epoch": 8.721227621483376,
"grad_norm": 2.0534441471099854,
"learning_rate": 3.623693379790941e-05,
"loss": 0.1514,
"step": 6820
},
{
"epoch": 8.734015345268542,
"grad_norm": 5.216137409210205,
"learning_rate": 3.6215156794425084e-05,
"loss": 0.4657,
"step": 6830
},
{
"epoch": 8.746803069053708,
"grad_norm": 5.7269487380981445,
"learning_rate": 3.6193379790940766e-05,
"loss": 0.4067,
"step": 6840
},
{
"epoch": 8.759590792838875,
"grad_norm": 0.33077237010002136,
"learning_rate": 3.617160278745645e-05,
"loss": 0.6687,
"step": 6850
},
{
"epoch": 8.772378516624041,
"grad_norm": 1.5920321941375732,
"learning_rate": 3.614982578397213e-05,
"loss": 0.3967,
"step": 6860
},
{
"epoch": 8.785166240409207,
"grad_norm": 1.9924708604812622,
"learning_rate": 3.6128048780487804e-05,
"loss": 0.4591,
"step": 6870
},
{
"epoch": 8.797953964194374,
"grad_norm": 2.579571008682251,
"learning_rate": 3.6106271777003486e-05,
"loss": 0.7944,
"step": 6880
},
{
"epoch": 8.81074168797954,
"grad_norm": 4.239784240722656,
"learning_rate": 3.608449477351917e-05,
"loss": 0.3443,
"step": 6890
},
{
"epoch": 8.823529411764707,
"grad_norm": 6.115857124328613,
"learning_rate": 3.606271777003484e-05,
"loss": 0.5023,
"step": 6900
},
{
"epoch": 8.836317135549873,
"grad_norm": 0.575232744216919,
"learning_rate": 3.6040940766550525e-05,
"loss": 0.4865,
"step": 6910
},
{
"epoch": 8.84910485933504,
"grad_norm": 1.7253854274749756,
"learning_rate": 3.6019163763066206e-05,
"loss": 0.3874,
"step": 6920
},
{
"epoch": 8.861892583120204,
"grad_norm": 4.649746417999268,
"learning_rate": 3.599738675958189e-05,
"loss": 0.2017,
"step": 6930
},
{
"epoch": 8.87468030690537,
"grad_norm": 3.3735687732696533,
"learning_rate": 3.597560975609756e-05,
"loss": 0.4938,
"step": 6940
},
{
"epoch": 8.887468030690536,
"grad_norm": 0.6998658180236816,
"learning_rate": 3.5953832752613245e-05,
"loss": 0.2944,
"step": 6950
},
{
"epoch": 8.900255754475703,
"grad_norm": 2.798309326171875,
"learning_rate": 3.5932055749128927e-05,
"loss": 0.4214,
"step": 6960
},
{
"epoch": 8.91304347826087,
"grad_norm": 3.0269863605499268,
"learning_rate": 3.59102787456446e-05,
"loss": 0.4129,
"step": 6970
},
{
"epoch": 8.925831202046036,
"grad_norm": 6.951076030731201,
"learning_rate": 3.5888501742160277e-05,
"loss": 0.356,
"step": 6980
},
{
"epoch": 8.938618925831202,
"grad_norm": 0.4412369728088379,
"learning_rate": 3.586672473867596e-05,
"loss": 0.165,
"step": 6990
},
{
"epoch": 8.951406649616368,
"grad_norm": 10.887850761413574,
"learning_rate": 3.584494773519164e-05,
"loss": 0.5921,
"step": 7000
},
{
"epoch": 8.964194373401535,
"grad_norm": 0.00953485444188118,
"learning_rate": 3.5823170731707315e-05,
"loss": 0.4405,
"step": 7010
},
{
"epoch": 8.976982097186701,
"grad_norm": 0.8474636077880859,
"learning_rate": 3.5801393728223e-05,
"loss": 0.3183,
"step": 7020
},
{
"epoch": 8.989769820971867,
"grad_norm": 0.3193589746952057,
"learning_rate": 3.577961672473868e-05,
"loss": 0.6581,
"step": 7030
},
{
"epoch": 9.0,
"eval_loss": 0.2372424602508545,
"eval_runtime": 0.9906,
"eval_samples_per_second": 98.934,
"eval_steps_per_second": 13.124,
"step": 7038
},
{
"epoch": 9.002557544757034,
"grad_norm": 1.6099143028259277,
"learning_rate": 3.5757839721254353e-05,
"loss": 0.4287,
"step": 7040
},
{
"epoch": 9.0153452685422,
"grad_norm": 0.0028387894853949547,
"learning_rate": 3.5736062717770035e-05,
"loss": 0.3471,
"step": 7050
},
{
"epoch": 9.028132992327366,
"grad_norm": 0.9083518385887146,
"learning_rate": 3.571428571428572e-05,
"loss": 0.4602,
"step": 7060
},
{
"epoch": 9.040920716112533,
"grad_norm": 10.855413436889648,
"learning_rate": 3.56925087108014e-05,
"loss": 0.3018,
"step": 7070
},
{
"epoch": 9.053708439897699,
"grad_norm": 5.899298667907715,
"learning_rate": 3.5670731707317074e-05,
"loss": 0.6242,
"step": 7080
},
{
"epoch": 9.066496163682864,
"grad_norm": 0.3065991699695587,
"learning_rate": 3.5648954703832755e-05,
"loss": 0.4585,
"step": 7090
},
{
"epoch": 9.07928388746803,
"grad_norm": 1.1200615167617798,
"learning_rate": 3.562717770034844e-05,
"loss": 0.384,
"step": 7100
},
{
"epoch": 9.092071611253196,
"grad_norm": 0.787402331829071,
"learning_rate": 3.560540069686411e-05,
"loss": 0.6417,
"step": 7110
},
{
"epoch": 9.104859335038363,
"grad_norm": 5.251519680023193,
"learning_rate": 3.5583623693379794e-05,
"loss": 0.3815,
"step": 7120
},
{
"epoch": 9.117647058823529,
"grad_norm": 6.498902797698975,
"learning_rate": 3.5561846689895476e-05,
"loss": 0.348,
"step": 7130
},
{
"epoch": 9.130434782608695,
"grad_norm": 7.617058753967285,
"learning_rate": 3.554006968641115e-05,
"loss": 0.5356,
"step": 7140
},
{
"epoch": 9.143222506393862,
"grad_norm": 3.143923759460449,
"learning_rate": 3.551829268292683e-05,
"loss": 0.3764,
"step": 7150
},
{
"epoch": 9.156010230179028,
"grad_norm": 0.016743259504437447,
"learning_rate": 3.549651567944251e-05,
"loss": 0.3662,
"step": 7160
},
{
"epoch": 9.168797953964194,
"grad_norm": 6.293381214141846,
"learning_rate": 3.547473867595819e-05,
"loss": 0.5453,
"step": 7170
},
{
"epoch": 9.18158567774936,
"grad_norm": 2.0475428104400635,
"learning_rate": 3.5452961672473864e-05,
"loss": 0.3636,
"step": 7180
},
{
"epoch": 9.194373401534527,
"grad_norm": 0.79485684633255,
"learning_rate": 3.5431184668989546e-05,
"loss": 0.3049,
"step": 7190
},
{
"epoch": 9.207161125319693,
"grad_norm": 1.2061353921890259,
"learning_rate": 3.540940766550523e-05,
"loss": 0.3841,
"step": 7200
},
{
"epoch": 9.21994884910486,
"grad_norm": 0.933066189289093,
"learning_rate": 3.538763066202091e-05,
"loss": 0.3891,
"step": 7210
},
{
"epoch": 9.232736572890026,
"grad_norm": 3.883514642715454,
"learning_rate": 3.5365853658536584e-05,
"loss": 0.6795,
"step": 7220
},
{
"epoch": 9.245524296675192,
"grad_norm": 8.447504997253418,
"learning_rate": 3.5344076655052266e-05,
"loss": 0.5821,
"step": 7230
},
{
"epoch": 9.258312020460359,
"grad_norm": 2.884052276611328,
"learning_rate": 3.532229965156795e-05,
"loss": 0.2316,
"step": 7240
},
{
"epoch": 9.271099744245525,
"grad_norm": 9.32747745513916,
"learning_rate": 3.530052264808362e-05,
"loss": 0.5633,
"step": 7250
},
{
"epoch": 9.28388746803069,
"grad_norm": 0.020528344437479973,
"learning_rate": 3.5278745644599304e-05,
"loss": 0.2843,
"step": 7260
},
{
"epoch": 9.296675191815856,
"grad_norm": 2.6097218990325928,
"learning_rate": 3.5256968641114986e-05,
"loss": 0.3732,
"step": 7270
},
{
"epoch": 9.309462915601022,
"grad_norm": 0.0003839631099253893,
"learning_rate": 3.523519163763067e-05,
"loss": 0.4346,
"step": 7280
},
{
"epoch": 9.322250639386189,
"grad_norm": 2.2322895526885986,
"learning_rate": 3.521341463414634e-05,
"loss": 0.2317,
"step": 7290
},
{
"epoch": 9.335038363171355,
"grad_norm": 3.4575886726379395,
"learning_rate": 3.5191637630662025e-05,
"loss": 0.3988,
"step": 7300
},
{
"epoch": 9.347826086956522,
"grad_norm": 1.4904476404190063,
"learning_rate": 3.5169860627177706e-05,
"loss": 0.2786,
"step": 7310
},
{
"epoch": 9.360613810741688,
"grad_norm": 0.7082493305206299,
"learning_rate": 3.514808362369338e-05,
"loss": 0.2075,
"step": 7320
},
{
"epoch": 9.373401534526854,
"grad_norm": 0.00038726787897758186,
"learning_rate": 3.5126306620209056e-05,
"loss": 0.222,
"step": 7330
},
{
"epoch": 9.38618925831202,
"grad_norm": 11.110411643981934,
"learning_rate": 3.510452961672474e-05,
"loss": 0.3972,
"step": 7340
},
{
"epoch": 9.398976982097187,
"grad_norm": 5.460088729858398,
"learning_rate": 3.508275261324042e-05,
"loss": 0.3556,
"step": 7350
},
{
"epoch": 9.411764705882353,
"grad_norm": 10.77302074432373,
"learning_rate": 3.5060975609756095e-05,
"loss": 0.4483,
"step": 7360
},
{
"epoch": 9.42455242966752,
"grad_norm": 1.2308753728866577,
"learning_rate": 3.5039198606271777e-05,
"loss": 0.2582,
"step": 7370
},
{
"epoch": 9.437340153452686,
"grad_norm": 0.862062394618988,
"learning_rate": 3.501742160278746e-05,
"loss": 0.4087,
"step": 7380
},
{
"epoch": 9.450127877237852,
"grad_norm": 1.5446736812591553,
"learning_rate": 3.499564459930314e-05,
"loss": 0.332,
"step": 7390
},
{
"epoch": 9.462915601023019,
"grad_norm": 1.6241346597671509,
"learning_rate": 3.4973867595818815e-05,
"loss": 0.474,
"step": 7400
},
{
"epoch": 9.475703324808185,
"grad_norm": 3.712130546569824,
"learning_rate": 3.49520905923345e-05,
"loss": 0.3702,
"step": 7410
},
{
"epoch": 9.48849104859335,
"grad_norm": 1.5162075757980347,
"learning_rate": 3.493031358885018e-05,
"loss": 0.1353,
"step": 7420
},
{
"epoch": 9.501278772378516,
"grad_norm": 2.307133197784424,
"learning_rate": 3.4908536585365853e-05,
"loss": 0.2684,
"step": 7430
},
{
"epoch": 9.514066496163682,
"grad_norm": 2.630873680114746,
"learning_rate": 3.4886759581881535e-05,
"loss": 0.4246,
"step": 7440
},
{
"epoch": 9.526854219948849,
"grad_norm": 9.756599426269531,
"learning_rate": 3.486498257839722e-05,
"loss": 0.5116,
"step": 7450
},
{
"epoch": 9.539641943734015,
"grad_norm": 1.998191237449646,
"learning_rate": 3.48432055749129e-05,
"loss": 0.3179,
"step": 7460
},
{
"epoch": 9.552429667519181,
"grad_norm": 4.331905364990234,
"learning_rate": 3.4821428571428574e-05,
"loss": 0.4869,
"step": 7470
},
{
"epoch": 9.565217391304348,
"grad_norm": 1.298435926437378,
"learning_rate": 3.4799651567944255e-05,
"loss": 0.4402,
"step": 7480
},
{
"epoch": 9.578005115089514,
"grad_norm": 0.7783128619194031,
"learning_rate": 3.477787456445993e-05,
"loss": 0.3408,
"step": 7490
},
{
"epoch": 9.59079283887468,
"grad_norm": 1.8130773305892944,
"learning_rate": 3.475609756097561e-05,
"loss": 0.6058,
"step": 7500
},
{
"epoch": 9.603580562659847,
"grad_norm": 0.29691851139068604,
"learning_rate": 3.473432055749129e-05,
"loss": 0.3485,
"step": 7510
},
{
"epoch": 9.616368286445013,
"grad_norm": 6.944264888763428,
"learning_rate": 3.471254355400697e-05,
"loss": 0.4315,
"step": 7520
},
{
"epoch": 9.62915601023018,
"grad_norm": 0.29563087224960327,
"learning_rate": 3.469076655052265e-05,
"loss": 0.4645,
"step": 7530
},
{
"epoch": 9.641943734015346,
"grad_norm": 6.2144951820373535,
"learning_rate": 3.4668989547038326e-05,
"loss": 0.4299,
"step": 7540
},
{
"epoch": 9.654731457800512,
"grad_norm": 0.41491979360580444,
"learning_rate": 3.464721254355401e-05,
"loss": 0.4714,
"step": 7550
},
{
"epoch": 9.667519181585678,
"grad_norm": 0.21521031856536865,
"learning_rate": 3.462543554006969e-05,
"loss": 0.4527,
"step": 7560
},
{
"epoch": 9.680306905370845,
"grad_norm": 10.179862976074219,
"learning_rate": 3.4603658536585364e-05,
"loss": 0.4961,
"step": 7570
},
{
"epoch": 9.693094629156011,
"grad_norm": 1.0263521671295166,
"learning_rate": 3.4581881533101046e-05,
"loss": 0.4905,
"step": 7580
},
{
"epoch": 9.705882352941176,
"grad_norm": 5.564786911010742,
"learning_rate": 3.456010452961673e-05,
"loss": 0.5835,
"step": 7590
},
{
"epoch": 9.718670076726342,
"grad_norm": 12.091937065124512,
"learning_rate": 3.453832752613241e-05,
"loss": 0.7117,
"step": 7600
},
{
"epoch": 9.731457800511508,
"grad_norm": 10.530780792236328,
"learning_rate": 3.4516550522648084e-05,
"loss": 0.3978,
"step": 7610
},
{
"epoch": 9.744245524296675,
"grad_norm": 1.0764374732971191,
"learning_rate": 3.4494773519163766e-05,
"loss": 0.2973,
"step": 7620
},
{
"epoch": 9.757033248081841,
"grad_norm": 5.413555145263672,
"learning_rate": 3.447299651567945e-05,
"loss": 0.5868,
"step": 7630
},
{
"epoch": 9.769820971867007,
"grad_norm": 0.3791005611419678,
"learning_rate": 3.445121951219512e-05,
"loss": 0.3802,
"step": 7640
},
{
"epoch": 9.782608695652174,
"grad_norm": 2.3650548458099365,
"learning_rate": 3.4429442508710804e-05,
"loss": 0.4553,
"step": 7650
},
{
"epoch": 9.79539641943734,
"grad_norm": 0.16670426726341248,
"learning_rate": 3.4407665505226486e-05,
"loss": 0.227,
"step": 7660
},
{
"epoch": 9.808184143222507,
"grad_norm": 0.002888306276872754,
"learning_rate": 3.438588850174216e-05,
"loss": 0.7055,
"step": 7670
},
{
"epoch": 9.820971867007673,
"grad_norm": 5.335592269897461,
"learning_rate": 3.4364111498257836e-05,
"loss": 0.3602,
"step": 7680
},
{
"epoch": 9.83375959079284,
"grad_norm": 0.004552973434329033,
"learning_rate": 3.434233449477352e-05,
"loss": 0.3308,
"step": 7690
},
{
"epoch": 9.846547314578006,
"grad_norm": 3.599313974380493,
"learning_rate": 3.43205574912892e-05,
"loss": 0.3185,
"step": 7700
},
{
"epoch": 9.859335038363172,
"grad_norm": 9.506612777709961,
"learning_rate": 3.429878048780488e-05,
"loss": 0.4221,
"step": 7710
},
{
"epoch": 9.872122762148338,
"grad_norm": 0.04120413213968277,
"learning_rate": 3.4277003484320556e-05,
"loss": 0.255,
"step": 7720
},
{
"epoch": 9.884910485933505,
"grad_norm": 6.27550745010376,
"learning_rate": 3.425522648083624e-05,
"loss": 0.3397,
"step": 7730
},
{
"epoch": 9.89769820971867,
"grad_norm": 6.480957508087158,
"learning_rate": 3.423344947735192e-05,
"loss": 0.5441,
"step": 7740
},
{
"epoch": 9.910485933503836,
"grad_norm": 0.825150728225708,
"learning_rate": 3.4211672473867595e-05,
"loss": 0.3759,
"step": 7750
},
{
"epoch": 9.923273657289002,
"grad_norm": 0.0003031272499356419,
"learning_rate": 3.4189895470383277e-05,
"loss": 0.564,
"step": 7760
},
{
"epoch": 9.936061381074168,
"grad_norm": 2.838111639022827,
"learning_rate": 3.416811846689896e-05,
"loss": 0.453,
"step": 7770
},
{
"epoch": 9.948849104859335,
"grad_norm": 1.2337100505828857,
"learning_rate": 3.414634146341464e-05,
"loss": 0.3176,
"step": 7780
},
{
"epoch": 9.961636828644501,
"grad_norm": 3.4663679599761963,
"learning_rate": 3.4124564459930315e-05,
"loss": 0.4674,
"step": 7790
},
{
"epoch": 9.974424552429667,
"grad_norm": 7.805672645568848,
"learning_rate": 3.4102787456446e-05,
"loss": 0.5197,
"step": 7800
},
{
"epoch": 9.987212276214834,
"grad_norm": 6.521031379699707,
"learning_rate": 3.408101045296168e-05,
"loss": 0.4132,
"step": 7810
},
{
"epoch": 10.0,
"grad_norm": 8.708837509155273,
"learning_rate": 3.4059233449477354e-05,
"loss": 0.4148,
"step": 7820
},
{
"epoch": 10.0,
"eval_loss": 0.2310720980167389,
"eval_runtime": 0.8084,
"eval_samples_per_second": 121.233,
"eval_steps_per_second": 16.082,
"step": 7820
},
{
"epoch": 10.012787723785166,
"grad_norm": 7.328716278076172,
"learning_rate": 3.4037456445993035e-05,
"loss": 0.4179,
"step": 7830
},
{
"epoch": 10.025575447570333,
"grad_norm": 0.4229029417037964,
"learning_rate": 3.401567944250871e-05,
"loss": 0.5226,
"step": 7840
},
{
"epoch": 10.038363171355499,
"grad_norm": 0.0002528049808461219,
"learning_rate": 3.399390243902439e-05,
"loss": 0.5042,
"step": 7850
},
{
"epoch": 10.051150895140665,
"grad_norm": 5.877325057983398,
"learning_rate": 3.397212543554007e-05,
"loss": 0.4143,
"step": 7860
},
{
"epoch": 10.063938618925832,
"grad_norm": 3.365591049194336,
"learning_rate": 3.395034843205575e-05,
"loss": 0.2714,
"step": 7870
},
{
"epoch": 10.076726342710998,
"grad_norm": 8.59089469909668,
"learning_rate": 3.392857142857143e-05,
"loss": 0.3792,
"step": 7880
},
{
"epoch": 10.089514066496164,
"grad_norm": 0.040207698941230774,
"learning_rate": 3.3906794425087105e-05,
"loss": 0.2498,
"step": 7890
},
{
"epoch": 10.10230179028133,
"grad_norm": 5.780106544494629,
"learning_rate": 3.388501742160279e-05,
"loss": 0.2073,
"step": 7900
},
{
"epoch": 10.115089514066495,
"grad_norm": 0.04502753168344498,
"learning_rate": 3.386324041811847e-05,
"loss": 0.5198,
"step": 7910
},
{
"epoch": 10.127877237851662,
"grad_norm": 0.007558781187981367,
"learning_rate": 3.384146341463415e-05,
"loss": 0.209,
"step": 7920
},
{
"epoch": 10.140664961636828,
"grad_norm": 4.91458797454834,
"learning_rate": 3.3819686411149826e-05,
"loss": 0.4293,
"step": 7930
},
{
"epoch": 10.153452685421994,
"grad_norm": 4.223980903625488,
"learning_rate": 3.379790940766551e-05,
"loss": 0.2377,
"step": 7940
},
{
"epoch": 10.16624040920716,
"grad_norm": 0.48757821321487427,
"learning_rate": 3.377613240418119e-05,
"loss": 0.1887,
"step": 7950
},
{
"epoch": 10.179028132992327,
"grad_norm": 2.029522180557251,
"learning_rate": 3.3754355400696864e-05,
"loss": 0.2748,
"step": 7960
},
{
"epoch": 10.191815856777493,
"grad_norm": 0.03896741569042206,
"learning_rate": 3.3732578397212546e-05,
"loss": 0.1531,
"step": 7970
},
{
"epoch": 10.20460358056266,
"grad_norm": 4.019787311553955,
"learning_rate": 3.371080139372823e-05,
"loss": 0.413,
"step": 7980
},
{
"epoch": 10.217391304347826,
"grad_norm": 3.299919366836548,
"learning_rate": 3.368902439024391e-05,
"loss": 0.4001,
"step": 7990
},
{
"epoch": 10.230179028132993,
"grad_norm": 1.2256722450256348,
"learning_rate": 3.3667247386759584e-05,
"loss": 0.3925,
"step": 8000
},
{
"epoch": 10.242966751918159,
"grad_norm": 1.500207781791687,
"learning_rate": 3.3645470383275266e-05,
"loss": 0.5539,
"step": 8010
},
{
"epoch": 10.255754475703325,
"grad_norm": 3.64996600151062,
"learning_rate": 3.362369337979094e-05,
"loss": 0.2435,
"step": 8020
},
{
"epoch": 10.268542199488492,
"grad_norm": 1.5223026275634766,
"learning_rate": 3.3601916376306616e-05,
"loss": 0.4816,
"step": 8030
},
{
"epoch": 10.281329923273658,
"grad_norm": 11.364663124084473,
"learning_rate": 3.35801393728223e-05,
"loss": 0.2599,
"step": 8040
},
{
"epoch": 10.294117647058824,
"grad_norm": 6.048238277435303,
"learning_rate": 3.355836236933798e-05,
"loss": 0.3296,
"step": 8050
},
{
"epoch": 10.30690537084399,
"grad_norm": 1.025272011756897,
"learning_rate": 3.353658536585366e-05,
"loss": 0.5319,
"step": 8060
},
{
"epoch": 10.319693094629155,
"grad_norm": 5.667033672332764,
"learning_rate": 3.3514808362369336e-05,
"loss": 0.6212,
"step": 8070
},
{
"epoch": 10.332480818414322,
"grad_norm": 8.041871070861816,
"learning_rate": 3.349303135888502e-05,
"loss": 0.6741,
"step": 8080
},
{
"epoch": 10.345268542199488,
"grad_norm": 1.6329516172409058,
"learning_rate": 3.34712543554007e-05,
"loss": 0.5887,
"step": 8090
},
{
"epoch": 10.358056265984654,
"grad_norm": 1.5292503833770752,
"learning_rate": 3.344947735191638e-05,
"loss": 0.1805,
"step": 8100
},
{
"epoch": 10.37084398976982,
"grad_norm": 3.6555302143096924,
"learning_rate": 3.3427700348432056e-05,
"loss": 0.4751,
"step": 8110
},
{
"epoch": 10.383631713554987,
"grad_norm": 0.032321881502866745,
"learning_rate": 3.340592334494774e-05,
"loss": 0.3751,
"step": 8120
},
{
"epoch": 10.396419437340153,
"grad_norm": 0.11594846844673157,
"learning_rate": 3.338414634146342e-05,
"loss": 0.3831,
"step": 8130
},
{
"epoch": 10.40920716112532,
"grad_norm": 0.2702908217906952,
"learning_rate": 3.3362369337979095e-05,
"loss": 0.2761,
"step": 8140
},
{
"epoch": 10.421994884910486,
"grad_norm": 5.8537726402282715,
"learning_rate": 3.3340592334494777e-05,
"loss": 0.7554,
"step": 8150
},
{
"epoch": 10.434782608695652,
"grad_norm": 1.9350101947784424,
"learning_rate": 3.331881533101046e-05,
"loss": 0.2913,
"step": 8160
},
{
"epoch": 10.447570332480819,
"grad_norm": 0.1800796538591385,
"learning_rate": 3.329703832752613e-05,
"loss": 0.1576,
"step": 8170
},
{
"epoch": 10.460358056265985,
"grad_norm": 2.1546852588653564,
"learning_rate": 3.3275261324041815e-05,
"loss": 0.2741,
"step": 8180
},
{
"epoch": 10.473145780051151,
"grad_norm": 11.017895698547363,
"learning_rate": 3.325348432055749e-05,
"loss": 0.2629,
"step": 8190
},
{
"epoch": 10.485933503836318,
"grad_norm": 3.735138416290283,
"learning_rate": 3.323170731707317e-05,
"loss": 0.4408,
"step": 8200
},
{
"epoch": 10.498721227621484,
"grad_norm": 4.445374965667725,
"learning_rate": 3.320993031358885e-05,
"loss": 0.5969,
"step": 8210
},
{
"epoch": 10.51150895140665,
"grad_norm": 1.9418877363204956,
"learning_rate": 3.318815331010453e-05,
"loss": 0.2827,
"step": 8220
},
{
"epoch": 10.524296675191817,
"grad_norm": 6.815186977386475,
"learning_rate": 3.316637630662021e-05,
"loss": 0.3757,
"step": 8230
},
{
"epoch": 10.537084398976981,
"grad_norm": 6.0457353591918945,
"learning_rate": 3.314459930313589e-05,
"loss": 0.3467,
"step": 8240
},
{
"epoch": 10.549872122762148,
"grad_norm": 1.9271039962768555,
"learning_rate": 3.312282229965157e-05,
"loss": 0.3033,
"step": 8250
},
{
"epoch": 10.562659846547314,
"grad_norm": 1.7230960130691528,
"learning_rate": 3.310104529616725e-05,
"loss": 0.3376,
"step": 8260
},
{
"epoch": 10.57544757033248,
"grad_norm": 0.8747263550758362,
"learning_rate": 3.307926829268293e-05,
"loss": 0.2699,
"step": 8270
},
{
"epoch": 10.588235294117647,
"grad_norm": 4.551005840301514,
"learning_rate": 3.3057491289198605e-05,
"loss": 0.4694,
"step": 8280
},
{
"epoch": 10.601023017902813,
"grad_norm": 1.4506466388702393,
"learning_rate": 3.303571428571429e-05,
"loss": 0.4233,
"step": 8290
},
{
"epoch": 10.61381074168798,
"grad_norm": 2.718492269515991,
"learning_rate": 3.301393728222997e-05,
"loss": 0.312,
"step": 8300
},
{
"epoch": 10.626598465473146,
"grad_norm": 4.134568214416504,
"learning_rate": 3.299216027874565e-05,
"loss": 0.5228,
"step": 8310
},
{
"epoch": 10.639386189258312,
"grad_norm": 1.0367114543914795,
"learning_rate": 3.2970383275261326e-05,
"loss": 0.2426,
"step": 8320
},
{
"epoch": 10.652173913043478,
"grad_norm": 0.3512336313724518,
"learning_rate": 3.294860627177701e-05,
"loss": 0.5909,
"step": 8330
},
{
"epoch": 10.664961636828645,
"grad_norm": 5.459338665008545,
"learning_rate": 3.292682926829269e-05,
"loss": 0.337,
"step": 8340
},
{
"epoch": 10.677749360613811,
"grad_norm": 7.244568824768066,
"learning_rate": 3.2905052264808364e-05,
"loss": 0.5357,
"step": 8350
},
{
"epoch": 10.690537084398978,
"grad_norm": 3.035902976989746,
"learning_rate": 3.288327526132404e-05,
"loss": 0.3594,
"step": 8360
},
{
"epoch": 10.703324808184144,
"grad_norm": 11.743247985839844,
"learning_rate": 3.286149825783972e-05,
"loss": 0.2371,
"step": 8370
},
{
"epoch": 10.71611253196931,
"grad_norm": 0.0029970197938382626,
"learning_rate": 3.28397212543554e-05,
"loss": 0.3303,
"step": 8380
},
{
"epoch": 10.728900255754475,
"grad_norm": 0.00909477099776268,
"learning_rate": 3.281794425087108e-05,
"loss": 0.3742,
"step": 8390
},
{
"epoch": 10.741687979539641,
"grad_norm": 0.9811226725578308,
"learning_rate": 3.279616724738676e-05,
"loss": 0.2562,
"step": 8400
},
{
"epoch": 10.754475703324808,
"grad_norm": 5.737852096557617,
"learning_rate": 3.277439024390244e-05,
"loss": 0.7256,
"step": 8410
},
{
"epoch": 10.767263427109974,
"grad_norm": 0.34903228282928467,
"learning_rate": 3.2752613240418116e-05,
"loss": 0.4239,
"step": 8420
},
{
"epoch": 10.78005115089514,
"grad_norm": 8.985549926757812,
"learning_rate": 3.27308362369338e-05,
"loss": 0.6547,
"step": 8430
},
{
"epoch": 10.792838874680307,
"grad_norm": 6.798750877380371,
"learning_rate": 3.270905923344948e-05,
"loss": 0.499,
"step": 8440
},
{
"epoch": 10.805626598465473,
"grad_norm": 4.940242767333984,
"learning_rate": 3.268728222996516e-05,
"loss": 0.4255,
"step": 8450
},
{
"epoch": 10.81841432225064,
"grad_norm": 1.2830392122268677,
"learning_rate": 3.2665505226480836e-05,
"loss": 0.7948,
"step": 8460
},
{
"epoch": 10.831202046035806,
"grad_norm": 0.4314042031764984,
"learning_rate": 3.264372822299652e-05,
"loss": 0.0659,
"step": 8470
},
{
"epoch": 10.843989769820972,
"grad_norm": 0.1115281879901886,
"learning_rate": 3.26219512195122e-05,
"loss": 0.405,
"step": 8480
},
{
"epoch": 10.856777493606138,
"grad_norm": 0.07167865335941315,
"learning_rate": 3.2600174216027875e-05,
"loss": 0.2189,
"step": 8490
},
{
"epoch": 10.869565217391305,
"grad_norm": 4.274066925048828,
"learning_rate": 3.2578397212543556e-05,
"loss": 0.6027,
"step": 8500
},
{
"epoch": 10.882352941176471,
"grad_norm": 3.2567176818847656,
"learning_rate": 3.255662020905924e-05,
"loss": 0.2483,
"step": 8510
},
{
"epoch": 10.895140664961637,
"grad_norm": 2.4175426959991455,
"learning_rate": 3.253484320557491e-05,
"loss": 0.3907,
"step": 8520
},
{
"epoch": 10.907928388746804,
"grad_norm": 0.1821274608373642,
"learning_rate": 3.2513066202090595e-05,
"loss": 0.1762,
"step": 8530
},
{
"epoch": 10.92071611253197,
"grad_norm": 0.4083157479763031,
"learning_rate": 3.249128919860627e-05,
"loss": 0.5483,
"step": 8540
},
{
"epoch": 10.933503836317136,
"grad_norm": 0.8208944201469421,
"learning_rate": 3.246951219512195e-05,
"loss": 0.3091,
"step": 8550
},
{
"epoch": 10.946291560102301,
"grad_norm": 1.555967926979065,
"learning_rate": 3.244773519163763e-05,
"loss": 0.4429,
"step": 8560
},
{
"epoch": 10.959079283887467,
"grad_norm": 1.8339693546295166,
"learning_rate": 3.242595818815331e-05,
"loss": 0.3064,
"step": 8570
},
{
"epoch": 10.971867007672634,
"grad_norm": 5.510306358337402,
"learning_rate": 3.240418118466899e-05,
"loss": 0.1709,
"step": 8580
},
{
"epoch": 10.9846547314578,
"grad_norm": 0.10631673038005829,
"learning_rate": 3.238240418118467e-05,
"loss": 0.3792,
"step": 8590
},
{
"epoch": 10.997442455242966,
"grad_norm": 6.853298664093018,
"learning_rate": 3.236062717770035e-05,
"loss": 0.565,
"step": 8600
},
{
"epoch": 11.0,
"eval_loss": 0.21491144597530365,
"eval_runtime": 0.9865,
"eval_samples_per_second": 99.337,
"eval_steps_per_second": 13.177,
"step": 8602
},
{
"epoch": 11.010230179028133,
"grad_norm": 6.32133150100708,
"learning_rate": 3.233885017421603e-05,
"loss": 0.4006,
"step": 8610
},
{
"epoch": 11.023017902813299,
"grad_norm": 0.444341242313385,
"learning_rate": 3.231707317073171e-05,
"loss": 0.3297,
"step": 8620
},
{
"epoch": 11.035805626598465,
"grad_norm": 8.10943603515625,
"learning_rate": 3.229529616724739e-05,
"loss": 0.4144,
"step": 8630
},
{
"epoch": 11.048593350383632,
"grad_norm": 0.13110394775867462,
"learning_rate": 3.227351916376307e-05,
"loss": 0.2261,
"step": 8640
},
{
"epoch": 11.061381074168798,
"grad_norm": 4.035839080810547,
"learning_rate": 3.225174216027875e-05,
"loss": 0.3694,
"step": 8650
},
{
"epoch": 11.074168797953964,
"grad_norm": 0.32012709975242615,
"learning_rate": 3.222996515679443e-05,
"loss": 0.4123,
"step": 8660
},
{
"epoch": 11.08695652173913,
"grad_norm": 1.2858375310897827,
"learning_rate": 3.2208188153310105e-05,
"loss": 0.3132,
"step": 8670
},
{
"epoch": 11.099744245524297,
"grad_norm": 5.57534122467041,
"learning_rate": 3.218641114982579e-05,
"loss": 0.3462,
"step": 8680
},
{
"epoch": 11.112531969309464,
"grad_norm": 9.02668571472168,
"learning_rate": 3.216463414634147e-05,
"loss": 0.3644,
"step": 8690
},
{
"epoch": 11.12531969309463,
"grad_norm": 0.16235436499118805,
"learning_rate": 3.2142857142857144e-05,
"loss": 0.563,
"step": 8700
},
{
"epoch": 11.138107416879796,
"grad_norm": 0.20466744899749756,
"learning_rate": 3.212108013937282e-05,
"loss": 0.4855,
"step": 8710
},
{
"epoch": 11.15089514066496,
"grad_norm": 7.841794967651367,
"learning_rate": 3.20993031358885e-05,
"loss": 0.4489,
"step": 8720
},
{
"epoch": 11.163682864450127,
"grad_norm": 0.008980615995824337,
"learning_rate": 3.207752613240418e-05,
"loss": 0.3862,
"step": 8730
},
{
"epoch": 11.176470588235293,
"grad_norm": 8.154345512390137,
"learning_rate": 3.205574912891986e-05,
"loss": 0.4611,
"step": 8740
},
{
"epoch": 11.18925831202046,
"grad_norm": 8.8815336227417,
"learning_rate": 3.203397212543554e-05,
"loss": 0.4231,
"step": 8750
},
{
"epoch": 11.202046035805626,
"grad_norm": 3.500678777694702,
"learning_rate": 3.201219512195122e-05,
"loss": 0.3599,
"step": 8760
},
{
"epoch": 11.214833759590793,
"grad_norm": 6.8923845291137695,
"learning_rate": 3.19904181184669e-05,
"loss": 0.5404,
"step": 8770
},
{
"epoch": 11.227621483375959,
"grad_norm": 0.22492839395999908,
"learning_rate": 3.196864111498258e-05,
"loss": 0.3662,
"step": 8780
},
{
"epoch": 11.240409207161125,
"grad_norm": 1.0206634998321533,
"learning_rate": 3.194686411149826e-05,
"loss": 0.3528,
"step": 8790
},
{
"epoch": 11.253196930946292,
"grad_norm": 8.273615837097168,
"learning_rate": 3.192508710801394e-05,
"loss": 0.507,
"step": 8800
},
{
"epoch": 11.265984654731458,
"grad_norm": 0.680773138999939,
"learning_rate": 3.1903310104529616e-05,
"loss": 0.3614,
"step": 8810
},
{
"epoch": 11.278772378516624,
"grad_norm": 8.943563461303711,
"learning_rate": 3.18815331010453e-05,
"loss": 0.502,
"step": 8820
},
{
"epoch": 11.29156010230179,
"grad_norm": 10.697273254394531,
"learning_rate": 3.185975609756098e-05,
"loss": 0.5452,
"step": 8830
},
{
"epoch": 11.304347826086957,
"grad_norm": 2.5585360527038574,
"learning_rate": 3.183797909407666e-05,
"loss": 0.2662,
"step": 8840
},
{
"epoch": 11.317135549872123,
"grad_norm": 5.41624641418457,
"learning_rate": 3.1816202090592336e-05,
"loss": 0.3855,
"step": 8850
},
{
"epoch": 11.32992327365729,
"grad_norm": 2.3647260665893555,
"learning_rate": 3.179442508710802e-05,
"loss": 0.6495,
"step": 8860
},
{
"epoch": 11.342710997442456,
"grad_norm": 0.32295748591423035,
"learning_rate": 3.177264808362369e-05,
"loss": 0.0998,
"step": 8870
},
{
"epoch": 11.355498721227622,
"grad_norm": 5.3191680908203125,
"learning_rate": 3.1750871080139375e-05,
"loss": 0.2025,
"step": 8880
},
{
"epoch": 11.368286445012787,
"grad_norm": 4.418299674987793,
"learning_rate": 3.172909407665505e-05,
"loss": 0.2674,
"step": 8890
},
{
"epoch": 11.381074168797953,
"grad_norm": 6.424844264984131,
"learning_rate": 3.170731707317073e-05,
"loss": 0.2655,
"step": 8900
},
{
"epoch": 11.39386189258312,
"grad_norm": 2.5015451908111572,
"learning_rate": 3.168554006968641e-05,
"loss": 0.3653,
"step": 8910
},
{
"epoch": 11.406649616368286,
"grad_norm": 0.09301682561635971,
"learning_rate": 3.166376306620209e-05,
"loss": 0.6217,
"step": 8920
},
{
"epoch": 11.419437340153452,
"grad_norm": 0.0004050794232171029,
"learning_rate": 3.164198606271777e-05,
"loss": 0.2637,
"step": 8930
},
{
"epoch": 11.432225063938619,
"grad_norm": 3.0390357971191406,
"learning_rate": 3.162020905923345e-05,
"loss": 0.4496,
"step": 8940
},
{
"epoch": 11.445012787723785,
"grad_norm": 0.20196415483951569,
"learning_rate": 3.159843205574913e-05,
"loss": 0.6409,
"step": 8950
},
{
"epoch": 11.457800511508951,
"grad_norm": 1.8359960317611694,
"learning_rate": 3.157665505226481e-05,
"loss": 0.147,
"step": 8960
},
{
"epoch": 11.470588235294118,
"grad_norm": 1.8633346557617188,
"learning_rate": 3.155487804878049e-05,
"loss": 0.4681,
"step": 8970
},
{
"epoch": 11.483375959079284,
"grad_norm": 0.3808380365371704,
"learning_rate": 3.153310104529617e-05,
"loss": 0.6801,
"step": 8980
},
{
"epoch": 11.49616368286445,
"grad_norm": 10.161221504211426,
"learning_rate": 3.151132404181185e-05,
"loss": 0.2983,
"step": 8990
},
{
"epoch": 11.508951406649617,
"grad_norm": 8.241488456726074,
"learning_rate": 3.148954703832753e-05,
"loss": 0.5304,
"step": 9000
},
{
"epoch": 11.521739130434783,
"grad_norm": 0.01585804484784603,
"learning_rate": 3.146777003484321e-05,
"loss": 0.0876,
"step": 9010
},
{
"epoch": 11.53452685421995,
"grad_norm": 1.2980036735534668,
"learning_rate": 3.144599303135889e-05,
"loss": 0.1861,
"step": 9020
},
{
"epoch": 11.547314578005116,
"grad_norm": 0.14101967215538025,
"learning_rate": 3.142421602787457e-05,
"loss": 0.4377,
"step": 9030
},
{
"epoch": 11.56010230179028,
"grad_norm": 15.285452842712402,
"learning_rate": 3.140243902439025e-05,
"loss": 0.2893,
"step": 9040
},
{
"epoch": 11.572890025575447,
"grad_norm": 2.7861084938049316,
"learning_rate": 3.1380662020905924e-05,
"loss": 0.4202,
"step": 9050
},
{
"epoch": 11.585677749360613,
"grad_norm": 0.6793758869171143,
"learning_rate": 3.13588850174216e-05,
"loss": 0.1564,
"step": 9060
},
{
"epoch": 11.59846547314578,
"grad_norm": 1.9136697053909302,
"learning_rate": 3.133710801393728e-05,
"loss": 0.3935,
"step": 9070
},
{
"epoch": 11.611253196930946,
"grad_norm": 11.733519554138184,
"learning_rate": 3.131533101045296e-05,
"loss": 0.4423,
"step": 9080
},
{
"epoch": 11.624040920716112,
"grad_norm": 1.6007936000823975,
"learning_rate": 3.1293554006968644e-05,
"loss": 0.3038,
"step": 9090
},
{
"epoch": 11.636828644501279,
"grad_norm": 0.06947501748800278,
"learning_rate": 3.127177700348432e-05,
"loss": 0.3009,
"step": 9100
},
{
"epoch": 11.649616368286445,
"grad_norm": 1.7004108428955078,
"learning_rate": 3.125e-05,
"loss": 0.3394,
"step": 9110
},
{
"epoch": 11.662404092071611,
"grad_norm": 0.46246737241744995,
"learning_rate": 3.122822299651568e-05,
"loss": 0.3468,
"step": 9120
},
{
"epoch": 11.675191815856778,
"grad_norm": 2.2499334812164307,
"learning_rate": 3.120644599303136e-05,
"loss": 0.4545,
"step": 9130
},
{
"epoch": 11.687979539641944,
"grad_norm": 0.09588778764009476,
"learning_rate": 3.118466898954704e-05,
"loss": 0.1164,
"step": 9140
},
{
"epoch": 11.70076726342711,
"grad_norm": 2.123534679412842,
"learning_rate": 3.116289198606272e-05,
"loss": 0.3877,
"step": 9150
},
{
"epoch": 11.713554987212277,
"grad_norm": 3.080892562866211,
"learning_rate": 3.11411149825784e-05,
"loss": 0.2416,
"step": 9160
},
{
"epoch": 11.726342710997443,
"grad_norm": 3.8951287269592285,
"learning_rate": 3.111933797909408e-05,
"loss": 0.3623,
"step": 9170
},
{
"epoch": 11.73913043478261,
"grad_norm": 9.134366035461426,
"learning_rate": 3.109756097560976e-05,
"loss": 0.3535,
"step": 9180
},
{
"epoch": 11.751918158567776,
"grad_norm": 6.711736679077148,
"learning_rate": 3.107578397212544e-05,
"loss": 0.3882,
"step": 9190
},
{
"epoch": 11.764705882352942,
"grad_norm": 0.008113594725728035,
"learning_rate": 3.1054006968641116e-05,
"loss": 0.6576,
"step": 9200
},
{
"epoch": 11.777493606138107,
"grad_norm": 0.04739408195018768,
"learning_rate": 3.10322299651568e-05,
"loss": 0.2207,
"step": 9210
},
{
"epoch": 11.790281329923273,
"grad_norm": 2.5379507541656494,
"learning_rate": 3.101045296167247e-05,
"loss": 0.2856,
"step": 9220
},
{
"epoch": 11.80306905370844,
"grad_norm": 0.0009813779033720493,
"learning_rate": 3.0988675958188155e-05,
"loss": 0.4471,
"step": 9230
},
{
"epoch": 11.815856777493606,
"grad_norm": 0.4562146067619324,
"learning_rate": 3.096689895470383e-05,
"loss": 0.1575,
"step": 9240
},
{
"epoch": 11.828644501278772,
"grad_norm": 2.216144323348999,
"learning_rate": 3.094512195121951e-05,
"loss": 0.201,
"step": 9250
},
{
"epoch": 11.841432225063938,
"grad_norm": 0.7896146178245544,
"learning_rate": 3.092334494773519e-05,
"loss": 0.3322,
"step": 9260
},
{
"epoch": 11.854219948849105,
"grad_norm": 0.8330498933792114,
"learning_rate": 3.090156794425087e-05,
"loss": 0.2876,
"step": 9270
},
{
"epoch": 11.867007672634271,
"grad_norm": 1.164717674255371,
"learning_rate": 3.087979094076655e-05,
"loss": 0.1853,
"step": 9280
},
{
"epoch": 11.879795396419437,
"grad_norm": 0.029109498485922813,
"learning_rate": 3.085801393728223e-05,
"loss": 0.1326,
"step": 9290
},
{
"epoch": 11.892583120204604,
"grad_norm": 1.593198537826538,
"learning_rate": 3.083623693379791e-05,
"loss": 0.4159,
"step": 9300
},
{
"epoch": 11.90537084398977,
"grad_norm": 1.5863924026489258,
"learning_rate": 3.081445993031359e-05,
"loss": 0.2288,
"step": 9310
},
{
"epoch": 11.918158567774936,
"grad_norm": 8.302711486816406,
"learning_rate": 3.079268292682927e-05,
"loss": 0.4357,
"step": 9320
},
{
"epoch": 11.930946291560103,
"grad_norm": 0.07162554562091827,
"learning_rate": 3.077090592334495e-05,
"loss": 0.5203,
"step": 9330
},
{
"epoch": 11.94373401534527,
"grad_norm": 5.237524509429932,
"learning_rate": 3.074912891986063e-05,
"loss": 0.2773,
"step": 9340
},
{
"epoch": 11.956521739130435,
"grad_norm": 2.659153699874878,
"learning_rate": 3.072735191637631e-05,
"loss": 0.3073,
"step": 9350
},
{
"epoch": 11.969309462915602,
"grad_norm": 0.9340159296989441,
"learning_rate": 3.070557491289199e-05,
"loss": 0.7157,
"step": 9360
},
{
"epoch": 11.982097186700766,
"grad_norm": 1.4989861249923706,
"learning_rate": 3.068379790940767e-05,
"loss": 0.2925,
"step": 9370
},
{
"epoch": 11.994884910485933,
"grad_norm": 0.7795642614364624,
"learning_rate": 3.066202090592335e-05,
"loss": 0.4561,
"step": 9380
},
{
"epoch": 12.0,
"eval_loss": 0.21105726063251495,
"eval_runtime": 0.9833,
"eval_samples_per_second": 99.663,
"eval_steps_per_second": 13.221,
"step": 9384
},
{
"epoch": 12.007672634271099,
"grad_norm": 2.518749475479126,
"learning_rate": 3.064024390243903e-05,
"loss": 0.2376,
"step": 9390
},
{
"epoch": 12.020460358056265,
"grad_norm": 0.2805574834346771,
"learning_rate": 3.0618466898954704e-05,
"loss": 0.457,
"step": 9400
},
{
"epoch": 12.033248081841432,
"grad_norm": 0.059038929641246796,
"learning_rate": 3.0596689895470385e-05,
"loss": 0.3972,
"step": 9410
},
{
"epoch": 12.046035805626598,
"grad_norm": 1.0303517580032349,
"learning_rate": 3.057491289198606e-05,
"loss": 0.1829,
"step": 9420
},
{
"epoch": 12.058823529411764,
"grad_norm": 0.005017964635044336,
"learning_rate": 3.055313588850174e-05,
"loss": 0.3087,
"step": 9430
},
{
"epoch": 12.07161125319693,
"grad_norm": 0.027756422758102417,
"learning_rate": 3.0531358885017424e-05,
"loss": 0.3635,
"step": 9440
},
{
"epoch": 12.084398976982097,
"grad_norm": 1.1101689338684082,
"learning_rate": 3.05095818815331e-05,
"loss": 0.2655,
"step": 9450
},
{
"epoch": 12.097186700767264,
"grad_norm": 0.0013217816594988108,
"learning_rate": 3.048780487804878e-05,
"loss": 0.5037,
"step": 9460
},
{
"epoch": 12.10997442455243,
"grad_norm": 1.3896468877792358,
"learning_rate": 3.0466027874564462e-05,
"loss": 0.3393,
"step": 9470
},
{
"epoch": 12.122762148337596,
"grad_norm": 9.710427284240723,
"learning_rate": 3.0444250871080144e-05,
"loss": 0.3086,
"step": 9480
},
{
"epoch": 12.135549872122763,
"grad_norm": 0.02628585882484913,
"learning_rate": 3.042247386759582e-05,
"loss": 0.1551,
"step": 9490
},
{
"epoch": 12.148337595907929,
"grad_norm": 4.432460784912109,
"learning_rate": 3.04006968641115e-05,
"loss": 0.3154,
"step": 9500
},
{
"epoch": 12.161125319693095,
"grad_norm": 0.4521609842777252,
"learning_rate": 3.0378919860627182e-05,
"loss": 0.3278,
"step": 9510
},
{
"epoch": 12.173913043478262,
"grad_norm": 2.970489978790283,
"learning_rate": 3.0357142857142857e-05,
"loss": 0.351,
"step": 9520
},
{
"epoch": 12.186700767263428,
"grad_norm": 0.3053510785102844,
"learning_rate": 3.0335365853658536e-05,
"loss": 0.1863,
"step": 9530
},
{
"epoch": 12.199488491048593,
"grad_norm": 8.950643539428711,
"learning_rate": 3.0313588850174217e-05,
"loss": 0.2964,
"step": 9540
},
{
"epoch": 12.212276214833759,
"grad_norm": 0.10913731902837753,
"learning_rate": 3.02918118466899e-05,
"loss": 0.2332,
"step": 9550
},
{
"epoch": 12.225063938618925,
"grad_norm": 9.739930152893066,
"learning_rate": 3.0270034843205574e-05,
"loss": 0.4687,
"step": 9560
},
{
"epoch": 12.237851662404092,
"grad_norm": 6.348082542419434,
"learning_rate": 3.0248257839721256e-05,
"loss": 0.4921,
"step": 9570
},
{
"epoch": 12.250639386189258,
"grad_norm": 0.6640710830688477,
"learning_rate": 3.0226480836236938e-05,
"loss": 0.5136,
"step": 9580
},
{
"epoch": 12.263427109974424,
"grad_norm": 0.0002884014684241265,
"learning_rate": 3.0204703832752613e-05,
"loss": 0.3254,
"step": 9590
},
{
"epoch": 12.27621483375959,
"grad_norm": 8.660523414611816,
"learning_rate": 3.0182926829268294e-05,
"loss": 0.3539,
"step": 9600
},
{
"epoch": 12.289002557544757,
"grad_norm": 25.596059799194336,
"learning_rate": 3.0161149825783973e-05,
"loss": 0.2523,
"step": 9610
},
{
"epoch": 12.301790281329923,
"grad_norm": 0.1933545619249344,
"learning_rate": 3.0139372822299655e-05,
"loss": 0.2112,
"step": 9620
},
{
"epoch": 12.31457800511509,
"grad_norm": 0.2891666293144226,
"learning_rate": 3.011759581881533e-05,
"loss": 0.1593,
"step": 9630
},
{
"epoch": 12.327365728900256,
"grad_norm": 2.7486932277679443,
"learning_rate": 3.009581881533101e-05,
"loss": 0.4398,
"step": 9640
},
{
"epoch": 12.340153452685422,
"grad_norm": 0.8518115282058716,
"learning_rate": 3.0074041811846693e-05,
"loss": 0.2917,
"step": 9650
},
{
"epoch": 12.352941176470589,
"grad_norm": 9.21524429321289,
"learning_rate": 3.0052264808362368e-05,
"loss": 0.573,
"step": 9660
},
{
"epoch": 12.365728900255755,
"grad_norm": 0.5691689848899841,
"learning_rate": 3.003048780487805e-05,
"loss": 0.4763,
"step": 9670
},
{
"epoch": 12.378516624040921,
"grad_norm": 9.370003700256348,
"learning_rate": 3.000871080139373e-05,
"loss": 0.2633,
"step": 9680
},
{
"epoch": 12.391304347826088,
"grad_norm": 17.42327880859375,
"learning_rate": 2.998693379790941e-05,
"loss": 0.2829,
"step": 9690
},
{
"epoch": 12.404092071611252,
"grad_norm": 3.5533621311187744,
"learning_rate": 2.9965156794425088e-05,
"loss": 0.386,
"step": 9700
},
{
"epoch": 12.416879795396419,
"grad_norm": 6.838221073150635,
"learning_rate": 2.9943379790940767e-05,
"loss": 0.3875,
"step": 9710
},
{
"epoch": 12.429667519181585,
"grad_norm": 4.223023414611816,
"learning_rate": 2.9921602787456448e-05,
"loss": 0.3517,
"step": 9720
},
{
"epoch": 12.442455242966751,
"grad_norm": 1.94353449344635,
"learning_rate": 2.989982578397213e-05,
"loss": 0.3197,
"step": 9730
},
{
"epoch": 12.455242966751918,
"grad_norm": 2.8062820434570312,
"learning_rate": 2.9878048780487805e-05,
"loss": 0.1984,
"step": 9740
},
{
"epoch": 12.468030690537084,
"grad_norm": 1.3602440357208252,
"learning_rate": 2.9856271777003487e-05,
"loss": 0.3751,
"step": 9750
},
{
"epoch": 12.48081841432225,
"grad_norm": 1.7058247327804565,
"learning_rate": 2.983449477351917e-05,
"loss": 0.2914,
"step": 9760
},
{
"epoch": 12.493606138107417,
"grad_norm": 6.558533191680908,
"learning_rate": 2.9812717770034843e-05,
"loss": 0.4901,
"step": 9770
},
{
"epoch": 12.506393861892583,
"grad_norm": 16.329387664794922,
"learning_rate": 2.9790940766550525e-05,
"loss": 0.2571,
"step": 9780
},
{
"epoch": 12.51918158567775,
"grad_norm": 1.404144287109375,
"learning_rate": 2.9769163763066204e-05,
"loss": 0.315,
"step": 9790
},
{
"epoch": 12.531969309462916,
"grad_norm": 1.7159531116485596,
"learning_rate": 2.9747386759581885e-05,
"loss": 0.3895,
"step": 9800
},
{
"epoch": 12.544757033248082,
"grad_norm": 1.3657723665237427,
"learning_rate": 2.972560975609756e-05,
"loss": 0.1817,
"step": 9810
},
{
"epoch": 12.557544757033249,
"grad_norm": 1.2785223722457886,
"learning_rate": 2.9703832752613242e-05,
"loss": 0.3728,
"step": 9820
},
{
"epoch": 12.570332480818415,
"grad_norm": 4.575535774230957,
"learning_rate": 2.9682055749128924e-05,
"loss": 0.3806,
"step": 9830
},
{
"epoch": 12.583120204603581,
"grad_norm": 5.597801208496094,
"learning_rate": 2.96602787456446e-05,
"loss": 0.2533,
"step": 9840
},
{
"epoch": 12.595907928388748,
"grad_norm": 0.038497261703014374,
"learning_rate": 2.963850174216028e-05,
"loss": 0.3971,
"step": 9850
},
{
"epoch": 12.608695652173914,
"grad_norm": 1.231428861618042,
"learning_rate": 2.9616724738675962e-05,
"loss": 0.4486,
"step": 9860
},
{
"epoch": 12.621483375959079,
"grad_norm": 3.3973329067230225,
"learning_rate": 2.959494773519164e-05,
"loss": 0.2941,
"step": 9870
},
{
"epoch": 12.634271099744245,
"grad_norm": 0.8516256213188171,
"learning_rate": 2.9573170731707316e-05,
"loss": 0.449,
"step": 9880
},
{
"epoch": 12.647058823529411,
"grad_norm": 10.69143295288086,
"learning_rate": 2.9551393728222997e-05,
"loss": 0.5526,
"step": 9890
},
{
"epoch": 12.659846547314578,
"grad_norm": 9.207474708557129,
"learning_rate": 2.952961672473868e-05,
"loss": 0.3616,
"step": 9900
},
{
"epoch": 12.672634271099744,
"grad_norm": 0.3940543532371521,
"learning_rate": 2.9507839721254354e-05,
"loss": 0.3483,
"step": 9910
},
{
"epoch": 12.68542199488491,
"grad_norm": 1.1689401865005493,
"learning_rate": 2.9486062717770036e-05,
"loss": 0.3215,
"step": 9920
},
{
"epoch": 12.698209718670077,
"grad_norm": 3.243196487426758,
"learning_rate": 2.9464285714285718e-05,
"loss": 0.335,
"step": 9930
},
{
"epoch": 12.710997442455243,
"grad_norm": 0.01855871081352234,
"learning_rate": 2.9442508710801396e-05,
"loss": 0.2783,
"step": 9940
},
{
"epoch": 12.72378516624041,
"grad_norm": 0.066720150411129,
"learning_rate": 2.9420731707317074e-05,
"loss": 0.3905,
"step": 9950
},
{
"epoch": 12.736572890025576,
"grad_norm": 3.417693853378296,
"learning_rate": 2.9398954703832753e-05,
"loss": 0.2852,
"step": 9960
},
{
"epoch": 12.749360613810742,
"grad_norm": 1.7551138401031494,
"learning_rate": 2.9377177700348434e-05,
"loss": 0.2011,
"step": 9970
},
{
"epoch": 12.762148337595908,
"grad_norm": 6.556581497192383,
"learning_rate": 2.935540069686411e-05,
"loss": 0.3613,
"step": 9980
},
{
"epoch": 12.774936061381075,
"grad_norm": 0.9520956873893738,
"learning_rate": 2.933362369337979e-05,
"loss": 0.2462,
"step": 9990
},
{
"epoch": 12.787723785166241,
"grad_norm": 2.980120734719094e-05,
"learning_rate": 2.9311846689895473e-05,
"loss": 0.2549,
"step": 10000
},
{
"epoch": 12.800511508951407,
"grad_norm": 0.20872947573661804,
"learning_rate": 2.9290069686411155e-05,
"loss": 0.2909,
"step": 10010
},
{
"epoch": 12.813299232736572,
"grad_norm": 0.3826689124107361,
"learning_rate": 2.926829268292683e-05,
"loss": 0.4378,
"step": 10020
},
{
"epoch": 12.826086956521738,
"grad_norm": 8.614890098571777,
"learning_rate": 2.924651567944251e-05,
"loss": 0.4704,
"step": 10030
},
{
"epoch": 12.838874680306905,
"grad_norm": 3.4223175048828125,
"learning_rate": 2.922473867595819e-05,
"loss": 0.526,
"step": 10040
},
{
"epoch": 12.851662404092071,
"grad_norm": 1.2012652158737183,
"learning_rate": 2.9202961672473868e-05,
"loss": 0.2264,
"step": 10050
},
{
"epoch": 12.864450127877237,
"grad_norm": 2.6389079093933105,
"learning_rate": 2.9181184668989546e-05,
"loss": 0.3546,
"step": 10060
},
{
"epoch": 12.877237851662404,
"grad_norm": 1.9273570775985718,
"learning_rate": 2.9159407665505228e-05,
"loss": 0.1862,
"step": 10070
},
{
"epoch": 12.89002557544757,
"grad_norm": 3.80769419670105,
"learning_rate": 2.913763066202091e-05,
"loss": 0.2989,
"step": 10080
},
{
"epoch": 12.902813299232736,
"grad_norm": 1.0025523900985718,
"learning_rate": 2.9115853658536585e-05,
"loss": 0.4393,
"step": 10090
},
{
"epoch": 12.915601023017903,
"grad_norm": 0.254962682723999,
"learning_rate": 2.9094076655052267e-05,
"loss": 0.5615,
"step": 10100
},
{
"epoch": 12.92838874680307,
"grad_norm": 0.8748624324798584,
"learning_rate": 2.9072299651567948e-05,
"loss": 0.3632,
"step": 10110
},
{
"epoch": 12.941176470588236,
"grad_norm": 2.6783287525177,
"learning_rate": 2.9050522648083623e-05,
"loss": 0.5758,
"step": 10120
},
{
"epoch": 12.953964194373402,
"grad_norm": 0.5378952622413635,
"learning_rate": 2.9028745644599305e-05,
"loss": 0.2729,
"step": 10130
},
{
"epoch": 12.966751918158568,
"grad_norm": 0.02223265916109085,
"learning_rate": 2.9006968641114983e-05,
"loss": 0.7055,
"step": 10140
},
{
"epoch": 12.979539641943735,
"grad_norm": 6.515031814575195,
"learning_rate": 2.8985191637630665e-05,
"loss": 0.3679,
"step": 10150
},
{
"epoch": 12.992327365728901,
"grad_norm": 1.4198124408721924,
"learning_rate": 2.896341463414634e-05,
"loss": 0.1918,
"step": 10160
},
{
"epoch": 13.0,
"eval_loss": 0.21145901083946228,
"eval_runtime": 0.9726,
"eval_samples_per_second": 100.758,
"eval_steps_per_second": 13.366,
"step": 10166
},
{
"epoch": 13.005115089514067,
"grad_norm": 7.412026882171631,
"learning_rate": 2.8941637630662022e-05,
"loss": 0.3834,
"step": 10170
},
{
"epoch": 13.017902813299234,
"grad_norm": 5.850493907928467,
"learning_rate": 2.8919860627177704e-05,
"loss": 0.1763,
"step": 10180
},
{
"epoch": 13.030690537084398,
"grad_norm": 2.427426338195801,
"learning_rate": 2.8898083623693385e-05,
"loss": 0.4808,
"step": 10190
},
{
"epoch": 13.043478260869565,
"grad_norm": 0.0027793091721832752,
"learning_rate": 2.887630662020906e-05,
"loss": 0.3402,
"step": 10200
},
{
"epoch": 13.05626598465473,
"grad_norm": 4.905974388122559,
"learning_rate": 2.885452961672474e-05,
"loss": 0.2913,
"step": 10210
},
{
"epoch": 13.069053708439897,
"grad_norm": 0.002724498976022005,
"learning_rate": 2.883275261324042e-05,
"loss": 0.1021,
"step": 10220
},
{
"epoch": 13.081841432225064,
"grad_norm": 1.5594589710235596,
"learning_rate": 2.8810975609756095e-05,
"loss": 0.0938,
"step": 10230
},
{
"epoch": 13.09462915601023,
"grad_norm": 8.709442138671875,
"learning_rate": 2.8789198606271777e-05,
"loss": 0.3254,
"step": 10240
},
{
"epoch": 13.107416879795396,
"grad_norm": 9.628579139709473,
"learning_rate": 2.876742160278746e-05,
"loss": 0.4707,
"step": 10250
},
{
"epoch": 13.120204603580563,
"grad_norm": 7.752285003662109,
"learning_rate": 2.874564459930314e-05,
"loss": 0.5434,
"step": 10260
},
{
"epoch": 13.132992327365729,
"grad_norm": 1.0182121992111206,
"learning_rate": 2.8723867595818816e-05,
"loss": 0.4221,
"step": 10270
},
{
"epoch": 13.145780051150895,
"grad_norm": 6.204538345336914,
"learning_rate": 2.8702090592334497e-05,
"loss": 0.25,
"step": 10280
},
{
"epoch": 13.158567774936062,
"grad_norm": 4.693891525268555,
"learning_rate": 2.8680313588850176e-05,
"loss": 0.5462,
"step": 10290
},
{
"epoch": 13.171355498721228,
"grad_norm": 0.026867728680372238,
"learning_rate": 2.8658536585365854e-05,
"loss": 0.0342,
"step": 10300
},
{
"epoch": 13.184143222506394,
"grad_norm": 0.9486016035079956,
"learning_rate": 2.8636759581881532e-05,
"loss": 0.4285,
"step": 10310
},
{
"epoch": 13.19693094629156,
"grad_norm": 0.1023603156208992,
"learning_rate": 2.8614982578397214e-05,
"loss": 0.0734,
"step": 10320
},
{
"epoch": 13.209718670076727,
"grad_norm": 2.3830373287200928,
"learning_rate": 2.8593205574912896e-05,
"loss": 0.219,
"step": 10330
},
{
"epoch": 13.222506393861893,
"grad_norm": 0.04898781701922417,
"learning_rate": 2.857142857142857e-05,
"loss": 0.0627,
"step": 10340
},
{
"epoch": 13.235294117647058,
"grad_norm": 0.0003403863520361483,
"learning_rate": 2.8549651567944253e-05,
"loss": 0.5191,
"step": 10350
},
{
"epoch": 13.248081841432224,
"grad_norm": 6.243635654449463,
"learning_rate": 2.8527874564459934e-05,
"loss": 0.5085,
"step": 10360
},
{
"epoch": 13.26086956521739,
"grad_norm": 3.0353028774261475,
"learning_rate": 2.850609756097561e-05,
"loss": 0.3381,
"step": 10370
},
{
"epoch": 13.273657289002557,
"grad_norm": 3.330402374267578,
"learning_rate": 2.848432055749129e-05,
"loss": 0.7394,
"step": 10380
},
{
"epoch": 13.286445012787723,
"grad_norm": 8.248026847839355,
"learning_rate": 2.846254355400697e-05,
"loss": 0.3344,
"step": 10390
},
{
"epoch": 13.29923273657289,
"grad_norm": 9.930607795715332,
"learning_rate": 2.844076655052265e-05,
"loss": 0.3615,
"step": 10400
},
{
"epoch": 13.312020460358056,
"grad_norm": 3.8150908946990967,
"learning_rate": 2.8418989547038326e-05,
"loss": 0.4423,
"step": 10410
},
{
"epoch": 13.324808184143222,
"grad_norm": 0.0005007564323022962,
"learning_rate": 2.8397212543554008e-05,
"loss": 0.4379,
"step": 10420
},
{
"epoch": 13.337595907928389,
"grad_norm": 10.748580932617188,
"learning_rate": 2.837543554006969e-05,
"loss": 0.7275,
"step": 10430
},
{
"epoch": 13.350383631713555,
"grad_norm": 2.6858019828796387,
"learning_rate": 2.8353658536585365e-05,
"loss": 0.3561,
"step": 10440
},
{
"epoch": 13.363171355498721,
"grad_norm": 0.2765854299068451,
"learning_rate": 2.8331881533101046e-05,
"loss": 0.3783,
"step": 10450
},
{
"epoch": 13.375959079283888,
"grad_norm": 7.886977672576904,
"learning_rate": 2.8310104529616728e-05,
"loss": 0.3486,
"step": 10460
},
{
"epoch": 13.388746803069054,
"grad_norm": 0.49375006556510925,
"learning_rate": 2.8288327526132406e-05,
"loss": 0.348,
"step": 10470
},
{
"epoch": 13.40153452685422,
"grad_norm": 1.5718512535095215,
"learning_rate": 2.826655052264808e-05,
"loss": 0.3416,
"step": 10480
},
{
"epoch": 13.414322250639387,
"grad_norm": 0.06271866708993912,
"learning_rate": 2.8244773519163763e-05,
"loss": 0.1687,
"step": 10490
},
{
"epoch": 13.427109974424553,
"grad_norm": 3.52473521232605,
"learning_rate": 2.8222996515679445e-05,
"loss": 0.5941,
"step": 10500
},
{
"epoch": 13.43989769820972,
"grad_norm": 0.5428152084350586,
"learning_rate": 2.820121951219512e-05,
"loss": 0.4202,
"step": 10510
},
{
"epoch": 13.452685421994884,
"grad_norm": 0.05799586698412895,
"learning_rate": 2.81794425087108e-05,
"loss": 0.2371,
"step": 10520
},
{
"epoch": 13.46547314578005,
"grad_norm": 3.599447250366211,
"learning_rate": 2.8157665505226483e-05,
"loss": 0.5036,
"step": 10530
},
{
"epoch": 13.478260869565217,
"grad_norm": 2.957773208618164,
"learning_rate": 2.8135888501742165e-05,
"loss": 0.5503,
"step": 10540
},
{
"epoch": 13.491048593350383,
"grad_norm": 3.7019054889678955,
"learning_rate": 2.811411149825784e-05,
"loss": 0.175,
"step": 10550
},
{
"epoch": 13.50383631713555,
"grad_norm": 0.025149650871753693,
"learning_rate": 2.809233449477352e-05,
"loss": 0.3618,
"step": 10560
},
{
"epoch": 13.516624040920716,
"grad_norm": 5.196008205413818,
"learning_rate": 2.80705574912892e-05,
"loss": 0.1966,
"step": 10570
},
{
"epoch": 13.529411764705882,
"grad_norm": 0.31828802824020386,
"learning_rate": 2.8048780487804882e-05,
"loss": 0.2693,
"step": 10580
},
{
"epoch": 13.542199488491049,
"grad_norm": 0.01727711223065853,
"learning_rate": 2.8027003484320557e-05,
"loss": 0.1533,
"step": 10590
},
{
"epoch": 13.554987212276215,
"grad_norm": 67.53722381591797,
"learning_rate": 2.800522648083624e-05,
"loss": 0.2984,
"step": 10600
},
{
"epoch": 13.567774936061381,
"grad_norm": 5.63265323638916,
"learning_rate": 2.798344947735192e-05,
"loss": 0.3842,
"step": 10610
},
{
"epoch": 13.580562659846548,
"grad_norm": 0.00870454777032137,
"learning_rate": 2.7961672473867595e-05,
"loss": 0.2308,
"step": 10620
},
{
"epoch": 13.593350383631714,
"grad_norm": 5.576174259185791,
"learning_rate": 2.7939895470383277e-05,
"loss": 0.4208,
"step": 10630
},
{
"epoch": 13.60613810741688,
"grad_norm": 6.635630130767822,
"learning_rate": 2.7918118466898955e-05,
"loss": 0.2455,
"step": 10640
},
{
"epoch": 13.618925831202047,
"grad_norm": 3.7037808895111084,
"learning_rate": 2.7896341463414637e-05,
"loss": 0.3545,
"step": 10650
},
{
"epoch": 13.631713554987213,
"grad_norm": 8.49842357635498,
"learning_rate": 2.7874564459930312e-05,
"loss": 0.291,
"step": 10660
},
{
"epoch": 13.644501278772378,
"grad_norm": 2.637025833129883,
"learning_rate": 2.7852787456445994e-05,
"loss": 0.3177,
"step": 10670
},
{
"epoch": 13.657289002557544,
"grad_norm": 1.063649296760559,
"learning_rate": 2.7831010452961676e-05,
"loss": 0.3922,
"step": 10680
},
{
"epoch": 13.67007672634271,
"grad_norm": 2.770068883895874,
"learning_rate": 2.780923344947735e-05,
"loss": 0.4757,
"step": 10690
},
{
"epoch": 13.682864450127877,
"grad_norm": 6.572999000549316,
"learning_rate": 2.7787456445993032e-05,
"loss": 0.3792,
"step": 10700
},
{
"epoch": 13.695652173913043,
"grad_norm": 0.05444970726966858,
"learning_rate": 2.7765679442508714e-05,
"loss": 0.179,
"step": 10710
},
{
"epoch": 13.70843989769821,
"grad_norm": 0.025916630402207375,
"learning_rate": 2.7743902439024393e-05,
"loss": 0.2356,
"step": 10720
},
{
"epoch": 13.721227621483376,
"grad_norm": 5.852172374725342,
"learning_rate": 2.772212543554007e-05,
"loss": 0.4052,
"step": 10730
},
{
"epoch": 13.734015345268542,
"grad_norm": 1.6785579919815063,
"learning_rate": 2.770034843205575e-05,
"loss": 0.2606,
"step": 10740
},
{
"epoch": 13.746803069053708,
"grad_norm": 0.3501584231853485,
"learning_rate": 2.767857142857143e-05,
"loss": 0.1516,
"step": 10750
},
{
"epoch": 13.759590792838875,
"grad_norm": 0.0033246350940316916,
"learning_rate": 2.7656794425087106e-05,
"loss": 0.2981,
"step": 10760
},
{
"epoch": 13.772378516624041,
"grad_norm": 10.814009666442871,
"learning_rate": 2.7635017421602788e-05,
"loss": 0.5273,
"step": 10770
},
{
"epoch": 13.785166240409207,
"grad_norm": 0.16116586327552795,
"learning_rate": 2.761324041811847e-05,
"loss": 0.1441,
"step": 10780
},
{
"epoch": 13.797953964194374,
"grad_norm": 0.0063223871402442455,
"learning_rate": 2.759146341463415e-05,
"loss": 0.1038,
"step": 10790
},
{
"epoch": 13.81074168797954,
"grad_norm": 3.8742804527282715,
"learning_rate": 2.7569686411149826e-05,
"loss": 0.2702,
"step": 10800
},
{
"epoch": 13.823529411764707,
"grad_norm": 0.9244747757911682,
"learning_rate": 2.7547909407665508e-05,
"loss": 0.1768,
"step": 10810
},
{
"epoch": 13.836317135549873,
"grad_norm": 0.29645875096321106,
"learning_rate": 2.7526132404181186e-05,
"loss": 0.1818,
"step": 10820
},
{
"epoch": 13.84910485933504,
"grad_norm": 1.829161286354065,
"learning_rate": 2.750435540069686e-05,
"loss": 0.2051,
"step": 10830
},
{
"epoch": 13.861892583120204,
"grad_norm": 2.0264694690704346,
"learning_rate": 2.7482578397212543e-05,
"loss": 0.328,
"step": 10840
},
{
"epoch": 13.87468030690537,
"grad_norm": 1.9954897165298462,
"learning_rate": 2.7460801393728225e-05,
"loss": 0.2223,
"step": 10850
},
{
"epoch": 13.887468030690536,
"grad_norm": 7.2884521484375,
"learning_rate": 2.7439024390243906e-05,
"loss": 0.3175,
"step": 10860
},
{
"epoch": 13.900255754475703,
"grad_norm": 3.5259621143341064,
"learning_rate": 2.741724738675958e-05,
"loss": 0.5335,
"step": 10870
},
{
"epoch": 13.91304347826087,
"grad_norm": 1.8546830415725708,
"learning_rate": 2.7395470383275263e-05,
"loss": 0.3467,
"step": 10880
},
{
"epoch": 13.925831202046036,
"grad_norm": 0.04471950605511665,
"learning_rate": 2.7373693379790945e-05,
"loss": 0.5066,
"step": 10890
},
{
"epoch": 13.938618925831202,
"grad_norm": 0.1593080461025238,
"learning_rate": 2.735191637630662e-05,
"loss": 0.2206,
"step": 10900
},
{
"epoch": 13.951406649616368,
"grad_norm": 0.12783415615558624,
"learning_rate": 2.7330139372822298e-05,
"loss": 0.2484,
"step": 10910
},
{
"epoch": 13.964194373401535,
"grad_norm": 6.850285530090332,
"learning_rate": 2.730836236933798e-05,
"loss": 0.2997,
"step": 10920
},
{
"epoch": 13.976982097186701,
"grad_norm": 0.002649192698299885,
"learning_rate": 2.7286585365853662e-05,
"loss": 0.323,
"step": 10930
},
{
"epoch": 13.989769820971867,
"grad_norm": 6.734117031097412,
"learning_rate": 2.7264808362369337e-05,
"loss": 0.4467,
"step": 10940
},
{
"epoch": 14.0,
"eval_loss": 0.20195920765399933,
"eval_runtime": 0.9722,
"eval_samples_per_second": 100.807,
"eval_steps_per_second": 13.372,
"step": 10948
},
{
"epoch": 14.002557544757034,
"grad_norm": 9.695572853088379,
"learning_rate": 2.724303135888502e-05,
"loss": 0.4267,
"step": 10950
},
{
"epoch": 14.0153452685422,
"grad_norm": 1.4536675214767456,
"learning_rate": 2.72212543554007e-05,
"loss": 0.0807,
"step": 10960
},
{
"epoch": 14.028132992327366,
"grad_norm": 0.2081240564584732,
"learning_rate": 2.7199477351916382e-05,
"loss": 0.5784,
"step": 10970
},
{
"epoch": 14.040920716112533,
"grad_norm": 0.38465332984924316,
"learning_rate": 2.7177700348432057e-05,
"loss": 0.4252,
"step": 10980
},
{
"epoch": 14.053708439897699,
"grad_norm": 0.16156940162181854,
"learning_rate": 2.7155923344947735e-05,
"loss": 0.2058,
"step": 10990
},
{
"epoch": 14.066496163682864,
"grad_norm": 4.0370917320251465,
"learning_rate": 2.7134146341463417e-05,
"loss": 0.5383,
"step": 11000
},
{
"epoch": 14.07928388746803,
"grad_norm": 12.344548225402832,
"learning_rate": 2.7112369337979092e-05,
"loss": 0.3364,
"step": 11010
},
{
"epoch": 14.092071611253196,
"grad_norm": 4.156929969787598,
"learning_rate": 2.7090592334494774e-05,
"loss": 0.3422,
"step": 11020
},
{
"epoch": 14.104859335038363,
"grad_norm": 1.435106873512268,
"learning_rate": 2.7068815331010456e-05,
"loss": 0.3734,
"step": 11030
},
{
"epoch": 14.117647058823529,
"grad_norm": 0.010196288116276264,
"learning_rate": 2.7047038327526137e-05,
"loss": 0.4882,
"step": 11040
},
{
"epoch": 14.130434782608695,
"grad_norm": 3.2925307750701904,
"learning_rate": 2.7025261324041812e-05,
"loss": 0.2681,
"step": 11050
},
{
"epoch": 14.143222506393862,
"grad_norm": 0.34978732466697693,
"learning_rate": 2.7003484320557494e-05,
"loss": 0.3192,
"step": 11060
},
{
"epoch": 14.156010230179028,
"grad_norm": 8.82948112487793,
"learning_rate": 2.6981707317073172e-05,
"loss": 0.2327,
"step": 11070
},
{
"epoch": 14.168797953964194,
"grad_norm": 7.798665523529053,
"learning_rate": 2.695993031358885e-05,
"loss": 0.3433,
"step": 11080
},
{
"epoch": 14.18158567774936,
"grad_norm": 5.831617832183838,
"learning_rate": 2.693815331010453e-05,
"loss": 0.4675,
"step": 11090
},
{
"epoch": 14.194373401534527,
"grad_norm": 7.787278652191162,
"learning_rate": 2.691637630662021e-05,
"loss": 0.3635,
"step": 11100
},
{
"epoch": 14.207161125319693,
"grad_norm": 2.973318576812744,
"learning_rate": 2.6894599303135893e-05,
"loss": 0.2639,
"step": 11110
},
{
"epoch": 14.21994884910486,
"grad_norm": 1.2989455461502075,
"learning_rate": 2.6872822299651568e-05,
"loss": 0.1029,
"step": 11120
},
{
"epoch": 14.232736572890026,
"grad_norm": 4.172037124633789,
"learning_rate": 2.685104529616725e-05,
"loss": 0.3039,
"step": 11130
},
{
"epoch": 14.245524296675192,
"grad_norm": 5.493216037750244,
"learning_rate": 2.682926829268293e-05,
"loss": 0.3115,
"step": 11140
},
{
"epoch": 14.258312020460359,
"grad_norm": 2.573470115661621,
"learning_rate": 2.6807491289198606e-05,
"loss": 0.1641,
"step": 11150
},
{
"epoch": 14.271099744245525,
"grad_norm": 1.0581598281860352,
"learning_rate": 2.6785714285714288e-05,
"loss": 0.2258,
"step": 11160
},
{
"epoch": 14.28388746803069,
"grad_norm": 0.6617485880851746,
"learning_rate": 2.6763937282229966e-05,
"loss": 0.1989,
"step": 11170
},
{
"epoch": 14.296675191815856,
"grad_norm": 3.649031162261963,
"learning_rate": 2.6742160278745648e-05,
"loss": 0.1438,
"step": 11180
},
{
"epoch": 14.309462915601022,
"grad_norm": 5.712010383605957,
"learning_rate": 2.6720383275261323e-05,
"loss": 0.404,
"step": 11190
},
{
"epoch": 14.322250639386189,
"grad_norm": 4.470798492431641,
"learning_rate": 2.6698606271777005e-05,
"loss": 0.6005,
"step": 11200
},
{
"epoch": 14.335038363171355,
"grad_norm": 5.101674556732178,
"learning_rate": 2.6676829268292686e-05,
"loss": 0.3332,
"step": 11210
},
{
"epoch": 14.347826086956522,
"grad_norm": 5.864815711975098,
"learning_rate": 2.665505226480836e-05,
"loss": 0.25,
"step": 11220
},
{
"epoch": 14.360613810741688,
"grad_norm": 4.044086933135986,
"learning_rate": 2.6633275261324043e-05,
"loss": 0.284,
"step": 11230
},
{
"epoch": 14.373401534526854,
"grad_norm": 1.0305874347686768,
"learning_rate": 2.6611498257839725e-05,
"loss": 0.3079,
"step": 11240
},
{
"epoch": 14.38618925831202,
"grad_norm": 2.8594095706939697,
"learning_rate": 2.6589721254355403e-05,
"loss": 0.194,
"step": 11250
},
{
"epoch": 14.398976982097187,
"grad_norm": 0.24175876379013062,
"learning_rate": 2.6567944250871078e-05,
"loss": 0.2451,
"step": 11260
},
{
"epoch": 14.411764705882353,
"grad_norm": 1.0513581037521362,
"learning_rate": 2.654616724738676e-05,
"loss": 0.1766,
"step": 11270
},
{
"epoch": 14.42455242966752,
"grad_norm": 2.0483906269073486,
"learning_rate": 2.652439024390244e-05,
"loss": 0.2997,
"step": 11280
},
{
"epoch": 14.437340153452686,
"grad_norm": 6.397331714630127,
"learning_rate": 2.6502613240418117e-05,
"loss": 0.2371,
"step": 11290
},
{
"epoch": 14.450127877237852,
"grad_norm": 0.0007806714274920523,
"learning_rate": 2.6480836236933798e-05,
"loss": 0.4092,
"step": 11300
},
{
"epoch": 14.462915601023019,
"grad_norm": 9.310996055603027,
"learning_rate": 2.645905923344948e-05,
"loss": 0.1948,
"step": 11310
},
{
"epoch": 14.475703324808185,
"grad_norm": 0.010096575133502483,
"learning_rate": 2.6437282229965162e-05,
"loss": 0.4244,
"step": 11320
},
{
"epoch": 14.48849104859335,
"grad_norm": 0.21134832501411438,
"learning_rate": 2.6415505226480837e-05,
"loss": 0.1581,
"step": 11330
},
{
"epoch": 14.501278772378516,
"grad_norm": 4.014981746673584,
"learning_rate": 2.6393728222996515e-05,
"loss": 0.3811,
"step": 11340
},
{
"epoch": 14.514066496163682,
"grad_norm": 2.516115427017212,
"learning_rate": 2.6371951219512197e-05,
"loss": 0.358,
"step": 11350
},
{
"epoch": 14.526854219948849,
"grad_norm": 1.9139074087142944,
"learning_rate": 2.6350174216027872e-05,
"loss": 0.1782,
"step": 11360
},
{
"epoch": 14.539641943734015,
"grad_norm": 0.18447408080101013,
"learning_rate": 2.6328397212543554e-05,
"loss": 0.2078,
"step": 11370
},
{
"epoch": 14.552429667519181,
"grad_norm": 2.9752326011657715,
"learning_rate": 2.6306620209059235e-05,
"loss": 0.4245,
"step": 11380
},
{
"epoch": 14.565217391304348,
"grad_norm": 6.934181213378906,
"learning_rate": 2.6284843205574917e-05,
"loss": 0.4025,
"step": 11390
},
{
"epoch": 14.578005115089514,
"grad_norm": 0.07944456487894058,
"learning_rate": 2.6263066202090592e-05,
"loss": 0.1398,
"step": 11400
},
{
"epoch": 14.59079283887468,
"grad_norm": 2.6303796768188477,
"learning_rate": 2.6241289198606274e-05,
"loss": 0.1872,
"step": 11410
},
{
"epoch": 14.603580562659847,
"grad_norm": 9.203404426574707,
"learning_rate": 2.6219512195121952e-05,
"loss": 0.7493,
"step": 11420
},
{
"epoch": 14.616368286445013,
"grad_norm": 5.315173149108887,
"learning_rate": 2.6197735191637634e-05,
"loss": 0.3889,
"step": 11430
},
{
"epoch": 14.62915601023018,
"grad_norm": 1.0642848014831543,
"learning_rate": 2.617595818815331e-05,
"loss": 0.2307,
"step": 11440
},
{
"epoch": 14.641943734015346,
"grad_norm": 1.01585853099823,
"learning_rate": 2.615418118466899e-05,
"loss": 0.1184,
"step": 11450
},
{
"epoch": 14.654731457800512,
"grad_norm": 0.000667865970171988,
"learning_rate": 2.6132404181184672e-05,
"loss": 0.3112,
"step": 11460
},
{
"epoch": 14.667519181585678,
"grad_norm": 0.2515814006328583,
"learning_rate": 2.6110627177700347e-05,
"loss": 0.1608,
"step": 11470
},
{
"epoch": 14.680306905370845,
"grad_norm": 1.5224660634994507,
"learning_rate": 2.608885017421603e-05,
"loss": 0.2974,
"step": 11480
},
{
"epoch": 14.693094629156011,
"grad_norm": 0.15977902710437775,
"learning_rate": 2.606707317073171e-05,
"loss": 0.6651,
"step": 11490
},
{
"epoch": 14.705882352941176,
"grad_norm": 4.893415451049805,
"learning_rate": 2.604529616724739e-05,
"loss": 0.4917,
"step": 11500
},
{
"epoch": 14.718670076726342,
"grad_norm": 1.119374394416809,
"learning_rate": 2.6023519163763068e-05,
"loss": 0.0827,
"step": 11510
},
{
"epoch": 14.731457800511508,
"grad_norm": 1.0671442747116089,
"learning_rate": 2.6001742160278746e-05,
"loss": 0.2147,
"step": 11520
},
{
"epoch": 14.744245524296675,
"grad_norm": 9.146306037902832,
"learning_rate": 2.5979965156794428e-05,
"loss": 0.5441,
"step": 11530
},
{
"epoch": 14.757033248081841,
"grad_norm": 3.6414523124694824,
"learning_rate": 2.5958188153310103e-05,
"loss": 0.2364,
"step": 11540
},
{
"epoch": 14.769820971867007,
"grad_norm": 4.342695236206055,
"learning_rate": 2.5936411149825784e-05,
"loss": 0.596,
"step": 11550
},
{
"epoch": 14.782608695652174,
"grad_norm": 0.005332822445780039,
"learning_rate": 2.5914634146341466e-05,
"loss": 0.1528,
"step": 11560
},
{
"epoch": 14.79539641943734,
"grad_norm": 2.429649591445923,
"learning_rate": 2.5892857142857148e-05,
"loss": 0.2345,
"step": 11570
},
{
"epoch": 14.808184143222507,
"grad_norm": 1.8359285593032837,
"learning_rate": 2.5871080139372823e-05,
"loss": 0.3112,
"step": 11580
},
{
"epoch": 14.820971867007673,
"grad_norm": 0.00027430267073214054,
"learning_rate": 2.5849303135888505e-05,
"loss": 0.1532,
"step": 11590
},
{
"epoch": 14.83375959079284,
"grad_norm": 10.834996223449707,
"learning_rate": 2.5827526132404183e-05,
"loss": 0.2621,
"step": 11600
},
{
"epoch": 14.846547314578006,
"grad_norm": 0.0005706704687327147,
"learning_rate": 2.5805749128919858e-05,
"loss": 0.543,
"step": 11610
},
{
"epoch": 14.859335038363172,
"grad_norm": 10.665911674499512,
"learning_rate": 2.578397212543554e-05,
"loss": 0.3869,
"step": 11620
},
{
"epoch": 14.872122762148338,
"grad_norm": 8.830192565917969,
"learning_rate": 2.576219512195122e-05,
"loss": 0.4839,
"step": 11630
},
{
"epoch": 14.884910485933505,
"grad_norm": 4.080071926116943,
"learning_rate": 2.5740418118466903e-05,
"loss": 0.1488,
"step": 11640
},
{
"epoch": 14.89769820971867,
"grad_norm": 0.40564408898353577,
"learning_rate": 2.5718641114982578e-05,
"loss": 0.1852,
"step": 11650
},
{
"epoch": 14.910485933503836,
"grad_norm": 2.581697940826416,
"learning_rate": 2.569686411149826e-05,
"loss": 0.4561,
"step": 11660
},
{
"epoch": 14.923273657289002,
"grad_norm": 0.11887872964143753,
"learning_rate": 2.567508710801394e-05,
"loss": 0.2166,
"step": 11670
},
{
"epoch": 14.936061381074168,
"grad_norm": 1.9810340404510498,
"learning_rate": 2.5653310104529617e-05,
"loss": 0.2498,
"step": 11680
},
{
"epoch": 14.948849104859335,
"grad_norm": 0.0242207869887352,
"learning_rate": 2.5631533101045295e-05,
"loss": 0.3475,
"step": 11690
},
{
"epoch": 14.961636828644501,
"grad_norm": 3.3264272212982178,
"learning_rate": 2.5609756097560977e-05,
"loss": 0.513,
"step": 11700
},
{
"epoch": 14.974424552429667,
"grad_norm": 2.6883902549743652,
"learning_rate": 2.558797909407666e-05,
"loss": 0.2488,
"step": 11710
},
{
"epoch": 14.987212276214834,
"grad_norm": 1.0811492204666138,
"learning_rate": 2.5566202090592333e-05,
"loss": 0.2837,
"step": 11720
},
{
"epoch": 15.0,
"grad_norm": 3.0630617141723633,
"learning_rate": 2.5544425087108015e-05,
"loss": 0.4542,
"step": 11730
},
{
"epoch": 15.0,
"eval_loss": 0.1965622454881668,
"eval_runtime": 0.8126,
"eval_samples_per_second": 120.604,
"eval_steps_per_second": 15.998,
"step": 11730
},
{
"epoch": 15.012787723785166,
"grad_norm": 2.454094171524048,
"learning_rate": 2.5522648083623697e-05,
"loss": 0.226,
"step": 11740
},
{
"epoch": 15.025575447570333,
"grad_norm": 3.766000986099243,
"learning_rate": 2.5500871080139372e-05,
"loss": 0.2517,
"step": 11750
},
{
"epoch": 15.038363171355499,
"grad_norm": 7.74129056930542,
"learning_rate": 2.5479094076655054e-05,
"loss": 0.2052,
"step": 11760
},
{
"epoch": 15.051150895140665,
"grad_norm": 2.924163341522217,
"learning_rate": 2.5457317073170732e-05,
"loss": 0.361,
"step": 11770
},
{
"epoch": 15.063938618925832,
"grad_norm": 0.38849493861198425,
"learning_rate": 2.5435540069686414e-05,
"loss": 0.2116,
"step": 11780
},
{
"epoch": 15.076726342710998,
"grad_norm": 0.18012334406375885,
"learning_rate": 2.541376306620209e-05,
"loss": 0.3311,
"step": 11790
},
{
"epoch": 15.089514066496164,
"grad_norm": 5.127551078796387,
"learning_rate": 2.539198606271777e-05,
"loss": 0.3043,
"step": 11800
},
{
"epoch": 15.10230179028133,
"grad_norm": 2.3291471004486084,
"learning_rate": 2.5370209059233452e-05,
"loss": 0.457,
"step": 11810
},
{
"epoch": 15.115089514066495,
"grad_norm": 3.040384531021118,
"learning_rate": 2.5348432055749134e-05,
"loss": 0.3165,
"step": 11820
},
{
"epoch": 15.127877237851662,
"grad_norm": 0.054389629513025284,
"learning_rate": 2.532665505226481e-05,
"loss": 0.0757,
"step": 11830
},
{
"epoch": 15.140664961636828,
"grad_norm": 8.579995155334473,
"learning_rate": 2.530487804878049e-05,
"loss": 0.3138,
"step": 11840
},
{
"epoch": 15.153452685421994,
"grad_norm": 0.020280305296182632,
"learning_rate": 2.528310104529617e-05,
"loss": 0.1894,
"step": 11850
},
{
"epoch": 15.16624040920716,
"grad_norm": 6.280925750732422,
"learning_rate": 2.5261324041811847e-05,
"loss": 0.4913,
"step": 11860
},
{
"epoch": 15.179028132992327,
"grad_norm": 3.4848694801330566,
"learning_rate": 2.5239547038327526e-05,
"loss": 0.2397,
"step": 11870
},
{
"epoch": 15.191815856777493,
"grad_norm": 5.22654390335083,
"learning_rate": 2.5217770034843207e-05,
"loss": 0.3758,
"step": 11880
},
{
"epoch": 15.20460358056266,
"grad_norm": 3.6274683475494385,
"learning_rate": 2.519599303135889e-05,
"loss": 0.2949,
"step": 11890
},
{
"epoch": 15.217391304347826,
"grad_norm": 0.8676568269729614,
"learning_rate": 2.5174216027874564e-05,
"loss": 0.1737,
"step": 11900
},
{
"epoch": 15.230179028132993,
"grad_norm": 0.00011830198491225019,
"learning_rate": 2.5152439024390246e-05,
"loss": 0.0692,
"step": 11910
},
{
"epoch": 15.242966751918159,
"grad_norm": 0.6105664372444153,
"learning_rate": 2.5130662020905928e-05,
"loss": 0.2105,
"step": 11920
},
{
"epoch": 15.255754475703325,
"grad_norm": 2.1029629707336426,
"learning_rate": 2.5108885017421603e-05,
"loss": 0.3843,
"step": 11930
},
{
"epoch": 15.268542199488492,
"grad_norm": 0.1929955929517746,
"learning_rate": 2.5087108013937284e-05,
"loss": 0.2277,
"step": 11940
},
{
"epoch": 15.281329923273658,
"grad_norm": 0.02147739939391613,
"learning_rate": 2.5065331010452963e-05,
"loss": 0.1369,
"step": 11950
},
{
"epoch": 15.294117647058824,
"grad_norm": 3.043717861175537,
"learning_rate": 2.5043554006968644e-05,
"loss": 0.4935,
"step": 11960
},
{
"epoch": 15.30690537084399,
"grad_norm": 0.0011144510935992002,
"learning_rate": 2.502177700348432e-05,
"loss": 0.3493,
"step": 11970
},
{
"epoch": 15.319693094629155,
"grad_norm": 5.858348846435547,
"learning_rate": 2.5e-05,
"loss": 0.2531,
"step": 11980
},
{
"epoch": 15.332480818414322,
"grad_norm": 0.5781923532485962,
"learning_rate": 2.497822299651568e-05,
"loss": 0.2783,
"step": 11990
},
{
"epoch": 15.345268542199488,
"grad_norm": 0.5104002356529236,
"learning_rate": 2.495644599303136e-05,
"loss": 0.5572,
"step": 12000
},
{
"epoch": 15.358056265984654,
"grad_norm": 0.00011129306221846491,
"learning_rate": 2.493466898954704e-05,
"loss": 0.2961,
"step": 12010
},
{
"epoch": 15.37084398976982,
"grad_norm": 14.113851547241211,
"learning_rate": 2.4912891986062718e-05,
"loss": 0.4369,
"step": 12020
},
{
"epoch": 15.383631713554987,
"grad_norm": 1.8240095376968384,
"learning_rate": 2.4891114982578396e-05,
"loss": 0.2643,
"step": 12030
},
{
"epoch": 15.396419437340153,
"grad_norm": 0.23419003188610077,
"learning_rate": 2.4869337979094078e-05,
"loss": 0.2411,
"step": 12040
},
{
"epoch": 15.40920716112532,
"grad_norm": 10.807489395141602,
"learning_rate": 2.4847560975609756e-05,
"loss": 0.1637,
"step": 12050
},
{
"epoch": 15.421994884910486,
"grad_norm": 18.70671844482422,
"learning_rate": 2.4825783972125435e-05,
"loss": 0.2905,
"step": 12060
},
{
"epoch": 15.434782608695652,
"grad_norm": 0.11549190431833267,
"learning_rate": 2.4804006968641117e-05,
"loss": 0.3249,
"step": 12070
},
{
"epoch": 15.447570332480819,
"grad_norm": 0.6824242472648621,
"learning_rate": 2.4782229965156795e-05,
"loss": 0.3821,
"step": 12080
},
{
"epoch": 15.460358056265985,
"grad_norm": 2.258577823638916,
"learning_rate": 2.4760452961672477e-05,
"loss": 0.2022,
"step": 12090
},
{
"epoch": 15.473145780051151,
"grad_norm": 4.128283500671387,
"learning_rate": 2.4738675958188155e-05,
"loss": 0.4868,
"step": 12100
},
{
"epoch": 15.485933503836318,
"grad_norm": 2.9728426933288574,
"learning_rate": 2.4716898954703833e-05,
"loss": 0.2863,
"step": 12110
},
{
"epoch": 15.498721227621484,
"grad_norm": 2.3259143829345703,
"learning_rate": 2.4695121951219512e-05,
"loss": 0.4513,
"step": 12120
},
{
"epoch": 15.51150895140665,
"grad_norm": 0.002854045946151018,
"learning_rate": 2.4673344947735194e-05,
"loss": 0.3729,
"step": 12130
},
{
"epoch": 15.524296675191817,
"grad_norm": 6.9137091636657715,
"learning_rate": 2.4651567944250872e-05,
"loss": 0.4284,
"step": 12140
},
{
"epoch": 15.537084398976981,
"grad_norm": 0.3184433579444885,
"learning_rate": 2.462979094076655e-05,
"loss": 0.347,
"step": 12150
},
{
"epoch": 15.549872122762148,
"grad_norm": 0.01655314862728119,
"learning_rate": 2.4608013937282232e-05,
"loss": 0.2761,
"step": 12160
},
{
"epoch": 15.562659846547314,
"grad_norm": 0.7789549231529236,
"learning_rate": 2.458623693379791e-05,
"loss": 0.2362,
"step": 12170
},
{
"epoch": 15.57544757033248,
"grad_norm": 0.0008993714000098407,
"learning_rate": 2.4564459930313592e-05,
"loss": 0.1348,
"step": 12180
},
{
"epoch": 15.588235294117647,
"grad_norm": 6.437202453613281,
"learning_rate": 2.454268292682927e-05,
"loss": 0.3716,
"step": 12190
},
{
"epoch": 15.601023017902813,
"grad_norm": 2.607696294784546,
"learning_rate": 2.452090592334495e-05,
"loss": 0.2855,
"step": 12200
},
{
"epoch": 15.61381074168798,
"grad_norm": 3.7107765674591064,
"learning_rate": 2.4499128919860627e-05,
"loss": 0.1729,
"step": 12210
},
{
"epoch": 15.626598465473146,
"grad_norm": 10.343840599060059,
"learning_rate": 2.4477351916376306e-05,
"loss": 0.2955,
"step": 12220
},
{
"epoch": 15.639386189258312,
"grad_norm": 2.9948103427886963,
"learning_rate": 2.4455574912891987e-05,
"loss": 0.1801,
"step": 12230
},
{
"epoch": 15.652173913043478,
"grad_norm": 7.791348457336426,
"learning_rate": 2.4433797909407666e-05,
"loss": 0.1949,
"step": 12240
},
{
"epoch": 15.664961636828645,
"grad_norm": 4.23246955871582,
"learning_rate": 2.4412020905923347e-05,
"loss": 0.4701,
"step": 12250
},
{
"epoch": 15.677749360613811,
"grad_norm": 0.00041086444980464876,
"learning_rate": 2.4390243902439026e-05,
"loss": 0.2601,
"step": 12260
},
{
"epoch": 15.690537084398978,
"grad_norm": 0.3954007625579834,
"learning_rate": 2.4368466898954707e-05,
"loss": 0.4062,
"step": 12270
},
{
"epoch": 15.703324808184144,
"grad_norm": 0.017659878358244896,
"learning_rate": 2.4346689895470386e-05,
"loss": 0.283,
"step": 12280
},
{
"epoch": 15.71611253196931,
"grad_norm": 2.2281081676483154,
"learning_rate": 2.4324912891986064e-05,
"loss": 0.1874,
"step": 12290
},
{
"epoch": 15.728900255754475,
"grad_norm": 0.07254109531641006,
"learning_rate": 2.4303135888501743e-05,
"loss": 0.1974,
"step": 12300
},
{
"epoch": 15.741687979539641,
"grad_norm": 5.276591777801514,
"learning_rate": 2.428135888501742e-05,
"loss": 0.3396,
"step": 12310
},
{
"epoch": 15.754475703324808,
"grad_norm": 3.9964542388916016,
"learning_rate": 2.4259581881533103e-05,
"loss": 0.166,
"step": 12320
},
{
"epoch": 15.767263427109974,
"grad_norm": 10.958535194396973,
"learning_rate": 2.423780487804878e-05,
"loss": 0.3589,
"step": 12330
},
{
"epoch": 15.78005115089514,
"grad_norm": 1.8940563201904297,
"learning_rate": 2.4216027874564463e-05,
"loss": 0.5555,
"step": 12340
},
{
"epoch": 15.792838874680307,
"grad_norm": 7.3639421463012695,
"learning_rate": 2.419425087108014e-05,
"loss": 0.5132,
"step": 12350
},
{
"epoch": 15.805626598465473,
"grad_norm": 1.4476559162139893,
"learning_rate": 2.4172473867595823e-05,
"loss": 0.1616,
"step": 12360
},
{
"epoch": 15.81841432225064,
"grad_norm": 0.613338828086853,
"learning_rate": 2.4150696864111498e-05,
"loss": 0.1627,
"step": 12370
},
{
"epoch": 15.831202046035806,
"grad_norm": 1.008769154548645,
"learning_rate": 2.4128919860627176e-05,
"loss": 0.4227,
"step": 12380
},
{
"epoch": 15.843989769820972,
"grad_norm": 0.0030397542286664248,
"learning_rate": 2.4107142857142858e-05,
"loss": 0.4402,
"step": 12390
},
{
"epoch": 15.856777493606138,
"grad_norm": 22.37423324584961,
"learning_rate": 2.4085365853658536e-05,
"loss": 0.4533,
"step": 12400
},
{
"epoch": 15.869565217391305,
"grad_norm": 5.366839408874512,
"learning_rate": 2.4063588850174218e-05,
"loss": 0.2505,
"step": 12410
},
{
"epoch": 15.882352941176471,
"grad_norm": 0.02170138992369175,
"learning_rate": 2.4041811846689896e-05,
"loss": 0.3758,
"step": 12420
},
{
"epoch": 15.895140664961637,
"grad_norm": 5.612401962280273,
"learning_rate": 2.4020034843205578e-05,
"loss": 0.2679,
"step": 12430
},
{
"epoch": 15.907928388746804,
"grad_norm": 1.2804334163665771,
"learning_rate": 2.3998257839721257e-05,
"loss": 0.1707,
"step": 12440
},
{
"epoch": 15.92071611253197,
"grad_norm": 0.618193507194519,
"learning_rate": 2.3976480836236935e-05,
"loss": 0.4114,
"step": 12450
},
{
"epoch": 15.933503836317136,
"grad_norm": 0.7701372504234314,
"learning_rate": 2.3954703832752613e-05,
"loss": 0.157,
"step": 12460
},
{
"epoch": 15.946291560102301,
"grad_norm": 4.087001323699951,
"learning_rate": 2.393292682926829e-05,
"loss": 0.3672,
"step": 12470
},
{
"epoch": 15.959079283887467,
"grad_norm": 1.3045196533203125,
"learning_rate": 2.3911149825783973e-05,
"loss": 0.3114,
"step": 12480
},
{
"epoch": 15.971867007672634,
"grad_norm": 0.04855654016137123,
"learning_rate": 2.388937282229965e-05,
"loss": 0.4491,
"step": 12490
},
{
"epoch": 15.9846547314578,
"grad_norm": 3.1147804260253906,
"learning_rate": 2.3867595818815333e-05,
"loss": 0.274,
"step": 12500
},
{
"epoch": 15.997442455242966,
"grad_norm": 0.048398375511169434,
"learning_rate": 2.3845818815331012e-05,
"loss": 0.4065,
"step": 12510
},
{
"epoch": 16.0,
"eval_loss": 0.1960146725177765,
"eval_runtime": 0.9892,
"eval_samples_per_second": 99.075,
"eval_steps_per_second": 13.143,
"step": 12512
},
{
"epoch": 16.010230179028135,
"grad_norm": 4.201949596405029,
"learning_rate": 2.3824041811846694e-05,
"loss": 0.2956,
"step": 12520
},
{
"epoch": 16.0230179028133,
"grad_norm": 0.5891295671463013,
"learning_rate": 2.3802264808362372e-05,
"loss": 0.3316,
"step": 12530
},
{
"epoch": 16.035805626598467,
"grad_norm": 7.009029388427734,
"learning_rate": 2.378048780487805e-05,
"loss": 0.3448,
"step": 12540
},
{
"epoch": 16.04859335038363,
"grad_norm": 2.449805498123169,
"learning_rate": 2.375871080139373e-05,
"loss": 0.1357,
"step": 12550
},
{
"epoch": 16.061381074168796,
"grad_norm": 1.8380799293518066,
"learning_rate": 2.3736933797909407e-05,
"loss": 0.2085,
"step": 12560
},
{
"epoch": 16.074168797953963,
"grad_norm": 5.865932941436768,
"learning_rate": 2.371515679442509e-05,
"loss": 0.3385,
"step": 12570
},
{
"epoch": 16.08695652173913,
"grad_norm": 1.848507285118103,
"learning_rate": 2.3693379790940767e-05,
"loss": 0.2499,
"step": 12580
},
{
"epoch": 16.099744245524295,
"grad_norm": 1.9168906211853027,
"learning_rate": 2.367160278745645e-05,
"loss": 0.241,
"step": 12590
},
{
"epoch": 16.11253196930946,
"grad_norm": 0.5782299041748047,
"learning_rate": 2.3649825783972127e-05,
"loss": 0.2668,
"step": 12600
},
{
"epoch": 16.125319693094628,
"grad_norm": 0.9813456535339355,
"learning_rate": 2.3628048780487806e-05,
"loss": 0.1909,
"step": 12610
},
{
"epoch": 16.138107416879794,
"grad_norm": 0.9238201975822449,
"learning_rate": 2.3606271777003487e-05,
"loss": 0.3034,
"step": 12620
},
{
"epoch": 16.15089514066496,
"grad_norm": 0.00906333327293396,
"learning_rate": 2.3584494773519166e-05,
"loss": 0.5375,
"step": 12630
},
{
"epoch": 16.163682864450127,
"grad_norm": 1.1421220302581787,
"learning_rate": 2.3562717770034844e-05,
"loss": 0.2129,
"step": 12640
},
{
"epoch": 16.176470588235293,
"grad_norm": 0.0030761337839066982,
"learning_rate": 2.3540940766550522e-05,
"loss": 0.2951,
"step": 12650
},
{
"epoch": 16.18925831202046,
"grad_norm": 0.29623809456825256,
"learning_rate": 2.3519163763066204e-05,
"loss": 0.3342,
"step": 12660
},
{
"epoch": 16.202046035805626,
"grad_norm": 0.129877507686615,
"learning_rate": 2.3497386759581882e-05,
"loss": 0.2952,
"step": 12670
},
{
"epoch": 16.214833759590793,
"grad_norm": 0.6856746077537537,
"learning_rate": 2.347560975609756e-05,
"loss": 0.3252,
"step": 12680
},
{
"epoch": 16.22762148337596,
"grad_norm": 3.098801374435425,
"learning_rate": 2.3453832752613243e-05,
"loss": 0.1322,
"step": 12690
},
{
"epoch": 16.240409207161125,
"grad_norm": 0.560373067855835,
"learning_rate": 2.343205574912892e-05,
"loss": 0.2272,
"step": 12700
},
{
"epoch": 16.25319693094629,
"grad_norm": 0.7054154276847839,
"learning_rate": 2.3410278745644603e-05,
"loss": 0.4074,
"step": 12710
},
{
"epoch": 16.265984654731458,
"grad_norm": 5.791378498077393,
"learning_rate": 2.3388501742160278e-05,
"loss": 0.4496,
"step": 12720
},
{
"epoch": 16.278772378516624,
"grad_norm": 0.18869848549365997,
"learning_rate": 2.336672473867596e-05,
"loss": 0.1884,
"step": 12730
},
{
"epoch": 16.29156010230179,
"grad_norm": 5.4613776206970215,
"learning_rate": 2.3344947735191638e-05,
"loss": 0.4337,
"step": 12740
},
{
"epoch": 16.304347826086957,
"grad_norm": 6.673255920410156,
"learning_rate": 2.332317073170732e-05,
"loss": 0.3041,
"step": 12750
},
{
"epoch": 16.317135549872123,
"grad_norm": 15.871744155883789,
"learning_rate": 2.3301393728222998e-05,
"loss": 0.4633,
"step": 12760
},
{
"epoch": 16.32992327365729,
"grad_norm": 0.38828790187835693,
"learning_rate": 2.3279616724738676e-05,
"loss": 0.2658,
"step": 12770
},
{
"epoch": 16.342710997442456,
"grad_norm": 1.8368539810180664,
"learning_rate": 2.3257839721254358e-05,
"loss": 0.0951,
"step": 12780
},
{
"epoch": 16.355498721227622,
"grad_norm": 6.898545265197754,
"learning_rate": 2.3236062717770036e-05,
"loss": 0.3854,
"step": 12790
},
{
"epoch": 16.36828644501279,
"grad_norm": 0.2560237646102905,
"learning_rate": 2.3214285714285715e-05,
"loss": 0.3077,
"step": 12800
},
{
"epoch": 16.381074168797955,
"grad_norm": 0.23725102841854095,
"learning_rate": 2.3192508710801393e-05,
"loss": 0.2606,
"step": 12810
},
{
"epoch": 16.39386189258312,
"grad_norm": 0.25493350625038147,
"learning_rate": 2.3170731707317075e-05,
"loss": 0.2807,
"step": 12820
},
{
"epoch": 16.406649616368288,
"grad_norm": 2.778068780899048,
"learning_rate": 2.3148954703832753e-05,
"loss": 0.2054,
"step": 12830
},
{
"epoch": 16.419437340153454,
"grad_norm": 0.4015319049358368,
"learning_rate": 2.312717770034843e-05,
"loss": 0.5631,
"step": 12840
},
{
"epoch": 16.43222506393862,
"grad_norm": 8.717150688171387,
"learning_rate": 2.3105400696864113e-05,
"loss": 0.3589,
"step": 12850
},
{
"epoch": 16.445012787723787,
"grad_norm": 0.0001172411284642294,
"learning_rate": 2.308362369337979e-05,
"loss": 0.2292,
"step": 12860
},
{
"epoch": 16.45780051150895,
"grad_norm": 4.406980514526367,
"learning_rate": 2.3061846689895473e-05,
"loss": 0.18,
"step": 12870
},
{
"epoch": 16.470588235294116,
"grad_norm": 2.2067298889160156,
"learning_rate": 2.3040069686411152e-05,
"loss": 0.4195,
"step": 12880
},
{
"epoch": 16.483375959079282,
"grad_norm": 0.9050547480583191,
"learning_rate": 2.301829268292683e-05,
"loss": 0.3378,
"step": 12890
},
{
"epoch": 16.49616368286445,
"grad_norm": 2.681492567062378,
"learning_rate": 2.299651567944251e-05,
"loss": 0.2948,
"step": 12900
},
{
"epoch": 16.508951406649615,
"grad_norm": 6.462623596191406,
"learning_rate": 2.297473867595819e-05,
"loss": 0.4256,
"step": 12910
},
{
"epoch": 16.52173913043478,
"grad_norm": 3.2028021812438965,
"learning_rate": 2.295296167247387e-05,
"loss": 0.2487,
"step": 12920
},
{
"epoch": 16.534526854219948,
"grad_norm": 0.28855952620506287,
"learning_rate": 2.2931184668989547e-05,
"loss": 0.4793,
"step": 12930
},
{
"epoch": 16.547314578005114,
"grad_norm": 0.00012032359518343583,
"learning_rate": 2.290940766550523e-05,
"loss": 0.355,
"step": 12940
},
{
"epoch": 16.56010230179028,
"grad_norm": 0.13559375703334808,
"learning_rate": 2.2887630662020907e-05,
"loss": 0.2055,
"step": 12950
},
{
"epoch": 16.572890025575447,
"grad_norm": 6.483852386474609,
"learning_rate": 2.286585365853659e-05,
"loss": 0.2742,
"step": 12960
},
{
"epoch": 16.585677749360613,
"grad_norm": 4.124710559844971,
"learning_rate": 2.2844076655052267e-05,
"loss": 0.3295,
"step": 12970
},
{
"epoch": 16.59846547314578,
"grad_norm": 1.22804856300354,
"learning_rate": 2.2822299651567945e-05,
"loss": 0.2733,
"step": 12980
},
{
"epoch": 16.611253196930946,
"grad_norm": 0.1256943941116333,
"learning_rate": 2.2800522648083624e-05,
"loss": 0.2037,
"step": 12990
},
{
"epoch": 16.624040920716112,
"grad_norm": 0.0005564725724980235,
"learning_rate": 2.2778745644599302e-05,
"loss": 0.2921,
"step": 13000
},
{
"epoch": 16.63682864450128,
"grad_norm": 0.0033420585095882416,
"learning_rate": 2.2756968641114984e-05,
"loss": 0.3267,
"step": 13010
},
{
"epoch": 16.649616368286445,
"grad_norm": 1.830318808555603,
"learning_rate": 2.2735191637630662e-05,
"loss": 0.3211,
"step": 13020
},
{
"epoch": 16.66240409207161,
"grad_norm": 3.8310177326202393,
"learning_rate": 2.2713414634146344e-05,
"loss": 0.1711,
"step": 13030
},
{
"epoch": 16.675191815856778,
"grad_norm": 1.3652631044387817,
"learning_rate": 2.2691637630662022e-05,
"loss": 0.3631,
"step": 13040
},
{
"epoch": 16.687979539641944,
"grad_norm": 0.04906712844967842,
"learning_rate": 2.2669860627177704e-05,
"loss": 0.0488,
"step": 13050
},
{
"epoch": 16.70076726342711,
"grad_norm": 7.600613594055176,
"learning_rate": 2.264808362369338e-05,
"loss": 0.4641,
"step": 13060
},
{
"epoch": 16.713554987212277,
"grad_norm": 0.00042603735346347094,
"learning_rate": 2.2626306620209057e-05,
"loss": 0.2161,
"step": 13070
},
{
"epoch": 16.726342710997443,
"grad_norm": 0.19117648899555206,
"learning_rate": 2.260452961672474e-05,
"loss": 0.2927,
"step": 13080
},
{
"epoch": 16.73913043478261,
"grad_norm": 2.7685117721557617,
"learning_rate": 2.2582752613240418e-05,
"loss": 0.1958,
"step": 13090
},
{
"epoch": 16.751918158567776,
"grad_norm": 2.0074069499969482,
"learning_rate": 2.25609756097561e-05,
"loss": 0.3821,
"step": 13100
},
{
"epoch": 16.764705882352942,
"grad_norm": 0.046160824596881866,
"learning_rate": 2.2539198606271778e-05,
"loss": 0.1671,
"step": 13110
},
{
"epoch": 16.77749360613811,
"grad_norm": 0.10031644254922867,
"learning_rate": 2.251742160278746e-05,
"loss": 0.0973,
"step": 13120
},
{
"epoch": 16.790281329923275,
"grad_norm": 2.997203826904297,
"learning_rate": 2.2495644599303138e-05,
"loss": 0.3202,
"step": 13130
},
{
"epoch": 16.80306905370844,
"grad_norm": 0.17301268875598907,
"learning_rate": 2.2473867595818816e-05,
"loss": 0.2823,
"step": 13140
},
{
"epoch": 16.815856777493607,
"grad_norm": 0.2966526448726654,
"learning_rate": 2.2452090592334494e-05,
"loss": 0.2609,
"step": 13150
},
{
"epoch": 16.828644501278774,
"grad_norm": 3.0557382106781006,
"learning_rate": 2.2430313588850173e-05,
"loss": 0.3246,
"step": 13160
},
{
"epoch": 16.84143222506394,
"grad_norm": 1.249798059463501,
"learning_rate": 2.2408536585365855e-05,
"loss": 0.3694,
"step": 13170
},
{
"epoch": 16.854219948849106,
"grad_norm": 0.13164706528186798,
"learning_rate": 2.2386759581881533e-05,
"loss": 0.2964,
"step": 13180
},
{
"epoch": 16.867007672634273,
"grad_norm": 0.051667943596839905,
"learning_rate": 2.2364982578397215e-05,
"loss": 0.1921,
"step": 13190
},
{
"epoch": 16.87979539641944,
"grad_norm": 0.7596039772033691,
"learning_rate": 2.2343205574912893e-05,
"loss": 0.3181,
"step": 13200
},
{
"epoch": 16.892583120204602,
"grad_norm": 11.275361061096191,
"learning_rate": 2.2321428571428575e-05,
"loss": 0.3087,
"step": 13210
},
{
"epoch": 16.90537084398977,
"grad_norm": 1.354041337966919,
"learning_rate": 2.2299651567944253e-05,
"loss": 0.073,
"step": 13220
},
{
"epoch": 16.918158567774935,
"grad_norm": 2.936005115509033,
"learning_rate": 2.227787456445993e-05,
"loss": 0.4707,
"step": 13230
},
{
"epoch": 16.9309462915601,
"grad_norm": 7.00629186630249,
"learning_rate": 2.225609756097561e-05,
"loss": 0.3539,
"step": 13240
},
{
"epoch": 16.943734015345267,
"grad_norm": 0.3057442307472229,
"learning_rate": 2.2234320557491288e-05,
"loss": 0.2228,
"step": 13250
},
{
"epoch": 16.956521739130434,
"grad_norm": 1.9461525678634644,
"learning_rate": 2.221254355400697e-05,
"loss": 0.2704,
"step": 13260
},
{
"epoch": 16.9693094629156,
"grad_norm": 0.01109783910214901,
"learning_rate": 2.219076655052265e-05,
"loss": 0.1429,
"step": 13270
},
{
"epoch": 16.982097186700766,
"grad_norm": 0.3377877175807953,
"learning_rate": 2.216898954703833e-05,
"loss": 0.206,
"step": 13280
},
{
"epoch": 16.994884910485933,
"grad_norm": 1.623797059059143,
"learning_rate": 2.214721254355401e-05,
"loss": 0.3466,
"step": 13290
},
{
"epoch": 17.0,
"eval_loss": 0.194735586643219,
"eval_runtime": 0.984,
"eval_samples_per_second": 99.592,
"eval_steps_per_second": 13.211,
"step": 13294
},
{
"epoch": 17.0076726342711,
"grad_norm": 0.4254322350025177,
"learning_rate": 2.2125435540069687e-05,
"loss": 0.3234,
"step": 13300
},
{
"epoch": 17.020460358056265,
"grad_norm": 0.6410069465637207,
"learning_rate": 2.210365853658537e-05,
"loss": 0.4415,
"step": 13310
},
{
"epoch": 17.033248081841432,
"grad_norm": 7.966799736022949,
"learning_rate": 2.2081881533101047e-05,
"loss": 0.1735,
"step": 13320
},
{
"epoch": 17.046035805626598,
"grad_norm": 1.0562230348587036,
"learning_rate": 2.2060104529616725e-05,
"loss": 0.217,
"step": 13330
},
{
"epoch": 17.058823529411764,
"grad_norm": 3.787612199783325,
"learning_rate": 2.2038327526132404e-05,
"loss": 0.2052,
"step": 13340
},
{
"epoch": 17.07161125319693,
"grad_norm": 4.378226280212402,
"learning_rate": 2.2016550522648085e-05,
"loss": 0.1748,
"step": 13350
},
{
"epoch": 17.084398976982097,
"grad_norm": 2.6178719997406006,
"learning_rate": 2.1994773519163764e-05,
"loss": 0.3225,
"step": 13360
},
{
"epoch": 17.097186700767264,
"grad_norm": 4.570335865020752,
"learning_rate": 2.1972996515679445e-05,
"loss": 0.342,
"step": 13370
},
{
"epoch": 17.10997442455243,
"grad_norm": 2.0175881385803223,
"learning_rate": 2.1951219512195124e-05,
"loss": 0.2828,
"step": 13380
},
{
"epoch": 17.122762148337596,
"grad_norm": 4.092813014984131,
"learning_rate": 2.1929442508710802e-05,
"loss": 0.4931,
"step": 13390
},
{
"epoch": 17.135549872122763,
"grad_norm": 2.1676254272460938,
"learning_rate": 2.1907665505226484e-05,
"loss": 0.0911,
"step": 13400
},
{
"epoch": 17.14833759590793,
"grad_norm": 2.225700855255127,
"learning_rate": 2.188588850174216e-05,
"loss": 0.3082,
"step": 13410
},
{
"epoch": 17.161125319693095,
"grad_norm": 1.6372703313827515,
"learning_rate": 2.186411149825784e-05,
"loss": 0.1567,
"step": 13420
},
{
"epoch": 17.17391304347826,
"grad_norm": 0.0013261646963655949,
"learning_rate": 2.184233449477352e-05,
"loss": 0.0525,
"step": 13430
},
{
"epoch": 17.186700767263428,
"grad_norm": 7.990118980407715,
"learning_rate": 2.18205574912892e-05,
"loss": 0.4464,
"step": 13440
},
{
"epoch": 17.199488491048594,
"grad_norm": 3.268831968307495,
"learning_rate": 2.179878048780488e-05,
"loss": 0.1397,
"step": 13450
},
{
"epoch": 17.21227621483376,
"grad_norm": 0.16767235100269318,
"learning_rate": 2.1777003484320557e-05,
"loss": 0.2333,
"step": 13460
},
{
"epoch": 17.225063938618927,
"grad_norm": 3.0834221839904785,
"learning_rate": 2.175522648083624e-05,
"loss": 0.2366,
"step": 13470
},
{
"epoch": 17.237851662404093,
"grad_norm": 13.640128135681152,
"learning_rate": 2.1733449477351918e-05,
"loss": 0.163,
"step": 13480
},
{
"epoch": 17.25063938618926,
"grad_norm": 0.00588564807549119,
"learning_rate": 2.1711672473867596e-05,
"loss": 0.1748,
"step": 13490
},
{
"epoch": 17.263427109974426,
"grad_norm": 6.2895660400390625,
"learning_rate": 2.1689895470383274e-05,
"loss": 0.4067,
"step": 13500
},
{
"epoch": 17.276214833759592,
"grad_norm": 3.481398105621338,
"learning_rate": 2.1668118466898956e-05,
"loss": 0.538,
"step": 13510
},
{
"epoch": 17.289002557544755,
"grad_norm": 0.00020840627257712185,
"learning_rate": 2.1646341463414634e-05,
"loss": 0.2197,
"step": 13520
},
{
"epoch": 17.30179028132992,
"grad_norm": 12.076574325561523,
"learning_rate": 2.1624564459930316e-05,
"loss": 0.3942,
"step": 13530
},
{
"epoch": 17.314578005115088,
"grad_norm": 8.081069946289062,
"learning_rate": 2.1602787456445995e-05,
"loss": 0.3668,
"step": 13540
},
{
"epoch": 17.327365728900254,
"grad_norm": 4.490804195404053,
"learning_rate": 2.1581010452961673e-05,
"loss": 0.1649,
"step": 13550
},
{
"epoch": 17.34015345268542,
"grad_norm": 2.62864351272583,
"learning_rate": 2.1559233449477355e-05,
"loss": 0.2097,
"step": 13560
},
{
"epoch": 17.352941176470587,
"grad_norm": 0.35655027627944946,
"learning_rate": 2.1537456445993033e-05,
"loss": 0.2966,
"step": 13570
},
{
"epoch": 17.365728900255753,
"grad_norm": 0.05890129879117012,
"learning_rate": 2.151567944250871e-05,
"loss": 0.1949,
"step": 13580
},
{
"epoch": 17.37851662404092,
"grad_norm": 0.0837198868393898,
"learning_rate": 2.149390243902439e-05,
"loss": 0.2269,
"step": 13590
},
{
"epoch": 17.391304347826086,
"grad_norm": 10.314323425292969,
"learning_rate": 2.147212543554007e-05,
"loss": 0.4396,
"step": 13600
},
{
"epoch": 17.404092071611252,
"grad_norm": 3.623154878616333,
"learning_rate": 2.145034843205575e-05,
"loss": 0.1614,
"step": 13610
},
{
"epoch": 17.41687979539642,
"grad_norm": 4.475399971008301,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.1741,
"step": 13620
},
{
"epoch": 17.429667519181585,
"grad_norm": 0.008211358450353146,
"learning_rate": 2.140679442508711e-05,
"loss": 0.0813,
"step": 13630
},
{
"epoch": 17.44245524296675,
"grad_norm": 8.496784210205078,
"learning_rate": 2.1385017421602788e-05,
"loss": 0.3875,
"step": 13640
},
{
"epoch": 17.455242966751918,
"grad_norm": 5.246673583984375,
"learning_rate": 2.136324041811847e-05,
"loss": 0.3415,
"step": 13650
},
{
"epoch": 17.468030690537084,
"grad_norm": 0.3365817368030548,
"learning_rate": 2.134146341463415e-05,
"loss": 0.1968,
"step": 13660
},
{
"epoch": 17.48081841432225,
"grad_norm": 0.7856032848358154,
"learning_rate": 2.1319686411149827e-05,
"loss": 0.3339,
"step": 13670
},
{
"epoch": 17.493606138107417,
"grad_norm": 6.534681797027588,
"learning_rate": 2.1297909407665505e-05,
"loss": 0.3993,
"step": 13680
},
{
"epoch": 17.506393861892583,
"grad_norm": 1.3871843814849854,
"learning_rate": 2.1276132404181183e-05,
"loss": 0.1953,
"step": 13690
},
{
"epoch": 17.51918158567775,
"grad_norm": 3.838477849960327,
"learning_rate": 2.1254355400696865e-05,
"loss": 0.3942,
"step": 13700
},
{
"epoch": 17.531969309462916,
"grad_norm": 0.18229646980762482,
"learning_rate": 2.1232578397212544e-05,
"loss": 0.1537,
"step": 13710
},
{
"epoch": 17.544757033248082,
"grad_norm": 3.601231575012207,
"learning_rate": 2.1210801393728225e-05,
"loss": 0.2614,
"step": 13720
},
{
"epoch": 17.55754475703325,
"grad_norm": 0.9129027724266052,
"learning_rate": 2.1189024390243904e-05,
"loss": 0.6262,
"step": 13730
},
{
"epoch": 17.570332480818415,
"grad_norm": 1.753222942352295,
"learning_rate": 2.1167247386759585e-05,
"loss": 0.231,
"step": 13740
},
{
"epoch": 17.58312020460358,
"grad_norm": 0.5044955611228943,
"learning_rate": 2.1145470383275264e-05,
"loss": 0.1774,
"step": 13750
},
{
"epoch": 17.595907928388748,
"grad_norm": 0.09469438344240189,
"learning_rate": 2.1123693379790942e-05,
"loss": 0.2984,
"step": 13760
},
{
"epoch": 17.608695652173914,
"grad_norm": 5.122965335845947,
"learning_rate": 2.110191637630662e-05,
"loss": 0.4116,
"step": 13770
},
{
"epoch": 17.62148337595908,
"grad_norm": 0.004069724585860968,
"learning_rate": 2.10801393728223e-05,
"loss": 0.1216,
"step": 13780
},
{
"epoch": 17.634271099744247,
"grad_norm": 0.0013519871281459928,
"learning_rate": 2.105836236933798e-05,
"loss": 0.2065,
"step": 13790
},
{
"epoch": 17.647058823529413,
"grad_norm": 2.187586545944214,
"learning_rate": 2.103658536585366e-05,
"loss": 0.2592,
"step": 13800
},
{
"epoch": 17.65984654731458,
"grad_norm": 9.104143142700195,
"learning_rate": 2.101480836236934e-05,
"loss": 0.3444,
"step": 13810
},
{
"epoch": 17.672634271099746,
"grad_norm": 5.176393985748291,
"learning_rate": 2.099303135888502e-05,
"loss": 0.2548,
"step": 13820
},
{
"epoch": 17.685421994884912,
"grad_norm": 7.437559127807617,
"learning_rate": 2.0971254355400697e-05,
"loss": 0.2961,
"step": 13830
},
{
"epoch": 17.69820971867008,
"grad_norm": 0.008558416739106178,
"learning_rate": 2.0949477351916376e-05,
"loss": 0.2467,
"step": 13840
},
{
"epoch": 17.710997442455245,
"grad_norm": 1.2789453268051147,
"learning_rate": 2.0927700348432054e-05,
"loss": 0.1578,
"step": 13850
},
{
"epoch": 17.723785166240408,
"grad_norm": 2.9721248149871826,
"learning_rate": 2.0905923344947736e-05,
"loss": 0.268,
"step": 13860
},
{
"epoch": 17.736572890025574,
"grad_norm": 0.1331997960805893,
"learning_rate": 2.0884146341463414e-05,
"loss": 0.2809,
"step": 13870
},
{
"epoch": 17.74936061381074,
"grad_norm": 7.518838405609131,
"learning_rate": 2.0862369337979096e-05,
"loss": 0.3814,
"step": 13880
},
{
"epoch": 17.762148337595907,
"grad_norm": 2.332446813583374,
"learning_rate": 2.0840592334494774e-05,
"loss": 0.2716,
"step": 13890
},
{
"epoch": 17.774936061381073,
"grad_norm": 8.556915283203125,
"learning_rate": 2.0818815331010456e-05,
"loss": 0.1651,
"step": 13900
},
{
"epoch": 17.78772378516624,
"grad_norm": 1.377729058265686,
"learning_rate": 2.0797038327526134e-05,
"loss": 0.3073,
"step": 13910
},
{
"epoch": 17.800511508951406,
"grad_norm": 0.5971033573150635,
"learning_rate": 2.0775261324041813e-05,
"loss": 0.2833,
"step": 13920
},
{
"epoch": 17.813299232736572,
"grad_norm": 7.453729152679443,
"learning_rate": 2.075348432055749e-05,
"loss": 0.4556,
"step": 13930
},
{
"epoch": 17.82608695652174,
"grad_norm": 0.14529769122600555,
"learning_rate": 2.073170731707317e-05,
"loss": 0.1734,
"step": 13940
},
{
"epoch": 17.838874680306905,
"grad_norm": 11.693350791931152,
"learning_rate": 2.070993031358885e-05,
"loss": 0.2128,
"step": 13950
},
{
"epoch": 17.85166240409207,
"grad_norm": 0.5937496423721313,
"learning_rate": 2.068815331010453e-05,
"loss": 0.2481,
"step": 13960
},
{
"epoch": 17.864450127877237,
"grad_norm": 5.276167869567871,
"learning_rate": 2.066637630662021e-05,
"loss": 0.2194,
"step": 13970
},
{
"epoch": 17.877237851662404,
"grad_norm": 0.024612775072455406,
"learning_rate": 2.064459930313589e-05,
"loss": 0.2629,
"step": 13980
},
{
"epoch": 17.89002557544757,
"grad_norm": 0.25338679552078247,
"learning_rate": 2.062282229965157e-05,
"loss": 0.2496,
"step": 13990
},
{
"epoch": 17.902813299232736,
"grad_norm": 9.272095680236816,
"learning_rate": 2.060104529616725e-05,
"loss": 0.3914,
"step": 14000
},
{
"epoch": 17.915601023017903,
"grad_norm": 2.9164888858795166,
"learning_rate": 2.0579268292682928e-05,
"loss": 0.4612,
"step": 14010
},
{
"epoch": 17.92838874680307,
"grad_norm": 5.0970282554626465,
"learning_rate": 2.0557491289198607e-05,
"loss": 0.534,
"step": 14020
},
{
"epoch": 17.941176470588236,
"grad_norm": 0.003662322647869587,
"learning_rate": 2.0535714285714285e-05,
"loss": 0.3885,
"step": 14030
},
{
"epoch": 17.953964194373402,
"grad_norm": 3.319575071334839,
"learning_rate": 2.0513937282229967e-05,
"loss": 0.3294,
"step": 14040
},
{
"epoch": 17.966751918158568,
"grad_norm": 0.11396234482526779,
"learning_rate": 2.0492160278745645e-05,
"loss": 0.3142,
"step": 14050
},
{
"epoch": 17.979539641943735,
"grad_norm": 1.887491226196289,
"learning_rate": 2.0470383275261327e-05,
"loss": 0.2062,
"step": 14060
},
{
"epoch": 17.9923273657289,
"grad_norm": 0.5869520902633667,
"learning_rate": 2.0448606271777005e-05,
"loss": 0.3147,
"step": 14070
},
{
"epoch": 18.0,
"eval_loss": 0.1915951371192932,
"eval_runtime": 0.9799,
"eval_samples_per_second": 100.008,
"eval_steps_per_second": 13.266,
"step": 14076
},
{
"epoch": 18.005115089514067,
"grad_norm": 0.9173471331596375,
"learning_rate": 2.0426829268292683e-05,
"loss": 0.2309,
"step": 14080
},
{
"epoch": 18.017902813299234,
"grad_norm": 10.534737586975098,
"learning_rate": 2.0405052264808365e-05,
"loss": 0.2463,
"step": 14090
},
{
"epoch": 18.0306905370844,
"grad_norm": 0.6548416614532471,
"learning_rate": 2.038327526132404e-05,
"loss": 0.2417,
"step": 14100
},
{
"epoch": 18.043478260869566,
"grad_norm": 1.446814775466919,
"learning_rate": 2.0361498257839722e-05,
"loss": 0.1992,
"step": 14110
},
{
"epoch": 18.056265984654733,
"grad_norm": 5.1453633308410645,
"learning_rate": 2.03397212543554e-05,
"loss": 0.187,
"step": 14120
},
{
"epoch": 18.0690537084399,
"grad_norm": 0.3860657215118408,
"learning_rate": 2.0317944250871082e-05,
"loss": 0.2015,
"step": 14130
},
{
"epoch": 18.081841432225065,
"grad_norm": 7.620376110076904,
"learning_rate": 2.029616724738676e-05,
"loss": 0.3419,
"step": 14140
},
{
"epoch": 18.09462915601023,
"grad_norm": 3.2555360794067383,
"learning_rate": 2.0274390243902442e-05,
"loss": 0.2185,
"step": 14150
},
{
"epoch": 18.107416879795398,
"grad_norm": 6.394047260284424,
"learning_rate": 2.025261324041812e-05,
"loss": 0.2663,
"step": 14160
},
{
"epoch": 18.120204603580564,
"grad_norm": 1.8313113451004028,
"learning_rate": 2.02308362369338e-05,
"loss": 0.0787,
"step": 14170
},
{
"epoch": 18.132992327365727,
"grad_norm": 2.6229982376098633,
"learning_rate": 2.0209059233449477e-05,
"loss": 0.219,
"step": 14180
},
{
"epoch": 18.145780051150894,
"grad_norm": 0.548910915851593,
"learning_rate": 2.0187282229965156e-05,
"loss": 0.1215,
"step": 14190
},
{
"epoch": 18.15856777493606,
"grad_norm": 0.660483717918396,
"learning_rate": 2.0165505226480837e-05,
"loss": 0.3008,
"step": 14200
},
{
"epoch": 18.171355498721226,
"grad_norm": 2.018397092819214,
"learning_rate": 2.0143728222996516e-05,
"loss": 0.2721,
"step": 14210
},
{
"epoch": 18.184143222506393,
"grad_norm": 0.0916806161403656,
"learning_rate": 2.0121951219512197e-05,
"loss": 0.4191,
"step": 14220
},
{
"epoch": 18.19693094629156,
"grad_norm": 0.08776136487722397,
"learning_rate": 2.0100174216027876e-05,
"loss": 0.2072,
"step": 14230
},
{
"epoch": 18.209718670076725,
"grad_norm": 5.849903583526611,
"learning_rate": 2.0078397212543554e-05,
"loss": 0.4458,
"step": 14240
},
{
"epoch": 18.22250639386189,
"grad_norm": 4.089534759521484,
"learning_rate": 2.0056620209059236e-05,
"loss": 0.1859,
"step": 14250
},
{
"epoch": 18.235294117647058,
"grad_norm": 11.935927391052246,
"learning_rate": 2.0034843205574914e-05,
"loss": 0.4495,
"step": 14260
},
{
"epoch": 18.248081841432224,
"grad_norm": 7.223516464233398,
"learning_rate": 2.0013066202090593e-05,
"loss": 0.2481,
"step": 14270
},
{
"epoch": 18.26086956521739,
"grad_norm": 4.914455890655518,
"learning_rate": 1.999128919860627e-05,
"loss": 0.3013,
"step": 14280
},
{
"epoch": 18.273657289002557,
"grad_norm": 0.18564479053020477,
"learning_rate": 1.9969512195121953e-05,
"loss": 0.2039,
"step": 14290
},
{
"epoch": 18.286445012787723,
"grad_norm": 0.6266302466392517,
"learning_rate": 1.994773519163763e-05,
"loss": 0.3876,
"step": 14300
},
{
"epoch": 18.29923273657289,
"grad_norm": 5.617627143859863,
"learning_rate": 1.992595818815331e-05,
"loss": 0.4202,
"step": 14310
},
{
"epoch": 18.312020460358056,
"grad_norm": 7.912176609039307,
"learning_rate": 1.990418118466899e-05,
"loss": 0.4303,
"step": 14320
},
{
"epoch": 18.324808184143222,
"grad_norm": 0.9023106694221497,
"learning_rate": 1.988240418118467e-05,
"loss": 0.1635,
"step": 14330
},
{
"epoch": 18.33759590792839,
"grad_norm": 6.940431594848633,
"learning_rate": 1.986062717770035e-05,
"loss": 0.3362,
"step": 14340
},
{
"epoch": 18.350383631713555,
"grad_norm": 0.8240020871162415,
"learning_rate": 1.983885017421603e-05,
"loss": 0.3445,
"step": 14350
},
{
"epoch": 18.36317135549872,
"grad_norm": 9.887919425964355,
"learning_rate": 1.9817073170731708e-05,
"loss": 0.3103,
"step": 14360
},
{
"epoch": 18.375959079283888,
"grad_norm": 0.005527616012841463,
"learning_rate": 1.9795296167247386e-05,
"loss": 0.645,
"step": 14370
},
{
"epoch": 18.388746803069054,
"grad_norm": 2.5189778804779053,
"learning_rate": 1.9773519163763068e-05,
"loss": 0.3151,
"step": 14380
},
{
"epoch": 18.40153452685422,
"grad_norm": 0.030117256566882133,
"learning_rate": 1.9751742160278746e-05,
"loss": 0.2635,
"step": 14390
},
{
"epoch": 18.414322250639387,
"grad_norm": 0.8564471006393433,
"learning_rate": 1.9729965156794425e-05,
"loss": 0.3881,
"step": 14400
},
{
"epoch": 18.427109974424553,
"grad_norm": 0.00839280616492033,
"learning_rate": 1.9708188153310107e-05,
"loss": 0.2388,
"step": 14410
},
{
"epoch": 18.43989769820972,
"grad_norm": 1.13720703125,
"learning_rate": 1.9686411149825785e-05,
"loss": 0.4222,
"step": 14420
},
{
"epoch": 18.452685421994886,
"grad_norm": 1.0654269456863403,
"learning_rate": 1.9664634146341467e-05,
"loss": 0.5875,
"step": 14430
},
{
"epoch": 18.465473145780052,
"grad_norm": 0.056377239525318146,
"learning_rate": 1.9642857142857145e-05,
"loss": 0.2721,
"step": 14440
},
{
"epoch": 18.47826086956522,
"grad_norm": 3.4963326454162598,
"learning_rate": 1.9621080139372823e-05,
"loss": 0.3812,
"step": 14450
},
{
"epoch": 18.491048593350385,
"grad_norm": 2.1274666786193848,
"learning_rate": 1.9599303135888502e-05,
"loss": 0.2163,
"step": 14460
},
{
"epoch": 18.50383631713555,
"grad_norm": 2.24591064453125,
"learning_rate": 1.957752613240418e-05,
"loss": 0.1823,
"step": 14470
},
{
"epoch": 18.516624040920718,
"grad_norm": 0.928048849105835,
"learning_rate": 1.9555749128919862e-05,
"loss": 0.3446,
"step": 14480
},
{
"epoch": 18.529411764705884,
"grad_norm": 5.6801225582603365e-05,
"learning_rate": 1.953397212543554e-05,
"loss": 0.3851,
"step": 14490
},
{
"epoch": 18.54219948849105,
"grad_norm": 0.3094902038574219,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.2043,
"step": 14500
},
{
"epoch": 18.554987212276213,
"grad_norm": 1.9753358364105225,
"learning_rate": 1.94904181184669e-05,
"loss": 0.1487,
"step": 14510
},
{
"epoch": 18.56777493606138,
"grad_norm": 8.858641624450684,
"learning_rate": 1.9468641114982582e-05,
"loss": 0.1643,
"step": 14520
},
{
"epoch": 18.580562659846546,
"grad_norm": 1.9973317384719849,
"learning_rate": 1.9446864111498257e-05,
"loss": 0.2102,
"step": 14530
},
{
"epoch": 18.593350383631712,
"grad_norm": 0.3067806363105774,
"learning_rate": 1.9425087108013935e-05,
"loss": 0.271,
"step": 14540
},
{
"epoch": 18.60613810741688,
"grad_norm": 2.4207940101623535,
"learning_rate": 1.9403310104529617e-05,
"loss": 0.2622,
"step": 14550
},
{
"epoch": 18.618925831202045,
"grad_norm": 2.6167995929718018,
"learning_rate": 1.9381533101045295e-05,
"loss": 0.1524,
"step": 14560
},
{
"epoch": 18.63171355498721,
"grad_norm": 0.04882597178220749,
"learning_rate": 1.9359756097560977e-05,
"loss": 0.1838,
"step": 14570
},
{
"epoch": 18.644501278772378,
"grad_norm": 0.4400959610939026,
"learning_rate": 1.9337979094076656e-05,
"loss": 0.1957,
"step": 14580
},
{
"epoch": 18.657289002557544,
"grad_norm": 0.2390824854373932,
"learning_rate": 1.9316202090592337e-05,
"loss": 0.3095,
"step": 14590
},
{
"epoch": 18.67007672634271,
"grad_norm": 1.1464831829071045,
"learning_rate": 1.9294425087108016e-05,
"loss": 0.2338,
"step": 14600
},
{
"epoch": 18.682864450127877,
"grad_norm": 3.7461390495300293,
"learning_rate": 1.9272648083623694e-05,
"loss": 0.1344,
"step": 14610
},
{
"epoch": 18.695652173913043,
"grad_norm": 6.968299865722656,
"learning_rate": 1.9250871080139372e-05,
"loss": 0.3831,
"step": 14620
},
{
"epoch": 18.70843989769821,
"grad_norm": 2.9434783458709717,
"learning_rate": 1.922909407665505e-05,
"loss": 0.0752,
"step": 14630
},
{
"epoch": 18.721227621483376,
"grad_norm": 2.873483419418335,
"learning_rate": 1.9207317073170733e-05,
"loss": 0.1213,
"step": 14640
},
{
"epoch": 18.734015345268542,
"grad_norm": 5.209991455078125,
"learning_rate": 1.918554006968641e-05,
"loss": 0.5809,
"step": 14650
},
{
"epoch": 18.74680306905371,
"grad_norm": 4.765650749206543,
"learning_rate": 1.9163763066202093e-05,
"loss": 0.3331,
"step": 14660
},
{
"epoch": 18.759590792838875,
"grad_norm": 1.3975070714950562,
"learning_rate": 1.914198606271777e-05,
"loss": 0.3194,
"step": 14670
},
{
"epoch": 18.77237851662404,
"grad_norm": 0.0020695773418992758,
"learning_rate": 1.9120209059233453e-05,
"loss": 0.2082,
"step": 14680
},
{
"epoch": 18.785166240409207,
"grad_norm": 5.4186787605285645,
"learning_rate": 1.909843205574913e-05,
"loss": 0.0826,
"step": 14690
},
{
"epoch": 18.797953964194374,
"grad_norm": 0.49431663751602173,
"learning_rate": 1.907665505226481e-05,
"loss": 0.2201,
"step": 14700
},
{
"epoch": 18.81074168797954,
"grad_norm": 3.665900468826294,
"learning_rate": 1.9054878048780488e-05,
"loss": 0.2726,
"step": 14710
},
{
"epoch": 18.823529411764707,
"grad_norm": 9.916034698486328,
"learning_rate": 1.9033101045296166e-05,
"loss": 0.3124,
"step": 14720
},
{
"epoch": 18.836317135549873,
"grad_norm": 8.723508834838867,
"learning_rate": 1.9011324041811848e-05,
"loss": 0.4384,
"step": 14730
},
{
"epoch": 18.84910485933504,
"grad_norm": 4.647964000701904,
"learning_rate": 1.8989547038327526e-05,
"loss": 0.4025,
"step": 14740
},
{
"epoch": 18.861892583120206,
"grad_norm": 0.7404060363769531,
"learning_rate": 1.8967770034843208e-05,
"loss": 0.1928,
"step": 14750
},
{
"epoch": 18.874680306905372,
"grad_norm": 2.1434133052825928,
"learning_rate": 1.8945993031358886e-05,
"loss": 0.219,
"step": 14760
},
{
"epoch": 18.88746803069054,
"grad_norm": 4.017099857330322,
"learning_rate": 1.8924216027874568e-05,
"loss": 0.1563,
"step": 14770
},
{
"epoch": 18.900255754475705,
"grad_norm": 1.0902212858200073,
"learning_rate": 1.8902439024390246e-05,
"loss": 0.2891,
"step": 14780
},
{
"epoch": 18.91304347826087,
"grad_norm": 2.023188352584839,
"learning_rate": 1.8880662020905925e-05,
"loss": 0.0701,
"step": 14790
},
{
"epoch": 18.925831202046037,
"grad_norm": 0.004225098993629217,
"learning_rate": 1.8858885017421603e-05,
"loss": 0.1588,
"step": 14800
},
{
"epoch": 18.938618925831204,
"grad_norm": 4.455468654632568,
"learning_rate": 1.883710801393728e-05,
"loss": 0.1735,
"step": 14810
},
{
"epoch": 18.95140664961637,
"grad_norm": 1.2001502513885498,
"learning_rate": 1.8815331010452963e-05,
"loss": 0.1153,
"step": 14820
},
{
"epoch": 18.964194373401533,
"grad_norm": 4.822994709014893,
"learning_rate": 1.879355400696864e-05,
"loss": 0.3452,
"step": 14830
},
{
"epoch": 18.9769820971867,
"grad_norm": 1.7732210159301758,
"learning_rate": 1.8771777003484323e-05,
"loss": 0.2798,
"step": 14840
},
{
"epoch": 18.989769820971865,
"grad_norm": 1.7861824035644531,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.2973,
"step": 14850
},
{
"epoch": 19.0,
"eval_loss": 0.18999117612838745,
"eval_runtime": 0.9813,
"eval_samples_per_second": 99.865,
"eval_steps_per_second": 13.247,
"step": 14858
},
{
"epoch": 19.002557544757032,
"grad_norm": 0.8770705461502075,
"learning_rate": 1.872822299651568e-05,
"loss": 0.1288,
"step": 14860
},
{
"epoch": 19.015345268542198,
"grad_norm": 6.593090057373047,
"learning_rate": 1.870644599303136e-05,
"loss": 0.2963,
"step": 14870
},
{
"epoch": 19.028132992327365,
"grad_norm": 0.974862813949585,
"learning_rate": 1.8684668989547037e-05,
"loss": 0.3377,
"step": 14880
},
{
"epoch": 19.04092071611253,
"grad_norm": 2.2275137901306152,
"learning_rate": 1.866289198606272e-05,
"loss": 0.2541,
"step": 14890
},
{
"epoch": 19.053708439897697,
"grad_norm": 1.2926586866378784,
"learning_rate": 1.8641114982578397e-05,
"loss": 0.1452,
"step": 14900
},
{
"epoch": 19.066496163682864,
"grad_norm": 3.027857780456543,
"learning_rate": 1.861933797909408e-05,
"loss": 0.155,
"step": 14910
},
{
"epoch": 19.07928388746803,
"grad_norm": 1.4662635326385498,
"learning_rate": 1.8597560975609757e-05,
"loss": 0.136,
"step": 14920
},
{
"epoch": 19.092071611253196,
"grad_norm": 1.220117211341858,
"learning_rate": 1.8575783972125435e-05,
"loss": 0.4804,
"step": 14930
},
{
"epoch": 19.104859335038363,
"grad_norm": 2.4428813457489014,
"learning_rate": 1.8554006968641117e-05,
"loss": 0.2189,
"step": 14940
},
{
"epoch": 19.11764705882353,
"grad_norm": 5.772051811218262,
"learning_rate": 1.8532229965156795e-05,
"loss": 0.2831,
"step": 14950
},
{
"epoch": 19.130434782608695,
"grad_norm": 0.0032378036994487047,
"learning_rate": 1.8510452961672474e-05,
"loss": 0.3129,
"step": 14960
},
{
"epoch": 19.14322250639386,
"grad_norm": 2.335304021835327,
"learning_rate": 1.8488675958188152e-05,
"loss": 0.3056,
"step": 14970
},
{
"epoch": 19.156010230179028,
"grad_norm": 3.2122292518615723,
"learning_rate": 1.8466898954703834e-05,
"loss": 0.347,
"step": 14980
},
{
"epoch": 19.168797953964194,
"grad_norm": 4.042379379272461,
"learning_rate": 1.8445121951219512e-05,
"loss": 0.1863,
"step": 14990
},
{
"epoch": 19.18158567774936,
"grad_norm": 2.36295747756958,
"learning_rate": 1.8423344947735194e-05,
"loss": 0.4143,
"step": 15000
},
{
"epoch": 19.194373401534527,
"grad_norm": 2.999772787094116,
"learning_rate": 1.8401567944250872e-05,
"loss": 0.2193,
"step": 15010
},
{
"epoch": 19.207161125319693,
"grad_norm": 1.0328400135040283,
"learning_rate": 1.837979094076655e-05,
"loss": 0.3591,
"step": 15020
},
{
"epoch": 19.21994884910486,
"grad_norm": 0.0005838721990585327,
"learning_rate": 1.8358013937282233e-05,
"loss": 0.351,
"step": 15030
},
{
"epoch": 19.232736572890026,
"grad_norm": 10.181427955627441,
"learning_rate": 1.833623693379791e-05,
"loss": 0.0954,
"step": 15040
},
{
"epoch": 19.245524296675192,
"grad_norm": 6.923588080098853e-05,
"learning_rate": 1.831445993031359e-05,
"loss": 0.2054,
"step": 15050
},
{
"epoch": 19.25831202046036,
"grad_norm": 3.050046920776367,
"learning_rate": 1.8292682926829268e-05,
"loss": 0.3645,
"step": 15060
},
{
"epoch": 19.271099744245525,
"grad_norm": 0.18810555338859558,
"learning_rate": 1.827090592334495e-05,
"loss": 0.0994,
"step": 15070
},
{
"epoch": 19.28388746803069,
"grad_norm": 4.1756439208984375,
"learning_rate": 1.8249128919860628e-05,
"loss": 0.2741,
"step": 15080
},
{
"epoch": 19.296675191815858,
"grad_norm": 0.8578528165817261,
"learning_rate": 1.8227351916376306e-05,
"loss": 0.1935,
"step": 15090
},
{
"epoch": 19.309462915601024,
"grad_norm": 1.3327559232711792,
"learning_rate": 1.8205574912891988e-05,
"loss": 0.241,
"step": 15100
},
{
"epoch": 19.32225063938619,
"grad_norm": 5.43163537979126,
"learning_rate": 1.8183797909407666e-05,
"loss": 0.3708,
"step": 15110
},
{
"epoch": 19.335038363171357,
"grad_norm": 0.055509597063064575,
"learning_rate": 1.8162020905923348e-05,
"loss": 0.4344,
"step": 15120
},
{
"epoch": 19.347826086956523,
"grad_norm": 5.035632610321045,
"learning_rate": 1.8140243902439026e-05,
"loss": 0.2767,
"step": 15130
},
{
"epoch": 19.36061381074169,
"grad_norm": 5.456521511077881,
"learning_rate": 1.8118466898954705e-05,
"loss": 0.3031,
"step": 15140
},
{
"epoch": 19.373401534526856,
"grad_norm": 0.0010591910686343908,
"learning_rate": 1.8096689895470383e-05,
"loss": 0.1793,
"step": 15150
},
{
"epoch": 19.38618925831202,
"grad_norm": 3.1625192165374756,
"learning_rate": 1.8074912891986065e-05,
"loss": 0.1645,
"step": 15160
},
{
"epoch": 19.398976982097185,
"grad_norm": 4.394965648651123,
"learning_rate": 1.8053135888501743e-05,
"loss": 0.2339,
"step": 15170
},
{
"epoch": 19.41176470588235,
"grad_norm": 6.195003032684326,
"learning_rate": 1.803135888501742e-05,
"loss": 0.3896,
"step": 15180
},
{
"epoch": 19.424552429667518,
"grad_norm": 0.0053399535827338696,
"learning_rate": 1.8009581881533103e-05,
"loss": 0.339,
"step": 15190
},
{
"epoch": 19.437340153452684,
"grad_norm": 0.00421350309625268,
"learning_rate": 1.798780487804878e-05,
"loss": 0.1229,
"step": 15200
},
{
"epoch": 19.45012787723785,
"grad_norm": 0.14334776997566223,
"learning_rate": 1.7966027874564463e-05,
"loss": 0.1785,
"step": 15210
},
{
"epoch": 19.462915601023017,
"grad_norm": 2.6690614223480225,
"learning_rate": 1.7944250871080138e-05,
"loss": 0.3094,
"step": 15220
},
{
"epoch": 19.475703324808183,
"grad_norm": 0.42609289288520813,
"learning_rate": 1.792247386759582e-05,
"loss": 0.323,
"step": 15230
},
{
"epoch": 19.48849104859335,
"grad_norm": 0.5637320876121521,
"learning_rate": 1.79006968641115e-05,
"loss": 0.3952,
"step": 15240
},
{
"epoch": 19.501278772378516,
"grad_norm": 5.911632537841797,
"learning_rate": 1.7878919860627177e-05,
"loss": 0.2469,
"step": 15250
},
{
"epoch": 19.514066496163682,
"grad_norm": 4.086767673492432,
"learning_rate": 1.785714285714286e-05,
"loss": 0.3565,
"step": 15260
},
{
"epoch": 19.52685421994885,
"grad_norm": 1.9716821908950806,
"learning_rate": 1.7835365853658537e-05,
"loss": 0.095,
"step": 15270
},
{
"epoch": 19.539641943734015,
"grad_norm": 0.5544086694717407,
"learning_rate": 1.781358885017422e-05,
"loss": 0.2009,
"step": 15280
},
{
"epoch": 19.55242966751918,
"grad_norm": 0.000488213641801849,
"learning_rate": 1.7791811846689897e-05,
"loss": 0.4178,
"step": 15290
},
{
"epoch": 19.565217391304348,
"grad_norm": 1.7613449096679688,
"learning_rate": 1.7770034843205575e-05,
"loss": 0.1983,
"step": 15300
},
{
"epoch": 19.578005115089514,
"grad_norm": 0.23536629974842072,
"learning_rate": 1.7748257839721254e-05,
"loss": 0.1895,
"step": 15310
},
{
"epoch": 19.59079283887468,
"grad_norm": 0.29210370779037476,
"learning_rate": 1.7726480836236932e-05,
"loss": 0.2842,
"step": 15320
},
{
"epoch": 19.603580562659847,
"grad_norm": 0.4455021023750305,
"learning_rate": 1.7704703832752614e-05,
"loss": 0.176,
"step": 15330
},
{
"epoch": 19.616368286445013,
"grad_norm": 2.8317060470581055,
"learning_rate": 1.7682926829268292e-05,
"loss": 0.3407,
"step": 15340
},
{
"epoch": 19.62915601023018,
"grad_norm": 3.654068946838379,
"learning_rate": 1.7661149825783974e-05,
"loss": 0.2485,
"step": 15350
},
{
"epoch": 19.641943734015346,
"grad_norm": 4.583719730377197,
"learning_rate": 1.7639372822299652e-05,
"loss": 0.2926,
"step": 15360
},
{
"epoch": 19.654731457800512,
"grad_norm": 0.38903215527534485,
"learning_rate": 1.7617595818815334e-05,
"loss": 0.1887,
"step": 15370
},
{
"epoch": 19.66751918158568,
"grad_norm": 7.089865684509277,
"learning_rate": 1.7595818815331012e-05,
"loss": 0.2881,
"step": 15380
},
{
"epoch": 19.680306905370845,
"grad_norm": 5.161704063415527,
"learning_rate": 1.757404181184669e-05,
"loss": 0.2531,
"step": 15390
},
{
"epoch": 19.69309462915601,
"grad_norm": 2.1623754501342773,
"learning_rate": 1.755226480836237e-05,
"loss": 0.2547,
"step": 15400
},
{
"epoch": 19.705882352941178,
"grad_norm": 2.177671432495117,
"learning_rate": 1.7530487804878047e-05,
"loss": 0.301,
"step": 15410
},
{
"epoch": 19.718670076726344,
"grad_norm": 11.073813438415527,
"learning_rate": 1.750871080139373e-05,
"loss": 0.4999,
"step": 15420
},
{
"epoch": 19.73145780051151,
"grad_norm": 5.834652423858643,
"learning_rate": 1.7486933797909408e-05,
"loss": 0.3861,
"step": 15430
},
{
"epoch": 19.744245524296677,
"grad_norm": 4.418552398681641,
"learning_rate": 1.746515679442509e-05,
"loss": 0.1229,
"step": 15440
},
{
"epoch": 19.757033248081843,
"grad_norm": 2.298755168914795,
"learning_rate": 1.7443379790940768e-05,
"loss": 0.1937,
"step": 15450
},
{
"epoch": 19.76982097186701,
"grad_norm": 0.003441237611696124,
"learning_rate": 1.742160278745645e-05,
"loss": 0.1667,
"step": 15460
},
{
"epoch": 19.782608695652176,
"grad_norm": 0.5872055888175964,
"learning_rate": 1.7399825783972128e-05,
"loss": 0.1502,
"step": 15470
},
{
"epoch": 19.79539641943734,
"grad_norm": 7.624274253845215,
"learning_rate": 1.7378048780487806e-05,
"loss": 0.363,
"step": 15480
},
{
"epoch": 19.808184143222505,
"grad_norm": 5.002297401428223,
"learning_rate": 1.7356271777003484e-05,
"loss": 0.2403,
"step": 15490
},
{
"epoch": 19.82097186700767,
"grad_norm": 2.922607898712158,
"learning_rate": 1.7334494773519163e-05,
"loss": 0.0844,
"step": 15500
},
{
"epoch": 19.833759590792837,
"grad_norm": 5.784374713897705,
"learning_rate": 1.7312717770034845e-05,
"loss": 0.5496,
"step": 15510
},
{
"epoch": 19.846547314578004,
"grad_norm": 0.09805575013160706,
"learning_rate": 1.7290940766550523e-05,
"loss": 0.0923,
"step": 15520
},
{
"epoch": 19.85933503836317,
"grad_norm": 0.28828638792037964,
"learning_rate": 1.7269163763066205e-05,
"loss": 0.2232,
"step": 15530
},
{
"epoch": 19.872122762148337,
"grad_norm": 1.204892635345459,
"learning_rate": 1.7247386759581883e-05,
"loss": 0.2714,
"step": 15540
},
{
"epoch": 19.884910485933503,
"grad_norm": 3.157284736633301,
"learning_rate": 1.722560975609756e-05,
"loss": 0.1934,
"step": 15550
},
{
"epoch": 19.89769820971867,
"grad_norm": 0.35447245836257935,
"learning_rate": 1.7203832752613243e-05,
"loss": 0.1453,
"step": 15560
},
{
"epoch": 19.910485933503836,
"grad_norm": 5.422603130340576,
"learning_rate": 1.7182055749128918e-05,
"loss": 0.1164,
"step": 15570
},
{
"epoch": 19.923273657289002,
"grad_norm": 0.6058647036552429,
"learning_rate": 1.71602787456446e-05,
"loss": 0.3696,
"step": 15580
},
{
"epoch": 19.93606138107417,
"grad_norm": 2.1776912212371826,
"learning_rate": 1.7138501742160278e-05,
"loss": 0.3658,
"step": 15590
},
{
"epoch": 19.948849104859335,
"grad_norm": 0.62024986743927,
"learning_rate": 1.711672473867596e-05,
"loss": 0.2492,
"step": 15600
},
{
"epoch": 19.9616368286445,
"grad_norm": 6.89608907699585,
"learning_rate": 1.7094947735191638e-05,
"loss": 0.4042,
"step": 15610
},
{
"epoch": 19.974424552429667,
"grad_norm": 3.269071340560913,
"learning_rate": 1.707317073170732e-05,
"loss": 0.1557,
"step": 15620
},
{
"epoch": 19.987212276214834,
"grad_norm": 0.1415141373872757,
"learning_rate": 1.7051393728223e-05,
"loss": 0.0946,
"step": 15630
},
{
"epoch": 20.0,
"grad_norm": 0.5312731862068176,
"learning_rate": 1.7029616724738677e-05,
"loss": 0.207,
"step": 15640
},
{
"epoch": 20.0,
"eval_loss": 0.18845418095588684,
"eval_runtime": 0.8176,
"eval_samples_per_second": 119.862,
"eval_steps_per_second": 15.9,
"step": 15640
},
{
"epoch": 20.012787723785166,
"grad_norm": 5.064207553863525,
"learning_rate": 1.7007839721254355e-05,
"loss": 0.3085,
"step": 15650
},
{
"epoch": 20.025575447570333,
"grad_norm": 0.05143703147768974,
"learning_rate": 1.6986062717770033e-05,
"loss": 0.2168,
"step": 15660
},
{
"epoch": 20.0383631713555,
"grad_norm": 5.019478797912598,
"learning_rate": 1.6964285714285715e-05,
"loss": 0.3274,
"step": 15670
},
{
"epoch": 20.051150895140665,
"grad_norm": 0.4289814531803131,
"learning_rate": 1.6942508710801394e-05,
"loss": 0.2389,
"step": 15680
},
{
"epoch": 20.06393861892583,
"grad_norm": 0.20377962291240692,
"learning_rate": 1.6920731707317075e-05,
"loss": 0.2123,
"step": 15690
},
{
"epoch": 20.076726342710998,
"grad_norm": 2.430692672729492,
"learning_rate": 1.6898954703832754e-05,
"loss": 0.226,
"step": 15700
},
{
"epoch": 20.089514066496164,
"grad_norm": 0.026158737018704414,
"learning_rate": 1.6877177700348432e-05,
"loss": 0.2226,
"step": 15710
},
{
"epoch": 20.10230179028133,
"grad_norm": 0.08608794212341309,
"learning_rate": 1.6855400696864114e-05,
"loss": 0.2112,
"step": 15720
},
{
"epoch": 20.115089514066497,
"grad_norm": 0.00048582549788989127,
"learning_rate": 1.6833623693379792e-05,
"loss": 0.1476,
"step": 15730
},
{
"epoch": 20.127877237851663,
"grad_norm": 0.0019287772011011839,
"learning_rate": 1.681184668989547e-05,
"loss": 0.0918,
"step": 15740
},
{
"epoch": 20.14066496163683,
"grad_norm": 4.9623442464508116e-05,
"learning_rate": 1.679006968641115e-05,
"loss": 0.2505,
"step": 15750
},
{
"epoch": 20.153452685421996,
"grad_norm": 5.41326904296875,
"learning_rate": 1.676829268292683e-05,
"loss": 0.3093,
"step": 15760
},
{
"epoch": 20.166240409207163,
"grad_norm": 5.913987159729004,
"learning_rate": 1.674651567944251e-05,
"loss": 0.2159,
"step": 15770
},
{
"epoch": 20.17902813299233,
"grad_norm": 0.5496914386749268,
"learning_rate": 1.672473867595819e-05,
"loss": 0.3173,
"step": 15780
},
{
"epoch": 20.191815856777495,
"grad_norm": 0.007048303727060556,
"learning_rate": 1.670296167247387e-05,
"loss": 0.2433,
"step": 15790
},
{
"epoch": 20.20460358056266,
"grad_norm": 4.826021671295166,
"learning_rate": 1.6681184668989547e-05,
"loss": 0.4035,
"step": 15800
},
{
"epoch": 20.217391304347824,
"grad_norm": 0.10413353145122528,
"learning_rate": 1.665940766550523e-05,
"loss": 0.2547,
"step": 15810
},
{
"epoch": 20.23017902813299,
"grad_norm": 0.14804640412330627,
"learning_rate": 1.6637630662020908e-05,
"loss": 0.5118,
"step": 15820
},
{
"epoch": 20.242966751918157,
"grad_norm": 3.711113691329956,
"learning_rate": 1.6615853658536586e-05,
"loss": 0.291,
"step": 15830
},
{
"epoch": 20.255754475703323,
"grad_norm": 5.286841869354248,
"learning_rate": 1.6594076655052264e-05,
"loss": 0.3255,
"step": 15840
},
{
"epoch": 20.26854219948849,
"grad_norm": 1.9062235355377197,
"learning_rate": 1.6572299651567946e-05,
"loss": 0.4051,
"step": 15850
},
{
"epoch": 20.281329923273656,
"grad_norm": 0.0009508104994893074,
"learning_rate": 1.6550522648083624e-05,
"loss": 0.2144,
"step": 15860
},
{
"epoch": 20.294117647058822,
"grad_norm": 0.877557098865509,
"learning_rate": 1.6528745644599303e-05,
"loss": 0.1162,
"step": 15870
},
{
"epoch": 20.30690537084399,
"grad_norm": 5.116530895233154,
"learning_rate": 1.6506968641114984e-05,
"loss": 0.3343,
"step": 15880
},
{
"epoch": 20.319693094629155,
"grad_norm": 2.592881679534912,
"learning_rate": 1.6485191637630663e-05,
"loss": 0.164,
"step": 15890
},
{
"epoch": 20.33248081841432,
"grad_norm": 1.5526633262634277,
"learning_rate": 1.6463414634146345e-05,
"loss": 0.2699,
"step": 15900
},
{
"epoch": 20.345268542199488,
"grad_norm": 3.142625093460083,
"learning_rate": 1.644163763066202e-05,
"loss": 0.1924,
"step": 15910
},
{
"epoch": 20.358056265984654,
"grad_norm": 0.4076800048351288,
"learning_rate": 1.64198606271777e-05,
"loss": 0.1584,
"step": 15920
},
{
"epoch": 20.37084398976982,
"grad_norm": 5.538966655731201,
"learning_rate": 1.639808362369338e-05,
"loss": 0.2156,
"step": 15930
},
{
"epoch": 20.383631713554987,
"grad_norm": 3.252673625946045,
"learning_rate": 1.6376306620209058e-05,
"loss": 0.1807,
"step": 15940
},
{
"epoch": 20.396419437340153,
"grad_norm": 0.481976181268692,
"learning_rate": 1.635452961672474e-05,
"loss": 0.1951,
"step": 15950
},
{
"epoch": 20.40920716112532,
"grad_norm": 0.04137945547699928,
"learning_rate": 1.6332752613240418e-05,
"loss": 0.2324,
"step": 15960
},
{
"epoch": 20.421994884910486,
"grad_norm": 6.7527360916137695,
"learning_rate": 1.63109756097561e-05,
"loss": 0.2467,
"step": 15970
},
{
"epoch": 20.434782608695652,
"grad_norm": 5.791774272918701,
"learning_rate": 1.6289198606271778e-05,
"loss": 0.2561,
"step": 15980
},
{
"epoch": 20.44757033248082,
"grad_norm": 0.04831215366721153,
"learning_rate": 1.6267421602787457e-05,
"loss": 0.4145,
"step": 15990
},
{
"epoch": 20.460358056265985,
"grad_norm": 0.03270556032657623,
"learning_rate": 1.6245644599303135e-05,
"loss": 0.2934,
"step": 16000
},
{
"epoch": 20.47314578005115,
"grad_norm": 4.968059539794922,
"learning_rate": 1.6223867595818817e-05,
"loss": 0.458,
"step": 16010
},
{
"epoch": 20.485933503836318,
"grad_norm": 0.7192120552062988,
"learning_rate": 1.6202090592334495e-05,
"loss": 0.3466,
"step": 16020
},
{
"epoch": 20.498721227621484,
"grad_norm": 7.932967662811279,
"learning_rate": 1.6180313588850173e-05,
"loss": 0.3495,
"step": 16030
},
{
"epoch": 20.51150895140665,
"grad_norm": 4.001988410949707,
"learning_rate": 1.6158536585365855e-05,
"loss": 0.2856,
"step": 16040
},
{
"epoch": 20.524296675191817,
"grad_norm": 0.0006555592408403754,
"learning_rate": 1.6136759581881533e-05,
"loss": 0.3303,
"step": 16050
},
{
"epoch": 20.537084398976983,
"grad_norm": 8.240303039550781,
"learning_rate": 1.6114982578397215e-05,
"loss": 0.1341,
"step": 16060
},
{
"epoch": 20.54987212276215,
"grad_norm": 2.4732863903045654,
"learning_rate": 1.6093205574912894e-05,
"loss": 0.2962,
"step": 16070
},
{
"epoch": 20.562659846547316,
"grad_norm": 4.178097248077393,
"learning_rate": 1.6071428571428572e-05,
"loss": 0.1694,
"step": 16080
},
{
"epoch": 20.575447570332482,
"grad_norm": 0.32051607966423035,
"learning_rate": 1.604965156794425e-05,
"loss": 0.1181,
"step": 16090
},
{
"epoch": 20.58823529411765,
"grad_norm": 8.6662015914917,
"learning_rate": 1.602787456445993e-05,
"loss": 0.3845,
"step": 16100
},
{
"epoch": 20.601023017902815,
"grad_norm": 2.2716774940490723,
"learning_rate": 1.600609756097561e-05,
"loss": 0.2206,
"step": 16110
},
{
"epoch": 20.61381074168798,
"grad_norm": 8.84775161743164,
"learning_rate": 1.598432055749129e-05,
"loss": 0.4821,
"step": 16120
},
{
"epoch": 20.626598465473144,
"grad_norm": 0.0991571843624115,
"learning_rate": 1.596254355400697e-05,
"loss": 0.391,
"step": 16130
},
{
"epoch": 20.63938618925831,
"grad_norm": 0.002819400280714035,
"learning_rate": 1.594076655052265e-05,
"loss": 0.3196,
"step": 16140
},
{
"epoch": 20.652173913043477,
"grad_norm": 11.331056594848633,
"learning_rate": 1.591898954703833e-05,
"loss": 0.3294,
"step": 16150
},
{
"epoch": 20.664961636828643,
"grad_norm": 0.011265520937740803,
"learning_rate": 1.589721254355401e-05,
"loss": 0.2331,
"step": 16160
},
{
"epoch": 20.67774936061381,
"grad_norm": 1.6835376024246216,
"learning_rate": 1.5875435540069687e-05,
"loss": 0.1845,
"step": 16170
},
{
"epoch": 20.690537084398976,
"grad_norm": 0.2547198235988617,
"learning_rate": 1.5853658536585366e-05,
"loss": 0.1338,
"step": 16180
},
{
"epoch": 20.703324808184142,
"grad_norm": 0.7425469756126404,
"learning_rate": 1.5831881533101044e-05,
"loss": 0.1229,
"step": 16190
},
{
"epoch": 20.71611253196931,
"grad_norm": 6.955141067504883,
"learning_rate": 1.5810104529616726e-05,
"loss": 0.4242,
"step": 16200
},
{
"epoch": 20.728900255754475,
"grad_norm": 6.0939741134643555,
"learning_rate": 1.5788327526132404e-05,
"loss": 0.3109,
"step": 16210
},
{
"epoch": 20.74168797953964,
"grad_norm": 0.008783875964581966,
"learning_rate": 1.5766550522648086e-05,
"loss": 0.3216,
"step": 16220
},
{
"epoch": 20.754475703324808,
"grad_norm": 0.00038275119732134044,
"learning_rate": 1.5744773519163764e-05,
"loss": 0.1876,
"step": 16230
},
{
"epoch": 20.767263427109974,
"grad_norm": 0.7452578544616699,
"learning_rate": 1.5722996515679446e-05,
"loss": 0.1195,
"step": 16240
},
{
"epoch": 20.78005115089514,
"grad_norm": 0.0018406964372843504,
"learning_rate": 1.5701219512195124e-05,
"loss": 0.2781,
"step": 16250
},
{
"epoch": 20.792838874680307,
"grad_norm": 2.2064616680145264,
"learning_rate": 1.56794425087108e-05,
"loss": 0.148,
"step": 16260
},
{
"epoch": 20.805626598465473,
"grad_norm": 0.5615038871765137,
"learning_rate": 1.565766550522648e-05,
"loss": 0.2922,
"step": 16270
},
{
"epoch": 20.81841432225064,
"grad_norm": 0.5706607103347778,
"learning_rate": 1.563588850174216e-05,
"loss": 0.2429,
"step": 16280
},
{
"epoch": 20.831202046035806,
"grad_norm": 9.093474388122559,
"learning_rate": 1.561411149825784e-05,
"loss": 0.2946,
"step": 16290
},
{
"epoch": 20.843989769820972,
"grad_norm": 0.25896137952804565,
"learning_rate": 1.559233449477352e-05,
"loss": 0.1847,
"step": 16300
},
{
"epoch": 20.85677749360614,
"grad_norm": 1.9687992334365845,
"learning_rate": 1.55705574912892e-05,
"loss": 0.3932,
"step": 16310
},
{
"epoch": 20.869565217391305,
"grad_norm": 7.081894397735596,
"learning_rate": 1.554878048780488e-05,
"loss": 0.3664,
"step": 16320
},
{
"epoch": 20.88235294117647,
"grad_norm": 0.6577122211456299,
"learning_rate": 1.5527003484320558e-05,
"loss": 0.3076,
"step": 16330
},
{
"epoch": 20.895140664961637,
"grad_norm": 2.853107452392578,
"learning_rate": 1.5505226480836236e-05,
"loss": 0.2537,
"step": 16340
},
{
"epoch": 20.907928388746804,
"grad_norm": 0.6096291542053223,
"learning_rate": 1.5483449477351915e-05,
"loss": 0.1874,
"step": 16350
},
{
"epoch": 20.92071611253197,
"grad_norm": 3.1739935874938965,
"learning_rate": 1.5461672473867596e-05,
"loss": 0.2585,
"step": 16360
},
{
"epoch": 20.933503836317136,
"grad_norm": 0.1571437120437622,
"learning_rate": 1.5439895470383275e-05,
"loss": 0.1351,
"step": 16370
},
{
"epoch": 20.946291560102303,
"grad_norm": 0.07612061500549316,
"learning_rate": 1.5418118466898957e-05,
"loss": 0.1971,
"step": 16380
},
{
"epoch": 20.95907928388747,
"grad_norm": 0.008349299430847168,
"learning_rate": 1.5396341463414635e-05,
"loss": 0.241,
"step": 16390
},
{
"epoch": 20.971867007672635,
"grad_norm": 0.2986217737197876,
"learning_rate": 1.5374564459930317e-05,
"loss": 0.209,
"step": 16400
},
{
"epoch": 20.984654731457802,
"grad_norm": 0.2554277777671814,
"learning_rate": 1.5352787456445995e-05,
"loss": 0.2472,
"step": 16410
},
{
"epoch": 20.997442455242968,
"grad_norm": 0.0008612891542725265,
"learning_rate": 1.5331010452961673e-05,
"loss": 0.1471,
"step": 16420
},
{
"epoch": 21.0,
"eval_loss": 0.1870102435350418,
"eval_runtime": 0.9883,
"eval_samples_per_second": 99.165,
"eval_steps_per_second": 13.155,
"step": 16422
},
{
"epoch": 21.010230179028135,
"grad_norm": 3.3676488399505615,
"learning_rate": 1.5309233449477352e-05,
"loss": 0.2031,
"step": 16430
},
{
"epoch": 21.0230179028133,
"grad_norm": 0.06823177635669708,
"learning_rate": 1.528745644599303e-05,
"loss": 0.2932,
"step": 16440
},
{
"epoch": 21.035805626598467,
"grad_norm": 2.5173239707946777,
"learning_rate": 1.5265679442508712e-05,
"loss": 0.0969,
"step": 16450
},
{
"epoch": 21.04859335038363,
"grad_norm": 0.048735931515693665,
"learning_rate": 1.524390243902439e-05,
"loss": 0.2956,
"step": 16460
},
{
"epoch": 21.061381074168796,
"grad_norm": 7.9867262840271,
"learning_rate": 1.5222125435540072e-05,
"loss": 0.2633,
"step": 16470
},
{
"epoch": 21.074168797953963,
"grad_norm": 0.06308836489915848,
"learning_rate": 1.520034843205575e-05,
"loss": 0.2263,
"step": 16480
},
{
"epoch": 21.08695652173913,
"grad_norm": 8.272692680358887,
"learning_rate": 1.5178571428571429e-05,
"loss": 0.4894,
"step": 16490
},
{
"epoch": 21.099744245524295,
"grad_norm": 7.9979376792907715,
"learning_rate": 1.5156794425087109e-05,
"loss": 0.3538,
"step": 16500
},
{
"epoch": 21.11253196930946,
"grad_norm": 10.409687995910645,
"learning_rate": 1.5135017421602787e-05,
"loss": 0.4088,
"step": 16510
},
{
"epoch": 21.125319693094628,
"grad_norm": 0.15129724144935608,
"learning_rate": 1.5113240418118469e-05,
"loss": 0.1055,
"step": 16520
},
{
"epoch": 21.138107416879794,
"grad_norm": 8.348666191101074,
"learning_rate": 1.5091463414634147e-05,
"loss": 0.5288,
"step": 16530
},
{
"epoch": 21.15089514066496,
"grad_norm": 0.0020471445750445127,
"learning_rate": 1.5069686411149827e-05,
"loss": 0.1927,
"step": 16540
},
{
"epoch": 21.163682864450127,
"grad_norm": 0.013132164254784584,
"learning_rate": 1.5047909407665506e-05,
"loss": 0.0957,
"step": 16550
},
{
"epoch": 21.176470588235293,
"grad_norm": 0.1892154961824417,
"learning_rate": 1.5026132404181184e-05,
"loss": 0.1987,
"step": 16560
},
{
"epoch": 21.18925831202046,
"grad_norm": 2.367501974105835,
"learning_rate": 1.5004355400696866e-05,
"loss": 0.147,
"step": 16570
},
{
"epoch": 21.202046035805626,
"grad_norm": 0.15295153856277466,
"learning_rate": 1.4982578397212544e-05,
"loss": 0.1955,
"step": 16580
},
{
"epoch": 21.214833759590793,
"grad_norm": 0.3686113655567169,
"learning_rate": 1.4960801393728224e-05,
"loss": 0.3367,
"step": 16590
},
{
"epoch": 21.22762148337596,
"grad_norm": 7.953524589538574,
"learning_rate": 1.4939024390243902e-05,
"loss": 0.0922,
"step": 16600
},
{
"epoch": 21.240409207161125,
"grad_norm": 9.946479797363281,
"learning_rate": 1.4917247386759584e-05,
"loss": 0.298,
"step": 16610
},
{
"epoch": 21.25319693094629,
"grad_norm": 9.536918640136719,
"learning_rate": 1.4895470383275263e-05,
"loss": 0.2268,
"step": 16620
},
{
"epoch": 21.265984654731458,
"grad_norm": 7.6481194496154785,
"learning_rate": 1.4873693379790943e-05,
"loss": 0.5011,
"step": 16630
},
{
"epoch": 21.278772378516624,
"grad_norm": 0.00011884182458743453,
"learning_rate": 1.4851916376306621e-05,
"loss": 0.2538,
"step": 16640
},
{
"epoch": 21.29156010230179,
"grad_norm": 1.6625375747680664,
"learning_rate": 1.48301393728223e-05,
"loss": 0.2028,
"step": 16650
},
{
"epoch": 21.304347826086957,
"grad_norm": 0.04297441244125366,
"learning_rate": 1.4808362369337981e-05,
"loss": 0.1176,
"step": 16660
},
{
"epoch": 21.317135549872123,
"grad_norm": 2.1983182430267334,
"learning_rate": 1.4786585365853658e-05,
"loss": 0.2772,
"step": 16670
},
{
"epoch": 21.32992327365729,
"grad_norm": 0.23334167897701263,
"learning_rate": 1.476480836236934e-05,
"loss": 0.3034,
"step": 16680
},
{
"epoch": 21.342710997442456,
"grad_norm": 6.521374702453613,
"learning_rate": 1.4743031358885018e-05,
"loss": 0.4314,
"step": 16690
},
{
"epoch": 21.355498721227622,
"grad_norm": 3.0311429500579834,
"learning_rate": 1.4721254355400698e-05,
"loss": 0.1632,
"step": 16700
},
{
"epoch": 21.36828644501279,
"grad_norm": 1.5404332876205444,
"learning_rate": 1.4699477351916376e-05,
"loss": 0.1377,
"step": 16710
},
{
"epoch": 21.381074168797955,
"grad_norm": 3.868997573852539,
"learning_rate": 1.4677700348432055e-05,
"loss": 0.1567,
"step": 16720
},
{
"epoch": 21.39386189258312,
"grad_norm": 0.3404499888420105,
"learning_rate": 1.4655923344947736e-05,
"loss": 0.2714,
"step": 16730
},
{
"epoch": 21.406649616368288,
"grad_norm": 2.7388010025024414,
"learning_rate": 1.4634146341463415e-05,
"loss": 0.1416,
"step": 16740
},
{
"epoch": 21.419437340153454,
"grad_norm": 0.022615350782871246,
"learning_rate": 1.4612369337979095e-05,
"loss": 0.1607,
"step": 16750
},
{
"epoch": 21.43222506393862,
"grad_norm": 7.353601932525635,
"learning_rate": 1.4590592334494773e-05,
"loss": 0.4509,
"step": 16760
},
{
"epoch": 21.445012787723787,
"grad_norm": 5.572024345397949,
"learning_rate": 1.4568815331010455e-05,
"loss": 0.2129,
"step": 16770
},
{
"epoch": 21.45780051150895,
"grad_norm": 2.0802969932556152,
"learning_rate": 1.4547038327526133e-05,
"loss": 0.3515,
"step": 16780
},
{
"epoch": 21.470588235294116,
"grad_norm": 10.334312438964844,
"learning_rate": 1.4525261324041812e-05,
"loss": 0.4857,
"step": 16790
},
{
"epoch": 21.483375959079282,
"grad_norm": 32.95657730102539,
"learning_rate": 1.4503484320557492e-05,
"loss": 0.2048,
"step": 16800
},
{
"epoch": 21.49616368286445,
"grad_norm": 2.74238657951355,
"learning_rate": 1.448170731707317e-05,
"loss": 0.1129,
"step": 16810
},
{
"epoch": 21.508951406649615,
"grad_norm": 0.7832550406455994,
"learning_rate": 1.4459930313588852e-05,
"loss": 0.165,
"step": 16820
},
{
"epoch": 21.52173913043478,
"grad_norm": 0.30068516731262207,
"learning_rate": 1.443815331010453e-05,
"loss": 0.442,
"step": 16830
},
{
"epoch": 21.534526854219948,
"grad_norm": 2.967395782470703,
"learning_rate": 1.441637630662021e-05,
"loss": 0.205,
"step": 16840
},
{
"epoch": 21.547314578005114,
"grad_norm": 0.000712763809133321,
"learning_rate": 1.4394599303135889e-05,
"loss": 0.299,
"step": 16850
},
{
"epoch": 21.56010230179028,
"grad_norm": 3.7180092334747314,
"learning_rate": 1.437282229965157e-05,
"loss": 0.3639,
"step": 16860
},
{
"epoch": 21.572890025575447,
"grad_norm": 1.8158949613571167,
"learning_rate": 1.4351045296167249e-05,
"loss": 0.4136,
"step": 16870
},
{
"epoch": 21.585677749360613,
"grad_norm": 7.350913047790527,
"learning_rate": 1.4329268292682927e-05,
"loss": 0.5053,
"step": 16880
},
{
"epoch": 21.59846547314578,
"grad_norm": 1.9012134075164795,
"learning_rate": 1.4307491289198607e-05,
"loss": 0.2196,
"step": 16890
},
{
"epoch": 21.611253196930946,
"grad_norm": 1.2609905004501343,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.1712,
"step": 16900
},
{
"epoch": 21.624040920716112,
"grad_norm": 9.658625602722168,
"learning_rate": 1.4263937282229967e-05,
"loss": 0.258,
"step": 16910
},
{
"epoch": 21.63682864450128,
"grad_norm": 1.7282674312591553,
"learning_rate": 1.4242160278745646e-05,
"loss": 0.3437,
"step": 16920
},
{
"epoch": 21.649616368286445,
"grad_norm": 0.8424760699272156,
"learning_rate": 1.4220383275261326e-05,
"loss": 0.2512,
"step": 16930
},
{
"epoch": 21.66240409207161,
"grad_norm": 0.018984561786055565,
"learning_rate": 1.4198606271777004e-05,
"loss": 0.1251,
"step": 16940
},
{
"epoch": 21.675191815856778,
"grad_norm": 6.375213146209717,
"learning_rate": 1.4176829268292682e-05,
"loss": 0.2275,
"step": 16950
},
{
"epoch": 21.687979539641944,
"grad_norm": 2.3540890216827393,
"learning_rate": 1.4155052264808364e-05,
"loss": 0.2892,
"step": 16960
},
{
"epoch": 21.70076726342711,
"grad_norm": 2.8791966438293457,
"learning_rate": 1.413327526132404e-05,
"loss": 0.3235,
"step": 16970
},
{
"epoch": 21.713554987212277,
"grad_norm": 4.614317893981934,
"learning_rate": 1.4111498257839722e-05,
"loss": 0.2368,
"step": 16980
},
{
"epoch": 21.726342710997443,
"grad_norm": 0.14197991788387299,
"learning_rate": 1.40897212543554e-05,
"loss": 0.077,
"step": 16990
},
{
"epoch": 21.73913043478261,
"grad_norm": 13.631002426147461,
"learning_rate": 1.4067944250871083e-05,
"loss": 0.1937,
"step": 17000
},
{
"epoch": 21.751918158567776,
"grad_norm": 5.711042404174805,
"learning_rate": 1.404616724738676e-05,
"loss": 0.2572,
"step": 17010
},
{
"epoch": 21.764705882352942,
"grad_norm": 3.289583206176758,
"learning_rate": 1.4024390243902441e-05,
"loss": 0.279,
"step": 17020
},
{
"epoch": 21.77749360613811,
"grad_norm": 1.9635261297225952,
"learning_rate": 1.400261324041812e-05,
"loss": 0.4068,
"step": 17030
},
{
"epoch": 21.790281329923275,
"grad_norm": 3.4247848987579346,
"learning_rate": 1.3980836236933798e-05,
"loss": 0.2091,
"step": 17040
},
{
"epoch": 21.80306905370844,
"grad_norm": 0.00046931157703511417,
"learning_rate": 1.3959059233449478e-05,
"loss": 0.1642,
"step": 17050
},
{
"epoch": 21.815856777493607,
"grad_norm": 2.2263681888580322,
"learning_rate": 1.3937282229965156e-05,
"loss": 0.2707,
"step": 17060
},
{
"epoch": 21.828644501278774,
"grad_norm": 4.848337173461914,
"learning_rate": 1.3915505226480838e-05,
"loss": 0.2175,
"step": 17070
},
{
"epoch": 21.84143222506394,
"grad_norm": 0.009802093729376793,
"learning_rate": 1.3893728222996516e-05,
"loss": 0.191,
"step": 17080
},
{
"epoch": 21.854219948849106,
"grad_norm": 0.4041551649570465,
"learning_rate": 1.3871951219512196e-05,
"loss": 0.1496,
"step": 17090
},
{
"epoch": 21.867007672634273,
"grad_norm": 2.5238401889801025,
"learning_rate": 1.3850174216027875e-05,
"loss": 0.1683,
"step": 17100
},
{
"epoch": 21.87979539641944,
"grad_norm": 1.646201491355896,
"learning_rate": 1.3828397212543553e-05,
"loss": 0.3284,
"step": 17110
},
{
"epoch": 21.892583120204602,
"grad_norm": 12.850646018981934,
"learning_rate": 1.3806620209059235e-05,
"loss": 0.2655,
"step": 17120
},
{
"epoch": 21.90537084398977,
"grad_norm": 0.1326649785041809,
"learning_rate": 1.3784843205574913e-05,
"loss": 0.2353,
"step": 17130
},
{
"epoch": 21.918158567774935,
"grad_norm": 0.05697041004896164,
"learning_rate": 1.3763066202090593e-05,
"loss": 0.2626,
"step": 17140
},
{
"epoch": 21.9309462915601,
"grad_norm": 2.5305371284484863,
"learning_rate": 1.3741289198606271e-05,
"loss": 0.2074,
"step": 17150
},
{
"epoch": 21.943734015345267,
"grad_norm": 1.367523193359375,
"learning_rate": 1.3719512195121953e-05,
"loss": 0.0912,
"step": 17160
},
{
"epoch": 21.956521739130434,
"grad_norm": 4.560670375823975,
"learning_rate": 1.3697735191637632e-05,
"loss": 0.2482,
"step": 17170
},
{
"epoch": 21.9693094629156,
"grad_norm": 0.6231527924537659,
"learning_rate": 1.367595818815331e-05,
"loss": 0.2624,
"step": 17180
},
{
"epoch": 21.982097186700766,
"grad_norm": 6.82766056060791,
"learning_rate": 1.365418118466899e-05,
"loss": 0.2555,
"step": 17190
},
{
"epoch": 21.994884910485933,
"grad_norm": 0.7137308120727539,
"learning_rate": 1.3632404181184668e-05,
"loss": 0.1362,
"step": 17200
},
{
"epoch": 22.0,
"eval_loss": 0.18604347109794617,
"eval_runtime": 0.9758,
"eval_samples_per_second": 100.43,
"eval_steps_per_second": 13.322,
"step": 17204
},
{
"epoch": 22.0076726342711,
"grad_norm": 3.093580961227417,
"learning_rate": 1.361062717770035e-05,
"loss": 0.2791,
"step": 17210
},
{
"epoch": 22.020460358056265,
"grad_norm": 0.4014206528663635,
"learning_rate": 1.3588850174216028e-05,
"loss": 0.3631,
"step": 17220
},
{
"epoch": 22.033248081841432,
"grad_norm": 8.501093543600291e-05,
"learning_rate": 1.3567073170731709e-05,
"loss": 0.281,
"step": 17230
},
{
"epoch": 22.046035805626598,
"grad_norm": 8.508047103881836,
"learning_rate": 1.3545296167247387e-05,
"loss": 0.2143,
"step": 17240
},
{
"epoch": 22.058823529411764,
"grad_norm": 6.361041069030762,
"learning_rate": 1.3523519163763069e-05,
"loss": 0.3231,
"step": 17250
},
{
"epoch": 22.07161125319693,
"grad_norm": 6.605595111846924,
"learning_rate": 1.3501742160278747e-05,
"loss": 0.5286,
"step": 17260
},
{
"epoch": 22.084398976982097,
"grad_norm": 0.00041541698738001287,
"learning_rate": 1.3479965156794425e-05,
"loss": 0.3769,
"step": 17270
},
{
"epoch": 22.097186700767264,
"grad_norm": 0.15628832578659058,
"learning_rate": 1.3458188153310105e-05,
"loss": 0.1301,
"step": 17280
},
{
"epoch": 22.10997442455243,
"grad_norm": 2.120054244995117,
"learning_rate": 1.3436411149825784e-05,
"loss": 0.1104,
"step": 17290
},
{
"epoch": 22.122762148337596,
"grad_norm": 3.07108473777771,
"learning_rate": 1.3414634146341466e-05,
"loss": 0.1603,
"step": 17300
},
{
"epoch": 22.135549872122763,
"grad_norm": 5.679004669189453,
"learning_rate": 1.3392857142857144e-05,
"loss": 0.2435,
"step": 17310
},
{
"epoch": 22.14833759590793,
"grad_norm": 5.24109411239624,
"learning_rate": 1.3371080139372824e-05,
"loss": 0.234,
"step": 17320
},
{
"epoch": 22.161125319693095,
"grad_norm": 4.5727949142456055,
"learning_rate": 1.3349303135888502e-05,
"loss": 0.1631,
"step": 17330
},
{
"epoch": 22.17391304347826,
"grad_norm": 3.552060127258301,
"learning_rate": 1.332752613240418e-05,
"loss": 0.148,
"step": 17340
},
{
"epoch": 22.186700767263428,
"grad_norm": 0.00046669490984641016,
"learning_rate": 1.3305749128919862e-05,
"loss": 0.2155,
"step": 17350
},
{
"epoch": 22.199488491048594,
"grad_norm": 0.26570141315460205,
"learning_rate": 1.3283972125435539e-05,
"loss": 0.2783,
"step": 17360
},
{
"epoch": 22.21227621483376,
"grad_norm": 8.340713500976562,
"learning_rate": 1.326219512195122e-05,
"loss": 0.2665,
"step": 17370
},
{
"epoch": 22.225063938618927,
"grad_norm": 0.003582471050322056,
"learning_rate": 1.3240418118466899e-05,
"loss": 0.2328,
"step": 17380
},
{
"epoch": 22.237851662404093,
"grad_norm": 0.14131903648376465,
"learning_rate": 1.3218641114982581e-05,
"loss": 0.2196,
"step": 17390
},
{
"epoch": 22.25063938618926,
"grad_norm": 0.5135777592658997,
"learning_rate": 1.3196864111498258e-05,
"loss": 0.2071,
"step": 17400
},
{
"epoch": 22.263427109974426,
"grad_norm": 0.34391576051712036,
"learning_rate": 1.3175087108013936e-05,
"loss": 0.0953,
"step": 17410
},
{
"epoch": 22.276214833759592,
"grad_norm": 2.3617889881134033,
"learning_rate": 1.3153310104529618e-05,
"loss": 0.1522,
"step": 17420
},
{
"epoch": 22.289002557544755,
"grad_norm": 0.0006938801379874349,
"learning_rate": 1.3131533101045296e-05,
"loss": 0.1555,
"step": 17430
},
{
"epoch": 22.30179028132992,
"grad_norm": 2.4944310188293457,
"learning_rate": 1.3109756097560976e-05,
"loss": 0.278,
"step": 17440
},
{
"epoch": 22.314578005115088,
"grad_norm": 1.586595892906189,
"learning_rate": 1.3087979094076654e-05,
"loss": 0.2938,
"step": 17450
},
{
"epoch": 22.327365728900254,
"grad_norm": 0.518889307975769,
"learning_rate": 1.3066202090592336e-05,
"loss": 0.2189,
"step": 17460
},
{
"epoch": 22.34015345268542,
"grad_norm": 3.496354111121036e-05,
"learning_rate": 1.3044425087108015e-05,
"loss": 0.3248,
"step": 17470
},
{
"epoch": 22.352941176470587,
"grad_norm": 0.38596707582473755,
"learning_rate": 1.3022648083623695e-05,
"loss": 0.3765,
"step": 17480
},
{
"epoch": 22.365728900255753,
"grad_norm": 0.0012932750396430492,
"learning_rate": 1.3000871080139373e-05,
"loss": 0.2076,
"step": 17490
},
{
"epoch": 22.37851662404092,
"grad_norm": 0.022681573405861855,
"learning_rate": 1.2979094076655051e-05,
"loss": 0.0535,
"step": 17500
},
{
"epoch": 22.391304347826086,
"grad_norm": 0.041287824511528015,
"learning_rate": 1.2957317073170733e-05,
"loss": 0.2799,
"step": 17510
},
{
"epoch": 22.404092071611252,
"grad_norm": 0.015387671068310738,
"learning_rate": 1.2935540069686411e-05,
"loss": 0.4086,
"step": 17520
},
{
"epoch": 22.41687979539642,
"grad_norm": 0.161788210272789,
"learning_rate": 1.2913763066202091e-05,
"loss": 0.0525,
"step": 17530
},
{
"epoch": 22.429667519181585,
"grad_norm": 0.8831532001495361,
"learning_rate": 1.289198606271777e-05,
"loss": 0.3695,
"step": 17540
},
{
"epoch": 22.44245524296675,
"grad_norm": 0.14793658256530762,
"learning_rate": 1.2870209059233452e-05,
"loss": 0.1574,
"step": 17550
},
{
"epoch": 22.455242966751918,
"grad_norm": 1.977602243423462,
"learning_rate": 1.284843205574913e-05,
"loss": 0.3135,
"step": 17560
},
{
"epoch": 22.468030690537084,
"grad_norm": 1.72086763381958,
"learning_rate": 1.2826655052264808e-05,
"loss": 0.2518,
"step": 17570
},
{
"epoch": 22.48081841432225,
"grad_norm": 1.3766134977340698,
"learning_rate": 1.2804878048780488e-05,
"loss": 0.2637,
"step": 17580
},
{
"epoch": 22.493606138107417,
"grad_norm": 1.531388282775879,
"learning_rate": 1.2783101045296167e-05,
"loss": 0.2058,
"step": 17590
},
{
"epoch": 22.506393861892583,
"grad_norm": 0.3944302499294281,
"learning_rate": 1.2761324041811848e-05,
"loss": 0.0879,
"step": 17600
},
{
"epoch": 22.51918158567775,
"grad_norm": 2.6415324211120605,
"learning_rate": 1.2739547038327527e-05,
"loss": 0.3183,
"step": 17610
},
{
"epoch": 22.531969309462916,
"grad_norm": 0.9195997714996338,
"learning_rate": 1.2717770034843207e-05,
"loss": 0.1717,
"step": 17620
},
{
"epoch": 22.544757033248082,
"grad_norm": 0.0030467098113149405,
"learning_rate": 1.2695993031358885e-05,
"loss": 0.2261,
"step": 17630
},
{
"epoch": 22.55754475703325,
"grad_norm": 9.557846069335938,
"learning_rate": 1.2674216027874567e-05,
"loss": 0.3103,
"step": 17640
},
{
"epoch": 22.570332480818415,
"grad_norm": 4.649169921875,
"learning_rate": 1.2652439024390245e-05,
"loss": 0.4754,
"step": 17650
},
{
"epoch": 22.58312020460358,
"grad_norm": 0.0008290003752335906,
"learning_rate": 1.2630662020905924e-05,
"loss": 0.3525,
"step": 17660
},
{
"epoch": 22.595907928388748,
"grad_norm": 4.795851230621338,
"learning_rate": 1.2608885017421604e-05,
"loss": 0.2181,
"step": 17670
},
{
"epoch": 22.608695652173914,
"grad_norm": 5.425566673278809,
"learning_rate": 1.2587108013937282e-05,
"loss": 0.4152,
"step": 17680
},
{
"epoch": 22.62148337595908,
"grad_norm": 10.171377182006836,
"learning_rate": 1.2565331010452964e-05,
"loss": 0.1547,
"step": 17690
},
{
"epoch": 22.634271099744247,
"grad_norm": 16.753875732421875,
"learning_rate": 1.2543554006968642e-05,
"loss": 0.3034,
"step": 17700
},
{
"epoch": 22.647058823529413,
"grad_norm": 4.119002819061279,
"learning_rate": 1.2521777003484322e-05,
"loss": 0.2585,
"step": 17710
},
{
"epoch": 22.65984654731458,
"grad_norm": 0.46646326780319214,
"learning_rate": 1.25e-05,
"loss": 0.1874,
"step": 17720
},
{
"epoch": 22.672634271099746,
"grad_norm": 4.02834939956665,
"learning_rate": 1.247822299651568e-05,
"loss": 0.1695,
"step": 17730
},
{
"epoch": 22.685421994884912,
"grad_norm": 0.11288446187973022,
"learning_rate": 1.2456445993031359e-05,
"loss": 0.2845,
"step": 17740
},
{
"epoch": 22.69820971867008,
"grad_norm": 0.019553814083337784,
"learning_rate": 1.2434668989547039e-05,
"loss": 0.2784,
"step": 17750
},
{
"epoch": 22.710997442455245,
"grad_norm": 0.01391393318772316,
"learning_rate": 1.2412891986062717e-05,
"loss": 0.12,
"step": 17760
},
{
"epoch": 22.723785166240408,
"grad_norm": 2.1759300231933594,
"learning_rate": 1.2391114982578397e-05,
"loss": 0.0794,
"step": 17770
},
{
"epoch": 22.736572890025574,
"grad_norm": 0.0018889792263507843,
"learning_rate": 1.2369337979094078e-05,
"loss": 0.1777,
"step": 17780
},
{
"epoch": 22.74936061381074,
"grad_norm": 0.005631202831864357,
"learning_rate": 1.2347560975609756e-05,
"loss": 0.3136,
"step": 17790
},
{
"epoch": 22.762148337595907,
"grad_norm": 9.254101753234863,
"learning_rate": 1.2325783972125436e-05,
"loss": 0.313,
"step": 17800
},
{
"epoch": 22.774936061381073,
"grad_norm": 1.4763479232788086,
"learning_rate": 1.2304006968641116e-05,
"loss": 0.1645,
"step": 17810
},
{
"epoch": 22.78772378516624,
"grad_norm": 0.2448328286409378,
"learning_rate": 1.2282229965156796e-05,
"loss": 0.1259,
"step": 17820
},
{
"epoch": 22.800511508951406,
"grad_norm": 0.14641886949539185,
"learning_rate": 1.2260452961672474e-05,
"loss": 0.261,
"step": 17830
},
{
"epoch": 22.813299232736572,
"grad_norm": 0.4757079482078552,
"learning_rate": 1.2238675958188153e-05,
"loss": 0.2618,
"step": 17840
},
{
"epoch": 22.82608695652174,
"grad_norm": 0.00018090580124408007,
"learning_rate": 1.2216898954703833e-05,
"loss": 0.2549,
"step": 17850
},
{
"epoch": 22.838874680306905,
"grad_norm": 0.9151415228843689,
"learning_rate": 1.2195121951219513e-05,
"loss": 0.2213,
"step": 17860
},
{
"epoch": 22.85166240409207,
"grad_norm": 6.071374893188477,
"learning_rate": 1.2173344947735193e-05,
"loss": 0.38,
"step": 17870
},
{
"epoch": 22.864450127877237,
"grad_norm": 8.787729263305664,
"learning_rate": 1.2151567944250871e-05,
"loss": 0.1553,
"step": 17880
},
{
"epoch": 22.877237851662404,
"grad_norm": 0.00015920742589514703,
"learning_rate": 1.2129790940766551e-05,
"loss": 0.3884,
"step": 17890
},
{
"epoch": 22.89002557544757,
"grad_norm": 1.9935282468795776,
"learning_rate": 1.2108013937282231e-05,
"loss": 0.3219,
"step": 17900
},
{
"epoch": 22.902813299232736,
"grad_norm": 1.669886589050293,
"learning_rate": 1.2086236933797911e-05,
"loss": 0.1544,
"step": 17910
},
{
"epoch": 22.915601023017903,
"grad_norm": 5.934450149536133,
"learning_rate": 1.2064459930313588e-05,
"loss": 0.1828,
"step": 17920
},
{
"epoch": 22.92838874680307,
"grad_norm": 0.0014017786597833037,
"learning_rate": 1.2042682926829268e-05,
"loss": 0.2292,
"step": 17930
},
{
"epoch": 22.941176470588236,
"grad_norm": 0.057678937911987305,
"learning_rate": 1.2020905923344948e-05,
"loss": 0.1398,
"step": 17940
},
{
"epoch": 22.953964194373402,
"grad_norm": 5.233877658843994,
"learning_rate": 1.1999128919860628e-05,
"loss": 0.2214,
"step": 17950
},
{
"epoch": 22.966751918158568,
"grad_norm": 5.719663619995117,
"learning_rate": 1.1977351916376307e-05,
"loss": 0.4731,
"step": 17960
},
{
"epoch": 22.979539641943735,
"grad_norm": 3.918996572494507,
"learning_rate": 1.1955574912891987e-05,
"loss": 0.1611,
"step": 17970
},
{
"epoch": 22.9923273657289,
"grad_norm": 4.605996608734131,
"learning_rate": 1.1933797909407667e-05,
"loss": 0.1637,
"step": 17980
},
{
"epoch": 23.0,
"eval_loss": 0.18160267174243927,
"eval_runtime": 0.9793,
"eval_samples_per_second": 100.07,
"eval_steps_per_second": 13.275,
"step": 17986
},
{
"epoch": 23.005115089514067,
"grad_norm": 0.10118851810693741,
"learning_rate": 1.1912020905923347e-05,
"loss": 0.1527,
"step": 17990
},
{
"epoch": 23.017902813299234,
"grad_norm": 2.708587646484375,
"learning_rate": 1.1890243902439025e-05,
"loss": 0.13,
"step": 18000
},
{
"epoch": 23.0306905370844,
"grad_norm": 6.671478748321533,
"learning_rate": 1.1868466898954703e-05,
"loss": 0.3244,
"step": 18010
},
{
"epoch": 23.043478260869566,
"grad_norm": 5.663084983825684,
"learning_rate": 1.1846689895470384e-05,
"loss": 0.2156,
"step": 18020
},
{
"epoch": 23.056265984654733,
"grad_norm": 3.7614872455596924,
"learning_rate": 1.1824912891986064e-05,
"loss": 0.1401,
"step": 18030
},
{
"epoch": 23.0690537084399,
"grad_norm": 0.7713243365287781,
"learning_rate": 1.1803135888501744e-05,
"loss": 0.1942,
"step": 18040
},
{
"epoch": 23.081841432225065,
"grad_norm": 4.182374686934054e-05,
"learning_rate": 1.1781358885017422e-05,
"loss": 0.1929,
"step": 18050
},
{
"epoch": 23.09462915601023,
"grad_norm": 4.289588451385498,
"learning_rate": 1.1759581881533102e-05,
"loss": 0.1914,
"step": 18060
},
{
"epoch": 23.107416879795398,
"grad_norm": 2.5513131618499756,
"learning_rate": 1.173780487804878e-05,
"loss": 0.1723,
"step": 18070
},
{
"epoch": 23.120204603580564,
"grad_norm": 1.6158396005630493,
"learning_rate": 1.171602787456446e-05,
"loss": 0.2443,
"step": 18080
},
{
"epoch": 23.132992327365727,
"grad_norm": 6.644562244415283,
"learning_rate": 1.1694250871080139e-05,
"loss": 0.2504,
"step": 18090
},
{
"epoch": 23.145780051150894,
"grad_norm": 3.37807035446167,
"learning_rate": 1.1672473867595819e-05,
"loss": 0.2057,
"step": 18100
},
{
"epoch": 23.15856777493606,
"grad_norm": 1.8405261039733887,
"learning_rate": 1.1650696864111499e-05,
"loss": 0.2303,
"step": 18110
},
{
"epoch": 23.171355498721226,
"grad_norm": 0.35012125968933105,
"learning_rate": 1.1628919860627179e-05,
"loss": 0.1204,
"step": 18120
},
{
"epoch": 23.184143222506393,
"grad_norm": 7.988998889923096,
"learning_rate": 1.1607142857142857e-05,
"loss": 0.2576,
"step": 18130
},
{
"epoch": 23.19693094629156,
"grad_norm": 0.010078749619424343,
"learning_rate": 1.1585365853658537e-05,
"loss": 0.1435,
"step": 18140
},
{
"epoch": 23.209718670076725,
"grad_norm": 0.5293736457824707,
"learning_rate": 1.1563588850174216e-05,
"loss": 0.1967,
"step": 18150
},
{
"epoch": 23.22250639386189,
"grad_norm": 0.9761539697647095,
"learning_rate": 1.1541811846689896e-05,
"loss": 0.1991,
"step": 18160
},
{
"epoch": 23.235294117647058,
"grad_norm": 1.45791494846344,
"learning_rate": 1.1520034843205576e-05,
"loss": 0.4252,
"step": 18170
},
{
"epoch": 23.248081841432224,
"grad_norm": 0.30160781741142273,
"learning_rate": 1.1498257839721254e-05,
"loss": 0.2784,
"step": 18180
},
{
"epoch": 23.26086956521739,
"grad_norm": 6.1339898109436035,
"learning_rate": 1.1476480836236934e-05,
"loss": 0.1893,
"step": 18190
},
{
"epoch": 23.273657289002557,
"grad_norm": 2.0641069412231445,
"learning_rate": 1.1454703832752614e-05,
"loss": 0.2487,
"step": 18200
},
{
"epoch": 23.286445012787723,
"grad_norm": 0.19321462512016296,
"learning_rate": 1.1432926829268294e-05,
"loss": 0.2067,
"step": 18210
},
{
"epoch": 23.29923273657289,
"grad_norm": 0.0008184879552572966,
"learning_rate": 1.1411149825783973e-05,
"loss": 0.2678,
"step": 18220
},
{
"epoch": 23.312020460358056,
"grad_norm": 6.459168910980225,
"learning_rate": 1.1389372822299651e-05,
"loss": 0.1284,
"step": 18230
},
{
"epoch": 23.324808184143222,
"grad_norm": 5.253271579742432,
"learning_rate": 1.1367595818815331e-05,
"loss": 0.2221,
"step": 18240
},
{
"epoch": 23.33759590792839,
"grad_norm": 0.016655512154102325,
"learning_rate": 1.1345818815331011e-05,
"loss": 0.1715,
"step": 18250
},
{
"epoch": 23.350383631713555,
"grad_norm": 1.1747348308563232,
"learning_rate": 1.132404181184669e-05,
"loss": 0.2928,
"step": 18260
},
{
"epoch": 23.36317135549872,
"grad_norm": 10.306817054748535,
"learning_rate": 1.130226480836237e-05,
"loss": 0.2723,
"step": 18270
},
{
"epoch": 23.375959079283888,
"grad_norm": 2.6417109966278076,
"learning_rate": 1.128048780487805e-05,
"loss": 0.3317,
"step": 18280
},
{
"epoch": 23.388746803069054,
"grad_norm": 1.2946593761444092,
"learning_rate": 1.125871080139373e-05,
"loss": 0.3156,
"step": 18290
},
{
"epoch": 23.40153452685422,
"grad_norm": 1.994095802307129,
"learning_rate": 1.1236933797909408e-05,
"loss": 0.3111,
"step": 18300
},
{
"epoch": 23.414322250639387,
"grad_norm": 1.8308746814727783,
"learning_rate": 1.1215156794425086e-05,
"loss": 0.377,
"step": 18310
},
{
"epoch": 23.427109974424553,
"grad_norm": 0.07952219992876053,
"learning_rate": 1.1193379790940766e-05,
"loss": 0.3723,
"step": 18320
},
{
"epoch": 23.43989769820972,
"grad_norm": 0.6271858215332031,
"learning_rate": 1.1171602787456447e-05,
"loss": 0.2829,
"step": 18330
},
{
"epoch": 23.452685421994886,
"grad_norm": 5.1058573722839355,
"learning_rate": 1.1149825783972127e-05,
"loss": 0.4572,
"step": 18340
},
{
"epoch": 23.465473145780052,
"grad_norm": 1.6570324897766113,
"learning_rate": 1.1128048780487805e-05,
"loss": 0.2236,
"step": 18350
},
{
"epoch": 23.47826086956522,
"grad_norm": 0.3510279357433319,
"learning_rate": 1.1106271777003485e-05,
"loss": 0.2297,
"step": 18360
},
{
"epoch": 23.491048593350385,
"grad_norm": 0.014843414537608624,
"learning_rate": 1.1084494773519165e-05,
"loss": 0.3306,
"step": 18370
},
{
"epoch": 23.50383631713555,
"grad_norm": 1.8954602479934692,
"learning_rate": 1.1062717770034843e-05,
"loss": 0.1152,
"step": 18380
},
{
"epoch": 23.516624040920718,
"grad_norm": 0.40667295455932617,
"learning_rate": 1.1040940766550523e-05,
"loss": 0.1229,
"step": 18390
},
{
"epoch": 23.529411764705884,
"grad_norm": 3.786303758621216,
"learning_rate": 1.1019163763066202e-05,
"loss": 0.2242,
"step": 18400
},
{
"epoch": 23.54219948849105,
"grad_norm": 0.021371856331825256,
"learning_rate": 1.0997386759581882e-05,
"loss": 0.3529,
"step": 18410
},
{
"epoch": 23.554987212276213,
"grad_norm": 0.805793821811676,
"learning_rate": 1.0975609756097562e-05,
"loss": 0.1224,
"step": 18420
},
{
"epoch": 23.56777493606138,
"grad_norm": 3.2790889739990234,
"learning_rate": 1.0953832752613242e-05,
"loss": 0.1574,
"step": 18430
},
{
"epoch": 23.580562659846546,
"grad_norm": 3.754932165145874,
"learning_rate": 1.093205574912892e-05,
"loss": 0.1643,
"step": 18440
},
{
"epoch": 23.593350383631712,
"grad_norm": 3.4517104625701904,
"learning_rate": 1.09102787456446e-05,
"loss": 0.1885,
"step": 18450
},
{
"epoch": 23.60613810741688,
"grad_norm": 1.3036948442459106,
"learning_rate": 1.0888501742160279e-05,
"loss": 0.2806,
"step": 18460
},
{
"epoch": 23.618925831202045,
"grad_norm": 0.0014313430292531848,
"learning_rate": 1.0866724738675959e-05,
"loss": 0.2697,
"step": 18470
},
{
"epoch": 23.63171355498721,
"grad_norm": 2.177548408508301,
"learning_rate": 1.0844947735191637e-05,
"loss": 0.3779,
"step": 18480
},
{
"epoch": 23.644501278772378,
"grad_norm": 3.585585594177246,
"learning_rate": 1.0823170731707317e-05,
"loss": 0.3034,
"step": 18490
},
{
"epoch": 23.657289002557544,
"grad_norm": 0.02034856006503105,
"learning_rate": 1.0801393728222997e-05,
"loss": 0.2041,
"step": 18500
},
{
"epoch": 23.67007672634271,
"grad_norm": 1.1447325944900513,
"learning_rate": 1.0779616724738677e-05,
"loss": 0.4234,
"step": 18510
},
{
"epoch": 23.682864450127877,
"grad_norm": 1.8239119052886963,
"learning_rate": 1.0757839721254356e-05,
"loss": 0.5485,
"step": 18520
},
{
"epoch": 23.695652173913043,
"grad_norm": 4.72014856338501,
"learning_rate": 1.0736062717770036e-05,
"loss": 0.1865,
"step": 18530
},
{
"epoch": 23.70843989769821,
"grad_norm": 0.9234018325805664,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.2055,
"step": 18540
},
{
"epoch": 23.721227621483376,
"grad_norm": 0.0001534064649604261,
"learning_rate": 1.0692508710801394e-05,
"loss": 0.3775,
"step": 18550
},
{
"epoch": 23.734015345268542,
"grad_norm": 0.08756177872419357,
"learning_rate": 1.0670731707317074e-05,
"loss": 0.2456,
"step": 18560
},
{
"epoch": 23.74680306905371,
"grad_norm": 1.3506488800048828,
"learning_rate": 1.0648954703832753e-05,
"loss": 0.2995,
"step": 18570
},
{
"epoch": 23.759590792838875,
"grad_norm": 1.4564141035079956,
"learning_rate": 1.0627177700348433e-05,
"loss": 0.2127,
"step": 18580
},
{
"epoch": 23.77237851662404,
"grad_norm": 5.594372749328613,
"learning_rate": 1.0605400696864113e-05,
"loss": 0.2294,
"step": 18590
},
{
"epoch": 23.785166240409207,
"grad_norm": 0.6284129619598389,
"learning_rate": 1.0583623693379793e-05,
"loss": 0.239,
"step": 18600
},
{
"epoch": 23.797953964194374,
"grad_norm": 0.047873031347990036,
"learning_rate": 1.0561846689895471e-05,
"loss": 0.1433,
"step": 18610
},
{
"epoch": 23.81074168797954,
"grad_norm": 5.969231128692627,
"learning_rate": 1.054006968641115e-05,
"loss": 0.1204,
"step": 18620
},
{
"epoch": 23.823529411764707,
"grad_norm": 11.529876708984375,
"learning_rate": 1.051829268292683e-05,
"loss": 0.2963,
"step": 18630
},
{
"epoch": 23.836317135549873,
"grad_norm": 1.0211116075515747,
"learning_rate": 1.049651567944251e-05,
"loss": 0.1601,
"step": 18640
},
{
"epoch": 23.84910485933504,
"grad_norm": 4.957517623901367,
"learning_rate": 1.0474738675958188e-05,
"loss": 0.2966,
"step": 18650
},
{
"epoch": 23.861892583120206,
"grad_norm": 0.279568612575531,
"learning_rate": 1.0452961672473868e-05,
"loss": 0.3315,
"step": 18660
},
{
"epoch": 23.874680306905372,
"grad_norm": 6.54602575302124,
"learning_rate": 1.0431184668989548e-05,
"loss": 0.183,
"step": 18670
},
{
"epoch": 23.88746803069054,
"grad_norm": 9.698213577270508,
"learning_rate": 1.0409407665505228e-05,
"loss": 0.3088,
"step": 18680
},
{
"epoch": 23.900255754475705,
"grad_norm": 7.419322967529297,
"learning_rate": 1.0387630662020906e-05,
"loss": 0.2057,
"step": 18690
},
{
"epoch": 23.91304347826087,
"grad_norm": 0.023956669494509697,
"learning_rate": 1.0365853658536585e-05,
"loss": 0.2966,
"step": 18700
},
{
"epoch": 23.925831202046037,
"grad_norm": 4.011311054229736,
"learning_rate": 1.0344076655052265e-05,
"loss": 0.1879,
"step": 18710
},
{
"epoch": 23.938618925831204,
"grad_norm": 6.17871618270874,
"learning_rate": 1.0322299651567945e-05,
"loss": 0.4622,
"step": 18720
},
{
"epoch": 23.95140664961637,
"grad_norm": 3.367823600769043,
"learning_rate": 1.0300522648083625e-05,
"loss": 0.1969,
"step": 18730
},
{
"epoch": 23.964194373401533,
"grad_norm": 2.884002685546875,
"learning_rate": 1.0278745644599303e-05,
"loss": 0.1785,
"step": 18740
},
{
"epoch": 23.9769820971867,
"grad_norm": 0.010596761479973793,
"learning_rate": 1.0256968641114983e-05,
"loss": 0.1556,
"step": 18750
},
{
"epoch": 23.989769820971865,
"grad_norm": 3.776186531467829e-06,
"learning_rate": 1.0235191637630663e-05,
"loss": 0.1703,
"step": 18760
},
{
"epoch": 24.0,
"eval_loss": 0.18299047648906708,
"eval_runtime": 0.9773,
"eval_samples_per_second": 100.272,
"eval_steps_per_second": 13.301,
"step": 18768
},
{
"epoch": 24.002557544757032,
"grad_norm": 1.9648569822311401,
"learning_rate": 1.0213414634146342e-05,
"loss": 0.1341,
"step": 18770
},
{
"epoch": 24.015345268542198,
"grad_norm": 8.87406786205247e-05,
"learning_rate": 1.019163763066202e-05,
"loss": 0.072,
"step": 18780
},
{
"epoch": 24.028132992327365,
"grad_norm": 1.8087506294250488,
"learning_rate": 1.01698606271777e-05,
"loss": 0.2022,
"step": 18790
},
{
"epoch": 24.04092071611253,
"grad_norm": 0.00022683825227431953,
"learning_rate": 1.014808362369338e-05,
"loss": 0.219,
"step": 18800
},
{
"epoch": 24.053708439897697,
"grad_norm": 8.788445848040283e-05,
"learning_rate": 1.012630662020906e-05,
"loss": 0.3452,
"step": 18810
},
{
"epoch": 24.066496163682864,
"grad_norm": 1.596673846244812,
"learning_rate": 1.0104529616724739e-05,
"loss": 0.2268,
"step": 18820
},
{
"epoch": 24.07928388746803,
"grad_norm": 5.229015350341797,
"learning_rate": 1.0082752613240419e-05,
"loss": 0.2382,
"step": 18830
},
{
"epoch": 24.092071611253196,
"grad_norm": 0.011357109993696213,
"learning_rate": 1.0060975609756099e-05,
"loss": 0.2029,
"step": 18840
},
{
"epoch": 24.104859335038363,
"grad_norm": 2.772266387939453,
"learning_rate": 1.0039198606271777e-05,
"loss": 0.1519,
"step": 18850
},
{
"epoch": 24.11764705882353,
"grad_norm": 5.571649074554443,
"learning_rate": 1.0017421602787457e-05,
"loss": 0.299,
"step": 18860
},
{
"epoch": 24.130434782608695,
"grad_norm": 4.471550941467285,
"learning_rate": 9.995644599303135e-06,
"loss": 0.1408,
"step": 18870
},
{
"epoch": 24.14322250639386,
"grad_norm": 0.002381289144977927,
"learning_rate": 9.973867595818816e-06,
"loss": 0.2127,
"step": 18880
},
{
"epoch": 24.156010230179028,
"grad_norm": 0.3346289396286011,
"learning_rate": 9.952090592334496e-06,
"loss": 0.3573,
"step": 18890
},
{
"epoch": 24.168797953964194,
"grad_norm": 6.417741775512695,
"learning_rate": 9.930313588850176e-06,
"loss": 0.1684,
"step": 18900
},
{
"epoch": 24.18158567774936,
"grad_norm": 0.4619981348514557,
"learning_rate": 9.908536585365854e-06,
"loss": 0.3421,
"step": 18910
},
{
"epoch": 24.194373401534527,
"grad_norm": 2.542964458465576,
"learning_rate": 9.886759581881534e-06,
"loss": 0.1321,
"step": 18920
},
{
"epoch": 24.207161125319693,
"grad_norm": 3.8259899616241455,
"learning_rate": 9.864982578397212e-06,
"loss": 0.301,
"step": 18930
},
{
"epoch": 24.21994884910486,
"grad_norm": 3.5682637691497803,
"learning_rate": 9.843205574912892e-06,
"loss": 0.2302,
"step": 18940
},
{
"epoch": 24.232736572890026,
"grad_norm": 7.466248512268066,
"learning_rate": 9.821428571428573e-06,
"loss": 0.2322,
"step": 18950
},
{
"epoch": 24.245524296675192,
"grad_norm": 4.964677333831787,
"learning_rate": 9.799651567944251e-06,
"loss": 0.3205,
"step": 18960
},
{
"epoch": 24.25831202046036,
"grad_norm": 0.6600395441055298,
"learning_rate": 9.777874564459931e-06,
"loss": 0.3198,
"step": 18970
},
{
"epoch": 24.271099744245525,
"grad_norm": 0.0014609363861382008,
"learning_rate": 9.756097560975611e-06,
"loss": 0.2724,
"step": 18980
},
{
"epoch": 24.28388746803069,
"grad_norm": 1.3109701871871948,
"learning_rate": 9.734320557491291e-06,
"loss": 0.4884,
"step": 18990
},
{
"epoch": 24.296675191815858,
"grad_norm": 1.394028902053833,
"learning_rate": 9.712543554006968e-06,
"loss": 0.2897,
"step": 19000
},
{
"epoch": 24.309462915601024,
"grad_norm": 3.291472911834717,
"learning_rate": 9.690766550522648e-06,
"loss": 0.2281,
"step": 19010
},
{
"epoch": 24.32225063938619,
"grad_norm": 0.004317448474466801,
"learning_rate": 9.668989547038328e-06,
"loss": 0.1659,
"step": 19020
},
{
"epoch": 24.335038363171357,
"grad_norm": 0.20717021822929382,
"learning_rate": 9.647212543554008e-06,
"loss": 0.2681,
"step": 19030
},
{
"epoch": 24.347826086956523,
"grad_norm": 0.0002472202468197793,
"learning_rate": 9.625435540069686e-06,
"loss": 0.2402,
"step": 19040
},
{
"epoch": 24.36061381074169,
"grad_norm": 8.822566032409668,
"learning_rate": 9.603658536585366e-06,
"loss": 0.2797,
"step": 19050
},
{
"epoch": 24.373401534526856,
"grad_norm": 6.60060453414917,
"learning_rate": 9.581881533101046e-06,
"loss": 0.1299,
"step": 19060
},
{
"epoch": 24.38618925831202,
"grad_norm": 0.008101239800453186,
"learning_rate": 9.560104529616726e-06,
"loss": 0.2473,
"step": 19070
},
{
"epoch": 24.398976982097185,
"grad_norm": 1.5664035081863403,
"learning_rate": 9.538327526132405e-06,
"loss": 0.148,
"step": 19080
},
{
"epoch": 24.41176470588235,
"grad_norm": 1.6293203830718994,
"learning_rate": 9.516550522648083e-06,
"loss": 0.0677,
"step": 19090
},
{
"epoch": 24.424552429667518,
"grad_norm": 3.834322214126587,
"learning_rate": 9.494773519163763e-06,
"loss": 0.1932,
"step": 19100
},
{
"epoch": 24.437340153452684,
"grad_norm": 0.9427624940872192,
"learning_rate": 9.472996515679443e-06,
"loss": 0.3893,
"step": 19110
},
{
"epoch": 24.45012787723785,
"grad_norm": 5.037188529968262,
"learning_rate": 9.451219512195123e-06,
"loss": 0.2438,
"step": 19120
},
{
"epoch": 24.462915601023017,
"grad_norm": 3.3101885318756104,
"learning_rate": 9.429442508710802e-06,
"loss": 0.1255,
"step": 19130
},
{
"epoch": 24.475703324808183,
"grad_norm": 3.3141419887542725,
"learning_rate": 9.407665505226482e-06,
"loss": 0.2537,
"step": 19140
},
{
"epoch": 24.48849104859335,
"grad_norm": 10.10535717010498,
"learning_rate": 9.385888501742162e-06,
"loss": 0.5465,
"step": 19150
},
{
"epoch": 24.501278772378516,
"grad_norm": 2.9780263900756836,
"learning_rate": 9.36411149825784e-06,
"loss": 0.2291,
"step": 19160
},
{
"epoch": 24.514066496163682,
"grad_norm": 0.05312852934002876,
"learning_rate": 9.342334494773518e-06,
"loss": 0.1619,
"step": 19170
},
{
"epoch": 24.52685421994885,
"grad_norm": 0.6286771297454834,
"learning_rate": 9.320557491289198e-06,
"loss": 0.2531,
"step": 19180
},
{
"epoch": 24.539641943734015,
"grad_norm": 10.184651374816895,
"learning_rate": 9.298780487804879e-06,
"loss": 0.3081,
"step": 19190
},
{
"epoch": 24.55242966751918,
"grad_norm": 3.5842103958129883,
"learning_rate": 9.277003484320559e-06,
"loss": 0.1545,
"step": 19200
},
{
"epoch": 24.565217391304348,
"grad_norm": 0.006370662711560726,
"learning_rate": 9.255226480836237e-06,
"loss": 0.2799,
"step": 19210
},
{
"epoch": 24.578005115089514,
"grad_norm": 6.6300368309021,
"learning_rate": 9.233449477351917e-06,
"loss": 0.1704,
"step": 19220
},
{
"epoch": 24.59079283887468,
"grad_norm": 0.029947593808174133,
"learning_rate": 9.211672473867597e-06,
"loss": 0.1921,
"step": 19230
},
{
"epoch": 24.603580562659847,
"grad_norm": 6.361519813537598,
"learning_rate": 9.189895470383275e-06,
"loss": 0.386,
"step": 19240
},
{
"epoch": 24.616368286445013,
"grad_norm": 8.52017593383789,
"learning_rate": 9.168118466898955e-06,
"loss": 0.1906,
"step": 19250
},
{
"epoch": 24.62915601023018,
"grad_norm": 0.013848032802343369,
"learning_rate": 9.146341463414634e-06,
"loss": 0.1493,
"step": 19260
},
{
"epoch": 24.641943734015346,
"grad_norm": 0.8284606337547302,
"learning_rate": 9.124564459930314e-06,
"loss": 0.1113,
"step": 19270
},
{
"epoch": 24.654731457800512,
"grad_norm": 0.40283098816871643,
"learning_rate": 9.102787456445994e-06,
"loss": 0.211,
"step": 19280
},
{
"epoch": 24.66751918158568,
"grad_norm": 1.3809633255004883,
"learning_rate": 9.081010452961674e-06,
"loss": 0.1649,
"step": 19290
},
{
"epoch": 24.680306905370845,
"grad_norm": 0.8382818102836609,
"learning_rate": 9.059233449477352e-06,
"loss": 0.3372,
"step": 19300
},
{
"epoch": 24.69309462915601,
"grad_norm": 3.602656126022339,
"learning_rate": 9.037456445993032e-06,
"loss": 0.1013,
"step": 19310
},
{
"epoch": 24.705882352941178,
"grad_norm": 3.5405521392822266,
"learning_rate": 9.01567944250871e-06,
"loss": 0.203,
"step": 19320
},
{
"epoch": 24.718670076726344,
"grad_norm": 2.895538330078125,
"learning_rate": 8.99390243902439e-06,
"loss": 0.1782,
"step": 19330
},
{
"epoch": 24.73145780051151,
"grad_norm": 0.021150898188352585,
"learning_rate": 8.972125435540069e-06,
"loss": 0.2326,
"step": 19340
},
{
"epoch": 24.744245524296677,
"grad_norm": 6.41797399520874,
"learning_rate": 8.95034843205575e-06,
"loss": 0.3114,
"step": 19350
},
{
"epoch": 24.757033248081843,
"grad_norm": 4.61984395980835,
"learning_rate": 8.92857142857143e-06,
"loss": 0.2526,
"step": 19360
},
{
"epoch": 24.76982097186701,
"grad_norm": 0.7206798195838928,
"learning_rate": 8.90679442508711e-06,
"loss": 0.2294,
"step": 19370
},
{
"epoch": 24.782608695652176,
"grad_norm": 0.021268386393785477,
"learning_rate": 8.885017421602788e-06,
"loss": 0.2442,
"step": 19380
},
{
"epoch": 24.79539641943734,
"grad_norm": 1.2543253898620605,
"learning_rate": 8.863240418118466e-06,
"loss": 0.3759,
"step": 19390
},
{
"epoch": 24.808184143222505,
"grad_norm": 6.605262279510498,
"learning_rate": 8.841463414634146e-06,
"loss": 0.2954,
"step": 19400
},
{
"epoch": 24.82097186700767,
"grad_norm": 0.008203168399631977,
"learning_rate": 8.819686411149826e-06,
"loss": 0.3309,
"step": 19410
},
{
"epoch": 24.833759590792837,
"grad_norm": 0.682137131690979,
"learning_rate": 8.797909407665506e-06,
"loss": 0.1424,
"step": 19420
},
{
"epoch": 24.846547314578004,
"grad_norm": 0.4394625723361969,
"learning_rate": 8.776132404181185e-06,
"loss": 0.3464,
"step": 19430
},
{
"epoch": 24.85933503836317,
"grad_norm": 0.0014602456940338016,
"learning_rate": 8.754355400696865e-06,
"loss": 0.0397,
"step": 19440
},
{
"epoch": 24.872122762148337,
"grad_norm": 4.264637470245361,
"learning_rate": 8.732578397212545e-06,
"loss": 0.4722,
"step": 19450
},
{
"epoch": 24.884910485933503,
"grad_norm": 1.0657953023910522,
"learning_rate": 8.710801393728225e-06,
"loss": 0.0911,
"step": 19460
},
{
"epoch": 24.89769820971867,
"grad_norm": 1.0639190673828125,
"learning_rate": 8.689024390243903e-06,
"loss": 0.2563,
"step": 19470
},
{
"epoch": 24.910485933503836,
"grad_norm": 0.0038846738170832396,
"learning_rate": 8.667247386759581e-06,
"loss": 0.2149,
"step": 19480
},
{
"epoch": 24.923273657289002,
"grad_norm": 5.282316207885742,
"learning_rate": 8.645470383275261e-06,
"loss": 0.2069,
"step": 19490
},
{
"epoch": 24.93606138107417,
"grad_norm": 0.00032713127438910306,
"learning_rate": 8.623693379790942e-06,
"loss": 0.1029,
"step": 19500
},
{
"epoch": 24.948849104859335,
"grad_norm": 4.110812187194824,
"learning_rate": 8.601916376306622e-06,
"loss": 0.296,
"step": 19510
},
{
"epoch": 24.9616368286445,
"grad_norm": 0.4580451250076294,
"learning_rate": 8.5801393728223e-06,
"loss": 0.3531,
"step": 19520
},
{
"epoch": 24.974424552429667,
"grad_norm": 0.22135113179683685,
"learning_rate": 8.55836236933798e-06,
"loss": 0.2273,
"step": 19530
},
{
"epoch": 24.987212276214834,
"grad_norm": 5.510282039642334,
"learning_rate": 8.53658536585366e-06,
"loss": 0.1763,
"step": 19540
},
{
"epoch": 25.0,
"grad_norm": 1.8533436059951782,
"learning_rate": 8.514808362369338e-06,
"loss": 0.2936,
"step": 19550
},
{
"epoch": 25.0,
"eval_loss": 0.18129856884479523,
"eval_runtime": 0.8207,
"eval_samples_per_second": 119.413,
"eval_steps_per_second": 15.84,
"step": 19550
},
{
"epoch": 25.012787723785166,
"grad_norm": 0.0017755866283550858,
"learning_rate": 8.493031358885017e-06,
"loss": 0.1499,
"step": 19560
},
{
"epoch": 25.025575447570333,
"grad_norm": 0.0011428899597376585,
"learning_rate": 8.471254355400697e-06,
"loss": 0.2138,
"step": 19570
},
{
"epoch": 25.0383631713555,
"grad_norm": 0.0004178075469098985,
"learning_rate": 8.449477351916377e-06,
"loss": 0.1578,
"step": 19580
},
{
"epoch": 25.051150895140665,
"grad_norm": 0.8963024020195007,
"learning_rate": 8.427700348432057e-06,
"loss": 0.1294,
"step": 19590
},
{
"epoch": 25.06393861892583,
"grad_norm": 3.469683885574341,
"learning_rate": 8.405923344947735e-06,
"loss": 0.2863,
"step": 19600
},
{
"epoch": 25.076726342710998,
"grad_norm": 0.030634768307209015,
"learning_rate": 8.384146341463415e-06,
"loss": 0.1694,
"step": 19610
},
{
"epoch": 25.089514066496164,
"grad_norm": 0.008396928198635578,
"learning_rate": 8.362369337979095e-06,
"loss": 0.303,
"step": 19620
},
{
"epoch": 25.10230179028133,
"grad_norm": 0.4458426237106323,
"learning_rate": 8.340592334494774e-06,
"loss": 0.4059,
"step": 19630
},
{
"epoch": 25.115089514066497,
"grad_norm": 2.2187139987945557,
"learning_rate": 8.318815331010454e-06,
"loss": 0.2115,
"step": 19640
},
{
"epoch": 25.127877237851663,
"grad_norm": 4.11098051071167,
"learning_rate": 8.297038327526132e-06,
"loss": 0.3253,
"step": 19650
},
{
"epoch": 25.14066496163683,
"grad_norm": 3.1576087474823,
"learning_rate": 8.275261324041812e-06,
"loss": 0.3276,
"step": 19660
},
{
"epoch": 25.153452685421996,
"grad_norm": 2.0662500858306885,
"learning_rate": 8.253484320557492e-06,
"loss": 0.1996,
"step": 19670
},
{
"epoch": 25.166240409207163,
"grad_norm": 0.3466600775718689,
"learning_rate": 8.231707317073172e-06,
"loss": 0.1362,
"step": 19680
},
{
"epoch": 25.17902813299233,
"grad_norm": 6.13591194152832,
"learning_rate": 8.20993031358885e-06,
"loss": 0.2625,
"step": 19690
},
{
"epoch": 25.191815856777495,
"grad_norm": 1.5550795793533325,
"learning_rate": 8.188153310104529e-06,
"loss": 0.0495,
"step": 19700
},
{
"epoch": 25.20460358056266,
"grad_norm": 3.5859768390655518,
"learning_rate": 8.166376306620209e-06,
"loss": 0.3586,
"step": 19710
},
{
"epoch": 25.217391304347824,
"grad_norm": 0.09591033309698105,
"learning_rate": 8.144599303135889e-06,
"loss": 0.0664,
"step": 19720
},
{
"epoch": 25.23017902813299,
"grad_norm": 1.0985368490219116,
"learning_rate": 8.122822299651567e-06,
"loss": 0.2419,
"step": 19730
},
{
"epoch": 25.242966751918157,
"grad_norm": 7.924480438232422,
"learning_rate": 8.101045296167248e-06,
"loss": 0.2375,
"step": 19740
},
{
"epoch": 25.255754475703323,
"grad_norm": 5.2205634117126465,
"learning_rate": 8.079268292682928e-06,
"loss": 0.4546,
"step": 19750
},
{
"epoch": 25.26854219948849,
"grad_norm": 1.323668360710144,
"learning_rate": 8.057491289198608e-06,
"loss": 0.2154,
"step": 19760
},
{
"epoch": 25.281329923273656,
"grad_norm": 3.7929680347442627,
"learning_rate": 8.035714285714286e-06,
"loss": 0.178,
"step": 19770
},
{
"epoch": 25.294117647058822,
"grad_norm": 0.025306949391961098,
"learning_rate": 8.013937282229964e-06,
"loss": 0.1248,
"step": 19780
},
{
"epoch": 25.30690537084399,
"grad_norm": 0.06079568713903427,
"learning_rate": 7.992160278745644e-06,
"loss": 0.0619,
"step": 19790
},
{
"epoch": 25.319693094629155,
"grad_norm": 6.62827730178833,
"learning_rate": 7.970383275261324e-06,
"loss": 0.1882,
"step": 19800
},
{
"epoch": 25.33248081841432,
"grad_norm": 0.04379437118768692,
"learning_rate": 7.948606271777004e-06,
"loss": 0.2076,
"step": 19810
},
{
"epoch": 25.345268542199488,
"grad_norm": 3.640428304672241,
"learning_rate": 7.926829268292683e-06,
"loss": 0.2472,
"step": 19820
},
{
"epoch": 25.358056265984654,
"grad_norm": 0.32952073216438293,
"learning_rate": 7.905052264808363e-06,
"loss": 0.2234,
"step": 19830
},
{
"epoch": 25.37084398976982,
"grad_norm": 0.0016094455495476723,
"learning_rate": 7.883275261324043e-06,
"loss": 0.1618,
"step": 19840
},
{
"epoch": 25.383631713554987,
"grad_norm": 2.6373326778411865,
"learning_rate": 7.861498257839723e-06,
"loss": 0.2415,
"step": 19850
},
{
"epoch": 25.396419437340153,
"grad_norm": 0.0004819360328838229,
"learning_rate": 7.8397212543554e-06,
"loss": 0.2548,
"step": 19860
},
{
"epoch": 25.40920716112532,
"grad_norm": 2.0560288429260254,
"learning_rate": 7.81794425087108e-06,
"loss": 0.1283,
"step": 19870
},
{
"epoch": 25.421994884910486,
"grad_norm": 5.6980133056640625,
"learning_rate": 7.79616724738676e-06,
"loss": 0.2889,
"step": 19880
},
{
"epoch": 25.434782608695652,
"grad_norm": 3.6380951404571533,
"learning_rate": 7.77439024390244e-06,
"loss": 0.2188,
"step": 19890
},
{
"epoch": 25.44757033248082,
"grad_norm": 3.955148458480835,
"learning_rate": 7.752613240418118e-06,
"loss": 0.3087,
"step": 19900
},
{
"epoch": 25.460358056265985,
"grad_norm": 0.12818066775798798,
"learning_rate": 7.730836236933798e-06,
"loss": 0.2401,
"step": 19910
},
{
"epoch": 25.47314578005115,
"grad_norm": 1.8835790157318115,
"learning_rate": 7.709059233449478e-06,
"loss": 0.1892,
"step": 19920
},
{
"epoch": 25.485933503836318,
"grad_norm": 1.2346657514572144,
"learning_rate": 7.687282229965158e-06,
"loss": 0.152,
"step": 19930
},
{
"epoch": 25.498721227621484,
"grad_norm": 5.731377124786377,
"learning_rate": 7.665505226480837e-06,
"loss": 0.1053,
"step": 19940
},
{
"epoch": 25.51150895140665,
"grad_norm": 0.0032745820935815573,
"learning_rate": 7.643728222996515e-06,
"loss": 0.3134,
"step": 19950
},
{
"epoch": 25.524296675191817,
"grad_norm": 5.341978073120117,
"learning_rate": 7.621951219512195e-06,
"loss": 0.2932,
"step": 19960
},
{
"epoch": 25.537084398976983,
"grad_norm": 2.9654135704040527,
"learning_rate": 7.600174216027875e-06,
"loss": 0.3528,
"step": 19970
},
{
"epoch": 25.54987212276215,
"grad_norm": 0.0003606612444855273,
"learning_rate": 7.578397212543554e-06,
"loss": 0.1237,
"step": 19980
},
{
"epoch": 25.562659846547316,
"grad_norm": 0.044729363173246384,
"learning_rate": 7.5566202090592344e-06,
"loss": 0.3323,
"step": 19990
},
{
"epoch": 25.575447570332482,
"grad_norm": 0.0004105431435164064,
"learning_rate": 7.534843205574914e-06,
"loss": 0.1374,
"step": 20000
},
{
"epoch": 25.58823529411765,
"grad_norm": 3.488405227661133,
"learning_rate": 7.513066202090592e-06,
"loss": 0.2517,
"step": 20010
},
{
"epoch": 25.601023017902815,
"grad_norm": 0.0034214009065181017,
"learning_rate": 7.491289198606272e-06,
"loss": 0.4066,
"step": 20020
},
{
"epoch": 25.61381074168798,
"grad_norm": 3.4584133625030518,
"learning_rate": 7.469512195121951e-06,
"loss": 0.2964,
"step": 20030
},
{
"epoch": 25.626598465473144,
"grad_norm": 0.000964305188972503,
"learning_rate": 7.447735191637631e-06,
"loss": 0.2671,
"step": 20040
},
{
"epoch": 25.63938618925831,
"grad_norm": 2.860732078552246,
"learning_rate": 7.4259581881533105e-06,
"loss": 0.1305,
"step": 20050
},
{
"epoch": 25.652173913043477,
"grad_norm": 2.513005495071411,
"learning_rate": 7.4041811846689906e-06,
"loss": 0.2967,
"step": 20060
},
{
"epoch": 25.664961636828643,
"grad_norm": 3.4734206199645996,
"learning_rate": 7.38240418118467e-06,
"loss": 0.2739,
"step": 20070
},
{
"epoch": 25.67774936061381,
"grad_norm": 0.8101391792297363,
"learning_rate": 7.360627177700349e-06,
"loss": 0.2415,
"step": 20080
},
{
"epoch": 25.690537084398976,
"grad_norm": 0.00019734690431505442,
"learning_rate": 7.338850174216027e-06,
"loss": 0.2654,
"step": 20090
},
{
"epoch": 25.703324808184142,
"grad_norm": 1.5274137258529663,
"learning_rate": 7.317073170731707e-06,
"loss": 0.1765,
"step": 20100
},
{
"epoch": 25.71611253196931,
"grad_norm": 0.002199924550950527,
"learning_rate": 7.295296167247387e-06,
"loss": 0.4541,
"step": 20110
},
{
"epoch": 25.728900255754475,
"grad_norm": 0.11991043388843536,
"learning_rate": 7.273519163763067e-06,
"loss": 0.1748,
"step": 20120
},
{
"epoch": 25.74168797953964,
"grad_norm": 2.977264642715454,
"learning_rate": 7.251742160278746e-06,
"loss": 0.187,
"step": 20130
},
{
"epoch": 25.754475703324808,
"grad_norm": 6.003963947296143,
"learning_rate": 7.229965156794426e-06,
"loss": 0.3517,
"step": 20140
},
{
"epoch": 25.767263427109974,
"grad_norm": 0.1923568695783615,
"learning_rate": 7.208188153310105e-06,
"loss": 0.2537,
"step": 20150
},
{
"epoch": 25.78005115089514,
"grad_norm": 6.205975532531738,
"learning_rate": 7.186411149825785e-06,
"loss": 0.2591,
"step": 20160
},
{
"epoch": 25.792838874680307,
"grad_norm": 5.6605753898620605,
"learning_rate": 7.1646341463414635e-06,
"loss": 0.401,
"step": 20170
},
{
"epoch": 25.805626598465473,
"grad_norm": 1.7978140115737915,
"learning_rate": 7.142857142857143e-06,
"loss": 0.184,
"step": 20180
},
{
"epoch": 25.81841432225064,
"grad_norm": 1.2769854068756104,
"learning_rate": 7.121080139372823e-06,
"loss": 0.2435,
"step": 20190
},
{
"epoch": 25.831202046035806,
"grad_norm": 6.43965482711792,
"learning_rate": 7.099303135888502e-06,
"loss": 0.3531,
"step": 20200
},
{
"epoch": 25.843989769820972,
"grad_norm": 6.394824981689453,
"learning_rate": 7.077526132404182e-06,
"loss": 0.2703,
"step": 20210
},
{
"epoch": 25.85677749360614,
"grad_norm": 3.0788609981536865,
"learning_rate": 7.055749128919861e-06,
"loss": 0.3849,
"step": 20220
},
{
"epoch": 25.869565217391305,
"grad_norm": 2.7930312156677246,
"learning_rate": 7.033972125435541e-06,
"loss": 0.3325,
"step": 20230
},
{
"epoch": 25.88235294117647,
"grad_norm": 5.459298133850098,
"learning_rate": 7.0121951219512205e-06,
"loss": 0.2963,
"step": 20240
},
{
"epoch": 25.895140664961637,
"grad_norm": 3.3363773822784424,
"learning_rate": 6.990418118466899e-06,
"loss": 0.1328,
"step": 20250
},
{
"epoch": 25.907928388746804,
"grad_norm": 0.9299935102462769,
"learning_rate": 6.968641114982578e-06,
"loss": 0.1498,
"step": 20260
},
{
"epoch": 25.92071611253197,
"grad_norm": 0.6399292349815369,
"learning_rate": 6.946864111498258e-06,
"loss": 0.2621,
"step": 20270
},
{
"epoch": 25.933503836317136,
"grad_norm": 3.862414836883545,
"learning_rate": 6.925087108013937e-06,
"loss": 0.1487,
"step": 20280
},
{
"epoch": 25.946291560102303,
"grad_norm": 2.322935104370117,
"learning_rate": 6.903310104529617e-06,
"loss": 0.1076,
"step": 20290
},
{
"epoch": 25.95907928388747,
"grad_norm": 2.0821304321289062,
"learning_rate": 6.8815331010452966e-06,
"loss": 0.1143,
"step": 20300
},
{
"epoch": 25.971867007672635,
"grad_norm": 1.0014243125915527,
"learning_rate": 6.859756097560977e-06,
"loss": 0.2203,
"step": 20310
},
{
"epoch": 25.984654731457802,
"grad_norm": 5.317222595214844,
"learning_rate": 6.837979094076655e-06,
"loss": 0.2636,
"step": 20320
},
{
"epoch": 25.997442455242968,
"grad_norm": 2.082972526550293,
"learning_rate": 6.816202090592334e-06,
"loss": 0.2516,
"step": 20330
},
{
"epoch": 26.0,
"eval_loss": 0.18091708421707153,
"eval_runtime": 0.9859,
"eval_samples_per_second": 99.403,
"eval_steps_per_second": 13.186,
"step": 20332
},
{
"epoch": 26.010230179028135,
"grad_norm": 0.04056939110159874,
"learning_rate": 6.794425087108014e-06,
"loss": 0.1934,
"step": 20340
},
{
"epoch": 26.0230179028133,
"grad_norm": 0.04060182347893715,
"learning_rate": 6.7726480836236934e-06,
"loss": 0.1245,
"step": 20350
},
{
"epoch": 26.035805626598467,
"grad_norm": 0.0002620798477437347,
"learning_rate": 6.7508710801393735e-06,
"loss": 0.1996,
"step": 20360
},
{
"epoch": 26.04859335038363,
"grad_norm": 0.953183650970459,
"learning_rate": 6.729094076655053e-06,
"loss": 0.1799,
"step": 20370
},
{
"epoch": 26.061381074168796,
"grad_norm": 1.502600908279419,
"learning_rate": 6.707317073170733e-06,
"loss": 0.2447,
"step": 20380
},
{
"epoch": 26.074168797953963,
"grad_norm": 3.2614762783050537,
"learning_rate": 6.685540069686412e-06,
"loss": 0.1171,
"step": 20390
},
{
"epoch": 26.08695652173913,
"grad_norm": 6.770511627197266,
"learning_rate": 6.66376306620209e-06,
"loss": 0.2494,
"step": 20400
},
{
"epoch": 26.099744245524295,
"grad_norm": 7.8673834800720215,
"learning_rate": 6.6419860627177695e-06,
"loss": 0.2198,
"step": 20410
},
{
"epoch": 26.11253196930946,
"grad_norm": 0.6085989475250244,
"learning_rate": 6.6202090592334496e-06,
"loss": 0.0444,
"step": 20420
},
{
"epoch": 26.125319693094628,
"grad_norm": 9.129613876342773,
"learning_rate": 6.598432055749129e-06,
"loss": 0.218,
"step": 20430
},
{
"epoch": 26.138107416879794,
"grad_norm": 4.52681303024292,
"learning_rate": 6.576655052264809e-06,
"loss": 0.4481,
"step": 20440
},
{
"epoch": 26.15089514066496,
"grad_norm": 10.57491683959961,
"learning_rate": 6.554878048780488e-06,
"loss": 0.2445,
"step": 20450
},
{
"epoch": 26.163682864450127,
"grad_norm": 5.382693767547607,
"learning_rate": 6.533101045296168e-06,
"loss": 0.2412,
"step": 20460
},
{
"epoch": 26.176470588235293,
"grad_norm": 1.7417715787887573,
"learning_rate": 6.511324041811847e-06,
"loss": 0.0359,
"step": 20470
},
{
"epoch": 26.18925831202046,
"grad_norm": 0.5911231637001038,
"learning_rate": 6.489547038327526e-06,
"loss": 0.2865,
"step": 20480
},
{
"epoch": 26.202046035805626,
"grad_norm": 1.412896990776062,
"learning_rate": 6.467770034843206e-06,
"loss": 0.1666,
"step": 20490
},
{
"epoch": 26.214833759590793,
"grad_norm": 1.6051143407821655,
"learning_rate": 6.445993031358885e-06,
"loss": 0.2141,
"step": 20500
},
{
"epoch": 26.22762148337596,
"grad_norm": 2.5996508598327637,
"learning_rate": 6.424216027874565e-06,
"loss": 0.2444,
"step": 20510
},
{
"epoch": 26.240409207161125,
"grad_norm": 5.24091100692749,
"learning_rate": 6.402439024390244e-06,
"loss": 0.3557,
"step": 20520
},
{
"epoch": 26.25319693094629,
"grad_norm": 0.9117621183395386,
"learning_rate": 6.380662020905924e-06,
"loss": 0.1892,
"step": 20530
},
{
"epoch": 26.265984654731458,
"grad_norm": 8.387924194335938,
"learning_rate": 6.3588850174216034e-06,
"loss": 0.1204,
"step": 20540
},
{
"epoch": 26.278772378516624,
"grad_norm": 0.32349109649658203,
"learning_rate": 6.3371080139372835e-06,
"loss": 0.1098,
"step": 20550
},
{
"epoch": 26.29156010230179,
"grad_norm": 3.7538986206054688,
"learning_rate": 6.315331010452962e-06,
"loss": 0.3294,
"step": 20560
},
{
"epoch": 26.304347826086957,
"grad_norm": 0.8818411827087402,
"learning_rate": 6.293554006968641e-06,
"loss": 0.1728,
"step": 20570
},
{
"epoch": 26.317135549872123,
"grad_norm": 0.0003087719378527254,
"learning_rate": 6.271777003484321e-06,
"loss": 0.2044,
"step": 20580
},
{
"epoch": 26.32992327365729,
"grad_norm": 4.227612495422363,
"learning_rate": 6.25e-06,
"loss": 0.1996,
"step": 20590
},
{
"epoch": 26.342710997442456,
"grad_norm": 8.944961547851562,
"learning_rate": 6.2282229965156795e-06,
"loss": 0.308,
"step": 20600
},
{
"epoch": 26.355498721227622,
"grad_norm": 0.040556784719228745,
"learning_rate": 6.206445993031359e-06,
"loss": 0.1636,
"step": 20610
},
{
"epoch": 26.36828644501279,
"grad_norm": 0.9798309206962585,
"learning_rate": 6.184668989547039e-06,
"loss": 0.1374,
"step": 20620
},
{
"epoch": 26.381074168797955,
"grad_norm": 12.984233856201172,
"learning_rate": 6.162891986062718e-06,
"loss": 0.3852,
"step": 20630
},
{
"epoch": 26.39386189258312,
"grad_norm": 3.31282901763916,
"learning_rate": 6.141114982578398e-06,
"loss": 0.2366,
"step": 20640
},
{
"epoch": 26.406649616368288,
"grad_norm": 3.520981788635254,
"learning_rate": 6.119337979094076e-06,
"loss": 0.3183,
"step": 20650
},
{
"epoch": 26.419437340153454,
"grad_norm": 4.874128818511963,
"learning_rate": 6.0975609756097564e-06,
"loss": 0.3643,
"step": 20660
},
{
"epoch": 26.43222506393862,
"grad_norm": 2.41288685798645,
"learning_rate": 6.075783972125436e-06,
"loss": 0.2655,
"step": 20670
},
{
"epoch": 26.445012787723787,
"grad_norm": 0.03236719220876694,
"learning_rate": 6.054006968641116e-06,
"loss": 0.1718,
"step": 20680
},
{
"epoch": 26.45780051150895,
"grad_norm": 1.0233948230743408,
"learning_rate": 6.032229965156794e-06,
"loss": 0.2505,
"step": 20690
},
{
"epoch": 26.470588235294116,
"grad_norm": 7.505443096160889,
"learning_rate": 6.010452961672474e-06,
"loss": 0.3001,
"step": 20700
},
{
"epoch": 26.483375959079282,
"grad_norm": 0.565833330154419,
"learning_rate": 5.988675958188153e-06,
"loss": 0.2213,
"step": 20710
},
{
"epoch": 26.49616368286445,
"grad_norm": 2.7456254959106445,
"learning_rate": 5.966898954703833e-06,
"loss": 0.4087,
"step": 20720
},
{
"epoch": 26.508951406649615,
"grad_norm": 8.262672424316406,
"learning_rate": 5.9451219512195126e-06,
"loss": 0.2174,
"step": 20730
},
{
"epoch": 26.52173913043478,
"grad_norm": 1.9031331539154053,
"learning_rate": 5.923344947735192e-06,
"loss": 0.254,
"step": 20740
},
{
"epoch": 26.534526854219948,
"grad_norm": 8.823184967041016,
"learning_rate": 5.901567944250872e-06,
"loss": 0.5003,
"step": 20750
},
{
"epoch": 26.547314578005114,
"grad_norm": 0.31433001160621643,
"learning_rate": 5.879790940766551e-06,
"loss": 0.2981,
"step": 20760
},
{
"epoch": 26.56010230179028,
"grad_norm": 5.347137451171875,
"learning_rate": 5.85801393728223e-06,
"loss": 0.4345,
"step": 20770
},
{
"epoch": 26.572890025575447,
"grad_norm": 0.94664466381073,
"learning_rate": 5.8362369337979094e-06,
"loss": 0.2791,
"step": 20780
},
{
"epoch": 26.585677749360613,
"grad_norm": 1.536033272743225,
"learning_rate": 5.8144599303135895e-06,
"loss": 0.1489,
"step": 20790
},
{
"epoch": 26.59846547314578,
"grad_norm": 0.5675119757652283,
"learning_rate": 5.792682926829269e-06,
"loss": 0.2372,
"step": 20800
},
{
"epoch": 26.611253196930946,
"grad_norm": 0.9500277638435364,
"learning_rate": 5.770905923344948e-06,
"loss": 0.2659,
"step": 20810
},
{
"epoch": 26.624040920716112,
"grad_norm": 6.701987266540527,
"learning_rate": 5.749128919860627e-06,
"loss": 0.1731,
"step": 20820
},
{
"epoch": 26.63682864450128,
"grad_norm": 5.811104774475098,
"learning_rate": 5.727351916376307e-06,
"loss": 0.2963,
"step": 20830
},
{
"epoch": 26.649616368286445,
"grad_norm": 1.2386894226074219,
"learning_rate": 5.705574912891986e-06,
"loss": 0.1571,
"step": 20840
},
{
"epoch": 26.66240409207161,
"grad_norm": 3.480888843536377,
"learning_rate": 5.6837979094076656e-06,
"loss": 0.3523,
"step": 20850
},
{
"epoch": 26.675191815856778,
"grad_norm": 0.00016630532627459615,
"learning_rate": 5.662020905923345e-06,
"loss": 0.0731,
"step": 20860
},
{
"epoch": 26.687979539641944,
"grad_norm": 6.070654392242432,
"learning_rate": 5.640243902439025e-06,
"loss": 0.2237,
"step": 20870
},
{
"epoch": 26.70076726342711,
"grad_norm": 3.9732308387756348,
"learning_rate": 5.618466898954704e-06,
"loss": 0.2249,
"step": 20880
},
{
"epoch": 26.713554987212277,
"grad_norm": 1.2122111320495605,
"learning_rate": 5.596689895470383e-06,
"loss": 0.145,
"step": 20890
},
{
"epoch": 26.726342710997443,
"grad_norm": 4.583484649658203,
"learning_rate": 5.574912891986063e-06,
"loss": 0.2378,
"step": 20900
},
{
"epoch": 26.73913043478261,
"grad_norm": 3.717529535293579,
"learning_rate": 5.5531358885017425e-06,
"loss": 0.2853,
"step": 20910
},
{
"epoch": 26.751918158567776,
"grad_norm": 4.097237586975098,
"learning_rate": 5.531358885017422e-06,
"loss": 0.2265,
"step": 20920
},
{
"epoch": 26.764705882352942,
"grad_norm": 0.002175225643441081,
"learning_rate": 5.509581881533101e-06,
"loss": 0.0507,
"step": 20930
},
{
"epoch": 26.77749360613811,
"grad_norm": 0.701464831829071,
"learning_rate": 5.487804878048781e-06,
"loss": 0.265,
"step": 20940
},
{
"epoch": 26.790281329923275,
"grad_norm": 0.14686907827854156,
"learning_rate": 5.46602787456446e-06,
"loss": 0.0828,
"step": 20950
},
{
"epoch": 26.80306905370844,
"grad_norm": 2.6718368530273438,
"learning_rate": 5.444250871080139e-06,
"loss": 0.3893,
"step": 20960
},
{
"epoch": 26.815856777493607,
"grad_norm": 0.008024285547435284,
"learning_rate": 5.4224738675958186e-06,
"loss": 0.206,
"step": 20970
},
{
"epoch": 26.828644501278774,
"grad_norm": 0.00877557322382927,
"learning_rate": 5.400696864111499e-06,
"loss": 0.0554,
"step": 20980
},
{
"epoch": 26.84143222506394,
"grad_norm": 0.3502082824707031,
"learning_rate": 5.378919860627178e-06,
"loss": 0.2968,
"step": 20990
},
{
"epoch": 26.854219948849106,
"grad_norm": 2.3258226065081544e-05,
"learning_rate": 5.357142857142857e-06,
"loss": 0.1385,
"step": 21000
},
{
"epoch": 26.867007672634273,
"grad_norm": 0.03361507132649422,
"learning_rate": 5.335365853658537e-06,
"loss": 0.1328,
"step": 21010
},
{
"epoch": 26.87979539641944,
"grad_norm": 0.9654991030693054,
"learning_rate": 5.313588850174216e-06,
"loss": 0.2115,
"step": 21020
},
{
"epoch": 26.892583120204602,
"grad_norm": 0.005103932227939367,
"learning_rate": 5.291811846689896e-06,
"loss": 0.1815,
"step": 21030
},
{
"epoch": 26.90537084398977,
"grad_norm": 7.247010216815397e-05,
"learning_rate": 5.270034843205575e-06,
"loss": 0.1699,
"step": 21040
},
{
"epoch": 26.918158567774935,
"grad_norm": 0.0003093885607086122,
"learning_rate": 5.248257839721255e-06,
"loss": 0.1997,
"step": 21050
},
{
"epoch": 26.9309462915601,
"grad_norm": 4.525994777679443,
"learning_rate": 5.226480836236934e-06,
"loss": 0.1211,
"step": 21060
},
{
"epoch": 26.943734015345267,
"grad_norm": 2.7894208431243896,
"learning_rate": 5.204703832752614e-06,
"loss": 0.2392,
"step": 21070
},
{
"epoch": 26.956521739130434,
"grad_norm": 0.9026535153388977,
"learning_rate": 5.182926829268292e-06,
"loss": 0.3208,
"step": 21080
},
{
"epoch": 26.9693094629156,
"grad_norm": 1.8788063526153564,
"learning_rate": 5.1611498257839724e-06,
"loss": 0.2059,
"step": 21090
},
{
"epoch": 26.982097186700766,
"grad_norm": 2.9115092754364014,
"learning_rate": 5.139372822299652e-06,
"loss": 0.2544,
"step": 21100
},
{
"epoch": 26.994884910485933,
"grad_norm": 1.7984645366668701,
"learning_rate": 5.117595818815332e-06,
"loss": 0.0378,
"step": 21110
},
{
"epoch": 27.0,
"eval_loss": 0.1803685426712036,
"eval_runtime": 0.9925,
"eval_samples_per_second": 98.745,
"eval_steps_per_second": 13.099,
"step": 21114
},
{
"epoch": 27.0076726342711,
"grad_norm": 2.6370112895965576,
"learning_rate": 5.09581881533101e-06,
"loss": 0.3032,
"step": 21120
},
{
"epoch": 27.020460358056265,
"grad_norm": 3.5613858699798584,
"learning_rate": 5.07404181184669e-06,
"loss": 0.2893,
"step": 21130
},
{
"epoch": 27.033248081841432,
"grad_norm": 5.88556432723999,
"learning_rate": 5.052264808362369e-06,
"loss": 0.1514,
"step": 21140
},
{
"epoch": 27.046035805626598,
"grad_norm": 0.1507536619901657,
"learning_rate": 5.030487804878049e-06,
"loss": 0.1593,
"step": 21150
},
{
"epoch": 27.058823529411764,
"grad_norm": 5.69314432144165,
"learning_rate": 5.0087108013937286e-06,
"loss": 0.289,
"step": 21160
},
{
"epoch": 27.07161125319693,
"grad_norm": 1.0300965309143066,
"learning_rate": 4.986933797909408e-06,
"loss": 0.1587,
"step": 21170
},
{
"epoch": 27.084398976982097,
"grad_norm": 0.0008522561402060091,
"learning_rate": 4.965156794425088e-06,
"loss": 0.1631,
"step": 21180
},
{
"epoch": 27.097186700767264,
"grad_norm": 1.6342631578445435,
"learning_rate": 4.943379790940767e-06,
"loss": 0.1555,
"step": 21190
},
{
"epoch": 27.10997442455243,
"grad_norm": 1.1432753801345825,
"learning_rate": 4.921602787456446e-06,
"loss": 0.2425,
"step": 21200
},
{
"epoch": 27.122762148337596,
"grad_norm": 1.992614507675171,
"learning_rate": 4.8998257839721254e-06,
"loss": 0.3561,
"step": 21210
},
{
"epoch": 27.135549872122763,
"grad_norm": 2.2932474613189697,
"learning_rate": 4.8780487804878055e-06,
"loss": 0.1691,
"step": 21220
},
{
"epoch": 27.14833759590793,
"grad_norm": 1.525471806526184,
"learning_rate": 4.856271777003484e-06,
"loss": 0.1616,
"step": 21230
},
{
"epoch": 27.161125319693095,
"grad_norm": 3.714881420135498,
"learning_rate": 4.834494773519164e-06,
"loss": 0.3203,
"step": 21240
},
{
"epoch": 27.17391304347826,
"grad_norm": 3.990633964538574,
"learning_rate": 4.812717770034843e-06,
"loss": 0.254,
"step": 21250
},
{
"epoch": 27.186700767263428,
"grad_norm": 1.0952008962631226,
"learning_rate": 4.790940766550523e-06,
"loss": 0.1943,
"step": 21260
},
{
"epoch": 27.199488491048594,
"grad_norm": 2.789057970046997,
"learning_rate": 4.769163763066202e-06,
"loss": 0.1023,
"step": 21270
},
{
"epoch": 27.21227621483376,
"grad_norm": 1.829593539237976,
"learning_rate": 4.7473867595818816e-06,
"loss": 0.183,
"step": 21280
},
{
"epoch": 27.225063938618927,
"grad_norm": 0.681667685508728,
"learning_rate": 4.725609756097562e-06,
"loss": 0.1145,
"step": 21290
},
{
"epoch": 27.237851662404093,
"grad_norm": 6.536792755126953,
"learning_rate": 4.703832752613241e-06,
"loss": 0.3635,
"step": 21300
},
{
"epoch": 27.25063938618926,
"grad_norm": 4.994762897491455,
"learning_rate": 4.68205574912892e-06,
"loss": 0.5282,
"step": 21310
},
{
"epoch": 27.263427109974426,
"grad_norm": 6.329887866973877,
"learning_rate": 4.660278745644599e-06,
"loss": 0.1123,
"step": 21320
},
{
"epoch": 27.276214833759592,
"grad_norm": 0.2964077591896057,
"learning_rate": 4.638501742160279e-06,
"loss": 0.3743,
"step": 21330
},
{
"epoch": 27.289002557544755,
"grad_norm": 5.211808204650879,
"learning_rate": 4.6167247386759585e-06,
"loss": 0.34,
"step": 21340
},
{
"epoch": 27.30179028132992,
"grad_norm": 2.559980869293213,
"learning_rate": 4.594947735191638e-06,
"loss": 0.2149,
"step": 21350
},
{
"epoch": 27.314578005115088,
"grad_norm": 6.942402362823486,
"learning_rate": 4.573170731707317e-06,
"loss": 0.2281,
"step": 21360
},
{
"epoch": 27.327365728900254,
"grad_norm": 0.011685797944664955,
"learning_rate": 4.551393728222997e-06,
"loss": 0.1857,
"step": 21370
},
{
"epoch": 27.34015345268542,
"grad_norm": 3.299914836883545,
"learning_rate": 4.529616724738676e-06,
"loss": 0.3707,
"step": 21380
},
{
"epoch": 27.352941176470587,
"grad_norm": 1.5382965803146362,
"learning_rate": 4.507839721254355e-06,
"loss": 0.1391,
"step": 21390
},
{
"epoch": 27.365728900255753,
"grad_norm": 4.048910617828369,
"learning_rate": 4.4860627177700346e-06,
"loss": 0.3187,
"step": 21400
},
{
"epoch": 27.37851662404092,
"grad_norm": 6.27728796005249,
"learning_rate": 4.464285714285715e-06,
"loss": 0.2896,
"step": 21410
},
{
"epoch": 27.391304347826086,
"grad_norm": 5.804953098297119,
"learning_rate": 4.442508710801394e-06,
"loss": 0.2246,
"step": 21420
},
{
"epoch": 27.404092071611252,
"grad_norm": 0.3168350160121918,
"learning_rate": 4.420731707317073e-06,
"loss": 0.2129,
"step": 21430
},
{
"epoch": 27.41687979539642,
"grad_norm": 2.710597038269043,
"learning_rate": 4.398954703832753e-06,
"loss": 0.2045,
"step": 21440
},
{
"epoch": 27.429667519181585,
"grad_norm": 0.05959421768784523,
"learning_rate": 4.377177700348432e-06,
"loss": 0.1987,
"step": 21450
},
{
"epoch": 27.44245524296675,
"grad_norm": 1.6882935762405396,
"learning_rate": 4.355400696864112e-06,
"loss": 0.2409,
"step": 21460
},
{
"epoch": 27.455242966751918,
"grad_norm": 1.9564663171768188,
"learning_rate": 4.333623693379791e-06,
"loss": 0.1971,
"step": 21470
},
{
"epoch": 27.468030690537084,
"grad_norm": 1.2921223640441895,
"learning_rate": 4.311846689895471e-06,
"loss": 0.2685,
"step": 21480
},
{
"epoch": 27.48081841432225,
"grad_norm": 2.278135061264038,
"learning_rate": 4.29006968641115e-06,
"loss": 0.166,
"step": 21490
},
{
"epoch": 27.493606138107417,
"grad_norm": 0.553162693977356,
"learning_rate": 4.26829268292683e-06,
"loss": 0.205,
"step": 21500
},
{
"epoch": 27.506393861892583,
"grad_norm": 8.630960464477539,
"learning_rate": 4.246515679442508e-06,
"loss": 0.3337,
"step": 21510
},
{
"epoch": 27.51918158567775,
"grad_norm": 5.457569599151611,
"learning_rate": 4.224738675958188e-06,
"loss": 0.4725,
"step": 21520
},
{
"epoch": 27.531969309462916,
"grad_norm": 2.227025032043457,
"learning_rate": 4.202961672473868e-06,
"loss": 0.1535,
"step": 21530
},
{
"epoch": 27.544757033248082,
"grad_norm": 2.7370967864990234,
"learning_rate": 4.181184668989548e-06,
"loss": 0.1932,
"step": 21540
},
{
"epoch": 27.55754475703325,
"grad_norm": 0.04351665824651718,
"learning_rate": 4.159407665505227e-06,
"loss": 0.0892,
"step": 21550
},
{
"epoch": 27.570332480818415,
"grad_norm": 0.006841658148914576,
"learning_rate": 4.137630662020906e-06,
"loss": 0.1867,
"step": 21560
},
{
"epoch": 27.58312020460358,
"grad_norm": 1.037326693534851,
"learning_rate": 4.115853658536586e-06,
"loss": 0.2649,
"step": 21570
},
{
"epoch": 27.595907928388748,
"grad_norm": 5.368692874908447,
"learning_rate": 4.0940766550522645e-06,
"loss": 0.2772,
"step": 21580
},
{
"epoch": 27.608695652173914,
"grad_norm": 6.071450233459473,
"learning_rate": 4.0722996515679446e-06,
"loss": 0.1821,
"step": 21590
},
{
"epoch": 27.62148337595908,
"grad_norm": 1.9267330169677734,
"learning_rate": 4.050522648083624e-06,
"loss": 0.3001,
"step": 21600
},
{
"epoch": 27.634271099744247,
"grad_norm": 6.9283881187438965,
"learning_rate": 4.028745644599304e-06,
"loss": 0.27,
"step": 21610
},
{
"epoch": 27.647058823529413,
"grad_norm": 11.072102546691895,
"learning_rate": 4.006968641114982e-06,
"loss": 0.2461,
"step": 21620
},
{
"epoch": 27.65984654731458,
"grad_norm": 6.862698554992676,
"learning_rate": 3.985191637630662e-06,
"loss": 0.317,
"step": 21630
},
{
"epoch": 27.672634271099746,
"grad_norm": 3.6825077533721924,
"learning_rate": 3.9634146341463414e-06,
"loss": 0.2672,
"step": 21640
},
{
"epoch": 27.685421994884912,
"grad_norm": 5.379913330078125,
"learning_rate": 3.9416376306620215e-06,
"loss": 0.1716,
"step": 21650
},
{
"epoch": 27.69820971867008,
"grad_norm": 0.01544896513223648,
"learning_rate": 3.9198606271777e-06,
"loss": 0.0986,
"step": 21660
},
{
"epoch": 27.710997442455245,
"grad_norm": 4.728484153747559,
"learning_rate": 3.89808362369338e-06,
"loss": 0.2229,
"step": 21670
},
{
"epoch": 27.723785166240408,
"grad_norm": 7.197249412536621,
"learning_rate": 3.876306620209059e-06,
"loss": 0.2093,
"step": 21680
},
{
"epoch": 27.736572890025574,
"grad_norm": 0.08864303678274155,
"learning_rate": 3.854529616724739e-06,
"loss": 0.0748,
"step": 21690
},
{
"epoch": 27.74936061381074,
"grad_norm": 3.961198091506958,
"learning_rate": 3.832752613240418e-06,
"loss": 0.1083,
"step": 21700
},
{
"epoch": 27.762148337595907,
"grad_norm": 1.672965407371521,
"learning_rate": 3.8109756097560976e-06,
"loss": 0.1421,
"step": 21710
},
{
"epoch": 27.774936061381073,
"grad_norm": 2.3427834510803223,
"learning_rate": 3.789198606271777e-06,
"loss": 0.2323,
"step": 21720
},
{
"epoch": 27.78772378516624,
"grad_norm": 0.023380041122436523,
"learning_rate": 3.767421602787457e-06,
"loss": 0.1407,
"step": 21730
},
{
"epoch": 27.800511508951406,
"grad_norm": 0.814188539981842,
"learning_rate": 3.745644599303136e-06,
"loss": 0.1607,
"step": 21740
},
{
"epoch": 27.813299232736572,
"grad_norm": 1.207890510559082,
"learning_rate": 3.7238675958188156e-06,
"loss": 0.2524,
"step": 21750
},
{
"epoch": 27.82608695652174,
"grad_norm": 1.3783173561096191,
"learning_rate": 3.7020905923344953e-06,
"loss": 0.2288,
"step": 21760
},
{
"epoch": 27.838874680306905,
"grad_norm": 4.3022236824035645,
"learning_rate": 3.6803135888501745e-06,
"loss": 0.4452,
"step": 21770
},
{
"epoch": 27.85166240409207,
"grad_norm": 1.6889233589172363,
"learning_rate": 3.6585365853658537e-06,
"loss": 0.2704,
"step": 21780
},
{
"epoch": 27.864450127877237,
"grad_norm": 1.1526528596878052,
"learning_rate": 3.6367595818815333e-06,
"loss": 0.1784,
"step": 21790
},
{
"epoch": 27.877237851662404,
"grad_norm": 0.12817203998565674,
"learning_rate": 3.614982578397213e-06,
"loss": 0.1778,
"step": 21800
},
{
"epoch": 27.89002557544757,
"grad_norm": 1.1597322225570679,
"learning_rate": 3.5932055749128926e-06,
"loss": 0.0466,
"step": 21810
},
{
"epoch": 27.902813299232736,
"grad_norm": 0.492725133895874,
"learning_rate": 3.5714285714285714e-06,
"loss": 0.3498,
"step": 21820
},
{
"epoch": 27.915601023017903,
"grad_norm": 1.6839929819107056,
"learning_rate": 3.549651567944251e-06,
"loss": 0.1928,
"step": 21830
},
{
"epoch": 27.92838874680307,
"grad_norm": 1.2277092933654785,
"learning_rate": 3.5278745644599306e-06,
"loss": 0.1704,
"step": 21840
},
{
"epoch": 27.941176470588236,
"grad_norm": 0.32345348596572876,
"learning_rate": 3.5060975609756102e-06,
"loss": 0.2256,
"step": 21850
},
{
"epoch": 27.953964194373402,
"grad_norm": 3.3228261470794678,
"learning_rate": 3.484320557491289e-06,
"loss": 0.062,
"step": 21860
},
{
"epoch": 27.966751918158568,
"grad_norm": 2.418651819229126,
"learning_rate": 3.4625435540069687e-06,
"loss": 0.2153,
"step": 21870
},
{
"epoch": 27.979539641943735,
"grad_norm": 0.17314131557941437,
"learning_rate": 3.4407665505226483e-06,
"loss": 0.2711,
"step": 21880
},
{
"epoch": 27.9923273657289,
"grad_norm": 2.9722671508789062,
"learning_rate": 3.4189895470383275e-06,
"loss": 0.3478,
"step": 21890
},
{
"epoch": 28.0,
"eval_loss": 0.1801394373178482,
"eval_runtime": 0.9787,
"eval_samples_per_second": 100.135,
"eval_steps_per_second": 13.283,
"step": 21896
},
{
"epoch": 28.005115089514067,
"grad_norm": 0.005500065162777901,
"learning_rate": 3.397212543554007e-06,
"loss": 0.1711,
"step": 21900
},
{
"epoch": 28.017902813299234,
"grad_norm": 5.95561408996582,
"learning_rate": 3.3754355400696867e-06,
"loss": 0.104,
"step": 21910
},
{
"epoch": 28.0306905370844,
"grad_norm": 0.061613526195287704,
"learning_rate": 3.3536585365853664e-06,
"loss": 0.2721,
"step": 21920
},
{
"epoch": 28.043478260869566,
"grad_norm": 4.259181022644043,
"learning_rate": 3.331881533101045e-06,
"loss": 0.3519,
"step": 21930
},
{
"epoch": 28.056265984654733,
"grad_norm": 0.000134848480229266,
"learning_rate": 3.3101045296167248e-06,
"loss": 0.2261,
"step": 21940
},
{
"epoch": 28.0690537084399,
"grad_norm": 0.006025349255651236,
"learning_rate": 3.2883275261324044e-06,
"loss": 0.1773,
"step": 21950
},
{
"epoch": 28.081841432225065,
"grad_norm": 0.5203061103820801,
"learning_rate": 3.266550522648084e-06,
"loss": 0.3539,
"step": 21960
},
{
"epoch": 28.09462915601023,
"grad_norm": 5.767180919647217,
"learning_rate": 3.244773519163763e-06,
"loss": 0.3707,
"step": 21970
},
{
"epoch": 28.107416879795398,
"grad_norm": 0.980506956577301,
"learning_rate": 3.2229965156794425e-06,
"loss": 0.2138,
"step": 21980
},
{
"epoch": 28.120204603580564,
"grad_norm": 1.3818098306655884,
"learning_rate": 3.201219512195122e-06,
"loss": 0.2992,
"step": 21990
},
{
"epoch": 28.132992327365727,
"grad_norm": 1.2961748838424683,
"learning_rate": 3.1794425087108017e-06,
"loss": 0.1655,
"step": 22000
},
{
"epoch": 28.145780051150894,
"grad_norm": 1.3928310871124268,
"learning_rate": 3.157665505226481e-06,
"loss": 0.1668,
"step": 22010
},
{
"epoch": 28.15856777493606,
"grad_norm": 0.49518585205078125,
"learning_rate": 3.1358885017421605e-06,
"loss": 0.1517,
"step": 22020
},
{
"epoch": 28.171355498721226,
"grad_norm": 5.98352575302124,
"learning_rate": 3.1141114982578398e-06,
"loss": 0.1481,
"step": 22030
},
{
"epoch": 28.184143222506393,
"grad_norm": 0.7869769930839539,
"learning_rate": 3.0923344947735194e-06,
"loss": 0.2591,
"step": 22040
},
{
"epoch": 28.19693094629156,
"grad_norm": 4.9443828174844384e-05,
"learning_rate": 3.070557491289199e-06,
"loss": 0.1369,
"step": 22050
},
{
"epoch": 28.209718670076725,
"grad_norm": 5.648998260498047,
"learning_rate": 3.0487804878048782e-06,
"loss": 0.2763,
"step": 22060
},
{
"epoch": 28.22250639386189,
"grad_norm": 0.4481763243675232,
"learning_rate": 3.027003484320558e-06,
"loss": 0.268,
"step": 22070
},
{
"epoch": 28.235294117647058,
"grad_norm": 2.534666061401367,
"learning_rate": 3.005226480836237e-06,
"loss": 0.2043,
"step": 22080
},
{
"epoch": 28.248081841432224,
"grad_norm": 1.7655045986175537,
"learning_rate": 2.9834494773519167e-06,
"loss": 0.1321,
"step": 22090
},
{
"epoch": 28.26086956521739,
"grad_norm": 4.485157489776611,
"learning_rate": 2.961672473867596e-06,
"loss": 0.2079,
"step": 22100
},
{
"epoch": 28.273657289002557,
"grad_norm": 4.403810024261475,
"learning_rate": 2.9398954703832755e-06,
"loss": 0.3577,
"step": 22110
},
{
"epoch": 28.286445012787723,
"grad_norm": 3.932236671447754,
"learning_rate": 2.9181184668989547e-06,
"loss": 0.38,
"step": 22120
},
{
"epoch": 28.29923273657289,
"grad_norm": 2.8906774520874023,
"learning_rate": 2.8963414634146343e-06,
"loss": 0.1712,
"step": 22130
},
{
"epoch": 28.312020460358056,
"grad_norm": 3.725170850753784,
"learning_rate": 2.8745644599303136e-06,
"loss": 0.1457,
"step": 22140
},
{
"epoch": 28.324808184143222,
"grad_norm": 1.1601566076278687,
"learning_rate": 2.852787456445993e-06,
"loss": 0.3236,
"step": 22150
},
{
"epoch": 28.33759590792839,
"grad_norm": 0.42673754692077637,
"learning_rate": 2.8310104529616724e-06,
"loss": 0.1724,
"step": 22160
},
{
"epoch": 28.350383631713555,
"grad_norm": 5.1795573234558105,
"learning_rate": 2.809233449477352e-06,
"loss": 0.2666,
"step": 22170
},
{
"epoch": 28.36317135549872,
"grad_norm": 4.630896091461182,
"learning_rate": 2.7874564459930316e-06,
"loss": 0.3567,
"step": 22180
},
{
"epoch": 28.375959079283888,
"grad_norm": 0.001059512491337955,
"learning_rate": 2.765679442508711e-06,
"loss": 0.3346,
"step": 22190
},
{
"epoch": 28.388746803069054,
"grad_norm": 3.2247471809387207,
"learning_rate": 2.7439024390243905e-06,
"loss": 0.0682,
"step": 22200
},
{
"epoch": 28.40153452685422,
"grad_norm": 0.2996211349964142,
"learning_rate": 2.7221254355400697e-06,
"loss": 0.1286,
"step": 22210
},
{
"epoch": 28.414322250639387,
"grad_norm": 0.013440398499369621,
"learning_rate": 2.7003484320557493e-06,
"loss": 0.0815,
"step": 22220
},
{
"epoch": 28.427109974424553,
"grad_norm": 0.8571885228157043,
"learning_rate": 2.6785714285714285e-06,
"loss": 0.2212,
"step": 22230
},
{
"epoch": 28.43989769820972,
"grad_norm": 6.663759231567383,
"learning_rate": 2.656794425087108e-06,
"loss": 0.286,
"step": 22240
},
{
"epoch": 28.452685421994886,
"grad_norm": 1.592871069908142,
"learning_rate": 2.6350174216027874e-06,
"loss": 0.2274,
"step": 22250
},
{
"epoch": 28.465473145780052,
"grad_norm": 0.00015884192544035614,
"learning_rate": 2.613240418118467e-06,
"loss": 0.2724,
"step": 22260
},
{
"epoch": 28.47826086956522,
"grad_norm": 100.69029998779297,
"learning_rate": 2.591463414634146e-06,
"loss": 0.22,
"step": 22270
},
{
"epoch": 28.491048593350385,
"grad_norm": 6.356751441955566,
"learning_rate": 2.569686411149826e-06,
"loss": 0.1702,
"step": 22280
},
{
"epoch": 28.50383631713555,
"grad_norm": 2.7504799365997314,
"learning_rate": 2.547909407665505e-06,
"loss": 0.4634,
"step": 22290
},
{
"epoch": 28.516624040920718,
"grad_norm": 0.8726930618286133,
"learning_rate": 2.5261324041811846e-06,
"loss": 0.183,
"step": 22300
},
{
"epoch": 28.529411764705884,
"grad_norm": 0.07652537524700165,
"learning_rate": 2.5043554006968643e-06,
"loss": 0.2604,
"step": 22310
},
{
"epoch": 28.54219948849105,
"grad_norm": 0.4226292371749878,
"learning_rate": 2.482578397212544e-06,
"loss": 0.1493,
"step": 22320
},
{
"epoch": 28.554987212276213,
"grad_norm": 0.3523002862930298,
"learning_rate": 2.460801393728223e-06,
"loss": 0.1188,
"step": 22330
},
{
"epoch": 28.56777493606138,
"grad_norm": 0.6177569031715393,
"learning_rate": 2.4390243902439027e-06,
"loss": 0.2698,
"step": 22340
},
{
"epoch": 28.580562659846546,
"grad_norm": 0.027569299563765526,
"learning_rate": 2.417247386759582e-06,
"loss": 0.2496,
"step": 22350
},
{
"epoch": 28.593350383631712,
"grad_norm": 0.00996365025639534,
"learning_rate": 2.3954703832752616e-06,
"loss": 0.187,
"step": 22360
},
{
"epoch": 28.60613810741688,
"grad_norm": 0.2898666560649872,
"learning_rate": 2.3736933797909408e-06,
"loss": 0.2004,
"step": 22370
},
{
"epoch": 28.618925831202045,
"grad_norm": 0.0006501222378574312,
"learning_rate": 2.3519163763066204e-06,
"loss": 0.2665,
"step": 22380
},
{
"epoch": 28.63171355498721,
"grad_norm": 0.003726001363247633,
"learning_rate": 2.3301393728222996e-06,
"loss": 0.2685,
"step": 22390
},
{
"epoch": 28.644501278772378,
"grad_norm": 0.0016021005576476455,
"learning_rate": 2.3083623693379792e-06,
"loss": 0.2553,
"step": 22400
},
{
"epoch": 28.657289002557544,
"grad_norm": 1.840811014175415,
"learning_rate": 2.2865853658536584e-06,
"loss": 0.1302,
"step": 22410
},
{
"epoch": 28.67007672634271,
"grad_norm": 2.189919948577881,
"learning_rate": 2.264808362369338e-06,
"loss": 0.2802,
"step": 22420
},
{
"epoch": 28.682864450127877,
"grad_norm": 0.8784971237182617,
"learning_rate": 2.2430313588850173e-06,
"loss": 0.2669,
"step": 22430
},
{
"epoch": 28.695652173913043,
"grad_norm": 5.116599082946777,
"learning_rate": 2.221254355400697e-06,
"loss": 0.1442,
"step": 22440
},
{
"epoch": 28.70843989769821,
"grad_norm": 0.6749101281166077,
"learning_rate": 2.1994773519163765e-06,
"loss": 0.2148,
"step": 22450
},
{
"epoch": 28.721227621483376,
"grad_norm": 4.480432987213135,
"learning_rate": 2.177700348432056e-06,
"loss": 0.1998,
"step": 22460
},
{
"epoch": 28.734015345268542,
"grad_norm": 0.05128619447350502,
"learning_rate": 2.1559233449477354e-06,
"loss": 0.2094,
"step": 22470
},
{
"epoch": 28.74680306905371,
"grad_norm": 6.135299205780029,
"learning_rate": 2.134146341463415e-06,
"loss": 0.3928,
"step": 22480
},
{
"epoch": 28.759590792838875,
"grad_norm": 3.8637638092041016,
"learning_rate": 2.112369337979094e-06,
"loss": 0.305,
"step": 22490
},
{
"epoch": 28.77237851662404,
"grad_norm": 0.015335003845393658,
"learning_rate": 2.090592334494774e-06,
"loss": 0.2201,
"step": 22500
},
{
"epoch": 28.785166240409207,
"grad_norm": 27.03233528137207,
"learning_rate": 2.068815331010453e-06,
"loss": 0.4113,
"step": 22510
},
{
"epoch": 28.797953964194374,
"grad_norm": 8.126382827758789,
"learning_rate": 2.0470383275261322e-06,
"loss": 0.396,
"step": 22520
},
{
"epoch": 28.81074168797954,
"grad_norm": 0.0010111212031915784,
"learning_rate": 2.025261324041812e-06,
"loss": 0.0858,
"step": 22530
},
{
"epoch": 28.823529411764707,
"grad_norm": 0.22579795122146606,
"learning_rate": 2.003484320557491e-06,
"loss": 0.1004,
"step": 22540
},
{
"epoch": 28.836317135549873,
"grad_norm": 7.518482685089111,
"learning_rate": 1.9817073170731707e-06,
"loss": 0.1294,
"step": 22550
},
{
"epoch": 28.84910485933504,
"grad_norm": 0.4615892469882965,
"learning_rate": 1.95993031358885e-06,
"loss": 0.0513,
"step": 22560
},
{
"epoch": 28.861892583120206,
"grad_norm": 0.9744994640350342,
"learning_rate": 1.9381533101045295e-06,
"loss": 0.0957,
"step": 22570
},
{
"epoch": 28.874680306905372,
"grad_norm": 8.404377937316895,
"learning_rate": 1.916376306620209e-06,
"loss": 0.2883,
"step": 22580
},
{
"epoch": 28.88746803069054,
"grad_norm": 0.06529644131660461,
"learning_rate": 1.8945993031358886e-06,
"loss": 0.175,
"step": 22590
},
{
"epoch": 28.900255754475705,
"grad_norm": 1.4713941812515259,
"learning_rate": 1.872822299651568e-06,
"loss": 0.1782,
"step": 22600
},
{
"epoch": 28.91304347826087,
"grad_norm": 0.0055590178817510605,
"learning_rate": 1.8510452961672476e-06,
"loss": 0.1801,
"step": 22610
},
{
"epoch": 28.925831202046037,
"grad_norm": 0.132467120885849,
"learning_rate": 1.8292682926829268e-06,
"loss": 0.1855,
"step": 22620
},
{
"epoch": 28.938618925831204,
"grad_norm": 4.285371780395508,
"learning_rate": 1.8074912891986065e-06,
"loss": 0.3301,
"step": 22630
},
{
"epoch": 28.95140664961637,
"grad_norm": 5.881651878356934,
"learning_rate": 1.7857142857142857e-06,
"loss": 0.0689,
"step": 22640
},
{
"epoch": 28.964194373401533,
"grad_norm": 4.706814765930176,
"learning_rate": 1.7639372822299653e-06,
"loss": 0.3224,
"step": 22650
},
{
"epoch": 28.9769820971867,
"grad_norm": 0.24153724312782288,
"learning_rate": 1.7421602787456445e-06,
"loss": 0.4702,
"step": 22660
},
{
"epoch": 28.989769820971865,
"grad_norm": 8.415337562561035,
"learning_rate": 1.7203832752613241e-06,
"loss": 0.2193,
"step": 22670
},
{
"epoch": 29.0,
"eval_loss": 0.1792634278535843,
"eval_runtime": 0.9762,
"eval_samples_per_second": 100.386,
"eval_steps_per_second": 13.317,
"step": 22678
},
{
"epoch": 29.002557544757032,
"grad_norm": 6.062386512756348,
"learning_rate": 1.6986062717770036e-06,
"loss": 0.1089,
"step": 22680
},
{
"epoch": 29.015345268542198,
"grad_norm": 3.9071085453033447,
"learning_rate": 1.6768292682926832e-06,
"loss": 0.2283,
"step": 22690
},
{
"epoch": 29.028132992327365,
"grad_norm": 0.00043772748904302716,
"learning_rate": 1.6550522648083624e-06,
"loss": 0.27,
"step": 22700
},
{
"epoch": 29.04092071611253,
"grad_norm": 0.5102390646934509,
"learning_rate": 1.633275261324042e-06,
"loss": 0.2082,
"step": 22710
},
{
"epoch": 29.053708439897697,
"grad_norm": 0.003764116670936346,
"learning_rate": 1.6114982578397212e-06,
"loss": 0.5365,
"step": 22720
},
{
"epoch": 29.066496163682864,
"grad_norm": 1.1932376623153687,
"learning_rate": 1.5897212543554009e-06,
"loss": 0.0399,
"step": 22730
},
{
"epoch": 29.07928388746803,
"grad_norm": 3.3606503009796143,
"learning_rate": 1.5679442508710803e-06,
"loss": 0.1718,
"step": 22740
},
{
"epoch": 29.092071611253196,
"grad_norm": 3.6388676166534424,
"learning_rate": 1.5461672473867597e-06,
"loss": 0.4028,
"step": 22750
},
{
"epoch": 29.104859335038363,
"grad_norm": 0.08391699939966202,
"learning_rate": 1.5243902439024391e-06,
"loss": 0.1473,
"step": 22760
},
{
"epoch": 29.11764705882353,
"grad_norm": 0.6788755655288696,
"learning_rate": 1.5026132404181185e-06,
"loss": 0.1682,
"step": 22770
},
{
"epoch": 29.130434782608695,
"grad_norm": 4.642868995666504,
"learning_rate": 1.480836236933798e-06,
"loss": 0.2218,
"step": 22780
},
{
"epoch": 29.14322250639386,
"grad_norm": 1.4224387407302856,
"learning_rate": 1.4590592334494774e-06,
"loss": 0.1681,
"step": 22790
},
{
"epoch": 29.156010230179028,
"grad_norm": 4.818307399749756,
"learning_rate": 1.4372822299651568e-06,
"loss": 0.2847,
"step": 22800
},
{
"epoch": 29.168797953964194,
"grad_norm": 1.9413695335388184,
"learning_rate": 1.4155052264808362e-06,
"loss": 0.3833,
"step": 22810
},
{
"epoch": 29.18158567774936,
"grad_norm": 0.2834830582141876,
"learning_rate": 1.3937282229965158e-06,
"loss": 0.0927,
"step": 22820
},
{
"epoch": 29.194373401534527,
"grad_norm": 2.186720132827759,
"learning_rate": 1.3719512195121952e-06,
"loss": 0.1265,
"step": 22830
},
{
"epoch": 29.207161125319693,
"grad_norm": 1.6864341497421265,
"learning_rate": 1.3501742160278747e-06,
"loss": 0.2385,
"step": 22840
},
{
"epoch": 29.21994884910486,
"grad_norm": 1.5684053897857666,
"learning_rate": 1.328397212543554e-06,
"loss": 0.4263,
"step": 22850
},
{
"epoch": 29.232736572890026,
"grad_norm": 1.0396729707717896,
"learning_rate": 1.3066202090592335e-06,
"loss": 0.124,
"step": 22860
},
{
"epoch": 29.245524296675192,
"grad_norm": 3.0199835300445557,
"learning_rate": 1.284843205574913e-06,
"loss": 0.1878,
"step": 22870
},
{
"epoch": 29.25831202046036,
"grad_norm": 5.518275737762451,
"learning_rate": 1.2630662020905923e-06,
"loss": 0.1865,
"step": 22880
},
{
"epoch": 29.271099744245525,
"grad_norm": 0.0033442946150898933,
"learning_rate": 1.241289198606272e-06,
"loss": 0.172,
"step": 22890
},
{
"epoch": 29.28388746803069,
"grad_norm": 1.8414983749389648,
"learning_rate": 1.2195121951219514e-06,
"loss": 0.2428,
"step": 22900
},
{
"epoch": 29.296675191815858,
"grad_norm": 2.1718709468841553,
"learning_rate": 1.1977351916376308e-06,
"loss": 0.1704,
"step": 22910
},
{
"epoch": 29.309462915601024,
"grad_norm": 0.0017286858055740595,
"learning_rate": 1.1759581881533102e-06,
"loss": 0.1766,
"step": 22920
},
{
"epoch": 29.32225063938619,
"grad_norm": 1.1897152662277222,
"learning_rate": 1.1541811846689896e-06,
"loss": 0.3789,
"step": 22930
},
{
"epoch": 29.335038363171357,
"grad_norm": 0.0005067705642431974,
"learning_rate": 1.132404181184669e-06,
"loss": 0.1308,
"step": 22940
},
{
"epoch": 29.347826086956523,
"grad_norm": 2.590867042541504,
"learning_rate": 1.1106271777003485e-06,
"loss": 0.1798,
"step": 22950
},
{
"epoch": 29.36061381074169,
"grad_norm": 1.66661536693573,
"learning_rate": 1.088850174216028e-06,
"loss": 0.3366,
"step": 22960
},
{
"epoch": 29.373401534526856,
"grad_norm": 6.159651756286621,
"learning_rate": 1.0670731707317075e-06,
"loss": 0.3356,
"step": 22970
},
{
"epoch": 29.38618925831202,
"grad_norm": 2.129429578781128,
"learning_rate": 1.045296167247387e-06,
"loss": 0.2774,
"step": 22980
},
{
"epoch": 29.398976982097185,
"grad_norm": 0.9112698435783386,
"learning_rate": 1.0235191637630661e-06,
"loss": 0.2685,
"step": 22990
},
{
"epoch": 29.41176470588235,
"grad_norm": 0.0026976391673088074,
"learning_rate": 1.0017421602787455e-06,
"loss": 0.2588,
"step": 23000
},
{
"epoch": 29.424552429667518,
"grad_norm": 0.9733322262763977,
"learning_rate": 9.79965156794425e-07,
"loss": 0.1532,
"step": 23010
},
{
"epoch": 29.437340153452684,
"grad_norm": 0.48181623220443726,
"learning_rate": 9.581881533101046e-07,
"loss": 0.2647,
"step": 23020
},
{
"epoch": 29.45012787723785,
"grad_norm": 0.4220898449420929,
"learning_rate": 9.36411149825784e-07,
"loss": 0.1638,
"step": 23030
},
{
"epoch": 29.462915601023017,
"grad_norm": 0.018712276592850685,
"learning_rate": 9.146341463414634e-07,
"loss": 0.0407,
"step": 23040
},
{
"epoch": 29.475703324808183,
"grad_norm": 0.0022459602914750576,
"learning_rate": 8.928571428571428e-07,
"loss": 0.1538,
"step": 23050
},
{
"epoch": 29.48849104859335,
"grad_norm": 1.2622050046920776,
"learning_rate": 8.710801393728223e-07,
"loss": 0.2032,
"step": 23060
},
{
"epoch": 29.501278772378516,
"grad_norm": 0.00908291433006525,
"learning_rate": 8.493031358885018e-07,
"loss": 0.1773,
"step": 23070
},
{
"epoch": 29.514066496163682,
"grad_norm": 0.028732916340231895,
"learning_rate": 8.275261324041812e-07,
"loss": 0.2818,
"step": 23080
},
{
"epoch": 29.52685421994885,
"grad_norm": 0.5516131520271301,
"learning_rate": 8.057491289198606e-07,
"loss": 0.0955,
"step": 23090
},
{
"epoch": 29.539641943734015,
"grad_norm": 1.139642357826233,
"learning_rate": 7.839721254355401e-07,
"loss": 0.2728,
"step": 23100
},
{
"epoch": 29.55242966751918,
"grad_norm": 3.2173802852630615,
"learning_rate": 7.621951219512196e-07,
"loss": 0.2744,
"step": 23110
},
{
"epoch": 29.565217391304348,
"grad_norm": 0.01653435453772545,
"learning_rate": 7.40418118466899e-07,
"loss": 0.0744,
"step": 23120
},
{
"epoch": 29.578005115089514,
"grad_norm": 6.4000325202941895,
"learning_rate": 7.186411149825784e-07,
"loss": 0.2451,
"step": 23130
},
{
"epoch": 29.59079283887468,
"grad_norm": 1.4968242645263672,
"learning_rate": 6.968641114982579e-07,
"loss": 0.2611,
"step": 23140
},
{
"epoch": 29.603580562659847,
"grad_norm": 0.000750532082747668,
"learning_rate": 6.750871080139373e-07,
"loss": 0.3625,
"step": 23150
},
{
"epoch": 29.616368286445013,
"grad_norm": 0.8928916454315186,
"learning_rate": 6.533101045296167e-07,
"loss": 0.0931,
"step": 23160
},
{
"epoch": 29.62915601023018,
"grad_norm": 0.03060201369225979,
"learning_rate": 6.315331010452962e-07,
"loss": 0.0568,
"step": 23170
},
{
"epoch": 29.641943734015346,
"grad_norm": 9.899931907653809,
"learning_rate": 6.097560975609757e-07,
"loss": 0.2723,
"step": 23180
},
{
"epoch": 29.654731457800512,
"grad_norm": 0.8692872524261475,
"learning_rate": 5.879790940766551e-07,
"loss": 0.1834,
"step": 23190
},
{
"epoch": 29.66751918158568,
"grad_norm": 0.00023471553868148476,
"learning_rate": 5.662020905923345e-07,
"loss": 0.2826,
"step": 23200
},
{
"epoch": 29.680306905370845,
"grad_norm": 4.050236701965332,
"learning_rate": 5.44425087108014e-07,
"loss": 0.1395,
"step": 23210
},
{
"epoch": 29.69309462915601,
"grad_norm": 2.7868478298187256,
"learning_rate": 5.226480836236935e-07,
"loss": 0.2827,
"step": 23220
},
{
"epoch": 29.705882352941178,
"grad_norm": 0.00015953659021761268,
"learning_rate": 5.008710801393728e-07,
"loss": 0.3188,
"step": 23230
},
{
"epoch": 29.718670076726344,
"grad_norm": 1.8475815057754517,
"learning_rate": 4.790940766550523e-07,
"loss": 0.1454,
"step": 23240
},
{
"epoch": 29.73145780051151,
"grad_norm": 1.6999574899673462,
"learning_rate": 4.573170731707317e-07,
"loss": 0.245,
"step": 23250
},
{
"epoch": 29.744245524296677,
"grad_norm": 0.3015870153903961,
"learning_rate": 4.3554006968641113e-07,
"loss": 0.1875,
"step": 23260
},
{
"epoch": 29.757033248081843,
"grad_norm": 0.06826595962047577,
"learning_rate": 4.137630662020906e-07,
"loss": 0.2096,
"step": 23270
},
{
"epoch": 29.76982097186701,
"grad_norm": 5.668938636779785,
"learning_rate": 3.9198606271777007e-07,
"loss": 0.1375,
"step": 23280
},
{
"epoch": 29.782608695652176,
"grad_norm": 5.67093563079834,
"learning_rate": 3.702090592334495e-07,
"loss": 0.2203,
"step": 23290
},
{
"epoch": 29.79539641943734,
"grad_norm": 3.95104718208313,
"learning_rate": 3.4843205574912896e-07,
"loss": 0.2321,
"step": 23300
},
{
"epoch": 29.808184143222505,
"grad_norm": 2.9969534873962402,
"learning_rate": 3.2665505226480837e-07,
"loss": 0.1346,
"step": 23310
},
{
"epoch": 29.82097186700767,
"grad_norm": 0.7820212841033936,
"learning_rate": 3.0487804878048784e-07,
"loss": 0.2656,
"step": 23320
},
{
"epoch": 29.833759590792837,
"grad_norm": 0.0010761891026049852,
"learning_rate": 2.8310104529616726e-07,
"loss": 0.4435,
"step": 23330
},
{
"epoch": 29.846547314578004,
"grad_norm": 4.919832229614258,
"learning_rate": 2.6132404181184673e-07,
"loss": 0.3309,
"step": 23340
},
{
"epoch": 29.85933503836317,
"grad_norm": 2.468470573425293,
"learning_rate": 2.3954703832752615e-07,
"loss": 0.1566,
"step": 23350
},
{
"epoch": 29.872122762148337,
"grad_norm": 5.76803731918335,
"learning_rate": 2.1777003484320556e-07,
"loss": 0.2243,
"step": 23360
},
{
"epoch": 29.884910485933503,
"grad_norm": 1.7636452913284302,
"learning_rate": 1.9599303135888503e-07,
"loss": 0.1639,
"step": 23370
},
{
"epoch": 29.89769820971867,
"grad_norm": 1.2893353700637817,
"learning_rate": 1.7421602787456448e-07,
"loss": 0.3609,
"step": 23380
},
{
"epoch": 29.910485933503836,
"grad_norm": 6.375534576363862e-05,
"learning_rate": 1.5243902439024392e-07,
"loss": 0.3925,
"step": 23390
},
{
"epoch": 29.923273657289002,
"grad_norm": 0.7869868278503418,
"learning_rate": 1.3066202090592336e-07,
"loss": 0.217,
"step": 23400
},
{
"epoch": 29.93606138107417,
"grad_norm": 7.046038627624512,
"learning_rate": 1.0888501742160278e-07,
"loss": 0.2354,
"step": 23410
},
{
"epoch": 29.948849104859335,
"grad_norm": 0.00026419543428346515,
"learning_rate": 8.710801393728224e-08,
"loss": 0.2192,
"step": 23420
},
{
"epoch": 29.9616368286445,
"grad_norm": 8.245471000671387,
"learning_rate": 6.533101045296168e-08,
"loss": 0.1844,
"step": 23430
},
{
"epoch": 29.974424552429667,
"grad_norm": 0.5220872759819031,
"learning_rate": 4.355400696864112e-08,
"loss": 0.2512,
"step": 23440
},
{
"epoch": 29.987212276214834,
"grad_norm": 5.6852521896362305,
"learning_rate": 2.177700348432056e-08,
"loss": 0.4313,
"step": 23450
},
{
"epoch": 30.0,
"grad_norm": 4.1284308433532715,
"learning_rate": 0.0,
"loss": 0.2054,
"step": 23460
},
{
"epoch": 30.0,
"eval_loss": 0.1794022172689438,
"eval_runtime": 0.835,
"eval_samples_per_second": 117.367,
"eval_steps_per_second": 15.569,
"step": 23460
}
],
"logging_steps": 10,
"max_steps": 23460,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3101116937011200.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}