{ "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 23460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01278772378516624, "grad_norm": 5291.65576171875, "learning_rate": 1.0000000000000002e-06, "loss": 46.8468, "step": 10 }, { "epoch": 0.02557544757033248, "grad_norm": 657.4127807617188, "learning_rate": 2.0000000000000003e-06, "loss": 47.4188, "step": 20 }, { "epoch": 0.03836317135549872, "grad_norm": 8509.5302734375, "learning_rate": 3e-06, "loss": 55.5624, "step": 30 }, { "epoch": 0.05115089514066496, "grad_norm": 427.65924072265625, "learning_rate": 4.000000000000001e-06, "loss": 54.0157, "step": 40 }, { "epoch": 0.0639386189258312, "grad_norm": 7448.37353515625, "learning_rate": 5e-06, "loss": 57.6548, "step": 50 }, { "epoch": 0.07672634271099744, "grad_norm": 14357.810546875, "learning_rate": 6e-06, "loss": 45.0872, "step": 60 }, { "epoch": 0.08951406649616368, "grad_norm": 4495.298828125, "learning_rate": 7.000000000000001e-06, "loss": 49.2105, "step": 70 }, { "epoch": 0.10230179028132992, "grad_norm": 3922.909912109375, "learning_rate": 8.000000000000001e-06, "loss": 45.772, "step": 80 }, { "epoch": 0.11508951406649616, "grad_norm": 47730.671875, "learning_rate": 9e-06, "loss": 54.9209, "step": 90 }, { "epoch": 0.1278772378516624, "grad_norm": 27943.875, "learning_rate": 1e-05, "loss": 47.032, "step": 100 }, { "epoch": 0.14066496163682865, "grad_norm": 185.7626953125, "learning_rate": 1.1000000000000001e-05, "loss": 55.6442, "step": 110 }, { "epoch": 0.1534526854219949, "grad_norm": 4819.99365234375, "learning_rate": 1.2e-05, "loss": 47.2024, "step": 120 }, { "epoch": 0.16624040920716113, "grad_norm": 16820.35546875, "learning_rate": 1.3000000000000001e-05, "loss": 47.1665, "step": 130 }, { "epoch": 0.17902813299232737, "grad_norm": 408.82489013671875, "learning_rate": 1.4000000000000001e-05, "loss": 45.6075, "step": 140 }, { "epoch": 0.1918158567774936, "grad_norm": 29451.880859375, "learning_rate": 1.5e-05, "loss": 50.9366, "step": 150 }, { "epoch": 0.20460358056265984, "grad_norm": 28413.0390625, "learning_rate": 1.6000000000000003e-05, "loss": 44.6847, "step": 160 }, { "epoch": 0.21739130434782608, "grad_norm": 799.9179077148438, "learning_rate": 1.7000000000000003e-05, "loss": 46.8802, "step": 170 }, { "epoch": 0.23017902813299232, "grad_norm": 2510.53515625, "learning_rate": 1.8e-05, "loss": 46.2193, "step": 180 }, { "epoch": 0.24296675191815856, "grad_norm": 2892.29248046875, "learning_rate": 1.9e-05, "loss": 41.2469, "step": 190 }, { "epoch": 0.2557544757033248, "grad_norm": 738.676513671875, "learning_rate": 2e-05, "loss": 41.8658, "step": 200 }, { "epoch": 0.26854219948849106, "grad_norm": 215.02032470703125, "learning_rate": 2.1e-05, "loss": 42.2854, "step": 210 }, { "epoch": 0.2813299232736573, "grad_norm": 1281.8134765625, "learning_rate": 2.2000000000000003e-05, "loss": 48.7005, "step": 220 }, { "epoch": 0.29411764705882354, "grad_norm": 677.4962158203125, "learning_rate": 2.3000000000000003e-05, "loss": 36.2834, "step": 230 }, { "epoch": 0.3069053708439898, "grad_norm": 2442.72900390625, "learning_rate": 2.4e-05, "loss": 44.3992, "step": 240 }, { "epoch": 0.319693094629156, "grad_norm": 135.88478088378906, "learning_rate": 2.5e-05, "loss": 42.5502, "step": 250 }, { "epoch": 0.33248081841432225, "grad_norm": 5432.8203125, "learning_rate": 2.6000000000000002e-05, "loss": 41.375, "step": 260 }, { "epoch": 0.3452685421994885, "grad_norm": 3573.05419921875, "learning_rate": 2.7000000000000002e-05, "loss": 40.6085, "step": 270 }, { "epoch": 0.35805626598465473, "grad_norm": 806.6569213867188, "learning_rate": 2.8000000000000003e-05, "loss": 40.9676, "step": 280 }, { "epoch": 0.37084398976982097, "grad_norm": 537.9384765625, "learning_rate": 2.9e-05, "loss": 36.0962, "step": 290 }, { "epoch": 0.3836317135549872, "grad_norm": 1317.54150390625, "learning_rate": 3e-05, "loss": 38.838, "step": 300 }, { "epoch": 0.39641943734015345, "grad_norm": 1044.3780517578125, "learning_rate": 3.1e-05, "loss": 35.0419, "step": 310 }, { "epoch": 0.4092071611253197, "grad_norm": 6332.0888671875, "learning_rate": 3.2000000000000005e-05, "loss": 35.0986, "step": 320 }, { "epoch": 0.4219948849104859, "grad_norm": 1020.596923828125, "learning_rate": 3.3e-05, "loss": 35.2819, "step": 330 }, { "epoch": 0.43478260869565216, "grad_norm": 4903.22119140625, "learning_rate": 3.4000000000000007e-05, "loss": 27.0823, "step": 340 }, { "epoch": 0.4475703324808184, "grad_norm": 705.4653930664062, "learning_rate": 3.5e-05, "loss": 35.8111, "step": 350 }, { "epoch": 0.46035805626598464, "grad_norm": 166.60675048828125, "learning_rate": 3.6e-05, "loss": 32.9624, "step": 360 }, { "epoch": 0.4731457800511509, "grad_norm": 1294.9737548828125, "learning_rate": 3.7e-05, "loss": 27.3774, "step": 370 }, { "epoch": 0.4859335038363171, "grad_norm": 143.36048889160156, "learning_rate": 3.8e-05, "loss": 35.3593, "step": 380 }, { "epoch": 0.49872122762148335, "grad_norm": 2351.956787109375, "learning_rate": 3.9000000000000006e-05, "loss": 31.6628, "step": 390 }, { "epoch": 0.5115089514066496, "grad_norm": 1412.3145751953125, "learning_rate": 4e-05, "loss": 24.9052, "step": 400 }, { "epoch": 0.5242966751918159, "grad_norm": 2179.52294921875, "learning_rate": 4.1e-05, "loss": 28.4615, "step": 410 }, { "epoch": 0.5370843989769821, "grad_norm": 1550.5777587890625, "learning_rate": 4.2e-05, "loss": 27.8655, "step": 420 }, { "epoch": 0.5498721227621484, "grad_norm": 14167.97265625, "learning_rate": 4.3e-05, "loss": 31.339, "step": 430 }, { "epoch": 0.5626598465473146, "grad_norm": 1213.757568359375, "learning_rate": 4.4000000000000006e-05, "loss": 29.2414, "step": 440 }, { "epoch": 0.5754475703324808, "grad_norm": 1910.325439453125, "learning_rate": 4.5e-05, "loss": 28.1563, "step": 450 }, { "epoch": 0.5882352941176471, "grad_norm": 2075.203369140625, "learning_rate": 4.600000000000001e-05, "loss": 30.5137, "step": 460 }, { "epoch": 0.6010230179028133, "grad_norm": 2741.365966796875, "learning_rate": 4.7e-05, "loss": 23.7269, "step": 470 }, { "epoch": 0.6138107416879796, "grad_norm": 6818.3935546875, "learning_rate": 4.8e-05, "loss": 20.7367, "step": 480 }, { "epoch": 0.6265984654731458, "grad_norm": 14128.44140625, "learning_rate": 4.9e-05, "loss": 23.4038, "step": 490 }, { "epoch": 0.639386189258312, "grad_norm": 268.3045654296875, "learning_rate": 5e-05, "loss": 25.6224, "step": 500 }, { "epoch": 0.6521739130434783, "grad_norm": 8016.40673828125, "learning_rate": 4.9978222996515684e-05, "loss": 24.9662, "step": 510 }, { "epoch": 0.6649616368286445, "grad_norm": 2014.55078125, "learning_rate": 4.995644599303136e-05, "loss": 27.3142, "step": 520 }, { "epoch": 0.6777493606138107, "grad_norm": 2465.661376953125, "learning_rate": 4.993466898954704e-05, "loss": 24.8702, "step": 530 }, { "epoch": 0.690537084398977, "grad_norm": 1037.0167236328125, "learning_rate": 4.991289198606272e-05, "loss": 22.7556, "step": 540 }, { "epoch": 0.7033248081841432, "grad_norm": 11746.3017578125, "learning_rate": 4.9891114982578404e-05, "loss": 20.6752, "step": 550 }, { "epoch": 0.7161125319693095, "grad_norm": 1548.5609130859375, "learning_rate": 4.986933797909408e-05, "loss": 21.2267, "step": 560 }, { "epoch": 0.7289002557544757, "grad_norm": 2361.02392578125, "learning_rate": 4.984756097560976e-05, "loss": 20.3482, "step": 570 }, { "epoch": 0.7416879795396419, "grad_norm": 11062.6640625, "learning_rate": 4.9825783972125436e-05, "loss": 19.0836, "step": 580 }, { "epoch": 0.7544757033248082, "grad_norm": 553.4859619140625, "learning_rate": 4.980400696864112e-05, "loss": 19.9769, "step": 590 }, { "epoch": 0.7672634271099744, "grad_norm": 497.746337890625, "learning_rate": 4.978222996515679e-05, "loss": 18.8568, "step": 600 }, { "epoch": 0.7800511508951407, "grad_norm": 930.5474243164062, "learning_rate": 4.9760452961672475e-05, "loss": 20.3492, "step": 610 }, { "epoch": 0.7928388746803069, "grad_norm": 585.6785278320312, "learning_rate": 4.9738675958188156e-05, "loss": 16.9816, "step": 620 }, { "epoch": 0.8056265984654731, "grad_norm": 1422.135009765625, "learning_rate": 4.971689895470383e-05, "loss": 17.72, "step": 630 }, { "epoch": 0.8184143222506394, "grad_norm": 375.1902770996094, "learning_rate": 4.969512195121951e-05, "loss": 17.7093, "step": 640 }, { "epoch": 0.8312020460358056, "grad_norm": 4669.2646484375, "learning_rate": 4.9673344947735195e-05, "loss": 16.5356, "step": 650 }, { "epoch": 0.8439897698209718, "grad_norm": 3034.7099609375, "learning_rate": 4.965156794425087e-05, "loss": 17.4828, "step": 660 }, { "epoch": 0.8567774936061381, "grad_norm": 2345.01708984375, "learning_rate": 4.962979094076655e-05, "loss": 15.835, "step": 670 }, { "epoch": 0.8695652173913043, "grad_norm": 363.0520935058594, "learning_rate": 4.960801393728223e-05, "loss": 15.788, "step": 680 }, { "epoch": 0.8823529411764706, "grad_norm": 491.0669250488281, "learning_rate": 4.9586236933797915e-05, "loss": 16.4292, "step": 690 }, { "epoch": 0.8951406649616368, "grad_norm": 415.5701904296875, "learning_rate": 4.956445993031359e-05, "loss": 15.2496, "step": 700 }, { "epoch": 0.907928388746803, "grad_norm": 361.4362487792969, "learning_rate": 4.954268292682927e-05, "loss": 14.1571, "step": 710 }, { "epoch": 0.9207161125319693, "grad_norm": 3015.107177734375, "learning_rate": 4.952090592334495e-05, "loss": 17.2341, "step": 720 }, { "epoch": 0.9335038363171355, "grad_norm": 86.11393737792969, "learning_rate": 4.9499128919860635e-05, "loss": 14.5296, "step": 730 }, { "epoch": 0.9462915601023018, "grad_norm": 47.42806625366211, "learning_rate": 4.947735191637631e-05, "loss": 14.4388, "step": 740 }, { "epoch": 0.959079283887468, "grad_norm": 150.82362365722656, "learning_rate": 4.945557491289199e-05, "loss": 13.377, "step": 750 }, { "epoch": 0.9718670076726342, "grad_norm": 136.4108428955078, "learning_rate": 4.943379790940767e-05, "loss": 13.7265, "step": 760 }, { "epoch": 0.9846547314578005, "grad_norm": 374.7745056152344, "learning_rate": 4.941202090592335e-05, "loss": 13.6729, "step": 770 }, { "epoch": 0.9974424552429667, "grad_norm": 809.5958862304688, "learning_rate": 4.9390243902439024e-05, "loss": 12.0761, "step": 780 }, { "epoch": 1.0, "eval_loss": 7.0809831619262695, "eval_runtime": 0.8598, "eval_samples_per_second": 113.984, "eval_steps_per_second": 15.12, "step": 782 }, { "epoch": 1.010230179028133, "grad_norm": 3061.148193359375, "learning_rate": 4.9368466898954705e-05, "loss": 11.7078, "step": 790 }, { "epoch": 1.0230179028132993, "grad_norm": 134.5844268798828, "learning_rate": 4.934668989547039e-05, "loss": 10.9549, "step": 800 }, { "epoch": 1.0358056265984654, "grad_norm": 300.20782470703125, "learning_rate": 4.932491289198606e-05, "loss": 12.3417, "step": 810 }, { "epoch": 1.0485933503836318, "grad_norm": 156.21311950683594, "learning_rate": 4.9303135888501744e-05, "loss": 11.6636, "step": 820 }, { "epoch": 1.061381074168798, "grad_norm": 164.83450317382812, "learning_rate": 4.9281358885017425e-05, "loss": 11.6409, "step": 830 }, { "epoch": 1.0741687979539642, "grad_norm": 137.92869567871094, "learning_rate": 4.92595818815331e-05, "loss": 11.9903, "step": 840 }, { "epoch": 1.0869565217391304, "grad_norm": 103.73356628417969, "learning_rate": 4.923780487804878e-05, "loss": 8.9094, "step": 850 }, { "epoch": 1.0997442455242967, "grad_norm": 64.21759033203125, "learning_rate": 4.9216027874564464e-05, "loss": 9.6173, "step": 860 }, { "epoch": 1.1125319693094629, "grad_norm": 695.460205078125, "learning_rate": 4.9194250871080146e-05, "loss": 9.6963, "step": 870 }, { "epoch": 1.1253196930946292, "grad_norm": 3314.6865234375, "learning_rate": 4.917247386759582e-05, "loss": 10.868, "step": 880 }, { "epoch": 1.1381074168797953, "grad_norm": 148.59613037109375, "learning_rate": 4.91506968641115e-05, "loss": 8.8247, "step": 890 }, { "epoch": 1.1508951406649617, "grad_norm": 330.5743103027344, "learning_rate": 4.9128919860627184e-05, "loss": 10.6854, "step": 900 }, { "epoch": 1.1636828644501278, "grad_norm": 40.26800537109375, "learning_rate": 4.910714285714286e-05, "loss": 9.2873, "step": 910 }, { "epoch": 1.1764705882352942, "grad_norm": 91.26012420654297, "learning_rate": 4.908536585365854e-05, "loss": 10.0678, "step": 920 }, { "epoch": 1.1892583120204603, "grad_norm": 516.1912841796875, "learning_rate": 4.9063588850174216e-05, "loss": 9.0135, "step": 930 }, { "epoch": 1.2020460358056266, "grad_norm": 70.36582946777344, "learning_rate": 4.90418118466899e-05, "loss": 8.7321, "step": 940 }, { "epoch": 1.2148337595907928, "grad_norm": 80.45369720458984, "learning_rate": 4.902003484320557e-05, "loss": 8.8967, "step": 950 }, { "epoch": 1.227621483375959, "grad_norm": 1352.676513671875, "learning_rate": 4.8998257839721254e-05, "loss": 8.731, "step": 960 }, { "epoch": 1.2404092071611252, "grad_norm": 572.7108764648438, "learning_rate": 4.8976480836236936e-05, "loss": 7.8549, "step": 970 }, { "epoch": 1.2531969309462916, "grad_norm": 34.12448501586914, "learning_rate": 4.895470383275261e-05, "loss": 6.734, "step": 980 }, { "epoch": 1.265984654731458, "grad_norm": 55.90411376953125, "learning_rate": 4.893292682926829e-05, "loss": 6.6569, "step": 990 }, { "epoch": 1.278772378516624, "grad_norm": 23.259788513183594, "learning_rate": 4.8911149825783975e-05, "loss": 6.8574, "step": 1000 }, { "epoch": 1.2915601023017902, "grad_norm": 112.81903839111328, "learning_rate": 4.8889372822299656e-05, "loss": 7.3217, "step": 1010 }, { "epoch": 1.3043478260869565, "grad_norm": 46.77792739868164, "learning_rate": 4.886759581881533e-05, "loss": 6.389, "step": 1020 }, { "epoch": 1.317135549872123, "grad_norm": 290.9834899902344, "learning_rate": 4.884581881533101e-05, "loss": 7.2083, "step": 1030 }, { "epoch": 1.329923273657289, "grad_norm": 683.7643432617188, "learning_rate": 4.8824041811846695e-05, "loss": 6.0531, "step": 1040 }, { "epoch": 1.3427109974424551, "grad_norm": 177.46051025390625, "learning_rate": 4.880226480836237e-05, "loss": 5.2371, "step": 1050 }, { "epoch": 1.3554987212276215, "grad_norm": 67.67430877685547, "learning_rate": 4.878048780487805e-05, "loss": 5.6214, "step": 1060 }, { "epoch": 1.3682864450127878, "grad_norm": 151.2257080078125, "learning_rate": 4.875871080139373e-05, "loss": 4.9522, "step": 1070 }, { "epoch": 1.381074168797954, "grad_norm": 89.38740539550781, "learning_rate": 4.8736933797909415e-05, "loss": 5.1789, "step": 1080 }, { "epoch": 1.39386189258312, "grad_norm": 150.41270446777344, "learning_rate": 4.871515679442509e-05, "loss": 4.3908, "step": 1090 }, { "epoch": 1.4066496163682864, "grad_norm": 18.98243522644043, "learning_rate": 4.869337979094077e-05, "loss": 4.7472, "step": 1100 }, { "epoch": 1.4194373401534528, "grad_norm": 21.242076873779297, "learning_rate": 4.867160278745645e-05, "loss": 4.2189, "step": 1110 }, { "epoch": 1.432225063938619, "grad_norm": 125.7092056274414, "learning_rate": 4.864982578397213e-05, "loss": 4.3067, "step": 1120 }, { "epoch": 1.445012787723785, "grad_norm": 33.442665100097656, "learning_rate": 4.86280487804878e-05, "loss": 4.1315, "step": 1130 }, { "epoch": 1.4578005115089514, "grad_norm": 192.08267211914062, "learning_rate": 4.8606271777003485e-05, "loss": 3.8742, "step": 1140 }, { "epoch": 1.4705882352941178, "grad_norm": 86.93074798583984, "learning_rate": 4.858449477351917e-05, "loss": 3.7819, "step": 1150 }, { "epoch": 1.4833759590792839, "grad_norm": 28.52533531188965, "learning_rate": 4.856271777003484e-05, "loss": 3.8282, "step": 1160 }, { "epoch": 1.49616368286445, "grad_norm": 179.64064025878906, "learning_rate": 4.8540940766550524e-05, "loss": 3.0471, "step": 1170 }, { "epoch": 1.5089514066496164, "grad_norm": 45.106510162353516, "learning_rate": 4.8519163763066205e-05, "loss": 3.5098, "step": 1180 }, { "epoch": 1.5217391304347827, "grad_norm": 97.1431884765625, "learning_rate": 4.849738675958189e-05, "loss": 2.8188, "step": 1190 }, { "epoch": 1.5345268542199488, "grad_norm": 385.2419128417969, "learning_rate": 4.847560975609756e-05, "loss": 3.1416, "step": 1200 }, { "epoch": 1.547314578005115, "grad_norm": 20.7208194732666, "learning_rate": 4.8453832752613244e-05, "loss": 2.9539, "step": 1210 }, { "epoch": 1.5601023017902813, "grad_norm": 30.659557342529297, "learning_rate": 4.8432055749128926e-05, "loss": 2.7002, "step": 1220 }, { "epoch": 1.5728900255754477, "grad_norm": 23.814847946166992, "learning_rate": 4.84102787456446e-05, "loss": 2.5541, "step": 1230 }, { "epoch": 1.5856777493606138, "grad_norm": 14.44255256652832, "learning_rate": 4.838850174216028e-05, "loss": 2.2534, "step": 1240 }, { "epoch": 1.59846547314578, "grad_norm": 41.14075469970703, "learning_rate": 4.8366724738675964e-05, "loss": 2.0536, "step": 1250 }, { "epoch": 1.6112531969309463, "grad_norm": 198.03524780273438, "learning_rate": 4.8344947735191646e-05, "loss": 2.8067, "step": 1260 }, { "epoch": 1.6240409207161126, "grad_norm": 35.26166915893555, "learning_rate": 4.832317073170732e-05, "loss": 2.0826, "step": 1270 }, { "epoch": 1.6368286445012787, "grad_norm": 11.258594512939453, "learning_rate": 4.8301393728222996e-05, "loss": 2.1802, "step": 1280 }, { "epoch": 1.6496163682864449, "grad_norm": 31.164817810058594, "learning_rate": 4.827961672473868e-05, "loss": 2.2419, "step": 1290 }, { "epoch": 1.6624040920716112, "grad_norm": 60.672122955322266, "learning_rate": 4.825783972125435e-05, "loss": 2.8254, "step": 1300 }, { "epoch": 1.6751918158567776, "grad_norm": 370.33953857421875, "learning_rate": 4.8236062717770034e-05, "loss": 2.7776, "step": 1310 }, { "epoch": 1.6879795396419437, "grad_norm": 758.9866333007812, "learning_rate": 4.8214285714285716e-05, "loss": 2.4551, "step": 1320 }, { "epoch": 1.7007672634271098, "grad_norm": 41.34950256347656, "learning_rate": 4.81925087108014e-05, "loss": 1.8235, "step": 1330 }, { "epoch": 1.7135549872122762, "grad_norm": 16.059200286865234, "learning_rate": 4.817073170731707e-05, "loss": 1.7094, "step": 1340 }, { "epoch": 1.7263427109974425, "grad_norm": 9.618565559387207, "learning_rate": 4.8148954703832754e-05, "loss": 1.8371, "step": 1350 }, { "epoch": 1.7391304347826086, "grad_norm": 50.72758102416992, "learning_rate": 4.8127177700348436e-05, "loss": 2.2624, "step": 1360 }, { "epoch": 1.7519181585677748, "grad_norm": 56.254966735839844, "learning_rate": 4.810540069686411e-05, "loss": 2.118, "step": 1370 }, { "epoch": 1.7647058823529411, "grad_norm": 14.840316772460938, "learning_rate": 4.808362369337979e-05, "loss": 1.4653, "step": 1380 }, { "epoch": 1.7774936061381075, "grad_norm": 33.320518493652344, "learning_rate": 4.8061846689895475e-05, "loss": 1.47, "step": 1390 }, { "epoch": 1.7902813299232738, "grad_norm": 38.10365676879883, "learning_rate": 4.8040069686411156e-05, "loss": 1.4439, "step": 1400 }, { "epoch": 1.80306905370844, "grad_norm": 27.954633712768555, "learning_rate": 4.801829268292683e-05, "loss": 1.7437, "step": 1410 }, { "epoch": 1.815856777493606, "grad_norm": 5.291255950927734, "learning_rate": 4.799651567944251e-05, "loss": 1.4063, "step": 1420 }, { "epoch": 1.8286445012787724, "grad_norm": 209.9783477783203, "learning_rate": 4.7974738675958195e-05, "loss": 1.7947, "step": 1430 }, { "epoch": 1.8414322250639388, "grad_norm": 9.803009986877441, "learning_rate": 4.795296167247387e-05, "loss": 1.9148, "step": 1440 }, { "epoch": 1.854219948849105, "grad_norm": 11.24997615814209, "learning_rate": 4.793118466898955e-05, "loss": 1.1532, "step": 1450 }, { "epoch": 1.867007672634271, "grad_norm": 45.51220703125, "learning_rate": 4.7909407665505226e-05, "loss": 1.5639, "step": 1460 }, { "epoch": 1.8797953964194374, "grad_norm": 16.30727767944336, "learning_rate": 4.788763066202091e-05, "loss": 1.1181, "step": 1470 }, { "epoch": 1.8925831202046037, "grad_norm": 136.94105529785156, "learning_rate": 4.786585365853658e-05, "loss": 1.3579, "step": 1480 }, { "epoch": 1.9053708439897699, "grad_norm": 11.275238037109375, "learning_rate": 4.7844076655052265e-05, "loss": 1.7074, "step": 1490 }, { "epoch": 1.918158567774936, "grad_norm": 25.395980834960938, "learning_rate": 4.782229965156795e-05, "loss": 1.0997, "step": 1500 }, { "epoch": 1.9309462915601023, "grad_norm": 23.93743324279785, "learning_rate": 4.780052264808362e-05, "loss": 1.3025, "step": 1510 }, { "epoch": 1.9437340153452687, "grad_norm": 2.8010175228118896, "learning_rate": 4.77787456445993e-05, "loss": 1.8831, "step": 1520 }, { "epoch": 1.9565217391304348, "grad_norm": 23.10536766052246, "learning_rate": 4.7756968641114985e-05, "loss": 0.8556, "step": 1530 }, { "epoch": 1.969309462915601, "grad_norm": 10.993846893310547, "learning_rate": 4.773519163763067e-05, "loss": 1.5948, "step": 1540 }, { "epoch": 1.9820971867007673, "grad_norm": 11.615630149841309, "learning_rate": 4.771341463414634e-05, "loss": 1.1638, "step": 1550 }, { "epoch": 1.9948849104859336, "grad_norm": 58.213191986083984, "learning_rate": 4.7691637630662024e-05, "loss": 1.6968, "step": 1560 }, { "epoch": 2.0, "eval_loss": 0.5625534653663635, "eval_runtime": 0.9781, "eval_samples_per_second": 100.193, "eval_steps_per_second": 13.291, "step": 1564 }, { "epoch": 2.0076726342710995, "grad_norm": 3.010547161102295, "learning_rate": 4.7669860627177705e-05, "loss": 1.8739, "step": 1570 }, { "epoch": 2.020460358056266, "grad_norm": 3.986027956008911, "learning_rate": 4.764808362369339e-05, "loss": 1.2554, "step": 1580 }, { "epoch": 2.0332480818414322, "grad_norm": 6.573552131652832, "learning_rate": 4.762630662020906e-05, "loss": 1.5013, "step": 1590 }, { "epoch": 2.0460358056265986, "grad_norm": 86.19767761230469, "learning_rate": 4.7604529616724744e-05, "loss": 1.7618, "step": 1600 }, { "epoch": 2.0588235294117645, "grad_norm": 9.659221649169922, "learning_rate": 4.7582752613240426e-05, "loss": 1.2333, "step": 1610 }, { "epoch": 2.071611253196931, "grad_norm": 11.018202781677246, "learning_rate": 4.75609756097561e-05, "loss": 1.3108, "step": 1620 }, { "epoch": 2.084398976982097, "grad_norm": 5.9316725730896, "learning_rate": 4.7539198606271775e-05, "loss": 1.4457, "step": 1630 }, { "epoch": 2.0971867007672635, "grad_norm": 6.019528865814209, "learning_rate": 4.751742160278746e-05, "loss": 1.3538, "step": 1640 }, { "epoch": 2.10997442455243, "grad_norm": 15.951410293579102, "learning_rate": 4.749564459930314e-05, "loss": 1.7894, "step": 1650 }, { "epoch": 2.122762148337596, "grad_norm": 17.659423828125, "learning_rate": 4.7473867595818814e-05, "loss": 1.6854, "step": 1660 }, { "epoch": 2.135549872122762, "grad_norm": 6.573909282684326, "learning_rate": 4.7452090592334496e-05, "loss": 0.7702, "step": 1670 }, { "epoch": 2.1483375959079285, "grad_norm": 19.226573944091797, "learning_rate": 4.743031358885018e-05, "loss": 1.5234, "step": 1680 }, { "epoch": 2.1611253196930944, "grad_norm": 7.366319179534912, "learning_rate": 4.740853658536585e-05, "loss": 1.167, "step": 1690 }, { "epoch": 2.1739130434782608, "grad_norm": 18.000076293945312, "learning_rate": 4.7386759581881534e-05, "loss": 1.3387, "step": 1700 }, { "epoch": 2.186700767263427, "grad_norm": 23.309993743896484, "learning_rate": 4.7364982578397216e-05, "loss": 0.9478, "step": 1710 }, { "epoch": 2.1994884910485935, "grad_norm": 11.471582412719727, "learning_rate": 4.73432055749129e-05, "loss": 1.8921, "step": 1720 }, { "epoch": 2.21227621483376, "grad_norm": 6.368335723876953, "learning_rate": 4.732142857142857e-05, "loss": 1.3705, "step": 1730 }, { "epoch": 2.2250639386189257, "grad_norm": 37.961761474609375, "learning_rate": 4.7299651567944254e-05, "loss": 1.0206, "step": 1740 }, { "epoch": 2.237851662404092, "grad_norm": 2.239867925643921, "learning_rate": 4.7277874564459936e-05, "loss": 0.4718, "step": 1750 }, { "epoch": 2.2506393861892584, "grad_norm": 3.7665998935699463, "learning_rate": 4.725609756097561e-05, "loss": 1.0531, "step": 1760 }, { "epoch": 2.2634271099744243, "grad_norm": 6.445348262786865, "learning_rate": 4.723432055749129e-05, "loss": 0.7531, "step": 1770 }, { "epoch": 2.2762148337595907, "grad_norm": 15.123578071594238, "learning_rate": 4.7212543554006975e-05, "loss": 1.8705, "step": 1780 }, { "epoch": 2.289002557544757, "grad_norm": 0.8845268487930298, "learning_rate": 4.719076655052265e-05, "loss": 1.3302, "step": 1790 }, { "epoch": 2.3017902813299234, "grad_norm": 4.527200698852539, "learning_rate": 4.716898954703833e-05, "loss": 0.6225, "step": 1800 }, { "epoch": 2.3145780051150897, "grad_norm": 1.2486788034439087, "learning_rate": 4.7147212543554006e-05, "loss": 2.2166, "step": 1810 }, { "epoch": 2.3273657289002556, "grad_norm": 5.734748363494873, "learning_rate": 4.712543554006969e-05, "loss": 1.3255, "step": 1820 }, { "epoch": 2.340153452685422, "grad_norm": 11.985346794128418, "learning_rate": 4.710365853658536e-05, "loss": 1.3059, "step": 1830 }, { "epoch": 2.3529411764705883, "grad_norm": 3.916868209838867, "learning_rate": 4.7081881533101045e-05, "loss": 1.5318, "step": 1840 }, { "epoch": 2.3657289002557547, "grad_norm": 7.840545654296875, "learning_rate": 4.7060104529616726e-05, "loss": 0.7891, "step": 1850 }, { "epoch": 2.3785166240409206, "grad_norm": 14.123540878295898, "learning_rate": 4.703832752613241e-05, "loss": 1.0555, "step": 1860 }, { "epoch": 2.391304347826087, "grad_norm": 7.954935550689697, "learning_rate": 4.701655052264808e-05, "loss": 1.4652, "step": 1870 }, { "epoch": 2.4040920716112533, "grad_norm": 14.094103813171387, "learning_rate": 4.6994773519163765e-05, "loss": 1.1833, "step": 1880 }, { "epoch": 2.4168797953964196, "grad_norm": 4.043068885803223, "learning_rate": 4.697299651567945e-05, "loss": 1.0861, "step": 1890 }, { "epoch": 2.4296675191815855, "grad_norm": 2.52896785736084, "learning_rate": 4.695121951219512e-05, "loss": 1.6684, "step": 1900 }, { "epoch": 2.442455242966752, "grad_norm": 1.6495180130004883, "learning_rate": 4.69294425087108e-05, "loss": 1.4837, "step": 1910 }, { "epoch": 2.455242966751918, "grad_norm": 12.262441635131836, "learning_rate": 4.6907665505226485e-05, "loss": 0.9211, "step": 1920 }, { "epoch": 2.4680306905370846, "grad_norm": 20.890920639038086, "learning_rate": 4.688588850174217e-05, "loss": 0.9938, "step": 1930 }, { "epoch": 2.4808184143222505, "grad_norm": 91.71107482910156, "learning_rate": 4.686411149825784e-05, "loss": 1.2399, "step": 1940 }, { "epoch": 2.493606138107417, "grad_norm": 15.037610054016113, "learning_rate": 4.6842334494773524e-05, "loss": 1.3903, "step": 1950 }, { "epoch": 2.506393861892583, "grad_norm": 13.86916732788086, "learning_rate": 4.6820557491289205e-05, "loss": 0.7887, "step": 1960 }, { "epoch": 2.5191815856777495, "grad_norm": 1.1754240989685059, "learning_rate": 4.679878048780488e-05, "loss": 0.8378, "step": 1970 }, { "epoch": 2.531969309462916, "grad_norm": 5.272491455078125, "learning_rate": 4.6777003484320555e-05, "loss": 0.8505, "step": 1980 }, { "epoch": 2.544757033248082, "grad_norm": 5.7034831047058105, "learning_rate": 4.675522648083624e-05, "loss": 0.7224, "step": 1990 }, { "epoch": 2.557544757033248, "grad_norm": 0.7890897393226624, "learning_rate": 4.673344947735192e-05, "loss": 0.6189, "step": 2000 }, { "epoch": 2.5703324808184145, "grad_norm": 12.357426643371582, "learning_rate": 4.6711672473867594e-05, "loss": 1.0881, "step": 2010 }, { "epoch": 2.5831202046035804, "grad_norm": 23.227792739868164, "learning_rate": 4.6689895470383275e-05, "loss": 0.8534, "step": 2020 }, { "epoch": 2.5959079283887467, "grad_norm": 10.504785537719727, "learning_rate": 4.666811846689896e-05, "loss": 0.8183, "step": 2030 }, { "epoch": 2.608695652173913, "grad_norm": 15.565208435058594, "learning_rate": 4.664634146341464e-05, "loss": 1.4254, "step": 2040 }, { "epoch": 2.6214833759590794, "grad_norm": 1.6157219409942627, "learning_rate": 4.6624564459930314e-05, "loss": 0.5524, "step": 2050 }, { "epoch": 2.634271099744246, "grad_norm": 19.441726684570312, "learning_rate": 4.6602787456445996e-05, "loss": 0.9524, "step": 2060 }, { "epoch": 2.6470588235294117, "grad_norm": 7.794475078582764, "learning_rate": 4.658101045296168e-05, "loss": 1.0205, "step": 2070 }, { "epoch": 2.659846547314578, "grad_norm": 15.814881324768066, "learning_rate": 4.655923344947735e-05, "loss": 0.9889, "step": 2080 }, { "epoch": 2.6726342710997444, "grad_norm": 0.5755274891853333, "learning_rate": 4.6537456445993034e-05, "loss": 1.1448, "step": 2090 }, { "epoch": 2.6854219948849103, "grad_norm": 17.273611068725586, "learning_rate": 4.6515679442508716e-05, "loss": 1.3059, "step": 2100 }, { "epoch": 2.6982097186700766, "grad_norm": 3.9910295009613037, "learning_rate": 4.64939024390244e-05, "loss": 0.683, "step": 2110 }, { "epoch": 2.710997442455243, "grad_norm": 14.555076599121094, "learning_rate": 4.647212543554007e-05, "loss": 1.1196, "step": 2120 }, { "epoch": 2.7237851662404093, "grad_norm": 23.410642623901367, "learning_rate": 4.6450348432055754e-05, "loss": 1.1257, "step": 2130 }, { "epoch": 2.7365728900255757, "grad_norm": 4.720057010650635, "learning_rate": 4.642857142857143e-05, "loss": 0.7206, "step": 2140 }, { "epoch": 2.7493606138107416, "grad_norm": 1.535508632659912, "learning_rate": 4.640679442508711e-05, "loss": 0.6184, "step": 2150 }, { "epoch": 2.762148337595908, "grad_norm": 14.630488395690918, "learning_rate": 4.6385017421602786e-05, "loss": 1.093, "step": 2160 }, { "epoch": 2.7749360613810743, "grad_norm": 5.701061248779297, "learning_rate": 4.636324041811847e-05, "loss": 1.0357, "step": 2170 }, { "epoch": 2.78772378516624, "grad_norm": 2.6190364360809326, "learning_rate": 4.634146341463415e-05, "loss": 1.3345, "step": 2180 }, { "epoch": 2.8005115089514065, "grad_norm": 14.732501029968262, "learning_rate": 4.6319686411149825e-05, "loss": 1.1697, "step": 2190 }, { "epoch": 2.813299232736573, "grad_norm": 54.13779830932617, "learning_rate": 4.6297909407665506e-05, "loss": 0.8014, "step": 2200 }, { "epoch": 2.8260869565217392, "grad_norm": 3.1863250732421875, "learning_rate": 4.627613240418119e-05, "loss": 0.6579, "step": 2210 }, { "epoch": 2.8388746803069056, "grad_norm": 0.7445744872093201, "learning_rate": 4.625435540069686e-05, "loss": 0.9363, "step": 2220 }, { "epoch": 2.8516624040920715, "grad_norm": 12.610774040222168, "learning_rate": 4.6232578397212545e-05, "loss": 1.0073, "step": 2230 }, { "epoch": 2.864450127877238, "grad_norm": 1.9731221199035645, "learning_rate": 4.6210801393728226e-05, "loss": 0.6766, "step": 2240 }, { "epoch": 2.877237851662404, "grad_norm": 18.617942810058594, "learning_rate": 4.618902439024391e-05, "loss": 1.124, "step": 2250 }, { "epoch": 2.89002557544757, "grad_norm": 15.15342903137207, "learning_rate": 4.616724738675958e-05, "loss": 1.0936, "step": 2260 }, { "epoch": 2.9028132992327365, "grad_norm": 6.208033561706543, "learning_rate": 4.6145470383275265e-05, "loss": 0.8369, "step": 2270 }, { "epoch": 2.915601023017903, "grad_norm": 5.944622993469238, "learning_rate": 4.612369337979095e-05, "loss": 0.8107, "step": 2280 }, { "epoch": 2.928388746803069, "grad_norm": 12.07383918762207, "learning_rate": 4.610191637630662e-05, "loss": 1.4551, "step": 2290 }, { "epoch": 2.9411764705882355, "grad_norm": 12.542791366577148, "learning_rate": 4.6080139372822303e-05, "loss": 0.9312, "step": 2300 }, { "epoch": 2.9539641943734014, "grad_norm": 6.652247905731201, "learning_rate": 4.6058362369337985e-05, "loss": 0.944, "step": 2310 }, { "epoch": 2.9667519181585678, "grad_norm": 0.610918402671814, "learning_rate": 4.603658536585366e-05, "loss": 0.9986, "step": 2320 }, { "epoch": 2.979539641943734, "grad_norm": 0.7279015183448792, "learning_rate": 4.6014808362369335e-05, "loss": 0.4389, "step": 2330 }, { "epoch": 2.9923273657289, "grad_norm": 25.247034072875977, "learning_rate": 4.599303135888502e-05, "loss": 1.0318, "step": 2340 }, { "epoch": 3.0, "eval_loss": 0.445333868265152, "eval_runtime": 0.967, "eval_samples_per_second": 101.343, "eval_steps_per_second": 13.443, "step": 2346 }, { "epoch": 3.0051150895140664, "grad_norm": 18.33305549621582, "learning_rate": 4.59712543554007e-05, "loss": 1.0371, "step": 2350 }, { "epoch": 3.0179028132992327, "grad_norm": 5.7453107833862305, "learning_rate": 4.594947735191638e-05, "loss": 1.2382, "step": 2360 }, { "epoch": 3.030690537084399, "grad_norm": 24.08013153076172, "learning_rate": 4.5927700348432055e-05, "loss": 1.5925, "step": 2370 }, { "epoch": 3.0434782608695654, "grad_norm": 16.559703826904297, "learning_rate": 4.590592334494774e-05, "loss": 0.7614, "step": 2380 }, { "epoch": 3.0562659846547313, "grad_norm": 0.3231087625026703, "learning_rate": 4.588414634146342e-05, "loss": 0.6892, "step": 2390 }, { "epoch": 3.0690537084398977, "grad_norm": 7.988368511199951, "learning_rate": 4.5862369337979094e-05, "loss": 0.8234, "step": 2400 }, { "epoch": 3.081841432225064, "grad_norm": 0.9878854155540466, "learning_rate": 4.5840592334494776e-05, "loss": 1.2006, "step": 2410 }, { "epoch": 3.0946291560102304, "grad_norm": 2.3492844104766846, "learning_rate": 4.581881533101046e-05, "loss": 0.4687, "step": 2420 }, { "epoch": 3.1074168797953963, "grad_norm": 10.556395530700684, "learning_rate": 4.579703832752614e-05, "loss": 0.5203, "step": 2430 }, { "epoch": 3.1202046035805626, "grad_norm": 2.9563450813293457, "learning_rate": 4.5775261324041814e-05, "loss": 1.575, "step": 2440 }, { "epoch": 3.132992327365729, "grad_norm": 238.0205078125, "learning_rate": 4.5753484320557496e-05, "loss": 0.9001, "step": 2450 }, { "epoch": 3.1457800511508953, "grad_norm": 5.97659158706665, "learning_rate": 4.573170731707318e-05, "loss": 1.0491, "step": 2460 }, { "epoch": 3.1585677749360612, "grad_norm": 2.6114704608917236, "learning_rate": 4.570993031358885e-05, "loss": 1.176, "step": 2470 }, { "epoch": 3.1713554987212276, "grad_norm": 11.491422653198242, "learning_rate": 4.5688153310104534e-05, "loss": 1.0819, "step": 2480 }, { "epoch": 3.184143222506394, "grad_norm": 15.428108215332031, "learning_rate": 4.566637630662021e-05, "loss": 0.7879, "step": 2490 }, { "epoch": 3.1969309462915603, "grad_norm": 1.30459725856781, "learning_rate": 4.564459930313589e-05, "loss": 0.432, "step": 2500 }, { "epoch": 3.209718670076726, "grad_norm": 10.579148292541504, "learning_rate": 4.5622822299651566e-05, "loss": 0.8243, "step": 2510 }, { "epoch": 3.2225063938618925, "grad_norm": 15.585744857788086, "learning_rate": 4.560104529616725e-05, "loss": 0.6711, "step": 2520 }, { "epoch": 3.235294117647059, "grad_norm": 0.4204480051994324, "learning_rate": 4.557926829268293e-05, "loss": 0.3794, "step": 2530 }, { "epoch": 3.2480818414322252, "grad_norm": 2.049827814102173, "learning_rate": 4.5557491289198604e-05, "loss": 0.8661, "step": 2540 }, { "epoch": 3.260869565217391, "grad_norm": 4.234211444854736, "learning_rate": 4.5535714285714286e-05, "loss": 0.6852, "step": 2550 }, { "epoch": 3.2736572890025575, "grad_norm": 6.04722785949707, "learning_rate": 4.551393728222997e-05, "loss": 0.6973, "step": 2560 }, { "epoch": 3.286445012787724, "grad_norm": 2.9452807903289795, "learning_rate": 4.549216027874565e-05, "loss": 1.2999, "step": 2570 }, { "epoch": 3.29923273657289, "grad_norm": 5.985997200012207, "learning_rate": 4.5470383275261325e-05, "loss": 0.6524, "step": 2580 }, { "epoch": 3.312020460358056, "grad_norm": 17.464967727661133, "learning_rate": 4.5448606271777006e-05, "loss": 0.414, "step": 2590 }, { "epoch": 3.3248081841432224, "grad_norm": 0.4771990180015564, "learning_rate": 4.542682926829269e-05, "loss": 0.9716, "step": 2600 }, { "epoch": 3.337595907928389, "grad_norm": 12.75200080871582, "learning_rate": 4.540505226480836e-05, "loss": 1.2216, "step": 2610 }, { "epoch": 3.350383631713555, "grad_norm": 137.2834014892578, "learning_rate": 4.5383275261324045e-05, "loss": 1.339, "step": 2620 }, { "epoch": 3.363171355498721, "grad_norm": 16.52651596069336, "learning_rate": 4.5361498257839727e-05, "loss": 1.3891, "step": 2630 }, { "epoch": 3.3759590792838874, "grad_norm": 3.5017669200897217, "learning_rate": 4.533972125435541e-05, "loss": 0.8694, "step": 2640 }, { "epoch": 3.3887468030690537, "grad_norm": 14.319966316223145, "learning_rate": 4.531794425087108e-05, "loss": 0.8546, "step": 2650 }, { "epoch": 3.40153452685422, "grad_norm": 13.259135246276855, "learning_rate": 4.529616724738676e-05, "loss": 0.706, "step": 2660 }, { "epoch": 3.414322250639386, "grad_norm": 12.226325035095215, "learning_rate": 4.527439024390244e-05, "loss": 0.6859, "step": 2670 }, { "epoch": 3.4271099744245523, "grad_norm": 3.982544422149658, "learning_rate": 4.5252613240418115e-05, "loss": 0.8816, "step": 2680 }, { "epoch": 3.4398976982097187, "grad_norm": 3.3434112071990967, "learning_rate": 4.52308362369338e-05, "loss": 0.2865, "step": 2690 }, { "epoch": 3.452685421994885, "grad_norm": 1.9963968992233276, "learning_rate": 4.520905923344948e-05, "loss": 0.7698, "step": 2700 }, { "epoch": 3.4654731457800514, "grad_norm": 11.71841812133789, "learning_rate": 4.518728222996516e-05, "loss": 1.1581, "step": 2710 }, { "epoch": 3.4782608695652173, "grad_norm": 7.631697654724121, "learning_rate": 4.5165505226480835e-05, "loss": 1.0637, "step": 2720 }, { "epoch": 3.4910485933503836, "grad_norm": 14.784839630126953, "learning_rate": 4.514372822299652e-05, "loss": 1.0658, "step": 2730 }, { "epoch": 3.50383631713555, "grad_norm": 5.46140193939209, "learning_rate": 4.51219512195122e-05, "loss": 0.6536, "step": 2740 }, { "epoch": 3.516624040920716, "grad_norm": 2.2502548694610596, "learning_rate": 4.5100174216027874e-05, "loss": 0.7903, "step": 2750 }, { "epoch": 3.5294117647058822, "grad_norm": 2.154597043991089, "learning_rate": 4.5078397212543555e-05, "loss": 0.9926, "step": 2760 }, { "epoch": 3.5421994884910486, "grad_norm": 13.152908325195312, "learning_rate": 4.505662020905924e-05, "loss": 0.8718, "step": 2770 }, { "epoch": 3.554987212276215, "grad_norm": 5.203449249267578, "learning_rate": 4.503484320557492e-05, "loss": 0.8298, "step": 2780 }, { "epoch": 3.5677749360613813, "grad_norm": 11.65567684173584, "learning_rate": 4.5013066202090594e-05, "loss": 1.1689, "step": 2790 }, { "epoch": 3.580562659846547, "grad_norm": 2.916771411895752, "learning_rate": 4.4991289198606276e-05, "loss": 0.817, "step": 2800 }, { "epoch": 3.5933503836317136, "grad_norm": 14.278409957885742, "learning_rate": 4.496951219512196e-05, "loss": 0.8092, "step": 2810 }, { "epoch": 3.60613810741688, "grad_norm": 0.2527465522289276, "learning_rate": 4.494773519163763e-05, "loss": 0.3738, "step": 2820 }, { "epoch": 3.618925831202046, "grad_norm": 0.8273968696594238, "learning_rate": 4.4925958188153314e-05, "loss": 0.4944, "step": 2830 }, { "epoch": 3.631713554987212, "grad_norm": 2.636253833770752, "learning_rate": 4.490418118466899e-05, "loss": 0.6911, "step": 2840 }, { "epoch": 3.6445012787723785, "grad_norm": 3.27070951461792, "learning_rate": 4.488240418118467e-05, "loss": 1.1412, "step": 2850 }, { "epoch": 3.657289002557545, "grad_norm": 7.859626770019531, "learning_rate": 4.4860627177700346e-05, "loss": 0.6205, "step": 2860 }, { "epoch": 3.670076726342711, "grad_norm": 4.144435882568359, "learning_rate": 4.483885017421603e-05, "loss": 1.1247, "step": 2870 }, { "epoch": 3.682864450127877, "grad_norm": 0.911280632019043, "learning_rate": 4.481707317073171e-05, "loss": 0.8172, "step": 2880 }, { "epoch": 3.6956521739130435, "grad_norm": 1.5579833984375, "learning_rate": 4.479529616724739e-05, "loss": 1.186, "step": 2890 }, { "epoch": 3.70843989769821, "grad_norm": 7.854421615600586, "learning_rate": 4.4773519163763066e-05, "loss": 0.5919, "step": 2900 }, { "epoch": 3.7212276214833757, "grad_norm": 3.1011366844177246, "learning_rate": 4.475174216027875e-05, "loss": 0.4518, "step": 2910 }, { "epoch": 3.734015345268542, "grad_norm": 5.562145709991455, "learning_rate": 4.472996515679443e-05, "loss": 1.296, "step": 2920 }, { "epoch": 3.7468030690537084, "grad_norm": 10.683934211730957, "learning_rate": 4.4708188153310104e-05, "loss": 0.5236, "step": 2930 }, { "epoch": 3.7595907928388748, "grad_norm": 6.5699076652526855, "learning_rate": 4.4686411149825786e-05, "loss": 0.9732, "step": 2940 }, { "epoch": 3.772378516624041, "grad_norm": 19.995325088500977, "learning_rate": 4.466463414634147e-05, "loss": 0.9127, "step": 2950 }, { "epoch": 3.785166240409207, "grad_norm": 4.819314479827881, "learning_rate": 4.464285714285715e-05, "loss": 0.8555, "step": 2960 }, { "epoch": 3.7979539641943734, "grad_norm": 7.258398532867432, "learning_rate": 4.4621080139372825e-05, "loss": 1.1759, "step": 2970 }, { "epoch": 3.8107416879795397, "grad_norm": 1.841517448425293, "learning_rate": 4.4599303135888506e-05, "loss": 1.1711, "step": 2980 }, { "epoch": 3.8235294117647056, "grad_norm": 2.7056972980499268, "learning_rate": 4.457752613240419e-05, "loss": 0.3712, "step": 2990 }, { "epoch": 3.836317135549872, "grad_norm": 12.133644104003906, "learning_rate": 4.455574912891986e-05, "loss": 0.5701, "step": 3000 }, { "epoch": 3.8491048593350383, "grad_norm": 5.757213115692139, "learning_rate": 4.453397212543554e-05, "loss": 0.3663, "step": 3010 }, { "epoch": 3.8618925831202047, "grad_norm": 6.096423149108887, "learning_rate": 4.451219512195122e-05, "loss": 0.5234, "step": 3020 }, { "epoch": 3.874680306905371, "grad_norm": 1.7655915021896362, "learning_rate": 4.44904181184669e-05, "loss": 0.7285, "step": 3030 }, { "epoch": 3.887468030690537, "grad_norm": 4.150766849517822, "learning_rate": 4.4468641114982576e-05, "loss": 1.2942, "step": 3040 }, { "epoch": 3.9002557544757033, "grad_norm": 0.9318215250968933, "learning_rate": 4.444686411149826e-05, "loss": 0.6762, "step": 3050 }, { "epoch": 3.9130434782608696, "grad_norm": 0.7870925068855286, "learning_rate": 4.442508710801394e-05, "loss": 0.2768, "step": 3060 }, { "epoch": 3.9258312020460355, "grad_norm": 0.7156265377998352, "learning_rate": 4.4403310104529615e-05, "loss": 0.7751, "step": 3070 }, { "epoch": 3.938618925831202, "grad_norm": 2.052534580230713, "learning_rate": 4.43815331010453e-05, "loss": 0.6196, "step": 3080 }, { "epoch": 3.9514066496163682, "grad_norm": 13.939319610595703, "learning_rate": 4.435975609756098e-05, "loss": 0.7721, "step": 3090 }, { "epoch": 3.9641943734015346, "grad_norm": 2.3193609714508057, "learning_rate": 4.433797909407666e-05, "loss": 0.8792, "step": 3100 }, { "epoch": 3.976982097186701, "grad_norm": 13.679859161376953, "learning_rate": 4.4316202090592335e-05, "loss": 0.8644, "step": 3110 }, { "epoch": 3.9897698209718673, "grad_norm": 1.552064299583435, "learning_rate": 4.429442508710802e-05, "loss": 0.711, "step": 3120 }, { "epoch": 4.0, "eval_loss": 0.3499666452407837, "eval_runtime": 0.9683, "eval_samples_per_second": 101.204, "eval_steps_per_second": 13.425, "step": 3128 }, { "epoch": 4.002557544757034, "grad_norm": 3.0008726119995117, "learning_rate": 4.42726480836237e-05, "loss": 0.9301, "step": 3130 }, { "epoch": 4.015345268542199, "grad_norm": 6.133752822875977, "learning_rate": 4.4250871080139374e-05, "loss": 0.7322, "step": 3140 }, { "epoch": 4.028132992327365, "grad_norm": 7.614703178405762, "learning_rate": 4.4229094076655055e-05, "loss": 1.0492, "step": 3150 }, { "epoch": 4.040920716112532, "grad_norm": 2.1636641025543213, "learning_rate": 4.420731707317074e-05, "loss": 1.1332, "step": 3160 }, { "epoch": 4.053708439897698, "grad_norm": 9.834653854370117, "learning_rate": 4.418554006968641e-05, "loss": 0.6502, "step": 3170 }, { "epoch": 4.0664961636828645, "grad_norm": 1.8648459911346436, "learning_rate": 4.4163763066202094e-05, "loss": 0.5078, "step": 3180 }, { "epoch": 4.079283887468031, "grad_norm": 8.35239315032959, "learning_rate": 4.414198606271777e-05, "loss": 0.7993, "step": 3190 }, { "epoch": 4.092071611253197, "grad_norm": 0.7734344601631165, "learning_rate": 4.412020905923345e-05, "loss": 0.8665, "step": 3200 }, { "epoch": 4.1048593350383635, "grad_norm": 1.2159333229064941, "learning_rate": 4.409843205574913e-05, "loss": 0.8147, "step": 3210 }, { "epoch": 4.117647058823529, "grad_norm": 14.401145935058594, "learning_rate": 4.407665505226481e-05, "loss": 0.6267, "step": 3220 }, { "epoch": 4.130434782608695, "grad_norm": 9.61027717590332, "learning_rate": 4.405487804878049e-05, "loss": 0.7327, "step": 3230 }, { "epoch": 4.143222506393862, "grad_norm": 15.293912887573242, "learning_rate": 4.403310104529617e-05, "loss": 1.3573, "step": 3240 }, { "epoch": 4.156010230179028, "grad_norm": 4.653273582458496, "learning_rate": 4.4011324041811846e-05, "loss": 0.5447, "step": 3250 }, { "epoch": 4.168797953964194, "grad_norm": 3.500906467437744, "learning_rate": 4.398954703832753e-05, "loss": 0.9046, "step": 3260 }, { "epoch": 4.181585677749361, "grad_norm": 13.659477233886719, "learning_rate": 4.396777003484321e-05, "loss": 0.6891, "step": 3270 }, { "epoch": 4.194373401534527, "grad_norm": 0.9972584247589111, "learning_rate": 4.394599303135889e-05, "loss": 0.7421, "step": 3280 }, { "epoch": 4.207161125319693, "grad_norm": 6.448363304138184, "learning_rate": 4.3924216027874566e-05, "loss": 0.6541, "step": 3290 }, { "epoch": 4.21994884910486, "grad_norm": 2.5608551502227783, "learning_rate": 4.390243902439025e-05, "loss": 0.3677, "step": 3300 }, { "epoch": 4.232736572890025, "grad_norm": 6.504319667816162, "learning_rate": 4.388066202090593e-05, "loss": 0.5709, "step": 3310 }, { "epoch": 4.245524296675192, "grad_norm": 9.659370422363281, "learning_rate": 4.3858885017421604e-05, "loss": 0.7813, "step": 3320 }, { "epoch": 4.258312020460358, "grad_norm": 3.0793986320495605, "learning_rate": 4.3837108013937286e-05, "loss": 0.5772, "step": 3330 }, { "epoch": 4.271099744245524, "grad_norm": 1.2446378469467163, "learning_rate": 4.381533101045297e-05, "loss": 0.6206, "step": 3340 }, { "epoch": 4.283887468030691, "grad_norm": 0.565462589263916, "learning_rate": 4.379355400696864e-05, "loss": 0.6753, "step": 3350 }, { "epoch": 4.296675191815857, "grad_norm": 1.6904747486114502, "learning_rate": 4.377177700348432e-05, "loss": 0.7721, "step": 3360 }, { "epoch": 4.309462915601023, "grad_norm": 2.9521095752716064, "learning_rate": 4.375e-05, "loss": 0.3611, "step": 3370 }, { "epoch": 4.322250639386189, "grad_norm": 6.3527703285217285, "learning_rate": 4.372822299651568e-05, "loss": 0.5197, "step": 3380 }, { "epoch": 4.335038363171355, "grad_norm": 3.786268949508667, "learning_rate": 4.3706445993031356e-05, "loss": 0.6294, "step": 3390 }, { "epoch": 4.3478260869565215, "grad_norm": 0.7279968857765198, "learning_rate": 4.368466898954704e-05, "loss": 1.0253, "step": 3400 }, { "epoch": 4.360613810741688, "grad_norm": 0.4943724274635315, "learning_rate": 4.366289198606272e-05, "loss": 0.668, "step": 3410 }, { "epoch": 4.373401534526854, "grad_norm": 5.150463581085205, "learning_rate": 4.36411149825784e-05, "loss": 0.7717, "step": 3420 }, { "epoch": 4.3861892583120206, "grad_norm": 8.849224090576172, "learning_rate": 4.3619337979094076e-05, "loss": 0.713, "step": 3430 }, { "epoch": 4.398976982097187, "grad_norm": 2.910844087600708, "learning_rate": 4.359756097560976e-05, "loss": 0.7032, "step": 3440 }, { "epoch": 4.411764705882353, "grad_norm": 3.0307321548461914, "learning_rate": 4.357578397212544e-05, "loss": 0.5685, "step": 3450 }, { "epoch": 4.42455242966752, "grad_norm": 11.50002670288086, "learning_rate": 4.3554006968641115e-05, "loss": 0.4806, "step": 3460 }, { "epoch": 4.437340153452685, "grad_norm": 9.223997116088867, "learning_rate": 4.35322299651568e-05, "loss": 0.4831, "step": 3470 }, { "epoch": 4.450127877237851, "grad_norm": 4.3660197257995605, "learning_rate": 4.351045296167248e-05, "loss": 1.1399, "step": 3480 }, { "epoch": 4.462915601023018, "grad_norm": 9.756316184997559, "learning_rate": 4.348867595818816e-05, "loss": 0.8305, "step": 3490 }, { "epoch": 4.475703324808184, "grad_norm": 4.104464530944824, "learning_rate": 4.3466898954703835e-05, "loss": 0.2647, "step": 3500 }, { "epoch": 4.4884910485933505, "grad_norm": 8.359086036682129, "learning_rate": 4.344512195121952e-05, "loss": 0.9752, "step": 3510 }, { "epoch": 4.501278772378517, "grad_norm": 2.668789863586426, "learning_rate": 4.342334494773519e-05, "loss": 0.5549, "step": 3520 }, { "epoch": 4.514066496163683, "grad_norm": 11.178609848022461, "learning_rate": 4.3401567944250874e-05, "loss": 0.8338, "step": 3530 }, { "epoch": 4.526854219948849, "grad_norm": 29.559751510620117, "learning_rate": 4.337979094076655e-05, "loss": 0.7025, "step": 3540 }, { "epoch": 4.539641943734015, "grad_norm": 13.457611083984375, "learning_rate": 4.335801393728223e-05, "loss": 0.3561, "step": 3550 }, { "epoch": 4.552429667519181, "grad_norm": 8.7972993850708, "learning_rate": 4.333623693379791e-05, "loss": 0.8934, "step": 3560 }, { "epoch": 4.565217391304348, "grad_norm": 4.532437324523926, "learning_rate": 4.331445993031359e-05, "loss": 0.6173, "step": 3570 }, { "epoch": 4.578005115089514, "grad_norm": 9.719355583190918, "learning_rate": 4.329268292682927e-05, "loss": 0.5608, "step": 3580 }, { "epoch": 4.59079283887468, "grad_norm": 10.024927139282227, "learning_rate": 4.327090592334495e-05, "loss": 1.2261, "step": 3590 }, { "epoch": 4.603580562659847, "grad_norm": 8.742612838745117, "learning_rate": 4.324912891986063e-05, "loss": 0.6297, "step": 3600 }, { "epoch": 4.616368286445013, "grad_norm": 4.9773077964782715, "learning_rate": 4.322735191637631e-05, "loss": 0.3228, "step": 3610 }, { "epoch": 4.629156010230179, "grad_norm": 18.090656280517578, "learning_rate": 4.320557491289199e-05, "loss": 0.6028, "step": 3620 }, { "epoch": 4.641943734015345, "grad_norm": 10.379446029663086, "learning_rate": 4.318379790940767e-05, "loss": 0.9047, "step": 3630 }, { "epoch": 4.654731457800511, "grad_norm": 10.179900169372559, "learning_rate": 4.3162020905923346e-05, "loss": 0.4727, "step": 3640 }, { "epoch": 4.667519181585678, "grad_norm": 9.0844144821167, "learning_rate": 4.314024390243903e-05, "loss": 0.8556, "step": 3650 }, { "epoch": 4.680306905370844, "grad_norm": 8.229222297668457, "learning_rate": 4.311846689895471e-05, "loss": 0.5654, "step": 3660 }, { "epoch": 4.69309462915601, "grad_norm": 2.8442490100860596, "learning_rate": 4.309668989547039e-05, "loss": 0.3812, "step": 3670 }, { "epoch": 4.705882352941177, "grad_norm": 14.046589851379395, "learning_rate": 4.3074912891986066e-05, "loss": 1.0255, "step": 3680 }, { "epoch": 4.718670076726343, "grad_norm": 1.3973981142044067, "learning_rate": 4.305313588850175e-05, "loss": 0.6829, "step": 3690 }, { "epoch": 4.731457800511509, "grad_norm": 4.366006374359131, "learning_rate": 4.303135888501742e-05, "loss": 0.7007, "step": 3700 }, { "epoch": 4.744245524296675, "grad_norm": 9.728074073791504, "learning_rate": 4.30095818815331e-05, "loss": 0.6583, "step": 3710 }, { "epoch": 4.757033248081841, "grad_norm": 7.475890159606934, "learning_rate": 4.298780487804878e-05, "loss": 0.7781, "step": 3720 }, { "epoch": 4.7698209718670075, "grad_norm": 0.4485682249069214, "learning_rate": 4.296602787456446e-05, "loss": 0.6201, "step": 3730 }, { "epoch": 4.782608695652174, "grad_norm": 13.75088882446289, "learning_rate": 4.294425087108014e-05, "loss": 0.8055, "step": 3740 }, { "epoch": 4.79539641943734, "grad_norm": 10.574711799621582, "learning_rate": 4.292247386759582e-05, "loss": 0.6285, "step": 3750 }, { "epoch": 4.8081841432225065, "grad_norm": 24.870988845825195, "learning_rate": 4.29006968641115e-05, "loss": 0.3683, "step": 3760 }, { "epoch": 4.820971867007673, "grad_norm": 2.3566386699676514, "learning_rate": 4.287891986062718e-05, "loss": 0.397, "step": 3770 }, { "epoch": 4.833759590792839, "grad_norm": 0.8658470511436462, "learning_rate": 4.2857142857142856e-05, "loss": 0.4488, "step": 3780 }, { "epoch": 4.846547314578006, "grad_norm": 10.955270767211914, "learning_rate": 4.283536585365854e-05, "loss": 0.8643, "step": 3790 }, { "epoch": 4.859335038363171, "grad_norm": 0.5680179595947266, "learning_rate": 4.281358885017422e-05, "loss": 0.6409, "step": 3800 }, { "epoch": 4.872122762148337, "grad_norm": 0.8693060874938965, "learning_rate": 4.27918118466899e-05, "loss": 0.6387, "step": 3810 }, { "epoch": 4.884910485933504, "grad_norm": 7.04971170425415, "learning_rate": 4.2770034843205577e-05, "loss": 0.8097, "step": 3820 }, { "epoch": 4.89769820971867, "grad_norm": 11.212493896484375, "learning_rate": 4.274825783972126e-05, "loss": 0.6476, "step": 3830 }, { "epoch": 4.910485933503836, "grad_norm": 3.368633270263672, "learning_rate": 4.272648083623694e-05, "loss": 0.7214, "step": 3840 }, { "epoch": 4.923273657289003, "grad_norm": 3.019455671310425, "learning_rate": 4.2704703832752615e-05, "loss": 0.3225, "step": 3850 }, { "epoch": 4.936061381074169, "grad_norm": 2.099149465560913, "learning_rate": 4.26829268292683e-05, "loss": 0.4397, "step": 3860 }, { "epoch": 4.948849104859335, "grad_norm": 3.1926653385162354, "learning_rate": 4.266114982578397e-05, "loss": 0.522, "step": 3870 }, { "epoch": 4.961636828644501, "grad_norm": 9.113776206970215, "learning_rate": 4.2639372822299653e-05, "loss": 0.6778, "step": 3880 }, { "epoch": 4.974424552429667, "grad_norm": 0.5909568071365356, "learning_rate": 4.261759581881533e-05, "loss": 0.7403, "step": 3890 }, { "epoch": 4.987212276214834, "grad_norm": 7.393810749053955, "learning_rate": 4.259581881533101e-05, "loss": 0.6318, "step": 3900 }, { "epoch": 5.0, "grad_norm": 8.818605422973633, "learning_rate": 4.257404181184669e-05, "loss": 0.5702, "step": 3910 }, { "epoch": 5.0, "eval_loss": 0.2965039610862732, "eval_runtime": 0.8112, "eval_samples_per_second": 120.812, "eval_steps_per_second": 16.026, "step": 3910 }, { "epoch": 5.012787723785166, "grad_norm": 6.940534591674805, "learning_rate": 4.255226480836237e-05, "loss": 0.6806, "step": 3920 }, { "epoch": 5.025575447570333, "grad_norm": 0.07264159619808197, "learning_rate": 4.253048780487805e-05, "loss": 0.7137, "step": 3930 }, { "epoch": 5.038363171355499, "grad_norm": 0.6328474283218384, "learning_rate": 4.250871080139373e-05, "loss": 0.5638, "step": 3940 }, { "epoch": 5.051150895140665, "grad_norm": 0.7308364510536194, "learning_rate": 4.248693379790941e-05, "loss": 0.7956, "step": 3950 }, { "epoch": 5.063938618925831, "grad_norm": 1.2225631475448608, "learning_rate": 4.246515679442509e-05, "loss": 0.5388, "step": 3960 }, { "epoch": 5.076726342710997, "grad_norm": 3.707005023956299, "learning_rate": 4.244337979094077e-05, "loss": 0.4415, "step": 3970 }, { "epoch": 5.089514066496164, "grad_norm": 5.035332679748535, "learning_rate": 4.242160278745645e-05, "loss": 0.3167, "step": 3980 }, { "epoch": 5.10230179028133, "grad_norm": 8.26025676727295, "learning_rate": 4.239982578397213e-05, "loss": 0.3634, "step": 3990 }, { "epoch": 5.115089514066496, "grad_norm": 0.4538026452064514, "learning_rate": 4.237804878048781e-05, "loss": 0.4083, "step": 4000 }, { "epoch": 5.127877237851663, "grad_norm": 2.740149736404419, "learning_rate": 4.235627177700349e-05, "loss": 0.1352, "step": 4010 }, { "epoch": 5.140664961636829, "grad_norm": 2.6353447437286377, "learning_rate": 4.233449477351917e-05, "loss": 0.5863, "step": 4020 }, { "epoch": 5.153452685421995, "grad_norm": 2.091527223587036, "learning_rate": 4.2312717770034846e-05, "loss": 0.398, "step": 4030 }, { "epoch": 5.166240409207161, "grad_norm": 2.8681185245513916, "learning_rate": 4.229094076655053e-05, "loss": 0.4215, "step": 4040 }, { "epoch": 5.179028132992327, "grad_norm": 8.741495132446289, "learning_rate": 4.22691637630662e-05, "loss": 0.6843, "step": 4050 }, { "epoch": 5.1918158567774935, "grad_norm": 9.90984058380127, "learning_rate": 4.2247386759581884e-05, "loss": 0.9233, "step": 4060 }, { "epoch": 5.20460358056266, "grad_norm": 2.038926601409912, "learning_rate": 4.222560975609756e-05, "loss": 0.5204, "step": 4070 }, { "epoch": 5.217391304347826, "grad_norm": 0.1147674098610878, "learning_rate": 4.220383275261324e-05, "loss": 0.342, "step": 4080 }, { "epoch": 5.2301790281329925, "grad_norm": 51.04707717895508, "learning_rate": 4.218205574912892e-05, "loss": 0.969, "step": 4090 }, { "epoch": 5.242966751918159, "grad_norm": 0.40241485834121704, "learning_rate": 4.21602787456446e-05, "loss": 0.3971, "step": 4100 }, { "epoch": 5.255754475703325, "grad_norm": 1.017622709274292, "learning_rate": 4.213850174216028e-05, "loss": 0.6763, "step": 4110 }, { "epoch": 5.268542199488491, "grad_norm": 11.975500106811523, "learning_rate": 4.211672473867596e-05, "loss": 0.9518, "step": 4120 }, { "epoch": 5.281329923273657, "grad_norm": 8.183752059936523, "learning_rate": 4.209494773519164e-05, "loss": 0.5027, "step": 4130 }, { "epoch": 5.294117647058823, "grad_norm": 2.0323333740234375, "learning_rate": 4.207317073170732e-05, "loss": 0.5327, "step": 4140 }, { "epoch": 5.30690537084399, "grad_norm": 9.58705997467041, "learning_rate": 4.2051393728223e-05, "loss": 0.7124, "step": 4150 }, { "epoch": 5.319693094629156, "grad_norm": 2.840604066848755, "learning_rate": 4.202961672473868e-05, "loss": 0.5643, "step": 4160 }, { "epoch": 5.332480818414322, "grad_norm": 4.099714756011963, "learning_rate": 4.2007839721254356e-05, "loss": 0.9582, "step": 4170 }, { "epoch": 5.345268542199489, "grad_norm": 10.31187629699707, "learning_rate": 4.198606271777004e-05, "loss": 0.494, "step": 4180 }, { "epoch": 5.358056265984655, "grad_norm": 3.8303794860839844, "learning_rate": 4.196428571428572e-05, "loss": 0.4447, "step": 4190 }, { "epoch": 5.370843989769821, "grad_norm": 7.313268661499023, "learning_rate": 4.1942508710801395e-05, "loss": 0.5681, "step": 4200 }, { "epoch": 5.383631713554987, "grad_norm": 9.197836875915527, "learning_rate": 4.1920731707317077e-05, "loss": 1.0159, "step": 4210 }, { "epoch": 5.396419437340153, "grad_norm": 3.0132012367248535, "learning_rate": 4.189895470383275e-05, "loss": 0.533, "step": 4220 }, { "epoch": 5.40920716112532, "grad_norm": 9.22429084777832, "learning_rate": 4.187717770034843e-05, "loss": 0.5966, "step": 4230 }, { "epoch": 5.421994884910486, "grad_norm": 13.826210975646973, "learning_rate": 4.185540069686411e-05, "loss": 0.4478, "step": 4240 }, { "epoch": 5.434782608695652, "grad_norm": 11.591147422790527, "learning_rate": 4.183362369337979e-05, "loss": 0.7064, "step": 4250 }, { "epoch": 5.447570332480819, "grad_norm": 12.315413475036621, "learning_rate": 4.181184668989547e-05, "loss": 1.0166, "step": 4260 }, { "epoch": 5.460358056265985, "grad_norm": 10.030841827392578, "learning_rate": 4.1790069686411153e-05, "loss": 0.537, "step": 4270 }, { "epoch": 5.4731457800511505, "grad_norm": 11.139789581298828, "learning_rate": 4.176829268292683e-05, "loss": 0.6873, "step": 4280 }, { "epoch": 5.485933503836317, "grad_norm": 0.15211737155914307, "learning_rate": 4.174651567944251e-05, "loss": 0.5206, "step": 4290 }, { "epoch": 5.498721227621483, "grad_norm": 12.38772964477539, "learning_rate": 4.172473867595819e-05, "loss": 0.4893, "step": 4300 }, { "epoch": 5.5115089514066495, "grad_norm": 8.11055850982666, "learning_rate": 4.170296167247387e-05, "loss": 0.424, "step": 4310 }, { "epoch": 5.524296675191816, "grad_norm": 18.673229217529297, "learning_rate": 4.168118466898955e-05, "loss": 0.8048, "step": 4320 }, { "epoch": 5.537084398976982, "grad_norm": 15.203030586242676, "learning_rate": 4.165940766550523e-05, "loss": 0.9721, "step": 4330 }, { "epoch": 5.549872122762149, "grad_norm": 5.294841766357422, "learning_rate": 4.163763066202091e-05, "loss": 0.7572, "step": 4340 }, { "epoch": 5.562659846547315, "grad_norm": 3.338132858276367, "learning_rate": 4.161585365853659e-05, "loss": 0.911, "step": 4350 }, { "epoch": 5.57544757033248, "grad_norm": 0.901577889919281, "learning_rate": 4.159407665505227e-05, "loss": 0.7884, "step": 4360 }, { "epoch": 5.588235294117647, "grad_norm": 3.517756223678589, "learning_rate": 4.157229965156795e-05, "loss": 0.324, "step": 4370 }, { "epoch": 5.601023017902813, "grad_norm": 0.7338356971740723, "learning_rate": 4.1550522648083626e-05, "loss": 0.4658, "step": 4380 }, { "epoch": 5.6138107416879794, "grad_norm": 3.6566998958587646, "learning_rate": 4.152874564459931e-05, "loss": 0.9993, "step": 4390 }, { "epoch": 5.626598465473146, "grad_norm": 2.2101399898529053, "learning_rate": 4.150696864111498e-05, "loss": 0.4704, "step": 4400 }, { "epoch": 5.639386189258312, "grad_norm": 4.851430892944336, "learning_rate": 4.1485191637630664e-05, "loss": 0.4984, "step": 4410 }, { "epoch": 5.6521739130434785, "grad_norm": 5.132842063903809, "learning_rate": 4.146341463414634e-05, "loss": 0.3548, "step": 4420 }, { "epoch": 5.664961636828645, "grad_norm": 4.944459438323975, "learning_rate": 4.144163763066202e-05, "loss": 0.556, "step": 4430 }, { "epoch": 5.677749360613811, "grad_norm": 2.2115225791931152, "learning_rate": 4.14198606271777e-05, "loss": 0.6975, "step": 4440 }, { "epoch": 5.690537084398977, "grad_norm": 2.7226309776306152, "learning_rate": 4.1398083623693384e-05, "loss": 0.4694, "step": 4450 }, { "epoch": 5.703324808184143, "grad_norm": 7.218973159790039, "learning_rate": 4.137630662020906e-05, "loss": 0.3918, "step": 4460 }, { "epoch": 5.716112531969309, "grad_norm": 0.470683753490448, "learning_rate": 4.135452961672474e-05, "loss": 0.719, "step": 4470 }, { "epoch": 5.728900255754476, "grad_norm": 0.4735340178012848, "learning_rate": 4.133275261324042e-05, "loss": 0.4324, "step": 4480 }, { "epoch": 5.741687979539642, "grad_norm": 2.4921889305114746, "learning_rate": 4.13109756097561e-05, "loss": 0.6334, "step": 4490 }, { "epoch": 5.754475703324808, "grad_norm": 7.904917240142822, "learning_rate": 4.128919860627178e-05, "loss": 0.7718, "step": 4500 }, { "epoch": 5.767263427109975, "grad_norm": 5.907204627990723, "learning_rate": 4.126742160278746e-05, "loss": 0.6752, "step": 4510 }, { "epoch": 5.78005115089514, "grad_norm": 22.04534912109375, "learning_rate": 4.124564459930314e-05, "loss": 0.5174, "step": 4520 }, { "epoch": 5.792838874680307, "grad_norm": 5.223736763000488, "learning_rate": 4.122386759581882e-05, "loss": 0.4966, "step": 4530 }, { "epoch": 5.805626598465473, "grad_norm": 1.3607535362243652, "learning_rate": 4.12020905923345e-05, "loss": 0.3876, "step": 4540 }, { "epoch": 5.818414322250639, "grad_norm": 3.3651671409606934, "learning_rate": 4.1180313588850175e-05, "loss": 0.4492, "step": 4550 }, { "epoch": 5.831202046035806, "grad_norm": 3.5978453159332275, "learning_rate": 4.1158536585365856e-05, "loss": 0.5008, "step": 4560 }, { "epoch": 5.843989769820972, "grad_norm": 15.397109031677246, "learning_rate": 4.113675958188153e-05, "loss": 0.6904, "step": 4570 }, { "epoch": 5.856777493606138, "grad_norm": 4.293745040893555, "learning_rate": 4.111498257839721e-05, "loss": 0.3919, "step": 4580 }, { "epoch": 5.869565217391305, "grad_norm": 0.40358975529670715, "learning_rate": 4.1093205574912895e-05, "loss": 0.372, "step": 4590 }, { "epoch": 5.882352941176471, "grad_norm": 6.08713960647583, "learning_rate": 4.107142857142857e-05, "loss": 0.4161, "step": 4600 }, { "epoch": 5.8951406649616365, "grad_norm": 14.7813138961792, "learning_rate": 4.104965156794425e-05, "loss": 0.7926, "step": 4610 }, { "epoch": 5.907928388746803, "grad_norm": 1.5015966892242432, "learning_rate": 4.102787456445993e-05, "loss": 0.564, "step": 4620 }, { "epoch": 5.920716112531969, "grad_norm": 5.556253910064697, "learning_rate": 4.100609756097561e-05, "loss": 0.4083, "step": 4630 }, { "epoch": 5.9335038363171355, "grad_norm": 4.575002670288086, "learning_rate": 4.098432055749129e-05, "loss": 0.7552, "step": 4640 }, { "epoch": 5.946291560102302, "grad_norm": 7.121840476989746, "learning_rate": 4.096254355400697e-05, "loss": 0.2914, "step": 4650 }, { "epoch": 5.959079283887468, "grad_norm": 8.14986801147461, "learning_rate": 4.0940766550522653e-05, "loss": 0.5904, "step": 4660 }, { "epoch": 5.971867007672635, "grad_norm": 0.0028777301777154207, "learning_rate": 4.091898954703833e-05, "loss": 0.6281, "step": 4670 }, { "epoch": 5.9846547314578, "grad_norm": 0.5826151371002197, "learning_rate": 4.089721254355401e-05, "loss": 0.6602, "step": 4680 }, { "epoch": 5.997442455242966, "grad_norm": 3.5384085178375244, "learning_rate": 4.087543554006969e-05, "loss": 0.6806, "step": 4690 }, { "epoch": 6.0, "eval_loss": 0.2815360426902771, "eval_runtime": 0.985, "eval_samples_per_second": 99.492, "eval_steps_per_second": 13.198, "step": 4692 }, { "epoch": 6.010230179028133, "grad_norm": 4.524169921875, "learning_rate": 4.085365853658537e-05, "loss": 0.4968, "step": 4700 }, { "epoch": 6.023017902813299, "grad_norm": 7.202708721160889, "learning_rate": 4.083188153310105e-05, "loss": 0.3843, "step": 4710 }, { "epoch": 6.035805626598465, "grad_norm": 0.15170826017856598, "learning_rate": 4.081010452961673e-05, "loss": 0.5316, "step": 4720 }, { "epoch": 6.048593350383632, "grad_norm": 0.06313107907772064, "learning_rate": 4.0788327526132405e-05, "loss": 0.5724, "step": 4730 }, { "epoch": 6.061381074168798, "grad_norm": 0.613458514213562, "learning_rate": 4.076655052264808e-05, "loss": 0.3703, "step": 4740 }, { "epoch": 6.0741687979539645, "grad_norm": 5.777957916259766, "learning_rate": 4.074477351916376e-05, "loss": 0.9007, "step": 4750 }, { "epoch": 6.086956521739131, "grad_norm": 2.215762138366699, "learning_rate": 4.0722996515679444e-05, "loss": 0.2629, "step": 4760 }, { "epoch": 6.099744245524296, "grad_norm": 66.70785522460938, "learning_rate": 4.070121951219512e-05, "loss": 0.682, "step": 4770 }, { "epoch": 6.112531969309463, "grad_norm": 0.04602469503879547, "learning_rate": 4.06794425087108e-05, "loss": 0.4461, "step": 4780 }, { "epoch": 6.125319693094629, "grad_norm": 4.179677486419678, "learning_rate": 4.065766550522648e-05, "loss": 0.7186, "step": 4790 }, { "epoch": 6.138107416879795, "grad_norm": 1.071020483970642, "learning_rate": 4.0635888501742164e-05, "loss": 0.1355, "step": 4800 }, { "epoch": 6.150895140664962, "grad_norm": 1.201188564300537, "learning_rate": 4.061411149825784e-05, "loss": 0.2015, "step": 4810 }, { "epoch": 6.163682864450128, "grad_norm": 3.307131290435791, "learning_rate": 4.059233449477352e-05, "loss": 0.3906, "step": 4820 }, { "epoch": 6.176470588235294, "grad_norm": 0.4368687868118286, "learning_rate": 4.05705574912892e-05, "loss": 0.8862, "step": 4830 }, { "epoch": 6.189258312020461, "grad_norm": 8.978657722473145, "learning_rate": 4.0548780487804884e-05, "loss": 0.5239, "step": 4840 }, { "epoch": 6.202046035805626, "grad_norm": 9.51962661743164, "learning_rate": 4.052700348432056e-05, "loss": 0.2829, "step": 4850 }, { "epoch": 6.2148337595907925, "grad_norm": 5.0570807456970215, "learning_rate": 4.050522648083624e-05, "loss": 0.743, "step": 4860 }, { "epoch": 6.227621483375959, "grad_norm": 10.152450561523438, "learning_rate": 4.048344947735192e-05, "loss": 1.0856, "step": 4870 }, { "epoch": 6.240409207161125, "grad_norm": 4.6442084312438965, "learning_rate": 4.04616724738676e-05, "loss": 0.3863, "step": 4880 }, { "epoch": 6.253196930946292, "grad_norm": 5.835144996643066, "learning_rate": 4.043989547038328e-05, "loss": 0.9037, "step": 4890 }, { "epoch": 6.265984654731458, "grad_norm": 5.151255130767822, "learning_rate": 4.0418118466898954e-05, "loss": 0.6878, "step": 4900 }, { "epoch": 6.278772378516624, "grad_norm": 0.5118243098258972, "learning_rate": 4.0396341463414636e-05, "loss": 0.8633, "step": 4910 }, { "epoch": 6.291560102301791, "grad_norm": 6.059098243713379, "learning_rate": 4.037456445993031e-05, "loss": 0.4825, "step": 4920 }, { "epoch": 6.304347826086957, "grad_norm": 0.20288658142089844, "learning_rate": 4.035278745644599e-05, "loss": 0.3513, "step": 4930 }, { "epoch": 6.3171355498721224, "grad_norm": 0.656362771987915, "learning_rate": 4.0331010452961675e-05, "loss": 0.5211, "step": 4940 }, { "epoch": 6.329923273657289, "grad_norm": 8.95826244354248, "learning_rate": 4.030923344947735e-05, "loss": 0.5175, "step": 4950 }, { "epoch": 6.342710997442455, "grad_norm": 0.8376319408416748, "learning_rate": 4.028745644599303e-05, "loss": 0.2661, "step": 4960 }, { "epoch": 6.3554987212276215, "grad_norm": 2.73271107673645, "learning_rate": 4.026567944250871e-05, "loss": 0.3016, "step": 4970 }, { "epoch": 6.368286445012788, "grad_norm": 8.35566520690918, "learning_rate": 4.0243902439024395e-05, "loss": 0.3719, "step": 4980 }, { "epoch": 6.381074168797954, "grad_norm": 5.748067378997803, "learning_rate": 4.022212543554007e-05, "loss": 0.5457, "step": 4990 }, { "epoch": 6.3938618925831205, "grad_norm": 3.471608877182007, "learning_rate": 4.020034843205575e-05, "loss": 0.6285, "step": 5000 }, { "epoch": 6.406649616368286, "grad_norm": 0.2684521973133087, "learning_rate": 4.017857142857143e-05, "loss": 0.2251, "step": 5010 }, { "epoch": 6.419437340153452, "grad_norm": 9.472249984741211, "learning_rate": 4.015679442508711e-05, "loss": 0.5518, "step": 5020 }, { "epoch": 6.432225063938619, "grad_norm": 0.46002939343452454, "learning_rate": 4.013501742160279e-05, "loss": 0.6917, "step": 5030 }, { "epoch": 6.445012787723785, "grad_norm": 6.503478527069092, "learning_rate": 4.011324041811847e-05, "loss": 0.8197, "step": 5040 }, { "epoch": 6.457800511508951, "grad_norm": 7.9507036209106445, "learning_rate": 4.0091463414634153e-05, "loss": 0.4216, "step": 5050 }, { "epoch": 6.470588235294118, "grad_norm": 7.5792412757873535, "learning_rate": 4.006968641114983e-05, "loss": 0.714, "step": 5060 }, { "epoch": 6.483375959079284, "grad_norm": 3.8649039268493652, "learning_rate": 4.004790940766551e-05, "loss": 0.9393, "step": 5070 }, { "epoch": 6.4961636828644505, "grad_norm": 10.020180702209473, "learning_rate": 4.0026132404181185e-05, "loss": 0.5577, "step": 5080 }, { "epoch": 6.508951406649617, "grad_norm": 8.545390129089355, "learning_rate": 4.000435540069686e-05, "loss": 0.7541, "step": 5090 }, { "epoch": 6.521739130434782, "grad_norm": 8.494593620300293, "learning_rate": 3.998257839721254e-05, "loss": 0.948, "step": 5100 }, { "epoch": 6.534526854219949, "grad_norm": 4.0168352127075195, "learning_rate": 3.9960801393728224e-05, "loss": 0.428, "step": 5110 }, { "epoch": 6.547314578005115, "grad_norm": 4.527425289154053, "learning_rate": 3.9939024390243905e-05, "loss": 0.598, "step": 5120 }, { "epoch": 6.560102301790281, "grad_norm": 6.2703537940979, "learning_rate": 3.991724738675958e-05, "loss": 0.5118, "step": 5130 }, { "epoch": 6.572890025575448, "grad_norm": 3.926626682281494, "learning_rate": 3.989547038327526e-05, "loss": 0.6562, "step": 5140 }, { "epoch": 6.585677749360614, "grad_norm": 1.1774628162384033, "learning_rate": 3.9873693379790944e-05, "loss": 0.5402, "step": 5150 }, { "epoch": 6.59846547314578, "grad_norm": 4.933875560760498, "learning_rate": 3.985191637630662e-05, "loss": 0.4469, "step": 5160 }, { "epoch": 6.611253196930946, "grad_norm": 4.4834184646606445, "learning_rate": 3.98301393728223e-05, "loss": 0.5847, "step": 5170 }, { "epoch": 6.624040920716112, "grad_norm": 0.006693384610116482, "learning_rate": 3.980836236933798e-05, "loss": 0.245, "step": 5180 }, { "epoch": 6.6368286445012785, "grad_norm": 5.899873733520508, "learning_rate": 3.9786585365853664e-05, "loss": 0.4535, "step": 5190 }, { "epoch": 6.649616368286445, "grad_norm": 8.414173126220703, "learning_rate": 3.976480836236934e-05, "loss": 0.8365, "step": 5200 }, { "epoch": 6.662404092071611, "grad_norm": 5.519989967346191, "learning_rate": 3.974303135888502e-05, "loss": 0.4639, "step": 5210 }, { "epoch": 6.675191815856778, "grad_norm": 3.984368085861206, "learning_rate": 3.97212543554007e-05, "loss": 0.4365, "step": 5220 }, { "epoch": 6.687979539641944, "grad_norm": 7.652017593383789, "learning_rate": 3.9699477351916384e-05, "loss": 0.3163, "step": 5230 }, { "epoch": 6.70076726342711, "grad_norm": 1.4251798391342163, "learning_rate": 3.967770034843206e-05, "loss": 0.4581, "step": 5240 }, { "epoch": 6.713554987212277, "grad_norm": 4.420569896697998, "learning_rate": 3.9655923344947734e-05, "loss": 0.6572, "step": 5250 }, { "epoch": 6.726342710997442, "grad_norm": 5.152069091796875, "learning_rate": 3.9634146341463416e-05, "loss": 0.1541, "step": 5260 }, { "epoch": 6.739130434782608, "grad_norm": 3.8654263019561768, "learning_rate": 3.961236933797909e-05, "loss": 0.4638, "step": 5270 }, { "epoch": 6.751918158567775, "grad_norm": 2.7325217723846436, "learning_rate": 3.959059233449477e-05, "loss": 0.3195, "step": 5280 }, { "epoch": 6.764705882352941, "grad_norm": 1.6292638778686523, "learning_rate": 3.9568815331010454e-05, "loss": 0.3065, "step": 5290 }, { "epoch": 6.7774936061381075, "grad_norm": 3.9360060691833496, "learning_rate": 3.9547038327526136e-05, "loss": 0.5371, "step": 5300 }, { "epoch": 6.790281329923274, "grad_norm": 8.34693431854248, "learning_rate": 3.952526132404181e-05, "loss": 0.6615, "step": 5310 }, { "epoch": 6.80306905370844, "grad_norm": 5.481771469116211, "learning_rate": 3.950348432055749e-05, "loss": 0.2692, "step": 5320 }, { "epoch": 6.8158567774936065, "grad_norm": 0.8621829152107239, "learning_rate": 3.9481707317073175e-05, "loss": 0.4334, "step": 5330 }, { "epoch": 6.828644501278772, "grad_norm": 2.479590654373169, "learning_rate": 3.945993031358885e-05, "loss": 0.5803, "step": 5340 }, { "epoch": 6.841432225063938, "grad_norm": 2.5847859382629395, "learning_rate": 3.943815331010453e-05, "loss": 0.2913, "step": 5350 }, { "epoch": 6.854219948849105, "grad_norm": 0.5854724645614624, "learning_rate": 3.941637630662021e-05, "loss": 0.494, "step": 5360 }, { "epoch": 6.867007672634271, "grad_norm": 0.038145843893289566, "learning_rate": 3.9394599303135895e-05, "loss": 0.5454, "step": 5370 }, { "epoch": 6.879795396419437, "grad_norm": 0.38679739832878113, "learning_rate": 3.937282229965157e-05, "loss": 0.2765, "step": 5380 }, { "epoch": 6.892583120204604, "grad_norm": 0.7759974598884583, "learning_rate": 3.935104529616725e-05, "loss": 0.5901, "step": 5390 }, { "epoch": 6.90537084398977, "grad_norm": 3.629659652709961, "learning_rate": 3.932926829268293e-05, "loss": 0.8057, "step": 5400 }, { "epoch": 6.918158567774936, "grad_norm": 0.9093683362007141, "learning_rate": 3.930749128919861e-05, "loss": 0.2888, "step": 5410 }, { "epoch": 6.930946291560103, "grad_norm": 1.096061110496521, "learning_rate": 3.928571428571429e-05, "loss": 0.6122, "step": 5420 }, { "epoch": 6.943734015345268, "grad_norm": 1.6472046375274658, "learning_rate": 3.9263937282229965e-05, "loss": 0.3792, "step": 5430 }, { "epoch": 6.956521739130435, "grad_norm": 12.249022483825684, "learning_rate": 3.924216027874565e-05, "loss": 0.8298, "step": 5440 }, { "epoch": 6.969309462915601, "grad_norm": 14.496586799621582, "learning_rate": 3.922038327526132e-05, "loss": 0.6759, "step": 5450 }, { "epoch": 6.982097186700767, "grad_norm": 7.142574310302734, "learning_rate": 3.9198606271777003e-05, "loss": 0.4921, "step": 5460 }, { "epoch": 6.994884910485934, "grad_norm": 1.1950016021728516, "learning_rate": 3.9176829268292685e-05, "loss": 0.3, "step": 5470 }, { "epoch": 7.0, "eval_loss": 0.2577835023403168, "eval_runtime": 0.976, "eval_samples_per_second": 100.408, "eval_steps_per_second": 13.319, "step": 5474 }, { "epoch": 7.0076726342711, "grad_norm": 1.785232424736023, "learning_rate": 3.915505226480836e-05, "loss": 0.2541, "step": 5480 }, { "epoch": 7.020460358056266, "grad_norm": 5.674078464508057, "learning_rate": 3.913327526132404e-05, "loss": 0.5632, "step": 5490 }, { "epoch": 7.033248081841432, "grad_norm": 4.966092109680176, "learning_rate": 3.9111498257839724e-05, "loss": 0.2459, "step": 5500 }, { "epoch": 7.046035805626598, "grad_norm": 5.083249092102051, "learning_rate": 3.9089721254355405e-05, "loss": 0.4098, "step": 5510 }, { "epoch": 7.0588235294117645, "grad_norm": 7.468497276306152, "learning_rate": 3.906794425087108e-05, "loss": 0.4966, "step": 5520 }, { "epoch": 7.071611253196931, "grad_norm": 2.9461817741394043, "learning_rate": 3.904616724738676e-05, "loss": 0.624, "step": 5530 }, { "epoch": 7.084398976982097, "grad_norm": 0.009090915322303772, "learning_rate": 3.9024390243902444e-05, "loss": 0.3501, "step": 5540 }, { "epoch": 7.0971867007672635, "grad_norm": 5.245519638061523, "learning_rate": 3.900261324041812e-05, "loss": 0.8218, "step": 5550 }, { "epoch": 7.10997442455243, "grad_norm": 5.124762535095215, "learning_rate": 3.89808362369338e-05, "loss": 0.2849, "step": 5560 }, { "epoch": 7.122762148337596, "grad_norm": 7.454417705535889, "learning_rate": 3.895905923344948e-05, "loss": 0.4737, "step": 5570 }, { "epoch": 7.135549872122763, "grad_norm": 5.743339538574219, "learning_rate": 3.8937282229965164e-05, "loss": 0.4892, "step": 5580 }, { "epoch": 7.148337595907928, "grad_norm": 6.936069488525391, "learning_rate": 3.891550522648084e-05, "loss": 0.2564, "step": 5590 }, { "epoch": 7.161125319693094, "grad_norm": 1.4387210607528687, "learning_rate": 3.8893728222996514e-05, "loss": 0.57, "step": 5600 }, { "epoch": 7.173913043478261, "grad_norm": 0.5554598569869995, "learning_rate": 3.8871951219512196e-05, "loss": 0.4544, "step": 5610 }, { "epoch": 7.186700767263427, "grad_norm": 0.05604790896177292, "learning_rate": 3.885017421602787e-05, "loss": 0.532, "step": 5620 }, { "epoch": 7.1994884910485935, "grad_norm": 1.4710054397583008, "learning_rate": 3.882839721254355e-05, "loss": 0.441, "step": 5630 }, { "epoch": 7.21227621483376, "grad_norm": 3.6691761016845703, "learning_rate": 3.8806620209059234e-05, "loss": 0.2938, "step": 5640 }, { "epoch": 7.225063938618926, "grad_norm": 0.6838319897651672, "learning_rate": 3.8784843205574916e-05, "loss": 0.4238, "step": 5650 }, { "epoch": 7.2378516624040925, "grad_norm": 1.6891676187515259, "learning_rate": 3.876306620209059e-05, "loss": 0.3278, "step": 5660 }, { "epoch": 7.250639386189258, "grad_norm": 1.7671685218811035, "learning_rate": 3.874128919860627e-05, "loss": 0.3804, "step": 5670 }, { "epoch": 7.263427109974424, "grad_norm": 0.5737038850784302, "learning_rate": 3.8719512195121954e-05, "loss": 0.6363, "step": 5680 }, { "epoch": 7.276214833759591, "grad_norm": 4.315406799316406, "learning_rate": 3.8697735191637636e-05, "loss": 0.4526, "step": 5690 }, { "epoch": 7.289002557544757, "grad_norm": 1.207388162612915, "learning_rate": 3.867595818815331e-05, "loss": 0.2925, "step": 5700 }, { "epoch": 7.301790281329923, "grad_norm": 4.993164539337158, "learning_rate": 3.865418118466899e-05, "loss": 0.4904, "step": 5710 }, { "epoch": 7.31457800511509, "grad_norm": 0.005563805811107159, "learning_rate": 3.8632404181184675e-05, "loss": 0.5732, "step": 5720 }, { "epoch": 7.327365728900256, "grad_norm": 5.448471546173096, "learning_rate": 3.861062717770035e-05, "loss": 0.281, "step": 5730 }, { "epoch": 7.340153452685422, "grad_norm": 1.2096017599105835, "learning_rate": 3.858885017421603e-05, "loss": 0.711, "step": 5740 }, { "epoch": 7.352941176470588, "grad_norm": 3.929307222366333, "learning_rate": 3.856707317073171e-05, "loss": 0.6578, "step": 5750 }, { "epoch": 7.365728900255754, "grad_norm": 3.536635160446167, "learning_rate": 3.854529616724739e-05, "loss": 0.3298, "step": 5760 }, { "epoch": 7.378516624040921, "grad_norm": 0.848988950252533, "learning_rate": 3.852351916376307e-05, "loss": 0.4895, "step": 5770 }, { "epoch": 7.391304347826087, "grad_norm": 8.26038646697998, "learning_rate": 3.8501742160278745e-05, "loss": 0.7074, "step": 5780 }, { "epoch": 7.404092071611253, "grad_norm": 0.8512731194496155, "learning_rate": 3.8479965156794427e-05, "loss": 0.6852, "step": 5790 }, { "epoch": 7.41687979539642, "grad_norm": 0.020886391401290894, "learning_rate": 3.84581881533101e-05, "loss": 0.5686, "step": 5800 }, { "epoch": 7.429667519181586, "grad_norm": 2.0098299980163574, "learning_rate": 3.843641114982578e-05, "loss": 0.4906, "step": 5810 }, { "epoch": 7.442455242966752, "grad_norm": 0.9866635799407959, "learning_rate": 3.8414634146341465e-05, "loss": 0.4961, "step": 5820 }, { "epoch": 7.455242966751918, "grad_norm": 0.8240039944648743, "learning_rate": 3.839285714285715e-05, "loss": 0.4916, "step": 5830 }, { "epoch": 7.468030690537084, "grad_norm": 4.04268217086792, "learning_rate": 3.837108013937282e-05, "loss": 0.7945, "step": 5840 }, { "epoch": 7.4808184143222505, "grad_norm": 5.761824131011963, "learning_rate": 3.8349303135888503e-05, "loss": 0.6028, "step": 5850 }, { "epoch": 7.493606138107417, "grad_norm": 14.809423446655273, "learning_rate": 3.8327526132404185e-05, "loss": 0.6021, "step": 5860 }, { "epoch": 7.506393861892583, "grad_norm": 4.42140007019043, "learning_rate": 3.830574912891986e-05, "loss": 0.5033, "step": 5870 }, { "epoch": 7.5191815856777495, "grad_norm": 5.502748966217041, "learning_rate": 3.828397212543554e-05, "loss": 0.2886, "step": 5880 }, { "epoch": 7.531969309462916, "grad_norm": 0.987762451171875, "learning_rate": 3.8262195121951224e-05, "loss": 0.4663, "step": 5890 }, { "epoch": 7.544757033248082, "grad_norm": 4.237513542175293, "learning_rate": 3.8240418118466905e-05, "loss": 0.5514, "step": 5900 }, { "epoch": 7.557544757033249, "grad_norm": 3.0318350791931152, "learning_rate": 3.821864111498258e-05, "loss": 0.5669, "step": 5910 }, { "epoch": 7.570332480818414, "grad_norm": 4.003817081451416, "learning_rate": 3.819686411149826e-05, "loss": 0.5131, "step": 5920 }, { "epoch": 7.58312020460358, "grad_norm": 9.188379287719727, "learning_rate": 3.8175087108013944e-05, "loss": 0.4796, "step": 5930 }, { "epoch": 7.595907928388747, "grad_norm": 0.7134082317352295, "learning_rate": 3.815331010452962e-05, "loss": 0.5296, "step": 5940 }, { "epoch": 7.608695652173913, "grad_norm": 12.00949764251709, "learning_rate": 3.8131533101045294e-05, "loss": 0.6011, "step": 5950 }, { "epoch": 7.621483375959079, "grad_norm": 4.412923336029053, "learning_rate": 3.8109756097560976e-05, "loss": 0.3967, "step": 5960 }, { "epoch": 7.634271099744246, "grad_norm": 0.03846118226647377, "learning_rate": 3.808797909407666e-05, "loss": 0.4239, "step": 5970 }, { "epoch": 7.647058823529412, "grad_norm": 6.899741172790527, "learning_rate": 3.806620209059233e-05, "loss": 0.5772, "step": 5980 }, { "epoch": 7.659846547314578, "grad_norm": 1.2849925756454468, "learning_rate": 3.8044425087108014e-05, "loss": 0.4367, "step": 5990 }, { "epoch": 7.672634271099744, "grad_norm": 0.005242053419351578, "learning_rate": 3.8022648083623696e-05, "loss": 0.1701, "step": 6000 }, { "epoch": 7.68542199488491, "grad_norm": 5.540919780731201, "learning_rate": 3.800087108013937e-05, "loss": 0.6158, "step": 6010 }, { "epoch": 7.698209718670077, "grad_norm": 0.8520113825798035, "learning_rate": 3.797909407665505e-05, "loss": 0.4181, "step": 6020 }, { "epoch": 7.710997442455243, "grad_norm": 3.9124886989593506, "learning_rate": 3.7957317073170734e-05, "loss": 0.1994, "step": 6030 }, { "epoch": 7.723785166240409, "grad_norm": 10.283561706542969, "learning_rate": 3.7935540069686416e-05, "loss": 0.4738, "step": 6040 }, { "epoch": 7.736572890025576, "grad_norm": 0.4710548222064972, "learning_rate": 3.791376306620209e-05, "loss": 0.3861, "step": 6050 }, { "epoch": 7.749360613810742, "grad_norm": 0.9403038024902344, "learning_rate": 3.789198606271777e-05, "loss": 0.2973, "step": 6060 }, { "epoch": 7.762148337595908, "grad_norm": 7.959447860717773, "learning_rate": 3.7870209059233454e-05, "loss": 0.3342, "step": 6070 }, { "epoch": 7.774936061381074, "grad_norm": 4.493699073791504, "learning_rate": 3.7848432055749136e-05, "loss": 0.645, "step": 6080 }, { "epoch": 7.78772378516624, "grad_norm": 0.9736508131027222, "learning_rate": 3.782665505226481e-05, "loss": 0.5379, "step": 6090 }, { "epoch": 7.8005115089514065, "grad_norm": 8.09215259552002, "learning_rate": 3.780487804878049e-05, "loss": 0.7049, "step": 6100 }, { "epoch": 7.813299232736573, "grad_norm": 4.407011032104492, "learning_rate": 3.778310104529617e-05, "loss": 0.4454, "step": 6110 }, { "epoch": 7.826086956521739, "grad_norm": 7.577841758728027, "learning_rate": 3.776132404181185e-05, "loss": 0.4755, "step": 6120 }, { "epoch": 7.838874680306906, "grad_norm": 5.885177135467529, "learning_rate": 3.7739547038327525e-05, "loss": 0.3473, "step": 6130 }, { "epoch": 7.851662404092072, "grad_norm": 3.9344217777252197, "learning_rate": 3.7717770034843206e-05, "loss": 0.2407, "step": 6140 }, { "epoch": 7.864450127877237, "grad_norm": 0.8838673830032349, "learning_rate": 3.769599303135889e-05, "loss": 0.1372, "step": 6150 }, { "epoch": 7.877237851662404, "grad_norm": 0.38971105217933655, "learning_rate": 3.767421602787456e-05, "loss": 0.3899, "step": 6160 }, { "epoch": 7.89002557544757, "grad_norm": 15.273054122924805, "learning_rate": 3.7652439024390245e-05, "loss": 0.5726, "step": 6170 }, { "epoch": 7.9028132992327365, "grad_norm": 0.9236469864845276, "learning_rate": 3.7630662020905927e-05, "loss": 0.3092, "step": 6180 }, { "epoch": 7.915601023017903, "grad_norm": 9.27222728729248, "learning_rate": 3.76088850174216e-05, "loss": 0.503, "step": 6190 }, { "epoch": 7.928388746803069, "grad_norm": 2.053985357284546, "learning_rate": 3.758710801393728e-05, "loss": 0.6894, "step": 6200 }, { "epoch": 7.9411764705882355, "grad_norm": 8.98082447052002, "learning_rate": 3.7565331010452965e-05, "loss": 0.7604, "step": 6210 }, { "epoch": 7.953964194373402, "grad_norm": 6.407993793487549, "learning_rate": 3.754355400696865e-05, "loss": 0.5022, "step": 6220 }, { "epoch": 7.966751918158568, "grad_norm": 3.8097591400146484, "learning_rate": 3.752177700348432e-05, "loss": 0.6324, "step": 6230 }, { "epoch": 7.979539641943734, "grad_norm": 3.067627429962158, "learning_rate": 3.7500000000000003e-05, "loss": 0.7819, "step": 6240 }, { "epoch": 7.9923273657289, "grad_norm": 3.3114781379699707, "learning_rate": 3.7478222996515685e-05, "loss": 0.3791, "step": 6250 }, { "epoch": 8.0, "eval_loss": 0.24141965806484222, "eval_runtime": 0.9886, "eval_samples_per_second": 99.127, "eval_steps_per_second": 13.15, "step": 6256 }, { "epoch": 8.005115089514067, "grad_norm": 10.103604316711426, "learning_rate": 3.745644599303136e-05, "loss": 0.2827, "step": 6260 }, { "epoch": 8.017902813299234, "grad_norm": 7.1643476486206055, "learning_rate": 3.743466898954704e-05, "loss": 0.6589, "step": 6270 }, { "epoch": 8.030690537084398, "grad_norm": 5.2952423095703125, "learning_rate": 3.741289198606272e-05, "loss": 0.3989, "step": 6280 }, { "epoch": 8.043478260869565, "grad_norm": 10.544265747070312, "learning_rate": 3.73911149825784e-05, "loss": 0.4586, "step": 6290 }, { "epoch": 8.05626598465473, "grad_norm": 7.280800819396973, "learning_rate": 3.7369337979094074e-05, "loss": 0.5087, "step": 6300 }, { "epoch": 8.069053708439897, "grad_norm": 4.117053031921387, "learning_rate": 3.7347560975609755e-05, "loss": 0.3276, "step": 6310 }, { "epoch": 8.081841432225064, "grad_norm": 8.452136993408203, "learning_rate": 3.732578397212544e-05, "loss": 0.2142, "step": 6320 }, { "epoch": 8.09462915601023, "grad_norm": 8.656983375549316, "learning_rate": 3.730400696864111e-05, "loss": 0.6328, "step": 6330 }, { "epoch": 8.107416879795396, "grad_norm": 15.760722160339355, "learning_rate": 3.7282229965156794e-05, "loss": 0.3794, "step": 6340 }, { "epoch": 8.120204603580563, "grad_norm": 0.8831185698509216, "learning_rate": 3.7260452961672476e-05, "loss": 0.2596, "step": 6350 }, { "epoch": 8.132992327365729, "grad_norm": 3.0778136253356934, "learning_rate": 3.723867595818816e-05, "loss": 0.2729, "step": 6360 }, { "epoch": 8.145780051150895, "grad_norm": 0.35585105419158936, "learning_rate": 3.721689895470383e-05, "loss": 0.2476, "step": 6370 }, { "epoch": 8.158567774936062, "grad_norm": 1.6337690353393555, "learning_rate": 3.7195121951219514e-05, "loss": 0.4862, "step": 6380 }, { "epoch": 8.171355498721228, "grad_norm": 2.012357711791992, "learning_rate": 3.7173344947735196e-05, "loss": 0.1733, "step": 6390 }, { "epoch": 8.184143222506394, "grad_norm": 1.2589486837387085, "learning_rate": 3.715156794425087e-05, "loss": 0.514, "step": 6400 }, { "epoch": 8.19693094629156, "grad_norm": 1.7067822217941284, "learning_rate": 3.712979094076655e-05, "loss": 0.4357, "step": 6410 }, { "epoch": 8.209718670076727, "grad_norm": 0.9603003263473511, "learning_rate": 3.7108013937282234e-05, "loss": 0.7706, "step": 6420 }, { "epoch": 8.222506393861893, "grad_norm": 0.010923515073955059, "learning_rate": 3.7086236933797916e-05, "loss": 0.4297, "step": 6430 }, { "epoch": 8.235294117647058, "grad_norm": 6.709939956665039, "learning_rate": 3.706445993031359e-05, "loss": 0.4281, "step": 6440 }, { "epoch": 8.248081841432224, "grad_norm": 0.016056306660175323, "learning_rate": 3.704268292682927e-05, "loss": 0.4342, "step": 6450 }, { "epoch": 8.26086956521739, "grad_norm": 7.876967430114746, "learning_rate": 3.702090592334495e-05, "loss": 0.439, "step": 6460 }, { "epoch": 8.273657289002557, "grad_norm": 0.3735317587852478, "learning_rate": 3.699912891986063e-05, "loss": 0.5231, "step": 6470 }, { "epoch": 8.286445012787723, "grad_norm": 4.260231018066406, "learning_rate": 3.6977351916376304e-05, "loss": 0.0909, "step": 6480 }, { "epoch": 8.29923273657289, "grad_norm": 0.04777367785573006, "learning_rate": 3.6955574912891986e-05, "loss": 0.2107, "step": 6490 }, { "epoch": 8.312020460358056, "grad_norm": 7.151200294494629, "learning_rate": 3.693379790940767e-05, "loss": 0.4714, "step": 6500 }, { "epoch": 8.324808184143222, "grad_norm": 10.462479591369629, "learning_rate": 3.691202090592334e-05, "loss": 0.6321, "step": 6510 }, { "epoch": 8.337595907928389, "grad_norm": 6.835601329803467, "learning_rate": 3.6890243902439025e-05, "loss": 0.4306, "step": 6520 }, { "epoch": 8.350383631713555, "grad_norm": 3.190450429916382, "learning_rate": 3.6868466898954706e-05, "loss": 0.3589, "step": 6530 }, { "epoch": 8.363171355498721, "grad_norm": 8.741924285888672, "learning_rate": 3.684668989547039e-05, "loss": 0.5098, "step": 6540 }, { "epoch": 8.375959079283888, "grad_norm": 1.7585572004318237, "learning_rate": 3.682491289198606e-05, "loss": 0.3946, "step": 6550 }, { "epoch": 8.388746803069054, "grad_norm": 1.7931690216064453, "learning_rate": 3.6803135888501745e-05, "loss": 0.3422, "step": 6560 }, { "epoch": 8.40153452685422, "grad_norm": 7.819911956787109, "learning_rate": 3.6781358885017427e-05, "loss": 0.4142, "step": 6570 }, { "epoch": 8.414322250639387, "grad_norm": 6.050047874450684, "learning_rate": 3.67595818815331e-05, "loss": 0.8177, "step": 6580 }, { "epoch": 8.427109974424553, "grad_norm": 12.437430381774902, "learning_rate": 3.673780487804878e-05, "loss": 0.3595, "step": 6590 }, { "epoch": 8.43989769820972, "grad_norm": 0.0015749474987387657, "learning_rate": 3.6716027874564465e-05, "loss": 0.4131, "step": 6600 }, { "epoch": 8.452685421994884, "grad_norm": 0.042624592781066895, "learning_rate": 3.669425087108015e-05, "loss": 0.5854, "step": 6610 }, { "epoch": 8.46547314578005, "grad_norm": 5.490470886230469, "learning_rate": 3.667247386759582e-05, "loss": 0.4621, "step": 6620 }, { "epoch": 8.478260869565217, "grad_norm": 5.620114326477051, "learning_rate": 3.66506968641115e-05, "loss": 0.6975, "step": 6630 }, { "epoch": 8.491048593350383, "grad_norm": 2.929370641708374, "learning_rate": 3.662891986062718e-05, "loss": 0.3325, "step": 6640 }, { "epoch": 8.50383631713555, "grad_norm": 0.27810731530189514, "learning_rate": 3.6607142857142853e-05, "loss": 0.4809, "step": 6650 }, { "epoch": 8.516624040920716, "grad_norm": 14.763570785522461, "learning_rate": 3.6585365853658535e-05, "loss": 0.5995, "step": 6660 }, { "epoch": 8.529411764705882, "grad_norm": 4.998637676239014, "learning_rate": 3.656358885017422e-05, "loss": 0.1925, "step": 6670 }, { "epoch": 8.542199488491049, "grad_norm": 11.907294273376465, "learning_rate": 3.65418118466899e-05, "loss": 0.5023, "step": 6680 }, { "epoch": 8.554987212276215, "grad_norm": 1.786744236946106, "learning_rate": 3.6520034843205574e-05, "loss": 0.4069, "step": 6690 }, { "epoch": 8.567774936061381, "grad_norm": 8.682933807373047, "learning_rate": 3.6498257839721255e-05, "loss": 0.5076, "step": 6700 }, { "epoch": 8.580562659846548, "grad_norm": 2.2297000885009766, "learning_rate": 3.647648083623694e-05, "loss": 0.4252, "step": 6710 }, { "epoch": 8.593350383631714, "grad_norm": 4.3645100593566895, "learning_rate": 3.645470383275261e-05, "loss": 0.3388, "step": 6720 }, { "epoch": 8.60613810741688, "grad_norm": 0.9659554958343506, "learning_rate": 3.6432926829268294e-05, "loss": 0.1987, "step": 6730 }, { "epoch": 8.618925831202047, "grad_norm": 4.049929141998291, "learning_rate": 3.6411149825783976e-05, "loss": 0.3661, "step": 6740 }, { "epoch": 8.631713554987213, "grad_norm": 1.467517614364624, "learning_rate": 3.638937282229966e-05, "loss": 0.3241, "step": 6750 }, { "epoch": 8.644501278772378, "grad_norm": 7.087103843688965, "learning_rate": 3.636759581881533e-05, "loss": 0.5732, "step": 6760 }, { "epoch": 8.657289002557544, "grad_norm": 2.423686981201172, "learning_rate": 3.6345818815331014e-05, "loss": 0.569, "step": 6770 }, { "epoch": 8.67007672634271, "grad_norm": 11.937353134155273, "learning_rate": 3.6324041811846696e-05, "loss": 0.5757, "step": 6780 }, { "epoch": 8.682864450127877, "grad_norm": 8.299654006958008, "learning_rate": 3.630226480836237e-05, "loss": 0.4712, "step": 6790 }, { "epoch": 8.695652173913043, "grad_norm": 0.01959991082549095, "learning_rate": 3.628048780487805e-05, "loss": 0.555, "step": 6800 }, { "epoch": 8.70843989769821, "grad_norm": 5.581100940704346, "learning_rate": 3.625871080139373e-05, "loss": 0.5037, "step": 6810 }, { "epoch": 8.721227621483376, "grad_norm": 2.0534441471099854, "learning_rate": 3.623693379790941e-05, "loss": 0.1514, "step": 6820 }, { "epoch": 8.734015345268542, "grad_norm": 5.216137409210205, "learning_rate": 3.6215156794425084e-05, "loss": 0.4657, "step": 6830 }, { "epoch": 8.746803069053708, "grad_norm": 5.7269487380981445, "learning_rate": 3.6193379790940766e-05, "loss": 0.4067, "step": 6840 }, { "epoch": 8.759590792838875, "grad_norm": 0.33077237010002136, "learning_rate": 3.617160278745645e-05, "loss": 0.6687, "step": 6850 }, { "epoch": 8.772378516624041, "grad_norm": 1.5920321941375732, "learning_rate": 3.614982578397213e-05, "loss": 0.3967, "step": 6860 }, { "epoch": 8.785166240409207, "grad_norm": 1.9924708604812622, "learning_rate": 3.6128048780487804e-05, "loss": 0.4591, "step": 6870 }, { "epoch": 8.797953964194374, "grad_norm": 2.579571008682251, "learning_rate": 3.6106271777003486e-05, "loss": 0.7944, "step": 6880 }, { "epoch": 8.81074168797954, "grad_norm": 4.239784240722656, "learning_rate": 3.608449477351917e-05, "loss": 0.3443, "step": 6890 }, { "epoch": 8.823529411764707, "grad_norm": 6.115857124328613, "learning_rate": 3.606271777003484e-05, "loss": 0.5023, "step": 6900 }, { "epoch": 8.836317135549873, "grad_norm": 0.575232744216919, "learning_rate": 3.6040940766550525e-05, "loss": 0.4865, "step": 6910 }, { "epoch": 8.84910485933504, "grad_norm": 1.7253854274749756, "learning_rate": 3.6019163763066206e-05, "loss": 0.3874, "step": 6920 }, { "epoch": 8.861892583120204, "grad_norm": 4.649746417999268, "learning_rate": 3.599738675958189e-05, "loss": 0.2017, "step": 6930 }, { "epoch": 8.87468030690537, "grad_norm": 3.3735687732696533, "learning_rate": 3.597560975609756e-05, "loss": 0.4938, "step": 6940 }, { "epoch": 8.887468030690536, "grad_norm": 0.6998658180236816, "learning_rate": 3.5953832752613245e-05, "loss": 0.2944, "step": 6950 }, { "epoch": 8.900255754475703, "grad_norm": 2.798309326171875, "learning_rate": 3.5932055749128927e-05, "loss": 0.4214, "step": 6960 }, { "epoch": 8.91304347826087, "grad_norm": 3.0269863605499268, "learning_rate": 3.59102787456446e-05, "loss": 0.4129, "step": 6970 }, { "epoch": 8.925831202046036, "grad_norm": 6.951076030731201, "learning_rate": 3.5888501742160277e-05, "loss": 0.356, "step": 6980 }, { "epoch": 8.938618925831202, "grad_norm": 0.4412369728088379, "learning_rate": 3.586672473867596e-05, "loss": 0.165, "step": 6990 }, { "epoch": 8.951406649616368, "grad_norm": 10.887850761413574, "learning_rate": 3.584494773519164e-05, "loss": 0.5921, "step": 7000 }, { "epoch": 8.964194373401535, "grad_norm": 0.00953485444188118, "learning_rate": 3.5823170731707315e-05, "loss": 0.4405, "step": 7010 }, { "epoch": 8.976982097186701, "grad_norm": 0.8474636077880859, "learning_rate": 3.5801393728223e-05, "loss": 0.3183, "step": 7020 }, { "epoch": 8.989769820971867, "grad_norm": 0.3193589746952057, "learning_rate": 3.577961672473868e-05, "loss": 0.6581, "step": 7030 }, { "epoch": 9.0, "eval_loss": 0.2372424602508545, "eval_runtime": 0.9906, "eval_samples_per_second": 98.934, "eval_steps_per_second": 13.124, "step": 7038 }, { "epoch": 9.002557544757034, "grad_norm": 1.6099143028259277, "learning_rate": 3.5757839721254353e-05, "loss": 0.4287, "step": 7040 }, { "epoch": 9.0153452685422, "grad_norm": 0.0028387894853949547, "learning_rate": 3.5736062717770035e-05, "loss": 0.3471, "step": 7050 }, { "epoch": 9.028132992327366, "grad_norm": 0.9083518385887146, "learning_rate": 3.571428571428572e-05, "loss": 0.4602, "step": 7060 }, { "epoch": 9.040920716112533, "grad_norm": 10.855413436889648, "learning_rate": 3.56925087108014e-05, "loss": 0.3018, "step": 7070 }, { "epoch": 9.053708439897699, "grad_norm": 5.899298667907715, "learning_rate": 3.5670731707317074e-05, "loss": 0.6242, "step": 7080 }, { "epoch": 9.066496163682864, "grad_norm": 0.3065991699695587, "learning_rate": 3.5648954703832755e-05, "loss": 0.4585, "step": 7090 }, { "epoch": 9.07928388746803, "grad_norm": 1.1200615167617798, "learning_rate": 3.562717770034844e-05, "loss": 0.384, "step": 7100 }, { "epoch": 9.092071611253196, "grad_norm": 0.787402331829071, "learning_rate": 3.560540069686411e-05, "loss": 0.6417, "step": 7110 }, { "epoch": 9.104859335038363, "grad_norm": 5.251519680023193, "learning_rate": 3.5583623693379794e-05, "loss": 0.3815, "step": 7120 }, { "epoch": 9.117647058823529, "grad_norm": 6.498902797698975, "learning_rate": 3.5561846689895476e-05, "loss": 0.348, "step": 7130 }, { "epoch": 9.130434782608695, "grad_norm": 7.617058753967285, "learning_rate": 3.554006968641115e-05, "loss": 0.5356, "step": 7140 }, { "epoch": 9.143222506393862, "grad_norm": 3.143923759460449, "learning_rate": 3.551829268292683e-05, "loss": 0.3764, "step": 7150 }, { "epoch": 9.156010230179028, "grad_norm": 0.016743259504437447, "learning_rate": 3.549651567944251e-05, "loss": 0.3662, "step": 7160 }, { "epoch": 9.168797953964194, "grad_norm": 6.293381214141846, "learning_rate": 3.547473867595819e-05, "loss": 0.5453, "step": 7170 }, { "epoch": 9.18158567774936, "grad_norm": 2.0475428104400635, "learning_rate": 3.5452961672473864e-05, "loss": 0.3636, "step": 7180 }, { "epoch": 9.194373401534527, "grad_norm": 0.79485684633255, "learning_rate": 3.5431184668989546e-05, "loss": 0.3049, "step": 7190 }, { "epoch": 9.207161125319693, "grad_norm": 1.2061353921890259, "learning_rate": 3.540940766550523e-05, "loss": 0.3841, "step": 7200 }, { "epoch": 9.21994884910486, "grad_norm": 0.933066189289093, "learning_rate": 3.538763066202091e-05, "loss": 0.3891, "step": 7210 }, { "epoch": 9.232736572890026, "grad_norm": 3.883514642715454, "learning_rate": 3.5365853658536584e-05, "loss": 0.6795, "step": 7220 }, { "epoch": 9.245524296675192, "grad_norm": 8.447504997253418, "learning_rate": 3.5344076655052266e-05, "loss": 0.5821, "step": 7230 }, { "epoch": 9.258312020460359, "grad_norm": 2.884052276611328, "learning_rate": 3.532229965156795e-05, "loss": 0.2316, "step": 7240 }, { "epoch": 9.271099744245525, "grad_norm": 9.32747745513916, "learning_rate": 3.530052264808362e-05, "loss": 0.5633, "step": 7250 }, { "epoch": 9.28388746803069, "grad_norm": 0.020528344437479973, "learning_rate": 3.5278745644599304e-05, "loss": 0.2843, "step": 7260 }, { "epoch": 9.296675191815856, "grad_norm": 2.6097218990325928, "learning_rate": 3.5256968641114986e-05, "loss": 0.3732, "step": 7270 }, { "epoch": 9.309462915601022, "grad_norm": 0.0003839631099253893, "learning_rate": 3.523519163763067e-05, "loss": 0.4346, "step": 7280 }, { "epoch": 9.322250639386189, "grad_norm": 2.2322895526885986, "learning_rate": 3.521341463414634e-05, "loss": 0.2317, "step": 7290 }, { "epoch": 9.335038363171355, "grad_norm": 3.4575886726379395, "learning_rate": 3.5191637630662025e-05, "loss": 0.3988, "step": 7300 }, { "epoch": 9.347826086956522, "grad_norm": 1.4904476404190063, "learning_rate": 3.5169860627177706e-05, "loss": 0.2786, "step": 7310 }, { "epoch": 9.360613810741688, "grad_norm": 0.7082493305206299, "learning_rate": 3.514808362369338e-05, "loss": 0.2075, "step": 7320 }, { "epoch": 9.373401534526854, "grad_norm": 0.00038726787897758186, "learning_rate": 3.5126306620209056e-05, "loss": 0.222, "step": 7330 }, { "epoch": 9.38618925831202, "grad_norm": 11.110411643981934, "learning_rate": 3.510452961672474e-05, "loss": 0.3972, "step": 7340 }, { "epoch": 9.398976982097187, "grad_norm": 5.460088729858398, "learning_rate": 3.508275261324042e-05, "loss": 0.3556, "step": 7350 }, { "epoch": 9.411764705882353, "grad_norm": 10.77302074432373, "learning_rate": 3.5060975609756095e-05, "loss": 0.4483, "step": 7360 }, { "epoch": 9.42455242966752, "grad_norm": 1.2308753728866577, "learning_rate": 3.5039198606271777e-05, "loss": 0.2582, "step": 7370 }, { "epoch": 9.437340153452686, "grad_norm": 0.862062394618988, "learning_rate": 3.501742160278746e-05, "loss": 0.4087, "step": 7380 }, { "epoch": 9.450127877237852, "grad_norm": 1.5446736812591553, "learning_rate": 3.499564459930314e-05, "loss": 0.332, "step": 7390 }, { "epoch": 9.462915601023019, "grad_norm": 1.6241346597671509, "learning_rate": 3.4973867595818815e-05, "loss": 0.474, "step": 7400 }, { "epoch": 9.475703324808185, "grad_norm": 3.712130546569824, "learning_rate": 3.49520905923345e-05, "loss": 0.3702, "step": 7410 }, { "epoch": 9.48849104859335, "grad_norm": 1.5162075757980347, "learning_rate": 3.493031358885018e-05, "loss": 0.1353, "step": 7420 }, { "epoch": 9.501278772378516, "grad_norm": 2.307133197784424, "learning_rate": 3.4908536585365853e-05, "loss": 0.2684, "step": 7430 }, { "epoch": 9.514066496163682, "grad_norm": 2.630873680114746, "learning_rate": 3.4886759581881535e-05, "loss": 0.4246, "step": 7440 }, { "epoch": 9.526854219948849, "grad_norm": 9.756599426269531, "learning_rate": 3.486498257839722e-05, "loss": 0.5116, "step": 7450 }, { "epoch": 9.539641943734015, "grad_norm": 1.998191237449646, "learning_rate": 3.48432055749129e-05, "loss": 0.3179, "step": 7460 }, { "epoch": 9.552429667519181, "grad_norm": 4.331905364990234, "learning_rate": 3.4821428571428574e-05, "loss": 0.4869, "step": 7470 }, { "epoch": 9.565217391304348, "grad_norm": 1.298435926437378, "learning_rate": 3.4799651567944255e-05, "loss": 0.4402, "step": 7480 }, { "epoch": 9.578005115089514, "grad_norm": 0.7783128619194031, "learning_rate": 3.477787456445993e-05, "loss": 0.3408, "step": 7490 }, { "epoch": 9.59079283887468, "grad_norm": 1.8130773305892944, "learning_rate": 3.475609756097561e-05, "loss": 0.6058, "step": 7500 }, { "epoch": 9.603580562659847, "grad_norm": 0.29691851139068604, "learning_rate": 3.473432055749129e-05, "loss": 0.3485, "step": 7510 }, { "epoch": 9.616368286445013, "grad_norm": 6.944264888763428, "learning_rate": 3.471254355400697e-05, "loss": 0.4315, "step": 7520 }, { "epoch": 9.62915601023018, "grad_norm": 0.29563087224960327, "learning_rate": 3.469076655052265e-05, "loss": 0.4645, "step": 7530 }, { "epoch": 9.641943734015346, "grad_norm": 6.2144951820373535, "learning_rate": 3.4668989547038326e-05, "loss": 0.4299, "step": 7540 }, { "epoch": 9.654731457800512, "grad_norm": 0.41491979360580444, "learning_rate": 3.464721254355401e-05, "loss": 0.4714, "step": 7550 }, { "epoch": 9.667519181585678, "grad_norm": 0.21521031856536865, "learning_rate": 3.462543554006969e-05, "loss": 0.4527, "step": 7560 }, { "epoch": 9.680306905370845, "grad_norm": 10.179862976074219, "learning_rate": 3.4603658536585364e-05, "loss": 0.4961, "step": 7570 }, { "epoch": 9.693094629156011, "grad_norm": 1.0263521671295166, "learning_rate": 3.4581881533101046e-05, "loss": 0.4905, "step": 7580 }, { "epoch": 9.705882352941176, "grad_norm": 5.564786911010742, "learning_rate": 3.456010452961673e-05, "loss": 0.5835, "step": 7590 }, { "epoch": 9.718670076726342, "grad_norm": 12.091937065124512, "learning_rate": 3.453832752613241e-05, "loss": 0.7117, "step": 7600 }, { "epoch": 9.731457800511508, "grad_norm": 10.530780792236328, "learning_rate": 3.4516550522648084e-05, "loss": 0.3978, "step": 7610 }, { "epoch": 9.744245524296675, "grad_norm": 1.0764374732971191, "learning_rate": 3.4494773519163766e-05, "loss": 0.2973, "step": 7620 }, { "epoch": 9.757033248081841, "grad_norm": 5.413555145263672, "learning_rate": 3.447299651567945e-05, "loss": 0.5868, "step": 7630 }, { "epoch": 9.769820971867007, "grad_norm": 0.3791005611419678, "learning_rate": 3.445121951219512e-05, "loss": 0.3802, "step": 7640 }, { "epoch": 9.782608695652174, "grad_norm": 2.3650548458099365, "learning_rate": 3.4429442508710804e-05, "loss": 0.4553, "step": 7650 }, { "epoch": 9.79539641943734, "grad_norm": 0.16670426726341248, "learning_rate": 3.4407665505226486e-05, "loss": 0.227, "step": 7660 }, { "epoch": 9.808184143222507, "grad_norm": 0.002888306276872754, "learning_rate": 3.438588850174216e-05, "loss": 0.7055, "step": 7670 }, { "epoch": 9.820971867007673, "grad_norm": 5.335592269897461, "learning_rate": 3.4364111498257836e-05, "loss": 0.3602, "step": 7680 }, { "epoch": 9.83375959079284, "grad_norm": 0.004552973434329033, "learning_rate": 3.434233449477352e-05, "loss": 0.3308, "step": 7690 }, { "epoch": 9.846547314578006, "grad_norm": 3.599313974380493, "learning_rate": 3.43205574912892e-05, "loss": 0.3185, "step": 7700 }, { "epoch": 9.859335038363172, "grad_norm": 9.506612777709961, "learning_rate": 3.429878048780488e-05, "loss": 0.4221, "step": 7710 }, { "epoch": 9.872122762148338, "grad_norm": 0.04120413213968277, "learning_rate": 3.4277003484320556e-05, "loss": 0.255, "step": 7720 }, { "epoch": 9.884910485933505, "grad_norm": 6.27550745010376, "learning_rate": 3.425522648083624e-05, "loss": 0.3397, "step": 7730 }, { "epoch": 9.89769820971867, "grad_norm": 6.480957508087158, "learning_rate": 3.423344947735192e-05, "loss": 0.5441, "step": 7740 }, { "epoch": 9.910485933503836, "grad_norm": 0.825150728225708, "learning_rate": 3.4211672473867595e-05, "loss": 0.3759, "step": 7750 }, { "epoch": 9.923273657289002, "grad_norm": 0.0003031272499356419, "learning_rate": 3.4189895470383277e-05, "loss": 0.564, "step": 7760 }, { "epoch": 9.936061381074168, "grad_norm": 2.838111639022827, "learning_rate": 3.416811846689896e-05, "loss": 0.453, "step": 7770 }, { "epoch": 9.948849104859335, "grad_norm": 1.2337100505828857, "learning_rate": 3.414634146341464e-05, "loss": 0.3176, "step": 7780 }, { "epoch": 9.961636828644501, "grad_norm": 3.4663679599761963, "learning_rate": 3.4124564459930315e-05, "loss": 0.4674, "step": 7790 }, { "epoch": 9.974424552429667, "grad_norm": 7.805672645568848, "learning_rate": 3.4102787456446e-05, "loss": 0.5197, "step": 7800 }, { "epoch": 9.987212276214834, "grad_norm": 6.521031379699707, "learning_rate": 3.408101045296168e-05, "loss": 0.4132, "step": 7810 }, { "epoch": 10.0, "grad_norm": 8.708837509155273, "learning_rate": 3.4059233449477354e-05, "loss": 0.4148, "step": 7820 }, { "epoch": 10.0, "eval_loss": 0.2310720980167389, "eval_runtime": 0.8084, "eval_samples_per_second": 121.233, "eval_steps_per_second": 16.082, "step": 7820 }, { "epoch": 10.012787723785166, "grad_norm": 7.328716278076172, "learning_rate": 3.4037456445993035e-05, "loss": 0.4179, "step": 7830 }, { "epoch": 10.025575447570333, "grad_norm": 0.4229029417037964, "learning_rate": 3.401567944250871e-05, "loss": 0.5226, "step": 7840 }, { "epoch": 10.038363171355499, "grad_norm": 0.0002528049808461219, "learning_rate": 3.399390243902439e-05, "loss": 0.5042, "step": 7850 }, { "epoch": 10.051150895140665, "grad_norm": 5.877325057983398, "learning_rate": 3.397212543554007e-05, "loss": 0.4143, "step": 7860 }, { "epoch": 10.063938618925832, "grad_norm": 3.365591049194336, "learning_rate": 3.395034843205575e-05, "loss": 0.2714, "step": 7870 }, { "epoch": 10.076726342710998, "grad_norm": 8.59089469909668, "learning_rate": 3.392857142857143e-05, "loss": 0.3792, "step": 7880 }, { "epoch": 10.089514066496164, "grad_norm": 0.040207698941230774, "learning_rate": 3.3906794425087105e-05, "loss": 0.2498, "step": 7890 }, { "epoch": 10.10230179028133, "grad_norm": 5.780106544494629, "learning_rate": 3.388501742160279e-05, "loss": 0.2073, "step": 7900 }, { "epoch": 10.115089514066495, "grad_norm": 0.04502753168344498, "learning_rate": 3.386324041811847e-05, "loss": 0.5198, "step": 7910 }, { "epoch": 10.127877237851662, "grad_norm": 0.007558781187981367, "learning_rate": 3.384146341463415e-05, "loss": 0.209, "step": 7920 }, { "epoch": 10.140664961636828, "grad_norm": 4.91458797454834, "learning_rate": 3.3819686411149826e-05, "loss": 0.4293, "step": 7930 }, { "epoch": 10.153452685421994, "grad_norm": 4.223980903625488, "learning_rate": 3.379790940766551e-05, "loss": 0.2377, "step": 7940 }, { "epoch": 10.16624040920716, "grad_norm": 0.48757821321487427, "learning_rate": 3.377613240418119e-05, "loss": 0.1887, "step": 7950 }, { "epoch": 10.179028132992327, "grad_norm": 2.029522180557251, "learning_rate": 3.3754355400696864e-05, "loss": 0.2748, "step": 7960 }, { "epoch": 10.191815856777493, "grad_norm": 0.03896741569042206, "learning_rate": 3.3732578397212546e-05, "loss": 0.1531, "step": 7970 }, { "epoch": 10.20460358056266, "grad_norm": 4.019787311553955, "learning_rate": 3.371080139372823e-05, "loss": 0.413, "step": 7980 }, { "epoch": 10.217391304347826, "grad_norm": 3.299919366836548, "learning_rate": 3.368902439024391e-05, "loss": 0.4001, "step": 7990 }, { "epoch": 10.230179028132993, "grad_norm": 1.2256722450256348, "learning_rate": 3.3667247386759584e-05, "loss": 0.3925, "step": 8000 }, { "epoch": 10.242966751918159, "grad_norm": 1.500207781791687, "learning_rate": 3.3645470383275266e-05, "loss": 0.5539, "step": 8010 }, { "epoch": 10.255754475703325, "grad_norm": 3.64996600151062, "learning_rate": 3.362369337979094e-05, "loss": 0.2435, "step": 8020 }, { "epoch": 10.268542199488492, "grad_norm": 1.5223026275634766, "learning_rate": 3.3601916376306616e-05, "loss": 0.4816, "step": 8030 }, { "epoch": 10.281329923273658, "grad_norm": 11.364663124084473, "learning_rate": 3.35801393728223e-05, "loss": 0.2599, "step": 8040 }, { "epoch": 10.294117647058824, "grad_norm": 6.048238277435303, "learning_rate": 3.355836236933798e-05, "loss": 0.3296, "step": 8050 }, { "epoch": 10.30690537084399, "grad_norm": 1.025272011756897, "learning_rate": 3.353658536585366e-05, "loss": 0.5319, "step": 8060 }, { "epoch": 10.319693094629155, "grad_norm": 5.667033672332764, "learning_rate": 3.3514808362369336e-05, "loss": 0.6212, "step": 8070 }, { "epoch": 10.332480818414322, "grad_norm": 8.041871070861816, "learning_rate": 3.349303135888502e-05, "loss": 0.6741, "step": 8080 }, { "epoch": 10.345268542199488, "grad_norm": 1.6329516172409058, "learning_rate": 3.34712543554007e-05, "loss": 0.5887, "step": 8090 }, { "epoch": 10.358056265984654, "grad_norm": 1.5292503833770752, "learning_rate": 3.344947735191638e-05, "loss": 0.1805, "step": 8100 }, { "epoch": 10.37084398976982, "grad_norm": 3.6555302143096924, "learning_rate": 3.3427700348432056e-05, "loss": 0.4751, "step": 8110 }, { "epoch": 10.383631713554987, "grad_norm": 0.032321881502866745, "learning_rate": 3.340592334494774e-05, "loss": 0.3751, "step": 8120 }, { "epoch": 10.396419437340153, "grad_norm": 0.11594846844673157, "learning_rate": 3.338414634146342e-05, "loss": 0.3831, "step": 8130 }, { "epoch": 10.40920716112532, "grad_norm": 0.2702908217906952, "learning_rate": 3.3362369337979095e-05, "loss": 0.2761, "step": 8140 }, { "epoch": 10.421994884910486, "grad_norm": 5.8537726402282715, "learning_rate": 3.3340592334494777e-05, "loss": 0.7554, "step": 8150 }, { "epoch": 10.434782608695652, "grad_norm": 1.9350101947784424, "learning_rate": 3.331881533101046e-05, "loss": 0.2913, "step": 8160 }, { "epoch": 10.447570332480819, "grad_norm": 0.1800796538591385, "learning_rate": 3.329703832752613e-05, "loss": 0.1576, "step": 8170 }, { "epoch": 10.460358056265985, "grad_norm": 2.1546852588653564, "learning_rate": 3.3275261324041815e-05, "loss": 0.2741, "step": 8180 }, { "epoch": 10.473145780051151, "grad_norm": 11.017895698547363, "learning_rate": 3.325348432055749e-05, "loss": 0.2629, "step": 8190 }, { "epoch": 10.485933503836318, "grad_norm": 3.735138416290283, "learning_rate": 3.323170731707317e-05, "loss": 0.4408, "step": 8200 }, { "epoch": 10.498721227621484, "grad_norm": 4.445374965667725, "learning_rate": 3.320993031358885e-05, "loss": 0.5969, "step": 8210 }, { "epoch": 10.51150895140665, "grad_norm": 1.9418877363204956, "learning_rate": 3.318815331010453e-05, "loss": 0.2827, "step": 8220 }, { "epoch": 10.524296675191817, "grad_norm": 6.815186977386475, "learning_rate": 3.316637630662021e-05, "loss": 0.3757, "step": 8230 }, { "epoch": 10.537084398976981, "grad_norm": 6.0457353591918945, "learning_rate": 3.314459930313589e-05, "loss": 0.3467, "step": 8240 }, { "epoch": 10.549872122762148, "grad_norm": 1.9271039962768555, "learning_rate": 3.312282229965157e-05, "loss": 0.3033, "step": 8250 }, { "epoch": 10.562659846547314, "grad_norm": 1.7230960130691528, "learning_rate": 3.310104529616725e-05, "loss": 0.3376, "step": 8260 }, { "epoch": 10.57544757033248, "grad_norm": 0.8747263550758362, "learning_rate": 3.307926829268293e-05, "loss": 0.2699, "step": 8270 }, { "epoch": 10.588235294117647, "grad_norm": 4.551005840301514, "learning_rate": 3.3057491289198605e-05, "loss": 0.4694, "step": 8280 }, { "epoch": 10.601023017902813, "grad_norm": 1.4506466388702393, "learning_rate": 3.303571428571429e-05, "loss": 0.4233, "step": 8290 }, { "epoch": 10.61381074168798, "grad_norm": 2.718492269515991, "learning_rate": 3.301393728222997e-05, "loss": 0.312, "step": 8300 }, { "epoch": 10.626598465473146, "grad_norm": 4.134568214416504, "learning_rate": 3.299216027874565e-05, "loss": 0.5228, "step": 8310 }, { "epoch": 10.639386189258312, "grad_norm": 1.0367114543914795, "learning_rate": 3.2970383275261326e-05, "loss": 0.2426, "step": 8320 }, { "epoch": 10.652173913043478, "grad_norm": 0.3512336313724518, "learning_rate": 3.294860627177701e-05, "loss": 0.5909, "step": 8330 }, { "epoch": 10.664961636828645, "grad_norm": 5.459338665008545, "learning_rate": 3.292682926829269e-05, "loss": 0.337, "step": 8340 }, { "epoch": 10.677749360613811, "grad_norm": 7.244568824768066, "learning_rate": 3.2905052264808364e-05, "loss": 0.5357, "step": 8350 }, { "epoch": 10.690537084398978, "grad_norm": 3.035902976989746, "learning_rate": 3.288327526132404e-05, "loss": 0.3594, "step": 8360 }, { "epoch": 10.703324808184144, "grad_norm": 11.743247985839844, "learning_rate": 3.286149825783972e-05, "loss": 0.2371, "step": 8370 }, { "epoch": 10.71611253196931, "grad_norm": 0.0029970197938382626, "learning_rate": 3.28397212543554e-05, "loss": 0.3303, "step": 8380 }, { "epoch": 10.728900255754475, "grad_norm": 0.00909477099776268, "learning_rate": 3.281794425087108e-05, "loss": 0.3742, "step": 8390 }, { "epoch": 10.741687979539641, "grad_norm": 0.9811226725578308, "learning_rate": 3.279616724738676e-05, "loss": 0.2562, "step": 8400 }, { "epoch": 10.754475703324808, "grad_norm": 5.737852096557617, "learning_rate": 3.277439024390244e-05, "loss": 0.7256, "step": 8410 }, { "epoch": 10.767263427109974, "grad_norm": 0.34903228282928467, "learning_rate": 3.2752613240418116e-05, "loss": 0.4239, "step": 8420 }, { "epoch": 10.78005115089514, "grad_norm": 8.985549926757812, "learning_rate": 3.27308362369338e-05, "loss": 0.6547, "step": 8430 }, { "epoch": 10.792838874680307, "grad_norm": 6.798750877380371, "learning_rate": 3.270905923344948e-05, "loss": 0.499, "step": 8440 }, { "epoch": 10.805626598465473, "grad_norm": 4.940242767333984, "learning_rate": 3.268728222996516e-05, "loss": 0.4255, "step": 8450 }, { "epoch": 10.81841432225064, "grad_norm": 1.2830392122268677, "learning_rate": 3.2665505226480836e-05, "loss": 0.7948, "step": 8460 }, { "epoch": 10.831202046035806, "grad_norm": 0.4314042031764984, "learning_rate": 3.264372822299652e-05, "loss": 0.0659, "step": 8470 }, { "epoch": 10.843989769820972, "grad_norm": 0.1115281879901886, "learning_rate": 3.26219512195122e-05, "loss": 0.405, "step": 8480 }, { "epoch": 10.856777493606138, "grad_norm": 0.07167865335941315, "learning_rate": 3.2600174216027875e-05, "loss": 0.2189, "step": 8490 }, { "epoch": 10.869565217391305, "grad_norm": 4.274066925048828, "learning_rate": 3.2578397212543556e-05, "loss": 0.6027, "step": 8500 }, { "epoch": 10.882352941176471, "grad_norm": 3.2567176818847656, "learning_rate": 3.255662020905924e-05, "loss": 0.2483, "step": 8510 }, { "epoch": 10.895140664961637, "grad_norm": 2.4175426959991455, "learning_rate": 3.253484320557491e-05, "loss": 0.3907, "step": 8520 }, { "epoch": 10.907928388746804, "grad_norm": 0.1821274608373642, "learning_rate": 3.2513066202090595e-05, "loss": 0.1762, "step": 8530 }, { "epoch": 10.92071611253197, "grad_norm": 0.4083157479763031, "learning_rate": 3.249128919860627e-05, "loss": 0.5483, "step": 8540 }, { "epoch": 10.933503836317136, "grad_norm": 0.8208944201469421, "learning_rate": 3.246951219512195e-05, "loss": 0.3091, "step": 8550 }, { "epoch": 10.946291560102301, "grad_norm": 1.555967926979065, "learning_rate": 3.244773519163763e-05, "loss": 0.4429, "step": 8560 }, { "epoch": 10.959079283887467, "grad_norm": 1.8339693546295166, "learning_rate": 3.242595818815331e-05, "loss": 0.3064, "step": 8570 }, { "epoch": 10.971867007672634, "grad_norm": 5.510306358337402, "learning_rate": 3.240418118466899e-05, "loss": 0.1709, "step": 8580 }, { "epoch": 10.9846547314578, "grad_norm": 0.10631673038005829, "learning_rate": 3.238240418118467e-05, "loss": 0.3792, "step": 8590 }, { "epoch": 10.997442455242966, "grad_norm": 6.853298664093018, "learning_rate": 3.236062717770035e-05, "loss": 0.565, "step": 8600 }, { "epoch": 11.0, "eval_loss": 0.21491144597530365, "eval_runtime": 0.9865, "eval_samples_per_second": 99.337, "eval_steps_per_second": 13.177, "step": 8602 }, { "epoch": 11.010230179028133, "grad_norm": 6.32133150100708, "learning_rate": 3.233885017421603e-05, "loss": 0.4006, "step": 8610 }, { "epoch": 11.023017902813299, "grad_norm": 0.444341242313385, "learning_rate": 3.231707317073171e-05, "loss": 0.3297, "step": 8620 }, { "epoch": 11.035805626598465, "grad_norm": 8.10943603515625, "learning_rate": 3.229529616724739e-05, "loss": 0.4144, "step": 8630 }, { "epoch": 11.048593350383632, "grad_norm": 0.13110394775867462, "learning_rate": 3.227351916376307e-05, "loss": 0.2261, "step": 8640 }, { "epoch": 11.061381074168798, "grad_norm": 4.035839080810547, "learning_rate": 3.225174216027875e-05, "loss": 0.3694, "step": 8650 }, { "epoch": 11.074168797953964, "grad_norm": 0.32012709975242615, "learning_rate": 3.222996515679443e-05, "loss": 0.4123, "step": 8660 }, { "epoch": 11.08695652173913, "grad_norm": 1.2858375310897827, "learning_rate": 3.2208188153310105e-05, "loss": 0.3132, "step": 8670 }, { "epoch": 11.099744245524297, "grad_norm": 5.57534122467041, "learning_rate": 3.218641114982579e-05, "loss": 0.3462, "step": 8680 }, { "epoch": 11.112531969309464, "grad_norm": 9.02668571472168, "learning_rate": 3.216463414634147e-05, "loss": 0.3644, "step": 8690 }, { "epoch": 11.12531969309463, "grad_norm": 0.16235436499118805, "learning_rate": 3.2142857142857144e-05, "loss": 0.563, "step": 8700 }, { "epoch": 11.138107416879796, "grad_norm": 0.20466744899749756, "learning_rate": 3.212108013937282e-05, "loss": 0.4855, "step": 8710 }, { "epoch": 11.15089514066496, "grad_norm": 7.841794967651367, "learning_rate": 3.20993031358885e-05, "loss": 0.4489, "step": 8720 }, { "epoch": 11.163682864450127, "grad_norm": 0.008980615995824337, "learning_rate": 3.207752613240418e-05, "loss": 0.3862, "step": 8730 }, { "epoch": 11.176470588235293, "grad_norm": 8.154345512390137, "learning_rate": 3.205574912891986e-05, "loss": 0.4611, "step": 8740 }, { "epoch": 11.18925831202046, "grad_norm": 8.8815336227417, "learning_rate": 3.203397212543554e-05, "loss": 0.4231, "step": 8750 }, { "epoch": 11.202046035805626, "grad_norm": 3.500678777694702, "learning_rate": 3.201219512195122e-05, "loss": 0.3599, "step": 8760 }, { "epoch": 11.214833759590793, "grad_norm": 6.8923845291137695, "learning_rate": 3.19904181184669e-05, "loss": 0.5404, "step": 8770 }, { "epoch": 11.227621483375959, "grad_norm": 0.22492839395999908, "learning_rate": 3.196864111498258e-05, "loss": 0.3662, "step": 8780 }, { "epoch": 11.240409207161125, "grad_norm": 1.0206634998321533, "learning_rate": 3.194686411149826e-05, "loss": 0.3528, "step": 8790 }, { "epoch": 11.253196930946292, "grad_norm": 8.273615837097168, "learning_rate": 3.192508710801394e-05, "loss": 0.507, "step": 8800 }, { "epoch": 11.265984654731458, "grad_norm": 0.680773138999939, "learning_rate": 3.1903310104529616e-05, "loss": 0.3614, "step": 8810 }, { "epoch": 11.278772378516624, "grad_norm": 8.943563461303711, "learning_rate": 3.18815331010453e-05, "loss": 0.502, "step": 8820 }, { "epoch": 11.29156010230179, "grad_norm": 10.697273254394531, "learning_rate": 3.185975609756098e-05, "loss": 0.5452, "step": 8830 }, { "epoch": 11.304347826086957, "grad_norm": 2.5585360527038574, "learning_rate": 3.183797909407666e-05, "loss": 0.2662, "step": 8840 }, { "epoch": 11.317135549872123, "grad_norm": 5.41624641418457, "learning_rate": 3.1816202090592336e-05, "loss": 0.3855, "step": 8850 }, { "epoch": 11.32992327365729, "grad_norm": 2.3647260665893555, "learning_rate": 3.179442508710802e-05, "loss": 0.6495, "step": 8860 }, { "epoch": 11.342710997442456, "grad_norm": 0.32295748591423035, "learning_rate": 3.177264808362369e-05, "loss": 0.0998, "step": 8870 }, { "epoch": 11.355498721227622, "grad_norm": 5.3191680908203125, "learning_rate": 3.1750871080139375e-05, "loss": 0.2025, "step": 8880 }, { "epoch": 11.368286445012787, "grad_norm": 4.418299674987793, "learning_rate": 3.172909407665505e-05, "loss": 0.2674, "step": 8890 }, { "epoch": 11.381074168797953, "grad_norm": 6.424844264984131, "learning_rate": 3.170731707317073e-05, "loss": 0.2655, "step": 8900 }, { "epoch": 11.39386189258312, "grad_norm": 2.5015451908111572, "learning_rate": 3.168554006968641e-05, "loss": 0.3653, "step": 8910 }, { "epoch": 11.406649616368286, "grad_norm": 0.09301682561635971, "learning_rate": 3.166376306620209e-05, "loss": 0.6217, "step": 8920 }, { "epoch": 11.419437340153452, "grad_norm": 0.0004050794232171029, "learning_rate": 3.164198606271777e-05, "loss": 0.2637, "step": 8930 }, { "epoch": 11.432225063938619, "grad_norm": 3.0390357971191406, "learning_rate": 3.162020905923345e-05, "loss": 0.4496, "step": 8940 }, { "epoch": 11.445012787723785, "grad_norm": 0.20196415483951569, "learning_rate": 3.159843205574913e-05, "loss": 0.6409, "step": 8950 }, { "epoch": 11.457800511508951, "grad_norm": 1.8359960317611694, "learning_rate": 3.157665505226481e-05, "loss": 0.147, "step": 8960 }, { "epoch": 11.470588235294118, "grad_norm": 1.8633346557617188, "learning_rate": 3.155487804878049e-05, "loss": 0.4681, "step": 8970 }, { "epoch": 11.483375959079284, "grad_norm": 0.3808380365371704, "learning_rate": 3.153310104529617e-05, "loss": 0.6801, "step": 8980 }, { "epoch": 11.49616368286445, "grad_norm": 10.161221504211426, "learning_rate": 3.151132404181185e-05, "loss": 0.2983, "step": 8990 }, { "epoch": 11.508951406649617, "grad_norm": 8.241488456726074, "learning_rate": 3.148954703832753e-05, "loss": 0.5304, "step": 9000 }, { "epoch": 11.521739130434783, "grad_norm": 0.01585804484784603, "learning_rate": 3.146777003484321e-05, "loss": 0.0876, "step": 9010 }, { "epoch": 11.53452685421995, "grad_norm": 1.2980036735534668, "learning_rate": 3.144599303135889e-05, "loss": 0.1861, "step": 9020 }, { "epoch": 11.547314578005116, "grad_norm": 0.14101967215538025, "learning_rate": 3.142421602787457e-05, "loss": 0.4377, "step": 9030 }, { "epoch": 11.56010230179028, "grad_norm": 15.285452842712402, "learning_rate": 3.140243902439025e-05, "loss": 0.2893, "step": 9040 }, { "epoch": 11.572890025575447, "grad_norm": 2.7861084938049316, "learning_rate": 3.1380662020905924e-05, "loss": 0.4202, "step": 9050 }, { "epoch": 11.585677749360613, "grad_norm": 0.6793758869171143, "learning_rate": 3.13588850174216e-05, "loss": 0.1564, "step": 9060 }, { "epoch": 11.59846547314578, "grad_norm": 1.9136697053909302, "learning_rate": 3.133710801393728e-05, "loss": 0.3935, "step": 9070 }, { "epoch": 11.611253196930946, "grad_norm": 11.733519554138184, "learning_rate": 3.131533101045296e-05, "loss": 0.4423, "step": 9080 }, { "epoch": 11.624040920716112, "grad_norm": 1.6007936000823975, "learning_rate": 3.1293554006968644e-05, "loss": 0.3038, "step": 9090 }, { "epoch": 11.636828644501279, "grad_norm": 0.06947501748800278, "learning_rate": 3.127177700348432e-05, "loss": 0.3009, "step": 9100 }, { "epoch": 11.649616368286445, "grad_norm": 1.7004108428955078, "learning_rate": 3.125e-05, "loss": 0.3394, "step": 9110 }, { "epoch": 11.662404092071611, "grad_norm": 0.46246737241744995, "learning_rate": 3.122822299651568e-05, "loss": 0.3468, "step": 9120 }, { "epoch": 11.675191815856778, "grad_norm": 2.2499334812164307, "learning_rate": 3.120644599303136e-05, "loss": 0.4545, "step": 9130 }, { "epoch": 11.687979539641944, "grad_norm": 0.09588778764009476, "learning_rate": 3.118466898954704e-05, "loss": 0.1164, "step": 9140 }, { "epoch": 11.70076726342711, "grad_norm": 2.123534679412842, "learning_rate": 3.116289198606272e-05, "loss": 0.3877, "step": 9150 }, { "epoch": 11.713554987212277, "grad_norm": 3.080892562866211, "learning_rate": 3.11411149825784e-05, "loss": 0.2416, "step": 9160 }, { "epoch": 11.726342710997443, "grad_norm": 3.8951287269592285, "learning_rate": 3.111933797909408e-05, "loss": 0.3623, "step": 9170 }, { "epoch": 11.73913043478261, "grad_norm": 9.134366035461426, "learning_rate": 3.109756097560976e-05, "loss": 0.3535, "step": 9180 }, { "epoch": 11.751918158567776, "grad_norm": 6.711736679077148, "learning_rate": 3.107578397212544e-05, "loss": 0.3882, "step": 9190 }, { "epoch": 11.764705882352942, "grad_norm": 0.008113594725728035, "learning_rate": 3.1054006968641116e-05, "loss": 0.6576, "step": 9200 }, { "epoch": 11.777493606138107, "grad_norm": 0.04739408195018768, "learning_rate": 3.10322299651568e-05, "loss": 0.2207, "step": 9210 }, { "epoch": 11.790281329923273, "grad_norm": 2.5379507541656494, "learning_rate": 3.101045296167247e-05, "loss": 0.2856, "step": 9220 }, { "epoch": 11.80306905370844, "grad_norm": 0.0009813779033720493, "learning_rate": 3.0988675958188155e-05, "loss": 0.4471, "step": 9230 }, { "epoch": 11.815856777493606, "grad_norm": 0.4562146067619324, "learning_rate": 3.096689895470383e-05, "loss": 0.1575, "step": 9240 }, { "epoch": 11.828644501278772, "grad_norm": 2.216144323348999, "learning_rate": 3.094512195121951e-05, "loss": 0.201, "step": 9250 }, { "epoch": 11.841432225063938, "grad_norm": 0.7896146178245544, "learning_rate": 3.092334494773519e-05, "loss": 0.3322, "step": 9260 }, { "epoch": 11.854219948849105, "grad_norm": 0.8330498933792114, "learning_rate": 3.090156794425087e-05, "loss": 0.2876, "step": 9270 }, { "epoch": 11.867007672634271, "grad_norm": 1.164717674255371, "learning_rate": 3.087979094076655e-05, "loss": 0.1853, "step": 9280 }, { "epoch": 11.879795396419437, "grad_norm": 0.029109498485922813, "learning_rate": 3.085801393728223e-05, "loss": 0.1326, "step": 9290 }, { "epoch": 11.892583120204604, "grad_norm": 1.593198537826538, "learning_rate": 3.083623693379791e-05, "loss": 0.4159, "step": 9300 }, { "epoch": 11.90537084398977, "grad_norm": 1.5863924026489258, "learning_rate": 3.081445993031359e-05, "loss": 0.2288, "step": 9310 }, { "epoch": 11.918158567774936, "grad_norm": 8.302711486816406, "learning_rate": 3.079268292682927e-05, "loss": 0.4357, "step": 9320 }, { "epoch": 11.930946291560103, "grad_norm": 0.07162554562091827, "learning_rate": 3.077090592334495e-05, "loss": 0.5203, "step": 9330 }, { "epoch": 11.94373401534527, "grad_norm": 5.237524509429932, "learning_rate": 3.074912891986063e-05, "loss": 0.2773, "step": 9340 }, { "epoch": 11.956521739130435, "grad_norm": 2.659153699874878, "learning_rate": 3.072735191637631e-05, "loss": 0.3073, "step": 9350 }, { "epoch": 11.969309462915602, "grad_norm": 0.9340159296989441, "learning_rate": 3.070557491289199e-05, "loss": 0.7157, "step": 9360 }, { "epoch": 11.982097186700766, "grad_norm": 1.4989861249923706, "learning_rate": 3.068379790940767e-05, "loss": 0.2925, "step": 9370 }, { "epoch": 11.994884910485933, "grad_norm": 0.7795642614364624, "learning_rate": 3.066202090592335e-05, "loss": 0.4561, "step": 9380 }, { "epoch": 12.0, "eval_loss": 0.21105726063251495, "eval_runtime": 0.9833, "eval_samples_per_second": 99.663, "eval_steps_per_second": 13.221, "step": 9384 }, { "epoch": 12.007672634271099, "grad_norm": 2.518749475479126, "learning_rate": 3.064024390243903e-05, "loss": 0.2376, "step": 9390 }, { "epoch": 12.020460358056265, "grad_norm": 0.2805574834346771, "learning_rate": 3.0618466898954704e-05, "loss": 0.457, "step": 9400 }, { "epoch": 12.033248081841432, "grad_norm": 0.059038929641246796, "learning_rate": 3.0596689895470385e-05, "loss": 0.3972, "step": 9410 }, { "epoch": 12.046035805626598, "grad_norm": 1.0303517580032349, "learning_rate": 3.057491289198606e-05, "loss": 0.1829, "step": 9420 }, { "epoch": 12.058823529411764, "grad_norm": 0.005017964635044336, "learning_rate": 3.055313588850174e-05, "loss": 0.3087, "step": 9430 }, { "epoch": 12.07161125319693, "grad_norm": 0.027756422758102417, "learning_rate": 3.0531358885017424e-05, "loss": 0.3635, "step": 9440 }, { "epoch": 12.084398976982097, "grad_norm": 1.1101689338684082, "learning_rate": 3.05095818815331e-05, "loss": 0.2655, "step": 9450 }, { "epoch": 12.097186700767264, "grad_norm": 0.0013217816594988108, "learning_rate": 3.048780487804878e-05, "loss": 0.5037, "step": 9460 }, { "epoch": 12.10997442455243, "grad_norm": 1.3896468877792358, "learning_rate": 3.0466027874564462e-05, "loss": 0.3393, "step": 9470 }, { "epoch": 12.122762148337596, "grad_norm": 9.710427284240723, "learning_rate": 3.0444250871080144e-05, "loss": 0.3086, "step": 9480 }, { "epoch": 12.135549872122763, "grad_norm": 0.02628585882484913, "learning_rate": 3.042247386759582e-05, "loss": 0.1551, "step": 9490 }, { "epoch": 12.148337595907929, "grad_norm": 4.432460784912109, "learning_rate": 3.04006968641115e-05, "loss": 0.3154, "step": 9500 }, { "epoch": 12.161125319693095, "grad_norm": 0.4521609842777252, "learning_rate": 3.0378919860627182e-05, "loss": 0.3278, "step": 9510 }, { "epoch": 12.173913043478262, "grad_norm": 2.970489978790283, "learning_rate": 3.0357142857142857e-05, "loss": 0.351, "step": 9520 }, { "epoch": 12.186700767263428, "grad_norm": 0.3053510785102844, "learning_rate": 3.0335365853658536e-05, "loss": 0.1863, "step": 9530 }, { "epoch": 12.199488491048593, "grad_norm": 8.950643539428711, "learning_rate": 3.0313588850174217e-05, "loss": 0.2964, "step": 9540 }, { "epoch": 12.212276214833759, "grad_norm": 0.10913731902837753, "learning_rate": 3.02918118466899e-05, "loss": 0.2332, "step": 9550 }, { "epoch": 12.225063938618925, "grad_norm": 9.739930152893066, "learning_rate": 3.0270034843205574e-05, "loss": 0.4687, "step": 9560 }, { "epoch": 12.237851662404092, "grad_norm": 6.348082542419434, "learning_rate": 3.0248257839721256e-05, "loss": 0.4921, "step": 9570 }, { "epoch": 12.250639386189258, "grad_norm": 0.6640710830688477, "learning_rate": 3.0226480836236938e-05, "loss": 0.5136, "step": 9580 }, { "epoch": 12.263427109974424, "grad_norm": 0.0002884014684241265, "learning_rate": 3.0204703832752613e-05, "loss": 0.3254, "step": 9590 }, { "epoch": 12.27621483375959, "grad_norm": 8.660523414611816, "learning_rate": 3.0182926829268294e-05, "loss": 0.3539, "step": 9600 }, { "epoch": 12.289002557544757, "grad_norm": 25.596059799194336, "learning_rate": 3.0161149825783973e-05, "loss": 0.2523, "step": 9610 }, { "epoch": 12.301790281329923, "grad_norm": 0.1933545619249344, "learning_rate": 3.0139372822299655e-05, "loss": 0.2112, "step": 9620 }, { "epoch": 12.31457800511509, "grad_norm": 0.2891666293144226, "learning_rate": 3.011759581881533e-05, "loss": 0.1593, "step": 9630 }, { "epoch": 12.327365728900256, "grad_norm": 2.7486932277679443, "learning_rate": 3.009581881533101e-05, "loss": 0.4398, "step": 9640 }, { "epoch": 12.340153452685422, "grad_norm": 0.8518115282058716, "learning_rate": 3.0074041811846693e-05, "loss": 0.2917, "step": 9650 }, { "epoch": 12.352941176470589, "grad_norm": 9.21524429321289, "learning_rate": 3.0052264808362368e-05, "loss": 0.573, "step": 9660 }, { "epoch": 12.365728900255755, "grad_norm": 0.5691689848899841, "learning_rate": 3.003048780487805e-05, "loss": 0.4763, "step": 9670 }, { "epoch": 12.378516624040921, "grad_norm": 9.370003700256348, "learning_rate": 3.000871080139373e-05, "loss": 0.2633, "step": 9680 }, { "epoch": 12.391304347826088, "grad_norm": 17.42327880859375, "learning_rate": 2.998693379790941e-05, "loss": 0.2829, "step": 9690 }, { "epoch": 12.404092071611252, "grad_norm": 3.5533621311187744, "learning_rate": 2.9965156794425088e-05, "loss": 0.386, "step": 9700 }, { "epoch": 12.416879795396419, "grad_norm": 6.838221073150635, "learning_rate": 2.9943379790940767e-05, "loss": 0.3875, "step": 9710 }, { "epoch": 12.429667519181585, "grad_norm": 4.223023414611816, "learning_rate": 2.9921602787456448e-05, "loss": 0.3517, "step": 9720 }, { "epoch": 12.442455242966751, "grad_norm": 1.94353449344635, "learning_rate": 2.989982578397213e-05, "loss": 0.3197, "step": 9730 }, { "epoch": 12.455242966751918, "grad_norm": 2.8062820434570312, "learning_rate": 2.9878048780487805e-05, "loss": 0.1984, "step": 9740 }, { "epoch": 12.468030690537084, "grad_norm": 1.3602440357208252, "learning_rate": 2.9856271777003487e-05, "loss": 0.3751, "step": 9750 }, { "epoch": 12.48081841432225, "grad_norm": 1.7058247327804565, "learning_rate": 2.983449477351917e-05, "loss": 0.2914, "step": 9760 }, { "epoch": 12.493606138107417, "grad_norm": 6.558533191680908, "learning_rate": 2.9812717770034843e-05, "loss": 0.4901, "step": 9770 }, { "epoch": 12.506393861892583, "grad_norm": 16.329387664794922, "learning_rate": 2.9790940766550525e-05, "loss": 0.2571, "step": 9780 }, { "epoch": 12.51918158567775, "grad_norm": 1.404144287109375, "learning_rate": 2.9769163763066204e-05, "loss": 0.315, "step": 9790 }, { "epoch": 12.531969309462916, "grad_norm": 1.7159531116485596, "learning_rate": 2.9747386759581885e-05, "loss": 0.3895, "step": 9800 }, { "epoch": 12.544757033248082, "grad_norm": 1.3657723665237427, "learning_rate": 2.972560975609756e-05, "loss": 0.1817, "step": 9810 }, { "epoch": 12.557544757033249, "grad_norm": 1.2785223722457886, "learning_rate": 2.9703832752613242e-05, "loss": 0.3728, "step": 9820 }, { "epoch": 12.570332480818415, "grad_norm": 4.575535774230957, "learning_rate": 2.9682055749128924e-05, "loss": 0.3806, "step": 9830 }, { "epoch": 12.583120204603581, "grad_norm": 5.597801208496094, "learning_rate": 2.96602787456446e-05, "loss": 0.2533, "step": 9840 }, { "epoch": 12.595907928388748, "grad_norm": 0.038497261703014374, "learning_rate": 2.963850174216028e-05, "loss": 0.3971, "step": 9850 }, { "epoch": 12.608695652173914, "grad_norm": 1.231428861618042, "learning_rate": 2.9616724738675962e-05, "loss": 0.4486, "step": 9860 }, { "epoch": 12.621483375959079, "grad_norm": 3.3973329067230225, "learning_rate": 2.959494773519164e-05, "loss": 0.2941, "step": 9870 }, { "epoch": 12.634271099744245, "grad_norm": 0.8516256213188171, "learning_rate": 2.9573170731707316e-05, "loss": 0.449, "step": 9880 }, { "epoch": 12.647058823529411, "grad_norm": 10.69143295288086, "learning_rate": 2.9551393728222997e-05, "loss": 0.5526, "step": 9890 }, { "epoch": 12.659846547314578, "grad_norm": 9.207474708557129, "learning_rate": 2.952961672473868e-05, "loss": 0.3616, "step": 9900 }, { "epoch": 12.672634271099744, "grad_norm": 0.3940543532371521, "learning_rate": 2.9507839721254354e-05, "loss": 0.3483, "step": 9910 }, { "epoch": 12.68542199488491, "grad_norm": 1.1689401865005493, "learning_rate": 2.9486062717770036e-05, "loss": 0.3215, "step": 9920 }, { "epoch": 12.698209718670077, "grad_norm": 3.243196487426758, "learning_rate": 2.9464285714285718e-05, "loss": 0.335, "step": 9930 }, { "epoch": 12.710997442455243, "grad_norm": 0.01855871081352234, "learning_rate": 2.9442508710801396e-05, "loss": 0.2783, "step": 9940 }, { "epoch": 12.72378516624041, "grad_norm": 0.066720150411129, "learning_rate": 2.9420731707317074e-05, "loss": 0.3905, "step": 9950 }, { "epoch": 12.736572890025576, "grad_norm": 3.417693853378296, "learning_rate": 2.9398954703832753e-05, "loss": 0.2852, "step": 9960 }, { "epoch": 12.749360613810742, "grad_norm": 1.7551138401031494, "learning_rate": 2.9377177700348434e-05, "loss": 0.2011, "step": 9970 }, { "epoch": 12.762148337595908, "grad_norm": 6.556581497192383, "learning_rate": 2.935540069686411e-05, "loss": 0.3613, "step": 9980 }, { "epoch": 12.774936061381075, "grad_norm": 0.9520956873893738, "learning_rate": 2.933362369337979e-05, "loss": 0.2462, "step": 9990 }, { "epoch": 12.787723785166241, "grad_norm": 2.980120734719094e-05, "learning_rate": 2.9311846689895473e-05, "loss": 0.2549, "step": 10000 }, { "epoch": 12.800511508951407, "grad_norm": 0.20872947573661804, "learning_rate": 2.9290069686411155e-05, "loss": 0.2909, "step": 10010 }, { "epoch": 12.813299232736572, "grad_norm": 0.3826689124107361, "learning_rate": 2.926829268292683e-05, "loss": 0.4378, "step": 10020 }, { "epoch": 12.826086956521738, "grad_norm": 8.614890098571777, "learning_rate": 2.924651567944251e-05, "loss": 0.4704, "step": 10030 }, { "epoch": 12.838874680306905, "grad_norm": 3.4223175048828125, "learning_rate": 2.922473867595819e-05, "loss": 0.526, "step": 10040 }, { "epoch": 12.851662404092071, "grad_norm": 1.2012652158737183, "learning_rate": 2.9202961672473868e-05, "loss": 0.2264, "step": 10050 }, { "epoch": 12.864450127877237, "grad_norm": 2.6389079093933105, "learning_rate": 2.9181184668989546e-05, "loss": 0.3546, "step": 10060 }, { "epoch": 12.877237851662404, "grad_norm": 1.9273570775985718, "learning_rate": 2.9159407665505228e-05, "loss": 0.1862, "step": 10070 }, { "epoch": 12.89002557544757, "grad_norm": 3.80769419670105, "learning_rate": 2.913763066202091e-05, "loss": 0.2989, "step": 10080 }, { "epoch": 12.902813299232736, "grad_norm": 1.0025523900985718, "learning_rate": 2.9115853658536585e-05, "loss": 0.4393, "step": 10090 }, { "epoch": 12.915601023017903, "grad_norm": 0.254962682723999, "learning_rate": 2.9094076655052267e-05, "loss": 0.5615, "step": 10100 }, { "epoch": 12.92838874680307, "grad_norm": 0.8748624324798584, "learning_rate": 2.9072299651567948e-05, "loss": 0.3632, "step": 10110 }, { "epoch": 12.941176470588236, "grad_norm": 2.6783287525177, "learning_rate": 2.9050522648083623e-05, "loss": 0.5758, "step": 10120 }, { "epoch": 12.953964194373402, "grad_norm": 0.5378952622413635, "learning_rate": 2.9028745644599305e-05, "loss": 0.2729, "step": 10130 }, { "epoch": 12.966751918158568, "grad_norm": 0.02223265916109085, "learning_rate": 2.9006968641114983e-05, "loss": 0.7055, "step": 10140 }, { "epoch": 12.979539641943735, "grad_norm": 6.515031814575195, "learning_rate": 2.8985191637630665e-05, "loss": 0.3679, "step": 10150 }, { "epoch": 12.992327365728901, "grad_norm": 1.4198124408721924, "learning_rate": 2.896341463414634e-05, "loss": 0.1918, "step": 10160 }, { "epoch": 13.0, "eval_loss": 0.21145901083946228, "eval_runtime": 0.9726, "eval_samples_per_second": 100.758, "eval_steps_per_second": 13.366, "step": 10166 }, { "epoch": 13.005115089514067, "grad_norm": 7.412026882171631, "learning_rate": 2.8941637630662022e-05, "loss": 0.3834, "step": 10170 }, { "epoch": 13.017902813299234, "grad_norm": 5.850493907928467, "learning_rate": 2.8919860627177704e-05, "loss": 0.1763, "step": 10180 }, { "epoch": 13.030690537084398, "grad_norm": 2.427426338195801, "learning_rate": 2.8898083623693385e-05, "loss": 0.4808, "step": 10190 }, { "epoch": 13.043478260869565, "grad_norm": 0.0027793091721832752, "learning_rate": 2.887630662020906e-05, "loss": 0.3402, "step": 10200 }, { "epoch": 13.05626598465473, "grad_norm": 4.905974388122559, "learning_rate": 2.885452961672474e-05, "loss": 0.2913, "step": 10210 }, { "epoch": 13.069053708439897, "grad_norm": 0.002724498976022005, "learning_rate": 2.883275261324042e-05, "loss": 0.1021, "step": 10220 }, { "epoch": 13.081841432225064, "grad_norm": 1.5594589710235596, "learning_rate": 2.8810975609756095e-05, "loss": 0.0938, "step": 10230 }, { "epoch": 13.09462915601023, "grad_norm": 8.709442138671875, "learning_rate": 2.8789198606271777e-05, "loss": 0.3254, "step": 10240 }, { "epoch": 13.107416879795396, "grad_norm": 9.628579139709473, "learning_rate": 2.876742160278746e-05, "loss": 0.4707, "step": 10250 }, { "epoch": 13.120204603580563, "grad_norm": 7.752285003662109, "learning_rate": 2.874564459930314e-05, "loss": 0.5434, "step": 10260 }, { "epoch": 13.132992327365729, "grad_norm": 1.0182121992111206, "learning_rate": 2.8723867595818816e-05, "loss": 0.4221, "step": 10270 }, { "epoch": 13.145780051150895, "grad_norm": 6.204538345336914, "learning_rate": 2.8702090592334497e-05, "loss": 0.25, "step": 10280 }, { "epoch": 13.158567774936062, "grad_norm": 4.693891525268555, "learning_rate": 2.8680313588850176e-05, "loss": 0.5462, "step": 10290 }, { "epoch": 13.171355498721228, "grad_norm": 0.026867728680372238, "learning_rate": 2.8658536585365854e-05, "loss": 0.0342, "step": 10300 }, { "epoch": 13.184143222506394, "grad_norm": 0.9486016035079956, "learning_rate": 2.8636759581881532e-05, "loss": 0.4285, "step": 10310 }, { "epoch": 13.19693094629156, "grad_norm": 0.1023603156208992, "learning_rate": 2.8614982578397214e-05, "loss": 0.0734, "step": 10320 }, { "epoch": 13.209718670076727, "grad_norm": 2.3830373287200928, "learning_rate": 2.8593205574912896e-05, "loss": 0.219, "step": 10330 }, { "epoch": 13.222506393861893, "grad_norm": 0.04898781701922417, "learning_rate": 2.857142857142857e-05, "loss": 0.0627, "step": 10340 }, { "epoch": 13.235294117647058, "grad_norm": 0.0003403863520361483, "learning_rate": 2.8549651567944253e-05, "loss": 0.5191, "step": 10350 }, { "epoch": 13.248081841432224, "grad_norm": 6.243635654449463, "learning_rate": 2.8527874564459934e-05, "loss": 0.5085, "step": 10360 }, { "epoch": 13.26086956521739, "grad_norm": 3.0353028774261475, "learning_rate": 2.850609756097561e-05, "loss": 0.3381, "step": 10370 }, { "epoch": 13.273657289002557, "grad_norm": 3.330402374267578, "learning_rate": 2.848432055749129e-05, "loss": 0.7394, "step": 10380 }, { "epoch": 13.286445012787723, "grad_norm": 8.248026847839355, "learning_rate": 2.846254355400697e-05, "loss": 0.3344, "step": 10390 }, { "epoch": 13.29923273657289, "grad_norm": 9.930607795715332, "learning_rate": 2.844076655052265e-05, "loss": 0.3615, "step": 10400 }, { "epoch": 13.312020460358056, "grad_norm": 3.8150908946990967, "learning_rate": 2.8418989547038326e-05, "loss": 0.4423, "step": 10410 }, { "epoch": 13.324808184143222, "grad_norm": 0.0005007564323022962, "learning_rate": 2.8397212543554008e-05, "loss": 0.4379, "step": 10420 }, { "epoch": 13.337595907928389, "grad_norm": 10.748580932617188, "learning_rate": 2.837543554006969e-05, "loss": 0.7275, "step": 10430 }, { "epoch": 13.350383631713555, "grad_norm": 2.6858019828796387, "learning_rate": 2.8353658536585365e-05, "loss": 0.3561, "step": 10440 }, { "epoch": 13.363171355498721, "grad_norm": 0.2765854299068451, "learning_rate": 2.8331881533101046e-05, "loss": 0.3783, "step": 10450 }, { "epoch": 13.375959079283888, "grad_norm": 7.886977672576904, "learning_rate": 2.8310104529616728e-05, "loss": 0.3486, "step": 10460 }, { "epoch": 13.388746803069054, "grad_norm": 0.49375006556510925, "learning_rate": 2.8288327526132406e-05, "loss": 0.348, "step": 10470 }, { "epoch": 13.40153452685422, "grad_norm": 1.5718512535095215, "learning_rate": 2.826655052264808e-05, "loss": 0.3416, "step": 10480 }, { "epoch": 13.414322250639387, "grad_norm": 0.06271866708993912, "learning_rate": 2.8244773519163763e-05, "loss": 0.1687, "step": 10490 }, { "epoch": 13.427109974424553, "grad_norm": 3.52473521232605, "learning_rate": 2.8222996515679445e-05, "loss": 0.5941, "step": 10500 }, { "epoch": 13.43989769820972, "grad_norm": 0.5428152084350586, "learning_rate": 2.820121951219512e-05, "loss": 0.4202, "step": 10510 }, { "epoch": 13.452685421994884, "grad_norm": 0.05799586698412895, "learning_rate": 2.81794425087108e-05, "loss": 0.2371, "step": 10520 }, { "epoch": 13.46547314578005, "grad_norm": 3.599447250366211, "learning_rate": 2.8157665505226483e-05, "loss": 0.5036, "step": 10530 }, { "epoch": 13.478260869565217, "grad_norm": 2.957773208618164, "learning_rate": 2.8135888501742165e-05, "loss": 0.5503, "step": 10540 }, { "epoch": 13.491048593350383, "grad_norm": 3.7019054889678955, "learning_rate": 2.811411149825784e-05, "loss": 0.175, "step": 10550 }, { "epoch": 13.50383631713555, "grad_norm": 0.025149650871753693, "learning_rate": 2.809233449477352e-05, "loss": 0.3618, "step": 10560 }, { "epoch": 13.516624040920716, "grad_norm": 5.196008205413818, "learning_rate": 2.80705574912892e-05, "loss": 0.1966, "step": 10570 }, { "epoch": 13.529411764705882, "grad_norm": 0.31828802824020386, "learning_rate": 2.8048780487804882e-05, "loss": 0.2693, "step": 10580 }, { "epoch": 13.542199488491049, "grad_norm": 0.01727711223065853, "learning_rate": 2.8027003484320557e-05, "loss": 0.1533, "step": 10590 }, { "epoch": 13.554987212276215, "grad_norm": 67.53722381591797, "learning_rate": 2.800522648083624e-05, "loss": 0.2984, "step": 10600 }, { "epoch": 13.567774936061381, "grad_norm": 5.63265323638916, "learning_rate": 2.798344947735192e-05, "loss": 0.3842, "step": 10610 }, { "epoch": 13.580562659846548, "grad_norm": 0.00870454777032137, "learning_rate": 2.7961672473867595e-05, "loss": 0.2308, "step": 10620 }, { "epoch": 13.593350383631714, "grad_norm": 5.576174259185791, "learning_rate": 2.7939895470383277e-05, "loss": 0.4208, "step": 10630 }, { "epoch": 13.60613810741688, "grad_norm": 6.635630130767822, "learning_rate": 2.7918118466898955e-05, "loss": 0.2455, "step": 10640 }, { "epoch": 13.618925831202047, "grad_norm": 3.7037808895111084, "learning_rate": 2.7896341463414637e-05, "loss": 0.3545, "step": 10650 }, { "epoch": 13.631713554987213, "grad_norm": 8.49842357635498, "learning_rate": 2.7874564459930312e-05, "loss": 0.291, "step": 10660 }, { "epoch": 13.644501278772378, "grad_norm": 2.637025833129883, "learning_rate": 2.7852787456445994e-05, "loss": 0.3177, "step": 10670 }, { "epoch": 13.657289002557544, "grad_norm": 1.063649296760559, "learning_rate": 2.7831010452961676e-05, "loss": 0.3922, "step": 10680 }, { "epoch": 13.67007672634271, "grad_norm": 2.770068883895874, "learning_rate": 2.780923344947735e-05, "loss": 0.4757, "step": 10690 }, { "epoch": 13.682864450127877, "grad_norm": 6.572999000549316, "learning_rate": 2.7787456445993032e-05, "loss": 0.3792, "step": 10700 }, { "epoch": 13.695652173913043, "grad_norm": 0.05444970726966858, "learning_rate": 2.7765679442508714e-05, "loss": 0.179, "step": 10710 }, { "epoch": 13.70843989769821, "grad_norm": 0.025916630402207375, "learning_rate": 2.7743902439024393e-05, "loss": 0.2356, "step": 10720 }, { "epoch": 13.721227621483376, "grad_norm": 5.852172374725342, "learning_rate": 2.772212543554007e-05, "loss": 0.4052, "step": 10730 }, { "epoch": 13.734015345268542, "grad_norm": 1.6785579919815063, "learning_rate": 2.770034843205575e-05, "loss": 0.2606, "step": 10740 }, { "epoch": 13.746803069053708, "grad_norm": 0.3501584231853485, "learning_rate": 2.767857142857143e-05, "loss": 0.1516, "step": 10750 }, { "epoch": 13.759590792838875, "grad_norm": 0.0033246350940316916, "learning_rate": 2.7656794425087106e-05, "loss": 0.2981, "step": 10760 }, { "epoch": 13.772378516624041, "grad_norm": 10.814009666442871, "learning_rate": 2.7635017421602788e-05, "loss": 0.5273, "step": 10770 }, { "epoch": 13.785166240409207, "grad_norm": 0.16116586327552795, "learning_rate": 2.761324041811847e-05, "loss": 0.1441, "step": 10780 }, { "epoch": 13.797953964194374, "grad_norm": 0.0063223871402442455, "learning_rate": 2.759146341463415e-05, "loss": 0.1038, "step": 10790 }, { "epoch": 13.81074168797954, "grad_norm": 3.8742804527282715, "learning_rate": 2.7569686411149826e-05, "loss": 0.2702, "step": 10800 }, { "epoch": 13.823529411764707, "grad_norm": 0.9244747757911682, "learning_rate": 2.7547909407665508e-05, "loss": 0.1768, "step": 10810 }, { "epoch": 13.836317135549873, "grad_norm": 0.29645875096321106, "learning_rate": 2.7526132404181186e-05, "loss": 0.1818, "step": 10820 }, { "epoch": 13.84910485933504, "grad_norm": 1.829161286354065, "learning_rate": 2.750435540069686e-05, "loss": 0.2051, "step": 10830 }, { "epoch": 13.861892583120204, "grad_norm": 2.0264694690704346, "learning_rate": 2.7482578397212543e-05, "loss": 0.328, "step": 10840 }, { "epoch": 13.87468030690537, "grad_norm": 1.9954897165298462, "learning_rate": 2.7460801393728225e-05, "loss": 0.2223, "step": 10850 }, { "epoch": 13.887468030690536, "grad_norm": 7.2884521484375, "learning_rate": 2.7439024390243906e-05, "loss": 0.3175, "step": 10860 }, { "epoch": 13.900255754475703, "grad_norm": 3.5259621143341064, "learning_rate": 2.741724738675958e-05, "loss": 0.5335, "step": 10870 }, { "epoch": 13.91304347826087, "grad_norm": 1.8546830415725708, "learning_rate": 2.7395470383275263e-05, "loss": 0.3467, "step": 10880 }, { "epoch": 13.925831202046036, "grad_norm": 0.04471950605511665, "learning_rate": 2.7373693379790945e-05, "loss": 0.5066, "step": 10890 }, { "epoch": 13.938618925831202, "grad_norm": 0.1593080461025238, "learning_rate": 2.735191637630662e-05, "loss": 0.2206, "step": 10900 }, { "epoch": 13.951406649616368, "grad_norm": 0.12783415615558624, "learning_rate": 2.7330139372822298e-05, "loss": 0.2484, "step": 10910 }, { "epoch": 13.964194373401535, "grad_norm": 6.850285530090332, "learning_rate": 2.730836236933798e-05, "loss": 0.2997, "step": 10920 }, { "epoch": 13.976982097186701, "grad_norm": 0.002649192698299885, "learning_rate": 2.7286585365853662e-05, "loss": 0.323, "step": 10930 }, { "epoch": 13.989769820971867, "grad_norm": 6.734117031097412, "learning_rate": 2.7264808362369337e-05, "loss": 0.4467, "step": 10940 }, { "epoch": 14.0, "eval_loss": 0.20195920765399933, "eval_runtime": 0.9722, "eval_samples_per_second": 100.807, "eval_steps_per_second": 13.372, "step": 10948 }, { "epoch": 14.002557544757034, "grad_norm": 9.695572853088379, "learning_rate": 2.724303135888502e-05, "loss": 0.4267, "step": 10950 }, { "epoch": 14.0153452685422, "grad_norm": 1.4536675214767456, "learning_rate": 2.72212543554007e-05, "loss": 0.0807, "step": 10960 }, { "epoch": 14.028132992327366, "grad_norm": 0.2081240564584732, "learning_rate": 2.7199477351916382e-05, "loss": 0.5784, "step": 10970 }, { "epoch": 14.040920716112533, "grad_norm": 0.38465332984924316, "learning_rate": 2.7177700348432057e-05, "loss": 0.4252, "step": 10980 }, { "epoch": 14.053708439897699, "grad_norm": 0.16156940162181854, "learning_rate": 2.7155923344947735e-05, "loss": 0.2058, "step": 10990 }, { "epoch": 14.066496163682864, "grad_norm": 4.0370917320251465, "learning_rate": 2.7134146341463417e-05, "loss": 0.5383, "step": 11000 }, { "epoch": 14.07928388746803, "grad_norm": 12.344548225402832, "learning_rate": 2.7112369337979092e-05, "loss": 0.3364, "step": 11010 }, { "epoch": 14.092071611253196, "grad_norm": 4.156929969787598, "learning_rate": 2.7090592334494774e-05, "loss": 0.3422, "step": 11020 }, { "epoch": 14.104859335038363, "grad_norm": 1.435106873512268, "learning_rate": 2.7068815331010456e-05, "loss": 0.3734, "step": 11030 }, { "epoch": 14.117647058823529, "grad_norm": 0.010196288116276264, "learning_rate": 2.7047038327526137e-05, "loss": 0.4882, "step": 11040 }, { "epoch": 14.130434782608695, "grad_norm": 3.2925307750701904, "learning_rate": 2.7025261324041812e-05, "loss": 0.2681, "step": 11050 }, { "epoch": 14.143222506393862, "grad_norm": 0.34978732466697693, "learning_rate": 2.7003484320557494e-05, "loss": 0.3192, "step": 11060 }, { "epoch": 14.156010230179028, "grad_norm": 8.82948112487793, "learning_rate": 2.6981707317073172e-05, "loss": 0.2327, "step": 11070 }, { "epoch": 14.168797953964194, "grad_norm": 7.798665523529053, "learning_rate": 2.695993031358885e-05, "loss": 0.3433, "step": 11080 }, { "epoch": 14.18158567774936, "grad_norm": 5.831617832183838, "learning_rate": 2.693815331010453e-05, "loss": 0.4675, "step": 11090 }, { "epoch": 14.194373401534527, "grad_norm": 7.787278652191162, "learning_rate": 2.691637630662021e-05, "loss": 0.3635, "step": 11100 }, { "epoch": 14.207161125319693, "grad_norm": 2.973318576812744, "learning_rate": 2.6894599303135893e-05, "loss": 0.2639, "step": 11110 }, { "epoch": 14.21994884910486, "grad_norm": 1.2989455461502075, "learning_rate": 2.6872822299651568e-05, "loss": 0.1029, "step": 11120 }, { "epoch": 14.232736572890026, "grad_norm": 4.172037124633789, "learning_rate": 2.685104529616725e-05, "loss": 0.3039, "step": 11130 }, { "epoch": 14.245524296675192, "grad_norm": 5.493216037750244, "learning_rate": 2.682926829268293e-05, "loss": 0.3115, "step": 11140 }, { "epoch": 14.258312020460359, "grad_norm": 2.573470115661621, "learning_rate": 2.6807491289198606e-05, "loss": 0.1641, "step": 11150 }, { "epoch": 14.271099744245525, "grad_norm": 1.0581598281860352, "learning_rate": 2.6785714285714288e-05, "loss": 0.2258, "step": 11160 }, { "epoch": 14.28388746803069, "grad_norm": 0.6617485880851746, "learning_rate": 2.6763937282229966e-05, "loss": 0.1989, "step": 11170 }, { "epoch": 14.296675191815856, "grad_norm": 3.649031162261963, "learning_rate": 2.6742160278745648e-05, "loss": 0.1438, "step": 11180 }, { "epoch": 14.309462915601022, "grad_norm": 5.712010383605957, "learning_rate": 2.6720383275261323e-05, "loss": 0.404, "step": 11190 }, { "epoch": 14.322250639386189, "grad_norm": 4.470798492431641, "learning_rate": 2.6698606271777005e-05, "loss": 0.6005, "step": 11200 }, { "epoch": 14.335038363171355, "grad_norm": 5.101674556732178, "learning_rate": 2.6676829268292686e-05, "loss": 0.3332, "step": 11210 }, { "epoch": 14.347826086956522, "grad_norm": 5.864815711975098, "learning_rate": 2.665505226480836e-05, "loss": 0.25, "step": 11220 }, { "epoch": 14.360613810741688, "grad_norm": 4.044086933135986, "learning_rate": 2.6633275261324043e-05, "loss": 0.284, "step": 11230 }, { "epoch": 14.373401534526854, "grad_norm": 1.0305874347686768, "learning_rate": 2.6611498257839725e-05, "loss": 0.3079, "step": 11240 }, { "epoch": 14.38618925831202, "grad_norm": 2.8594095706939697, "learning_rate": 2.6589721254355403e-05, "loss": 0.194, "step": 11250 }, { "epoch": 14.398976982097187, "grad_norm": 0.24175876379013062, "learning_rate": 2.6567944250871078e-05, "loss": 0.2451, "step": 11260 }, { "epoch": 14.411764705882353, "grad_norm": 1.0513581037521362, "learning_rate": 2.654616724738676e-05, "loss": 0.1766, "step": 11270 }, { "epoch": 14.42455242966752, "grad_norm": 2.0483906269073486, "learning_rate": 2.652439024390244e-05, "loss": 0.2997, "step": 11280 }, { "epoch": 14.437340153452686, "grad_norm": 6.397331714630127, "learning_rate": 2.6502613240418117e-05, "loss": 0.2371, "step": 11290 }, { "epoch": 14.450127877237852, "grad_norm": 0.0007806714274920523, "learning_rate": 2.6480836236933798e-05, "loss": 0.4092, "step": 11300 }, { "epoch": 14.462915601023019, "grad_norm": 9.310996055603027, "learning_rate": 2.645905923344948e-05, "loss": 0.1948, "step": 11310 }, { "epoch": 14.475703324808185, "grad_norm": 0.010096575133502483, "learning_rate": 2.6437282229965162e-05, "loss": 0.4244, "step": 11320 }, { "epoch": 14.48849104859335, "grad_norm": 0.21134832501411438, "learning_rate": 2.6415505226480837e-05, "loss": 0.1581, "step": 11330 }, { "epoch": 14.501278772378516, "grad_norm": 4.014981746673584, "learning_rate": 2.6393728222996515e-05, "loss": 0.3811, "step": 11340 }, { "epoch": 14.514066496163682, "grad_norm": 2.516115427017212, "learning_rate": 2.6371951219512197e-05, "loss": 0.358, "step": 11350 }, { "epoch": 14.526854219948849, "grad_norm": 1.9139074087142944, "learning_rate": 2.6350174216027872e-05, "loss": 0.1782, "step": 11360 }, { "epoch": 14.539641943734015, "grad_norm": 0.18447408080101013, "learning_rate": 2.6328397212543554e-05, "loss": 0.2078, "step": 11370 }, { "epoch": 14.552429667519181, "grad_norm": 2.9752326011657715, "learning_rate": 2.6306620209059235e-05, "loss": 0.4245, "step": 11380 }, { "epoch": 14.565217391304348, "grad_norm": 6.934181213378906, "learning_rate": 2.6284843205574917e-05, "loss": 0.4025, "step": 11390 }, { "epoch": 14.578005115089514, "grad_norm": 0.07944456487894058, "learning_rate": 2.6263066202090592e-05, "loss": 0.1398, "step": 11400 }, { "epoch": 14.59079283887468, "grad_norm": 2.6303796768188477, "learning_rate": 2.6241289198606274e-05, "loss": 0.1872, "step": 11410 }, { "epoch": 14.603580562659847, "grad_norm": 9.203404426574707, "learning_rate": 2.6219512195121952e-05, "loss": 0.7493, "step": 11420 }, { "epoch": 14.616368286445013, "grad_norm": 5.315173149108887, "learning_rate": 2.6197735191637634e-05, "loss": 0.3889, "step": 11430 }, { "epoch": 14.62915601023018, "grad_norm": 1.0642848014831543, "learning_rate": 2.617595818815331e-05, "loss": 0.2307, "step": 11440 }, { "epoch": 14.641943734015346, "grad_norm": 1.01585853099823, "learning_rate": 2.615418118466899e-05, "loss": 0.1184, "step": 11450 }, { "epoch": 14.654731457800512, "grad_norm": 0.000667865970171988, "learning_rate": 2.6132404181184672e-05, "loss": 0.3112, "step": 11460 }, { "epoch": 14.667519181585678, "grad_norm": 0.2515814006328583, "learning_rate": 2.6110627177700347e-05, "loss": 0.1608, "step": 11470 }, { "epoch": 14.680306905370845, "grad_norm": 1.5224660634994507, "learning_rate": 2.608885017421603e-05, "loss": 0.2974, "step": 11480 }, { "epoch": 14.693094629156011, "grad_norm": 0.15977902710437775, "learning_rate": 2.606707317073171e-05, "loss": 0.6651, "step": 11490 }, { "epoch": 14.705882352941176, "grad_norm": 4.893415451049805, "learning_rate": 2.604529616724739e-05, "loss": 0.4917, "step": 11500 }, { "epoch": 14.718670076726342, "grad_norm": 1.119374394416809, "learning_rate": 2.6023519163763068e-05, "loss": 0.0827, "step": 11510 }, { "epoch": 14.731457800511508, "grad_norm": 1.0671442747116089, "learning_rate": 2.6001742160278746e-05, "loss": 0.2147, "step": 11520 }, { "epoch": 14.744245524296675, "grad_norm": 9.146306037902832, "learning_rate": 2.5979965156794428e-05, "loss": 0.5441, "step": 11530 }, { "epoch": 14.757033248081841, "grad_norm": 3.6414523124694824, "learning_rate": 2.5958188153310103e-05, "loss": 0.2364, "step": 11540 }, { "epoch": 14.769820971867007, "grad_norm": 4.342695236206055, "learning_rate": 2.5936411149825784e-05, "loss": 0.596, "step": 11550 }, { "epoch": 14.782608695652174, "grad_norm": 0.005332822445780039, "learning_rate": 2.5914634146341466e-05, "loss": 0.1528, "step": 11560 }, { "epoch": 14.79539641943734, "grad_norm": 2.429649591445923, "learning_rate": 2.5892857142857148e-05, "loss": 0.2345, "step": 11570 }, { "epoch": 14.808184143222507, "grad_norm": 1.8359285593032837, "learning_rate": 2.5871080139372823e-05, "loss": 0.3112, "step": 11580 }, { "epoch": 14.820971867007673, "grad_norm": 0.00027430267073214054, "learning_rate": 2.5849303135888505e-05, "loss": 0.1532, "step": 11590 }, { "epoch": 14.83375959079284, "grad_norm": 10.834996223449707, "learning_rate": 2.5827526132404183e-05, "loss": 0.2621, "step": 11600 }, { "epoch": 14.846547314578006, "grad_norm": 0.0005706704687327147, "learning_rate": 2.5805749128919858e-05, "loss": 0.543, "step": 11610 }, { "epoch": 14.859335038363172, "grad_norm": 10.665911674499512, "learning_rate": 2.578397212543554e-05, "loss": 0.3869, "step": 11620 }, { "epoch": 14.872122762148338, "grad_norm": 8.830192565917969, "learning_rate": 2.576219512195122e-05, "loss": 0.4839, "step": 11630 }, { "epoch": 14.884910485933505, "grad_norm": 4.080071926116943, "learning_rate": 2.5740418118466903e-05, "loss": 0.1488, "step": 11640 }, { "epoch": 14.89769820971867, "grad_norm": 0.40564408898353577, "learning_rate": 2.5718641114982578e-05, "loss": 0.1852, "step": 11650 }, { "epoch": 14.910485933503836, "grad_norm": 2.581697940826416, "learning_rate": 2.569686411149826e-05, "loss": 0.4561, "step": 11660 }, { "epoch": 14.923273657289002, "grad_norm": 0.11887872964143753, "learning_rate": 2.567508710801394e-05, "loss": 0.2166, "step": 11670 }, { "epoch": 14.936061381074168, "grad_norm": 1.9810340404510498, "learning_rate": 2.5653310104529617e-05, "loss": 0.2498, "step": 11680 }, { "epoch": 14.948849104859335, "grad_norm": 0.0242207869887352, "learning_rate": 2.5631533101045295e-05, "loss": 0.3475, "step": 11690 }, { "epoch": 14.961636828644501, "grad_norm": 3.3264272212982178, "learning_rate": 2.5609756097560977e-05, "loss": 0.513, "step": 11700 }, { "epoch": 14.974424552429667, "grad_norm": 2.6883902549743652, "learning_rate": 2.558797909407666e-05, "loss": 0.2488, "step": 11710 }, { "epoch": 14.987212276214834, "grad_norm": 1.0811492204666138, "learning_rate": 2.5566202090592333e-05, "loss": 0.2837, "step": 11720 }, { "epoch": 15.0, "grad_norm": 3.0630617141723633, "learning_rate": 2.5544425087108015e-05, "loss": 0.4542, "step": 11730 }, { "epoch": 15.0, "eval_loss": 0.1965622454881668, "eval_runtime": 0.8126, "eval_samples_per_second": 120.604, "eval_steps_per_second": 15.998, "step": 11730 }, { "epoch": 15.012787723785166, "grad_norm": 2.454094171524048, "learning_rate": 2.5522648083623697e-05, "loss": 0.226, "step": 11740 }, { "epoch": 15.025575447570333, "grad_norm": 3.766000986099243, "learning_rate": 2.5500871080139372e-05, "loss": 0.2517, "step": 11750 }, { "epoch": 15.038363171355499, "grad_norm": 7.74129056930542, "learning_rate": 2.5479094076655054e-05, "loss": 0.2052, "step": 11760 }, { "epoch": 15.051150895140665, "grad_norm": 2.924163341522217, "learning_rate": 2.5457317073170732e-05, "loss": 0.361, "step": 11770 }, { "epoch": 15.063938618925832, "grad_norm": 0.38849493861198425, "learning_rate": 2.5435540069686414e-05, "loss": 0.2116, "step": 11780 }, { "epoch": 15.076726342710998, "grad_norm": 0.18012334406375885, "learning_rate": 2.541376306620209e-05, "loss": 0.3311, "step": 11790 }, { "epoch": 15.089514066496164, "grad_norm": 5.127551078796387, "learning_rate": 2.539198606271777e-05, "loss": 0.3043, "step": 11800 }, { "epoch": 15.10230179028133, "grad_norm": 2.3291471004486084, "learning_rate": 2.5370209059233452e-05, "loss": 0.457, "step": 11810 }, { "epoch": 15.115089514066495, "grad_norm": 3.040384531021118, "learning_rate": 2.5348432055749134e-05, "loss": 0.3165, "step": 11820 }, { "epoch": 15.127877237851662, "grad_norm": 0.054389629513025284, "learning_rate": 2.532665505226481e-05, "loss": 0.0757, "step": 11830 }, { "epoch": 15.140664961636828, "grad_norm": 8.579995155334473, "learning_rate": 2.530487804878049e-05, "loss": 0.3138, "step": 11840 }, { "epoch": 15.153452685421994, "grad_norm": 0.020280305296182632, "learning_rate": 2.528310104529617e-05, "loss": 0.1894, "step": 11850 }, { "epoch": 15.16624040920716, "grad_norm": 6.280925750732422, "learning_rate": 2.5261324041811847e-05, "loss": 0.4913, "step": 11860 }, { "epoch": 15.179028132992327, "grad_norm": 3.4848694801330566, "learning_rate": 2.5239547038327526e-05, "loss": 0.2397, "step": 11870 }, { "epoch": 15.191815856777493, "grad_norm": 5.22654390335083, "learning_rate": 2.5217770034843207e-05, "loss": 0.3758, "step": 11880 }, { "epoch": 15.20460358056266, "grad_norm": 3.6274683475494385, "learning_rate": 2.519599303135889e-05, "loss": 0.2949, "step": 11890 }, { "epoch": 15.217391304347826, "grad_norm": 0.8676568269729614, "learning_rate": 2.5174216027874564e-05, "loss": 0.1737, "step": 11900 }, { "epoch": 15.230179028132993, "grad_norm": 0.00011830198491225019, "learning_rate": 2.5152439024390246e-05, "loss": 0.0692, "step": 11910 }, { "epoch": 15.242966751918159, "grad_norm": 0.6105664372444153, "learning_rate": 2.5130662020905928e-05, "loss": 0.2105, "step": 11920 }, { "epoch": 15.255754475703325, "grad_norm": 2.1029629707336426, "learning_rate": 2.5108885017421603e-05, "loss": 0.3843, "step": 11930 }, { "epoch": 15.268542199488492, "grad_norm": 0.1929955929517746, "learning_rate": 2.5087108013937284e-05, "loss": 0.2277, "step": 11940 }, { "epoch": 15.281329923273658, "grad_norm": 0.02147739939391613, "learning_rate": 2.5065331010452963e-05, "loss": 0.1369, "step": 11950 }, { "epoch": 15.294117647058824, "grad_norm": 3.043717861175537, "learning_rate": 2.5043554006968644e-05, "loss": 0.4935, "step": 11960 }, { "epoch": 15.30690537084399, "grad_norm": 0.0011144510935992002, "learning_rate": 2.502177700348432e-05, "loss": 0.3493, "step": 11970 }, { "epoch": 15.319693094629155, "grad_norm": 5.858348846435547, "learning_rate": 2.5e-05, "loss": 0.2531, "step": 11980 }, { "epoch": 15.332480818414322, "grad_norm": 0.5781923532485962, "learning_rate": 2.497822299651568e-05, "loss": 0.2783, "step": 11990 }, { "epoch": 15.345268542199488, "grad_norm": 0.5104002356529236, "learning_rate": 2.495644599303136e-05, "loss": 0.5572, "step": 12000 }, { "epoch": 15.358056265984654, "grad_norm": 0.00011129306221846491, "learning_rate": 2.493466898954704e-05, "loss": 0.2961, "step": 12010 }, { "epoch": 15.37084398976982, "grad_norm": 14.113851547241211, "learning_rate": 2.4912891986062718e-05, "loss": 0.4369, "step": 12020 }, { "epoch": 15.383631713554987, "grad_norm": 1.8240095376968384, "learning_rate": 2.4891114982578396e-05, "loss": 0.2643, "step": 12030 }, { "epoch": 15.396419437340153, "grad_norm": 0.23419003188610077, "learning_rate": 2.4869337979094078e-05, "loss": 0.2411, "step": 12040 }, { "epoch": 15.40920716112532, "grad_norm": 10.807489395141602, "learning_rate": 2.4847560975609756e-05, "loss": 0.1637, "step": 12050 }, { "epoch": 15.421994884910486, "grad_norm": 18.70671844482422, "learning_rate": 2.4825783972125435e-05, "loss": 0.2905, "step": 12060 }, { "epoch": 15.434782608695652, "grad_norm": 0.11549190431833267, "learning_rate": 2.4804006968641117e-05, "loss": 0.3249, "step": 12070 }, { "epoch": 15.447570332480819, "grad_norm": 0.6824242472648621, "learning_rate": 2.4782229965156795e-05, "loss": 0.3821, "step": 12080 }, { "epoch": 15.460358056265985, "grad_norm": 2.258577823638916, "learning_rate": 2.4760452961672477e-05, "loss": 0.2022, "step": 12090 }, { "epoch": 15.473145780051151, "grad_norm": 4.128283500671387, "learning_rate": 2.4738675958188155e-05, "loss": 0.4868, "step": 12100 }, { "epoch": 15.485933503836318, "grad_norm": 2.9728426933288574, "learning_rate": 2.4716898954703833e-05, "loss": 0.2863, "step": 12110 }, { "epoch": 15.498721227621484, "grad_norm": 2.3259143829345703, "learning_rate": 2.4695121951219512e-05, "loss": 0.4513, "step": 12120 }, { "epoch": 15.51150895140665, "grad_norm": 0.002854045946151018, "learning_rate": 2.4673344947735194e-05, "loss": 0.3729, "step": 12130 }, { "epoch": 15.524296675191817, "grad_norm": 6.9137091636657715, "learning_rate": 2.4651567944250872e-05, "loss": 0.4284, "step": 12140 }, { "epoch": 15.537084398976981, "grad_norm": 0.3184433579444885, "learning_rate": 2.462979094076655e-05, "loss": 0.347, "step": 12150 }, { "epoch": 15.549872122762148, "grad_norm": 0.01655314862728119, "learning_rate": 2.4608013937282232e-05, "loss": 0.2761, "step": 12160 }, { "epoch": 15.562659846547314, "grad_norm": 0.7789549231529236, "learning_rate": 2.458623693379791e-05, "loss": 0.2362, "step": 12170 }, { "epoch": 15.57544757033248, "grad_norm": 0.0008993714000098407, "learning_rate": 2.4564459930313592e-05, "loss": 0.1348, "step": 12180 }, { "epoch": 15.588235294117647, "grad_norm": 6.437202453613281, "learning_rate": 2.454268292682927e-05, "loss": 0.3716, "step": 12190 }, { "epoch": 15.601023017902813, "grad_norm": 2.607696294784546, "learning_rate": 2.452090592334495e-05, "loss": 0.2855, "step": 12200 }, { "epoch": 15.61381074168798, "grad_norm": 3.7107765674591064, "learning_rate": 2.4499128919860627e-05, "loss": 0.1729, "step": 12210 }, { "epoch": 15.626598465473146, "grad_norm": 10.343840599060059, "learning_rate": 2.4477351916376306e-05, "loss": 0.2955, "step": 12220 }, { "epoch": 15.639386189258312, "grad_norm": 2.9948103427886963, "learning_rate": 2.4455574912891987e-05, "loss": 0.1801, "step": 12230 }, { "epoch": 15.652173913043478, "grad_norm": 7.791348457336426, "learning_rate": 2.4433797909407666e-05, "loss": 0.1949, "step": 12240 }, { "epoch": 15.664961636828645, "grad_norm": 4.23246955871582, "learning_rate": 2.4412020905923347e-05, "loss": 0.4701, "step": 12250 }, { "epoch": 15.677749360613811, "grad_norm": 0.00041086444980464876, "learning_rate": 2.4390243902439026e-05, "loss": 0.2601, "step": 12260 }, { "epoch": 15.690537084398978, "grad_norm": 0.3954007625579834, "learning_rate": 2.4368466898954707e-05, "loss": 0.4062, "step": 12270 }, { "epoch": 15.703324808184144, "grad_norm": 0.017659878358244896, "learning_rate": 2.4346689895470386e-05, "loss": 0.283, "step": 12280 }, { "epoch": 15.71611253196931, "grad_norm": 2.2281081676483154, "learning_rate": 2.4324912891986064e-05, "loss": 0.1874, "step": 12290 }, { "epoch": 15.728900255754475, "grad_norm": 0.07254109531641006, "learning_rate": 2.4303135888501743e-05, "loss": 0.1974, "step": 12300 }, { "epoch": 15.741687979539641, "grad_norm": 5.276591777801514, "learning_rate": 2.428135888501742e-05, "loss": 0.3396, "step": 12310 }, { "epoch": 15.754475703324808, "grad_norm": 3.9964542388916016, "learning_rate": 2.4259581881533103e-05, "loss": 0.166, "step": 12320 }, { "epoch": 15.767263427109974, "grad_norm": 10.958535194396973, "learning_rate": 2.423780487804878e-05, "loss": 0.3589, "step": 12330 }, { "epoch": 15.78005115089514, "grad_norm": 1.8940563201904297, "learning_rate": 2.4216027874564463e-05, "loss": 0.5555, "step": 12340 }, { "epoch": 15.792838874680307, "grad_norm": 7.3639421463012695, "learning_rate": 2.419425087108014e-05, "loss": 0.5132, "step": 12350 }, { "epoch": 15.805626598465473, "grad_norm": 1.4476559162139893, "learning_rate": 2.4172473867595823e-05, "loss": 0.1616, "step": 12360 }, { "epoch": 15.81841432225064, "grad_norm": 0.613338828086853, "learning_rate": 2.4150696864111498e-05, "loss": 0.1627, "step": 12370 }, { "epoch": 15.831202046035806, "grad_norm": 1.008769154548645, "learning_rate": 2.4128919860627176e-05, "loss": 0.4227, "step": 12380 }, { "epoch": 15.843989769820972, "grad_norm": 0.0030397542286664248, "learning_rate": 2.4107142857142858e-05, "loss": 0.4402, "step": 12390 }, { "epoch": 15.856777493606138, "grad_norm": 22.37423324584961, "learning_rate": 2.4085365853658536e-05, "loss": 0.4533, "step": 12400 }, { "epoch": 15.869565217391305, "grad_norm": 5.366839408874512, "learning_rate": 2.4063588850174218e-05, "loss": 0.2505, "step": 12410 }, { "epoch": 15.882352941176471, "grad_norm": 0.02170138992369175, "learning_rate": 2.4041811846689896e-05, "loss": 0.3758, "step": 12420 }, { "epoch": 15.895140664961637, "grad_norm": 5.612401962280273, "learning_rate": 2.4020034843205578e-05, "loss": 0.2679, "step": 12430 }, { "epoch": 15.907928388746804, "grad_norm": 1.2804334163665771, "learning_rate": 2.3998257839721257e-05, "loss": 0.1707, "step": 12440 }, { "epoch": 15.92071611253197, "grad_norm": 0.618193507194519, "learning_rate": 2.3976480836236935e-05, "loss": 0.4114, "step": 12450 }, { "epoch": 15.933503836317136, "grad_norm": 0.7701372504234314, "learning_rate": 2.3954703832752613e-05, "loss": 0.157, "step": 12460 }, { "epoch": 15.946291560102301, "grad_norm": 4.087001323699951, "learning_rate": 2.393292682926829e-05, "loss": 0.3672, "step": 12470 }, { "epoch": 15.959079283887467, "grad_norm": 1.3045196533203125, "learning_rate": 2.3911149825783973e-05, "loss": 0.3114, "step": 12480 }, { "epoch": 15.971867007672634, "grad_norm": 0.04855654016137123, "learning_rate": 2.388937282229965e-05, "loss": 0.4491, "step": 12490 }, { "epoch": 15.9846547314578, "grad_norm": 3.1147804260253906, "learning_rate": 2.3867595818815333e-05, "loss": 0.274, "step": 12500 }, { "epoch": 15.997442455242966, "grad_norm": 0.048398375511169434, "learning_rate": 2.3845818815331012e-05, "loss": 0.4065, "step": 12510 }, { "epoch": 16.0, "eval_loss": 0.1960146725177765, "eval_runtime": 0.9892, "eval_samples_per_second": 99.075, "eval_steps_per_second": 13.143, "step": 12512 }, { "epoch": 16.010230179028135, "grad_norm": 4.201949596405029, "learning_rate": 2.3824041811846694e-05, "loss": 0.2956, "step": 12520 }, { "epoch": 16.0230179028133, "grad_norm": 0.5891295671463013, "learning_rate": 2.3802264808362372e-05, "loss": 0.3316, "step": 12530 }, { "epoch": 16.035805626598467, "grad_norm": 7.009029388427734, "learning_rate": 2.378048780487805e-05, "loss": 0.3448, "step": 12540 }, { "epoch": 16.04859335038363, "grad_norm": 2.449805498123169, "learning_rate": 2.375871080139373e-05, "loss": 0.1357, "step": 12550 }, { "epoch": 16.061381074168796, "grad_norm": 1.8380799293518066, "learning_rate": 2.3736933797909407e-05, "loss": 0.2085, "step": 12560 }, { "epoch": 16.074168797953963, "grad_norm": 5.865932941436768, "learning_rate": 2.371515679442509e-05, "loss": 0.3385, "step": 12570 }, { "epoch": 16.08695652173913, "grad_norm": 1.848507285118103, "learning_rate": 2.3693379790940767e-05, "loss": 0.2499, "step": 12580 }, { "epoch": 16.099744245524295, "grad_norm": 1.9168906211853027, "learning_rate": 2.367160278745645e-05, "loss": 0.241, "step": 12590 }, { "epoch": 16.11253196930946, "grad_norm": 0.5782299041748047, "learning_rate": 2.3649825783972127e-05, "loss": 0.2668, "step": 12600 }, { "epoch": 16.125319693094628, "grad_norm": 0.9813456535339355, "learning_rate": 2.3628048780487806e-05, "loss": 0.1909, "step": 12610 }, { "epoch": 16.138107416879794, "grad_norm": 0.9238201975822449, "learning_rate": 2.3606271777003487e-05, "loss": 0.3034, "step": 12620 }, { "epoch": 16.15089514066496, "grad_norm": 0.00906333327293396, "learning_rate": 2.3584494773519166e-05, "loss": 0.5375, "step": 12630 }, { "epoch": 16.163682864450127, "grad_norm": 1.1421220302581787, "learning_rate": 2.3562717770034844e-05, "loss": 0.2129, "step": 12640 }, { "epoch": 16.176470588235293, "grad_norm": 0.0030761337839066982, "learning_rate": 2.3540940766550522e-05, "loss": 0.2951, "step": 12650 }, { "epoch": 16.18925831202046, "grad_norm": 0.29623809456825256, "learning_rate": 2.3519163763066204e-05, "loss": 0.3342, "step": 12660 }, { "epoch": 16.202046035805626, "grad_norm": 0.129877507686615, "learning_rate": 2.3497386759581882e-05, "loss": 0.2952, "step": 12670 }, { "epoch": 16.214833759590793, "grad_norm": 0.6856746077537537, "learning_rate": 2.347560975609756e-05, "loss": 0.3252, "step": 12680 }, { "epoch": 16.22762148337596, "grad_norm": 3.098801374435425, "learning_rate": 2.3453832752613243e-05, "loss": 0.1322, "step": 12690 }, { "epoch": 16.240409207161125, "grad_norm": 0.560373067855835, "learning_rate": 2.343205574912892e-05, "loss": 0.2272, "step": 12700 }, { "epoch": 16.25319693094629, "grad_norm": 0.7054154276847839, "learning_rate": 2.3410278745644603e-05, "loss": 0.4074, "step": 12710 }, { "epoch": 16.265984654731458, "grad_norm": 5.791378498077393, "learning_rate": 2.3388501742160278e-05, "loss": 0.4496, "step": 12720 }, { "epoch": 16.278772378516624, "grad_norm": 0.18869848549365997, "learning_rate": 2.336672473867596e-05, "loss": 0.1884, "step": 12730 }, { "epoch": 16.29156010230179, "grad_norm": 5.4613776206970215, "learning_rate": 2.3344947735191638e-05, "loss": 0.4337, "step": 12740 }, { "epoch": 16.304347826086957, "grad_norm": 6.673255920410156, "learning_rate": 2.332317073170732e-05, "loss": 0.3041, "step": 12750 }, { "epoch": 16.317135549872123, "grad_norm": 15.871744155883789, "learning_rate": 2.3301393728222998e-05, "loss": 0.4633, "step": 12760 }, { "epoch": 16.32992327365729, "grad_norm": 0.38828790187835693, "learning_rate": 2.3279616724738676e-05, "loss": 0.2658, "step": 12770 }, { "epoch": 16.342710997442456, "grad_norm": 1.8368539810180664, "learning_rate": 2.3257839721254358e-05, "loss": 0.0951, "step": 12780 }, { "epoch": 16.355498721227622, "grad_norm": 6.898545265197754, "learning_rate": 2.3236062717770036e-05, "loss": 0.3854, "step": 12790 }, { "epoch": 16.36828644501279, "grad_norm": 0.2560237646102905, "learning_rate": 2.3214285714285715e-05, "loss": 0.3077, "step": 12800 }, { "epoch": 16.381074168797955, "grad_norm": 0.23725102841854095, "learning_rate": 2.3192508710801393e-05, "loss": 0.2606, "step": 12810 }, { "epoch": 16.39386189258312, "grad_norm": 0.25493350625038147, "learning_rate": 2.3170731707317075e-05, "loss": 0.2807, "step": 12820 }, { "epoch": 16.406649616368288, "grad_norm": 2.778068780899048, "learning_rate": 2.3148954703832753e-05, "loss": 0.2054, "step": 12830 }, { "epoch": 16.419437340153454, "grad_norm": 0.4015319049358368, "learning_rate": 2.312717770034843e-05, "loss": 0.5631, "step": 12840 }, { "epoch": 16.43222506393862, "grad_norm": 8.717150688171387, "learning_rate": 2.3105400696864113e-05, "loss": 0.3589, "step": 12850 }, { "epoch": 16.445012787723787, "grad_norm": 0.0001172411284642294, "learning_rate": 2.308362369337979e-05, "loss": 0.2292, "step": 12860 }, { "epoch": 16.45780051150895, "grad_norm": 4.406980514526367, "learning_rate": 2.3061846689895473e-05, "loss": 0.18, "step": 12870 }, { "epoch": 16.470588235294116, "grad_norm": 2.2067298889160156, "learning_rate": 2.3040069686411152e-05, "loss": 0.4195, "step": 12880 }, { "epoch": 16.483375959079282, "grad_norm": 0.9050547480583191, "learning_rate": 2.301829268292683e-05, "loss": 0.3378, "step": 12890 }, { "epoch": 16.49616368286445, "grad_norm": 2.681492567062378, "learning_rate": 2.299651567944251e-05, "loss": 0.2948, "step": 12900 }, { "epoch": 16.508951406649615, "grad_norm": 6.462623596191406, "learning_rate": 2.297473867595819e-05, "loss": 0.4256, "step": 12910 }, { "epoch": 16.52173913043478, "grad_norm": 3.2028021812438965, "learning_rate": 2.295296167247387e-05, "loss": 0.2487, "step": 12920 }, { "epoch": 16.534526854219948, "grad_norm": 0.28855952620506287, "learning_rate": 2.2931184668989547e-05, "loss": 0.4793, "step": 12930 }, { "epoch": 16.547314578005114, "grad_norm": 0.00012032359518343583, "learning_rate": 2.290940766550523e-05, "loss": 0.355, "step": 12940 }, { "epoch": 16.56010230179028, "grad_norm": 0.13559375703334808, "learning_rate": 2.2887630662020907e-05, "loss": 0.2055, "step": 12950 }, { "epoch": 16.572890025575447, "grad_norm": 6.483852386474609, "learning_rate": 2.286585365853659e-05, "loss": 0.2742, "step": 12960 }, { "epoch": 16.585677749360613, "grad_norm": 4.124710559844971, "learning_rate": 2.2844076655052267e-05, "loss": 0.3295, "step": 12970 }, { "epoch": 16.59846547314578, "grad_norm": 1.22804856300354, "learning_rate": 2.2822299651567945e-05, "loss": 0.2733, "step": 12980 }, { "epoch": 16.611253196930946, "grad_norm": 0.1256943941116333, "learning_rate": 2.2800522648083624e-05, "loss": 0.2037, "step": 12990 }, { "epoch": 16.624040920716112, "grad_norm": 0.0005564725724980235, "learning_rate": 2.2778745644599302e-05, "loss": 0.2921, "step": 13000 }, { "epoch": 16.63682864450128, "grad_norm": 0.0033420585095882416, "learning_rate": 2.2756968641114984e-05, "loss": 0.3267, "step": 13010 }, { "epoch": 16.649616368286445, "grad_norm": 1.830318808555603, "learning_rate": 2.2735191637630662e-05, "loss": 0.3211, "step": 13020 }, { "epoch": 16.66240409207161, "grad_norm": 3.8310177326202393, "learning_rate": 2.2713414634146344e-05, "loss": 0.1711, "step": 13030 }, { "epoch": 16.675191815856778, "grad_norm": 1.3652631044387817, "learning_rate": 2.2691637630662022e-05, "loss": 0.3631, "step": 13040 }, { "epoch": 16.687979539641944, "grad_norm": 0.04906712844967842, "learning_rate": 2.2669860627177704e-05, "loss": 0.0488, "step": 13050 }, { "epoch": 16.70076726342711, "grad_norm": 7.600613594055176, "learning_rate": 2.264808362369338e-05, "loss": 0.4641, "step": 13060 }, { "epoch": 16.713554987212277, "grad_norm": 0.00042603735346347094, "learning_rate": 2.2626306620209057e-05, "loss": 0.2161, "step": 13070 }, { "epoch": 16.726342710997443, "grad_norm": 0.19117648899555206, "learning_rate": 2.260452961672474e-05, "loss": 0.2927, "step": 13080 }, { "epoch": 16.73913043478261, "grad_norm": 2.7685117721557617, "learning_rate": 2.2582752613240418e-05, "loss": 0.1958, "step": 13090 }, { "epoch": 16.751918158567776, "grad_norm": 2.0074069499969482, "learning_rate": 2.25609756097561e-05, "loss": 0.3821, "step": 13100 }, { "epoch": 16.764705882352942, "grad_norm": 0.046160824596881866, "learning_rate": 2.2539198606271778e-05, "loss": 0.1671, "step": 13110 }, { "epoch": 16.77749360613811, "grad_norm": 0.10031644254922867, "learning_rate": 2.251742160278746e-05, "loss": 0.0973, "step": 13120 }, { "epoch": 16.790281329923275, "grad_norm": 2.997203826904297, "learning_rate": 2.2495644599303138e-05, "loss": 0.3202, "step": 13130 }, { "epoch": 16.80306905370844, "grad_norm": 0.17301268875598907, "learning_rate": 2.2473867595818816e-05, "loss": 0.2823, "step": 13140 }, { "epoch": 16.815856777493607, "grad_norm": 0.2966526448726654, "learning_rate": 2.2452090592334494e-05, "loss": 0.2609, "step": 13150 }, { "epoch": 16.828644501278774, "grad_norm": 3.0557382106781006, "learning_rate": 2.2430313588850173e-05, "loss": 0.3246, "step": 13160 }, { "epoch": 16.84143222506394, "grad_norm": 1.249798059463501, "learning_rate": 2.2408536585365855e-05, "loss": 0.3694, "step": 13170 }, { "epoch": 16.854219948849106, "grad_norm": 0.13164706528186798, "learning_rate": 2.2386759581881533e-05, "loss": 0.2964, "step": 13180 }, { "epoch": 16.867007672634273, "grad_norm": 0.051667943596839905, "learning_rate": 2.2364982578397215e-05, "loss": 0.1921, "step": 13190 }, { "epoch": 16.87979539641944, "grad_norm": 0.7596039772033691, "learning_rate": 2.2343205574912893e-05, "loss": 0.3181, "step": 13200 }, { "epoch": 16.892583120204602, "grad_norm": 11.275361061096191, "learning_rate": 2.2321428571428575e-05, "loss": 0.3087, "step": 13210 }, { "epoch": 16.90537084398977, "grad_norm": 1.354041337966919, "learning_rate": 2.2299651567944253e-05, "loss": 0.073, "step": 13220 }, { "epoch": 16.918158567774935, "grad_norm": 2.936005115509033, "learning_rate": 2.227787456445993e-05, "loss": 0.4707, "step": 13230 }, { "epoch": 16.9309462915601, "grad_norm": 7.00629186630249, "learning_rate": 2.225609756097561e-05, "loss": 0.3539, "step": 13240 }, { "epoch": 16.943734015345267, "grad_norm": 0.3057442307472229, "learning_rate": 2.2234320557491288e-05, "loss": 0.2228, "step": 13250 }, { "epoch": 16.956521739130434, "grad_norm": 1.9461525678634644, "learning_rate": 2.221254355400697e-05, "loss": 0.2704, "step": 13260 }, { "epoch": 16.9693094629156, "grad_norm": 0.01109783910214901, "learning_rate": 2.219076655052265e-05, "loss": 0.1429, "step": 13270 }, { "epoch": 16.982097186700766, "grad_norm": 0.3377877175807953, "learning_rate": 2.216898954703833e-05, "loss": 0.206, "step": 13280 }, { "epoch": 16.994884910485933, "grad_norm": 1.623797059059143, "learning_rate": 2.214721254355401e-05, "loss": 0.3466, "step": 13290 }, { "epoch": 17.0, "eval_loss": 0.194735586643219, "eval_runtime": 0.984, "eval_samples_per_second": 99.592, "eval_steps_per_second": 13.211, "step": 13294 }, { "epoch": 17.0076726342711, "grad_norm": 0.4254322350025177, "learning_rate": 2.2125435540069687e-05, "loss": 0.3234, "step": 13300 }, { "epoch": 17.020460358056265, "grad_norm": 0.6410069465637207, "learning_rate": 2.210365853658537e-05, "loss": 0.4415, "step": 13310 }, { "epoch": 17.033248081841432, "grad_norm": 7.966799736022949, "learning_rate": 2.2081881533101047e-05, "loss": 0.1735, "step": 13320 }, { "epoch": 17.046035805626598, "grad_norm": 1.0562230348587036, "learning_rate": 2.2060104529616725e-05, "loss": 0.217, "step": 13330 }, { "epoch": 17.058823529411764, "grad_norm": 3.787612199783325, "learning_rate": 2.2038327526132404e-05, "loss": 0.2052, "step": 13340 }, { "epoch": 17.07161125319693, "grad_norm": 4.378226280212402, "learning_rate": 2.2016550522648085e-05, "loss": 0.1748, "step": 13350 }, { "epoch": 17.084398976982097, "grad_norm": 2.6178719997406006, "learning_rate": 2.1994773519163764e-05, "loss": 0.3225, "step": 13360 }, { "epoch": 17.097186700767264, "grad_norm": 4.570335865020752, "learning_rate": 2.1972996515679445e-05, "loss": 0.342, "step": 13370 }, { "epoch": 17.10997442455243, "grad_norm": 2.0175881385803223, "learning_rate": 2.1951219512195124e-05, "loss": 0.2828, "step": 13380 }, { "epoch": 17.122762148337596, "grad_norm": 4.092813014984131, "learning_rate": 2.1929442508710802e-05, "loss": 0.4931, "step": 13390 }, { "epoch": 17.135549872122763, "grad_norm": 2.1676254272460938, "learning_rate": 2.1907665505226484e-05, "loss": 0.0911, "step": 13400 }, { "epoch": 17.14833759590793, "grad_norm": 2.225700855255127, "learning_rate": 2.188588850174216e-05, "loss": 0.3082, "step": 13410 }, { "epoch": 17.161125319693095, "grad_norm": 1.6372703313827515, "learning_rate": 2.186411149825784e-05, "loss": 0.1567, "step": 13420 }, { "epoch": 17.17391304347826, "grad_norm": 0.0013261646963655949, "learning_rate": 2.184233449477352e-05, "loss": 0.0525, "step": 13430 }, { "epoch": 17.186700767263428, "grad_norm": 7.990118980407715, "learning_rate": 2.18205574912892e-05, "loss": 0.4464, "step": 13440 }, { "epoch": 17.199488491048594, "grad_norm": 3.268831968307495, "learning_rate": 2.179878048780488e-05, "loss": 0.1397, "step": 13450 }, { "epoch": 17.21227621483376, "grad_norm": 0.16767235100269318, "learning_rate": 2.1777003484320557e-05, "loss": 0.2333, "step": 13460 }, { "epoch": 17.225063938618927, "grad_norm": 3.0834221839904785, "learning_rate": 2.175522648083624e-05, "loss": 0.2366, "step": 13470 }, { "epoch": 17.237851662404093, "grad_norm": 13.640128135681152, "learning_rate": 2.1733449477351918e-05, "loss": 0.163, "step": 13480 }, { "epoch": 17.25063938618926, "grad_norm": 0.00588564807549119, "learning_rate": 2.1711672473867596e-05, "loss": 0.1748, "step": 13490 }, { "epoch": 17.263427109974426, "grad_norm": 6.2895660400390625, "learning_rate": 2.1689895470383274e-05, "loss": 0.4067, "step": 13500 }, { "epoch": 17.276214833759592, "grad_norm": 3.481398105621338, "learning_rate": 2.1668118466898956e-05, "loss": 0.538, "step": 13510 }, { "epoch": 17.289002557544755, "grad_norm": 0.00020840627257712185, "learning_rate": 2.1646341463414634e-05, "loss": 0.2197, "step": 13520 }, { "epoch": 17.30179028132992, "grad_norm": 12.076574325561523, "learning_rate": 2.1624564459930316e-05, "loss": 0.3942, "step": 13530 }, { "epoch": 17.314578005115088, "grad_norm": 8.081069946289062, "learning_rate": 2.1602787456445995e-05, "loss": 0.3668, "step": 13540 }, { "epoch": 17.327365728900254, "grad_norm": 4.490804195404053, "learning_rate": 2.1581010452961673e-05, "loss": 0.1649, "step": 13550 }, { "epoch": 17.34015345268542, "grad_norm": 2.62864351272583, "learning_rate": 2.1559233449477355e-05, "loss": 0.2097, "step": 13560 }, { "epoch": 17.352941176470587, "grad_norm": 0.35655027627944946, "learning_rate": 2.1537456445993033e-05, "loss": 0.2966, "step": 13570 }, { "epoch": 17.365728900255753, "grad_norm": 0.05890129879117012, "learning_rate": 2.151567944250871e-05, "loss": 0.1949, "step": 13580 }, { "epoch": 17.37851662404092, "grad_norm": 0.0837198868393898, "learning_rate": 2.149390243902439e-05, "loss": 0.2269, "step": 13590 }, { "epoch": 17.391304347826086, "grad_norm": 10.314323425292969, "learning_rate": 2.147212543554007e-05, "loss": 0.4396, "step": 13600 }, { "epoch": 17.404092071611252, "grad_norm": 3.623154878616333, "learning_rate": 2.145034843205575e-05, "loss": 0.1614, "step": 13610 }, { "epoch": 17.41687979539642, "grad_norm": 4.475399971008301, "learning_rate": 2.1428571428571428e-05, "loss": 0.1741, "step": 13620 }, { "epoch": 17.429667519181585, "grad_norm": 0.008211358450353146, "learning_rate": 2.140679442508711e-05, "loss": 0.0813, "step": 13630 }, { "epoch": 17.44245524296675, "grad_norm": 8.496784210205078, "learning_rate": 2.1385017421602788e-05, "loss": 0.3875, "step": 13640 }, { "epoch": 17.455242966751918, "grad_norm": 5.246673583984375, "learning_rate": 2.136324041811847e-05, "loss": 0.3415, "step": 13650 }, { "epoch": 17.468030690537084, "grad_norm": 0.3365817368030548, "learning_rate": 2.134146341463415e-05, "loss": 0.1968, "step": 13660 }, { "epoch": 17.48081841432225, "grad_norm": 0.7856032848358154, "learning_rate": 2.1319686411149827e-05, "loss": 0.3339, "step": 13670 }, { "epoch": 17.493606138107417, "grad_norm": 6.534681797027588, "learning_rate": 2.1297909407665505e-05, "loss": 0.3993, "step": 13680 }, { "epoch": 17.506393861892583, "grad_norm": 1.3871843814849854, "learning_rate": 2.1276132404181183e-05, "loss": 0.1953, "step": 13690 }, { "epoch": 17.51918158567775, "grad_norm": 3.838477849960327, "learning_rate": 2.1254355400696865e-05, "loss": 0.3942, "step": 13700 }, { "epoch": 17.531969309462916, "grad_norm": 0.18229646980762482, "learning_rate": 2.1232578397212544e-05, "loss": 0.1537, "step": 13710 }, { "epoch": 17.544757033248082, "grad_norm": 3.601231575012207, "learning_rate": 2.1210801393728225e-05, "loss": 0.2614, "step": 13720 }, { "epoch": 17.55754475703325, "grad_norm": 0.9129027724266052, "learning_rate": 2.1189024390243904e-05, "loss": 0.6262, "step": 13730 }, { "epoch": 17.570332480818415, "grad_norm": 1.753222942352295, "learning_rate": 2.1167247386759585e-05, "loss": 0.231, "step": 13740 }, { "epoch": 17.58312020460358, "grad_norm": 0.5044955611228943, "learning_rate": 2.1145470383275264e-05, "loss": 0.1774, "step": 13750 }, { "epoch": 17.595907928388748, "grad_norm": 0.09469438344240189, "learning_rate": 2.1123693379790942e-05, "loss": 0.2984, "step": 13760 }, { "epoch": 17.608695652173914, "grad_norm": 5.122965335845947, "learning_rate": 2.110191637630662e-05, "loss": 0.4116, "step": 13770 }, { "epoch": 17.62148337595908, "grad_norm": 0.004069724585860968, "learning_rate": 2.10801393728223e-05, "loss": 0.1216, "step": 13780 }, { "epoch": 17.634271099744247, "grad_norm": 0.0013519871281459928, "learning_rate": 2.105836236933798e-05, "loss": 0.2065, "step": 13790 }, { "epoch": 17.647058823529413, "grad_norm": 2.187586545944214, "learning_rate": 2.103658536585366e-05, "loss": 0.2592, "step": 13800 }, { "epoch": 17.65984654731458, "grad_norm": 9.104143142700195, "learning_rate": 2.101480836236934e-05, "loss": 0.3444, "step": 13810 }, { "epoch": 17.672634271099746, "grad_norm": 5.176393985748291, "learning_rate": 2.099303135888502e-05, "loss": 0.2548, "step": 13820 }, { "epoch": 17.685421994884912, "grad_norm": 7.437559127807617, "learning_rate": 2.0971254355400697e-05, "loss": 0.2961, "step": 13830 }, { "epoch": 17.69820971867008, "grad_norm": 0.008558416739106178, "learning_rate": 2.0949477351916376e-05, "loss": 0.2467, "step": 13840 }, { "epoch": 17.710997442455245, "grad_norm": 1.2789453268051147, "learning_rate": 2.0927700348432054e-05, "loss": 0.1578, "step": 13850 }, { "epoch": 17.723785166240408, "grad_norm": 2.9721248149871826, "learning_rate": 2.0905923344947736e-05, "loss": 0.268, "step": 13860 }, { "epoch": 17.736572890025574, "grad_norm": 0.1331997960805893, "learning_rate": 2.0884146341463414e-05, "loss": 0.2809, "step": 13870 }, { "epoch": 17.74936061381074, "grad_norm": 7.518838405609131, "learning_rate": 2.0862369337979096e-05, "loss": 0.3814, "step": 13880 }, { "epoch": 17.762148337595907, "grad_norm": 2.332446813583374, "learning_rate": 2.0840592334494774e-05, "loss": 0.2716, "step": 13890 }, { "epoch": 17.774936061381073, "grad_norm": 8.556915283203125, "learning_rate": 2.0818815331010456e-05, "loss": 0.1651, "step": 13900 }, { "epoch": 17.78772378516624, "grad_norm": 1.377729058265686, "learning_rate": 2.0797038327526134e-05, "loss": 0.3073, "step": 13910 }, { "epoch": 17.800511508951406, "grad_norm": 0.5971033573150635, "learning_rate": 2.0775261324041813e-05, "loss": 0.2833, "step": 13920 }, { "epoch": 17.813299232736572, "grad_norm": 7.453729152679443, "learning_rate": 2.075348432055749e-05, "loss": 0.4556, "step": 13930 }, { "epoch": 17.82608695652174, "grad_norm": 0.14529769122600555, "learning_rate": 2.073170731707317e-05, "loss": 0.1734, "step": 13940 }, { "epoch": 17.838874680306905, "grad_norm": 11.693350791931152, "learning_rate": 2.070993031358885e-05, "loss": 0.2128, "step": 13950 }, { "epoch": 17.85166240409207, "grad_norm": 0.5937496423721313, "learning_rate": 2.068815331010453e-05, "loss": 0.2481, "step": 13960 }, { "epoch": 17.864450127877237, "grad_norm": 5.276167869567871, "learning_rate": 2.066637630662021e-05, "loss": 0.2194, "step": 13970 }, { "epoch": 17.877237851662404, "grad_norm": 0.024612775072455406, "learning_rate": 2.064459930313589e-05, "loss": 0.2629, "step": 13980 }, { "epoch": 17.89002557544757, "grad_norm": 0.25338679552078247, "learning_rate": 2.062282229965157e-05, "loss": 0.2496, "step": 13990 }, { "epoch": 17.902813299232736, "grad_norm": 9.272095680236816, "learning_rate": 2.060104529616725e-05, "loss": 0.3914, "step": 14000 }, { "epoch": 17.915601023017903, "grad_norm": 2.9164888858795166, "learning_rate": 2.0579268292682928e-05, "loss": 0.4612, "step": 14010 }, { "epoch": 17.92838874680307, "grad_norm": 5.0970282554626465, "learning_rate": 2.0557491289198607e-05, "loss": 0.534, "step": 14020 }, { "epoch": 17.941176470588236, "grad_norm": 0.003662322647869587, "learning_rate": 2.0535714285714285e-05, "loss": 0.3885, "step": 14030 }, { "epoch": 17.953964194373402, "grad_norm": 3.319575071334839, "learning_rate": 2.0513937282229967e-05, "loss": 0.3294, "step": 14040 }, { "epoch": 17.966751918158568, "grad_norm": 0.11396234482526779, "learning_rate": 2.0492160278745645e-05, "loss": 0.3142, "step": 14050 }, { "epoch": 17.979539641943735, "grad_norm": 1.887491226196289, "learning_rate": 2.0470383275261327e-05, "loss": 0.2062, "step": 14060 }, { "epoch": 17.9923273657289, "grad_norm": 0.5869520902633667, "learning_rate": 2.0448606271777005e-05, "loss": 0.3147, "step": 14070 }, { "epoch": 18.0, "eval_loss": 0.1915951371192932, "eval_runtime": 0.9799, "eval_samples_per_second": 100.008, "eval_steps_per_second": 13.266, "step": 14076 }, { "epoch": 18.005115089514067, "grad_norm": 0.9173471331596375, "learning_rate": 2.0426829268292683e-05, "loss": 0.2309, "step": 14080 }, { "epoch": 18.017902813299234, "grad_norm": 10.534737586975098, "learning_rate": 2.0405052264808365e-05, "loss": 0.2463, "step": 14090 }, { "epoch": 18.0306905370844, "grad_norm": 0.6548416614532471, "learning_rate": 2.038327526132404e-05, "loss": 0.2417, "step": 14100 }, { "epoch": 18.043478260869566, "grad_norm": 1.446814775466919, "learning_rate": 2.0361498257839722e-05, "loss": 0.1992, "step": 14110 }, { "epoch": 18.056265984654733, "grad_norm": 5.1453633308410645, "learning_rate": 2.03397212543554e-05, "loss": 0.187, "step": 14120 }, { "epoch": 18.0690537084399, "grad_norm": 0.3860657215118408, "learning_rate": 2.0317944250871082e-05, "loss": 0.2015, "step": 14130 }, { "epoch": 18.081841432225065, "grad_norm": 7.620376110076904, "learning_rate": 2.029616724738676e-05, "loss": 0.3419, "step": 14140 }, { "epoch": 18.09462915601023, "grad_norm": 3.2555360794067383, "learning_rate": 2.0274390243902442e-05, "loss": 0.2185, "step": 14150 }, { "epoch": 18.107416879795398, "grad_norm": 6.394047260284424, "learning_rate": 2.025261324041812e-05, "loss": 0.2663, "step": 14160 }, { "epoch": 18.120204603580564, "grad_norm": 1.8313113451004028, "learning_rate": 2.02308362369338e-05, "loss": 0.0787, "step": 14170 }, { "epoch": 18.132992327365727, "grad_norm": 2.6229982376098633, "learning_rate": 2.0209059233449477e-05, "loss": 0.219, "step": 14180 }, { "epoch": 18.145780051150894, "grad_norm": 0.548910915851593, "learning_rate": 2.0187282229965156e-05, "loss": 0.1215, "step": 14190 }, { "epoch": 18.15856777493606, "grad_norm": 0.660483717918396, "learning_rate": 2.0165505226480837e-05, "loss": 0.3008, "step": 14200 }, { "epoch": 18.171355498721226, "grad_norm": 2.018397092819214, "learning_rate": 2.0143728222996516e-05, "loss": 0.2721, "step": 14210 }, { "epoch": 18.184143222506393, "grad_norm": 0.0916806161403656, "learning_rate": 2.0121951219512197e-05, "loss": 0.4191, "step": 14220 }, { "epoch": 18.19693094629156, "grad_norm": 0.08776136487722397, "learning_rate": 2.0100174216027876e-05, "loss": 0.2072, "step": 14230 }, { "epoch": 18.209718670076725, "grad_norm": 5.849903583526611, "learning_rate": 2.0078397212543554e-05, "loss": 0.4458, "step": 14240 }, { "epoch": 18.22250639386189, "grad_norm": 4.089534759521484, "learning_rate": 2.0056620209059236e-05, "loss": 0.1859, "step": 14250 }, { "epoch": 18.235294117647058, "grad_norm": 11.935927391052246, "learning_rate": 2.0034843205574914e-05, "loss": 0.4495, "step": 14260 }, { "epoch": 18.248081841432224, "grad_norm": 7.223516464233398, "learning_rate": 2.0013066202090593e-05, "loss": 0.2481, "step": 14270 }, { "epoch": 18.26086956521739, "grad_norm": 4.914455890655518, "learning_rate": 1.999128919860627e-05, "loss": 0.3013, "step": 14280 }, { "epoch": 18.273657289002557, "grad_norm": 0.18564479053020477, "learning_rate": 1.9969512195121953e-05, "loss": 0.2039, "step": 14290 }, { "epoch": 18.286445012787723, "grad_norm": 0.6266302466392517, "learning_rate": 1.994773519163763e-05, "loss": 0.3876, "step": 14300 }, { "epoch": 18.29923273657289, "grad_norm": 5.617627143859863, "learning_rate": 1.992595818815331e-05, "loss": 0.4202, "step": 14310 }, { "epoch": 18.312020460358056, "grad_norm": 7.912176609039307, "learning_rate": 1.990418118466899e-05, "loss": 0.4303, "step": 14320 }, { "epoch": 18.324808184143222, "grad_norm": 0.9023106694221497, "learning_rate": 1.988240418118467e-05, "loss": 0.1635, "step": 14330 }, { "epoch": 18.33759590792839, "grad_norm": 6.940431594848633, "learning_rate": 1.986062717770035e-05, "loss": 0.3362, "step": 14340 }, { "epoch": 18.350383631713555, "grad_norm": 0.8240020871162415, "learning_rate": 1.983885017421603e-05, "loss": 0.3445, "step": 14350 }, { "epoch": 18.36317135549872, "grad_norm": 9.887919425964355, "learning_rate": 1.9817073170731708e-05, "loss": 0.3103, "step": 14360 }, { "epoch": 18.375959079283888, "grad_norm": 0.005527616012841463, "learning_rate": 1.9795296167247386e-05, "loss": 0.645, "step": 14370 }, { "epoch": 18.388746803069054, "grad_norm": 2.5189778804779053, "learning_rate": 1.9773519163763068e-05, "loss": 0.3151, "step": 14380 }, { "epoch": 18.40153452685422, "grad_norm": 0.030117256566882133, "learning_rate": 1.9751742160278746e-05, "loss": 0.2635, "step": 14390 }, { "epoch": 18.414322250639387, "grad_norm": 0.8564471006393433, "learning_rate": 1.9729965156794425e-05, "loss": 0.3881, "step": 14400 }, { "epoch": 18.427109974424553, "grad_norm": 0.00839280616492033, "learning_rate": 1.9708188153310107e-05, "loss": 0.2388, "step": 14410 }, { "epoch": 18.43989769820972, "grad_norm": 1.13720703125, "learning_rate": 1.9686411149825785e-05, "loss": 0.4222, "step": 14420 }, { "epoch": 18.452685421994886, "grad_norm": 1.0654269456863403, "learning_rate": 1.9664634146341467e-05, "loss": 0.5875, "step": 14430 }, { "epoch": 18.465473145780052, "grad_norm": 0.056377239525318146, "learning_rate": 1.9642857142857145e-05, "loss": 0.2721, "step": 14440 }, { "epoch": 18.47826086956522, "grad_norm": 3.4963326454162598, "learning_rate": 1.9621080139372823e-05, "loss": 0.3812, "step": 14450 }, { "epoch": 18.491048593350385, "grad_norm": 2.1274666786193848, "learning_rate": 1.9599303135888502e-05, "loss": 0.2163, "step": 14460 }, { "epoch": 18.50383631713555, "grad_norm": 2.24591064453125, "learning_rate": 1.957752613240418e-05, "loss": 0.1823, "step": 14470 }, { "epoch": 18.516624040920718, "grad_norm": 0.928048849105835, "learning_rate": 1.9555749128919862e-05, "loss": 0.3446, "step": 14480 }, { "epoch": 18.529411764705884, "grad_norm": 5.6801225582603365e-05, "learning_rate": 1.953397212543554e-05, "loss": 0.3851, "step": 14490 }, { "epoch": 18.54219948849105, "grad_norm": 0.3094902038574219, "learning_rate": 1.9512195121951222e-05, "loss": 0.2043, "step": 14500 }, { "epoch": 18.554987212276213, "grad_norm": 1.9753358364105225, "learning_rate": 1.94904181184669e-05, "loss": 0.1487, "step": 14510 }, { "epoch": 18.56777493606138, "grad_norm": 8.858641624450684, "learning_rate": 1.9468641114982582e-05, "loss": 0.1643, "step": 14520 }, { "epoch": 18.580562659846546, "grad_norm": 1.9973317384719849, "learning_rate": 1.9446864111498257e-05, "loss": 0.2102, "step": 14530 }, { "epoch": 18.593350383631712, "grad_norm": 0.3067806363105774, "learning_rate": 1.9425087108013935e-05, "loss": 0.271, "step": 14540 }, { "epoch": 18.60613810741688, "grad_norm": 2.4207940101623535, "learning_rate": 1.9403310104529617e-05, "loss": 0.2622, "step": 14550 }, { "epoch": 18.618925831202045, "grad_norm": 2.6167995929718018, "learning_rate": 1.9381533101045295e-05, "loss": 0.1524, "step": 14560 }, { "epoch": 18.63171355498721, "grad_norm": 0.04882597178220749, "learning_rate": 1.9359756097560977e-05, "loss": 0.1838, "step": 14570 }, { "epoch": 18.644501278772378, "grad_norm": 0.4400959610939026, "learning_rate": 1.9337979094076656e-05, "loss": 0.1957, "step": 14580 }, { "epoch": 18.657289002557544, "grad_norm": 0.2390824854373932, "learning_rate": 1.9316202090592337e-05, "loss": 0.3095, "step": 14590 }, { "epoch": 18.67007672634271, "grad_norm": 1.1464831829071045, "learning_rate": 1.9294425087108016e-05, "loss": 0.2338, "step": 14600 }, { "epoch": 18.682864450127877, "grad_norm": 3.7461390495300293, "learning_rate": 1.9272648083623694e-05, "loss": 0.1344, "step": 14610 }, { "epoch": 18.695652173913043, "grad_norm": 6.968299865722656, "learning_rate": 1.9250871080139372e-05, "loss": 0.3831, "step": 14620 }, { "epoch": 18.70843989769821, "grad_norm": 2.9434783458709717, "learning_rate": 1.922909407665505e-05, "loss": 0.0752, "step": 14630 }, { "epoch": 18.721227621483376, "grad_norm": 2.873483419418335, "learning_rate": 1.9207317073170733e-05, "loss": 0.1213, "step": 14640 }, { "epoch": 18.734015345268542, "grad_norm": 5.209991455078125, "learning_rate": 1.918554006968641e-05, "loss": 0.5809, "step": 14650 }, { "epoch": 18.74680306905371, "grad_norm": 4.765650749206543, "learning_rate": 1.9163763066202093e-05, "loss": 0.3331, "step": 14660 }, { "epoch": 18.759590792838875, "grad_norm": 1.3975070714950562, "learning_rate": 1.914198606271777e-05, "loss": 0.3194, "step": 14670 }, { "epoch": 18.77237851662404, "grad_norm": 0.0020695773418992758, "learning_rate": 1.9120209059233453e-05, "loss": 0.2082, "step": 14680 }, { "epoch": 18.785166240409207, "grad_norm": 5.4186787605285645, "learning_rate": 1.909843205574913e-05, "loss": 0.0826, "step": 14690 }, { "epoch": 18.797953964194374, "grad_norm": 0.49431663751602173, "learning_rate": 1.907665505226481e-05, "loss": 0.2201, "step": 14700 }, { "epoch": 18.81074168797954, "grad_norm": 3.665900468826294, "learning_rate": 1.9054878048780488e-05, "loss": 0.2726, "step": 14710 }, { "epoch": 18.823529411764707, "grad_norm": 9.916034698486328, "learning_rate": 1.9033101045296166e-05, "loss": 0.3124, "step": 14720 }, { "epoch": 18.836317135549873, "grad_norm": 8.723508834838867, "learning_rate": 1.9011324041811848e-05, "loss": 0.4384, "step": 14730 }, { "epoch": 18.84910485933504, "grad_norm": 4.647964000701904, "learning_rate": 1.8989547038327526e-05, "loss": 0.4025, "step": 14740 }, { "epoch": 18.861892583120206, "grad_norm": 0.7404060363769531, "learning_rate": 1.8967770034843208e-05, "loss": 0.1928, "step": 14750 }, { "epoch": 18.874680306905372, "grad_norm": 2.1434133052825928, "learning_rate": 1.8945993031358886e-05, "loss": 0.219, "step": 14760 }, { "epoch": 18.88746803069054, "grad_norm": 4.017099857330322, "learning_rate": 1.8924216027874568e-05, "loss": 0.1563, "step": 14770 }, { "epoch": 18.900255754475705, "grad_norm": 1.0902212858200073, "learning_rate": 1.8902439024390246e-05, "loss": 0.2891, "step": 14780 }, { "epoch": 18.91304347826087, "grad_norm": 2.023188352584839, "learning_rate": 1.8880662020905925e-05, "loss": 0.0701, "step": 14790 }, { "epoch": 18.925831202046037, "grad_norm": 0.004225098993629217, "learning_rate": 1.8858885017421603e-05, "loss": 0.1588, "step": 14800 }, { "epoch": 18.938618925831204, "grad_norm": 4.455468654632568, "learning_rate": 1.883710801393728e-05, "loss": 0.1735, "step": 14810 }, { "epoch": 18.95140664961637, "grad_norm": 1.2001502513885498, "learning_rate": 1.8815331010452963e-05, "loss": 0.1153, "step": 14820 }, { "epoch": 18.964194373401533, "grad_norm": 4.822994709014893, "learning_rate": 1.879355400696864e-05, "loss": 0.3452, "step": 14830 }, { "epoch": 18.9769820971867, "grad_norm": 1.7732210159301758, "learning_rate": 1.8771777003484323e-05, "loss": 0.2798, "step": 14840 }, { "epoch": 18.989769820971865, "grad_norm": 1.7861824035644531, "learning_rate": 1.8750000000000002e-05, "loss": 0.2973, "step": 14850 }, { "epoch": 19.0, "eval_loss": 0.18999117612838745, "eval_runtime": 0.9813, "eval_samples_per_second": 99.865, "eval_steps_per_second": 13.247, "step": 14858 }, { "epoch": 19.002557544757032, "grad_norm": 0.8770705461502075, "learning_rate": 1.872822299651568e-05, "loss": 0.1288, "step": 14860 }, { "epoch": 19.015345268542198, "grad_norm": 6.593090057373047, "learning_rate": 1.870644599303136e-05, "loss": 0.2963, "step": 14870 }, { "epoch": 19.028132992327365, "grad_norm": 0.974862813949585, "learning_rate": 1.8684668989547037e-05, "loss": 0.3377, "step": 14880 }, { "epoch": 19.04092071611253, "grad_norm": 2.2275137901306152, "learning_rate": 1.866289198606272e-05, "loss": 0.2541, "step": 14890 }, { "epoch": 19.053708439897697, "grad_norm": 1.2926586866378784, "learning_rate": 1.8641114982578397e-05, "loss": 0.1452, "step": 14900 }, { "epoch": 19.066496163682864, "grad_norm": 3.027857780456543, "learning_rate": 1.861933797909408e-05, "loss": 0.155, "step": 14910 }, { "epoch": 19.07928388746803, "grad_norm": 1.4662635326385498, "learning_rate": 1.8597560975609757e-05, "loss": 0.136, "step": 14920 }, { "epoch": 19.092071611253196, "grad_norm": 1.220117211341858, "learning_rate": 1.8575783972125435e-05, "loss": 0.4804, "step": 14930 }, { "epoch": 19.104859335038363, "grad_norm": 2.4428813457489014, "learning_rate": 1.8554006968641117e-05, "loss": 0.2189, "step": 14940 }, { "epoch": 19.11764705882353, "grad_norm": 5.772051811218262, "learning_rate": 1.8532229965156795e-05, "loss": 0.2831, "step": 14950 }, { "epoch": 19.130434782608695, "grad_norm": 0.0032378036994487047, "learning_rate": 1.8510452961672474e-05, "loss": 0.3129, "step": 14960 }, { "epoch": 19.14322250639386, "grad_norm": 2.335304021835327, "learning_rate": 1.8488675958188152e-05, "loss": 0.3056, "step": 14970 }, { "epoch": 19.156010230179028, "grad_norm": 3.2122292518615723, "learning_rate": 1.8466898954703834e-05, "loss": 0.347, "step": 14980 }, { "epoch": 19.168797953964194, "grad_norm": 4.042379379272461, "learning_rate": 1.8445121951219512e-05, "loss": 0.1863, "step": 14990 }, { "epoch": 19.18158567774936, "grad_norm": 2.36295747756958, "learning_rate": 1.8423344947735194e-05, "loss": 0.4143, "step": 15000 }, { "epoch": 19.194373401534527, "grad_norm": 2.999772787094116, "learning_rate": 1.8401567944250872e-05, "loss": 0.2193, "step": 15010 }, { "epoch": 19.207161125319693, "grad_norm": 1.0328400135040283, "learning_rate": 1.837979094076655e-05, "loss": 0.3591, "step": 15020 }, { "epoch": 19.21994884910486, "grad_norm": 0.0005838721990585327, "learning_rate": 1.8358013937282233e-05, "loss": 0.351, "step": 15030 }, { "epoch": 19.232736572890026, "grad_norm": 10.181427955627441, "learning_rate": 1.833623693379791e-05, "loss": 0.0954, "step": 15040 }, { "epoch": 19.245524296675192, "grad_norm": 6.923588080098853e-05, "learning_rate": 1.831445993031359e-05, "loss": 0.2054, "step": 15050 }, { "epoch": 19.25831202046036, "grad_norm": 3.050046920776367, "learning_rate": 1.8292682926829268e-05, "loss": 0.3645, "step": 15060 }, { "epoch": 19.271099744245525, "grad_norm": 0.18810555338859558, "learning_rate": 1.827090592334495e-05, "loss": 0.0994, "step": 15070 }, { "epoch": 19.28388746803069, "grad_norm": 4.1756439208984375, "learning_rate": 1.8249128919860628e-05, "loss": 0.2741, "step": 15080 }, { "epoch": 19.296675191815858, "grad_norm": 0.8578528165817261, "learning_rate": 1.8227351916376306e-05, "loss": 0.1935, "step": 15090 }, { "epoch": 19.309462915601024, "grad_norm": 1.3327559232711792, "learning_rate": 1.8205574912891988e-05, "loss": 0.241, "step": 15100 }, { "epoch": 19.32225063938619, "grad_norm": 5.43163537979126, "learning_rate": 1.8183797909407666e-05, "loss": 0.3708, "step": 15110 }, { "epoch": 19.335038363171357, "grad_norm": 0.055509597063064575, "learning_rate": 1.8162020905923348e-05, "loss": 0.4344, "step": 15120 }, { "epoch": 19.347826086956523, "grad_norm": 5.035632610321045, "learning_rate": 1.8140243902439026e-05, "loss": 0.2767, "step": 15130 }, { "epoch": 19.36061381074169, "grad_norm": 5.456521511077881, "learning_rate": 1.8118466898954705e-05, "loss": 0.3031, "step": 15140 }, { "epoch": 19.373401534526856, "grad_norm": 0.0010591910686343908, "learning_rate": 1.8096689895470383e-05, "loss": 0.1793, "step": 15150 }, { "epoch": 19.38618925831202, "grad_norm": 3.1625192165374756, "learning_rate": 1.8074912891986065e-05, "loss": 0.1645, "step": 15160 }, { "epoch": 19.398976982097185, "grad_norm": 4.394965648651123, "learning_rate": 1.8053135888501743e-05, "loss": 0.2339, "step": 15170 }, { "epoch": 19.41176470588235, "grad_norm": 6.195003032684326, "learning_rate": 1.803135888501742e-05, "loss": 0.3896, "step": 15180 }, { "epoch": 19.424552429667518, "grad_norm": 0.0053399535827338696, "learning_rate": 1.8009581881533103e-05, "loss": 0.339, "step": 15190 }, { "epoch": 19.437340153452684, "grad_norm": 0.00421350309625268, "learning_rate": 1.798780487804878e-05, "loss": 0.1229, "step": 15200 }, { "epoch": 19.45012787723785, "grad_norm": 0.14334776997566223, "learning_rate": 1.7966027874564463e-05, "loss": 0.1785, "step": 15210 }, { "epoch": 19.462915601023017, "grad_norm": 2.6690614223480225, "learning_rate": 1.7944250871080138e-05, "loss": 0.3094, "step": 15220 }, { "epoch": 19.475703324808183, "grad_norm": 0.42609289288520813, "learning_rate": 1.792247386759582e-05, "loss": 0.323, "step": 15230 }, { "epoch": 19.48849104859335, "grad_norm": 0.5637320876121521, "learning_rate": 1.79006968641115e-05, "loss": 0.3952, "step": 15240 }, { "epoch": 19.501278772378516, "grad_norm": 5.911632537841797, "learning_rate": 1.7878919860627177e-05, "loss": 0.2469, "step": 15250 }, { "epoch": 19.514066496163682, "grad_norm": 4.086767673492432, "learning_rate": 1.785714285714286e-05, "loss": 0.3565, "step": 15260 }, { "epoch": 19.52685421994885, "grad_norm": 1.9716821908950806, "learning_rate": 1.7835365853658537e-05, "loss": 0.095, "step": 15270 }, { "epoch": 19.539641943734015, "grad_norm": 0.5544086694717407, "learning_rate": 1.781358885017422e-05, "loss": 0.2009, "step": 15280 }, { "epoch": 19.55242966751918, "grad_norm": 0.000488213641801849, "learning_rate": 1.7791811846689897e-05, "loss": 0.4178, "step": 15290 }, { "epoch": 19.565217391304348, "grad_norm": 1.7613449096679688, "learning_rate": 1.7770034843205575e-05, "loss": 0.1983, "step": 15300 }, { "epoch": 19.578005115089514, "grad_norm": 0.23536629974842072, "learning_rate": 1.7748257839721254e-05, "loss": 0.1895, "step": 15310 }, { "epoch": 19.59079283887468, "grad_norm": 0.29210370779037476, "learning_rate": 1.7726480836236932e-05, "loss": 0.2842, "step": 15320 }, { "epoch": 19.603580562659847, "grad_norm": 0.4455021023750305, "learning_rate": 1.7704703832752614e-05, "loss": 0.176, "step": 15330 }, { "epoch": 19.616368286445013, "grad_norm": 2.8317060470581055, "learning_rate": 1.7682926829268292e-05, "loss": 0.3407, "step": 15340 }, { "epoch": 19.62915601023018, "grad_norm": 3.654068946838379, "learning_rate": 1.7661149825783974e-05, "loss": 0.2485, "step": 15350 }, { "epoch": 19.641943734015346, "grad_norm": 4.583719730377197, "learning_rate": 1.7639372822299652e-05, "loss": 0.2926, "step": 15360 }, { "epoch": 19.654731457800512, "grad_norm": 0.38903215527534485, "learning_rate": 1.7617595818815334e-05, "loss": 0.1887, "step": 15370 }, { "epoch": 19.66751918158568, "grad_norm": 7.089865684509277, "learning_rate": 1.7595818815331012e-05, "loss": 0.2881, "step": 15380 }, { "epoch": 19.680306905370845, "grad_norm": 5.161704063415527, "learning_rate": 1.757404181184669e-05, "loss": 0.2531, "step": 15390 }, { "epoch": 19.69309462915601, "grad_norm": 2.1623754501342773, "learning_rate": 1.755226480836237e-05, "loss": 0.2547, "step": 15400 }, { "epoch": 19.705882352941178, "grad_norm": 2.177671432495117, "learning_rate": 1.7530487804878047e-05, "loss": 0.301, "step": 15410 }, { "epoch": 19.718670076726344, "grad_norm": 11.073813438415527, "learning_rate": 1.750871080139373e-05, "loss": 0.4999, "step": 15420 }, { "epoch": 19.73145780051151, "grad_norm": 5.834652423858643, "learning_rate": 1.7486933797909408e-05, "loss": 0.3861, "step": 15430 }, { "epoch": 19.744245524296677, "grad_norm": 4.418552398681641, "learning_rate": 1.746515679442509e-05, "loss": 0.1229, "step": 15440 }, { "epoch": 19.757033248081843, "grad_norm": 2.298755168914795, "learning_rate": 1.7443379790940768e-05, "loss": 0.1937, "step": 15450 }, { "epoch": 19.76982097186701, "grad_norm": 0.003441237611696124, "learning_rate": 1.742160278745645e-05, "loss": 0.1667, "step": 15460 }, { "epoch": 19.782608695652176, "grad_norm": 0.5872055888175964, "learning_rate": 1.7399825783972128e-05, "loss": 0.1502, "step": 15470 }, { "epoch": 19.79539641943734, "grad_norm": 7.624274253845215, "learning_rate": 1.7378048780487806e-05, "loss": 0.363, "step": 15480 }, { "epoch": 19.808184143222505, "grad_norm": 5.002297401428223, "learning_rate": 1.7356271777003484e-05, "loss": 0.2403, "step": 15490 }, { "epoch": 19.82097186700767, "grad_norm": 2.922607898712158, "learning_rate": 1.7334494773519163e-05, "loss": 0.0844, "step": 15500 }, { "epoch": 19.833759590792837, "grad_norm": 5.784374713897705, "learning_rate": 1.7312717770034845e-05, "loss": 0.5496, "step": 15510 }, { "epoch": 19.846547314578004, "grad_norm": 0.09805575013160706, "learning_rate": 1.7290940766550523e-05, "loss": 0.0923, "step": 15520 }, { "epoch": 19.85933503836317, "grad_norm": 0.28828638792037964, "learning_rate": 1.7269163763066205e-05, "loss": 0.2232, "step": 15530 }, { "epoch": 19.872122762148337, "grad_norm": 1.204892635345459, "learning_rate": 1.7247386759581883e-05, "loss": 0.2714, "step": 15540 }, { "epoch": 19.884910485933503, "grad_norm": 3.157284736633301, "learning_rate": 1.722560975609756e-05, "loss": 0.1934, "step": 15550 }, { "epoch": 19.89769820971867, "grad_norm": 0.35447245836257935, "learning_rate": 1.7203832752613243e-05, "loss": 0.1453, "step": 15560 }, { "epoch": 19.910485933503836, "grad_norm": 5.422603130340576, "learning_rate": 1.7182055749128918e-05, "loss": 0.1164, "step": 15570 }, { "epoch": 19.923273657289002, "grad_norm": 0.6058647036552429, "learning_rate": 1.71602787456446e-05, "loss": 0.3696, "step": 15580 }, { "epoch": 19.93606138107417, "grad_norm": 2.1776912212371826, "learning_rate": 1.7138501742160278e-05, "loss": 0.3658, "step": 15590 }, { "epoch": 19.948849104859335, "grad_norm": 0.62024986743927, "learning_rate": 1.711672473867596e-05, "loss": 0.2492, "step": 15600 }, { "epoch": 19.9616368286445, "grad_norm": 6.89608907699585, "learning_rate": 1.7094947735191638e-05, "loss": 0.4042, "step": 15610 }, { "epoch": 19.974424552429667, "grad_norm": 3.269071340560913, "learning_rate": 1.707317073170732e-05, "loss": 0.1557, "step": 15620 }, { "epoch": 19.987212276214834, "grad_norm": 0.1415141373872757, "learning_rate": 1.7051393728223e-05, "loss": 0.0946, "step": 15630 }, { "epoch": 20.0, "grad_norm": 0.5312731862068176, "learning_rate": 1.7029616724738677e-05, "loss": 0.207, "step": 15640 }, { "epoch": 20.0, "eval_loss": 0.18845418095588684, "eval_runtime": 0.8176, "eval_samples_per_second": 119.862, "eval_steps_per_second": 15.9, "step": 15640 }, { "epoch": 20.012787723785166, "grad_norm": 5.064207553863525, "learning_rate": 1.7007839721254355e-05, "loss": 0.3085, "step": 15650 }, { "epoch": 20.025575447570333, "grad_norm": 0.05143703147768974, "learning_rate": 1.6986062717770033e-05, "loss": 0.2168, "step": 15660 }, { "epoch": 20.0383631713555, "grad_norm": 5.019478797912598, "learning_rate": 1.6964285714285715e-05, "loss": 0.3274, "step": 15670 }, { "epoch": 20.051150895140665, "grad_norm": 0.4289814531803131, "learning_rate": 1.6942508710801394e-05, "loss": 0.2389, "step": 15680 }, { "epoch": 20.06393861892583, "grad_norm": 0.20377962291240692, "learning_rate": 1.6920731707317075e-05, "loss": 0.2123, "step": 15690 }, { "epoch": 20.076726342710998, "grad_norm": 2.430692672729492, "learning_rate": 1.6898954703832754e-05, "loss": 0.226, "step": 15700 }, { "epoch": 20.089514066496164, "grad_norm": 0.026158737018704414, "learning_rate": 1.6877177700348432e-05, "loss": 0.2226, "step": 15710 }, { "epoch": 20.10230179028133, "grad_norm": 0.08608794212341309, "learning_rate": 1.6855400696864114e-05, "loss": 0.2112, "step": 15720 }, { "epoch": 20.115089514066497, "grad_norm": 0.00048582549788989127, "learning_rate": 1.6833623693379792e-05, "loss": 0.1476, "step": 15730 }, { "epoch": 20.127877237851663, "grad_norm": 0.0019287772011011839, "learning_rate": 1.681184668989547e-05, "loss": 0.0918, "step": 15740 }, { "epoch": 20.14066496163683, "grad_norm": 4.9623442464508116e-05, "learning_rate": 1.679006968641115e-05, "loss": 0.2505, "step": 15750 }, { "epoch": 20.153452685421996, "grad_norm": 5.41326904296875, "learning_rate": 1.676829268292683e-05, "loss": 0.3093, "step": 15760 }, { "epoch": 20.166240409207163, "grad_norm": 5.913987159729004, "learning_rate": 1.674651567944251e-05, "loss": 0.2159, "step": 15770 }, { "epoch": 20.17902813299233, "grad_norm": 0.5496914386749268, "learning_rate": 1.672473867595819e-05, "loss": 0.3173, "step": 15780 }, { "epoch": 20.191815856777495, "grad_norm": 0.007048303727060556, "learning_rate": 1.670296167247387e-05, "loss": 0.2433, "step": 15790 }, { "epoch": 20.20460358056266, "grad_norm": 4.826021671295166, "learning_rate": 1.6681184668989547e-05, "loss": 0.4035, "step": 15800 }, { "epoch": 20.217391304347824, "grad_norm": 0.10413353145122528, "learning_rate": 1.665940766550523e-05, "loss": 0.2547, "step": 15810 }, { "epoch": 20.23017902813299, "grad_norm": 0.14804640412330627, "learning_rate": 1.6637630662020908e-05, "loss": 0.5118, "step": 15820 }, { "epoch": 20.242966751918157, "grad_norm": 3.711113691329956, "learning_rate": 1.6615853658536586e-05, "loss": 0.291, "step": 15830 }, { "epoch": 20.255754475703323, "grad_norm": 5.286841869354248, "learning_rate": 1.6594076655052264e-05, "loss": 0.3255, "step": 15840 }, { "epoch": 20.26854219948849, "grad_norm": 1.9062235355377197, "learning_rate": 1.6572299651567946e-05, "loss": 0.4051, "step": 15850 }, { "epoch": 20.281329923273656, "grad_norm": 0.0009508104994893074, "learning_rate": 1.6550522648083624e-05, "loss": 0.2144, "step": 15860 }, { "epoch": 20.294117647058822, "grad_norm": 0.877557098865509, "learning_rate": 1.6528745644599303e-05, "loss": 0.1162, "step": 15870 }, { "epoch": 20.30690537084399, "grad_norm": 5.116530895233154, "learning_rate": 1.6506968641114984e-05, "loss": 0.3343, "step": 15880 }, { "epoch": 20.319693094629155, "grad_norm": 2.592881679534912, "learning_rate": 1.6485191637630663e-05, "loss": 0.164, "step": 15890 }, { "epoch": 20.33248081841432, "grad_norm": 1.5526633262634277, "learning_rate": 1.6463414634146345e-05, "loss": 0.2699, "step": 15900 }, { "epoch": 20.345268542199488, "grad_norm": 3.142625093460083, "learning_rate": 1.644163763066202e-05, "loss": 0.1924, "step": 15910 }, { "epoch": 20.358056265984654, "grad_norm": 0.4076800048351288, "learning_rate": 1.64198606271777e-05, "loss": 0.1584, "step": 15920 }, { "epoch": 20.37084398976982, "grad_norm": 5.538966655731201, "learning_rate": 1.639808362369338e-05, "loss": 0.2156, "step": 15930 }, { "epoch": 20.383631713554987, "grad_norm": 3.252673625946045, "learning_rate": 1.6376306620209058e-05, "loss": 0.1807, "step": 15940 }, { "epoch": 20.396419437340153, "grad_norm": 0.481976181268692, "learning_rate": 1.635452961672474e-05, "loss": 0.1951, "step": 15950 }, { "epoch": 20.40920716112532, "grad_norm": 0.04137945547699928, "learning_rate": 1.6332752613240418e-05, "loss": 0.2324, "step": 15960 }, { "epoch": 20.421994884910486, "grad_norm": 6.7527360916137695, "learning_rate": 1.63109756097561e-05, "loss": 0.2467, "step": 15970 }, { "epoch": 20.434782608695652, "grad_norm": 5.791774272918701, "learning_rate": 1.6289198606271778e-05, "loss": 0.2561, "step": 15980 }, { "epoch": 20.44757033248082, "grad_norm": 0.04831215366721153, "learning_rate": 1.6267421602787457e-05, "loss": 0.4145, "step": 15990 }, { "epoch": 20.460358056265985, "grad_norm": 0.03270556032657623, "learning_rate": 1.6245644599303135e-05, "loss": 0.2934, "step": 16000 }, { "epoch": 20.47314578005115, "grad_norm": 4.968059539794922, "learning_rate": 1.6223867595818817e-05, "loss": 0.458, "step": 16010 }, { "epoch": 20.485933503836318, "grad_norm": 0.7192120552062988, "learning_rate": 1.6202090592334495e-05, "loss": 0.3466, "step": 16020 }, { "epoch": 20.498721227621484, "grad_norm": 7.932967662811279, "learning_rate": 1.6180313588850173e-05, "loss": 0.3495, "step": 16030 }, { "epoch": 20.51150895140665, "grad_norm": 4.001988410949707, "learning_rate": 1.6158536585365855e-05, "loss": 0.2856, "step": 16040 }, { "epoch": 20.524296675191817, "grad_norm": 0.0006555592408403754, "learning_rate": 1.6136759581881533e-05, "loss": 0.3303, "step": 16050 }, { "epoch": 20.537084398976983, "grad_norm": 8.240303039550781, "learning_rate": 1.6114982578397215e-05, "loss": 0.1341, "step": 16060 }, { "epoch": 20.54987212276215, "grad_norm": 2.4732863903045654, "learning_rate": 1.6093205574912894e-05, "loss": 0.2962, "step": 16070 }, { "epoch": 20.562659846547316, "grad_norm": 4.178097248077393, "learning_rate": 1.6071428571428572e-05, "loss": 0.1694, "step": 16080 }, { "epoch": 20.575447570332482, "grad_norm": 0.32051607966423035, "learning_rate": 1.604965156794425e-05, "loss": 0.1181, "step": 16090 }, { "epoch": 20.58823529411765, "grad_norm": 8.6662015914917, "learning_rate": 1.602787456445993e-05, "loss": 0.3845, "step": 16100 }, { "epoch": 20.601023017902815, "grad_norm": 2.2716774940490723, "learning_rate": 1.600609756097561e-05, "loss": 0.2206, "step": 16110 }, { "epoch": 20.61381074168798, "grad_norm": 8.84775161743164, "learning_rate": 1.598432055749129e-05, "loss": 0.4821, "step": 16120 }, { "epoch": 20.626598465473144, "grad_norm": 0.0991571843624115, "learning_rate": 1.596254355400697e-05, "loss": 0.391, "step": 16130 }, { "epoch": 20.63938618925831, "grad_norm": 0.002819400280714035, "learning_rate": 1.594076655052265e-05, "loss": 0.3196, "step": 16140 }, { "epoch": 20.652173913043477, "grad_norm": 11.331056594848633, "learning_rate": 1.591898954703833e-05, "loss": 0.3294, "step": 16150 }, { "epoch": 20.664961636828643, "grad_norm": 0.011265520937740803, "learning_rate": 1.589721254355401e-05, "loss": 0.2331, "step": 16160 }, { "epoch": 20.67774936061381, "grad_norm": 1.6835376024246216, "learning_rate": 1.5875435540069687e-05, "loss": 0.1845, "step": 16170 }, { "epoch": 20.690537084398976, "grad_norm": 0.2547198235988617, "learning_rate": 1.5853658536585366e-05, "loss": 0.1338, "step": 16180 }, { "epoch": 20.703324808184142, "grad_norm": 0.7425469756126404, "learning_rate": 1.5831881533101044e-05, "loss": 0.1229, "step": 16190 }, { "epoch": 20.71611253196931, "grad_norm": 6.955141067504883, "learning_rate": 1.5810104529616726e-05, "loss": 0.4242, "step": 16200 }, { "epoch": 20.728900255754475, "grad_norm": 6.0939741134643555, "learning_rate": 1.5788327526132404e-05, "loss": 0.3109, "step": 16210 }, { "epoch": 20.74168797953964, "grad_norm": 0.008783875964581966, "learning_rate": 1.5766550522648086e-05, "loss": 0.3216, "step": 16220 }, { "epoch": 20.754475703324808, "grad_norm": 0.00038275119732134044, "learning_rate": 1.5744773519163764e-05, "loss": 0.1876, "step": 16230 }, { "epoch": 20.767263427109974, "grad_norm": 0.7452578544616699, "learning_rate": 1.5722996515679446e-05, "loss": 0.1195, "step": 16240 }, { "epoch": 20.78005115089514, "grad_norm": 0.0018406964372843504, "learning_rate": 1.5701219512195124e-05, "loss": 0.2781, "step": 16250 }, { "epoch": 20.792838874680307, "grad_norm": 2.2064616680145264, "learning_rate": 1.56794425087108e-05, "loss": 0.148, "step": 16260 }, { "epoch": 20.805626598465473, "grad_norm": 0.5615038871765137, "learning_rate": 1.565766550522648e-05, "loss": 0.2922, "step": 16270 }, { "epoch": 20.81841432225064, "grad_norm": 0.5706607103347778, "learning_rate": 1.563588850174216e-05, "loss": 0.2429, "step": 16280 }, { "epoch": 20.831202046035806, "grad_norm": 9.093474388122559, "learning_rate": 1.561411149825784e-05, "loss": 0.2946, "step": 16290 }, { "epoch": 20.843989769820972, "grad_norm": 0.25896137952804565, "learning_rate": 1.559233449477352e-05, "loss": 0.1847, "step": 16300 }, { "epoch": 20.85677749360614, "grad_norm": 1.9687992334365845, "learning_rate": 1.55705574912892e-05, "loss": 0.3932, "step": 16310 }, { "epoch": 20.869565217391305, "grad_norm": 7.081894397735596, "learning_rate": 1.554878048780488e-05, "loss": 0.3664, "step": 16320 }, { "epoch": 20.88235294117647, "grad_norm": 0.6577122211456299, "learning_rate": 1.5527003484320558e-05, "loss": 0.3076, "step": 16330 }, { "epoch": 20.895140664961637, "grad_norm": 2.853107452392578, "learning_rate": 1.5505226480836236e-05, "loss": 0.2537, "step": 16340 }, { "epoch": 20.907928388746804, "grad_norm": 0.6096291542053223, "learning_rate": 1.5483449477351915e-05, "loss": 0.1874, "step": 16350 }, { "epoch": 20.92071611253197, "grad_norm": 3.1739935874938965, "learning_rate": 1.5461672473867596e-05, "loss": 0.2585, "step": 16360 }, { "epoch": 20.933503836317136, "grad_norm": 0.1571437120437622, "learning_rate": 1.5439895470383275e-05, "loss": 0.1351, "step": 16370 }, { "epoch": 20.946291560102303, "grad_norm": 0.07612061500549316, "learning_rate": 1.5418118466898957e-05, "loss": 0.1971, "step": 16380 }, { "epoch": 20.95907928388747, "grad_norm": 0.008349299430847168, "learning_rate": 1.5396341463414635e-05, "loss": 0.241, "step": 16390 }, { "epoch": 20.971867007672635, "grad_norm": 0.2986217737197876, "learning_rate": 1.5374564459930317e-05, "loss": 0.209, "step": 16400 }, { "epoch": 20.984654731457802, "grad_norm": 0.2554277777671814, "learning_rate": 1.5352787456445995e-05, "loss": 0.2472, "step": 16410 }, { "epoch": 20.997442455242968, "grad_norm": 0.0008612891542725265, "learning_rate": 1.5331010452961673e-05, "loss": 0.1471, "step": 16420 }, { "epoch": 21.0, "eval_loss": 0.1870102435350418, "eval_runtime": 0.9883, "eval_samples_per_second": 99.165, "eval_steps_per_second": 13.155, "step": 16422 }, { "epoch": 21.010230179028135, "grad_norm": 3.3676488399505615, "learning_rate": 1.5309233449477352e-05, "loss": 0.2031, "step": 16430 }, { "epoch": 21.0230179028133, "grad_norm": 0.06823177635669708, "learning_rate": 1.528745644599303e-05, "loss": 0.2932, "step": 16440 }, { "epoch": 21.035805626598467, "grad_norm": 2.5173239707946777, "learning_rate": 1.5265679442508712e-05, "loss": 0.0969, "step": 16450 }, { "epoch": 21.04859335038363, "grad_norm": 0.048735931515693665, "learning_rate": 1.524390243902439e-05, "loss": 0.2956, "step": 16460 }, { "epoch": 21.061381074168796, "grad_norm": 7.9867262840271, "learning_rate": 1.5222125435540072e-05, "loss": 0.2633, "step": 16470 }, { "epoch": 21.074168797953963, "grad_norm": 0.06308836489915848, "learning_rate": 1.520034843205575e-05, "loss": 0.2263, "step": 16480 }, { "epoch": 21.08695652173913, "grad_norm": 8.272692680358887, "learning_rate": 1.5178571428571429e-05, "loss": 0.4894, "step": 16490 }, { "epoch": 21.099744245524295, "grad_norm": 7.9979376792907715, "learning_rate": 1.5156794425087109e-05, "loss": 0.3538, "step": 16500 }, { "epoch": 21.11253196930946, "grad_norm": 10.409687995910645, "learning_rate": 1.5135017421602787e-05, "loss": 0.4088, "step": 16510 }, { "epoch": 21.125319693094628, "grad_norm": 0.15129724144935608, "learning_rate": 1.5113240418118469e-05, "loss": 0.1055, "step": 16520 }, { "epoch": 21.138107416879794, "grad_norm": 8.348666191101074, "learning_rate": 1.5091463414634147e-05, "loss": 0.5288, "step": 16530 }, { "epoch": 21.15089514066496, "grad_norm": 0.0020471445750445127, "learning_rate": 1.5069686411149827e-05, "loss": 0.1927, "step": 16540 }, { "epoch": 21.163682864450127, "grad_norm": 0.013132164254784584, "learning_rate": 1.5047909407665506e-05, "loss": 0.0957, "step": 16550 }, { "epoch": 21.176470588235293, "grad_norm": 0.1892154961824417, "learning_rate": 1.5026132404181184e-05, "loss": 0.1987, "step": 16560 }, { "epoch": 21.18925831202046, "grad_norm": 2.367501974105835, "learning_rate": 1.5004355400696866e-05, "loss": 0.147, "step": 16570 }, { "epoch": 21.202046035805626, "grad_norm": 0.15295153856277466, "learning_rate": 1.4982578397212544e-05, "loss": 0.1955, "step": 16580 }, { "epoch": 21.214833759590793, "grad_norm": 0.3686113655567169, "learning_rate": 1.4960801393728224e-05, "loss": 0.3367, "step": 16590 }, { "epoch": 21.22762148337596, "grad_norm": 7.953524589538574, "learning_rate": 1.4939024390243902e-05, "loss": 0.0922, "step": 16600 }, { "epoch": 21.240409207161125, "grad_norm": 9.946479797363281, "learning_rate": 1.4917247386759584e-05, "loss": 0.298, "step": 16610 }, { "epoch": 21.25319693094629, "grad_norm": 9.536918640136719, "learning_rate": 1.4895470383275263e-05, "loss": 0.2268, "step": 16620 }, { "epoch": 21.265984654731458, "grad_norm": 7.6481194496154785, "learning_rate": 1.4873693379790943e-05, "loss": 0.5011, "step": 16630 }, { "epoch": 21.278772378516624, "grad_norm": 0.00011884182458743453, "learning_rate": 1.4851916376306621e-05, "loss": 0.2538, "step": 16640 }, { "epoch": 21.29156010230179, "grad_norm": 1.6625375747680664, "learning_rate": 1.48301393728223e-05, "loss": 0.2028, "step": 16650 }, { "epoch": 21.304347826086957, "grad_norm": 0.04297441244125366, "learning_rate": 1.4808362369337981e-05, "loss": 0.1176, "step": 16660 }, { "epoch": 21.317135549872123, "grad_norm": 2.1983182430267334, "learning_rate": 1.4786585365853658e-05, "loss": 0.2772, "step": 16670 }, { "epoch": 21.32992327365729, "grad_norm": 0.23334167897701263, "learning_rate": 1.476480836236934e-05, "loss": 0.3034, "step": 16680 }, { "epoch": 21.342710997442456, "grad_norm": 6.521374702453613, "learning_rate": 1.4743031358885018e-05, "loss": 0.4314, "step": 16690 }, { "epoch": 21.355498721227622, "grad_norm": 3.0311429500579834, "learning_rate": 1.4721254355400698e-05, "loss": 0.1632, "step": 16700 }, { "epoch": 21.36828644501279, "grad_norm": 1.5404332876205444, "learning_rate": 1.4699477351916376e-05, "loss": 0.1377, "step": 16710 }, { "epoch": 21.381074168797955, "grad_norm": 3.868997573852539, "learning_rate": 1.4677700348432055e-05, "loss": 0.1567, "step": 16720 }, { "epoch": 21.39386189258312, "grad_norm": 0.3404499888420105, "learning_rate": 1.4655923344947736e-05, "loss": 0.2714, "step": 16730 }, { "epoch": 21.406649616368288, "grad_norm": 2.7388010025024414, "learning_rate": 1.4634146341463415e-05, "loss": 0.1416, "step": 16740 }, { "epoch": 21.419437340153454, "grad_norm": 0.022615350782871246, "learning_rate": 1.4612369337979095e-05, "loss": 0.1607, "step": 16750 }, { "epoch": 21.43222506393862, "grad_norm": 7.353601932525635, "learning_rate": 1.4590592334494773e-05, "loss": 0.4509, "step": 16760 }, { "epoch": 21.445012787723787, "grad_norm": 5.572024345397949, "learning_rate": 1.4568815331010455e-05, "loss": 0.2129, "step": 16770 }, { "epoch": 21.45780051150895, "grad_norm": 2.0802969932556152, "learning_rate": 1.4547038327526133e-05, "loss": 0.3515, "step": 16780 }, { "epoch": 21.470588235294116, "grad_norm": 10.334312438964844, "learning_rate": 1.4525261324041812e-05, "loss": 0.4857, "step": 16790 }, { "epoch": 21.483375959079282, "grad_norm": 32.95657730102539, "learning_rate": 1.4503484320557492e-05, "loss": 0.2048, "step": 16800 }, { "epoch": 21.49616368286445, "grad_norm": 2.74238657951355, "learning_rate": 1.448170731707317e-05, "loss": 0.1129, "step": 16810 }, { "epoch": 21.508951406649615, "grad_norm": 0.7832550406455994, "learning_rate": 1.4459930313588852e-05, "loss": 0.165, "step": 16820 }, { "epoch": 21.52173913043478, "grad_norm": 0.30068516731262207, "learning_rate": 1.443815331010453e-05, "loss": 0.442, "step": 16830 }, { "epoch": 21.534526854219948, "grad_norm": 2.967395782470703, "learning_rate": 1.441637630662021e-05, "loss": 0.205, "step": 16840 }, { "epoch": 21.547314578005114, "grad_norm": 0.000712763809133321, "learning_rate": 1.4394599303135889e-05, "loss": 0.299, "step": 16850 }, { "epoch": 21.56010230179028, "grad_norm": 3.7180092334747314, "learning_rate": 1.437282229965157e-05, "loss": 0.3639, "step": 16860 }, { "epoch": 21.572890025575447, "grad_norm": 1.8158949613571167, "learning_rate": 1.4351045296167249e-05, "loss": 0.4136, "step": 16870 }, { "epoch": 21.585677749360613, "grad_norm": 7.350913047790527, "learning_rate": 1.4329268292682927e-05, "loss": 0.5053, "step": 16880 }, { "epoch": 21.59846547314578, "grad_norm": 1.9012134075164795, "learning_rate": 1.4307491289198607e-05, "loss": 0.2196, "step": 16890 }, { "epoch": 21.611253196930946, "grad_norm": 1.2609905004501343, "learning_rate": 1.4285714285714285e-05, "loss": 0.1712, "step": 16900 }, { "epoch": 21.624040920716112, "grad_norm": 9.658625602722168, "learning_rate": 1.4263937282229967e-05, "loss": 0.258, "step": 16910 }, { "epoch": 21.63682864450128, "grad_norm": 1.7282674312591553, "learning_rate": 1.4242160278745646e-05, "loss": 0.3437, "step": 16920 }, { "epoch": 21.649616368286445, "grad_norm": 0.8424760699272156, "learning_rate": 1.4220383275261326e-05, "loss": 0.2512, "step": 16930 }, { "epoch": 21.66240409207161, "grad_norm": 0.018984561786055565, "learning_rate": 1.4198606271777004e-05, "loss": 0.1251, "step": 16940 }, { "epoch": 21.675191815856778, "grad_norm": 6.375213146209717, "learning_rate": 1.4176829268292682e-05, "loss": 0.2275, "step": 16950 }, { "epoch": 21.687979539641944, "grad_norm": 2.3540890216827393, "learning_rate": 1.4155052264808364e-05, "loss": 0.2892, "step": 16960 }, { "epoch": 21.70076726342711, "grad_norm": 2.8791966438293457, "learning_rate": 1.413327526132404e-05, "loss": 0.3235, "step": 16970 }, { "epoch": 21.713554987212277, "grad_norm": 4.614317893981934, "learning_rate": 1.4111498257839722e-05, "loss": 0.2368, "step": 16980 }, { "epoch": 21.726342710997443, "grad_norm": 0.14197991788387299, "learning_rate": 1.40897212543554e-05, "loss": 0.077, "step": 16990 }, { "epoch": 21.73913043478261, "grad_norm": 13.631002426147461, "learning_rate": 1.4067944250871083e-05, "loss": 0.1937, "step": 17000 }, { "epoch": 21.751918158567776, "grad_norm": 5.711042404174805, "learning_rate": 1.404616724738676e-05, "loss": 0.2572, "step": 17010 }, { "epoch": 21.764705882352942, "grad_norm": 3.289583206176758, "learning_rate": 1.4024390243902441e-05, "loss": 0.279, "step": 17020 }, { "epoch": 21.77749360613811, "grad_norm": 1.9635261297225952, "learning_rate": 1.400261324041812e-05, "loss": 0.4068, "step": 17030 }, { "epoch": 21.790281329923275, "grad_norm": 3.4247848987579346, "learning_rate": 1.3980836236933798e-05, "loss": 0.2091, "step": 17040 }, { "epoch": 21.80306905370844, "grad_norm": 0.00046931157703511417, "learning_rate": 1.3959059233449478e-05, "loss": 0.1642, "step": 17050 }, { "epoch": 21.815856777493607, "grad_norm": 2.2263681888580322, "learning_rate": 1.3937282229965156e-05, "loss": 0.2707, "step": 17060 }, { "epoch": 21.828644501278774, "grad_norm": 4.848337173461914, "learning_rate": 1.3915505226480838e-05, "loss": 0.2175, "step": 17070 }, { "epoch": 21.84143222506394, "grad_norm": 0.009802093729376793, "learning_rate": 1.3893728222996516e-05, "loss": 0.191, "step": 17080 }, { "epoch": 21.854219948849106, "grad_norm": 0.4041551649570465, "learning_rate": 1.3871951219512196e-05, "loss": 0.1496, "step": 17090 }, { "epoch": 21.867007672634273, "grad_norm": 2.5238401889801025, "learning_rate": 1.3850174216027875e-05, "loss": 0.1683, "step": 17100 }, { "epoch": 21.87979539641944, "grad_norm": 1.646201491355896, "learning_rate": 1.3828397212543553e-05, "loss": 0.3284, "step": 17110 }, { "epoch": 21.892583120204602, "grad_norm": 12.850646018981934, "learning_rate": 1.3806620209059235e-05, "loss": 0.2655, "step": 17120 }, { "epoch": 21.90537084398977, "grad_norm": 0.1326649785041809, "learning_rate": 1.3784843205574913e-05, "loss": 0.2353, "step": 17130 }, { "epoch": 21.918158567774935, "grad_norm": 0.05697041004896164, "learning_rate": 1.3763066202090593e-05, "loss": 0.2626, "step": 17140 }, { "epoch": 21.9309462915601, "grad_norm": 2.5305371284484863, "learning_rate": 1.3741289198606271e-05, "loss": 0.2074, "step": 17150 }, { "epoch": 21.943734015345267, "grad_norm": 1.367523193359375, "learning_rate": 1.3719512195121953e-05, "loss": 0.0912, "step": 17160 }, { "epoch": 21.956521739130434, "grad_norm": 4.560670375823975, "learning_rate": 1.3697735191637632e-05, "loss": 0.2482, "step": 17170 }, { "epoch": 21.9693094629156, "grad_norm": 0.6231527924537659, "learning_rate": 1.367595818815331e-05, "loss": 0.2624, "step": 17180 }, { "epoch": 21.982097186700766, "grad_norm": 6.82766056060791, "learning_rate": 1.365418118466899e-05, "loss": 0.2555, "step": 17190 }, { "epoch": 21.994884910485933, "grad_norm": 0.7137308120727539, "learning_rate": 1.3632404181184668e-05, "loss": 0.1362, "step": 17200 }, { "epoch": 22.0, "eval_loss": 0.18604347109794617, "eval_runtime": 0.9758, "eval_samples_per_second": 100.43, "eval_steps_per_second": 13.322, "step": 17204 }, { "epoch": 22.0076726342711, "grad_norm": 3.093580961227417, "learning_rate": 1.361062717770035e-05, "loss": 0.2791, "step": 17210 }, { "epoch": 22.020460358056265, "grad_norm": 0.4014206528663635, "learning_rate": 1.3588850174216028e-05, "loss": 0.3631, "step": 17220 }, { "epoch": 22.033248081841432, "grad_norm": 8.501093543600291e-05, "learning_rate": 1.3567073170731709e-05, "loss": 0.281, "step": 17230 }, { "epoch": 22.046035805626598, "grad_norm": 8.508047103881836, "learning_rate": 1.3545296167247387e-05, "loss": 0.2143, "step": 17240 }, { "epoch": 22.058823529411764, "grad_norm": 6.361041069030762, "learning_rate": 1.3523519163763069e-05, "loss": 0.3231, "step": 17250 }, { "epoch": 22.07161125319693, "grad_norm": 6.605595111846924, "learning_rate": 1.3501742160278747e-05, "loss": 0.5286, "step": 17260 }, { "epoch": 22.084398976982097, "grad_norm": 0.00041541698738001287, "learning_rate": 1.3479965156794425e-05, "loss": 0.3769, "step": 17270 }, { "epoch": 22.097186700767264, "grad_norm": 0.15628832578659058, "learning_rate": 1.3458188153310105e-05, "loss": 0.1301, "step": 17280 }, { "epoch": 22.10997442455243, "grad_norm": 2.120054244995117, "learning_rate": 1.3436411149825784e-05, "loss": 0.1104, "step": 17290 }, { "epoch": 22.122762148337596, "grad_norm": 3.07108473777771, "learning_rate": 1.3414634146341466e-05, "loss": 0.1603, "step": 17300 }, { "epoch": 22.135549872122763, "grad_norm": 5.679004669189453, "learning_rate": 1.3392857142857144e-05, "loss": 0.2435, "step": 17310 }, { "epoch": 22.14833759590793, "grad_norm": 5.24109411239624, "learning_rate": 1.3371080139372824e-05, "loss": 0.234, "step": 17320 }, { "epoch": 22.161125319693095, "grad_norm": 4.5727949142456055, "learning_rate": 1.3349303135888502e-05, "loss": 0.1631, "step": 17330 }, { "epoch": 22.17391304347826, "grad_norm": 3.552060127258301, "learning_rate": 1.332752613240418e-05, "loss": 0.148, "step": 17340 }, { "epoch": 22.186700767263428, "grad_norm": 0.00046669490984641016, "learning_rate": 1.3305749128919862e-05, "loss": 0.2155, "step": 17350 }, { "epoch": 22.199488491048594, "grad_norm": 0.26570141315460205, "learning_rate": 1.3283972125435539e-05, "loss": 0.2783, "step": 17360 }, { "epoch": 22.21227621483376, "grad_norm": 8.340713500976562, "learning_rate": 1.326219512195122e-05, "loss": 0.2665, "step": 17370 }, { "epoch": 22.225063938618927, "grad_norm": 0.003582471050322056, "learning_rate": 1.3240418118466899e-05, "loss": 0.2328, "step": 17380 }, { "epoch": 22.237851662404093, "grad_norm": 0.14131903648376465, "learning_rate": 1.3218641114982581e-05, "loss": 0.2196, "step": 17390 }, { "epoch": 22.25063938618926, "grad_norm": 0.5135777592658997, "learning_rate": 1.3196864111498258e-05, "loss": 0.2071, "step": 17400 }, { "epoch": 22.263427109974426, "grad_norm": 0.34391576051712036, "learning_rate": 1.3175087108013936e-05, "loss": 0.0953, "step": 17410 }, { "epoch": 22.276214833759592, "grad_norm": 2.3617889881134033, "learning_rate": 1.3153310104529618e-05, "loss": 0.1522, "step": 17420 }, { "epoch": 22.289002557544755, "grad_norm": 0.0006938801379874349, "learning_rate": 1.3131533101045296e-05, "loss": 0.1555, "step": 17430 }, { "epoch": 22.30179028132992, "grad_norm": 2.4944310188293457, "learning_rate": 1.3109756097560976e-05, "loss": 0.278, "step": 17440 }, { "epoch": 22.314578005115088, "grad_norm": 1.586595892906189, "learning_rate": 1.3087979094076654e-05, "loss": 0.2938, "step": 17450 }, { "epoch": 22.327365728900254, "grad_norm": 0.518889307975769, "learning_rate": 1.3066202090592336e-05, "loss": 0.2189, "step": 17460 }, { "epoch": 22.34015345268542, "grad_norm": 3.496354111121036e-05, "learning_rate": 1.3044425087108015e-05, "loss": 0.3248, "step": 17470 }, { "epoch": 22.352941176470587, "grad_norm": 0.38596707582473755, "learning_rate": 1.3022648083623695e-05, "loss": 0.3765, "step": 17480 }, { "epoch": 22.365728900255753, "grad_norm": 0.0012932750396430492, "learning_rate": 1.3000871080139373e-05, "loss": 0.2076, "step": 17490 }, { "epoch": 22.37851662404092, "grad_norm": 0.022681573405861855, "learning_rate": 1.2979094076655051e-05, "loss": 0.0535, "step": 17500 }, { "epoch": 22.391304347826086, "grad_norm": 0.041287824511528015, "learning_rate": 1.2957317073170733e-05, "loss": 0.2799, "step": 17510 }, { "epoch": 22.404092071611252, "grad_norm": 0.015387671068310738, "learning_rate": 1.2935540069686411e-05, "loss": 0.4086, "step": 17520 }, { "epoch": 22.41687979539642, "grad_norm": 0.161788210272789, "learning_rate": 1.2913763066202091e-05, "loss": 0.0525, "step": 17530 }, { "epoch": 22.429667519181585, "grad_norm": 0.8831532001495361, "learning_rate": 1.289198606271777e-05, "loss": 0.3695, "step": 17540 }, { "epoch": 22.44245524296675, "grad_norm": 0.14793658256530762, "learning_rate": 1.2870209059233452e-05, "loss": 0.1574, "step": 17550 }, { "epoch": 22.455242966751918, "grad_norm": 1.977602243423462, "learning_rate": 1.284843205574913e-05, "loss": 0.3135, "step": 17560 }, { "epoch": 22.468030690537084, "grad_norm": 1.72086763381958, "learning_rate": 1.2826655052264808e-05, "loss": 0.2518, "step": 17570 }, { "epoch": 22.48081841432225, "grad_norm": 1.3766134977340698, "learning_rate": 1.2804878048780488e-05, "loss": 0.2637, "step": 17580 }, { "epoch": 22.493606138107417, "grad_norm": 1.531388282775879, "learning_rate": 1.2783101045296167e-05, "loss": 0.2058, "step": 17590 }, { "epoch": 22.506393861892583, "grad_norm": 0.3944302499294281, "learning_rate": 1.2761324041811848e-05, "loss": 0.0879, "step": 17600 }, { "epoch": 22.51918158567775, "grad_norm": 2.6415324211120605, "learning_rate": 1.2739547038327527e-05, "loss": 0.3183, "step": 17610 }, { "epoch": 22.531969309462916, "grad_norm": 0.9195997714996338, "learning_rate": 1.2717770034843207e-05, "loss": 0.1717, "step": 17620 }, { "epoch": 22.544757033248082, "grad_norm": 0.0030467098113149405, "learning_rate": 1.2695993031358885e-05, "loss": 0.2261, "step": 17630 }, { "epoch": 22.55754475703325, "grad_norm": 9.557846069335938, "learning_rate": 1.2674216027874567e-05, "loss": 0.3103, "step": 17640 }, { "epoch": 22.570332480818415, "grad_norm": 4.649169921875, "learning_rate": 1.2652439024390245e-05, "loss": 0.4754, "step": 17650 }, { "epoch": 22.58312020460358, "grad_norm": 0.0008290003752335906, "learning_rate": 1.2630662020905924e-05, "loss": 0.3525, "step": 17660 }, { "epoch": 22.595907928388748, "grad_norm": 4.795851230621338, "learning_rate": 1.2608885017421604e-05, "loss": 0.2181, "step": 17670 }, { "epoch": 22.608695652173914, "grad_norm": 5.425566673278809, "learning_rate": 1.2587108013937282e-05, "loss": 0.4152, "step": 17680 }, { "epoch": 22.62148337595908, "grad_norm": 10.171377182006836, "learning_rate": 1.2565331010452964e-05, "loss": 0.1547, "step": 17690 }, { "epoch": 22.634271099744247, "grad_norm": 16.753875732421875, "learning_rate": 1.2543554006968642e-05, "loss": 0.3034, "step": 17700 }, { "epoch": 22.647058823529413, "grad_norm": 4.119002819061279, "learning_rate": 1.2521777003484322e-05, "loss": 0.2585, "step": 17710 }, { "epoch": 22.65984654731458, "grad_norm": 0.46646326780319214, "learning_rate": 1.25e-05, "loss": 0.1874, "step": 17720 }, { "epoch": 22.672634271099746, "grad_norm": 4.02834939956665, "learning_rate": 1.247822299651568e-05, "loss": 0.1695, "step": 17730 }, { "epoch": 22.685421994884912, "grad_norm": 0.11288446187973022, "learning_rate": 1.2456445993031359e-05, "loss": 0.2845, "step": 17740 }, { "epoch": 22.69820971867008, "grad_norm": 0.019553814083337784, "learning_rate": 1.2434668989547039e-05, "loss": 0.2784, "step": 17750 }, { "epoch": 22.710997442455245, "grad_norm": 0.01391393318772316, "learning_rate": 1.2412891986062717e-05, "loss": 0.12, "step": 17760 }, { "epoch": 22.723785166240408, "grad_norm": 2.1759300231933594, "learning_rate": 1.2391114982578397e-05, "loss": 0.0794, "step": 17770 }, { "epoch": 22.736572890025574, "grad_norm": 0.0018889792263507843, "learning_rate": 1.2369337979094078e-05, "loss": 0.1777, "step": 17780 }, { "epoch": 22.74936061381074, "grad_norm": 0.005631202831864357, "learning_rate": 1.2347560975609756e-05, "loss": 0.3136, "step": 17790 }, { "epoch": 22.762148337595907, "grad_norm": 9.254101753234863, "learning_rate": 1.2325783972125436e-05, "loss": 0.313, "step": 17800 }, { "epoch": 22.774936061381073, "grad_norm": 1.4763479232788086, "learning_rate": 1.2304006968641116e-05, "loss": 0.1645, "step": 17810 }, { "epoch": 22.78772378516624, "grad_norm": 0.2448328286409378, "learning_rate": 1.2282229965156796e-05, "loss": 0.1259, "step": 17820 }, { "epoch": 22.800511508951406, "grad_norm": 0.14641886949539185, "learning_rate": 1.2260452961672474e-05, "loss": 0.261, "step": 17830 }, { "epoch": 22.813299232736572, "grad_norm": 0.4757079482078552, "learning_rate": 1.2238675958188153e-05, "loss": 0.2618, "step": 17840 }, { "epoch": 22.82608695652174, "grad_norm": 0.00018090580124408007, "learning_rate": 1.2216898954703833e-05, "loss": 0.2549, "step": 17850 }, { "epoch": 22.838874680306905, "grad_norm": 0.9151415228843689, "learning_rate": 1.2195121951219513e-05, "loss": 0.2213, "step": 17860 }, { "epoch": 22.85166240409207, "grad_norm": 6.071374893188477, "learning_rate": 1.2173344947735193e-05, "loss": 0.38, "step": 17870 }, { "epoch": 22.864450127877237, "grad_norm": 8.787729263305664, "learning_rate": 1.2151567944250871e-05, "loss": 0.1553, "step": 17880 }, { "epoch": 22.877237851662404, "grad_norm": 0.00015920742589514703, "learning_rate": 1.2129790940766551e-05, "loss": 0.3884, "step": 17890 }, { "epoch": 22.89002557544757, "grad_norm": 1.9935282468795776, "learning_rate": 1.2108013937282231e-05, "loss": 0.3219, "step": 17900 }, { "epoch": 22.902813299232736, "grad_norm": 1.669886589050293, "learning_rate": 1.2086236933797911e-05, "loss": 0.1544, "step": 17910 }, { "epoch": 22.915601023017903, "grad_norm": 5.934450149536133, "learning_rate": 1.2064459930313588e-05, "loss": 0.1828, "step": 17920 }, { "epoch": 22.92838874680307, "grad_norm": 0.0014017786597833037, "learning_rate": 1.2042682926829268e-05, "loss": 0.2292, "step": 17930 }, { "epoch": 22.941176470588236, "grad_norm": 0.057678937911987305, "learning_rate": 1.2020905923344948e-05, "loss": 0.1398, "step": 17940 }, { "epoch": 22.953964194373402, "grad_norm": 5.233877658843994, "learning_rate": 1.1999128919860628e-05, "loss": 0.2214, "step": 17950 }, { "epoch": 22.966751918158568, "grad_norm": 5.719663619995117, "learning_rate": 1.1977351916376307e-05, "loss": 0.4731, "step": 17960 }, { "epoch": 22.979539641943735, "grad_norm": 3.918996572494507, "learning_rate": 1.1955574912891987e-05, "loss": 0.1611, "step": 17970 }, { "epoch": 22.9923273657289, "grad_norm": 4.605996608734131, "learning_rate": 1.1933797909407667e-05, "loss": 0.1637, "step": 17980 }, { "epoch": 23.0, "eval_loss": 0.18160267174243927, "eval_runtime": 0.9793, "eval_samples_per_second": 100.07, "eval_steps_per_second": 13.275, "step": 17986 }, { "epoch": 23.005115089514067, "grad_norm": 0.10118851810693741, "learning_rate": 1.1912020905923347e-05, "loss": 0.1527, "step": 17990 }, { "epoch": 23.017902813299234, "grad_norm": 2.708587646484375, "learning_rate": 1.1890243902439025e-05, "loss": 0.13, "step": 18000 }, { "epoch": 23.0306905370844, "grad_norm": 6.671478748321533, "learning_rate": 1.1868466898954703e-05, "loss": 0.3244, "step": 18010 }, { "epoch": 23.043478260869566, "grad_norm": 5.663084983825684, "learning_rate": 1.1846689895470384e-05, "loss": 0.2156, "step": 18020 }, { "epoch": 23.056265984654733, "grad_norm": 3.7614872455596924, "learning_rate": 1.1824912891986064e-05, "loss": 0.1401, "step": 18030 }, { "epoch": 23.0690537084399, "grad_norm": 0.7713243365287781, "learning_rate": 1.1803135888501744e-05, "loss": 0.1942, "step": 18040 }, { "epoch": 23.081841432225065, "grad_norm": 4.182374686934054e-05, "learning_rate": 1.1781358885017422e-05, "loss": 0.1929, "step": 18050 }, { "epoch": 23.09462915601023, "grad_norm": 4.289588451385498, "learning_rate": 1.1759581881533102e-05, "loss": 0.1914, "step": 18060 }, { "epoch": 23.107416879795398, "grad_norm": 2.5513131618499756, "learning_rate": 1.173780487804878e-05, "loss": 0.1723, "step": 18070 }, { "epoch": 23.120204603580564, "grad_norm": 1.6158396005630493, "learning_rate": 1.171602787456446e-05, "loss": 0.2443, "step": 18080 }, { "epoch": 23.132992327365727, "grad_norm": 6.644562244415283, "learning_rate": 1.1694250871080139e-05, "loss": 0.2504, "step": 18090 }, { "epoch": 23.145780051150894, "grad_norm": 3.37807035446167, "learning_rate": 1.1672473867595819e-05, "loss": 0.2057, "step": 18100 }, { "epoch": 23.15856777493606, "grad_norm": 1.8405261039733887, "learning_rate": 1.1650696864111499e-05, "loss": 0.2303, "step": 18110 }, { "epoch": 23.171355498721226, "grad_norm": 0.35012125968933105, "learning_rate": 1.1628919860627179e-05, "loss": 0.1204, "step": 18120 }, { "epoch": 23.184143222506393, "grad_norm": 7.988998889923096, "learning_rate": 1.1607142857142857e-05, "loss": 0.2576, "step": 18130 }, { "epoch": 23.19693094629156, "grad_norm": 0.010078749619424343, "learning_rate": 1.1585365853658537e-05, "loss": 0.1435, "step": 18140 }, { "epoch": 23.209718670076725, "grad_norm": 0.5293736457824707, "learning_rate": 1.1563588850174216e-05, "loss": 0.1967, "step": 18150 }, { "epoch": 23.22250639386189, "grad_norm": 0.9761539697647095, "learning_rate": 1.1541811846689896e-05, "loss": 0.1991, "step": 18160 }, { "epoch": 23.235294117647058, "grad_norm": 1.45791494846344, "learning_rate": 1.1520034843205576e-05, "loss": 0.4252, "step": 18170 }, { "epoch": 23.248081841432224, "grad_norm": 0.30160781741142273, "learning_rate": 1.1498257839721254e-05, "loss": 0.2784, "step": 18180 }, { "epoch": 23.26086956521739, "grad_norm": 6.1339898109436035, "learning_rate": 1.1476480836236934e-05, "loss": 0.1893, "step": 18190 }, { "epoch": 23.273657289002557, "grad_norm": 2.0641069412231445, "learning_rate": 1.1454703832752614e-05, "loss": 0.2487, "step": 18200 }, { "epoch": 23.286445012787723, "grad_norm": 0.19321462512016296, "learning_rate": 1.1432926829268294e-05, "loss": 0.2067, "step": 18210 }, { "epoch": 23.29923273657289, "grad_norm": 0.0008184879552572966, "learning_rate": 1.1411149825783973e-05, "loss": 0.2678, "step": 18220 }, { "epoch": 23.312020460358056, "grad_norm": 6.459168910980225, "learning_rate": 1.1389372822299651e-05, "loss": 0.1284, "step": 18230 }, { "epoch": 23.324808184143222, "grad_norm": 5.253271579742432, "learning_rate": 1.1367595818815331e-05, "loss": 0.2221, "step": 18240 }, { "epoch": 23.33759590792839, "grad_norm": 0.016655512154102325, "learning_rate": 1.1345818815331011e-05, "loss": 0.1715, "step": 18250 }, { "epoch": 23.350383631713555, "grad_norm": 1.1747348308563232, "learning_rate": 1.132404181184669e-05, "loss": 0.2928, "step": 18260 }, { "epoch": 23.36317135549872, "grad_norm": 10.306817054748535, "learning_rate": 1.130226480836237e-05, "loss": 0.2723, "step": 18270 }, { "epoch": 23.375959079283888, "grad_norm": 2.6417109966278076, "learning_rate": 1.128048780487805e-05, "loss": 0.3317, "step": 18280 }, { "epoch": 23.388746803069054, "grad_norm": 1.2946593761444092, "learning_rate": 1.125871080139373e-05, "loss": 0.3156, "step": 18290 }, { "epoch": 23.40153452685422, "grad_norm": 1.994095802307129, "learning_rate": 1.1236933797909408e-05, "loss": 0.3111, "step": 18300 }, { "epoch": 23.414322250639387, "grad_norm": 1.8308746814727783, "learning_rate": 1.1215156794425086e-05, "loss": 0.377, "step": 18310 }, { "epoch": 23.427109974424553, "grad_norm": 0.07952219992876053, "learning_rate": 1.1193379790940766e-05, "loss": 0.3723, "step": 18320 }, { "epoch": 23.43989769820972, "grad_norm": 0.6271858215332031, "learning_rate": 1.1171602787456447e-05, "loss": 0.2829, "step": 18330 }, { "epoch": 23.452685421994886, "grad_norm": 5.1058573722839355, "learning_rate": 1.1149825783972127e-05, "loss": 0.4572, "step": 18340 }, { "epoch": 23.465473145780052, "grad_norm": 1.6570324897766113, "learning_rate": 1.1128048780487805e-05, "loss": 0.2236, "step": 18350 }, { "epoch": 23.47826086956522, "grad_norm": 0.3510279357433319, "learning_rate": 1.1106271777003485e-05, "loss": 0.2297, "step": 18360 }, { "epoch": 23.491048593350385, "grad_norm": 0.014843414537608624, "learning_rate": 1.1084494773519165e-05, "loss": 0.3306, "step": 18370 }, { "epoch": 23.50383631713555, "grad_norm": 1.8954602479934692, "learning_rate": 1.1062717770034843e-05, "loss": 0.1152, "step": 18380 }, { "epoch": 23.516624040920718, "grad_norm": 0.40667295455932617, "learning_rate": 1.1040940766550523e-05, "loss": 0.1229, "step": 18390 }, { "epoch": 23.529411764705884, "grad_norm": 3.786303758621216, "learning_rate": 1.1019163763066202e-05, "loss": 0.2242, "step": 18400 }, { "epoch": 23.54219948849105, "grad_norm": 0.021371856331825256, "learning_rate": 1.0997386759581882e-05, "loss": 0.3529, "step": 18410 }, { "epoch": 23.554987212276213, "grad_norm": 0.805793821811676, "learning_rate": 1.0975609756097562e-05, "loss": 0.1224, "step": 18420 }, { "epoch": 23.56777493606138, "grad_norm": 3.2790889739990234, "learning_rate": 1.0953832752613242e-05, "loss": 0.1574, "step": 18430 }, { "epoch": 23.580562659846546, "grad_norm": 3.754932165145874, "learning_rate": 1.093205574912892e-05, "loss": 0.1643, "step": 18440 }, { "epoch": 23.593350383631712, "grad_norm": 3.4517104625701904, "learning_rate": 1.09102787456446e-05, "loss": 0.1885, "step": 18450 }, { "epoch": 23.60613810741688, "grad_norm": 1.3036948442459106, "learning_rate": 1.0888501742160279e-05, "loss": 0.2806, "step": 18460 }, { "epoch": 23.618925831202045, "grad_norm": 0.0014313430292531848, "learning_rate": 1.0866724738675959e-05, "loss": 0.2697, "step": 18470 }, { "epoch": 23.63171355498721, "grad_norm": 2.177548408508301, "learning_rate": 1.0844947735191637e-05, "loss": 0.3779, "step": 18480 }, { "epoch": 23.644501278772378, "grad_norm": 3.585585594177246, "learning_rate": 1.0823170731707317e-05, "loss": 0.3034, "step": 18490 }, { "epoch": 23.657289002557544, "grad_norm": 0.02034856006503105, "learning_rate": 1.0801393728222997e-05, "loss": 0.2041, "step": 18500 }, { "epoch": 23.67007672634271, "grad_norm": 1.1447325944900513, "learning_rate": 1.0779616724738677e-05, "loss": 0.4234, "step": 18510 }, { "epoch": 23.682864450127877, "grad_norm": 1.8239119052886963, "learning_rate": 1.0757839721254356e-05, "loss": 0.5485, "step": 18520 }, { "epoch": 23.695652173913043, "grad_norm": 4.72014856338501, "learning_rate": 1.0736062717770036e-05, "loss": 0.1865, "step": 18530 }, { "epoch": 23.70843989769821, "grad_norm": 0.9234018325805664, "learning_rate": 1.0714285714285714e-05, "loss": 0.2055, "step": 18540 }, { "epoch": 23.721227621483376, "grad_norm": 0.0001534064649604261, "learning_rate": 1.0692508710801394e-05, "loss": 0.3775, "step": 18550 }, { "epoch": 23.734015345268542, "grad_norm": 0.08756177872419357, "learning_rate": 1.0670731707317074e-05, "loss": 0.2456, "step": 18560 }, { "epoch": 23.74680306905371, "grad_norm": 1.3506488800048828, "learning_rate": 1.0648954703832753e-05, "loss": 0.2995, "step": 18570 }, { "epoch": 23.759590792838875, "grad_norm": 1.4564141035079956, "learning_rate": 1.0627177700348433e-05, "loss": 0.2127, "step": 18580 }, { "epoch": 23.77237851662404, "grad_norm": 5.594372749328613, "learning_rate": 1.0605400696864113e-05, "loss": 0.2294, "step": 18590 }, { "epoch": 23.785166240409207, "grad_norm": 0.6284129619598389, "learning_rate": 1.0583623693379793e-05, "loss": 0.239, "step": 18600 }, { "epoch": 23.797953964194374, "grad_norm": 0.047873031347990036, "learning_rate": 1.0561846689895471e-05, "loss": 0.1433, "step": 18610 }, { "epoch": 23.81074168797954, "grad_norm": 5.969231128692627, "learning_rate": 1.054006968641115e-05, "loss": 0.1204, "step": 18620 }, { "epoch": 23.823529411764707, "grad_norm": 11.529876708984375, "learning_rate": 1.051829268292683e-05, "loss": 0.2963, "step": 18630 }, { "epoch": 23.836317135549873, "grad_norm": 1.0211116075515747, "learning_rate": 1.049651567944251e-05, "loss": 0.1601, "step": 18640 }, { "epoch": 23.84910485933504, "grad_norm": 4.957517623901367, "learning_rate": 1.0474738675958188e-05, "loss": 0.2966, "step": 18650 }, { "epoch": 23.861892583120206, "grad_norm": 0.279568612575531, "learning_rate": 1.0452961672473868e-05, "loss": 0.3315, "step": 18660 }, { "epoch": 23.874680306905372, "grad_norm": 6.54602575302124, "learning_rate": 1.0431184668989548e-05, "loss": 0.183, "step": 18670 }, { "epoch": 23.88746803069054, "grad_norm": 9.698213577270508, "learning_rate": 1.0409407665505228e-05, "loss": 0.3088, "step": 18680 }, { "epoch": 23.900255754475705, "grad_norm": 7.419322967529297, "learning_rate": 1.0387630662020906e-05, "loss": 0.2057, "step": 18690 }, { "epoch": 23.91304347826087, "grad_norm": 0.023956669494509697, "learning_rate": 1.0365853658536585e-05, "loss": 0.2966, "step": 18700 }, { "epoch": 23.925831202046037, "grad_norm": 4.011311054229736, "learning_rate": 1.0344076655052265e-05, "loss": 0.1879, "step": 18710 }, { "epoch": 23.938618925831204, "grad_norm": 6.17871618270874, "learning_rate": 1.0322299651567945e-05, "loss": 0.4622, "step": 18720 }, { "epoch": 23.95140664961637, "grad_norm": 3.367823600769043, "learning_rate": 1.0300522648083625e-05, "loss": 0.1969, "step": 18730 }, { "epoch": 23.964194373401533, "grad_norm": 2.884002685546875, "learning_rate": 1.0278745644599303e-05, "loss": 0.1785, "step": 18740 }, { "epoch": 23.9769820971867, "grad_norm": 0.010596761479973793, "learning_rate": 1.0256968641114983e-05, "loss": 0.1556, "step": 18750 }, { "epoch": 23.989769820971865, "grad_norm": 3.776186531467829e-06, "learning_rate": 1.0235191637630663e-05, "loss": 0.1703, "step": 18760 }, { "epoch": 24.0, "eval_loss": 0.18299047648906708, "eval_runtime": 0.9773, "eval_samples_per_second": 100.272, "eval_steps_per_second": 13.301, "step": 18768 }, { "epoch": 24.002557544757032, "grad_norm": 1.9648569822311401, "learning_rate": 1.0213414634146342e-05, "loss": 0.1341, "step": 18770 }, { "epoch": 24.015345268542198, "grad_norm": 8.87406786205247e-05, "learning_rate": 1.019163763066202e-05, "loss": 0.072, "step": 18780 }, { "epoch": 24.028132992327365, "grad_norm": 1.8087506294250488, "learning_rate": 1.01698606271777e-05, "loss": 0.2022, "step": 18790 }, { "epoch": 24.04092071611253, "grad_norm": 0.00022683825227431953, "learning_rate": 1.014808362369338e-05, "loss": 0.219, "step": 18800 }, { "epoch": 24.053708439897697, "grad_norm": 8.788445848040283e-05, "learning_rate": 1.012630662020906e-05, "loss": 0.3452, "step": 18810 }, { "epoch": 24.066496163682864, "grad_norm": 1.596673846244812, "learning_rate": 1.0104529616724739e-05, "loss": 0.2268, "step": 18820 }, { "epoch": 24.07928388746803, "grad_norm": 5.229015350341797, "learning_rate": 1.0082752613240419e-05, "loss": 0.2382, "step": 18830 }, { "epoch": 24.092071611253196, "grad_norm": 0.011357109993696213, "learning_rate": 1.0060975609756099e-05, "loss": 0.2029, "step": 18840 }, { "epoch": 24.104859335038363, "grad_norm": 2.772266387939453, "learning_rate": 1.0039198606271777e-05, "loss": 0.1519, "step": 18850 }, { "epoch": 24.11764705882353, "grad_norm": 5.571649074554443, "learning_rate": 1.0017421602787457e-05, "loss": 0.299, "step": 18860 }, { "epoch": 24.130434782608695, "grad_norm": 4.471550941467285, "learning_rate": 9.995644599303135e-06, "loss": 0.1408, "step": 18870 }, { "epoch": 24.14322250639386, "grad_norm": 0.002381289144977927, "learning_rate": 9.973867595818816e-06, "loss": 0.2127, "step": 18880 }, { "epoch": 24.156010230179028, "grad_norm": 0.3346289396286011, "learning_rate": 9.952090592334496e-06, "loss": 0.3573, "step": 18890 }, { "epoch": 24.168797953964194, "grad_norm": 6.417741775512695, "learning_rate": 9.930313588850176e-06, "loss": 0.1684, "step": 18900 }, { "epoch": 24.18158567774936, "grad_norm": 0.4619981348514557, "learning_rate": 9.908536585365854e-06, "loss": 0.3421, "step": 18910 }, { "epoch": 24.194373401534527, "grad_norm": 2.542964458465576, "learning_rate": 9.886759581881534e-06, "loss": 0.1321, "step": 18920 }, { "epoch": 24.207161125319693, "grad_norm": 3.8259899616241455, "learning_rate": 9.864982578397212e-06, "loss": 0.301, "step": 18930 }, { "epoch": 24.21994884910486, "grad_norm": 3.5682637691497803, "learning_rate": 9.843205574912892e-06, "loss": 0.2302, "step": 18940 }, { "epoch": 24.232736572890026, "grad_norm": 7.466248512268066, "learning_rate": 9.821428571428573e-06, "loss": 0.2322, "step": 18950 }, { "epoch": 24.245524296675192, "grad_norm": 4.964677333831787, "learning_rate": 9.799651567944251e-06, "loss": 0.3205, "step": 18960 }, { "epoch": 24.25831202046036, "grad_norm": 0.6600395441055298, "learning_rate": 9.777874564459931e-06, "loss": 0.3198, "step": 18970 }, { "epoch": 24.271099744245525, "grad_norm": 0.0014609363861382008, "learning_rate": 9.756097560975611e-06, "loss": 0.2724, "step": 18980 }, { "epoch": 24.28388746803069, "grad_norm": 1.3109701871871948, "learning_rate": 9.734320557491291e-06, "loss": 0.4884, "step": 18990 }, { "epoch": 24.296675191815858, "grad_norm": 1.394028902053833, "learning_rate": 9.712543554006968e-06, "loss": 0.2897, "step": 19000 }, { "epoch": 24.309462915601024, "grad_norm": 3.291472911834717, "learning_rate": 9.690766550522648e-06, "loss": 0.2281, "step": 19010 }, { "epoch": 24.32225063938619, "grad_norm": 0.004317448474466801, "learning_rate": 9.668989547038328e-06, "loss": 0.1659, "step": 19020 }, { "epoch": 24.335038363171357, "grad_norm": 0.20717021822929382, "learning_rate": 9.647212543554008e-06, "loss": 0.2681, "step": 19030 }, { "epoch": 24.347826086956523, "grad_norm": 0.0002472202468197793, "learning_rate": 9.625435540069686e-06, "loss": 0.2402, "step": 19040 }, { "epoch": 24.36061381074169, "grad_norm": 8.822566032409668, "learning_rate": 9.603658536585366e-06, "loss": 0.2797, "step": 19050 }, { "epoch": 24.373401534526856, "grad_norm": 6.60060453414917, "learning_rate": 9.581881533101046e-06, "loss": 0.1299, "step": 19060 }, { "epoch": 24.38618925831202, "grad_norm": 0.008101239800453186, "learning_rate": 9.560104529616726e-06, "loss": 0.2473, "step": 19070 }, { "epoch": 24.398976982097185, "grad_norm": 1.5664035081863403, "learning_rate": 9.538327526132405e-06, "loss": 0.148, "step": 19080 }, { "epoch": 24.41176470588235, "grad_norm": 1.6293203830718994, "learning_rate": 9.516550522648083e-06, "loss": 0.0677, "step": 19090 }, { "epoch": 24.424552429667518, "grad_norm": 3.834322214126587, "learning_rate": 9.494773519163763e-06, "loss": 0.1932, "step": 19100 }, { "epoch": 24.437340153452684, "grad_norm": 0.9427624940872192, "learning_rate": 9.472996515679443e-06, "loss": 0.3893, "step": 19110 }, { "epoch": 24.45012787723785, "grad_norm": 5.037188529968262, "learning_rate": 9.451219512195123e-06, "loss": 0.2438, "step": 19120 }, { "epoch": 24.462915601023017, "grad_norm": 3.3101885318756104, "learning_rate": 9.429442508710802e-06, "loss": 0.1255, "step": 19130 }, { "epoch": 24.475703324808183, "grad_norm": 3.3141419887542725, "learning_rate": 9.407665505226482e-06, "loss": 0.2537, "step": 19140 }, { "epoch": 24.48849104859335, "grad_norm": 10.10535717010498, "learning_rate": 9.385888501742162e-06, "loss": 0.5465, "step": 19150 }, { "epoch": 24.501278772378516, "grad_norm": 2.9780263900756836, "learning_rate": 9.36411149825784e-06, "loss": 0.2291, "step": 19160 }, { "epoch": 24.514066496163682, "grad_norm": 0.05312852934002876, "learning_rate": 9.342334494773518e-06, "loss": 0.1619, "step": 19170 }, { "epoch": 24.52685421994885, "grad_norm": 0.6286771297454834, "learning_rate": 9.320557491289198e-06, "loss": 0.2531, "step": 19180 }, { "epoch": 24.539641943734015, "grad_norm": 10.184651374816895, "learning_rate": 9.298780487804879e-06, "loss": 0.3081, "step": 19190 }, { "epoch": 24.55242966751918, "grad_norm": 3.5842103958129883, "learning_rate": 9.277003484320559e-06, "loss": 0.1545, "step": 19200 }, { "epoch": 24.565217391304348, "grad_norm": 0.006370662711560726, "learning_rate": 9.255226480836237e-06, "loss": 0.2799, "step": 19210 }, { "epoch": 24.578005115089514, "grad_norm": 6.6300368309021, "learning_rate": 9.233449477351917e-06, "loss": 0.1704, "step": 19220 }, { "epoch": 24.59079283887468, "grad_norm": 0.029947593808174133, "learning_rate": 9.211672473867597e-06, "loss": 0.1921, "step": 19230 }, { "epoch": 24.603580562659847, "grad_norm": 6.361519813537598, "learning_rate": 9.189895470383275e-06, "loss": 0.386, "step": 19240 }, { "epoch": 24.616368286445013, "grad_norm": 8.52017593383789, "learning_rate": 9.168118466898955e-06, "loss": 0.1906, "step": 19250 }, { "epoch": 24.62915601023018, "grad_norm": 0.013848032802343369, "learning_rate": 9.146341463414634e-06, "loss": 0.1493, "step": 19260 }, { "epoch": 24.641943734015346, "grad_norm": 0.8284606337547302, "learning_rate": 9.124564459930314e-06, "loss": 0.1113, "step": 19270 }, { "epoch": 24.654731457800512, "grad_norm": 0.40283098816871643, "learning_rate": 9.102787456445994e-06, "loss": 0.211, "step": 19280 }, { "epoch": 24.66751918158568, "grad_norm": 1.3809633255004883, "learning_rate": 9.081010452961674e-06, "loss": 0.1649, "step": 19290 }, { "epoch": 24.680306905370845, "grad_norm": 0.8382818102836609, "learning_rate": 9.059233449477352e-06, "loss": 0.3372, "step": 19300 }, { "epoch": 24.69309462915601, "grad_norm": 3.602656126022339, "learning_rate": 9.037456445993032e-06, "loss": 0.1013, "step": 19310 }, { "epoch": 24.705882352941178, "grad_norm": 3.5405521392822266, "learning_rate": 9.01567944250871e-06, "loss": 0.203, "step": 19320 }, { "epoch": 24.718670076726344, "grad_norm": 2.895538330078125, "learning_rate": 8.99390243902439e-06, "loss": 0.1782, "step": 19330 }, { "epoch": 24.73145780051151, "grad_norm": 0.021150898188352585, "learning_rate": 8.972125435540069e-06, "loss": 0.2326, "step": 19340 }, { "epoch": 24.744245524296677, "grad_norm": 6.41797399520874, "learning_rate": 8.95034843205575e-06, "loss": 0.3114, "step": 19350 }, { "epoch": 24.757033248081843, "grad_norm": 4.61984395980835, "learning_rate": 8.92857142857143e-06, "loss": 0.2526, "step": 19360 }, { "epoch": 24.76982097186701, "grad_norm": 0.7206798195838928, "learning_rate": 8.90679442508711e-06, "loss": 0.2294, "step": 19370 }, { "epoch": 24.782608695652176, "grad_norm": 0.021268386393785477, "learning_rate": 8.885017421602788e-06, "loss": 0.2442, "step": 19380 }, { "epoch": 24.79539641943734, "grad_norm": 1.2543253898620605, "learning_rate": 8.863240418118466e-06, "loss": 0.3759, "step": 19390 }, { "epoch": 24.808184143222505, "grad_norm": 6.605262279510498, "learning_rate": 8.841463414634146e-06, "loss": 0.2954, "step": 19400 }, { "epoch": 24.82097186700767, "grad_norm": 0.008203168399631977, "learning_rate": 8.819686411149826e-06, "loss": 0.3309, "step": 19410 }, { "epoch": 24.833759590792837, "grad_norm": 0.682137131690979, "learning_rate": 8.797909407665506e-06, "loss": 0.1424, "step": 19420 }, { "epoch": 24.846547314578004, "grad_norm": 0.4394625723361969, "learning_rate": 8.776132404181185e-06, "loss": 0.3464, "step": 19430 }, { "epoch": 24.85933503836317, "grad_norm": 0.0014602456940338016, "learning_rate": 8.754355400696865e-06, "loss": 0.0397, "step": 19440 }, { "epoch": 24.872122762148337, "grad_norm": 4.264637470245361, "learning_rate": 8.732578397212545e-06, "loss": 0.4722, "step": 19450 }, { "epoch": 24.884910485933503, "grad_norm": 1.0657953023910522, "learning_rate": 8.710801393728225e-06, "loss": 0.0911, "step": 19460 }, { "epoch": 24.89769820971867, "grad_norm": 1.0639190673828125, "learning_rate": 8.689024390243903e-06, "loss": 0.2563, "step": 19470 }, { "epoch": 24.910485933503836, "grad_norm": 0.0038846738170832396, "learning_rate": 8.667247386759581e-06, "loss": 0.2149, "step": 19480 }, { "epoch": 24.923273657289002, "grad_norm": 5.282316207885742, "learning_rate": 8.645470383275261e-06, "loss": 0.2069, "step": 19490 }, { "epoch": 24.93606138107417, "grad_norm": 0.00032713127438910306, "learning_rate": 8.623693379790942e-06, "loss": 0.1029, "step": 19500 }, { "epoch": 24.948849104859335, "grad_norm": 4.110812187194824, "learning_rate": 8.601916376306622e-06, "loss": 0.296, "step": 19510 }, { "epoch": 24.9616368286445, "grad_norm": 0.4580451250076294, "learning_rate": 8.5801393728223e-06, "loss": 0.3531, "step": 19520 }, { "epoch": 24.974424552429667, "grad_norm": 0.22135113179683685, "learning_rate": 8.55836236933798e-06, "loss": 0.2273, "step": 19530 }, { "epoch": 24.987212276214834, "grad_norm": 5.510282039642334, "learning_rate": 8.53658536585366e-06, "loss": 0.1763, "step": 19540 }, { "epoch": 25.0, "grad_norm": 1.8533436059951782, "learning_rate": 8.514808362369338e-06, "loss": 0.2936, "step": 19550 }, { "epoch": 25.0, "eval_loss": 0.18129856884479523, "eval_runtime": 0.8207, "eval_samples_per_second": 119.413, "eval_steps_per_second": 15.84, "step": 19550 }, { "epoch": 25.012787723785166, "grad_norm": 0.0017755866283550858, "learning_rate": 8.493031358885017e-06, "loss": 0.1499, "step": 19560 }, { "epoch": 25.025575447570333, "grad_norm": 0.0011428899597376585, "learning_rate": 8.471254355400697e-06, "loss": 0.2138, "step": 19570 }, { "epoch": 25.0383631713555, "grad_norm": 0.0004178075469098985, "learning_rate": 8.449477351916377e-06, "loss": 0.1578, "step": 19580 }, { "epoch": 25.051150895140665, "grad_norm": 0.8963024020195007, "learning_rate": 8.427700348432057e-06, "loss": 0.1294, "step": 19590 }, { "epoch": 25.06393861892583, "grad_norm": 3.469683885574341, "learning_rate": 8.405923344947735e-06, "loss": 0.2863, "step": 19600 }, { "epoch": 25.076726342710998, "grad_norm": 0.030634768307209015, "learning_rate": 8.384146341463415e-06, "loss": 0.1694, "step": 19610 }, { "epoch": 25.089514066496164, "grad_norm": 0.008396928198635578, "learning_rate": 8.362369337979095e-06, "loss": 0.303, "step": 19620 }, { "epoch": 25.10230179028133, "grad_norm": 0.4458426237106323, "learning_rate": 8.340592334494774e-06, "loss": 0.4059, "step": 19630 }, { "epoch": 25.115089514066497, "grad_norm": 2.2187139987945557, "learning_rate": 8.318815331010454e-06, "loss": 0.2115, "step": 19640 }, { "epoch": 25.127877237851663, "grad_norm": 4.11098051071167, "learning_rate": 8.297038327526132e-06, "loss": 0.3253, "step": 19650 }, { "epoch": 25.14066496163683, "grad_norm": 3.1576087474823, "learning_rate": 8.275261324041812e-06, "loss": 0.3276, "step": 19660 }, { "epoch": 25.153452685421996, "grad_norm": 2.0662500858306885, "learning_rate": 8.253484320557492e-06, "loss": 0.1996, "step": 19670 }, { "epoch": 25.166240409207163, "grad_norm": 0.3466600775718689, "learning_rate": 8.231707317073172e-06, "loss": 0.1362, "step": 19680 }, { "epoch": 25.17902813299233, "grad_norm": 6.13591194152832, "learning_rate": 8.20993031358885e-06, "loss": 0.2625, "step": 19690 }, { "epoch": 25.191815856777495, "grad_norm": 1.5550795793533325, "learning_rate": 8.188153310104529e-06, "loss": 0.0495, "step": 19700 }, { "epoch": 25.20460358056266, "grad_norm": 3.5859768390655518, "learning_rate": 8.166376306620209e-06, "loss": 0.3586, "step": 19710 }, { "epoch": 25.217391304347824, "grad_norm": 0.09591033309698105, "learning_rate": 8.144599303135889e-06, "loss": 0.0664, "step": 19720 }, { "epoch": 25.23017902813299, "grad_norm": 1.0985368490219116, "learning_rate": 8.122822299651567e-06, "loss": 0.2419, "step": 19730 }, { "epoch": 25.242966751918157, "grad_norm": 7.924480438232422, "learning_rate": 8.101045296167248e-06, "loss": 0.2375, "step": 19740 }, { "epoch": 25.255754475703323, "grad_norm": 5.2205634117126465, "learning_rate": 8.079268292682928e-06, "loss": 0.4546, "step": 19750 }, { "epoch": 25.26854219948849, "grad_norm": 1.323668360710144, "learning_rate": 8.057491289198608e-06, "loss": 0.2154, "step": 19760 }, { "epoch": 25.281329923273656, "grad_norm": 3.7929680347442627, "learning_rate": 8.035714285714286e-06, "loss": 0.178, "step": 19770 }, { "epoch": 25.294117647058822, "grad_norm": 0.025306949391961098, "learning_rate": 8.013937282229964e-06, "loss": 0.1248, "step": 19780 }, { "epoch": 25.30690537084399, "grad_norm": 0.06079568713903427, "learning_rate": 7.992160278745644e-06, "loss": 0.0619, "step": 19790 }, { "epoch": 25.319693094629155, "grad_norm": 6.62827730178833, "learning_rate": 7.970383275261324e-06, "loss": 0.1882, "step": 19800 }, { "epoch": 25.33248081841432, "grad_norm": 0.04379437118768692, "learning_rate": 7.948606271777004e-06, "loss": 0.2076, "step": 19810 }, { "epoch": 25.345268542199488, "grad_norm": 3.640428304672241, "learning_rate": 7.926829268292683e-06, "loss": 0.2472, "step": 19820 }, { "epoch": 25.358056265984654, "grad_norm": 0.32952073216438293, "learning_rate": 7.905052264808363e-06, "loss": 0.2234, "step": 19830 }, { "epoch": 25.37084398976982, "grad_norm": 0.0016094455495476723, "learning_rate": 7.883275261324043e-06, "loss": 0.1618, "step": 19840 }, { "epoch": 25.383631713554987, "grad_norm": 2.6373326778411865, "learning_rate": 7.861498257839723e-06, "loss": 0.2415, "step": 19850 }, { "epoch": 25.396419437340153, "grad_norm": 0.0004819360328838229, "learning_rate": 7.8397212543554e-06, "loss": 0.2548, "step": 19860 }, { "epoch": 25.40920716112532, "grad_norm": 2.0560288429260254, "learning_rate": 7.81794425087108e-06, "loss": 0.1283, "step": 19870 }, { "epoch": 25.421994884910486, "grad_norm": 5.6980133056640625, "learning_rate": 7.79616724738676e-06, "loss": 0.2889, "step": 19880 }, { "epoch": 25.434782608695652, "grad_norm": 3.6380951404571533, "learning_rate": 7.77439024390244e-06, "loss": 0.2188, "step": 19890 }, { "epoch": 25.44757033248082, "grad_norm": 3.955148458480835, "learning_rate": 7.752613240418118e-06, "loss": 0.3087, "step": 19900 }, { "epoch": 25.460358056265985, "grad_norm": 0.12818066775798798, "learning_rate": 7.730836236933798e-06, "loss": 0.2401, "step": 19910 }, { "epoch": 25.47314578005115, "grad_norm": 1.8835790157318115, "learning_rate": 7.709059233449478e-06, "loss": 0.1892, "step": 19920 }, { "epoch": 25.485933503836318, "grad_norm": 1.2346657514572144, "learning_rate": 7.687282229965158e-06, "loss": 0.152, "step": 19930 }, { "epoch": 25.498721227621484, "grad_norm": 5.731377124786377, "learning_rate": 7.665505226480837e-06, "loss": 0.1053, "step": 19940 }, { "epoch": 25.51150895140665, "grad_norm": 0.0032745820935815573, "learning_rate": 7.643728222996515e-06, "loss": 0.3134, "step": 19950 }, { "epoch": 25.524296675191817, "grad_norm": 5.341978073120117, "learning_rate": 7.621951219512195e-06, "loss": 0.2932, "step": 19960 }, { "epoch": 25.537084398976983, "grad_norm": 2.9654135704040527, "learning_rate": 7.600174216027875e-06, "loss": 0.3528, "step": 19970 }, { "epoch": 25.54987212276215, "grad_norm": 0.0003606612444855273, "learning_rate": 7.578397212543554e-06, "loss": 0.1237, "step": 19980 }, { "epoch": 25.562659846547316, "grad_norm": 0.044729363173246384, "learning_rate": 7.5566202090592344e-06, "loss": 0.3323, "step": 19990 }, { "epoch": 25.575447570332482, "grad_norm": 0.0004105431435164064, "learning_rate": 7.534843205574914e-06, "loss": 0.1374, "step": 20000 }, { "epoch": 25.58823529411765, "grad_norm": 3.488405227661133, "learning_rate": 7.513066202090592e-06, "loss": 0.2517, "step": 20010 }, { "epoch": 25.601023017902815, "grad_norm": 0.0034214009065181017, "learning_rate": 7.491289198606272e-06, "loss": 0.4066, "step": 20020 }, { "epoch": 25.61381074168798, "grad_norm": 3.4584133625030518, "learning_rate": 7.469512195121951e-06, "loss": 0.2964, "step": 20030 }, { "epoch": 25.626598465473144, "grad_norm": 0.000964305188972503, "learning_rate": 7.447735191637631e-06, "loss": 0.2671, "step": 20040 }, { "epoch": 25.63938618925831, "grad_norm": 2.860732078552246, "learning_rate": 7.4259581881533105e-06, "loss": 0.1305, "step": 20050 }, { "epoch": 25.652173913043477, "grad_norm": 2.513005495071411, "learning_rate": 7.4041811846689906e-06, "loss": 0.2967, "step": 20060 }, { "epoch": 25.664961636828643, "grad_norm": 3.4734206199645996, "learning_rate": 7.38240418118467e-06, "loss": 0.2739, "step": 20070 }, { "epoch": 25.67774936061381, "grad_norm": 0.8101391792297363, "learning_rate": 7.360627177700349e-06, "loss": 0.2415, "step": 20080 }, { "epoch": 25.690537084398976, "grad_norm": 0.00019734690431505442, "learning_rate": 7.338850174216027e-06, "loss": 0.2654, "step": 20090 }, { "epoch": 25.703324808184142, "grad_norm": 1.5274137258529663, "learning_rate": 7.317073170731707e-06, "loss": 0.1765, "step": 20100 }, { "epoch": 25.71611253196931, "grad_norm": 0.002199924550950527, "learning_rate": 7.295296167247387e-06, "loss": 0.4541, "step": 20110 }, { "epoch": 25.728900255754475, "grad_norm": 0.11991043388843536, "learning_rate": 7.273519163763067e-06, "loss": 0.1748, "step": 20120 }, { "epoch": 25.74168797953964, "grad_norm": 2.977264642715454, "learning_rate": 7.251742160278746e-06, "loss": 0.187, "step": 20130 }, { "epoch": 25.754475703324808, "grad_norm": 6.003963947296143, "learning_rate": 7.229965156794426e-06, "loss": 0.3517, "step": 20140 }, { "epoch": 25.767263427109974, "grad_norm": 0.1923568695783615, "learning_rate": 7.208188153310105e-06, "loss": 0.2537, "step": 20150 }, { "epoch": 25.78005115089514, "grad_norm": 6.205975532531738, "learning_rate": 7.186411149825785e-06, "loss": 0.2591, "step": 20160 }, { "epoch": 25.792838874680307, "grad_norm": 5.6605753898620605, "learning_rate": 7.1646341463414635e-06, "loss": 0.401, "step": 20170 }, { "epoch": 25.805626598465473, "grad_norm": 1.7978140115737915, "learning_rate": 7.142857142857143e-06, "loss": 0.184, "step": 20180 }, { "epoch": 25.81841432225064, "grad_norm": 1.2769854068756104, "learning_rate": 7.121080139372823e-06, "loss": 0.2435, "step": 20190 }, { "epoch": 25.831202046035806, "grad_norm": 6.43965482711792, "learning_rate": 7.099303135888502e-06, "loss": 0.3531, "step": 20200 }, { "epoch": 25.843989769820972, "grad_norm": 6.394824981689453, "learning_rate": 7.077526132404182e-06, "loss": 0.2703, "step": 20210 }, { "epoch": 25.85677749360614, "grad_norm": 3.0788609981536865, "learning_rate": 7.055749128919861e-06, "loss": 0.3849, "step": 20220 }, { "epoch": 25.869565217391305, "grad_norm": 2.7930312156677246, "learning_rate": 7.033972125435541e-06, "loss": 0.3325, "step": 20230 }, { "epoch": 25.88235294117647, "grad_norm": 5.459298133850098, "learning_rate": 7.0121951219512205e-06, "loss": 0.2963, "step": 20240 }, { "epoch": 25.895140664961637, "grad_norm": 3.3363773822784424, "learning_rate": 6.990418118466899e-06, "loss": 0.1328, "step": 20250 }, { "epoch": 25.907928388746804, "grad_norm": 0.9299935102462769, "learning_rate": 6.968641114982578e-06, "loss": 0.1498, "step": 20260 }, { "epoch": 25.92071611253197, "grad_norm": 0.6399292349815369, "learning_rate": 6.946864111498258e-06, "loss": 0.2621, "step": 20270 }, { "epoch": 25.933503836317136, "grad_norm": 3.862414836883545, "learning_rate": 6.925087108013937e-06, "loss": 0.1487, "step": 20280 }, { "epoch": 25.946291560102303, "grad_norm": 2.322935104370117, "learning_rate": 6.903310104529617e-06, "loss": 0.1076, "step": 20290 }, { "epoch": 25.95907928388747, "grad_norm": 2.0821304321289062, "learning_rate": 6.8815331010452966e-06, "loss": 0.1143, "step": 20300 }, { "epoch": 25.971867007672635, "grad_norm": 1.0014243125915527, "learning_rate": 6.859756097560977e-06, "loss": 0.2203, "step": 20310 }, { "epoch": 25.984654731457802, "grad_norm": 5.317222595214844, "learning_rate": 6.837979094076655e-06, "loss": 0.2636, "step": 20320 }, { "epoch": 25.997442455242968, "grad_norm": 2.082972526550293, "learning_rate": 6.816202090592334e-06, "loss": 0.2516, "step": 20330 }, { "epoch": 26.0, "eval_loss": 0.18091708421707153, "eval_runtime": 0.9859, "eval_samples_per_second": 99.403, "eval_steps_per_second": 13.186, "step": 20332 }, { "epoch": 26.010230179028135, "grad_norm": 0.04056939110159874, "learning_rate": 6.794425087108014e-06, "loss": 0.1934, "step": 20340 }, { "epoch": 26.0230179028133, "grad_norm": 0.04060182347893715, "learning_rate": 6.7726480836236934e-06, "loss": 0.1245, "step": 20350 }, { "epoch": 26.035805626598467, "grad_norm": 0.0002620798477437347, "learning_rate": 6.7508710801393735e-06, "loss": 0.1996, "step": 20360 }, { "epoch": 26.04859335038363, "grad_norm": 0.953183650970459, "learning_rate": 6.729094076655053e-06, "loss": 0.1799, "step": 20370 }, { "epoch": 26.061381074168796, "grad_norm": 1.502600908279419, "learning_rate": 6.707317073170733e-06, "loss": 0.2447, "step": 20380 }, { "epoch": 26.074168797953963, "grad_norm": 3.2614762783050537, "learning_rate": 6.685540069686412e-06, "loss": 0.1171, "step": 20390 }, { "epoch": 26.08695652173913, "grad_norm": 6.770511627197266, "learning_rate": 6.66376306620209e-06, "loss": 0.2494, "step": 20400 }, { "epoch": 26.099744245524295, "grad_norm": 7.8673834800720215, "learning_rate": 6.6419860627177695e-06, "loss": 0.2198, "step": 20410 }, { "epoch": 26.11253196930946, "grad_norm": 0.6085989475250244, "learning_rate": 6.6202090592334496e-06, "loss": 0.0444, "step": 20420 }, { "epoch": 26.125319693094628, "grad_norm": 9.129613876342773, "learning_rate": 6.598432055749129e-06, "loss": 0.218, "step": 20430 }, { "epoch": 26.138107416879794, "grad_norm": 4.52681303024292, "learning_rate": 6.576655052264809e-06, "loss": 0.4481, "step": 20440 }, { "epoch": 26.15089514066496, "grad_norm": 10.57491683959961, "learning_rate": 6.554878048780488e-06, "loss": 0.2445, "step": 20450 }, { "epoch": 26.163682864450127, "grad_norm": 5.382693767547607, "learning_rate": 6.533101045296168e-06, "loss": 0.2412, "step": 20460 }, { "epoch": 26.176470588235293, "grad_norm": 1.7417715787887573, "learning_rate": 6.511324041811847e-06, "loss": 0.0359, "step": 20470 }, { "epoch": 26.18925831202046, "grad_norm": 0.5911231637001038, "learning_rate": 6.489547038327526e-06, "loss": 0.2865, "step": 20480 }, { "epoch": 26.202046035805626, "grad_norm": 1.412896990776062, "learning_rate": 6.467770034843206e-06, "loss": 0.1666, "step": 20490 }, { "epoch": 26.214833759590793, "grad_norm": 1.6051143407821655, "learning_rate": 6.445993031358885e-06, "loss": 0.2141, "step": 20500 }, { "epoch": 26.22762148337596, "grad_norm": 2.5996508598327637, "learning_rate": 6.424216027874565e-06, "loss": 0.2444, "step": 20510 }, { "epoch": 26.240409207161125, "grad_norm": 5.24091100692749, "learning_rate": 6.402439024390244e-06, "loss": 0.3557, "step": 20520 }, { "epoch": 26.25319693094629, "grad_norm": 0.9117621183395386, "learning_rate": 6.380662020905924e-06, "loss": 0.1892, "step": 20530 }, { "epoch": 26.265984654731458, "grad_norm": 8.387924194335938, "learning_rate": 6.3588850174216034e-06, "loss": 0.1204, "step": 20540 }, { "epoch": 26.278772378516624, "grad_norm": 0.32349109649658203, "learning_rate": 6.3371080139372835e-06, "loss": 0.1098, "step": 20550 }, { "epoch": 26.29156010230179, "grad_norm": 3.7538986206054688, "learning_rate": 6.315331010452962e-06, "loss": 0.3294, "step": 20560 }, { "epoch": 26.304347826086957, "grad_norm": 0.8818411827087402, "learning_rate": 6.293554006968641e-06, "loss": 0.1728, "step": 20570 }, { "epoch": 26.317135549872123, "grad_norm": 0.0003087719378527254, "learning_rate": 6.271777003484321e-06, "loss": 0.2044, "step": 20580 }, { "epoch": 26.32992327365729, "grad_norm": 4.227612495422363, "learning_rate": 6.25e-06, "loss": 0.1996, "step": 20590 }, { "epoch": 26.342710997442456, "grad_norm": 8.944961547851562, "learning_rate": 6.2282229965156795e-06, "loss": 0.308, "step": 20600 }, { "epoch": 26.355498721227622, "grad_norm": 0.040556784719228745, "learning_rate": 6.206445993031359e-06, "loss": 0.1636, "step": 20610 }, { "epoch": 26.36828644501279, "grad_norm": 0.9798309206962585, "learning_rate": 6.184668989547039e-06, "loss": 0.1374, "step": 20620 }, { "epoch": 26.381074168797955, "grad_norm": 12.984233856201172, "learning_rate": 6.162891986062718e-06, "loss": 0.3852, "step": 20630 }, { "epoch": 26.39386189258312, "grad_norm": 3.31282901763916, "learning_rate": 6.141114982578398e-06, "loss": 0.2366, "step": 20640 }, { "epoch": 26.406649616368288, "grad_norm": 3.520981788635254, "learning_rate": 6.119337979094076e-06, "loss": 0.3183, "step": 20650 }, { "epoch": 26.419437340153454, "grad_norm": 4.874128818511963, "learning_rate": 6.0975609756097564e-06, "loss": 0.3643, "step": 20660 }, { "epoch": 26.43222506393862, "grad_norm": 2.41288685798645, "learning_rate": 6.075783972125436e-06, "loss": 0.2655, "step": 20670 }, { "epoch": 26.445012787723787, "grad_norm": 0.03236719220876694, "learning_rate": 6.054006968641116e-06, "loss": 0.1718, "step": 20680 }, { "epoch": 26.45780051150895, "grad_norm": 1.0233948230743408, "learning_rate": 6.032229965156794e-06, "loss": 0.2505, "step": 20690 }, { "epoch": 26.470588235294116, "grad_norm": 7.505443096160889, "learning_rate": 6.010452961672474e-06, "loss": 0.3001, "step": 20700 }, { "epoch": 26.483375959079282, "grad_norm": 0.565833330154419, "learning_rate": 5.988675958188153e-06, "loss": 0.2213, "step": 20710 }, { "epoch": 26.49616368286445, "grad_norm": 2.7456254959106445, "learning_rate": 5.966898954703833e-06, "loss": 0.4087, "step": 20720 }, { "epoch": 26.508951406649615, "grad_norm": 8.262672424316406, "learning_rate": 5.9451219512195126e-06, "loss": 0.2174, "step": 20730 }, { "epoch": 26.52173913043478, "grad_norm": 1.9031331539154053, "learning_rate": 5.923344947735192e-06, "loss": 0.254, "step": 20740 }, { "epoch": 26.534526854219948, "grad_norm": 8.823184967041016, "learning_rate": 5.901567944250872e-06, "loss": 0.5003, "step": 20750 }, { "epoch": 26.547314578005114, "grad_norm": 0.31433001160621643, "learning_rate": 5.879790940766551e-06, "loss": 0.2981, "step": 20760 }, { "epoch": 26.56010230179028, "grad_norm": 5.347137451171875, "learning_rate": 5.85801393728223e-06, "loss": 0.4345, "step": 20770 }, { "epoch": 26.572890025575447, "grad_norm": 0.94664466381073, "learning_rate": 5.8362369337979094e-06, "loss": 0.2791, "step": 20780 }, { "epoch": 26.585677749360613, "grad_norm": 1.536033272743225, "learning_rate": 5.8144599303135895e-06, "loss": 0.1489, "step": 20790 }, { "epoch": 26.59846547314578, "grad_norm": 0.5675119757652283, "learning_rate": 5.792682926829269e-06, "loss": 0.2372, "step": 20800 }, { "epoch": 26.611253196930946, "grad_norm": 0.9500277638435364, "learning_rate": 5.770905923344948e-06, "loss": 0.2659, "step": 20810 }, { "epoch": 26.624040920716112, "grad_norm": 6.701987266540527, "learning_rate": 5.749128919860627e-06, "loss": 0.1731, "step": 20820 }, { "epoch": 26.63682864450128, "grad_norm": 5.811104774475098, "learning_rate": 5.727351916376307e-06, "loss": 0.2963, "step": 20830 }, { "epoch": 26.649616368286445, "grad_norm": 1.2386894226074219, "learning_rate": 5.705574912891986e-06, "loss": 0.1571, "step": 20840 }, { "epoch": 26.66240409207161, "grad_norm": 3.480888843536377, "learning_rate": 5.6837979094076656e-06, "loss": 0.3523, "step": 20850 }, { "epoch": 26.675191815856778, "grad_norm": 0.00016630532627459615, "learning_rate": 5.662020905923345e-06, "loss": 0.0731, "step": 20860 }, { "epoch": 26.687979539641944, "grad_norm": 6.070654392242432, "learning_rate": 5.640243902439025e-06, "loss": 0.2237, "step": 20870 }, { "epoch": 26.70076726342711, "grad_norm": 3.9732308387756348, "learning_rate": 5.618466898954704e-06, "loss": 0.2249, "step": 20880 }, { "epoch": 26.713554987212277, "grad_norm": 1.2122111320495605, "learning_rate": 5.596689895470383e-06, "loss": 0.145, "step": 20890 }, { "epoch": 26.726342710997443, "grad_norm": 4.583484649658203, "learning_rate": 5.574912891986063e-06, "loss": 0.2378, "step": 20900 }, { "epoch": 26.73913043478261, "grad_norm": 3.717529535293579, "learning_rate": 5.5531358885017425e-06, "loss": 0.2853, "step": 20910 }, { "epoch": 26.751918158567776, "grad_norm": 4.097237586975098, "learning_rate": 5.531358885017422e-06, "loss": 0.2265, "step": 20920 }, { "epoch": 26.764705882352942, "grad_norm": 0.002175225643441081, "learning_rate": 5.509581881533101e-06, "loss": 0.0507, "step": 20930 }, { "epoch": 26.77749360613811, "grad_norm": 0.701464831829071, "learning_rate": 5.487804878048781e-06, "loss": 0.265, "step": 20940 }, { "epoch": 26.790281329923275, "grad_norm": 0.14686907827854156, "learning_rate": 5.46602787456446e-06, "loss": 0.0828, "step": 20950 }, { "epoch": 26.80306905370844, "grad_norm": 2.6718368530273438, "learning_rate": 5.444250871080139e-06, "loss": 0.3893, "step": 20960 }, { "epoch": 26.815856777493607, "grad_norm": 0.008024285547435284, "learning_rate": 5.4224738675958186e-06, "loss": 0.206, "step": 20970 }, { "epoch": 26.828644501278774, "grad_norm": 0.00877557322382927, "learning_rate": 5.400696864111499e-06, "loss": 0.0554, "step": 20980 }, { "epoch": 26.84143222506394, "grad_norm": 0.3502082824707031, "learning_rate": 5.378919860627178e-06, "loss": 0.2968, "step": 20990 }, { "epoch": 26.854219948849106, "grad_norm": 2.3258226065081544e-05, "learning_rate": 5.357142857142857e-06, "loss": 0.1385, "step": 21000 }, { "epoch": 26.867007672634273, "grad_norm": 0.03361507132649422, "learning_rate": 5.335365853658537e-06, "loss": 0.1328, "step": 21010 }, { "epoch": 26.87979539641944, "grad_norm": 0.9654991030693054, "learning_rate": 5.313588850174216e-06, "loss": 0.2115, "step": 21020 }, { "epoch": 26.892583120204602, "grad_norm": 0.005103932227939367, "learning_rate": 5.291811846689896e-06, "loss": 0.1815, "step": 21030 }, { "epoch": 26.90537084398977, "grad_norm": 7.247010216815397e-05, "learning_rate": 5.270034843205575e-06, "loss": 0.1699, "step": 21040 }, { "epoch": 26.918158567774935, "grad_norm": 0.0003093885607086122, "learning_rate": 5.248257839721255e-06, "loss": 0.1997, "step": 21050 }, { "epoch": 26.9309462915601, "grad_norm": 4.525994777679443, "learning_rate": 5.226480836236934e-06, "loss": 0.1211, "step": 21060 }, { "epoch": 26.943734015345267, "grad_norm": 2.7894208431243896, "learning_rate": 5.204703832752614e-06, "loss": 0.2392, "step": 21070 }, { "epoch": 26.956521739130434, "grad_norm": 0.9026535153388977, "learning_rate": 5.182926829268292e-06, "loss": 0.3208, "step": 21080 }, { "epoch": 26.9693094629156, "grad_norm": 1.8788063526153564, "learning_rate": 5.1611498257839724e-06, "loss": 0.2059, "step": 21090 }, { "epoch": 26.982097186700766, "grad_norm": 2.9115092754364014, "learning_rate": 5.139372822299652e-06, "loss": 0.2544, "step": 21100 }, { "epoch": 26.994884910485933, "grad_norm": 1.7984645366668701, "learning_rate": 5.117595818815332e-06, "loss": 0.0378, "step": 21110 }, { "epoch": 27.0, "eval_loss": 0.1803685426712036, "eval_runtime": 0.9925, "eval_samples_per_second": 98.745, "eval_steps_per_second": 13.099, "step": 21114 }, { "epoch": 27.0076726342711, "grad_norm": 2.6370112895965576, "learning_rate": 5.09581881533101e-06, "loss": 0.3032, "step": 21120 }, { "epoch": 27.020460358056265, "grad_norm": 3.5613858699798584, "learning_rate": 5.07404181184669e-06, "loss": 0.2893, "step": 21130 }, { "epoch": 27.033248081841432, "grad_norm": 5.88556432723999, "learning_rate": 5.052264808362369e-06, "loss": 0.1514, "step": 21140 }, { "epoch": 27.046035805626598, "grad_norm": 0.1507536619901657, "learning_rate": 5.030487804878049e-06, "loss": 0.1593, "step": 21150 }, { "epoch": 27.058823529411764, "grad_norm": 5.69314432144165, "learning_rate": 5.0087108013937286e-06, "loss": 0.289, "step": 21160 }, { "epoch": 27.07161125319693, "grad_norm": 1.0300965309143066, "learning_rate": 4.986933797909408e-06, "loss": 0.1587, "step": 21170 }, { "epoch": 27.084398976982097, "grad_norm": 0.0008522561402060091, "learning_rate": 4.965156794425088e-06, "loss": 0.1631, "step": 21180 }, { "epoch": 27.097186700767264, "grad_norm": 1.6342631578445435, "learning_rate": 4.943379790940767e-06, "loss": 0.1555, "step": 21190 }, { "epoch": 27.10997442455243, "grad_norm": 1.1432753801345825, "learning_rate": 4.921602787456446e-06, "loss": 0.2425, "step": 21200 }, { "epoch": 27.122762148337596, "grad_norm": 1.992614507675171, "learning_rate": 4.8998257839721254e-06, "loss": 0.3561, "step": 21210 }, { "epoch": 27.135549872122763, "grad_norm": 2.2932474613189697, "learning_rate": 4.8780487804878055e-06, "loss": 0.1691, "step": 21220 }, { "epoch": 27.14833759590793, "grad_norm": 1.525471806526184, "learning_rate": 4.856271777003484e-06, "loss": 0.1616, "step": 21230 }, { "epoch": 27.161125319693095, "grad_norm": 3.714881420135498, "learning_rate": 4.834494773519164e-06, "loss": 0.3203, "step": 21240 }, { "epoch": 27.17391304347826, "grad_norm": 3.990633964538574, "learning_rate": 4.812717770034843e-06, "loss": 0.254, "step": 21250 }, { "epoch": 27.186700767263428, "grad_norm": 1.0952008962631226, "learning_rate": 4.790940766550523e-06, "loss": 0.1943, "step": 21260 }, { "epoch": 27.199488491048594, "grad_norm": 2.789057970046997, "learning_rate": 4.769163763066202e-06, "loss": 0.1023, "step": 21270 }, { "epoch": 27.21227621483376, "grad_norm": 1.829593539237976, "learning_rate": 4.7473867595818816e-06, "loss": 0.183, "step": 21280 }, { "epoch": 27.225063938618927, "grad_norm": 0.681667685508728, "learning_rate": 4.725609756097562e-06, "loss": 0.1145, "step": 21290 }, { "epoch": 27.237851662404093, "grad_norm": 6.536792755126953, "learning_rate": 4.703832752613241e-06, "loss": 0.3635, "step": 21300 }, { "epoch": 27.25063938618926, "grad_norm": 4.994762897491455, "learning_rate": 4.68205574912892e-06, "loss": 0.5282, "step": 21310 }, { "epoch": 27.263427109974426, "grad_norm": 6.329887866973877, "learning_rate": 4.660278745644599e-06, "loss": 0.1123, "step": 21320 }, { "epoch": 27.276214833759592, "grad_norm": 0.2964077591896057, "learning_rate": 4.638501742160279e-06, "loss": 0.3743, "step": 21330 }, { "epoch": 27.289002557544755, "grad_norm": 5.211808204650879, "learning_rate": 4.6167247386759585e-06, "loss": 0.34, "step": 21340 }, { "epoch": 27.30179028132992, "grad_norm": 2.559980869293213, "learning_rate": 4.594947735191638e-06, "loss": 0.2149, "step": 21350 }, { "epoch": 27.314578005115088, "grad_norm": 6.942402362823486, "learning_rate": 4.573170731707317e-06, "loss": 0.2281, "step": 21360 }, { "epoch": 27.327365728900254, "grad_norm": 0.011685797944664955, "learning_rate": 4.551393728222997e-06, "loss": 0.1857, "step": 21370 }, { "epoch": 27.34015345268542, "grad_norm": 3.299914836883545, "learning_rate": 4.529616724738676e-06, "loss": 0.3707, "step": 21380 }, { "epoch": 27.352941176470587, "grad_norm": 1.5382965803146362, "learning_rate": 4.507839721254355e-06, "loss": 0.1391, "step": 21390 }, { "epoch": 27.365728900255753, "grad_norm": 4.048910617828369, "learning_rate": 4.4860627177700346e-06, "loss": 0.3187, "step": 21400 }, { "epoch": 27.37851662404092, "grad_norm": 6.27728796005249, "learning_rate": 4.464285714285715e-06, "loss": 0.2896, "step": 21410 }, { "epoch": 27.391304347826086, "grad_norm": 5.804953098297119, "learning_rate": 4.442508710801394e-06, "loss": 0.2246, "step": 21420 }, { "epoch": 27.404092071611252, "grad_norm": 0.3168350160121918, "learning_rate": 4.420731707317073e-06, "loss": 0.2129, "step": 21430 }, { "epoch": 27.41687979539642, "grad_norm": 2.710597038269043, "learning_rate": 4.398954703832753e-06, "loss": 0.2045, "step": 21440 }, { "epoch": 27.429667519181585, "grad_norm": 0.05959421768784523, "learning_rate": 4.377177700348432e-06, "loss": 0.1987, "step": 21450 }, { "epoch": 27.44245524296675, "grad_norm": 1.6882935762405396, "learning_rate": 4.355400696864112e-06, "loss": 0.2409, "step": 21460 }, { "epoch": 27.455242966751918, "grad_norm": 1.9564663171768188, "learning_rate": 4.333623693379791e-06, "loss": 0.1971, "step": 21470 }, { "epoch": 27.468030690537084, "grad_norm": 1.2921223640441895, "learning_rate": 4.311846689895471e-06, "loss": 0.2685, "step": 21480 }, { "epoch": 27.48081841432225, "grad_norm": 2.278135061264038, "learning_rate": 4.29006968641115e-06, "loss": 0.166, "step": 21490 }, { "epoch": 27.493606138107417, "grad_norm": 0.553162693977356, "learning_rate": 4.26829268292683e-06, "loss": 0.205, "step": 21500 }, { "epoch": 27.506393861892583, "grad_norm": 8.630960464477539, "learning_rate": 4.246515679442508e-06, "loss": 0.3337, "step": 21510 }, { "epoch": 27.51918158567775, "grad_norm": 5.457569599151611, "learning_rate": 4.224738675958188e-06, "loss": 0.4725, "step": 21520 }, { "epoch": 27.531969309462916, "grad_norm": 2.227025032043457, "learning_rate": 4.202961672473868e-06, "loss": 0.1535, "step": 21530 }, { "epoch": 27.544757033248082, "grad_norm": 2.7370967864990234, "learning_rate": 4.181184668989548e-06, "loss": 0.1932, "step": 21540 }, { "epoch": 27.55754475703325, "grad_norm": 0.04351665824651718, "learning_rate": 4.159407665505227e-06, "loss": 0.0892, "step": 21550 }, { "epoch": 27.570332480818415, "grad_norm": 0.006841658148914576, "learning_rate": 4.137630662020906e-06, "loss": 0.1867, "step": 21560 }, { "epoch": 27.58312020460358, "grad_norm": 1.037326693534851, "learning_rate": 4.115853658536586e-06, "loss": 0.2649, "step": 21570 }, { "epoch": 27.595907928388748, "grad_norm": 5.368692874908447, "learning_rate": 4.0940766550522645e-06, "loss": 0.2772, "step": 21580 }, { "epoch": 27.608695652173914, "grad_norm": 6.071450233459473, "learning_rate": 4.0722996515679446e-06, "loss": 0.1821, "step": 21590 }, { "epoch": 27.62148337595908, "grad_norm": 1.9267330169677734, "learning_rate": 4.050522648083624e-06, "loss": 0.3001, "step": 21600 }, { "epoch": 27.634271099744247, "grad_norm": 6.9283881187438965, "learning_rate": 4.028745644599304e-06, "loss": 0.27, "step": 21610 }, { "epoch": 27.647058823529413, "grad_norm": 11.072102546691895, "learning_rate": 4.006968641114982e-06, "loss": 0.2461, "step": 21620 }, { "epoch": 27.65984654731458, "grad_norm": 6.862698554992676, "learning_rate": 3.985191637630662e-06, "loss": 0.317, "step": 21630 }, { "epoch": 27.672634271099746, "grad_norm": 3.6825077533721924, "learning_rate": 3.9634146341463414e-06, "loss": 0.2672, "step": 21640 }, { "epoch": 27.685421994884912, "grad_norm": 5.379913330078125, "learning_rate": 3.9416376306620215e-06, "loss": 0.1716, "step": 21650 }, { "epoch": 27.69820971867008, "grad_norm": 0.01544896513223648, "learning_rate": 3.9198606271777e-06, "loss": 0.0986, "step": 21660 }, { "epoch": 27.710997442455245, "grad_norm": 4.728484153747559, "learning_rate": 3.89808362369338e-06, "loss": 0.2229, "step": 21670 }, { "epoch": 27.723785166240408, "grad_norm": 7.197249412536621, "learning_rate": 3.876306620209059e-06, "loss": 0.2093, "step": 21680 }, { "epoch": 27.736572890025574, "grad_norm": 0.08864303678274155, "learning_rate": 3.854529616724739e-06, "loss": 0.0748, "step": 21690 }, { "epoch": 27.74936061381074, "grad_norm": 3.961198091506958, "learning_rate": 3.832752613240418e-06, "loss": 0.1083, "step": 21700 }, { "epoch": 27.762148337595907, "grad_norm": 1.672965407371521, "learning_rate": 3.8109756097560976e-06, "loss": 0.1421, "step": 21710 }, { "epoch": 27.774936061381073, "grad_norm": 2.3427834510803223, "learning_rate": 3.789198606271777e-06, "loss": 0.2323, "step": 21720 }, { "epoch": 27.78772378516624, "grad_norm": 0.023380041122436523, "learning_rate": 3.767421602787457e-06, "loss": 0.1407, "step": 21730 }, { "epoch": 27.800511508951406, "grad_norm": 0.814188539981842, "learning_rate": 3.745644599303136e-06, "loss": 0.1607, "step": 21740 }, { "epoch": 27.813299232736572, "grad_norm": 1.207890510559082, "learning_rate": 3.7238675958188156e-06, "loss": 0.2524, "step": 21750 }, { "epoch": 27.82608695652174, "grad_norm": 1.3783173561096191, "learning_rate": 3.7020905923344953e-06, "loss": 0.2288, "step": 21760 }, { "epoch": 27.838874680306905, "grad_norm": 4.3022236824035645, "learning_rate": 3.6803135888501745e-06, "loss": 0.4452, "step": 21770 }, { "epoch": 27.85166240409207, "grad_norm": 1.6889233589172363, "learning_rate": 3.6585365853658537e-06, "loss": 0.2704, "step": 21780 }, { "epoch": 27.864450127877237, "grad_norm": 1.1526528596878052, "learning_rate": 3.6367595818815333e-06, "loss": 0.1784, "step": 21790 }, { "epoch": 27.877237851662404, "grad_norm": 0.12817203998565674, "learning_rate": 3.614982578397213e-06, "loss": 0.1778, "step": 21800 }, { "epoch": 27.89002557544757, "grad_norm": 1.1597322225570679, "learning_rate": 3.5932055749128926e-06, "loss": 0.0466, "step": 21810 }, { "epoch": 27.902813299232736, "grad_norm": 0.492725133895874, "learning_rate": 3.5714285714285714e-06, "loss": 0.3498, "step": 21820 }, { "epoch": 27.915601023017903, "grad_norm": 1.6839929819107056, "learning_rate": 3.549651567944251e-06, "loss": 0.1928, "step": 21830 }, { "epoch": 27.92838874680307, "grad_norm": 1.2277092933654785, "learning_rate": 3.5278745644599306e-06, "loss": 0.1704, "step": 21840 }, { "epoch": 27.941176470588236, "grad_norm": 0.32345348596572876, "learning_rate": 3.5060975609756102e-06, "loss": 0.2256, "step": 21850 }, { "epoch": 27.953964194373402, "grad_norm": 3.3228261470794678, "learning_rate": 3.484320557491289e-06, "loss": 0.062, "step": 21860 }, { "epoch": 27.966751918158568, "grad_norm": 2.418651819229126, "learning_rate": 3.4625435540069687e-06, "loss": 0.2153, "step": 21870 }, { "epoch": 27.979539641943735, "grad_norm": 0.17314131557941437, "learning_rate": 3.4407665505226483e-06, "loss": 0.2711, "step": 21880 }, { "epoch": 27.9923273657289, "grad_norm": 2.9722671508789062, "learning_rate": 3.4189895470383275e-06, "loss": 0.3478, "step": 21890 }, { "epoch": 28.0, "eval_loss": 0.1801394373178482, "eval_runtime": 0.9787, "eval_samples_per_second": 100.135, "eval_steps_per_second": 13.283, "step": 21896 }, { "epoch": 28.005115089514067, "grad_norm": 0.005500065162777901, "learning_rate": 3.397212543554007e-06, "loss": 0.1711, "step": 21900 }, { "epoch": 28.017902813299234, "grad_norm": 5.95561408996582, "learning_rate": 3.3754355400696867e-06, "loss": 0.104, "step": 21910 }, { "epoch": 28.0306905370844, "grad_norm": 0.061613526195287704, "learning_rate": 3.3536585365853664e-06, "loss": 0.2721, "step": 21920 }, { "epoch": 28.043478260869566, "grad_norm": 4.259181022644043, "learning_rate": 3.331881533101045e-06, "loss": 0.3519, "step": 21930 }, { "epoch": 28.056265984654733, "grad_norm": 0.000134848480229266, "learning_rate": 3.3101045296167248e-06, "loss": 0.2261, "step": 21940 }, { "epoch": 28.0690537084399, "grad_norm": 0.006025349255651236, "learning_rate": 3.2883275261324044e-06, "loss": 0.1773, "step": 21950 }, { "epoch": 28.081841432225065, "grad_norm": 0.5203061103820801, "learning_rate": 3.266550522648084e-06, "loss": 0.3539, "step": 21960 }, { "epoch": 28.09462915601023, "grad_norm": 5.767180919647217, "learning_rate": 3.244773519163763e-06, "loss": 0.3707, "step": 21970 }, { "epoch": 28.107416879795398, "grad_norm": 0.980506956577301, "learning_rate": 3.2229965156794425e-06, "loss": 0.2138, "step": 21980 }, { "epoch": 28.120204603580564, "grad_norm": 1.3818098306655884, "learning_rate": 3.201219512195122e-06, "loss": 0.2992, "step": 21990 }, { "epoch": 28.132992327365727, "grad_norm": 1.2961748838424683, "learning_rate": 3.1794425087108017e-06, "loss": 0.1655, "step": 22000 }, { "epoch": 28.145780051150894, "grad_norm": 1.3928310871124268, "learning_rate": 3.157665505226481e-06, "loss": 0.1668, "step": 22010 }, { "epoch": 28.15856777493606, "grad_norm": 0.49518585205078125, "learning_rate": 3.1358885017421605e-06, "loss": 0.1517, "step": 22020 }, { "epoch": 28.171355498721226, "grad_norm": 5.98352575302124, "learning_rate": 3.1141114982578398e-06, "loss": 0.1481, "step": 22030 }, { "epoch": 28.184143222506393, "grad_norm": 0.7869769930839539, "learning_rate": 3.0923344947735194e-06, "loss": 0.2591, "step": 22040 }, { "epoch": 28.19693094629156, "grad_norm": 4.9443828174844384e-05, "learning_rate": 3.070557491289199e-06, "loss": 0.1369, "step": 22050 }, { "epoch": 28.209718670076725, "grad_norm": 5.648998260498047, "learning_rate": 3.0487804878048782e-06, "loss": 0.2763, "step": 22060 }, { "epoch": 28.22250639386189, "grad_norm": 0.4481763243675232, "learning_rate": 3.027003484320558e-06, "loss": 0.268, "step": 22070 }, { "epoch": 28.235294117647058, "grad_norm": 2.534666061401367, "learning_rate": 3.005226480836237e-06, "loss": 0.2043, "step": 22080 }, { "epoch": 28.248081841432224, "grad_norm": 1.7655045986175537, "learning_rate": 2.9834494773519167e-06, "loss": 0.1321, "step": 22090 }, { "epoch": 28.26086956521739, "grad_norm": 4.485157489776611, "learning_rate": 2.961672473867596e-06, "loss": 0.2079, "step": 22100 }, { "epoch": 28.273657289002557, "grad_norm": 4.403810024261475, "learning_rate": 2.9398954703832755e-06, "loss": 0.3577, "step": 22110 }, { "epoch": 28.286445012787723, "grad_norm": 3.932236671447754, "learning_rate": 2.9181184668989547e-06, "loss": 0.38, "step": 22120 }, { "epoch": 28.29923273657289, "grad_norm": 2.8906774520874023, "learning_rate": 2.8963414634146343e-06, "loss": 0.1712, "step": 22130 }, { "epoch": 28.312020460358056, "grad_norm": 3.725170850753784, "learning_rate": 2.8745644599303136e-06, "loss": 0.1457, "step": 22140 }, { "epoch": 28.324808184143222, "grad_norm": 1.1601566076278687, "learning_rate": 2.852787456445993e-06, "loss": 0.3236, "step": 22150 }, { "epoch": 28.33759590792839, "grad_norm": 0.42673754692077637, "learning_rate": 2.8310104529616724e-06, "loss": 0.1724, "step": 22160 }, { "epoch": 28.350383631713555, "grad_norm": 5.1795573234558105, "learning_rate": 2.809233449477352e-06, "loss": 0.2666, "step": 22170 }, { "epoch": 28.36317135549872, "grad_norm": 4.630896091461182, "learning_rate": 2.7874564459930316e-06, "loss": 0.3567, "step": 22180 }, { "epoch": 28.375959079283888, "grad_norm": 0.001059512491337955, "learning_rate": 2.765679442508711e-06, "loss": 0.3346, "step": 22190 }, { "epoch": 28.388746803069054, "grad_norm": 3.2247471809387207, "learning_rate": 2.7439024390243905e-06, "loss": 0.0682, "step": 22200 }, { "epoch": 28.40153452685422, "grad_norm": 0.2996211349964142, "learning_rate": 2.7221254355400697e-06, "loss": 0.1286, "step": 22210 }, { "epoch": 28.414322250639387, "grad_norm": 0.013440398499369621, "learning_rate": 2.7003484320557493e-06, "loss": 0.0815, "step": 22220 }, { "epoch": 28.427109974424553, "grad_norm": 0.8571885228157043, "learning_rate": 2.6785714285714285e-06, "loss": 0.2212, "step": 22230 }, { "epoch": 28.43989769820972, "grad_norm": 6.663759231567383, "learning_rate": 2.656794425087108e-06, "loss": 0.286, "step": 22240 }, { "epoch": 28.452685421994886, "grad_norm": 1.592871069908142, "learning_rate": 2.6350174216027874e-06, "loss": 0.2274, "step": 22250 }, { "epoch": 28.465473145780052, "grad_norm": 0.00015884192544035614, "learning_rate": 2.613240418118467e-06, "loss": 0.2724, "step": 22260 }, { "epoch": 28.47826086956522, "grad_norm": 100.69029998779297, "learning_rate": 2.591463414634146e-06, "loss": 0.22, "step": 22270 }, { "epoch": 28.491048593350385, "grad_norm": 6.356751441955566, "learning_rate": 2.569686411149826e-06, "loss": 0.1702, "step": 22280 }, { "epoch": 28.50383631713555, "grad_norm": 2.7504799365997314, "learning_rate": 2.547909407665505e-06, "loss": 0.4634, "step": 22290 }, { "epoch": 28.516624040920718, "grad_norm": 0.8726930618286133, "learning_rate": 2.5261324041811846e-06, "loss": 0.183, "step": 22300 }, { "epoch": 28.529411764705884, "grad_norm": 0.07652537524700165, "learning_rate": 2.5043554006968643e-06, "loss": 0.2604, "step": 22310 }, { "epoch": 28.54219948849105, "grad_norm": 0.4226292371749878, "learning_rate": 2.482578397212544e-06, "loss": 0.1493, "step": 22320 }, { "epoch": 28.554987212276213, "grad_norm": 0.3523002862930298, "learning_rate": 2.460801393728223e-06, "loss": 0.1188, "step": 22330 }, { "epoch": 28.56777493606138, "grad_norm": 0.6177569031715393, "learning_rate": 2.4390243902439027e-06, "loss": 0.2698, "step": 22340 }, { "epoch": 28.580562659846546, "grad_norm": 0.027569299563765526, "learning_rate": 2.417247386759582e-06, "loss": 0.2496, "step": 22350 }, { "epoch": 28.593350383631712, "grad_norm": 0.00996365025639534, "learning_rate": 2.3954703832752616e-06, "loss": 0.187, "step": 22360 }, { "epoch": 28.60613810741688, "grad_norm": 0.2898666560649872, "learning_rate": 2.3736933797909408e-06, "loss": 0.2004, "step": 22370 }, { "epoch": 28.618925831202045, "grad_norm": 0.0006501222378574312, "learning_rate": 2.3519163763066204e-06, "loss": 0.2665, "step": 22380 }, { "epoch": 28.63171355498721, "grad_norm": 0.003726001363247633, "learning_rate": 2.3301393728222996e-06, "loss": 0.2685, "step": 22390 }, { "epoch": 28.644501278772378, "grad_norm": 0.0016021005576476455, "learning_rate": 2.3083623693379792e-06, "loss": 0.2553, "step": 22400 }, { "epoch": 28.657289002557544, "grad_norm": 1.840811014175415, "learning_rate": 2.2865853658536584e-06, "loss": 0.1302, "step": 22410 }, { "epoch": 28.67007672634271, "grad_norm": 2.189919948577881, "learning_rate": 2.264808362369338e-06, "loss": 0.2802, "step": 22420 }, { "epoch": 28.682864450127877, "grad_norm": 0.8784971237182617, "learning_rate": 2.2430313588850173e-06, "loss": 0.2669, "step": 22430 }, { "epoch": 28.695652173913043, "grad_norm": 5.116599082946777, "learning_rate": 2.221254355400697e-06, "loss": 0.1442, "step": 22440 }, { "epoch": 28.70843989769821, "grad_norm": 0.6749101281166077, "learning_rate": 2.1994773519163765e-06, "loss": 0.2148, "step": 22450 }, { "epoch": 28.721227621483376, "grad_norm": 4.480432987213135, "learning_rate": 2.177700348432056e-06, "loss": 0.1998, "step": 22460 }, { "epoch": 28.734015345268542, "grad_norm": 0.05128619447350502, "learning_rate": 2.1559233449477354e-06, "loss": 0.2094, "step": 22470 }, { "epoch": 28.74680306905371, "grad_norm": 6.135299205780029, "learning_rate": 2.134146341463415e-06, "loss": 0.3928, "step": 22480 }, { "epoch": 28.759590792838875, "grad_norm": 3.8637638092041016, "learning_rate": 2.112369337979094e-06, "loss": 0.305, "step": 22490 }, { "epoch": 28.77237851662404, "grad_norm": 0.015335003845393658, "learning_rate": 2.090592334494774e-06, "loss": 0.2201, "step": 22500 }, { "epoch": 28.785166240409207, "grad_norm": 27.03233528137207, "learning_rate": 2.068815331010453e-06, "loss": 0.4113, "step": 22510 }, { "epoch": 28.797953964194374, "grad_norm": 8.126382827758789, "learning_rate": 2.0470383275261322e-06, "loss": 0.396, "step": 22520 }, { "epoch": 28.81074168797954, "grad_norm": 0.0010111212031915784, "learning_rate": 2.025261324041812e-06, "loss": 0.0858, "step": 22530 }, { "epoch": 28.823529411764707, "grad_norm": 0.22579795122146606, "learning_rate": 2.003484320557491e-06, "loss": 0.1004, "step": 22540 }, { "epoch": 28.836317135549873, "grad_norm": 7.518482685089111, "learning_rate": 1.9817073170731707e-06, "loss": 0.1294, "step": 22550 }, { "epoch": 28.84910485933504, "grad_norm": 0.4615892469882965, "learning_rate": 1.95993031358885e-06, "loss": 0.0513, "step": 22560 }, { "epoch": 28.861892583120206, "grad_norm": 0.9744994640350342, "learning_rate": 1.9381533101045295e-06, "loss": 0.0957, "step": 22570 }, { "epoch": 28.874680306905372, "grad_norm": 8.404377937316895, "learning_rate": 1.916376306620209e-06, "loss": 0.2883, "step": 22580 }, { "epoch": 28.88746803069054, "grad_norm": 0.06529644131660461, "learning_rate": 1.8945993031358886e-06, "loss": 0.175, "step": 22590 }, { "epoch": 28.900255754475705, "grad_norm": 1.4713941812515259, "learning_rate": 1.872822299651568e-06, "loss": 0.1782, "step": 22600 }, { "epoch": 28.91304347826087, "grad_norm": 0.0055590178817510605, "learning_rate": 1.8510452961672476e-06, "loss": 0.1801, "step": 22610 }, { "epoch": 28.925831202046037, "grad_norm": 0.132467120885849, "learning_rate": 1.8292682926829268e-06, "loss": 0.1855, "step": 22620 }, { "epoch": 28.938618925831204, "grad_norm": 4.285371780395508, "learning_rate": 1.8074912891986065e-06, "loss": 0.3301, "step": 22630 }, { "epoch": 28.95140664961637, "grad_norm": 5.881651878356934, "learning_rate": 1.7857142857142857e-06, "loss": 0.0689, "step": 22640 }, { "epoch": 28.964194373401533, "grad_norm": 4.706814765930176, "learning_rate": 1.7639372822299653e-06, "loss": 0.3224, "step": 22650 }, { "epoch": 28.9769820971867, "grad_norm": 0.24153724312782288, "learning_rate": 1.7421602787456445e-06, "loss": 0.4702, "step": 22660 }, { "epoch": 28.989769820971865, "grad_norm": 8.415337562561035, "learning_rate": 1.7203832752613241e-06, "loss": 0.2193, "step": 22670 }, { "epoch": 29.0, "eval_loss": 0.1792634278535843, "eval_runtime": 0.9762, "eval_samples_per_second": 100.386, "eval_steps_per_second": 13.317, "step": 22678 }, { "epoch": 29.002557544757032, "grad_norm": 6.062386512756348, "learning_rate": 1.6986062717770036e-06, "loss": 0.1089, "step": 22680 }, { "epoch": 29.015345268542198, "grad_norm": 3.9071085453033447, "learning_rate": 1.6768292682926832e-06, "loss": 0.2283, "step": 22690 }, { "epoch": 29.028132992327365, "grad_norm": 0.00043772748904302716, "learning_rate": 1.6550522648083624e-06, "loss": 0.27, "step": 22700 }, { "epoch": 29.04092071611253, "grad_norm": 0.5102390646934509, "learning_rate": 1.633275261324042e-06, "loss": 0.2082, "step": 22710 }, { "epoch": 29.053708439897697, "grad_norm": 0.003764116670936346, "learning_rate": 1.6114982578397212e-06, "loss": 0.5365, "step": 22720 }, { "epoch": 29.066496163682864, "grad_norm": 1.1932376623153687, "learning_rate": 1.5897212543554009e-06, "loss": 0.0399, "step": 22730 }, { "epoch": 29.07928388746803, "grad_norm": 3.3606503009796143, "learning_rate": 1.5679442508710803e-06, "loss": 0.1718, "step": 22740 }, { "epoch": 29.092071611253196, "grad_norm": 3.6388676166534424, "learning_rate": 1.5461672473867597e-06, "loss": 0.4028, "step": 22750 }, { "epoch": 29.104859335038363, "grad_norm": 0.08391699939966202, "learning_rate": 1.5243902439024391e-06, "loss": 0.1473, "step": 22760 }, { "epoch": 29.11764705882353, "grad_norm": 0.6788755655288696, "learning_rate": 1.5026132404181185e-06, "loss": 0.1682, "step": 22770 }, { "epoch": 29.130434782608695, "grad_norm": 4.642868995666504, "learning_rate": 1.480836236933798e-06, "loss": 0.2218, "step": 22780 }, { "epoch": 29.14322250639386, "grad_norm": 1.4224387407302856, "learning_rate": 1.4590592334494774e-06, "loss": 0.1681, "step": 22790 }, { "epoch": 29.156010230179028, "grad_norm": 4.818307399749756, "learning_rate": 1.4372822299651568e-06, "loss": 0.2847, "step": 22800 }, { "epoch": 29.168797953964194, "grad_norm": 1.9413695335388184, "learning_rate": 1.4155052264808362e-06, "loss": 0.3833, "step": 22810 }, { "epoch": 29.18158567774936, "grad_norm": 0.2834830582141876, "learning_rate": 1.3937282229965158e-06, "loss": 0.0927, "step": 22820 }, { "epoch": 29.194373401534527, "grad_norm": 2.186720132827759, "learning_rate": 1.3719512195121952e-06, "loss": 0.1265, "step": 22830 }, { "epoch": 29.207161125319693, "grad_norm": 1.6864341497421265, "learning_rate": 1.3501742160278747e-06, "loss": 0.2385, "step": 22840 }, { "epoch": 29.21994884910486, "grad_norm": 1.5684053897857666, "learning_rate": 1.328397212543554e-06, "loss": 0.4263, "step": 22850 }, { "epoch": 29.232736572890026, "grad_norm": 1.0396729707717896, "learning_rate": 1.3066202090592335e-06, "loss": 0.124, "step": 22860 }, { "epoch": 29.245524296675192, "grad_norm": 3.0199835300445557, "learning_rate": 1.284843205574913e-06, "loss": 0.1878, "step": 22870 }, { "epoch": 29.25831202046036, "grad_norm": 5.518275737762451, "learning_rate": 1.2630662020905923e-06, "loss": 0.1865, "step": 22880 }, { "epoch": 29.271099744245525, "grad_norm": 0.0033442946150898933, "learning_rate": 1.241289198606272e-06, "loss": 0.172, "step": 22890 }, { "epoch": 29.28388746803069, "grad_norm": 1.8414983749389648, "learning_rate": 1.2195121951219514e-06, "loss": 0.2428, "step": 22900 }, { "epoch": 29.296675191815858, "grad_norm": 2.1718709468841553, "learning_rate": 1.1977351916376308e-06, "loss": 0.1704, "step": 22910 }, { "epoch": 29.309462915601024, "grad_norm": 0.0017286858055740595, "learning_rate": 1.1759581881533102e-06, "loss": 0.1766, "step": 22920 }, { "epoch": 29.32225063938619, "grad_norm": 1.1897152662277222, "learning_rate": 1.1541811846689896e-06, "loss": 0.3789, "step": 22930 }, { "epoch": 29.335038363171357, "grad_norm": 0.0005067705642431974, "learning_rate": 1.132404181184669e-06, "loss": 0.1308, "step": 22940 }, { "epoch": 29.347826086956523, "grad_norm": 2.590867042541504, "learning_rate": 1.1106271777003485e-06, "loss": 0.1798, "step": 22950 }, { "epoch": 29.36061381074169, "grad_norm": 1.66661536693573, "learning_rate": 1.088850174216028e-06, "loss": 0.3366, "step": 22960 }, { "epoch": 29.373401534526856, "grad_norm": 6.159651756286621, "learning_rate": 1.0670731707317075e-06, "loss": 0.3356, "step": 22970 }, { "epoch": 29.38618925831202, "grad_norm": 2.129429578781128, "learning_rate": 1.045296167247387e-06, "loss": 0.2774, "step": 22980 }, { "epoch": 29.398976982097185, "grad_norm": 0.9112698435783386, "learning_rate": 1.0235191637630661e-06, "loss": 0.2685, "step": 22990 }, { "epoch": 29.41176470588235, "grad_norm": 0.0026976391673088074, "learning_rate": 1.0017421602787455e-06, "loss": 0.2588, "step": 23000 }, { "epoch": 29.424552429667518, "grad_norm": 0.9733322262763977, "learning_rate": 9.79965156794425e-07, "loss": 0.1532, "step": 23010 }, { "epoch": 29.437340153452684, "grad_norm": 0.48181623220443726, "learning_rate": 9.581881533101046e-07, "loss": 0.2647, "step": 23020 }, { "epoch": 29.45012787723785, "grad_norm": 0.4220898449420929, "learning_rate": 9.36411149825784e-07, "loss": 0.1638, "step": 23030 }, { "epoch": 29.462915601023017, "grad_norm": 0.018712276592850685, "learning_rate": 9.146341463414634e-07, "loss": 0.0407, "step": 23040 }, { "epoch": 29.475703324808183, "grad_norm": 0.0022459602914750576, "learning_rate": 8.928571428571428e-07, "loss": 0.1538, "step": 23050 }, { "epoch": 29.48849104859335, "grad_norm": 1.2622050046920776, "learning_rate": 8.710801393728223e-07, "loss": 0.2032, "step": 23060 }, { "epoch": 29.501278772378516, "grad_norm": 0.00908291433006525, "learning_rate": 8.493031358885018e-07, "loss": 0.1773, "step": 23070 }, { "epoch": 29.514066496163682, "grad_norm": 0.028732916340231895, "learning_rate": 8.275261324041812e-07, "loss": 0.2818, "step": 23080 }, { "epoch": 29.52685421994885, "grad_norm": 0.5516131520271301, "learning_rate": 8.057491289198606e-07, "loss": 0.0955, "step": 23090 }, { "epoch": 29.539641943734015, "grad_norm": 1.139642357826233, "learning_rate": 7.839721254355401e-07, "loss": 0.2728, "step": 23100 }, { "epoch": 29.55242966751918, "grad_norm": 3.2173802852630615, "learning_rate": 7.621951219512196e-07, "loss": 0.2744, "step": 23110 }, { "epoch": 29.565217391304348, "grad_norm": 0.01653435453772545, "learning_rate": 7.40418118466899e-07, "loss": 0.0744, "step": 23120 }, { "epoch": 29.578005115089514, "grad_norm": 6.4000325202941895, "learning_rate": 7.186411149825784e-07, "loss": 0.2451, "step": 23130 }, { "epoch": 29.59079283887468, "grad_norm": 1.4968242645263672, "learning_rate": 6.968641114982579e-07, "loss": 0.2611, "step": 23140 }, { "epoch": 29.603580562659847, "grad_norm": 0.000750532082747668, "learning_rate": 6.750871080139373e-07, "loss": 0.3625, "step": 23150 }, { "epoch": 29.616368286445013, "grad_norm": 0.8928916454315186, "learning_rate": 6.533101045296167e-07, "loss": 0.0931, "step": 23160 }, { "epoch": 29.62915601023018, "grad_norm": 0.03060201369225979, "learning_rate": 6.315331010452962e-07, "loss": 0.0568, "step": 23170 }, { "epoch": 29.641943734015346, "grad_norm": 9.899931907653809, "learning_rate": 6.097560975609757e-07, "loss": 0.2723, "step": 23180 }, { "epoch": 29.654731457800512, "grad_norm": 0.8692872524261475, "learning_rate": 5.879790940766551e-07, "loss": 0.1834, "step": 23190 }, { "epoch": 29.66751918158568, "grad_norm": 0.00023471553868148476, "learning_rate": 5.662020905923345e-07, "loss": 0.2826, "step": 23200 }, { "epoch": 29.680306905370845, "grad_norm": 4.050236701965332, "learning_rate": 5.44425087108014e-07, "loss": 0.1395, "step": 23210 }, { "epoch": 29.69309462915601, "grad_norm": 2.7868478298187256, "learning_rate": 5.226480836236935e-07, "loss": 0.2827, "step": 23220 }, { "epoch": 29.705882352941178, "grad_norm": 0.00015953659021761268, "learning_rate": 5.008710801393728e-07, "loss": 0.3188, "step": 23230 }, { "epoch": 29.718670076726344, "grad_norm": 1.8475815057754517, "learning_rate": 4.790940766550523e-07, "loss": 0.1454, "step": 23240 }, { "epoch": 29.73145780051151, "grad_norm": 1.6999574899673462, "learning_rate": 4.573170731707317e-07, "loss": 0.245, "step": 23250 }, { "epoch": 29.744245524296677, "grad_norm": 0.3015870153903961, "learning_rate": 4.3554006968641113e-07, "loss": 0.1875, "step": 23260 }, { "epoch": 29.757033248081843, "grad_norm": 0.06826595962047577, "learning_rate": 4.137630662020906e-07, "loss": 0.2096, "step": 23270 }, { "epoch": 29.76982097186701, "grad_norm": 5.668938636779785, "learning_rate": 3.9198606271777007e-07, "loss": 0.1375, "step": 23280 }, { "epoch": 29.782608695652176, "grad_norm": 5.67093563079834, "learning_rate": 3.702090592334495e-07, "loss": 0.2203, "step": 23290 }, { "epoch": 29.79539641943734, "grad_norm": 3.95104718208313, "learning_rate": 3.4843205574912896e-07, "loss": 0.2321, "step": 23300 }, { "epoch": 29.808184143222505, "grad_norm": 2.9969534873962402, "learning_rate": 3.2665505226480837e-07, "loss": 0.1346, "step": 23310 }, { "epoch": 29.82097186700767, "grad_norm": 0.7820212841033936, "learning_rate": 3.0487804878048784e-07, "loss": 0.2656, "step": 23320 }, { "epoch": 29.833759590792837, "grad_norm": 0.0010761891026049852, "learning_rate": 2.8310104529616726e-07, "loss": 0.4435, "step": 23330 }, { "epoch": 29.846547314578004, "grad_norm": 4.919832229614258, "learning_rate": 2.6132404181184673e-07, "loss": 0.3309, "step": 23340 }, { "epoch": 29.85933503836317, "grad_norm": 2.468470573425293, "learning_rate": 2.3954703832752615e-07, "loss": 0.1566, "step": 23350 }, { "epoch": 29.872122762148337, "grad_norm": 5.76803731918335, "learning_rate": 2.1777003484320556e-07, "loss": 0.2243, "step": 23360 }, { "epoch": 29.884910485933503, "grad_norm": 1.7636452913284302, "learning_rate": 1.9599303135888503e-07, "loss": 0.1639, "step": 23370 }, { "epoch": 29.89769820971867, "grad_norm": 1.2893353700637817, "learning_rate": 1.7421602787456448e-07, "loss": 0.3609, "step": 23380 }, { "epoch": 29.910485933503836, "grad_norm": 6.375534576363862e-05, "learning_rate": 1.5243902439024392e-07, "loss": 0.3925, "step": 23390 }, { "epoch": 29.923273657289002, "grad_norm": 0.7869868278503418, "learning_rate": 1.3066202090592336e-07, "loss": 0.217, "step": 23400 }, { "epoch": 29.93606138107417, "grad_norm": 7.046038627624512, "learning_rate": 1.0888501742160278e-07, "loss": 0.2354, "step": 23410 }, { "epoch": 29.948849104859335, "grad_norm": 0.00026419543428346515, "learning_rate": 8.710801393728224e-08, "loss": 0.2192, "step": 23420 }, { "epoch": 29.9616368286445, "grad_norm": 8.245471000671387, "learning_rate": 6.533101045296168e-08, "loss": 0.1844, "step": 23430 }, { "epoch": 29.974424552429667, "grad_norm": 0.5220872759819031, "learning_rate": 4.355400696864112e-08, "loss": 0.2512, "step": 23440 }, { "epoch": 29.987212276214834, "grad_norm": 5.6852521896362305, "learning_rate": 2.177700348432056e-08, "loss": 0.4313, "step": 23450 }, { "epoch": 30.0, "grad_norm": 4.1284308433532715, "learning_rate": 0.0, "loss": 0.2054, "step": 23460 }, { "epoch": 30.0, "eval_loss": 0.1794022172689438, "eval_runtime": 0.835, "eval_samples_per_second": 117.367, "eval_steps_per_second": 15.569, "step": 23460 } ], "logging_steps": 10, "max_steps": 23460, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3101116937011200.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }