8365 lines
202 KiB
JSON
8365 lines
202 KiB
JSON
|
{
|
||
|
"best_metric": null,
|
||
|
"best_model_checkpoint": null,
|
||
|
"epoch": 15.0,
|
||
|
"eval_steps": 500,
|
||
|
"global_step": 11730,
|
||
|
"is_hyper_param_search": false,
|
||
|
"is_local_process_zero": true,
|
||
|
"is_world_process_zero": true,
|
||
|
"log_history": [
|
||
|
{
|
||
|
"epoch": 0.01278772378516624,
|
||
|
"grad_norm": 5291.65576171875,
|
||
|
"learning_rate": 1.0000000000000002e-06,
|
||
|
"loss": 46.8468,
|
||
|
"step": 10
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.02557544757033248,
|
||
|
"grad_norm": 657.4127807617188,
|
||
|
"learning_rate": 2.0000000000000003e-06,
|
||
|
"loss": 47.4188,
|
||
|
"step": 20
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.03836317135549872,
|
||
|
"grad_norm": 8509.5302734375,
|
||
|
"learning_rate": 3e-06,
|
||
|
"loss": 55.5624,
|
||
|
"step": 30
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.05115089514066496,
|
||
|
"grad_norm": 427.65924072265625,
|
||
|
"learning_rate": 4.000000000000001e-06,
|
||
|
"loss": 54.0157,
|
||
|
"step": 40
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.0639386189258312,
|
||
|
"grad_norm": 7448.37353515625,
|
||
|
"learning_rate": 5e-06,
|
||
|
"loss": 57.6548,
|
||
|
"step": 50
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.07672634271099744,
|
||
|
"grad_norm": 14357.810546875,
|
||
|
"learning_rate": 6e-06,
|
||
|
"loss": 45.0872,
|
||
|
"step": 60
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.08951406649616368,
|
||
|
"grad_norm": 4495.298828125,
|
||
|
"learning_rate": 7.000000000000001e-06,
|
||
|
"loss": 49.2105,
|
||
|
"step": 70
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.10230179028132992,
|
||
|
"grad_norm": 3922.909912109375,
|
||
|
"learning_rate": 8.000000000000001e-06,
|
||
|
"loss": 45.772,
|
||
|
"step": 80
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.11508951406649616,
|
||
|
"grad_norm": 47730.671875,
|
||
|
"learning_rate": 9e-06,
|
||
|
"loss": 54.9209,
|
||
|
"step": 90
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.1278772378516624,
|
||
|
"grad_norm": 27943.875,
|
||
|
"learning_rate": 1e-05,
|
||
|
"loss": 47.032,
|
||
|
"step": 100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.14066496163682865,
|
||
|
"grad_norm": 185.7626953125,
|
||
|
"learning_rate": 1.1000000000000001e-05,
|
||
|
"loss": 55.6442,
|
||
|
"step": 110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.1534526854219949,
|
||
|
"grad_norm": 4819.99365234375,
|
||
|
"learning_rate": 1.2e-05,
|
||
|
"loss": 47.2024,
|
||
|
"step": 120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.16624040920716113,
|
||
|
"grad_norm": 16820.35546875,
|
||
|
"learning_rate": 1.3000000000000001e-05,
|
||
|
"loss": 47.1665,
|
||
|
"step": 130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.17902813299232737,
|
||
|
"grad_norm": 408.82489013671875,
|
||
|
"learning_rate": 1.4000000000000001e-05,
|
||
|
"loss": 45.6075,
|
||
|
"step": 140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.1918158567774936,
|
||
|
"grad_norm": 29451.880859375,
|
||
|
"learning_rate": 1.5e-05,
|
||
|
"loss": 50.9366,
|
||
|
"step": 150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.20460358056265984,
|
||
|
"grad_norm": 28413.0390625,
|
||
|
"learning_rate": 1.6000000000000003e-05,
|
||
|
"loss": 44.6847,
|
||
|
"step": 160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.21739130434782608,
|
||
|
"grad_norm": 799.9179077148438,
|
||
|
"learning_rate": 1.7000000000000003e-05,
|
||
|
"loss": 46.8802,
|
||
|
"step": 170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.23017902813299232,
|
||
|
"grad_norm": 2510.53515625,
|
||
|
"learning_rate": 1.8e-05,
|
||
|
"loss": 46.2193,
|
||
|
"step": 180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.24296675191815856,
|
||
|
"grad_norm": 2892.29248046875,
|
||
|
"learning_rate": 1.9e-05,
|
||
|
"loss": 41.2469,
|
||
|
"step": 190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.2557544757033248,
|
||
|
"grad_norm": 738.676513671875,
|
||
|
"learning_rate": 2e-05,
|
||
|
"loss": 41.8658,
|
||
|
"step": 200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.26854219948849106,
|
||
|
"grad_norm": 215.02032470703125,
|
||
|
"learning_rate": 2.1e-05,
|
||
|
"loss": 42.2854,
|
||
|
"step": 210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.2813299232736573,
|
||
|
"grad_norm": 1281.8134765625,
|
||
|
"learning_rate": 2.2000000000000003e-05,
|
||
|
"loss": 48.7005,
|
||
|
"step": 220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.29411764705882354,
|
||
|
"grad_norm": 677.4962158203125,
|
||
|
"learning_rate": 2.3000000000000003e-05,
|
||
|
"loss": 36.2834,
|
||
|
"step": 230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.3069053708439898,
|
||
|
"grad_norm": 2442.72900390625,
|
||
|
"learning_rate": 2.4e-05,
|
||
|
"loss": 44.3992,
|
||
|
"step": 240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.319693094629156,
|
||
|
"grad_norm": 135.88478088378906,
|
||
|
"learning_rate": 2.5e-05,
|
||
|
"loss": 42.5502,
|
||
|
"step": 250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.33248081841432225,
|
||
|
"grad_norm": 5432.8203125,
|
||
|
"learning_rate": 2.6000000000000002e-05,
|
||
|
"loss": 41.375,
|
||
|
"step": 260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.3452685421994885,
|
||
|
"grad_norm": 3573.05419921875,
|
||
|
"learning_rate": 2.7000000000000002e-05,
|
||
|
"loss": 40.6085,
|
||
|
"step": 270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.35805626598465473,
|
||
|
"grad_norm": 806.6569213867188,
|
||
|
"learning_rate": 2.8000000000000003e-05,
|
||
|
"loss": 40.9676,
|
||
|
"step": 280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.37084398976982097,
|
||
|
"grad_norm": 537.9384765625,
|
||
|
"learning_rate": 2.9e-05,
|
||
|
"loss": 36.0962,
|
||
|
"step": 290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.3836317135549872,
|
||
|
"grad_norm": 1317.54150390625,
|
||
|
"learning_rate": 3e-05,
|
||
|
"loss": 38.838,
|
||
|
"step": 300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.39641943734015345,
|
||
|
"grad_norm": 1044.3780517578125,
|
||
|
"learning_rate": 3.1e-05,
|
||
|
"loss": 35.0419,
|
||
|
"step": 310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.4092071611253197,
|
||
|
"grad_norm": 6332.0888671875,
|
||
|
"learning_rate": 3.2000000000000005e-05,
|
||
|
"loss": 35.0986,
|
||
|
"step": 320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.4219948849104859,
|
||
|
"grad_norm": 1020.596923828125,
|
||
|
"learning_rate": 3.3e-05,
|
||
|
"loss": 35.2819,
|
||
|
"step": 330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.43478260869565216,
|
||
|
"grad_norm": 4903.22119140625,
|
||
|
"learning_rate": 3.4000000000000007e-05,
|
||
|
"loss": 27.0823,
|
||
|
"step": 340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.4475703324808184,
|
||
|
"grad_norm": 705.4653930664062,
|
||
|
"learning_rate": 3.5e-05,
|
||
|
"loss": 35.8111,
|
||
|
"step": 350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.46035805626598464,
|
||
|
"grad_norm": 166.60675048828125,
|
||
|
"learning_rate": 3.6e-05,
|
||
|
"loss": 32.9624,
|
||
|
"step": 360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.4731457800511509,
|
||
|
"grad_norm": 1294.9737548828125,
|
||
|
"learning_rate": 3.7e-05,
|
||
|
"loss": 27.3774,
|
||
|
"step": 370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.4859335038363171,
|
||
|
"grad_norm": 143.36048889160156,
|
||
|
"learning_rate": 3.8e-05,
|
||
|
"loss": 35.3593,
|
||
|
"step": 380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.49872122762148335,
|
||
|
"grad_norm": 2351.956787109375,
|
||
|
"learning_rate": 3.9000000000000006e-05,
|
||
|
"loss": 31.6628,
|
||
|
"step": 390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.5115089514066496,
|
||
|
"grad_norm": 1412.3145751953125,
|
||
|
"learning_rate": 4e-05,
|
||
|
"loss": 24.9052,
|
||
|
"step": 400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.5242966751918159,
|
||
|
"grad_norm": 2179.52294921875,
|
||
|
"learning_rate": 4.1e-05,
|
||
|
"loss": 28.4615,
|
||
|
"step": 410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.5370843989769821,
|
||
|
"grad_norm": 1550.5777587890625,
|
||
|
"learning_rate": 4.2e-05,
|
||
|
"loss": 27.8655,
|
||
|
"step": 420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.5498721227621484,
|
||
|
"grad_norm": 14167.97265625,
|
||
|
"learning_rate": 4.3e-05,
|
||
|
"loss": 31.339,
|
||
|
"step": 430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.5626598465473146,
|
||
|
"grad_norm": 1213.757568359375,
|
||
|
"learning_rate": 4.4000000000000006e-05,
|
||
|
"loss": 29.2414,
|
||
|
"step": 440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.5754475703324808,
|
||
|
"grad_norm": 1910.325439453125,
|
||
|
"learning_rate": 4.5e-05,
|
||
|
"loss": 28.1563,
|
||
|
"step": 450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.5882352941176471,
|
||
|
"grad_norm": 2075.203369140625,
|
||
|
"learning_rate": 4.600000000000001e-05,
|
||
|
"loss": 30.5137,
|
||
|
"step": 460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.6010230179028133,
|
||
|
"grad_norm": 2741.365966796875,
|
||
|
"learning_rate": 4.7e-05,
|
||
|
"loss": 23.7269,
|
||
|
"step": 470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.6138107416879796,
|
||
|
"grad_norm": 6818.3935546875,
|
||
|
"learning_rate": 4.8e-05,
|
||
|
"loss": 20.7367,
|
||
|
"step": 480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.6265984654731458,
|
||
|
"grad_norm": 14128.44140625,
|
||
|
"learning_rate": 4.9e-05,
|
||
|
"loss": 23.4038,
|
||
|
"step": 490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.639386189258312,
|
||
|
"grad_norm": 268.3045654296875,
|
||
|
"learning_rate": 5e-05,
|
||
|
"loss": 25.6224,
|
||
|
"step": 500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.6521739130434783,
|
||
|
"grad_norm": 7697.00341796875,
|
||
|
"learning_rate": 4.995547640249332e-05,
|
||
|
"loss": 24.966,
|
||
|
"step": 510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.6649616368286445,
|
||
|
"grad_norm": 1550.7242431640625,
|
||
|
"learning_rate": 4.9910952804986644e-05,
|
||
|
"loss": 27.2747,
|
||
|
"step": 520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.6777493606138107,
|
||
|
"grad_norm": 5949.79150390625,
|
||
|
"learning_rate": 4.986642920747996e-05,
|
||
|
"loss": 25.6169,
|
||
|
"step": 530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.690537084398977,
|
||
|
"grad_norm": 966.1629638671875,
|
||
|
"learning_rate": 4.982190560997329e-05,
|
||
|
"loss": 22.0359,
|
||
|
"step": 540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.7033248081841432,
|
||
|
"grad_norm": 14809.646484375,
|
||
|
"learning_rate": 4.977738201246661e-05,
|
||
|
"loss": 20.4764,
|
||
|
"step": 550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.7161125319693095,
|
||
|
"grad_norm": 5175.0810546875,
|
||
|
"learning_rate": 4.9732858414959934e-05,
|
||
|
"loss": 21.0584,
|
||
|
"step": 560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.7289002557544757,
|
||
|
"grad_norm": 620.6530151367188,
|
||
|
"learning_rate": 4.968833481745325e-05,
|
||
|
"loss": 20.8162,
|
||
|
"step": 570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.7416879795396419,
|
||
|
"grad_norm": 545.9930419921875,
|
||
|
"learning_rate": 4.9643811219946576e-05,
|
||
|
"loss": 19.4457,
|
||
|
"step": 580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.7544757033248082,
|
||
|
"grad_norm": 394.00115966796875,
|
||
|
"learning_rate": 4.9599287622439894e-05,
|
||
|
"loss": 20.3071,
|
||
|
"step": 590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.7672634271099744,
|
||
|
"grad_norm": 642.2852783203125,
|
||
|
"learning_rate": 4.955476402493322e-05,
|
||
|
"loss": 19.3772,
|
||
|
"step": 600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.7800511508951407,
|
||
|
"grad_norm": 775.719970703125,
|
||
|
"learning_rate": 4.951024042742654e-05,
|
||
|
"loss": 21.5308,
|
||
|
"step": 610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.7928388746803069,
|
||
|
"grad_norm": 3649.2021484375,
|
||
|
"learning_rate": 4.946571682991986e-05,
|
||
|
"loss": 18.0963,
|
||
|
"step": 620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.8056265984654731,
|
||
|
"grad_norm": 3447.32421875,
|
||
|
"learning_rate": 4.9421193232413184e-05,
|
||
|
"loss": 18.1472,
|
||
|
"step": 630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.8184143222506394,
|
||
|
"grad_norm": 250.08575439453125,
|
||
|
"learning_rate": 4.93766696349065e-05,
|
||
|
"loss": 17.4447,
|
||
|
"step": 640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.8312020460358056,
|
||
|
"grad_norm": 371.5052490234375,
|
||
|
"learning_rate": 4.9332146037399826e-05,
|
||
|
"loss": 18.3444,
|
||
|
"step": 650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.8439897698209718,
|
||
|
"grad_norm": 878.1161499023438,
|
||
|
"learning_rate": 4.928762243989314e-05,
|
||
|
"loss": 17.2934,
|
||
|
"step": 660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.8567774936061381,
|
||
|
"grad_norm": 1265.5709228515625,
|
||
|
"learning_rate": 4.924309884238647e-05,
|
||
|
"loss": 16.3026,
|
||
|
"step": 670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.8695652173913043,
|
||
|
"grad_norm": 591.113037109375,
|
||
|
"learning_rate": 4.919857524487979e-05,
|
||
|
"loss": 16.274,
|
||
|
"step": 680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.8823529411764706,
|
||
|
"grad_norm": 219.59530639648438,
|
||
|
"learning_rate": 4.915405164737311e-05,
|
||
|
"loss": 17.4999,
|
||
|
"step": 690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.8951406649616368,
|
||
|
"grad_norm": 708.5332641601562,
|
||
|
"learning_rate": 4.9109528049866433e-05,
|
||
|
"loss": 15.9393,
|
||
|
"step": 700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.907928388746803,
|
||
|
"grad_norm": 430.86468505859375,
|
||
|
"learning_rate": 4.906500445235975e-05,
|
||
|
"loss": 16.0164,
|
||
|
"step": 710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.9207161125319693,
|
||
|
"grad_norm": 1154.94873046875,
|
||
|
"learning_rate": 4.9020480854853075e-05,
|
||
|
"loss": 18.7462,
|
||
|
"step": 720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.9335038363171355,
|
||
|
"grad_norm": 405.43511962890625,
|
||
|
"learning_rate": 4.897595725734639e-05,
|
||
|
"loss": 15.9433,
|
||
|
"step": 730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.9462915601023018,
|
||
|
"grad_norm": 85.01339721679688,
|
||
|
"learning_rate": 4.893143365983972e-05,
|
||
|
"loss": 15.8037,
|
||
|
"step": 740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.959079283887468,
|
||
|
"grad_norm": 309.97119140625,
|
||
|
"learning_rate": 4.888691006233304e-05,
|
||
|
"loss": 14.8662,
|
||
|
"step": 750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.9718670076726342,
|
||
|
"grad_norm": 554.9390869140625,
|
||
|
"learning_rate": 4.884238646482636e-05,
|
||
|
"loss": 14.4255,
|
||
|
"step": 760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.9846547314578005,
|
||
|
"grad_norm": 830.1895141601562,
|
||
|
"learning_rate": 4.879786286731968e-05,
|
||
|
"loss": 14.9826,
|
||
|
"step": 770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 0.9974424552429667,
|
||
|
"grad_norm": 133.27218627929688,
|
||
|
"learning_rate": 4.8753339269813e-05,
|
||
|
"loss": 13.5141,
|
||
|
"step": 780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.0,
|
||
|
"eval_loss": 8.323198318481445,
|
||
|
"eval_runtime": 0.8605,
|
||
|
"eval_samples_per_second": 113.894,
|
||
|
"eval_steps_per_second": 15.108,
|
||
|
"step": 782
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.010230179028133,
|
||
|
"grad_norm": 2045.128173828125,
|
||
|
"learning_rate": 4.8708815672306325e-05,
|
||
|
"loss": 12.1253,
|
||
|
"step": 790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.0230179028132993,
|
||
|
"grad_norm": 288.3453369140625,
|
||
|
"learning_rate": 4.866429207479964e-05,
|
||
|
"loss": 12.204,
|
||
|
"step": 800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.0358056265984654,
|
||
|
"grad_norm": 507.2064208984375,
|
||
|
"learning_rate": 4.8619768477292966e-05,
|
||
|
"loss": 13.964,
|
||
|
"step": 810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.0485933503836318,
|
||
|
"grad_norm": 359.73388671875,
|
||
|
"learning_rate": 4.857524487978629e-05,
|
||
|
"loss": 13.5956,
|
||
|
"step": 820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.061381074168798,
|
||
|
"grad_norm": 113.6794662475586,
|
||
|
"learning_rate": 4.8530721282279615e-05,
|
||
|
"loss": 12.8579,
|
||
|
"step": 830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.0741687979539642,
|
||
|
"grad_norm": 754.2025756835938,
|
||
|
"learning_rate": 4.848619768477293e-05,
|
||
|
"loss": 13.0858,
|
||
|
"step": 840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.0869565217391304,
|
||
|
"grad_norm": 191.79119873046875,
|
||
|
"learning_rate": 4.844167408726625e-05,
|
||
|
"loss": 10.2861,
|
||
|
"step": 850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.0997442455242967,
|
||
|
"grad_norm": 138.1201629638672,
|
||
|
"learning_rate": 4.8397150489759574e-05,
|
||
|
"loss": 9.7727,
|
||
|
"step": 860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.1125319693094629,
|
||
|
"grad_norm": 201.81227111816406,
|
||
|
"learning_rate": 4.835262689225289e-05,
|
||
|
"loss": 11.2379,
|
||
|
"step": 870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.1253196930946292,
|
||
|
"grad_norm": 2496.734619140625,
|
||
|
"learning_rate": 4.8308103294746216e-05,
|
||
|
"loss": 12.2517,
|
||
|
"step": 880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.1381074168797953,
|
||
|
"grad_norm": 126.88189697265625,
|
||
|
"learning_rate": 4.826357969723954e-05,
|
||
|
"loss": 10.6216,
|
||
|
"step": 890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.1508951406649617,
|
||
|
"grad_norm": 189.23846435546875,
|
||
|
"learning_rate": 4.8219056099732865e-05,
|
||
|
"loss": 11.7789,
|
||
|
"step": 900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.1636828644501278,
|
||
|
"grad_norm": 137.6693115234375,
|
||
|
"learning_rate": 4.817453250222618e-05,
|
||
|
"loss": 10.3337,
|
||
|
"step": 910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.1764705882352942,
|
||
|
"grad_norm": 233.86508178710938,
|
||
|
"learning_rate": 4.8130008904719506e-05,
|
||
|
"loss": 10.3479,
|
||
|
"step": 920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.1892583120204603,
|
||
|
"grad_norm": 1433.0694580078125,
|
||
|
"learning_rate": 4.8085485307212824e-05,
|
||
|
"loss": 10.1257,
|
||
|
"step": 930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.2020460358056266,
|
||
|
"grad_norm": 241.59225463867188,
|
||
|
"learning_rate": 4.804096170970615e-05,
|
||
|
"loss": 10.8335,
|
||
|
"step": 940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.2148337595907928,
|
||
|
"grad_norm": 751.7616577148438,
|
||
|
"learning_rate": 4.7996438112199466e-05,
|
||
|
"loss": 9.9136,
|
||
|
"step": 950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.227621483375959,
|
||
|
"grad_norm": 137.66688537597656,
|
||
|
"learning_rate": 4.795191451469279e-05,
|
||
|
"loss": 9.6519,
|
||
|
"step": 960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.2404092071611252,
|
||
|
"grad_norm": 576.8855590820312,
|
||
|
"learning_rate": 4.7907390917186114e-05,
|
||
|
"loss": 8.2642,
|
||
|
"step": 970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.2531969309462916,
|
||
|
"grad_norm": 59.55119705200195,
|
||
|
"learning_rate": 4.786286731967943e-05,
|
||
|
"loss": 7.5509,
|
||
|
"step": 980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.265984654731458,
|
||
|
"grad_norm": 364.87939453125,
|
||
|
"learning_rate": 4.7818343722172756e-05,
|
||
|
"loss": 8.2691,
|
||
|
"step": 990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.278772378516624,
|
||
|
"grad_norm": 80.77793884277344,
|
||
|
"learning_rate": 4.777382012466607e-05,
|
||
|
"loss": 8.1777,
|
||
|
"step": 1000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.2915601023017902,
|
||
|
"grad_norm": 166.04991149902344,
|
||
|
"learning_rate": 4.77292965271594e-05,
|
||
|
"loss": 8.2977,
|
||
|
"step": 1010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.3043478260869565,
|
||
|
"grad_norm": 201.66940307617188,
|
||
|
"learning_rate": 4.7684772929652715e-05,
|
||
|
"loss": 7.6507,
|
||
|
"step": 1020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.317135549872123,
|
||
|
"grad_norm": 115.5215835571289,
|
||
|
"learning_rate": 4.764024933214604e-05,
|
||
|
"loss": 8.4317,
|
||
|
"step": 1030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.329923273657289,
|
||
|
"grad_norm": 856.268310546875,
|
||
|
"learning_rate": 4.7595725734639364e-05,
|
||
|
"loss": 7.4182,
|
||
|
"step": 1040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.3427109974424551,
|
||
|
"grad_norm": 164.53457641601562,
|
||
|
"learning_rate": 4.755120213713268e-05,
|
||
|
"loss": 7.0188,
|
||
|
"step": 1050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.3554987212276215,
|
||
|
"grad_norm": 284.8175354003906,
|
||
|
"learning_rate": 4.7506678539626005e-05,
|
||
|
"loss": 7.4934,
|
||
|
"step": 1060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.3682864450127878,
|
||
|
"grad_norm": 914.1421508789062,
|
||
|
"learning_rate": 4.746215494211932e-05,
|
||
|
"loss": 6.3777,
|
||
|
"step": 1070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.381074168797954,
|
||
|
"grad_norm": 364.2792663574219,
|
||
|
"learning_rate": 4.741763134461265e-05,
|
||
|
"loss": 7.0009,
|
||
|
"step": 1080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.39386189258312,
|
||
|
"grad_norm": 40.856849670410156,
|
||
|
"learning_rate": 4.7373107747105965e-05,
|
||
|
"loss": 6.5175,
|
||
|
"step": 1090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.4066496163682864,
|
||
|
"grad_norm": 20.656641006469727,
|
||
|
"learning_rate": 4.732858414959929e-05,
|
||
|
"loss": 6.5319,
|
||
|
"step": 1100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.4194373401534528,
|
||
|
"grad_norm": 102.03244018554688,
|
||
|
"learning_rate": 4.728406055209261e-05,
|
||
|
"loss": 5.6601,
|
||
|
"step": 1110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.432225063938619,
|
||
|
"grad_norm": 166.99356079101562,
|
||
|
"learning_rate": 4.723953695458593e-05,
|
||
|
"loss": 5.4954,
|
||
|
"step": 1120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.445012787723785,
|
||
|
"grad_norm": 35.748558044433594,
|
||
|
"learning_rate": 4.7195013357079255e-05,
|
||
|
"loss": 5.5329,
|
||
|
"step": 1130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.4578005115089514,
|
||
|
"grad_norm": 152.98487854003906,
|
||
|
"learning_rate": 4.715048975957257e-05,
|
||
|
"loss": 5.2992,
|
||
|
"step": 1140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.4705882352941178,
|
||
|
"grad_norm": 35.46538162231445,
|
||
|
"learning_rate": 4.7105966162065897e-05,
|
||
|
"loss": 4.7582,
|
||
|
"step": 1150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.4833759590792839,
|
||
|
"grad_norm": 667.7198486328125,
|
||
|
"learning_rate": 4.7061442564559214e-05,
|
||
|
"loss": 5.2354,
|
||
|
"step": 1160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.49616368286445,
|
||
|
"grad_norm": 36.09316635131836,
|
||
|
"learning_rate": 4.7016918967052545e-05,
|
||
|
"loss": 4.4023,
|
||
|
"step": 1170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.5089514066496164,
|
||
|
"grad_norm": 24.274316787719727,
|
||
|
"learning_rate": 4.697239536954586e-05,
|
||
|
"loss": 4.3065,
|
||
|
"step": 1180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.5217391304347827,
|
||
|
"grad_norm": 76.46308898925781,
|
||
|
"learning_rate": 4.692787177203919e-05,
|
||
|
"loss": 3.7555,
|
||
|
"step": 1190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.5345268542199488,
|
||
|
"grad_norm": 49.68375015258789,
|
||
|
"learning_rate": 4.6883348174532504e-05,
|
||
|
"loss": 4.5423,
|
||
|
"step": 1200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.547314578005115,
|
||
|
"grad_norm": 41.64923858642578,
|
||
|
"learning_rate": 4.683882457702582e-05,
|
||
|
"loss": 4.1702,
|
||
|
"step": 1210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.5601023017902813,
|
||
|
"grad_norm": 296.3228759765625,
|
||
|
"learning_rate": 4.6794300979519146e-05,
|
||
|
"loss": 3.4573,
|
||
|
"step": 1220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.5728900255754477,
|
||
|
"grad_norm": 338.4577331542969,
|
||
|
"learning_rate": 4.6749777382012464e-05,
|
||
|
"loss": 3.5353,
|
||
|
"step": 1230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.5856777493606138,
|
||
|
"grad_norm": 28.94730567932129,
|
||
|
"learning_rate": 4.6705253784505795e-05,
|
||
|
"loss": 3.1779,
|
||
|
"step": 1240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.59846547314578,
|
||
|
"grad_norm": 132.22483825683594,
|
||
|
"learning_rate": 4.666073018699911e-05,
|
||
|
"loss": 2.745,
|
||
|
"step": 1250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.6112531969309463,
|
||
|
"grad_norm": 63.76390838623047,
|
||
|
"learning_rate": 4.6616206589492436e-05,
|
||
|
"loss": 3.2324,
|
||
|
"step": 1260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.6240409207161126,
|
||
|
"grad_norm": 24.974475860595703,
|
||
|
"learning_rate": 4.6571682991985754e-05,
|
||
|
"loss": 2.8275,
|
||
|
"step": 1270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.6368286445012787,
|
||
|
"grad_norm": 42.1992301940918,
|
||
|
"learning_rate": 4.652715939447908e-05,
|
||
|
"loss": 2.9965,
|
||
|
"step": 1280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.6496163682864449,
|
||
|
"grad_norm": 55.832916259765625,
|
||
|
"learning_rate": 4.6482635796972396e-05,
|
||
|
"loss": 2.7633,
|
||
|
"step": 1290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.6624040920716112,
|
||
|
"grad_norm": 21.454418182373047,
|
||
|
"learning_rate": 4.643811219946571e-05,
|
||
|
"loss": 3.3381,
|
||
|
"step": 1300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.6751918158567776,
|
||
|
"grad_norm": 186.13711547851562,
|
||
|
"learning_rate": 4.6393588601959044e-05,
|
||
|
"loss": 3.3175,
|
||
|
"step": 1310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.6879795396419437,
|
||
|
"grad_norm": 43.50181579589844,
|
||
|
"learning_rate": 4.634906500445236e-05,
|
||
|
"loss": 2.947,
|
||
|
"step": 1320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.7007672634271098,
|
||
|
"grad_norm": 280.3487854003906,
|
||
|
"learning_rate": 4.6304541406945686e-05,
|
||
|
"loss": 2.417,
|
||
|
"step": 1330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.7135549872122762,
|
||
|
"grad_norm": 13.333784103393555,
|
||
|
"learning_rate": 4.6260017809439003e-05,
|
||
|
"loss": 2.0575,
|
||
|
"step": 1340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.7263427109974425,
|
||
|
"grad_norm": 20.713420867919922,
|
||
|
"learning_rate": 4.621549421193233e-05,
|
||
|
"loss": 2.3133,
|
||
|
"step": 1350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.7391304347826086,
|
||
|
"grad_norm": 200.07363891601562,
|
||
|
"learning_rate": 4.6170970614425645e-05,
|
||
|
"loss": 2.5783,
|
||
|
"step": 1360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.7519181585677748,
|
||
|
"grad_norm": 27.835031509399414,
|
||
|
"learning_rate": 4.612644701691897e-05,
|
||
|
"loss": 2.5367,
|
||
|
"step": 1370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.7647058823529411,
|
||
|
"grad_norm": 25.559825897216797,
|
||
|
"learning_rate": 4.6081923419412294e-05,
|
||
|
"loss": 1.879,
|
||
|
"step": 1380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.7774936061381075,
|
||
|
"grad_norm": 133.1987762451172,
|
||
|
"learning_rate": 4.603739982190561e-05,
|
||
|
"loss": 1.8325,
|
||
|
"step": 1390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.7902813299232738,
|
||
|
"grad_norm": 25.337984085083008,
|
||
|
"learning_rate": 4.5992876224398935e-05,
|
||
|
"loss": 1.7651,
|
||
|
"step": 1400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.80306905370844,
|
||
|
"grad_norm": 23.645824432373047,
|
||
|
"learning_rate": 4.594835262689225e-05,
|
||
|
"loss": 2.1557,
|
||
|
"step": 1410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.815856777493606,
|
||
|
"grad_norm": 6.187915325164795,
|
||
|
"learning_rate": 4.590382902938558e-05,
|
||
|
"loss": 1.8122,
|
||
|
"step": 1420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.8286445012787724,
|
||
|
"grad_norm": 32.88364791870117,
|
||
|
"learning_rate": 4.5859305431878895e-05,
|
||
|
"loss": 2.1533,
|
||
|
"step": 1430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.8414322250639388,
|
||
|
"grad_norm": 12.379995346069336,
|
||
|
"learning_rate": 4.581478183437222e-05,
|
||
|
"loss": 2.1713,
|
||
|
"step": 1440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.854219948849105,
|
||
|
"grad_norm": 13.056475639343262,
|
||
|
"learning_rate": 4.577025823686554e-05,
|
||
|
"loss": 1.5537,
|
||
|
"step": 1450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.867007672634271,
|
||
|
"grad_norm": 25.982215881347656,
|
||
|
"learning_rate": 4.572573463935886e-05,
|
||
|
"loss": 1.92,
|
||
|
"step": 1460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.8797953964194374,
|
||
|
"grad_norm": 17.912216186523438,
|
||
|
"learning_rate": 4.5681211041852185e-05,
|
||
|
"loss": 1.4743,
|
||
|
"step": 1470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.8925831202046037,
|
||
|
"grad_norm": 141.81936645507812,
|
||
|
"learning_rate": 4.56366874443455e-05,
|
||
|
"loss": 1.7076,
|
||
|
"step": 1480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.9053708439897699,
|
||
|
"grad_norm": 56.60566711425781,
|
||
|
"learning_rate": 4.559216384683883e-05,
|
||
|
"loss": 1.9273,
|
||
|
"step": 1490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.918158567774936,
|
||
|
"grad_norm": 21.652421951293945,
|
||
|
"learning_rate": 4.5547640249332144e-05,
|
||
|
"loss": 1.4124,
|
||
|
"step": 1500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.9309462915601023,
|
||
|
"grad_norm": 8.854305267333984,
|
||
|
"learning_rate": 4.550311665182547e-05,
|
||
|
"loss": 1.4905,
|
||
|
"step": 1510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.9437340153452687,
|
||
|
"grad_norm": 4.608613014221191,
|
||
|
"learning_rate": 4.545859305431879e-05,
|
||
|
"loss": 2.2376,
|
||
|
"step": 1520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.9565217391304348,
|
||
|
"grad_norm": 16.537403106689453,
|
||
|
"learning_rate": 4.541406945681212e-05,
|
||
|
"loss": 1.0597,
|
||
|
"step": 1530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.969309462915601,
|
||
|
"grad_norm": 6.274345874786377,
|
||
|
"learning_rate": 4.5369545859305434e-05,
|
||
|
"loss": 1.8588,
|
||
|
"step": 1540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.9820971867007673,
|
||
|
"grad_norm": 38.86388397216797,
|
||
|
"learning_rate": 4.532502226179875e-05,
|
||
|
"loss": 1.4454,
|
||
|
"step": 1550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 1.9948849104859336,
|
||
|
"grad_norm": 56.21317672729492,
|
||
|
"learning_rate": 4.5280498664292076e-05,
|
||
|
"loss": 1.9605,
|
||
|
"step": 1560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.0,
|
||
|
"eval_loss": 0.8149464726448059,
|
||
|
"eval_runtime": 0.972,
|
||
|
"eval_samples_per_second": 100.82,
|
||
|
"eval_steps_per_second": 13.374,
|
||
|
"step": 1564
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.0076726342710995,
|
||
|
"grad_norm": 5.580387592315674,
|
||
|
"learning_rate": 4.5235975066785394e-05,
|
||
|
"loss": 2.1328,
|
||
|
"step": 1570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.020460358056266,
|
||
|
"grad_norm": 7.484273433685303,
|
||
|
"learning_rate": 4.519145146927872e-05,
|
||
|
"loss": 1.5055,
|
||
|
"step": 1580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.0332480818414322,
|
||
|
"grad_norm": 39.16282653808594,
|
||
|
"learning_rate": 4.514692787177204e-05,
|
||
|
"loss": 1.8466,
|
||
|
"step": 1590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.0460358056265986,
|
||
|
"grad_norm": 45.915321350097656,
|
||
|
"learning_rate": 4.5102404274265367e-05,
|
||
|
"loss": 2.0529,
|
||
|
"step": 1600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.0588235294117645,
|
||
|
"grad_norm": 12.101923942565918,
|
||
|
"learning_rate": 4.5057880676758684e-05,
|
||
|
"loss": 1.6502,
|
||
|
"step": 1610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.071611253196931,
|
||
|
"grad_norm": 10.51504898071289,
|
||
|
"learning_rate": 4.501335707925201e-05,
|
||
|
"loss": 1.5942,
|
||
|
"step": 1620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.084398976982097,
|
||
|
"grad_norm": 2.8996851444244385,
|
||
|
"learning_rate": 4.4968833481745326e-05,
|
||
|
"loss": 1.6974,
|
||
|
"step": 1630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.0971867007672635,
|
||
|
"grad_norm": 10.019527435302734,
|
||
|
"learning_rate": 4.492430988423865e-05,
|
||
|
"loss": 1.6031,
|
||
|
"step": 1640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.10997442455243,
|
||
|
"grad_norm": 18.028640747070312,
|
||
|
"learning_rate": 4.487978628673197e-05,
|
||
|
"loss": 2.0213,
|
||
|
"step": 1650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.122762148337596,
|
||
|
"grad_norm": 19.63820457458496,
|
||
|
"learning_rate": 4.483526268922529e-05,
|
||
|
"loss": 1.9554,
|
||
|
"step": 1660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.135549872122762,
|
||
|
"grad_norm": 97.62370300292969,
|
||
|
"learning_rate": 4.4790739091718616e-05,
|
||
|
"loss": 1.0171,
|
||
|
"step": 1670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.1483375959079285,
|
||
|
"grad_norm": 26.990764617919922,
|
||
|
"learning_rate": 4.4746215494211934e-05,
|
||
|
"loss": 1.763,
|
||
|
"step": 1680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.1611253196930944,
|
||
|
"grad_norm": 8.061385154724121,
|
||
|
"learning_rate": 4.470169189670526e-05,
|
||
|
"loss": 1.4569,
|
||
|
"step": 1690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.1739130434782608,
|
||
|
"grad_norm": 8.058719635009766,
|
||
|
"learning_rate": 4.4657168299198575e-05,
|
||
|
"loss": 1.6015,
|
||
|
"step": 1700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.186700767263427,
|
||
|
"grad_norm": 16.712255477905273,
|
||
|
"learning_rate": 4.46126447016919e-05,
|
||
|
"loss": 1.2064,
|
||
|
"step": 1710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.1994884910485935,
|
||
|
"grad_norm": 9.311502456665039,
|
||
|
"learning_rate": 4.456812110418522e-05,
|
||
|
"loss": 2.1118,
|
||
|
"step": 1720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.21227621483376,
|
||
|
"grad_norm": 8.408000946044922,
|
||
|
"learning_rate": 4.452359750667854e-05,
|
||
|
"loss": 1.6535,
|
||
|
"step": 1730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.2250639386189257,
|
||
|
"grad_norm": 18.537572860717773,
|
||
|
"learning_rate": 4.4479073909171866e-05,
|
||
|
"loss": 1.2816,
|
||
|
"step": 1740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.237851662404092,
|
||
|
"grad_norm": 17.14375114440918,
|
||
|
"learning_rate": 4.443455031166518e-05,
|
||
|
"loss": 0.7858,
|
||
|
"step": 1750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.2506393861892584,
|
||
|
"grad_norm": 6.235766410827637,
|
||
|
"learning_rate": 4.439002671415851e-05,
|
||
|
"loss": 1.2958,
|
||
|
"step": 1760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.2634271099744243,
|
||
|
"grad_norm": 16.74968719482422,
|
||
|
"learning_rate": 4.4345503116651825e-05,
|
||
|
"loss": 0.9718,
|
||
|
"step": 1770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.2762148337595907,
|
||
|
"grad_norm": 17.026458740234375,
|
||
|
"learning_rate": 4.430097951914515e-05,
|
||
|
"loss": 2.1236,
|
||
|
"step": 1780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.289002557544757,
|
||
|
"grad_norm": 4.0173516273498535,
|
||
|
"learning_rate": 4.4256455921638467e-05,
|
||
|
"loss": 1.5366,
|
||
|
"step": 1790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.3017902813299234,
|
||
|
"grad_norm": 10.223082542419434,
|
||
|
"learning_rate": 4.421193232413179e-05,
|
||
|
"loss": 0.8786,
|
||
|
"step": 1800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.3145780051150897,
|
||
|
"grad_norm": 2.731196880340576,
|
||
|
"learning_rate": 4.4167408726625115e-05,
|
||
|
"loss": 2.3721,
|
||
|
"step": 1810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.3273657289002556,
|
||
|
"grad_norm": 10.35998821258545,
|
||
|
"learning_rate": 4.412288512911843e-05,
|
||
|
"loss": 1.5431,
|
||
|
"step": 1820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.340153452685422,
|
||
|
"grad_norm": 12.369463920593262,
|
||
|
"learning_rate": 4.407836153161176e-05,
|
||
|
"loss": 1.5767,
|
||
|
"step": 1830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.3529411764705883,
|
||
|
"grad_norm": 4.78343391418457,
|
||
|
"learning_rate": 4.4033837934105074e-05,
|
||
|
"loss": 1.7491,
|
||
|
"step": 1840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.3657289002557547,
|
||
|
"grad_norm": 6.811643123626709,
|
||
|
"learning_rate": 4.39893143365984e-05,
|
||
|
"loss": 0.9933,
|
||
|
"step": 1850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.3785166240409206,
|
||
|
"grad_norm": 16.536911010742188,
|
||
|
"learning_rate": 4.3944790739091716e-05,
|
||
|
"loss": 1.2678,
|
||
|
"step": 1860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.391304347826087,
|
||
|
"grad_norm": 7.726188659667969,
|
||
|
"learning_rate": 4.390026714158505e-05,
|
||
|
"loss": 1.6206,
|
||
|
"step": 1870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.4040920716112533,
|
||
|
"grad_norm": 22.88146209716797,
|
||
|
"learning_rate": 4.3855743544078365e-05,
|
||
|
"loss": 1.3968,
|
||
|
"step": 1880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.4168797953964196,
|
||
|
"grad_norm": 4.256078243255615,
|
||
|
"learning_rate": 4.381121994657169e-05,
|
||
|
"loss": 1.2646,
|
||
|
"step": 1890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.4296675191815855,
|
||
|
"grad_norm": 39.807491302490234,
|
||
|
"learning_rate": 4.3766696349065006e-05,
|
||
|
"loss": 1.8832,
|
||
|
"step": 1900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.442455242966752,
|
||
|
"grad_norm": 4.80452299118042,
|
||
|
"learning_rate": 4.3722172751558324e-05,
|
||
|
"loss": 1.6629,
|
||
|
"step": 1910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.455242966751918,
|
||
|
"grad_norm": 13.139750480651855,
|
||
|
"learning_rate": 4.367764915405165e-05,
|
||
|
"loss": 1.125,
|
||
|
"step": 1920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.4680306905370846,
|
||
|
"grad_norm": 14.896947860717773,
|
||
|
"learning_rate": 4.363312555654497e-05,
|
||
|
"loss": 1.202,
|
||
|
"step": 1930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.4808184143222505,
|
||
|
"grad_norm": 6.931974411010742,
|
||
|
"learning_rate": 4.35886019590383e-05,
|
||
|
"loss": 1.3819,
|
||
|
"step": 1940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.493606138107417,
|
||
|
"grad_norm": 17.391084671020508,
|
||
|
"learning_rate": 4.3544078361531614e-05,
|
||
|
"loss": 1.5623,
|
||
|
"step": 1950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.506393861892583,
|
||
|
"grad_norm": 7.759119033813477,
|
||
|
"learning_rate": 4.349955476402494e-05,
|
||
|
"loss": 0.8815,
|
||
|
"step": 1960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.5191815856777495,
|
||
|
"grad_norm": 2.0047388076782227,
|
||
|
"learning_rate": 4.3455031166518256e-05,
|
||
|
"loss": 1.0101,
|
||
|
"step": 1970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.531969309462916,
|
||
|
"grad_norm": 8.238865852355957,
|
||
|
"learning_rate": 4.341050756901158e-05,
|
||
|
"loss": 1.0075,
|
||
|
"step": 1980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.544757033248082,
|
||
|
"grad_norm": 7.979090213775635,
|
||
|
"learning_rate": 4.33659839715049e-05,
|
||
|
"loss": 0.8694,
|
||
|
"step": 1990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.557544757033248,
|
||
|
"grad_norm": 3.4456515312194824,
|
||
|
"learning_rate": 4.332146037399822e-05,
|
||
|
"loss": 0.7287,
|
||
|
"step": 2000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.5703324808184145,
|
||
|
"grad_norm": 8.840888977050781,
|
||
|
"learning_rate": 4.3276936776491546e-05,
|
||
|
"loss": 1.2007,
|
||
|
"step": 2010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.5831202046035804,
|
||
|
"grad_norm": 17.507421493530273,
|
||
|
"learning_rate": 4.3232413178984864e-05,
|
||
|
"loss": 0.9524,
|
||
|
"step": 2020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.5959079283887467,
|
||
|
"grad_norm": 4.879913806915283,
|
||
|
"learning_rate": 4.318788958147819e-05,
|
||
|
"loss": 0.9294,
|
||
|
"step": 2030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.608695652173913,
|
||
|
"grad_norm": 15.94214153289795,
|
||
|
"learning_rate": 4.3143365983971505e-05,
|
||
|
"loss": 1.5526,
|
||
|
"step": 2040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.6214833759590794,
|
||
|
"grad_norm": 10.489286422729492,
|
||
|
"learning_rate": 4.309884238646483e-05,
|
||
|
"loss": 0.6439,
|
||
|
"step": 2050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.634271099744246,
|
||
|
"grad_norm": 15.586228370666504,
|
||
|
"learning_rate": 4.305431878895815e-05,
|
||
|
"loss": 1.0779,
|
||
|
"step": 2060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.6470588235294117,
|
||
|
"grad_norm": 2.084414005279541,
|
||
|
"learning_rate": 4.300979519145147e-05,
|
||
|
"loss": 1.169,
|
||
|
"step": 2070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.659846547314578,
|
||
|
"grad_norm": 15.87989330291748,
|
||
|
"learning_rate": 4.2965271593944796e-05,
|
||
|
"loss": 1.0841,
|
||
|
"step": 2080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.6726342710997444,
|
||
|
"grad_norm": 9.806713104248047,
|
||
|
"learning_rate": 4.292074799643811e-05,
|
||
|
"loss": 1.2079,
|
||
|
"step": 2090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.6854219948849103,
|
||
|
"grad_norm": 22.467456817626953,
|
||
|
"learning_rate": 4.287622439893144e-05,
|
||
|
"loss": 1.3484,
|
||
|
"step": 2100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.6982097186700766,
|
||
|
"grad_norm": 10.053953170776367,
|
||
|
"learning_rate": 4.2831700801424755e-05,
|
||
|
"loss": 0.7977,
|
||
|
"step": 2110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.710997442455243,
|
||
|
"grad_norm": 23.656936645507812,
|
||
|
"learning_rate": 4.278717720391808e-05,
|
||
|
"loss": 1.2581,
|
||
|
"step": 2120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.7237851662404093,
|
||
|
"grad_norm": 10.244109153747559,
|
||
|
"learning_rate": 4.27426536064114e-05,
|
||
|
"loss": 1.218,
|
||
|
"step": 2130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.7365728900255757,
|
||
|
"grad_norm": 6.62468957901001,
|
||
|
"learning_rate": 4.269813000890472e-05,
|
||
|
"loss": 0.7606,
|
||
|
"step": 2140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.7493606138107416,
|
||
|
"grad_norm": 2.3720364570617676,
|
||
|
"learning_rate": 4.2653606411398045e-05,
|
||
|
"loss": 0.582,
|
||
|
"step": 2150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.762148337595908,
|
||
|
"grad_norm": 18.96427345275879,
|
||
|
"learning_rate": 4.260908281389136e-05,
|
||
|
"loss": 1.1205,
|
||
|
"step": 2160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.7749360613810743,
|
||
|
"grad_norm": 5.456460475921631,
|
||
|
"learning_rate": 4.256455921638469e-05,
|
||
|
"loss": 1.0613,
|
||
|
"step": 2170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.78772378516624,
|
||
|
"grad_norm": 6.44590950012207,
|
||
|
"learning_rate": 4.2520035618878004e-05,
|
||
|
"loss": 1.4278,
|
||
|
"step": 2180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.8005115089514065,
|
||
|
"grad_norm": 13.97479248046875,
|
||
|
"learning_rate": 4.247551202137133e-05,
|
||
|
"loss": 1.2871,
|
||
|
"step": 2190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.813299232736573,
|
||
|
"grad_norm": 7.399073123931885,
|
||
|
"learning_rate": 4.2430988423864646e-05,
|
||
|
"loss": 0.867,
|
||
|
"step": 2200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.8260869565217392,
|
||
|
"grad_norm": 3.542203187942505,
|
||
|
"learning_rate": 4.238646482635798e-05,
|
||
|
"loss": 0.7119,
|
||
|
"step": 2210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.8388746803069056,
|
||
|
"grad_norm": 1.8208072185516357,
|
||
|
"learning_rate": 4.2341941228851295e-05,
|
||
|
"loss": 1.0378,
|
||
|
"step": 2220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.8516624040920715,
|
||
|
"grad_norm": 11.78288745880127,
|
||
|
"learning_rate": 4.229741763134462e-05,
|
||
|
"loss": 1.0857,
|
||
|
"step": 2230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.864450127877238,
|
||
|
"grad_norm": 2.7360832691192627,
|
||
|
"learning_rate": 4.2252894033837936e-05,
|
||
|
"loss": 0.7267,
|
||
|
"step": 2240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.877237851662404,
|
||
|
"grad_norm": 17.203723907470703,
|
||
|
"learning_rate": 4.2208370436331254e-05,
|
||
|
"loss": 1.222,
|
||
|
"step": 2250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.89002557544757,
|
||
|
"grad_norm": 11.256026268005371,
|
||
|
"learning_rate": 4.216384683882458e-05,
|
||
|
"loss": 1.175,
|
||
|
"step": 2260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.9028132992327365,
|
||
|
"grad_norm": 8.434975624084473,
|
||
|
"learning_rate": 4.2119323241317896e-05,
|
||
|
"loss": 0.8133,
|
||
|
"step": 2270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.915601023017903,
|
||
|
"grad_norm": 8.774354934692383,
|
||
|
"learning_rate": 4.207479964381123e-05,
|
||
|
"loss": 0.9086,
|
||
|
"step": 2280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.928388746803069,
|
||
|
"grad_norm": 6.846632480621338,
|
||
|
"learning_rate": 4.2030276046304544e-05,
|
||
|
"loss": 1.4552,
|
||
|
"step": 2290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.9411764705882355,
|
||
|
"grad_norm": 11.470861434936523,
|
||
|
"learning_rate": 4.198575244879787e-05,
|
||
|
"loss": 1.0146,
|
||
|
"step": 2300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.9539641943734014,
|
||
|
"grad_norm": 14.26189136505127,
|
||
|
"learning_rate": 4.1941228851291186e-05,
|
||
|
"loss": 0.9995,
|
||
|
"step": 2310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.9667519181585678,
|
||
|
"grad_norm": 6.346394062042236,
|
||
|
"learning_rate": 4.189670525378451e-05,
|
||
|
"loss": 1.0182,
|
||
|
"step": 2320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.979539641943734,
|
||
|
"grad_norm": 3.880919933319092,
|
||
|
"learning_rate": 4.185218165627783e-05,
|
||
|
"loss": 0.4778,
|
||
|
"step": 2330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 2.9923273657289,
|
||
|
"grad_norm": 19.28879737854004,
|
||
|
"learning_rate": 4.180765805877115e-05,
|
||
|
"loss": 1.0621,
|
||
|
"step": 2340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.0,
|
||
|
"eval_loss": 0.46207094192504883,
|
||
|
"eval_runtime": 0.9749,
|
||
|
"eval_samples_per_second": 100.52,
|
||
|
"eval_steps_per_second": 13.334,
|
||
|
"step": 2346
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.0051150895140664,
|
||
|
"grad_norm": 12.311324119567871,
|
||
|
"learning_rate": 4.1763134461264476e-05,
|
||
|
"loss": 1.1394,
|
||
|
"step": 2350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.0179028132992327,
|
||
|
"grad_norm": 6.373651027679443,
|
||
|
"learning_rate": 4.1718610863757794e-05,
|
||
|
"loss": 1.3079,
|
||
|
"step": 2360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.030690537084399,
|
||
|
"grad_norm": 20.532047271728516,
|
||
|
"learning_rate": 4.167408726625112e-05,
|
||
|
"loss": 1.5355,
|
||
|
"step": 2370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.0434782608695654,
|
||
|
"grad_norm": 21.725034713745117,
|
||
|
"learning_rate": 4.1629563668744435e-05,
|
||
|
"loss": 0.7584,
|
||
|
"step": 2380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.0562659846547313,
|
||
|
"grad_norm": 0.31800711154937744,
|
||
|
"learning_rate": 4.158504007123776e-05,
|
||
|
"loss": 0.7678,
|
||
|
"step": 2390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.0690537084398977,
|
||
|
"grad_norm": 7.7792744636535645,
|
||
|
"learning_rate": 4.154051647373108e-05,
|
||
|
"loss": 0.876,
|
||
|
"step": 2400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.081841432225064,
|
||
|
"grad_norm": 0.5140416026115417,
|
||
|
"learning_rate": 4.14959928762244e-05,
|
||
|
"loss": 1.2692,
|
||
|
"step": 2410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.0946291560102304,
|
||
|
"grad_norm": 2.118436813354492,
|
||
|
"learning_rate": 4.1451469278717726e-05,
|
||
|
"loss": 0.463,
|
||
|
"step": 2420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.1074168797953963,
|
||
|
"grad_norm": 10.993577003479004,
|
||
|
"learning_rate": 4.140694568121104e-05,
|
||
|
"loss": 0.5364,
|
||
|
"step": 2430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.1202046035805626,
|
||
|
"grad_norm": 3.5870437622070312,
|
||
|
"learning_rate": 4.136242208370437e-05,
|
||
|
"loss": 1.6276,
|
||
|
"step": 2440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.132992327365729,
|
||
|
"grad_norm": 83.34903717041016,
|
||
|
"learning_rate": 4.1317898486197685e-05,
|
||
|
"loss": 0.8779,
|
||
|
"step": 2450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.1457800511508953,
|
||
|
"grad_norm": 7.2245941162109375,
|
||
|
"learning_rate": 4.127337488869101e-05,
|
||
|
"loss": 1.0543,
|
||
|
"step": 2460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.1585677749360612,
|
||
|
"grad_norm": 2.145303249359131,
|
||
|
"learning_rate": 4.122885129118433e-05,
|
||
|
"loss": 1.2492,
|
||
|
"step": 2470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.1713554987212276,
|
||
|
"grad_norm": 10.122068405151367,
|
||
|
"learning_rate": 4.118432769367765e-05,
|
||
|
"loss": 1.0897,
|
||
|
"step": 2480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.184143222506394,
|
||
|
"grad_norm": 10.550411224365234,
|
||
|
"learning_rate": 4.1139804096170975e-05,
|
||
|
"loss": 0.8078,
|
||
|
"step": 2490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.1969309462915603,
|
||
|
"grad_norm": 1.527519941329956,
|
||
|
"learning_rate": 4.109528049866429e-05,
|
||
|
"loss": 0.4696,
|
||
|
"step": 2500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.209718670076726,
|
||
|
"grad_norm": 11.756329536437988,
|
||
|
"learning_rate": 4.105075690115762e-05,
|
||
|
"loss": 0.7907,
|
||
|
"step": 2510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.2225063938618925,
|
||
|
"grad_norm": 14.163141250610352,
|
||
|
"learning_rate": 4.1006233303650935e-05,
|
||
|
"loss": 0.6561,
|
||
|
"step": 2520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.235294117647059,
|
||
|
"grad_norm": 1.5355645418167114,
|
||
|
"learning_rate": 4.096170970614426e-05,
|
||
|
"loss": 0.3889,
|
||
|
"step": 2530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.2480818414322252,
|
||
|
"grad_norm": 2.9927313327789307,
|
||
|
"learning_rate": 4.0917186108637576e-05,
|
||
|
"loss": 0.9113,
|
||
|
"step": 2540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.260869565217391,
|
||
|
"grad_norm": 3.701603889465332,
|
||
|
"learning_rate": 4.08726625111309e-05,
|
||
|
"loss": 0.7163,
|
||
|
"step": 2550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.2736572890025575,
|
||
|
"grad_norm": 2.665637731552124,
|
||
|
"learning_rate": 4.0828138913624225e-05,
|
||
|
"loss": 0.7217,
|
||
|
"step": 2560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.286445012787724,
|
||
|
"grad_norm": 4.589602947235107,
|
||
|
"learning_rate": 4.078361531611755e-05,
|
||
|
"loss": 1.2966,
|
||
|
"step": 2570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.29923273657289,
|
||
|
"grad_norm": 129.0029754638672,
|
||
|
"learning_rate": 4.0739091718610867e-05,
|
||
|
"loss": 0.6644,
|
||
|
"step": 2580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.312020460358056,
|
||
|
"grad_norm": 5.486814498901367,
|
||
|
"learning_rate": 4.0694568121104184e-05,
|
||
|
"loss": 0.4016,
|
||
|
"step": 2590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.3248081841432224,
|
||
|
"grad_norm": 0.4740515351295471,
|
||
|
"learning_rate": 4.065004452359751e-05,
|
||
|
"loss": 0.9797,
|
||
|
"step": 2600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.337595907928389,
|
||
|
"grad_norm": 11.610332489013672,
|
||
|
"learning_rate": 4.0605520926090826e-05,
|
||
|
"loss": 1.2847,
|
||
|
"step": 2610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.350383631713555,
|
||
|
"grad_norm": 99.53129577636719,
|
||
|
"learning_rate": 4.056099732858415e-05,
|
||
|
"loss": 1.3681,
|
||
|
"step": 2620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.363171355498721,
|
||
|
"grad_norm": 15.345809936523438,
|
||
|
"learning_rate": 4.0516473731077474e-05,
|
||
|
"loss": 1.4627,
|
||
|
"step": 2630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.3759590792838874,
|
||
|
"grad_norm": 3.9375596046447754,
|
||
|
"learning_rate": 4.04719501335708e-05,
|
||
|
"loss": 0.8383,
|
||
|
"step": 2640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.3887468030690537,
|
||
|
"grad_norm": 14.221712112426758,
|
||
|
"learning_rate": 4.0427426536064116e-05,
|
||
|
"loss": 0.841,
|
||
|
"step": 2650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.40153452685422,
|
||
|
"grad_norm": 9.657092094421387,
|
||
|
"learning_rate": 4.038290293855744e-05,
|
||
|
"loss": 0.7076,
|
||
|
"step": 2660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.414322250639386,
|
||
|
"grad_norm": 15.229437828063965,
|
||
|
"learning_rate": 4.033837934105076e-05,
|
||
|
"loss": 0.7533,
|
||
|
"step": 2670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.4271099744245523,
|
||
|
"grad_norm": 3.810375928878784,
|
||
|
"learning_rate": 4.029385574354408e-05,
|
||
|
"loss": 0.9037,
|
||
|
"step": 2680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.4398976982097187,
|
||
|
"grad_norm": 3.3026020526885986,
|
||
|
"learning_rate": 4.02493321460374e-05,
|
||
|
"loss": 0.2881,
|
||
|
"step": 2690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.452685421994885,
|
||
|
"grad_norm": 4.372268199920654,
|
||
|
"learning_rate": 4.0204808548530724e-05,
|
||
|
"loss": 0.7749,
|
||
|
"step": 2700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.4654731457800514,
|
||
|
"grad_norm": 11.79289722442627,
|
||
|
"learning_rate": 4.016028495102405e-05,
|
||
|
"loss": 1.1843,
|
||
|
"step": 2710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.4782608695652173,
|
||
|
"grad_norm": 6.681139945983887,
|
||
|
"learning_rate": 4.0115761353517366e-05,
|
||
|
"loss": 1.1241,
|
||
|
"step": 2720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.4910485933503836,
|
||
|
"grad_norm": 13.276257514953613,
|
||
|
"learning_rate": 4.007123775601069e-05,
|
||
|
"loss": 1.1697,
|
||
|
"step": 2730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.50383631713555,
|
||
|
"grad_norm": 4.372821807861328,
|
||
|
"learning_rate": 4.002671415850401e-05,
|
||
|
"loss": 0.7218,
|
||
|
"step": 2740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.516624040920716,
|
||
|
"grad_norm": 2.8409574031829834,
|
||
|
"learning_rate": 3.998219056099733e-05,
|
||
|
"loss": 0.813,
|
||
|
"step": 2750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.5294117647058822,
|
||
|
"grad_norm": 1.350940465927124,
|
||
|
"learning_rate": 3.993766696349065e-05,
|
||
|
"loss": 1.0097,
|
||
|
"step": 2760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.5421994884910486,
|
||
|
"grad_norm": 12.746123313903809,
|
||
|
"learning_rate": 3.989314336598397e-05,
|
||
|
"loss": 0.8977,
|
||
|
"step": 2770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.554987212276215,
|
||
|
"grad_norm": 5.898983001708984,
|
||
|
"learning_rate": 3.98486197684773e-05,
|
||
|
"loss": 0.8431,
|
||
|
"step": 2780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.5677749360613813,
|
||
|
"grad_norm": 11.514519691467285,
|
||
|
"learning_rate": 3.9804096170970615e-05,
|
||
|
"loss": 1.2064,
|
||
|
"step": 2790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.580562659846547,
|
||
|
"grad_norm": 2.989314556121826,
|
||
|
"learning_rate": 3.975957257346394e-05,
|
||
|
"loss": 0.8261,
|
||
|
"step": 2800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.5933503836317136,
|
||
|
"grad_norm": 16.665599822998047,
|
||
|
"learning_rate": 3.971504897595726e-05,
|
||
|
"loss": 0.832,
|
||
|
"step": 2810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.60613810741688,
|
||
|
"grad_norm": 0.406387597322464,
|
||
|
"learning_rate": 3.967052537845058e-05,
|
||
|
"loss": 0.3909,
|
||
|
"step": 2820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.618925831202046,
|
||
|
"grad_norm": 2.7753970623016357,
|
||
|
"learning_rate": 3.96260017809439e-05,
|
||
|
"loss": 0.5041,
|
||
|
"step": 2830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.631713554987212,
|
||
|
"grad_norm": 1.3813972473144531,
|
||
|
"learning_rate": 3.958147818343722e-05,
|
||
|
"loss": 0.7192,
|
||
|
"step": 2840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.6445012787723785,
|
||
|
"grad_norm": 2.6115665435791016,
|
||
|
"learning_rate": 3.953695458593055e-05,
|
||
|
"loss": 1.1741,
|
||
|
"step": 2850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.657289002557545,
|
||
|
"grad_norm": 9.115361213684082,
|
||
|
"learning_rate": 3.9492430988423865e-05,
|
||
|
"loss": 0.6617,
|
||
|
"step": 2860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.670076726342711,
|
||
|
"grad_norm": 6.27280330657959,
|
||
|
"learning_rate": 3.944790739091719e-05,
|
||
|
"loss": 1.1405,
|
||
|
"step": 2870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.682864450127877,
|
||
|
"grad_norm": 0.9005927443504333,
|
||
|
"learning_rate": 3.9403383793410506e-05,
|
||
|
"loss": 0.8749,
|
||
|
"step": 2880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.6956521739130435,
|
||
|
"grad_norm": 1.6038532257080078,
|
||
|
"learning_rate": 3.935886019590383e-05,
|
||
|
"loss": 1.1794,
|
||
|
"step": 2890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.70843989769821,
|
||
|
"grad_norm": 7.583934307098389,
|
||
|
"learning_rate": 3.931433659839715e-05,
|
||
|
"loss": 0.6017,
|
||
|
"step": 2900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.7212276214833757,
|
||
|
"grad_norm": 0.9503026008605957,
|
||
|
"learning_rate": 3.926981300089048e-05,
|
||
|
"loss": 0.5053,
|
||
|
"step": 2910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.734015345268542,
|
||
|
"grad_norm": 7.907811164855957,
|
||
|
"learning_rate": 3.92252894033838e-05,
|
||
|
"loss": 1.3511,
|
||
|
"step": 2920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.7468030690537084,
|
||
|
"grad_norm": 10.489314079284668,
|
||
|
"learning_rate": 3.918076580587712e-05,
|
||
|
"loss": 0.508,
|
||
|
"step": 2930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.7595907928388748,
|
||
|
"grad_norm": 7.821562767028809,
|
||
|
"learning_rate": 3.913624220837044e-05,
|
||
|
"loss": 1.0142,
|
||
|
"step": 2940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.772378516624041,
|
||
|
"grad_norm": 12.84817123413086,
|
||
|
"learning_rate": 3.9091718610863756e-05,
|
||
|
"loss": 0.9375,
|
||
|
"step": 2950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.785166240409207,
|
||
|
"grad_norm": 2.9846205711364746,
|
||
|
"learning_rate": 3.904719501335708e-05,
|
||
|
"loss": 0.8996,
|
||
|
"step": 2960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.7979539641943734,
|
||
|
"grad_norm": 9.234708786010742,
|
||
|
"learning_rate": 3.90026714158504e-05,
|
||
|
"loss": 1.1906,
|
||
|
"step": 2970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.8107416879795397,
|
||
|
"grad_norm": 1.4158365726470947,
|
||
|
"learning_rate": 3.895814781834373e-05,
|
||
|
"loss": 1.1976,
|
||
|
"step": 2980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.8235294117647056,
|
||
|
"grad_norm": 8.958301544189453,
|
||
|
"learning_rate": 3.8913624220837046e-05,
|
||
|
"loss": 0.4074,
|
||
|
"step": 2990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.836317135549872,
|
||
|
"grad_norm": 8.71174144744873,
|
||
|
"learning_rate": 3.886910062333037e-05,
|
||
|
"loss": 0.5862,
|
||
|
"step": 3000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.8491048593350383,
|
||
|
"grad_norm": 0.366887629032135,
|
||
|
"learning_rate": 3.882457702582369e-05,
|
||
|
"loss": 0.3793,
|
||
|
"step": 3010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.8618925831202047,
|
||
|
"grad_norm": 6.718595504760742,
|
||
|
"learning_rate": 3.878005342831701e-05,
|
||
|
"loss": 0.5965,
|
||
|
"step": 3020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.874680306905371,
|
||
|
"grad_norm": 1.7599328756332397,
|
||
|
"learning_rate": 3.873552983081033e-05,
|
||
|
"loss": 0.7439,
|
||
|
"step": 3030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.887468030690537,
|
||
|
"grad_norm": 4.962011337280273,
|
||
|
"learning_rate": 3.869100623330365e-05,
|
||
|
"loss": 1.2969,
|
||
|
"step": 3040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.9002557544757033,
|
||
|
"grad_norm": 6.866610527038574,
|
||
|
"learning_rate": 3.864648263579698e-05,
|
||
|
"loss": 0.7168,
|
||
|
"step": 3050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.9130434782608696,
|
||
|
"grad_norm": 0.8201662302017212,
|
||
|
"learning_rate": 3.8601959038290296e-05,
|
||
|
"loss": 0.2705,
|
||
|
"step": 3060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.9258312020460355,
|
||
|
"grad_norm": 0.8354922533035278,
|
||
|
"learning_rate": 3.855743544078362e-05,
|
||
|
"loss": 0.7996,
|
||
|
"step": 3070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.938618925831202,
|
||
|
"grad_norm": 2.216919422149658,
|
||
|
"learning_rate": 3.851291184327694e-05,
|
||
|
"loss": 0.6187,
|
||
|
"step": 3080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.9514066496163682,
|
||
|
"grad_norm": 9.284915924072266,
|
||
|
"learning_rate": 3.846838824577026e-05,
|
||
|
"loss": 0.7844,
|
||
|
"step": 3090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.9641943734015346,
|
||
|
"grad_norm": 3.3350045680999756,
|
||
|
"learning_rate": 3.842386464826358e-05,
|
||
|
"loss": 0.9315,
|
||
|
"step": 3100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.976982097186701,
|
||
|
"grad_norm": 12.905816078186035,
|
||
|
"learning_rate": 3.8379341050756903e-05,
|
||
|
"loss": 0.8972,
|
||
|
"step": 3110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 3.9897698209718673,
|
||
|
"grad_norm": 2.008113384246826,
|
||
|
"learning_rate": 3.833481745325023e-05,
|
||
|
"loss": 0.6918,
|
||
|
"step": 3120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.0,
|
||
|
"eval_loss": 0.356393963098526,
|
||
|
"eval_runtime": 0.9935,
|
||
|
"eval_samples_per_second": 98.639,
|
||
|
"eval_steps_per_second": 13.085,
|
||
|
"step": 3128
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.002557544757034,
|
||
|
"grad_norm": 5.132030487060547,
|
||
|
"learning_rate": 3.8290293855743545e-05,
|
||
|
"loss": 0.8732,
|
||
|
"step": 3130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.015345268542199,
|
||
|
"grad_norm": 7.4329400062561035,
|
||
|
"learning_rate": 3.824577025823687e-05,
|
||
|
"loss": 0.7742,
|
||
|
"step": 3140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.028132992327365,
|
||
|
"grad_norm": 7.301215648651123,
|
||
|
"learning_rate": 3.820124666073019e-05,
|
||
|
"loss": 1.0238,
|
||
|
"step": 3150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.040920716112532,
|
||
|
"grad_norm": 2.81072735786438,
|
||
|
"learning_rate": 3.815672306322351e-05,
|
||
|
"loss": 1.1583,
|
||
|
"step": 3160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.053708439897698,
|
||
|
"grad_norm": 11.35189437866211,
|
||
|
"learning_rate": 3.811219946571683e-05,
|
||
|
"loss": 0.6364,
|
||
|
"step": 3170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.0664961636828645,
|
||
|
"grad_norm": 1.3151694536209106,
|
||
|
"learning_rate": 3.806767586821015e-05,
|
||
|
"loss": 0.4789,
|
||
|
"step": 3180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.079283887468031,
|
||
|
"grad_norm": 7.979901313781738,
|
||
|
"learning_rate": 3.802315227070348e-05,
|
||
|
"loss": 0.7974,
|
||
|
"step": 3190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.092071611253197,
|
||
|
"grad_norm": 1.1267353296279907,
|
||
|
"learning_rate": 3.7978628673196795e-05,
|
||
|
"loss": 0.883,
|
||
|
"step": 3200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.1048593350383635,
|
||
|
"grad_norm": 1.3425774574279785,
|
||
|
"learning_rate": 3.793410507569012e-05,
|
||
|
"loss": 0.8678,
|
||
|
"step": 3210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.117647058823529,
|
||
|
"grad_norm": 13.647799491882324,
|
||
|
"learning_rate": 3.7889581478183437e-05,
|
||
|
"loss": 0.6365,
|
||
|
"step": 3220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.130434782608695,
|
||
|
"grad_norm": 8.646533012390137,
|
||
|
"learning_rate": 3.784505788067676e-05,
|
||
|
"loss": 0.742,
|
||
|
"step": 3230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.143222506393862,
|
||
|
"grad_norm": 14.215703964233398,
|
||
|
"learning_rate": 3.780053428317008e-05,
|
||
|
"loss": 1.4076,
|
||
|
"step": 3240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.156010230179028,
|
||
|
"grad_norm": 5.101785659790039,
|
||
|
"learning_rate": 3.77560106856634e-05,
|
||
|
"loss": 0.5596,
|
||
|
"step": 3250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.168797953964194,
|
||
|
"grad_norm": 3.0319337844848633,
|
||
|
"learning_rate": 3.771148708815673e-05,
|
||
|
"loss": 0.9117,
|
||
|
"step": 3260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.181585677749361,
|
||
|
"grad_norm": 19.691225051879883,
|
||
|
"learning_rate": 3.766696349065005e-05,
|
||
|
"loss": 0.6898,
|
||
|
"step": 3270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.194373401534527,
|
||
|
"grad_norm": 0.6862250566482544,
|
||
|
"learning_rate": 3.762243989314337e-05,
|
||
|
"loss": 0.755,
|
||
|
"step": 3280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.207161125319693,
|
||
|
"grad_norm": 4.289923191070557,
|
||
|
"learning_rate": 3.7577916295636686e-05,
|
||
|
"loss": 0.6671,
|
||
|
"step": 3290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.21994884910486,
|
||
|
"grad_norm": 1.4601982831954956,
|
||
|
"learning_rate": 3.753339269813001e-05,
|
||
|
"loss": 0.3689,
|
||
|
"step": 3300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.232736572890025,
|
||
|
"grad_norm": 9.796490669250488,
|
||
|
"learning_rate": 3.748886910062333e-05,
|
||
|
"loss": 0.5699,
|
||
|
"step": 3310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.245524296675192,
|
||
|
"grad_norm": 11.114338874816895,
|
||
|
"learning_rate": 3.744434550311665e-05,
|
||
|
"loss": 0.8346,
|
||
|
"step": 3320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.258312020460358,
|
||
|
"grad_norm": 3.114365339279175,
|
||
|
"learning_rate": 3.7399821905609976e-05,
|
||
|
"loss": 0.6354,
|
||
|
"step": 3330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.271099744245524,
|
||
|
"grad_norm": 2.011147975921631,
|
||
|
"learning_rate": 3.73552983081033e-05,
|
||
|
"loss": 0.6387,
|
||
|
"step": 3340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.283887468030691,
|
||
|
"grad_norm": 0.5666208267211914,
|
||
|
"learning_rate": 3.731077471059662e-05,
|
||
|
"loss": 0.6872,
|
||
|
"step": 3350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.296675191815857,
|
||
|
"grad_norm": 2.4012691974639893,
|
||
|
"learning_rate": 3.726625111308994e-05,
|
||
|
"loss": 0.7966,
|
||
|
"step": 3360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.309462915601023,
|
||
|
"grad_norm": 2.9042274951934814,
|
||
|
"learning_rate": 3.722172751558326e-05,
|
||
|
"loss": 0.4021,
|
||
|
"step": 3370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.322250639386189,
|
||
|
"grad_norm": 15.42292308807373,
|
||
|
"learning_rate": 3.7177203918076584e-05,
|
||
|
"loss": 0.5537,
|
||
|
"step": 3380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.335038363171355,
|
||
|
"grad_norm": 5.009647369384766,
|
||
|
"learning_rate": 3.71326803205699e-05,
|
||
|
"loss": 0.6362,
|
||
|
"step": 3390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.3478260869565215,
|
||
|
"grad_norm": 1.7608520984649658,
|
||
|
"learning_rate": 3.7088156723063226e-05,
|
||
|
"loss": 1.0878,
|
||
|
"step": 3400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.360613810741688,
|
||
|
"grad_norm": 0.519935667514801,
|
||
|
"learning_rate": 3.704363312555655e-05,
|
||
|
"loss": 0.6975,
|
||
|
"step": 3410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.373401534526854,
|
||
|
"grad_norm": 4.793030738830566,
|
||
|
"learning_rate": 3.699910952804987e-05,
|
||
|
"loss": 0.7543,
|
||
|
"step": 3420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.3861892583120206,
|
||
|
"grad_norm": 7.277567386627197,
|
||
|
"learning_rate": 3.695458593054319e-05,
|
||
|
"loss": 0.7289,
|
||
|
"step": 3430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.398976982097187,
|
||
|
"grad_norm": 2.9830596446990967,
|
||
|
"learning_rate": 3.691006233303651e-05,
|
||
|
"loss": 0.7286,
|
||
|
"step": 3440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.411764705882353,
|
||
|
"grad_norm": 3.031907320022583,
|
||
|
"learning_rate": 3.6865538735529834e-05,
|
||
|
"loss": 0.5704,
|
||
|
"step": 3450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.42455242966752,
|
||
|
"grad_norm": 11.57882308959961,
|
||
|
"learning_rate": 3.682101513802315e-05,
|
||
|
"loss": 0.4617,
|
||
|
"step": 3460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.437340153452685,
|
||
|
"grad_norm": 10.076362609863281,
|
||
|
"learning_rate": 3.6776491540516475e-05,
|
||
|
"loss": 0.5074,
|
||
|
"step": 3470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.450127877237851,
|
||
|
"grad_norm": 3.9779062271118164,
|
||
|
"learning_rate": 3.67319679430098e-05,
|
||
|
"loss": 1.1924,
|
||
|
"step": 3480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.462915601023018,
|
||
|
"grad_norm": 2.4633734226226807,
|
||
|
"learning_rate": 3.668744434550312e-05,
|
||
|
"loss": 0.8225,
|
||
|
"step": 3490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.475703324808184,
|
||
|
"grad_norm": 4.295853137969971,
|
||
|
"learning_rate": 3.664292074799644e-05,
|
||
|
"loss": 0.2683,
|
||
|
"step": 3500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.4884910485933505,
|
||
|
"grad_norm": 11.388009071350098,
|
||
|
"learning_rate": 3.659839715048976e-05,
|
||
|
"loss": 1.0016,
|
||
|
"step": 3510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.501278772378517,
|
||
|
"grad_norm": 1.5823692083358765,
|
||
|
"learning_rate": 3.655387355298308e-05,
|
||
|
"loss": 0.5582,
|
||
|
"step": 3520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.514066496163683,
|
||
|
"grad_norm": 14.604119300842285,
|
||
|
"learning_rate": 3.65093499554764e-05,
|
||
|
"loss": 0.8775,
|
||
|
"step": 3530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.526854219948849,
|
||
|
"grad_norm": 2.4483489990234375,
|
||
|
"learning_rate": 3.6464826357969725e-05,
|
||
|
"loss": 0.7407,
|
||
|
"step": 3540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.539641943734015,
|
||
|
"grad_norm": 12.382391929626465,
|
||
|
"learning_rate": 3.642030276046305e-05,
|
||
|
"loss": 0.3615,
|
||
|
"step": 3550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.552429667519181,
|
||
|
"grad_norm": 11.958683013916016,
|
||
|
"learning_rate": 3.637577916295637e-05,
|
||
|
"loss": 0.9151,
|
||
|
"step": 3560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.565217391304348,
|
||
|
"grad_norm": 4.6142897605896,
|
||
|
"learning_rate": 3.633125556544969e-05,
|
||
|
"loss": 0.6341,
|
||
|
"step": 3570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.578005115089514,
|
||
|
"grad_norm": 10.04975700378418,
|
||
|
"learning_rate": 3.628673196794301e-05,
|
||
|
"loss": 0.5591,
|
||
|
"step": 3580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.59079283887468,
|
||
|
"grad_norm": 12.153048515319824,
|
||
|
"learning_rate": 3.624220837043633e-05,
|
||
|
"loss": 1.2612,
|
||
|
"step": 3590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.603580562659847,
|
||
|
"grad_norm": 8.2946138381958,
|
||
|
"learning_rate": 3.619768477292965e-05,
|
||
|
"loss": 0.667,
|
||
|
"step": 3600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.616368286445013,
|
||
|
"grad_norm": 8.6572265625,
|
||
|
"learning_rate": 3.615316117542298e-05,
|
||
|
"loss": 0.3353,
|
||
|
"step": 3610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.629156010230179,
|
||
|
"grad_norm": 18.10332679748535,
|
||
|
"learning_rate": 3.61086375779163e-05,
|
||
|
"loss": 0.6227,
|
||
|
"step": 3620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.641943734015345,
|
||
|
"grad_norm": 10.11817455291748,
|
||
|
"learning_rate": 3.6064113980409616e-05,
|
||
|
"loss": 0.9023,
|
||
|
"step": 3630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.654731457800511,
|
||
|
"grad_norm": 9.292647361755371,
|
||
|
"learning_rate": 3.601959038290294e-05,
|
||
|
"loss": 0.5075,
|
||
|
"step": 3640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.667519181585678,
|
||
|
"grad_norm": 14.415812492370605,
|
||
|
"learning_rate": 3.597506678539626e-05,
|
||
|
"loss": 0.8366,
|
||
|
"step": 3650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.680306905370844,
|
||
|
"grad_norm": 10.451414108276367,
|
||
|
"learning_rate": 3.593054318788958e-05,
|
||
|
"loss": 0.599,
|
||
|
"step": 3660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.69309462915601,
|
||
|
"grad_norm": 3.4356963634490967,
|
||
|
"learning_rate": 3.58860195903829e-05,
|
||
|
"loss": 0.4089,
|
||
|
"step": 3670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.705882352941177,
|
||
|
"grad_norm": 13.768664360046387,
|
||
|
"learning_rate": 3.584149599287623e-05,
|
||
|
"loss": 1.0491,
|
||
|
"step": 3680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.718670076726343,
|
||
|
"grad_norm": 1.4625777006149292,
|
||
|
"learning_rate": 3.579697239536955e-05,
|
||
|
"loss": 0.6929,
|
||
|
"step": 3690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.731457800511509,
|
||
|
"grad_norm": 3.810811758041382,
|
||
|
"learning_rate": 3.575244879786287e-05,
|
||
|
"loss": 0.7565,
|
||
|
"step": 3700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.744245524296675,
|
||
|
"grad_norm": 10.068679809570312,
|
||
|
"learning_rate": 3.570792520035619e-05,
|
||
|
"loss": 0.6768,
|
||
|
"step": 3710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.757033248081841,
|
||
|
"grad_norm": 7.575318813323975,
|
||
|
"learning_rate": 3.5663401602849514e-05,
|
||
|
"loss": 0.8091,
|
||
|
"step": 3720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.7698209718670075,
|
||
|
"grad_norm": 0.5645831823348999,
|
||
|
"learning_rate": 3.561887800534283e-05,
|
||
|
"loss": 0.6504,
|
||
|
"step": 3730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.782608695652174,
|
||
|
"grad_norm": 11.733582496643066,
|
||
|
"learning_rate": 3.557435440783615e-05,
|
||
|
"loss": 0.76,
|
||
|
"step": 3740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.79539641943734,
|
||
|
"grad_norm": 11.055776596069336,
|
||
|
"learning_rate": 3.552983081032948e-05,
|
||
|
"loss": 0.6466,
|
||
|
"step": 3750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.8081841432225065,
|
||
|
"grad_norm": 2.825686454772949,
|
||
|
"learning_rate": 3.54853072128228e-05,
|
||
|
"loss": 0.4032,
|
||
|
"step": 3760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.820971867007673,
|
||
|
"grad_norm": 1.2281103134155273,
|
||
|
"learning_rate": 3.544078361531612e-05,
|
||
|
"loss": 0.3983,
|
||
|
"step": 3770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.833759590792839,
|
||
|
"grad_norm": 0.7035624384880066,
|
||
|
"learning_rate": 3.539626001780944e-05,
|
||
|
"loss": 0.4677,
|
||
|
"step": 3780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.846547314578006,
|
||
|
"grad_norm": 12.086211204528809,
|
||
|
"learning_rate": 3.5351736420302764e-05,
|
||
|
"loss": 0.8847,
|
||
|
"step": 3790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.859335038363171,
|
||
|
"grad_norm": 1.0651588439941406,
|
||
|
"learning_rate": 3.530721282279608e-05,
|
||
|
"loss": 0.6754,
|
||
|
"step": 3800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.872122762148337,
|
||
|
"grad_norm": 1.574463129043579,
|
||
|
"learning_rate": 3.5262689225289405e-05,
|
||
|
"loss": 0.6701,
|
||
|
"step": 3810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.884910485933504,
|
||
|
"grad_norm": 12.231953620910645,
|
||
|
"learning_rate": 3.521816562778273e-05,
|
||
|
"loss": 0.8218,
|
||
|
"step": 3820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.89769820971867,
|
||
|
"grad_norm": 4.2090535163879395,
|
||
|
"learning_rate": 3.517364203027605e-05,
|
||
|
"loss": 0.6251,
|
||
|
"step": 3830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.910485933503836,
|
||
|
"grad_norm": 4.578638076782227,
|
||
|
"learning_rate": 3.512911843276937e-05,
|
||
|
"loss": 0.7583,
|
||
|
"step": 3840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.923273657289003,
|
||
|
"grad_norm": 3.189852237701416,
|
||
|
"learning_rate": 3.508459483526269e-05,
|
||
|
"loss": 0.2964,
|
||
|
"step": 3850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.936061381074169,
|
||
|
"grad_norm": 2.299405097961426,
|
||
|
"learning_rate": 3.504007123775601e-05,
|
||
|
"loss": 0.4669,
|
||
|
"step": 3860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.948849104859335,
|
||
|
"grad_norm": 2.1393966674804688,
|
||
|
"learning_rate": 3.499554764024933e-05,
|
||
|
"loss": 0.5147,
|
||
|
"step": 3870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.961636828644501,
|
||
|
"grad_norm": 10.183451652526855,
|
||
|
"learning_rate": 3.4951024042742655e-05,
|
||
|
"loss": 0.6936,
|
||
|
"step": 3880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.974424552429667,
|
||
|
"grad_norm": 0.556858479976654,
|
||
|
"learning_rate": 3.490650044523598e-05,
|
||
|
"loss": 0.7699,
|
||
|
"step": 3890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 4.987212276214834,
|
||
|
"grad_norm": 7.906371116638184,
|
||
|
"learning_rate": 3.48619768477293e-05,
|
||
|
"loss": 0.5674,
|
||
|
"step": 3900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.0,
|
||
|
"grad_norm": 9.858052253723145,
|
||
|
"learning_rate": 3.481745325022262e-05,
|
||
|
"loss": 0.5803,
|
||
|
"step": 3910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.0,
|
||
|
"eval_loss": 0.308910071849823,
|
||
|
"eval_runtime": 0.8202,
|
||
|
"eval_samples_per_second": 119.488,
|
||
|
"eval_steps_per_second": 15.85,
|
||
|
"step": 3910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.012787723785166,
|
||
|
"grad_norm": 4.255712985992432,
|
||
|
"learning_rate": 3.477292965271594e-05,
|
||
|
"loss": 0.7186,
|
||
|
"step": 3920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.025575447570333,
|
||
|
"grad_norm": 0.03182278946042061,
|
||
|
"learning_rate": 3.472840605520926e-05,
|
||
|
"loss": 0.764,
|
||
|
"step": 3930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.038363171355499,
|
||
|
"grad_norm": 1.7552779912948608,
|
||
|
"learning_rate": 3.468388245770258e-05,
|
||
|
"loss": 0.5904,
|
||
|
"step": 3940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.051150895140665,
|
||
|
"grad_norm": 0.7406250238418579,
|
||
|
"learning_rate": 3.4639358860195904e-05,
|
||
|
"loss": 0.8398,
|
||
|
"step": 3950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.063938618925831,
|
||
|
"grad_norm": 1.4716426134109497,
|
||
|
"learning_rate": 3.459483526268923e-05,
|
||
|
"loss": 0.5454,
|
||
|
"step": 3960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.076726342710997,
|
||
|
"grad_norm": 2.5447280406951904,
|
||
|
"learning_rate": 3.455031166518255e-05,
|
||
|
"loss": 0.4558,
|
||
|
"step": 3970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.089514066496164,
|
||
|
"grad_norm": 4.706602096557617,
|
||
|
"learning_rate": 3.450578806767587e-05,
|
||
|
"loss": 0.3279,
|
||
|
"step": 3980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.10230179028133,
|
||
|
"grad_norm": 3.110121250152588,
|
||
|
"learning_rate": 3.446126447016919e-05,
|
||
|
"loss": 0.3657,
|
||
|
"step": 3990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.115089514066496,
|
||
|
"grad_norm": 0.3601504862308502,
|
||
|
"learning_rate": 3.441674087266251e-05,
|
||
|
"loss": 0.4001,
|
||
|
"step": 4000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.127877237851663,
|
||
|
"grad_norm": 1.9790247678756714,
|
||
|
"learning_rate": 3.437221727515583e-05,
|
||
|
"loss": 0.1499,
|
||
|
"step": 4010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.140664961636829,
|
||
|
"grad_norm": 5.311811447143555,
|
||
|
"learning_rate": 3.4327693677649154e-05,
|
||
|
"loss": 0.5795,
|
||
|
"step": 4020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.153452685421995,
|
||
|
"grad_norm": 6.410403728485107,
|
||
|
"learning_rate": 3.428317008014248e-05,
|
||
|
"loss": 0.4079,
|
||
|
"step": 4030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.166240409207161,
|
||
|
"grad_norm": 3.0055534839630127,
|
||
|
"learning_rate": 3.42386464826358e-05,
|
||
|
"loss": 0.4303,
|
||
|
"step": 4040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.179028132992327,
|
||
|
"grad_norm": 9.70368766784668,
|
||
|
"learning_rate": 3.419412288512912e-05,
|
||
|
"loss": 0.7073,
|
||
|
"step": 4050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.1918158567774935,
|
||
|
"grad_norm": 14.46500301361084,
|
||
|
"learning_rate": 3.4149599287622444e-05,
|
||
|
"loss": 0.9534,
|
||
|
"step": 4060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.20460358056266,
|
||
|
"grad_norm": 1.8181160688400269,
|
||
|
"learning_rate": 3.410507569011576e-05,
|
||
|
"loss": 0.5392,
|
||
|
"step": 4070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.217391304347826,
|
||
|
"grad_norm": 0.3840659558773041,
|
||
|
"learning_rate": 3.406055209260908e-05,
|
||
|
"loss": 0.3438,
|
||
|
"step": 4080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.2301790281329925,
|
||
|
"grad_norm": 4.869695663452148,
|
||
|
"learning_rate": 3.4016028495102404e-05,
|
||
|
"loss": 0.9978,
|
||
|
"step": 4090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.242966751918159,
|
||
|
"grad_norm": 7.061380386352539,
|
||
|
"learning_rate": 3.397150489759573e-05,
|
||
|
"loss": 0.4002,
|
||
|
"step": 4100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.255754475703325,
|
||
|
"grad_norm": 5.828289031982422,
|
||
|
"learning_rate": 3.392698130008905e-05,
|
||
|
"loss": 0.7073,
|
||
|
"step": 4110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.268542199488491,
|
||
|
"grad_norm": 10.163734436035156,
|
||
|
"learning_rate": 3.388245770258237e-05,
|
||
|
"loss": 0.9711,
|
||
|
"step": 4120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.281329923273657,
|
||
|
"grad_norm": 9.201620101928711,
|
||
|
"learning_rate": 3.3837934105075694e-05,
|
||
|
"loss": 0.5375,
|
||
|
"step": 4130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.294117647058823,
|
||
|
"grad_norm": 1.4918196201324463,
|
||
|
"learning_rate": 3.379341050756901e-05,
|
||
|
"loss": 0.5364,
|
||
|
"step": 4140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.30690537084399,
|
||
|
"grad_norm": 12.544402122497559,
|
||
|
"learning_rate": 3.3748886910062336e-05,
|
||
|
"loss": 0.7528,
|
||
|
"step": 4150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.319693094629156,
|
||
|
"grad_norm": 3.5984480381011963,
|
||
|
"learning_rate": 3.370436331255565e-05,
|
||
|
"loss": 0.591,
|
||
|
"step": 4160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.332480818414322,
|
||
|
"grad_norm": 3.8022067546844482,
|
||
|
"learning_rate": 3.365983971504898e-05,
|
||
|
"loss": 0.9976,
|
||
|
"step": 4170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.345268542199489,
|
||
|
"grad_norm": 8.02660846710205,
|
||
|
"learning_rate": 3.36153161175423e-05,
|
||
|
"loss": 0.4874,
|
||
|
"step": 4180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.358056265984655,
|
||
|
"grad_norm": 4.031845569610596,
|
||
|
"learning_rate": 3.357079252003562e-05,
|
||
|
"loss": 0.4577,
|
||
|
"step": 4190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.370843989769821,
|
||
|
"grad_norm": 9.381792068481445,
|
||
|
"learning_rate": 3.352626892252894e-05,
|
||
|
"loss": 0.5708,
|
||
|
"step": 4200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.383631713554987,
|
||
|
"grad_norm": 10.603078842163086,
|
||
|
"learning_rate": 3.348174532502226e-05,
|
||
|
"loss": 1.0511,
|
||
|
"step": 4210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.396419437340153,
|
||
|
"grad_norm": 3.2949740886688232,
|
||
|
"learning_rate": 3.3437221727515585e-05,
|
||
|
"loss": 0.5616,
|
||
|
"step": 4220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.40920716112532,
|
||
|
"grad_norm": 12.314652442932129,
|
||
|
"learning_rate": 3.33926981300089e-05,
|
||
|
"loss": 0.6216,
|
||
|
"step": 4230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.421994884910486,
|
||
|
"grad_norm": 15.642436981201172,
|
||
|
"learning_rate": 3.334817453250223e-05,
|
||
|
"loss": 0.478,
|
||
|
"step": 4240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.434782608695652,
|
||
|
"grad_norm": 11.80217456817627,
|
||
|
"learning_rate": 3.330365093499555e-05,
|
||
|
"loss": 0.7626,
|
||
|
"step": 4250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.447570332480819,
|
||
|
"grad_norm": 32.96918487548828,
|
||
|
"learning_rate": 3.325912733748887e-05,
|
||
|
"loss": 1.0794,
|
||
|
"step": 4260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.460358056265985,
|
||
|
"grad_norm": 12.918391227722168,
|
||
|
"learning_rate": 3.321460373998219e-05,
|
||
|
"loss": 0.5586,
|
||
|
"step": 4270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.4731457800511505,
|
||
|
"grad_norm": 11.27306079864502,
|
||
|
"learning_rate": 3.317008014247551e-05,
|
||
|
"loss": 0.7194,
|
||
|
"step": 4280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.485933503836317,
|
||
|
"grad_norm": 0.12751297652721405,
|
||
|
"learning_rate": 3.3125556544968835e-05,
|
||
|
"loss": 0.5606,
|
||
|
"step": 4290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.498721227621483,
|
||
|
"grad_norm": 10.48075008392334,
|
||
|
"learning_rate": 3.308103294746215e-05,
|
||
|
"loss": 0.493,
|
||
|
"step": 4300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.5115089514066495,
|
||
|
"grad_norm": 4.982821941375732,
|
||
|
"learning_rate": 3.303650934995548e-05,
|
||
|
"loss": 0.4245,
|
||
|
"step": 4310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.524296675191816,
|
||
|
"grad_norm": 6.689045429229736,
|
||
|
"learning_rate": 3.29919857524488e-05,
|
||
|
"loss": 0.8354,
|
||
|
"step": 4320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.537084398976982,
|
||
|
"grad_norm": 15.300660133361816,
|
||
|
"learning_rate": 3.294746215494212e-05,
|
||
|
"loss": 0.9657,
|
||
|
"step": 4330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.549872122762149,
|
||
|
"grad_norm": 5.424358367919922,
|
||
|
"learning_rate": 3.290293855743544e-05,
|
||
|
"loss": 0.7693,
|
||
|
"step": 4340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.562659846547315,
|
||
|
"grad_norm": 2.7610678672790527,
|
||
|
"learning_rate": 3.285841495992876e-05,
|
||
|
"loss": 0.9893,
|
||
|
"step": 4350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.57544757033248,
|
||
|
"grad_norm": 4.664830684661865,
|
||
|
"learning_rate": 3.2813891362422084e-05,
|
||
|
"loss": 0.84,
|
||
|
"step": 4360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.588235294117647,
|
||
|
"grad_norm": 2.3301279544830322,
|
||
|
"learning_rate": 3.27693677649154e-05,
|
||
|
"loss": 0.3313,
|
||
|
"step": 4370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.601023017902813,
|
||
|
"grad_norm": 1.1856235265731812,
|
||
|
"learning_rate": 3.272484416740873e-05,
|
||
|
"loss": 0.4543,
|
||
|
"step": 4380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.6138107416879794,
|
||
|
"grad_norm": 3.703605890274048,
|
||
|
"learning_rate": 3.268032056990205e-05,
|
||
|
"loss": 1.0115,
|
||
|
"step": 4390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.626598465473146,
|
||
|
"grad_norm": 2.387458562850952,
|
||
|
"learning_rate": 3.2635796972395374e-05,
|
||
|
"loss": 0.4687,
|
||
|
"step": 4400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.639386189258312,
|
||
|
"grad_norm": 7.303229808807373,
|
||
|
"learning_rate": 3.259127337488869e-05,
|
||
|
"loss": 0.5117,
|
||
|
"step": 4410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.6521739130434785,
|
||
|
"grad_norm": 7.670161724090576,
|
||
|
"learning_rate": 3.2546749777382016e-05,
|
||
|
"loss": 0.3712,
|
||
|
"step": 4420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.664961636828645,
|
||
|
"grad_norm": 6.873749256134033,
|
||
|
"learning_rate": 3.2502226179875334e-05,
|
||
|
"loss": 0.5735,
|
||
|
"step": 4430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.677749360613811,
|
||
|
"grad_norm": 4.9012956619262695,
|
||
|
"learning_rate": 3.245770258236865e-05,
|
||
|
"loss": 0.7233,
|
||
|
"step": 4440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.690537084398977,
|
||
|
"grad_norm": 3.3855135440826416,
|
||
|
"learning_rate": 3.241317898486198e-05,
|
||
|
"loss": 0.5361,
|
||
|
"step": 4450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.703324808184143,
|
||
|
"grad_norm": 3.381133794784546,
|
||
|
"learning_rate": 3.23686553873553e-05,
|
||
|
"loss": 0.4158,
|
||
|
"step": 4460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.716112531969309,
|
||
|
"grad_norm": 1.0660206079483032,
|
||
|
"learning_rate": 3.2324131789848624e-05,
|
||
|
"loss": 0.7874,
|
||
|
"step": 4470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.728900255754476,
|
||
|
"grad_norm": 0.3694034516811371,
|
||
|
"learning_rate": 3.227960819234194e-05,
|
||
|
"loss": 0.4555,
|
||
|
"step": 4480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.741687979539642,
|
||
|
"grad_norm": 5.135284900665283,
|
||
|
"learning_rate": 3.2235084594835266e-05,
|
||
|
"loss": 0.661,
|
||
|
"step": 4490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.754475703324808,
|
||
|
"grad_norm": 8.120854377746582,
|
||
|
"learning_rate": 3.219056099732858e-05,
|
||
|
"loss": 0.7841,
|
||
|
"step": 4500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.767263427109975,
|
||
|
"grad_norm": 4.406229019165039,
|
||
|
"learning_rate": 3.214603739982191e-05,
|
||
|
"loss": 0.7135,
|
||
|
"step": 4510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.78005115089514,
|
||
|
"grad_norm": 16.42761993408203,
|
||
|
"learning_rate": 3.210151380231523e-05,
|
||
|
"loss": 0.5225,
|
||
|
"step": 4520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.792838874680307,
|
||
|
"grad_norm": 4.247344970703125,
|
||
|
"learning_rate": 3.205699020480855e-05,
|
||
|
"loss": 0.509,
|
||
|
"step": 4530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.805626598465473,
|
||
|
"grad_norm": 0.8684327006340027,
|
||
|
"learning_rate": 3.2012466607301873e-05,
|
||
|
"loss": 0.3948,
|
||
|
"step": 4540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.818414322250639,
|
||
|
"grad_norm": 3.883173942565918,
|
||
|
"learning_rate": 3.196794300979519e-05,
|
||
|
"loss": 0.4688,
|
||
|
"step": 4550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.831202046035806,
|
||
|
"grad_norm": 2.6903090476989746,
|
||
|
"learning_rate": 3.1923419412288515e-05,
|
||
|
"loss": 0.5118,
|
||
|
"step": 4560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.843989769820972,
|
||
|
"grad_norm": 13.025918960571289,
|
||
|
"learning_rate": 3.187889581478183e-05,
|
||
|
"loss": 0.7126,
|
||
|
"step": 4570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.856777493606138,
|
||
|
"grad_norm": 5.017139911651611,
|
||
|
"learning_rate": 3.183437221727516e-05,
|
||
|
"loss": 0.4241,
|
||
|
"step": 4580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.869565217391305,
|
||
|
"grad_norm": 0.0795026496052742,
|
||
|
"learning_rate": 3.178984861976848e-05,
|
||
|
"loss": 0.367,
|
||
|
"step": 4590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.882352941176471,
|
||
|
"grad_norm": 6.228437423706055,
|
||
|
"learning_rate": 3.17453250222618e-05,
|
||
|
"loss": 0.4223,
|
||
|
"step": 4600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.8951406649616365,
|
||
|
"grad_norm": 13.155862808227539,
|
||
|
"learning_rate": 3.170080142475512e-05,
|
||
|
"loss": 0.8122,
|
||
|
"step": 4610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.907928388746803,
|
||
|
"grad_norm": 1.7071353197097778,
|
||
|
"learning_rate": 3.165627782724844e-05,
|
||
|
"loss": 0.5693,
|
||
|
"step": 4620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.920716112531969,
|
||
|
"grad_norm": 29.22410011291504,
|
||
|
"learning_rate": 3.1611754229741765e-05,
|
||
|
"loss": 0.5403,
|
||
|
"step": 4630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.9335038363171355,
|
||
|
"grad_norm": 5.154636383056641,
|
||
|
"learning_rate": 3.156723063223508e-05,
|
||
|
"loss": 0.7832,
|
||
|
"step": 4640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.946291560102302,
|
||
|
"grad_norm": 4.094592571258545,
|
||
|
"learning_rate": 3.1522707034728406e-05,
|
||
|
"loss": 0.2902,
|
||
|
"step": 4650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.959079283887468,
|
||
|
"grad_norm": 8.081905364990234,
|
||
|
"learning_rate": 3.147818343722173e-05,
|
||
|
"loss": 0.6333,
|
||
|
"step": 4660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.971867007672635,
|
||
|
"grad_norm": 0.0020114060025662184,
|
||
|
"learning_rate": 3.1433659839715055e-05,
|
||
|
"loss": 0.6819,
|
||
|
"step": 4670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.9846547314578,
|
||
|
"grad_norm": 1.4741204977035522,
|
||
|
"learning_rate": 3.138913624220837e-05,
|
||
|
"loss": 0.6854,
|
||
|
"step": 4680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 5.997442455242966,
|
||
|
"grad_norm": 2.4684624671936035,
|
||
|
"learning_rate": 3.134461264470169e-05,
|
||
|
"loss": 0.7138,
|
||
|
"step": 4690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.0,
|
||
|
"eval_loss": 0.2926405668258667,
|
||
|
"eval_runtime": 0.9804,
|
||
|
"eval_samples_per_second": 99.964,
|
||
|
"eval_steps_per_second": 13.261,
|
||
|
"step": 4692
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.010230179028133,
|
||
|
"grad_norm": 5.0837931632995605,
|
||
|
"learning_rate": 3.1300089047195014e-05,
|
||
|
"loss": 0.4942,
|
||
|
"step": 4700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.023017902813299,
|
||
|
"grad_norm": 8.149367332458496,
|
||
|
"learning_rate": 3.125556544968833e-05,
|
||
|
"loss": 0.4184,
|
||
|
"step": 4710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.035805626598465,
|
||
|
"grad_norm": 0.08973731100559235,
|
||
|
"learning_rate": 3.121104185218166e-05,
|
||
|
"loss": 0.5757,
|
||
|
"step": 4720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.048593350383632,
|
||
|
"grad_norm": 0.09891729801893234,
|
||
|
"learning_rate": 3.116651825467498e-05,
|
||
|
"loss": 0.5934,
|
||
|
"step": 4730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.061381074168798,
|
||
|
"grad_norm": 0.7006903886795044,
|
||
|
"learning_rate": 3.1121994657168305e-05,
|
||
|
"loss": 0.3696,
|
||
|
"step": 4740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.0741687979539645,
|
||
|
"grad_norm": 7.207807540893555,
|
||
|
"learning_rate": 3.107747105966162e-05,
|
||
|
"loss": 0.9691,
|
||
|
"step": 4750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.086956521739131,
|
||
|
"grad_norm": 2.0564119815826416,
|
||
|
"learning_rate": 3.1032947462154946e-05,
|
||
|
"loss": 0.2924,
|
||
|
"step": 4760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.099744245524296,
|
||
|
"grad_norm": 4.980827808380127,
|
||
|
"learning_rate": 3.0988423864648264e-05,
|
||
|
"loss": 0.6964,
|
||
|
"step": 4770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.112531969309463,
|
||
|
"grad_norm": 0.09228426963090897,
|
||
|
"learning_rate": 3.094390026714158e-05,
|
||
|
"loss": 0.4555,
|
||
|
"step": 4780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.125319693094629,
|
||
|
"grad_norm": 5.13065242767334,
|
||
|
"learning_rate": 3.089937666963491e-05,
|
||
|
"loss": 0.7797,
|
||
|
"step": 4790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.138107416879795,
|
||
|
"grad_norm": 2.9278452396392822,
|
||
|
"learning_rate": 3.085485307212823e-05,
|
||
|
"loss": 0.1351,
|
||
|
"step": 4800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.150895140664962,
|
||
|
"grad_norm": 1.4354749917984009,
|
||
|
"learning_rate": 3.0810329474621554e-05,
|
||
|
"loss": 0.2317,
|
||
|
"step": 4810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.163682864450128,
|
||
|
"grad_norm": 3.3318662643432617,
|
||
|
"learning_rate": 3.076580587711487e-05,
|
||
|
"loss": 0.4199,
|
||
|
"step": 4820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.176470588235294,
|
||
|
"grad_norm": 0.6563153266906738,
|
||
|
"learning_rate": 3.0721282279608196e-05,
|
||
|
"loss": 0.9203,
|
||
|
"step": 4830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.189258312020461,
|
||
|
"grad_norm": 8.87299633026123,
|
||
|
"learning_rate": 3.067675868210151e-05,
|
||
|
"loss": 0.5379,
|
||
|
"step": 4840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.202046035805626,
|
||
|
"grad_norm": 7.299469470977783,
|
||
|
"learning_rate": 3.063223508459484e-05,
|
||
|
"loss": 0.2947,
|
||
|
"step": 4850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.2148337595907925,
|
||
|
"grad_norm": 5.325803279876709,
|
||
|
"learning_rate": 3.058771148708816e-05,
|
||
|
"loss": 0.7735,
|
||
|
"step": 4860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.227621483375959,
|
||
|
"grad_norm": 9.854231834411621,
|
||
|
"learning_rate": 3.054318788958148e-05,
|
||
|
"loss": 1.1524,
|
||
|
"step": 4870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.240409207161125,
|
||
|
"grad_norm": 4.8076372146606445,
|
||
|
"learning_rate": 3.0498664292074804e-05,
|
||
|
"loss": 0.3987,
|
||
|
"step": 4880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.253196930946292,
|
||
|
"grad_norm": 5.543219566345215,
|
||
|
"learning_rate": 3.0454140694568124e-05,
|
||
|
"loss": 0.9449,
|
||
|
"step": 4890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.265984654731458,
|
||
|
"grad_norm": 4.439374923706055,
|
||
|
"learning_rate": 3.0409617097061442e-05,
|
||
|
"loss": 0.7293,
|
||
|
"step": 4900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.278772378516624,
|
||
|
"grad_norm": 0.6048958897590637,
|
||
|
"learning_rate": 3.0365093499554763e-05,
|
||
|
"loss": 0.9003,
|
||
|
"step": 4910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.291560102301791,
|
||
|
"grad_norm": 7.064619541168213,
|
||
|
"learning_rate": 3.0320569902048084e-05,
|
||
|
"loss": 0.4962,
|
||
|
"step": 4920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.304347826086957,
|
||
|
"grad_norm": 0.19604283571243286,
|
||
|
"learning_rate": 3.027604630454141e-05,
|
||
|
"loss": 0.3643,
|
||
|
"step": 4930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.3171355498721224,
|
||
|
"grad_norm": 0.40240225195884705,
|
||
|
"learning_rate": 3.0231522707034732e-05,
|
||
|
"loss": 0.5883,
|
||
|
"step": 4940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.329923273657289,
|
||
|
"grad_norm": 7.794588565826416,
|
||
|
"learning_rate": 3.0186999109528053e-05,
|
||
|
"loss": 0.5402,
|
||
|
"step": 4950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.342710997442455,
|
||
|
"grad_norm": 1.4699820280075073,
|
||
|
"learning_rate": 3.0142475512021374e-05,
|
||
|
"loss": 0.2695,
|
||
|
"step": 4960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.3554987212276215,
|
||
|
"grad_norm": 2.457961082458496,
|
||
|
"learning_rate": 3.0097951914514695e-05,
|
||
|
"loss": 0.3105,
|
||
|
"step": 4970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.368286445012788,
|
||
|
"grad_norm": 6.704987525939941,
|
||
|
"learning_rate": 3.0053428317008016e-05,
|
||
|
"loss": 0.3334,
|
||
|
"step": 4980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.381074168797954,
|
||
|
"grad_norm": 3.568899154663086,
|
||
|
"learning_rate": 3.0008904719501337e-05,
|
||
|
"loss": 0.5752,
|
||
|
"step": 4990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.3938618925831205,
|
||
|
"grad_norm": 6.0692853927612305,
|
||
|
"learning_rate": 2.996438112199466e-05,
|
||
|
"loss": 0.6972,
|
||
|
"step": 5000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.406649616368286,
|
||
|
"grad_norm": 0.15085622668266296,
|
||
|
"learning_rate": 2.9919857524487982e-05,
|
||
|
"loss": 0.2433,
|
||
|
"step": 5010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.419437340153452,
|
||
|
"grad_norm": 9.642597198486328,
|
||
|
"learning_rate": 2.9875333926981303e-05,
|
||
|
"loss": 0.6038,
|
||
|
"step": 5020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.432225063938619,
|
||
|
"grad_norm": 0.5172861218452454,
|
||
|
"learning_rate": 2.9830810329474623e-05,
|
||
|
"loss": 0.7294,
|
||
|
"step": 5030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.445012787723785,
|
||
|
"grad_norm": 6.880235195159912,
|
||
|
"learning_rate": 2.9786286731967944e-05,
|
||
|
"loss": 0.8727,
|
||
|
"step": 5040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.457800511508951,
|
||
|
"grad_norm": 9.20561695098877,
|
||
|
"learning_rate": 2.9741763134461265e-05,
|
||
|
"loss": 0.4577,
|
||
|
"step": 5050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.470588235294118,
|
||
|
"grad_norm": 8.940627098083496,
|
||
|
"learning_rate": 2.9697239536954586e-05,
|
||
|
"loss": 0.7648,
|
||
|
"step": 5060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.483375959079284,
|
||
|
"grad_norm": 2.744727611541748,
|
||
|
"learning_rate": 2.965271593944791e-05,
|
||
|
"loss": 0.9997,
|
||
|
"step": 5070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.4961636828644505,
|
||
|
"grad_norm": 10.972113609313965,
|
||
|
"learning_rate": 2.960819234194123e-05,
|
||
|
"loss": 0.5985,
|
||
|
"step": 5080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.508951406649617,
|
||
|
"grad_norm": 7.300151348114014,
|
||
|
"learning_rate": 2.9563668744434552e-05,
|
||
|
"loss": 0.7946,
|
||
|
"step": 5090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.521739130434782,
|
||
|
"grad_norm": 9.26161003112793,
|
||
|
"learning_rate": 2.9519145146927873e-05,
|
||
|
"loss": 0.9692,
|
||
|
"step": 5100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.534526854219949,
|
||
|
"grad_norm": 0.12795042991638184,
|
||
|
"learning_rate": 2.9474621549421194e-05,
|
||
|
"loss": 0.472,
|
||
|
"step": 5110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.547314578005115,
|
||
|
"grad_norm": 5.266800880432129,
|
||
|
"learning_rate": 2.9430097951914515e-05,
|
||
|
"loss": 0.5951,
|
||
|
"step": 5120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.560102301790281,
|
||
|
"grad_norm": 5.73391056060791,
|
||
|
"learning_rate": 2.9385574354407836e-05,
|
||
|
"loss": 0.5553,
|
||
|
"step": 5130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.572890025575448,
|
||
|
"grad_norm": 4.122284889221191,
|
||
|
"learning_rate": 2.9341050756901163e-05,
|
||
|
"loss": 0.6291,
|
||
|
"step": 5140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.585677749360614,
|
||
|
"grad_norm": 1.5334197282791138,
|
||
|
"learning_rate": 2.929652715939448e-05,
|
||
|
"loss": 0.5834,
|
||
|
"step": 5150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.59846547314578,
|
||
|
"grad_norm": 2.3212168216705322,
|
||
|
"learning_rate": 2.92520035618878e-05,
|
||
|
"loss": 0.471,
|
||
|
"step": 5160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.611253196930946,
|
||
|
"grad_norm": 7.480304718017578,
|
||
|
"learning_rate": 2.9207479964381123e-05,
|
||
|
"loss": 0.6053,
|
||
|
"step": 5170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.624040920716112,
|
||
|
"grad_norm": 0.005698173772543669,
|
||
|
"learning_rate": 2.9162956366874443e-05,
|
||
|
"loss": 0.2767,
|
||
|
"step": 5180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.6368286445012785,
|
||
|
"grad_norm": 5.176445960998535,
|
||
|
"learning_rate": 2.9118432769367764e-05,
|
||
|
"loss": 0.4619,
|
||
|
"step": 5190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.649616368286445,
|
||
|
"grad_norm": 8.181547164916992,
|
||
|
"learning_rate": 2.9073909171861085e-05,
|
||
|
"loss": 0.8741,
|
||
|
"step": 5200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.662404092071611,
|
||
|
"grad_norm": 6.4060211181640625,
|
||
|
"learning_rate": 2.9029385574354413e-05,
|
||
|
"loss": 0.4839,
|
||
|
"step": 5210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.675191815856778,
|
||
|
"grad_norm": 3.9297218322753906,
|
||
|
"learning_rate": 2.8984861976847734e-05,
|
||
|
"loss": 0.4571,
|
||
|
"step": 5220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.687979539641944,
|
||
|
"grad_norm": 9.544256210327148,
|
||
|
"learning_rate": 2.8940338379341055e-05,
|
||
|
"loss": 0.3365,
|
||
|
"step": 5230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.70076726342711,
|
||
|
"grad_norm": 3.5246851444244385,
|
||
|
"learning_rate": 2.8895814781834375e-05,
|
||
|
"loss": 0.4995,
|
||
|
"step": 5240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.713554987212277,
|
||
|
"grad_norm": 4.475048542022705,
|
||
|
"learning_rate": 2.8851291184327693e-05,
|
||
|
"loss": 0.6951,
|
||
|
"step": 5250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.726342710997442,
|
||
|
"grad_norm": 7.613154888153076,
|
||
|
"learning_rate": 2.8806767586821014e-05,
|
||
|
"loss": 0.1763,
|
||
|
"step": 5260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.739130434782608,
|
||
|
"grad_norm": 5.281979084014893,
|
||
|
"learning_rate": 2.8762243989314335e-05,
|
||
|
"loss": 0.4828,
|
||
|
"step": 5270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.751918158567775,
|
||
|
"grad_norm": 3.480308771133423,
|
||
|
"learning_rate": 2.8717720391807662e-05,
|
||
|
"loss": 0.3431,
|
||
|
"step": 5280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.764705882352941,
|
||
|
"grad_norm": 1.8283220529556274,
|
||
|
"learning_rate": 2.8673196794300983e-05,
|
||
|
"loss": 0.3137,
|
||
|
"step": 5290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.7774936061381075,
|
||
|
"grad_norm": 0.4503525197505951,
|
||
|
"learning_rate": 2.8628673196794304e-05,
|
||
|
"loss": 0.582,
|
||
|
"step": 5300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.790281329923274,
|
||
|
"grad_norm": 8.414701461791992,
|
||
|
"learning_rate": 2.8584149599287625e-05,
|
||
|
"loss": 0.6779,
|
||
|
"step": 5310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.80306905370844,
|
||
|
"grad_norm": 9.155427932739258,
|
||
|
"learning_rate": 2.8539626001780946e-05,
|
||
|
"loss": 0.2913,
|
||
|
"step": 5320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.8158567774936065,
|
||
|
"grad_norm": 0.7775105237960815,
|
||
|
"learning_rate": 2.8495102404274267e-05,
|
||
|
"loss": 0.5001,
|
||
|
"step": 5330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.828644501278772,
|
||
|
"grad_norm": 2.658113956451416,
|
||
|
"learning_rate": 2.8450578806767588e-05,
|
||
|
"loss": 0.6299,
|
||
|
"step": 5340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.841432225063938,
|
||
|
"grad_norm": 0.30859649181365967,
|
||
|
"learning_rate": 2.8406055209260912e-05,
|
||
|
"loss": 0.314,
|
||
|
"step": 5350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.854219948849105,
|
||
|
"grad_norm": 1.178870677947998,
|
||
|
"learning_rate": 2.8361531611754233e-05,
|
||
|
"loss": 0.5978,
|
||
|
"step": 5360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.867007672634271,
|
||
|
"grad_norm": 0.010601550340652466,
|
||
|
"learning_rate": 2.8317008014247554e-05,
|
||
|
"loss": 0.5799,
|
||
|
"step": 5370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.879795396419437,
|
||
|
"grad_norm": 0.43929383158683777,
|
||
|
"learning_rate": 2.8272484416740874e-05,
|
||
|
"loss": 0.3263,
|
||
|
"step": 5380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.892583120204604,
|
||
|
"grad_norm": 0.5252712965011597,
|
||
|
"learning_rate": 2.8227960819234195e-05,
|
||
|
"loss": 0.6099,
|
||
|
"step": 5390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.90537084398977,
|
||
|
"grad_norm": 5.2658233642578125,
|
||
|
"learning_rate": 2.8183437221727516e-05,
|
||
|
"loss": 0.8632,
|
||
|
"step": 5400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.918158567774936,
|
||
|
"grad_norm": 1.1721999645233154,
|
||
|
"learning_rate": 2.8138913624220837e-05,
|
||
|
"loss": 0.3408,
|
||
|
"step": 5410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.930946291560103,
|
||
|
"grad_norm": 1.8380037546157837,
|
||
|
"learning_rate": 2.809439002671416e-05,
|
||
|
"loss": 0.6238,
|
||
|
"step": 5420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.943734015345268,
|
||
|
"grad_norm": 2.2290737628936768,
|
||
|
"learning_rate": 2.8049866429207482e-05,
|
||
|
"loss": 0.4239,
|
||
|
"step": 5430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.956521739130435,
|
||
|
"grad_norm": 14.996088027954102,
|
||
|
"learning_rate": 2.8005342831700803e-05,
|
||
|
"loss": 0.8926,
|
||
|
"step": 5440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.969309462915601,
|
||
|
"grad_norm": 18.02151107788086,
|
||
|
"learning_rate": 2.7960819234194124e-05,
|
||
|
"loss": 0.7365,
|
||
|
"step": 5450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.982097186700767,
|
||
|
"grad_norm": 9.642180442810059,
|
||
|
"learning_rate": 2.7916295636687445e-05,
|
||
|
"loss": 0.4924,
|
||
|
"step": 5460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 6.994884910485934,
|
||
|
"grad_norm": 1.6962766647338867,
|
||
|
"learning_rate": 2.7871772039180766e-05,
|
||
|
"loss": 0.3256,
|
||
|
"step": 5470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.0,
|
||
|
"eval_loss": 0.2672092914581299,
|
||
|
"eval_runtime": 0.9877,
|
||
|
"eval_samples_per_second": 99.225,
|
||
|
"eval_steps_per_second": 13.162,
|
||
|
"step": 5474
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.0076726342711,
|
||
|
"grad_norm": 2.8432042598724365,
|
||
|
"learning_rate": 2.7827248441674087e-05,
|
||
|
"loss": 0.2708,
|
||
|
"step": 5480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.020460358056266,
|
||
|
"grad_norm": 6.073406219482422,
|
||
|
"learning_rate": 2.7782724844167414e-05,
|
||
|
"loss": 0.5792,
|
||
|
"step": 5490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.033248081841432,
|
||
|
"grad_norm": 4.646320819854736,
|
||
|
"learning_rate": 2.7738201246660732e-05,
|
||
|
"loss": 0.2623,
|
||
|
"step": 5500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.046035805626598,
|
||
|
"grad_norm": 6.342520236968994,
|
||
|
"learning_rate": 2.7693677649154053e-05,
|
||
|
"loss": 0.4399,
|
||
|
"step": 5510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.0588235294117645,
|
||
|
"grad_norm": 8.213132858276367,
|
||
|
"learning_rate": 2.7649154051647374e-05,
|
||
|
"loss": 0.5036,
|
||
|
"step": 5520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.071611253196931,
|
||
|
"grad_norm": 4.454671382904053,
|
||
|
"learning_rate": 2.7604630454140694e-05,
|
||
|
"loss": 0.6741,
|
||
|
"step": 5530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.084398976982097,
|
||
|
"grad_norm": 0.019239643588662148,
|
||
|
"learning_rate": 2.7560106856634015e-05,
|
||
|
"loss": 0.3737,
|
||
|
"step": 5540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.0971867007672635,
|
||
|
"grad_norm": 5.077779293060303,
|
||
|
"learning_rate": 2.7515583259127336e-05,
|
||
|
"loss": 0.8875,
|
||
|
"step": 5550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.10997442455243,
|
||
|
"grad_norm": 3.9947173595428467,
|
||
|
"learning_rate": 2.7471059661620664e-05,
|
||
|
"loss": 0.3211,
|
||
|
"step": 5560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.122762148337596,
|
||
|
"grad_norm": 8.417941093444824,
|
||
|
"learning_rate": 2.7426536064113985e-05,
|
||
|
"loss": 0.5168,
|
||
|
"step": 5570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.135549872122763,
|
||
|
"grad_norm": 6.5370774269104,
|
||
|
"learning_rate": 2.7382012466607306e-05,
|
||
|
"loss": 0.5258,
|
||
|
"step": 5580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.148337595907928,
|
||
|
"grad_norm": 5.088172912597656,
|
||
|
"learning_rate": 2.7337488869100626e-05,
|
||
|
"loss": 0.2866,
|
||
|
"step": 5590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.161125319693094,
|
||
|
"grad_norm": 1.4238532781600952,
|
||
|
"learning_rate": 2.7292965271593944e-05,
|
||
|
"loss": 0.6187,
|
||
|
"step": 5600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.173913043478261,
|
||
|
"grad_norm": 0.9053813219070435,
|
||
|
"learning_rate": 2.7248441674087265e-05,
|
||
|
"loss": 0.4809,
|
||
|
"step": 5610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.186700767263427,
|
||
|
"grad_norm": 0.27804991602897644,
|
||
|
"learning_rate": 2.7203918076580586e-05,
|
||
|
"loss": 0.5625,
|
||
|
"step": 5620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.1994884910485935,
|
||
|
"grad_norm": 1.3559807538986206,
|
||
|
"learning_rate": 2.7159394479073913e-05,
|
||
|
"loss": 0.4713,
|
||
|
"step": 5630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.21227621483376,
|
||
|
"grad_norm": 3.623849868774414,
|
||
|
"learning_rate": 2.7114870881567234e-05,
|
||
|
"loss": 0.3054,
|
||
|
"step": 5640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.225063938618926,
|
||
|
"grad_norm": 0.9959923624992371,
|
||
|
"learning_rate": 2.7070347284060555e-05,
|
||
|
"loss": 0.4528,
|
||
|
"step": 5650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.2378516624040925,
|
||
|
"grad_norm": 1.5169447660446167,
|
||
|
"learning_rate": 2.7025823686553876e-05,
|
||
|
"loss": 0.2952,
|
||
|
"step": 5660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.250639386189258,
|
||
|
"grad_norm": 2.1620376110076904,
|
||
|
"learning_rate": 2.6981300089047197e-05,
|
||
|
"loss": 0.3941,
|
||
|
"step": 5670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.263427109974424,
|
||
|
"grad_norm": 0.8747214674949646,
|
||
|
"learning_rate": 2.6936776491540518e-05,
|
||
|
"loss": 0.6892,
|
||
|
"step": 5680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.276214833759591,
|
||
|
"grad_norm": 5.04610013961792,
|
||
|
"learning_rate": 2.689225289403384e-05,
|
||
|
"loss": 0.4836,
|
||
|
"step": 5690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.289002557544757,
|
||
|
"grad_norm": 1.117069959640503,
|
||
|
"learning_rate": 2.6847729296527163e-05,
|
||
|
"loss": 0.309,
|
||
|
"step": 5700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.301790281329923,
|
||
|
"grad_norm": 5.628134727478027,
|
||
|
"learning_rate": 2.6803205699020484e-05,
|
||
|
"loss": 0.5118,
|
||
|
"step": 5710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.31457800511509,
|
||
|
"grad_norm": 0.10263155400753021,
|
||
|
"learning_rate": 2.6758682101513805e-05,
|
||
|
"loss": 0.6117,
|
||
|
"step": 5720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.327365728900256,
|
||
|
"grad_norm": 4.31287145614624,
|
||
|
"learning_rate": 2.6714158504007125e-05,
|
||
|
"loss": 0.3162,
|
||
|
"step": 5730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.340153452685422,
|
||
|
"grad_norm": 1.4107064008712769,
|
||
|
"learning_rate": 2.6669634906500446e-05,
|
||
|
"loss": 0.8187,
|
||
|
"step": 5740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.352941176470588,
|
||
|
"grad_norm": 4.531869411468506,
|
||
|
"learning_rate": 2.6625111308993767e-05,
|
||
|
"loss": 0.7014,
|
||
|
"step": 5750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.365728900255754,
|
||
|
"grad_norm": 3.742617130279541,
|
||
|
"learning_rate": 2.6580587711487088e-05,
|
||
|
"loss": 0.4181,
|
||
|
"step": 5760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.378516624040921,
|
||
|
"grad_norm": 0.5917960405349731,
|
||
|
"learning_rate": 2.6536064113980412e-05,
|
||
|
"loss": 0.5423,
|
||
|
"step": 5770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.391304347826087,
|
||
|
"grad_norm": 10.565349578857422,
|
||
|
"learning_rate": 2.6491540516473733e-05,
|
||
|
"loss": 0.767,
|
||
|
"step": 5780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.404092071611253,
|
||
|
"grad_norm": 1.371715784072876,
|
||
|
"learning_rate": 2.6447016918967054e-05,
|
||
|
"loss": 0.7285,
|
||
|
"step": 5790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.41687979539642,
|
||
|
"grad_norm": 0.1052427589893341,
|
||
|
"learning_rate": 2.6402493321460375e-05,
|
||
|
"loss": 0.6098,
|
||
|
"step": 5800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.429667519181586,
|
||
|
"grad_norm": 2.0800373554229736,
|
||
|
"learning_rate": 2.6357969723953696e-05,
|
||
|
"loss": 0.5199,
|
||
|
"step": 5810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.442455242966752,
|
||
|
"grad_norm": 0.35600629448890686,
|
||
|
"learning_rate": 2.6313446126447017e-05,
|
||
|
"loss": 0.4764,
|
||
|
"step": 5820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.455242966751918,
|
||
|
"grad_norm": 0.7949437499046326,
|
||
|
"learning_rate": 2.6268922528940338e-05,
|
||
|
"loss": 0.5367,
|
||
|
"step": 5830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.468030690537084,
|
||
|
"grad_norm": 10.288331031799316,
|
||
|
"learning_rate": 2.6224398931433662e-05,
|
||
|
"loss": 0.8891,
|
||
|
"step": 5840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.4808184143222505,
|
||
|
"grad_norm": 6.744941711425781,
|
||
|
"learning_rate": 2.6179875333926983e-05,
|
||
|
"loss": 0.6405,
|
||
|
"step": 5850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.493606138107417,
|
||
|
"grad_norm": 15.893798828125,
|
||
|
"learning_rate": 2.6135351736420304e-05,
|
||
|
"loss": 0.6464,
|
||
|
"step": 5860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.506393861892583,
|
||
|
"grad_norm": 0.04780289903283119,
|
||
|
"learning_rate": 2.6090828138913624e-05,
|
||
|
"loss": 0.5096,
|
||
|
"step": 5870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.5191815856777495,
|
||
|
"grad_norm": 7.183560848236084,
|
||
|
"learning_rate": 2.6046304541406945e-05,
|
||
|
"loss": 0.402,
|
||
|
"step": 5880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.531969309462916,
|
||
|
"grad_norm": 0.7552804350852966,
|
||
|
"learning_rate": 2.6001780943900266e-05,
|
||
|
"loss": 0.4915,
|
||
|
"step": 5890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.544757033248082,
|
||
|
"grad_norm": 4.60145902633667,
|
||
|
"learning_rate": 2.5957257346393587e-05,
|
||
|
"loss": 0.5564,
|
||
|
"step": 5900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.557544757033249,
|
||
|
"grad_norm": 3.2626659870147705,
|
||
|
"learning_rate": 2.5912733748886915e-05,
|
||
|
"loss": 0.6087,
|
||
|
"step": 5910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.570332480818414,
|
||
|
"grad_norm": 3.3970375061035156,
|
||
|
"learning_rate": 2.5868210151380236e-05,
|
||
|
"loss": 0.5361,
|
||
|
"step": 5920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.58312020460358,
|
||
|
"grad_norm": 1.0720630884170532,
|
||
|
"learning_rate": 2.5823686553873557e-05,
|
||
|
"loss": 0.5002,
|
||
|
"step": 5930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.595907928388747,
|
||
|
"grad_norm": 0.7739387154579163,
|
||
|
"learning_rate": 2.5779162956366877e-05,
|
||
|
"loss": 0.5824,
|
||
|
"step": 5940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.608695652173913,
|
||
|
"grad_norm": 12.361614227294922,
|
||
|
"learning_rate": 2.5734639358860195e-05,
|
||
|
"loss": 0.6861,
|
||
|
"step": 5950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.621483375959079,
|
||
|
"grad_norm": 5.803438186645508,
|
||
|
"learning_rate": 2.5690115761353516e-05,
|
||
|
"loss": 0.4415,
|
||
|
"step": 5960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.634271099744246,
|
||
|
"grad_norm": 0.18312691152095795,
|
||
|
"learning_rate": 2.5645592163846837e-05,
|
||
|
"loss": 0.4464,
|
||
|
"step": 5970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.647058823529412,
|
||
|
"grad_norm": 6.178823947906494,
|
||
|
"learning_rate": 2.5601068566340164e-05,
|
||
|
"loss": 0.6077,
|
||
|
"step": 5980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.659846547314578,
|
||
|
"grad_norm": 1.4608851671218872,
|
||
|
"learning_rate": 2.5556544968833485e-05,
|
||
|
"loss": 0.4675,
|
||
|
"step": 5990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.672634271099744,
|
||
|
"grad_norm": 0.0074475789442658424,
|
||
|
"learning_rate": 2.5512021371326806e-05,
|
||
|
"loss": 0.1926,
|
||
|
"step": 6000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.68542199488491,
|
||
|
"grad_norm": 4.761837959289551,
|
||
|
"learning_rate": 2.5467497773820127e-05,
|
||
|
"loss": 0.7276,
|
||
|
"step": 6010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.698209718670077,
|
||
|
"grad_norm": 1.3654205799102783,
|
||
|
"learning_rate": 2.5422974176313448e-05,
|
||
|
"loss": 0.4568,
|
||
|
"step": 6020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.710997442455243,
|
||
|
"grad_norm": 0.9348598122596741,
|
||
|
"learning_rate": 2.537845057880677e-05,
|
||
|
"loss": 0.2002,
|
||
|
"step": 6030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.723785166240409,
|
||
|
"grad_norm": 9.504705429077148,
|
||
|
"learning_rate": 2.533392698130009e-05,
|
||
|
"loss": 0.492,
|
||
|
"step": 6040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.736572890025576,
|
||
|
"grad_norm": 2.3037102222442627,
|
||
|
"learning_rate": 2.5289403383793414e-05,
|
||
|
"loss": 0.4039,
|
||
|
"step": 6050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.749360613810742,
|
||
|
"grad_norm": 0.9193987250328064,
|
||
|
"learning_rate": 2.5244879786286735e-05,
|
||
|
"loss": 0.3467,
|
||
|
"step": 6060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.762148337595908,
|
||
|
"grad_norm": 7.834420680999756,
|
||
|
"learning_rate": 2.5200356188780056e-05,
|
||
|
"loss": 0.3421,
|
||
|
"step": 6070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.774936061381074,
|
||
|
"grad_norm": 6.3842878341674805,
|
||
|
"learning_rate": 2.5155832591273376e-05,
|
||
|
"loss": 0.6775,
|
||
|
"step": 6080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.78772378516624,
|
||
|
"grad_norm": 0.7432451844215393,
|
||
|
"learning_rate": 2.5111308993766697e-05,
|
||
|
"loss": 0.5759,
|
||
|
"step": 6090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.8005115089514065,
|
||
|
"grad_norm": 9.213702201843262,
|
||
|
"learning_rate": 2.5066785396260018e-05,
|
||
|
"loss": 0.737,
|
||
|
"step": 6100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.813299232736573,
|
||
|
"grad_norm": 4.210599422454834,
|
||
|
"learning_rate": 2.502226179875334e-05,
|
||
|
"loss": 0.472,
|
||
|
"step": 6110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.826086956521739,
|
||
|
"grad_norm": 5.298123836517334,
|
||
|
"learning_rate": 2.497773820124666e-05,
|
||
|
"loss": 0.5179,
|
||
|
"step": 6120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.838874680306906,
|
||
|
"grad_norm": 6.451060771942139,
|
||
|
"learning_rate": 2.493321460373998e-05,
|
||
|
"loss": 0.3755,
|
||
|
"step": 6130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.851662404092072,
|
||
|
"grad_norm": 10.77486801147461,
|
||
|
"learning_rate": 2.4888691006233305e-05,
|
||
|
"loss": 0.2594,
|
||
|
"step": 6140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.864450127877237,
|
||
|
"grad_norm": 1.9036270380020142,
|
||
|
"learning_rate": 2.4844167408726626e-05,
|
||
|
"loss": 0.1492,
|
||
|
"step": 6150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.877237851662404,
|
||
|
"grad_norm": 0.526852011680603,
|
||
|
"learning_rate": 2.4799643811219947e-05,
|
||
|
"loss": 0.4159,
|
||
|
"step": 6160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.89002557544757,
|
||
|
"grad_norm": 9.612473487854004,
|
||
|
"learning_rate": 2.475512021371327e-05,
|
||
|
"loss": 0.5882,
|
||
|
"step": 6170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.9028132992327365,
|
||
|
"grad_norm": 2.0128557682037354,
|
||
|
"learning_rate": 2.4710596616206592e-05,
|
||
|
"loss": 0.3371,
|
||
|
"step": 6180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.915601023017903,
|
||
|
"grad_norm": 12.005789756774902,
|
||
|
"learning_rate": 2.4666073018699913e-05,
|
||
|
"loss": 0.5436,
|
||
|
"step": 6190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.928388746803069,
|
||
|
"grad_norm": 2.172546625137329,
|
||
|
"learning_rate": 2.4621549421193234e-05,
|
||
|
"loss": 0.7374,
|
||
|
"step": 6200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.9411764705882355,
|
||
|
"grad_norm": 9.670553207397461,
|
||
|
"learning_rate": 2.4577025823686555e-05,
|
||
|
"loss": 0.8065,
|
||
|
"step": 6210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.953964194373402,
|
||
|
"grad_norm": 5.221970081329346,
|
||
|
"learning_rate": 2.4532502226179875e-05,
|
||
|
"loss": 0.535,
|
||
|
"step": 6220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.966751918158568,
|
||
|
"grad_norm": 7.669173240661621,
|
||
|
"learning_rate": 2.4487978628673196e-05,
|
||
|
"loss": 0.6655,
|
||
|
"step": 6230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.979539641943734,
|
||
|
"grad_norm": 2.3195643424987793,
|
||
|
"learning_rate": 2.444345503116652e-05,
|
||
|
"loss": 0.8417,
|
||
|
"step": 6240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 7.9923273657289,
|
||
|
"grad_norm": 3.1944046020507812,
|
||
|
"learning_rate": 2.439893143365984e-05,
|
||
|
"loss": 0.4028,
|
||
|
"step": 6250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.0,
|
||
|
"eval_loss": 0.25757405161857605,
|
||
|
"eval_runtime": 0.9949,
|
||
|
"eval_samples_per_second": 98.507,
|
||
|
"eval_steps_per_second": 13.067,
|
||
|
"step": 6256
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.005115089514067,
|
||
|
"grad_norm": 13.27065658569336,
|
||
|
"learning_rate": 2.4354407836153162e-05,
|
||
|
"loss": 0.3224,
|
||
|
"step": 6260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.017902813299234,
|
||
|
"grad_norm": 7.944550037384033,
|
||
|
"learning_rate": 2.4309884238646483e-05,
|
||
|
"loss": 0.7011,
|
||
|
"step": 6270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.030690537084398,
|
||
|
"grad_norm": 6.031693458557129,
|
||
|
"learning_rate": 2.4265360641139808e-05,
|
||
|
"loss": 0.4088,
|
||
|
"step": 6280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.043478260869565,
|
||
|
"grad_norm": 13.39229679107666,
|
||
|
"learning_rate": 2.4220837043633125e-05,
|
||
|
"loss": 0.5026,
|
||
|
"step": 6290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.05626598465473,
|
||
|
"grad_norm": 11.215279579162598,
|
||
|
"learning_rate": 2.4176313446126446e-05,
|
||
|
"loss": 0.5442,
|
||
|
"step": 6300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.069053708439897,
|
||
|
"grad_norm": 6.160999298095703,
|
||
|
"learning_rate": 2.413178984861977e-05,
|
||
|
"loss": 0.3378,
|
||
|
"step": 6310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.081841432225064,
|
||
|
"grad_norm": 4.923915863037109,
|
||
|
"learning_rate": 2.408726625111309e-05,
|
||
|
"loss": 0.2429,
|
||
|
"step": 6320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.09462915601023,
|
||
|
"grad_norm": 7.593574523925781,
|
||
|
"learning_rate": 2.4042742653606412e-05,
|
||
|
"loss": 0.6619,
|
||
|
"step": 6330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.107416879795396,
|
||
|
"grad_norm": 16.820772171020508,
|
||
|
"learning_rate": 2.3998219056099733e-05,
|
||
|
"loss": 0.4277,
|
||
|
"step": 6340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.120204603580563,
|
||
|
"grad_norm": 0.1135517805814743,
|
||
|
"learning_rate": 2.3953695458593057e-05,
|
||
|
"loss": 0.2982,
|
||
|
"step": 6350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.132992327365729,
|
||
|
"grad_norm": 2.25067138671875,
|
||
|
"learning_rate": 2.3909171861086378e-05,
|
||
|
"loss": 0.3036,
|
||
|
"step": 6360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.145780051150895,
|
||
|
"grad_norm": 1.080831527709961,
|
||
|
"learning_rate": 2.38646482635797e-05,
|
||
|
"loss": 0.2754,
|
||
|
"step": 6370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.158567774936062,
|
||
|
"grad_norm": 2.591698169708252,
|
||
|
"learning_rate": 2.382012466607302e-05,
|
||
|
"loss": 0.4765,
|
||
|
"step": 6380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.171355498721228,
|
||
|
"grad_norm": 2.097321033477783,
|
||
|
"learning_rate": 2.377560106856634e-05,
|
||
|
"loss": 0.167,
|
||
|
"step": 6390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.184143222506394,
|
||
|
"grad_norm": 0.8653718829154968,
|
||
|
"learning_rate": 2.373107747105966e-05,
|
||
|
"loss": 0.5256,
|
||
|
"step": 6400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.19693094629156,
|
||
|
"grad_norm": 1.544346809387207,
|
||
|
"learning_rate": 2.3686553873552982e-05,
|
||
|
"loss": 0.4664,
|
||
|
"step": 6410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.209718670076727,
|
||
|
"grad_norm": 1.176542043685913,
|
||
|
"learning_rate": 2.3642030276046307e-05,
|
||
|
"loss": 0.8735,
|
||
|
"step": 6420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.222506393861893,
|
||
|
"grad_norm": 0.06903265416622162,
|
||
|
"learning_rate": 2.3597506678539627e-05,
|
||
|
"loss": 0.4592,
|
||
|
"step": 6430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.235294117647058,
|
||
|
"grad_norm": 7.001086711883545,
|
||
|
"learning_rate": 2.3552983081032948e-05,
|
||
|
"loss": 0.4594,
|
||
|
"step": 6440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.248081841432224,
|
||
|
"grad_norm": 0.028170613572001457,
|
||
|
"learning_rate": 2.3508459483526273e-05,
|
||
|
"loss": 0.4996,
|
||
|
"step": 6450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.26086956521739,
|
||
|
"grad_norm": 8.073884010314941,
|
||
|
"learning_rate": 2.3463935886019593e-05,
|
||
|
"loss": 0.5063,
|
||
|
"step": 6460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.273657289002557,
|
||
|
"grad_norm": 34.870094299316406,
|
||
|
"learning_rate": 2.341941228851291e-05,
|
||
|
"loss": 0.5557,
|
||
|
"step": 6470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.286445012787723,
|
||
|
"grad_norm": 4.562534809112549,
|
||
|
"learning_rate": 2.3374888691006232e-05,
|
||
|
"loss": 0.0968,
|
||
|
"step": 6480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.29923273657289,
|
||
|
"grad_norm": 0.04304574057459831,
|
||
|
"learning_rate": 2.3330365093499556e-05,
|
||
|
"loss": 0.2486,
|
||
|
"step": 6490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.312020460358056,
|
||
|
"grad_norm": 7.053394794464111,
|
||
|
"learning_rate": 2.3285841495992877e-05,
|
||
|
"loss": 0.505,
|
||
|
"step": 6500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.324808184143222,
|
||
|
"grad_norm": 9.683124542236328,
|
||
|
"learning_rate": 2.3241317898486198e-05,
|
||
|
"loss": 0.6914,
|
||
|
"step": 6510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.337595907928389,
|
||
|
"grad_norm": 6.789582252502441,
|
||
|
"learning_rate": 2.3196794300979522e-05,
|
||
|
"loss": 0.4666,
|
||
|
"step": 6520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.350383631713555,
|
||
|
"grad_norm": 4.472419261932373,
|
||
|
"learning_rate": 2.3152270703472843e-05,
|
||
|
"loss": 0.381,
|
||
|
"step": 6530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.363171355498721,
|
||
|
"grad_norm": 10.337482452392578,
|
||
|
"learning_rate": 2.3107747105966164e-05,
|
||
|
"loss": 0.5651,
|
||
|
"step": 6540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.375959079283888,
|
||
|
"grad_norm": 1.6700971126556396,
|
||
|
"learning_rate": 2.3063223508459485e-05,
|
||
|
"loss": 0.4265,
|
||
|
"step": 6550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.388746803069054,
|
||
|
"grad_norm": 1.6565567255020142,
|
||
|
"learning_rate": 2.3018699910952806e-05,
|
||
|
"loss": 0.3659,
|
||
|
"step": 6560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.40153452685422,
|
||
|
"grad_norm": 8.731335639953613,
|
||
|
"learning_rate": 2.2974176313446126e-05,
|
||
|
"loss": 0.4689,
|
||
|
"step": 6570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.414322250639387,
|
||
|
"grad_norm": 8.600381851196289,
|
||
|
"learning_rate": 2.2929652715939447e-05,
|
||
|
"loss": 0.8944,
|
||
|
"step": 6580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.427109974424553,
|
||
|
"grad_norm": 10.421477317810059,
|
||
|
"learning_rate": 2.288512911843277e-05,
|
||
|
"loss": 0.4084,
|
||
|
"step": 6590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.43989769820972,
|
||
|
"grad_norm": 0.0770091563463211,
|
||
|
"learning_rate": 2.2840605520926092e-05,
|
||
|
"loss": 0.4492,
|
||
|
"step": 6600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.452685421994884,
|
||
|
"grad_norm": 0.04360765591263771,
|
||
|
"learning_rate": 2.2796081923419413e-05,
|
||
|
"loss": 0.6236,
|
||
|
"step": 6610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.46547314578005,
|
||
|
"grad_norm": 5.0289626121521,
|
||
|
"learning_rate": 2.2751558325912734e-05,
|
||
|
"loss": 0.5115,
|
||
|
"step": 6620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.478260869565217,
|
||
|
"grad_norm": 5.965487957000732,
|
||
|
"learning_rate": 2.270703472840606e-05,
|
||
|
"loss": 0.7749,
|
||
|
"step": 6630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.491048593350383,
|
||
|
"grad_norm": 2.7336697578430176,
|
||
|
"learning_rate": 2.2662511130899376e-05,
|
||
|
"loss": 0.3559,
|
||
|
"step": 6640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.50383631713555,
|
||
|
"grad_norm": 0.37265846133232117,
|
||
|
"learning_rate": 2.2617987533392697e-05,
|
||
|
"loss": 0.5246,
|
||
|
"step": 6650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.516624040920716,
|
||
|
"grad_norm": 14.674330711364746,
|
||
|
"learning_rate": 2.257346393588602e-05,
|
||
|
"loss": 0.6532,
|
||
|
"step": 6660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.529411764705882,
|
||
|
"grad_norm": 6.816307544708252,
|
||
|
"learning_rate": 2.2528940338379342e-05,
|
||
|
"loss": 0.2127,
|
||
|
"step": 6670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.542199488491049,
|
||
|
"grad_norm": 9.356270790100098,
|
||
|
"learning_rate": 2.2484416740872663e-05,
|
||
|
"loss": 0.5478,
|
||
|
"step": 6680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.554987212276215,
|
||
|
"grad_norm": 1.5472646951675415,
|
||
|
"learning_rate": 2.2439893143365984e-05,
|
||
|
"loss": 0.4139,
|
||
|
"step": 6690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.567774936061381,
|
||
|
"grad_norm": 8.75796127319336,
|
||
|
"learning_rate": 2.2395369545859308e-05,
|
||
|
"loss": 0.5247,
|
||
|
"step": 6700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.580562659846548,
|
||
|
"grad_norm": 2.2100446224212646,
|
||
|
"learning_rate": 2.235084594835263e-05,
|
||
|
"loss": 0.4889,
|
||
|
"step": 6710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.593350383631714,
|
||
|
"grad_norm": 4.241175174713135,
|
||
|
"learning_rate": 2.230632235084595e-05,
|
||
|
"loss": 0.3766,
|
||
|
"step": 6720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.60613810741688,
|
||
|
"grad_norm": 0.9913263916969299,
|
||
|
"learning_rate": 2.226179875333927e-05,
|
||
|
"loss": 0.2065,
|
||
|
"step": 6730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.618925831202047,
|
||
|
"grad_norm": 4.9824442863464355,
|
||
|
"learning_rate": 2.221727515583259e-05,
|
||
|
"loss": 0.3993,
|
||
|
"step": 6740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.631713554987213,
|
||
|
"grad_norm": 4.201189041137695,
|
||
|
"learning_rate": 2.2172751558325912e-05,
|
||
|
"loss": 0.3504,
|
||
|
"step": 6750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.644501278772378,
|
||
|
"grad_norm": 6.985659599304199,
|
||
|
"learning_rate": 2.2128227960819233e-05,
|
||
|
"loss": 0.6149,
|
||
|
"step": 6760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.657289002557544,
|
||
|
"grad_norm": 2.4724535942077637,
|
||
|
"learning_rate": 2.2083704363312558e-05,
|
||
|
"loss": 0.6447,
|
||
|
"step": 6770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.67007672634271,
|
||
|
"grad_norm": 10.667593002319336,
|
||
|
"learning_rate": 2.203918076580588e-05,
|
||
|
"loss": 0.6225,
|
||
|
"step": 6780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.682864450127877,
|
||
|
"grad_norm": 8.419607162475586,
|
||
|
"learning_rate": 2.19946571682992e-05,
|
||
|
"loss": 0.5348,
|
||
|
"step": 6790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.695652173913043,
|
||
|
"grad_norm": 0.02665606141090393,
|
||
|
"learning_rate": 2.1950133570792524e-05,
|
||
|
"loss": 0.607,
|
||
|
"step": 6800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.70843989769821,
|
||
|
"grad_norm": 6.91499662399292,
|
||
|
"learning_rate": 2.1905609973285844e-05,
|
||
|
"loss": 0.5484,
|
||
|
"step": 6810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.721227621483376,
|
||
|
"grad_norm": 3.57250714302063,
|
||
|
"learning_rate": 2.1861086375779162e-05,
|
||
|
"loss": 0.1793,
|
||
|
"step": 6820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.734015345268542,
|
||
|
"grad_norm": 2.3197195529937744,
|
||
|
"learning_rate": 2.1816562778272486e-05,
|
||
|
"loss": 0.4837,
|
||
|
"step": 6830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.746803069053708,
|
||
|
"grad_norm": 6.936196327209473,
|
||
|
"learning_rate": 2.1772039180765807e-05,
|
||
|
"loss": 0.4397,
|
||
|
"step": 6840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.759590792838875,
|
||
|
"grad_norm": 1.3397008180618286,
|
||
|
"learning_rate": 2.1727515583259128e-05,
|
||
|
"loss": 0.7489,
|
||
|
"step": 6850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.772378516624041,
|
||
|
"grad_norm": 1.0735710859298706,
|
||
|
"learning_rate": 2.168299198575245e-05,
|
||
|
"loss": 0.4262,
|
||
|
"step": 6860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.785166240409207,
|
||
|
"grad_norm": 2.5606706142425537,
|
||
|
"learning_rate": 2.1638468388245773e-05,
|
||
|
"loss": 0.4965,
|
||
|
"step": 6870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.797953964194374,
|
||
|
"grad_norm": 3.640894889831543,
|
||
|
"learning_rate": 2.1593944790739094e-05,
|
||
|
"loss": 0.8421,
|
||
|
"step": 6880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.81074168797954,
|
||
|
"grad_norm": 4.286122798919678,
|
||
|
"learning_rate": 2.1549421193232415e-05,
|
||
|
"loss": 0.35,
|
||
|
"step": 6890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.823529411764707,
|
||
|
"grad_norm": 5.966396331787109,
|
||
|
"learning_rate": 2.1504897595725736e-05,
|
||
|
"loss": 0.5545,
|
||
|
"step": 6900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.836317135549873,
|
||
|
"grad_norm": 0.748144268989563,
|
||
|
"learning_rate": 2.1460373998219057e-05,
|
||
|
"loss": 0.5225,
|
||
|
"step": 6910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.84910485933504,
|
||
|
"grad_norm": 1.9871532917022705,
|
||
|
"learning_rate": 2.1415850400712377e-05,
|
||
|
"loss": 0.4146,
|
||
|
"step": 6920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.861892583120204,
|
||
|
"grad_norm": 6.072329521179199,
|
||
|
"learning_rate": 2.13713268032057e-05,
|
||
|
"loss": 0.2274,
|
||
|
"step": 6930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.87468030690537,
|
||
|
"grad_norm": 3.948272466659546,
|
||
|
"learning_rate": 2.1326803205699023e-05,
|
||
|
"loss": 0.5427,
|
||
|
"step": 6940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.887468030690536,
|
||
|
"grad_norm": 4.448310375213623,
|
||
|
"learning_rate": 2.1282279608192343e-05,
|
||
|
"loss": 0.3465,
|
||
|
"step": 6950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.900255754475703,
|
||
|
"grad_norm": 2.879702568054199,
|
||
|
"learning_rate": 2.1237756010685664e-05,
|
||
|
"loss": 0.4667,
|
||
|
"step": 6960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.91304347826087,
|
||
|
"grad_norm": 5.268418788909912,
|
||
|
"learning_rate": 2.119323241317899e-05,
|
||
|
"loss": 0.4483,
|
||
|
"step": 6970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.925831202046036,
|
||
|
"grad_norm": 10.505093574523926,
|
||
|
"learning_rate": 2.114870881567231e-05,
|
||
|
"loss": 0.3888,
|
||
|
"step": 6980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.938618925831202,
|
||
|
"grad_norm": 0.7593215107917786,
|
||
|
"learning_rate": 2.1104185218165627e-05,
|
||
|
"loss": 0.1886,
|
||
|
"step": 6990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.951406649616368,
|
||
|
"grad_norm": 9.993316650390625,
|
||
|
"learning_rate": 2.1059661620658948e-05,
|
||
|
"loss": 0.6512,
|
||
|
"step": 7000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.964194373401535,
|
||
|
"grad_norm": 0.009588400833308697,
|
||
|
"learning_rate": 2.1015138023152272e-05,
|
||
|
"loss": 0.4605,
|
||
|
"step": 7010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.976982097186701,
|
||
|
"grad_norm": 0.6530927419662476,
|
||
|
"learning_rate": 2.0970614425645593e-05,
|
||
|
"loss": 0.35,
|
||
|
"step": 7020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 8.989769820971867,
|
||
|
"grad_norm": 0.30554449558258057,
|
||
|
"learning_rate": 2.0926090828138914e-05,
|
||
|
"loss": 0.728,
|
||
|
"step": 7030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.0,
|
||
|
"eval_loss": 0.25451144576072693,
|
||
|
"eval_runtime": 0.9753,
|
||
|
"eval_samples_per_second": 100.482,
|
||
|
"eval_steps_per_second": 13.329,
|
||
|
"step": 7038
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.002557544757034,
|
||
|
"grad_norm": 1.9777408838272095,
|
||
|
"learning_rate": 2.0881567230632238e-05,
|
||
|
"loss": 0.4878,
|
||
|
"step": 7040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.0153452685422,
|
||
|
"grad_norm": 0.005459375213831663,
|
||
|
"learning_rate": 2.083704363312556e-05,
|
||
|
"loss": 0.3858,
|
||
|
"step": 7050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.028132992327366,
|
||
|
"grad_norm": 1.3175240755081177,
|
||
|
"learning_rate": 2.079252003561888e-05,
|
||
|
"loss": 0.5197,
|
||
|
"step": 7060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.040920716112533,
|
||
|
"grad_norm": 12.455684661865234,
|
||
|
"learning_rate": 2.07479964381122e-05,
|
||
|
"loss": 0.3106,
|
||
|
"step": 7070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.053708439897699,
|
||
|
"grad_norm": 1.8953137397766113,
|
||
|
"learning_rate": 2.070347284060552e-05,
|
||
|
"loss": 0.675,
|
||
|
"step": 7080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.066496163682864,
|
||
|
"grad_norm": 0.25667712092399597,
|
||
|
"learning_rate": 2.0658949243098843e-05,
|
||
|
"loss": 0.5439,
|
||
|
"step": 7090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.07928388746803,
|
||
|
"grad_norm": 1.5751233100891113,
|
||
|
"learning_rate": 2.0614425645592163e-05,
|
||
|
"loss": 0.4046,
|
||
|
"step": 7100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.092071611253196,
|
||
|
"grad_norm": 0.4334716200828552,
|
||
|
"learning_rate": 2.0569902048085488e-05,
|
||
|
"loss": 0.7046,
|
||
|
"step": 7110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.104859335038363,
|
||
|
"grad_norm": 6.322332859039307,
|
||
|
"learning_rate": 2.052537845057881e-05,
|
||
|
"loss": 0.421,
|
||
|
"step": 7120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.117647058823529,
|
||
|
"grad_norm": 7.568769454956055,
|
||
|
"learning_rate": 2.048085485307213e-05,
|
||
|
"loss": 0.3811,
|
||
|
"step": 7130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.130434782608695,
|
||
|
"grad_norm": 4.8575334548950195,
|
||
|
"learning_rate": 2.043633125556545e-05,
|
||
|
"loss": 0.5919,
|
||
|
"step": 7140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.143222506393862,
|
||
|
"grad_norm": 2.816992998123169,
|
||
|
"learning_rate": 2.0391807658058775e-05,
|
||
|
"loss": 0.4486,
|
||
|
"step": 7150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.156010230179028,
|
||
|
"grad_norm": 0.024615757167339325,
|
||
|
"learning_rate": 2.0347284060552092e-05,
|
||
|
"loss": 0.4216,
|
||
|
"step": 7160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.168797953964194,
|
||
|
"grad_norm": 13.121672630310059,
|
||
|
"learning_rate": 2.0302760463045413e-05,
|
||
|
"loss": 0.5703,
|
||
|
"step": 7170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.18158567774936,
|
||
|
"grad_norm": 2.4997825622558594,
|
||
|
"learning_rate": 2.0258236865538737e-05,
|
||
|
"loss": 0.3972,
|
||
|
"step": 7180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.194373401534527,
|
||
|
"grad_norm": 0.672987163066864,
|
||
|
"learning_rate": 2.0213713268032058e-05,
|
||
|
"loss": 0.3447,
|
||
|
"step": 7190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.207161125319693,
|
||
|
"grad_norm": 0.3629339337348938,
|
||
|
"learning_rate": 2.016918967052538e-05,
|
||
|
"loss": 0.4216,
|
||
|
"step": 7200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.21994884910486,
|
||
|
"grad_norm": 0.4874151647090912,
|
||
|
"learning_rate": 2.01246660730187e-05,
|
||
|
"loss": 0.4348,
|
||
|
"step": 7210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.232736572890026,
|
||
|
"grad_norm": 3.5700504779815674,
|
||
|
"learning_rate": 2.0080142475512024e-05,
|
||
|
"loss": 0.7219,
|
||
|
"step": 7220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.245524296675192,
|
||
|
"grad_norm": 7.892800331115723,
|
||
|
"learning_rate": 2.0035618878005345e-05,
|
||
|
"loss": 0.6184,
|
||
|
"step": 7230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.258312020460359,
|
||
|
"grad_norm": 6.028756618499756,
|
||
|
"learning_rate": 1.9991095280498666e-05,
|
||
|
"loss": 0.2687,
|
||
|
"step": 7240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.271099744245525,
|
||
|
"grad_norm": 7.790216445922852,
|
||
|
"learning_rate": 1.9946571682991987e-05,
|
||
|
"loss": 0.6128,
|
||
|
"step": 7250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.28388746803069,
|
||
|
"grad_norm": 0.28933781385421753,
|
||
|
"learning_rate": 1.9902048085485308e-05,
|
||
|
"loss": 0.3225,
|
||
|
"step": 7260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.296675191815856,
|
||
|
"grad_norm": 3.2421770095825195,
|
||
|
"learning_rate": 1.985752448797863e-05,
|
||
|
"loss": 0.4387,
|
||
|
"step": 7270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.309462915601022,
|
||
|
"grad_norm": 0.5195609331130981,
|
||
|
"learning_rate": 1.981300089047195e-05,
|
||
|
"loss": 0.4904,
|
||
|
"step": 7280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.322250639386189,
|
||
|
"grad_norm": 1.5599663257598877,
|
||
|
"learning_rate": 1.9768477292965274e-05,
|
||
|
"loss": 0.2514,
|
||
|
"step": 7290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.335038363171355,
|
||
|
"grad_norm": 3.887960195541382,
|
||
|
"learning_rate": 1.9723953695458594e-05,
|
||
|
"loss": 0.4621,
|
||
|
"step": 7300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.347826086956522,
|
||
|
"grad_norm": 2.202698230743408,
|
||
|
"learning_rate": 1.9679430097951915e-05,
|
||
|
"loss": 0.2866,
|
||
|
"step": 7310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.360613810741688,
|
||
|
"grad_norm": 0.9991092085838318,
|
||
|
"learning_rate": 1.963490650044524e-05,
|
||
|
"loss": 0.2594,
|
||
|
"step": 7320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.373401534526854,
|
||
|
"grad_norm": 0.00048051172052510083,
|
||
|
"learning_rate": 1.959038290293856e-05,
|
||
|
"loss": 0.2282,
|
||
|
"step": 7330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.38618925831202,
|
||
|
"grad_norm": 10.449288368225098,
|
||
|
"learning_rate": 1.9545859305431878e-05,
|
||
|
"loss": 0.4363,
|
||
|
"step": 7340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.398976982097187,
|
||
|
"grad_norm": 6.0978569984436035,
|
||
|
"learning_rate": 1.95013357079252e-05,
|
||
|
"loss": 0.4363,
|
||
|
"step": 7350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.411764705882353,
|
||
|
"grad_norm": 13.751110076904297,
|
||
|
"learning_rate": 1.9456812110418523e-05,
|
||
|
"loss": 0.5147,
|
||
|
"step": 7360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.42455242966752,
|
||
|
"grad_norm": 1.5191411972045898,
|
||
|
"learning_rate": 1.9412288512911844e-05,
|
||
|
"loss": 0.2839,
|
||
|
"step": 7370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.437340153452686,
|
||
|
"grad_norm": 0.6311479806900024,
|
||
|
"learning_rate": 1.9367764915405165e-05,
|
||
|
"loss": 0.4466,
|
||
|
"step": 7380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.450127877237852,
|
||
|
"grad_norm": 2.6968650817871094,
|
||
|
"learning_rate": 1.932324131789849e-05,
|
||
|
"loss": 0.3821,
|
||
|
"step": 7390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.462915601023019,
|
||
|
"grad_norm": 2.450756549835205,
|
||
|
"learning_rate": 1.927871772039181e-05,
|
||
|
"loss": 0.4983,
|
||
|
"step": 7400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.475703324808185,
|
||
|
"grad_norm": 2.733262538909912,
|
||
|
"learning_rate": 1.923419412288513e-05,
|
||
|
"loss": 0.3849,
|
||
|
"step": 7410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.48849104859335,
|
||
|
"grad_norm": 1.4597264528274536,
|
||
|
"learning_rate": 1.9189670525378452e-05,
|
||
|
"loss": 0.1508,
|
||
|
"step": 7420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.501278772378516,
|
||
|
"grad_norm": 2.9642603397369385,
|
||
|
"learning_rate": 1.9145146927871773e-05,
|
||
|
"loss": 0.093,
|
||
|
"step": 7430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.514066496163682,
|
||
|
"grad_norm": 2.610978603363037,
|
||
|
"learning_rate": 1.9100623330365093e-05,
|
||
|
"loss": 0.4723,
|
||
|
"step": 7440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.526854219948849,
|
||
|
"grad_norm": 4.64446496963501,
|
||
|
"learning_rate": 1.9056099732858414e-05,
|
||
|
"loss": 0.5699,
|
||
|
"step": 7450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.539641943734015,
|
||
|
"grad_norm": 2.086815357208252,
|
||
|
"learning_rate": 1.901157613535174e-05,
|
||
|
"loss": 0.3695,
|
||
|
"step": 7460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.552429667519181,
|
||
|
"grad_norm": 5.845988750457764,
|
||
|
"learning_rate": 1.896705253784506e-05,
|
||
|
"loss": 0.5565,
|
||
|
"step": 7470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.565217391304348,
|
||
|
"grad_norm": 1.279890537261963,
|
||
|
"learning_rate": 1.892252894033838e-05,
|
||
|
"loss": 0.4849,
|
||
|
"step": 7480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.578005115089514,
|
||
|
"grad_norm": 0.4533943235874176,
|
||
|
"learning_rate": 1.88780053428317e-05,
|
||
|
"loss": 0.3737,
|
||
|
"step": 7490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.59079283887468,
|
||
|
"grad_norm": 2.034996271133423,
|
||
|
"learning_rate": 1.8833481745325026e-05,
|
||
|
"loss": 0.6928,
|
||
|
"step": 7500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.603580562659847,
|
||
|
"grad_norm": 0.6041597723960876,
|
||
|
"learning_rate": 1.8788958147818343e-05,
|
||
|
"loss": 0.382,
|
||
|
"step": 7510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.616368286445013,
|
||
|
"grad_norm": 8.170845031738281,
|
||
|
"learning_rate": 1.8744434550311664e-05,
|
||
|
"loss": 0.4653,
|
||
|
"step": 7520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.62915601023018,
|
||
|
"grad_norm": 0.5266035199165344,
|
||
|
"learning_rate": 1.8699910952804988e-05,
|
||
|
"loss": 0.5368,
|
||
|
"step": 7530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.641943734015346,
|
||
|
"grad_norm": 9.099715232849121,
|
||
|
"learning_rate": 1.865538735529831e-05,
|
||
|
"loss": 0.5995,
|
||
|
"step": 7540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.654731457800512,
|
||
|
"grad_norm": 0.364877313375473,
|
||
|
"learning_rate": 1.861086375779163e-05,
|
||
|
"loss": 0.5252,
|
||
|
"step": 7550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.667519181585678,
|
||
|
"grad_norm": 0.291299045085907,
|
||
|
"learning_rate": 1.856634016028495e-05,
|
||
|
"loss": 0.5014,
|
||
|
"step": 7560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.680306905370845,
|
||
|
"grad_norm": 10.438728332519531,
|
||
|
"learning_rate": 1.8521816562778275e-05,
|
||
|
"loss": 0.5446,
|
||
|
"step": 7570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.693094629156011,
|
||
|
"grad_norm": 1.1329154968261719,
|
||
|
"learning_rate": 1.8477292965271596e-05,
|
||
|
"loss": 0.5574,
|
||
|
"step": 7580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.705882352941176,
|
||
|
"grad_norm": 0.9410423040390015,
|
||
|
"learning_rate": 1.8432769367764917e-05,
|
||
|
"loss": 0.6805,
|
||
|
"step": 7590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.718670076726342,
|
||
|
"grad_norm": 13.469749450683594,
|
||
|
"learning_rate": 1.8388245770258238e-05,
|
||
|
"loss": 0.7721,
|
||
|
"step": 7600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.731457800511508,
|
||
|
"grad_norm": 12.993340492248535,
|
||
|
"learning_rate": 1.834372217275156e-05,
|
||
|
"loss": 0.4185,
|
||
|
"step": 7610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.744245524296675,
|
||
|
"grad_norm": 0.5286237001419067,
|
||
|
"learning_rate": 1.829919857524488e-05,
|
||
|
"loss": 0.2877,
|
||
|
"step": 7620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.757033248081841,
|
||
|
"grad_norm": 7.435681343078613,
|
||
|
"learning_rate": 1.82546749777382e-05,
|
||
|
"loss": 0.6757,
|
||
|
"step": 7630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.769820971867007,
|
||
|
"grad_norm": 2.0388455390930176,
|
||
|
"learning_rate": 1.8210151380231525e-05,
|
||
|
"loss": 0.4057,
|
||
|
"step": 7640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.782608695652174,
|
||
|
"grad_norm": 2.632791757583618,
|
||
|
"learning_rate": 1.8165627782724845e-05,
|
||
|
"loss": 0.5251,
|
||
|
"step": 7650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.79539641943734,
|
||
|
"grad_norm": 0.16664129495620728,
|
||
|
"learning_rate": 1.8121104185218166e-05,
|
||
|
"loss": 0.252,
|
||
|
"step": 7660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.808184143222507,
|
||
|
"grad_norm": 0.003070043632760644,
|
||
|
"learning_rate": 1.807658058771149e-05,
|
||
|
"loss": 0.7866,
|
||
|
"step": 7670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.820971867007673,
|
||
|
"grad_norm": 5.778616428375244,
|
||
|
"learning_rate": 1.8032056990204808e-05,
|
||
|
"loss": 0.4152,
|
||
|
"step": 7680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.83375959079284,
|
||
|
"grad_norm": 0.07909457385540009,
|
||
|
"learning_rate": 1.798753339269813e-05,
|
||
|
"loss": 0.3552,
|
||
|
"step": 7690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.846547314578006,
|
||
|
"grad_norm": 8.888358116149902,
|
||
|
"learning_rate": 1.794300979519145e-05,
|
||
|
"loss": 0.3736,
|
||
|
"step": 7700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.859335038363172,
|
||
|
"grad_norm": 11.318596839904785,
|
||
|
"learning_rate": 1.7898486197684774e-05,
|
||
|
"loss": 0.4869,
|
||
|
"step": 7710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.872122762148338,
|
||
|
"grad_norm": 0.5175634026527405,
|
||
|
"learning_rate": 1.7853962600178095e-05,
|
||
|
"loss": 0.2856,
|
||
|
"step": 7720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.884910485933505,
|
||
|
"grad_norm": 7.7081379890441895,
|
||
|
"learning_rate": 1.7809439002671416e-05,
|
||
|
"loss": 0.3757,
|
||
|
"step": 7730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.89769820971867,
|
||
|
"grad_norm": 7.398036956787109,
|
||
|
"learning_rate": 1.776491540516474e-05,
|
||
|
"loss": 0.6392,
|
||
|
"step": 7740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.910485933503836,
|
||
|
"grad_norm": 0.41945111751556396,
|
||
|
"learning_rate": 1.772039180765806e-05,
|
||
|
"loss": 0.3977,
|
||
|
"step": 7750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.923273657289002,
|
||
|
"grad_norm": 0.057822152972221375,
|
||
|
"learning_rate": 1.7675868210151382e-05,
|
||
|
"loss": 0.6515,
|
||
|
"step": 7760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.936061381074168,
|
||
|
"grad_norm": 2.6882266998291016,
|
||
|
"learning_rate": 1.7631344612644703e-05,
|
||
|
"loss": 0.4997,
|
||
|
"step": 7770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.948849104859335,
|
||
|
"grad_norm": 1.6198879480361938,
|
||
|
"learning_rate": 1.7586821015138024e-05,
|
||
|
"loss": 0.364,
|
||
|
"step": 7780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.961636828644501,
|
||
|
"grad_norm": 4.458327293395996,
|
||
|
"learning_rate": 1.7542297417631344e-05,
|
||
|
"loss": 0.5551,
|
||
|
"step": 7790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.974424552429667,
|
||
|
"grad_norm": 9.222285270690918,
|
||
|
"learning_rate": 1.7497773820124665e-05,
|
||
|
"loss": 0.5828,
|
||
|
"step": 7800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 9.987212276214834,
|
||
|
"grad_norm": 8.898882865905762,
|
||
|
"learning_rate": 1.745325022261799e-05,
|
||
|
"loss": 0.4641,
|
||
|
"step": 7810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.0,
|
||
|
"grad_norm": 8.259880065917969,
|
||
|
"learning_rate": 1.740872662511131e-05,
|
||
|
"loss": 0.4681,
|
||
|
"step": 7820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.0,
|
||
|
"eval_loss": 0.24825578927993774,
|
||
|
"eval_runtime": 0.8161,
|
||
|
"eval_samples_per_second": 120.081,
|
||
|
"eval_steps_per_second": 15.929,
|
||
|
"step": 7820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.012787723785166,
|
||
|
"grad_norm": 11.80496597290039,
|
||
|
"learning_rate": 1.736420302760463e-05,
|
||
|
"loss": 0.4748,
|
||
|
"step": 7830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.025575447570333,
|
||
|
"grad_norm": 0.5205655694007874,
|
||
|
"learning_rate": 1.7319679430097952e-05,
|
||
|
"loss": 0.593,
|
||
|
"step": 7840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.038363171355499,
|
||
|
"grad_norm": 1.392087459564209,
|
||
|
"learning_rate": 1.7275155832591277e-05,
|
||
|
"loss": 0.5828,
|
||
|
"step": 7850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.051150895140665,
|
||
|
"grad_norm": 6.822452545166016,
|
||
|
"learning_rate": 1.7230632235084594e-05,
|
||
|
"loss": 0.4847,
|
||
|
"step": 7860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.063938618925832,
|
||
|
"grad_norm": 4.766864776611328,
|
||
|
"learning_rate": 1.7186108637577915e-05,
|
||
|
"loss": 0.3282,
|
||
|
"step": 7870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.076726342710998,
|
||
|
"grad_norm": 9.132004737854004,
|
||
|
"learning_rate": 1.714158504007124e-05,
|
||
|
"loss": 0.4287,
|
||
|
"step": 7880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.089514066496164,
|
||
|
"grad_norm": 0.027325913310050964,
|
||
|
"learning_rate": 1.709706144256456e-05,
|
||
|
"loss": 0.2849,
|
||
|
"step": 7890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.10230179028133,
|
||
|
"grad_norm": 3.668032169342041,
|
||
|
"learning_rate": 1.705253784505788e-05,
|
||
|
"loss": 0.2403,
|
||
|
"step": 7900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.115089514066495,
|
||
|
"grad_norm": 0.08681362867355347,
|
||
|
"learning_rate": 1.7008014247551202e-05,
|
||
|
"loss": 0.6101,
|
||
|
"step": 7910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.127877237851662,
|
||
|
"grad_norm": 0.0037536800373345613,
|
||
|
"learning_rate": 1.6963490650044526e-05,
|
||
|
"loss": 0.2353,
|
||
|
"step": 7920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.140664961636828,
|
||
|
"grad_norm": 4.245620250701904,
|
||
|
"learning_rate": 1.6918967052537847e-05,
|
||
|
"loss": 0.4882,
|
||
|
"step": 7930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.153452685421994,
|
||
|
"grad_norm": 3.511157751083374,
|
||
|
"learning_rate": 1.6874443455031168e-05,
|
||
|
"loss": 0.2636,
|
||
|
"step": 7940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.16624040920716,
|
||
|
"grad_norm": 0.33653515577316284,
|
||
|
"learning_rate": 1.682991985752449e-05,
|
||
|
"loss": 0.2167,
|
||
|
"step": 7950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.179028132992327,
|
||
|
"grad_norm": 2.5212366580963135,
|
||
|
"learning_rate": 1.678539626001781e-05,
|
||
|
"loss": 0.3128,
|
||
|
"step": 7960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.191815856777493,
|
||
|
"grad_norm": 0.11155296862125397,
|
||
|
"learning_rate": 1.674087266251113e-05,
|
||
|
"loss": 0.1596,
|
||
|
"step": 7970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.20460358056266,
|
||
|
"grad_norm": 4.983904838562012,
|
||
|
"learning_rate": 1.669634906500445e-05,
|
||
|
"loss": 0.4555,
|
||
|
"step": 7980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.217391304347826,
|
||
|
"grad_norm": 2.878169536590576,
|
||
|
"learning_rate": 1.6651825467497776e-05,
|
||
|
"loss": 0.4383,
|
||
|
"step": 7990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.230179028132993,
|
||
|
"grad_norm": 2.0731053352355957,
|
||
|
"learning_rate": 1.6607301869991096e-05,
|
||
|
"loss": 0.4476,
|
||
|
"step": 8000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.242966751918159,
|
||
|
"grad_norm": 1.8638560771942139,
|
||
|
"learning_rate": 1.6562778272484417e-05,
|
||
|
"loss": 0.6536,
|
||
|
"step": 8010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.255754475703325,
|
||
|
"grad_norm": 3.486363172531128,
|
||
|
"learning_rate": 1.651825467497774e-05,
|
||
|
"loss": 0.2838,
|
||
|
"step": 8020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.268542199488492,
|
||
|
"grad_norm": 1.8177838325500488,
|
||
|
"learning_rate": 1.647373107747106e-05,
|
||
|
"loss": 0.5319,
|
||
|
"step": 8030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.281329923273658,
|
||
|
"grad_norm": 2.788775682449341,
|
||
|
"learning_rate": 1.642920747996438e-05,
|
||
|
"loss": 0.2796,
|
||
|
"step": 8040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.294117647058824,
|
||
|
"grad_norm": 3.992607355117798,
|
||
|
"learning_rate": 1.63846838824577e-05,
|
||
|
"loss": 0.3864,
|
||
|
"step": 8050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.30690537084399,
|
||
|
"grad_norm": 1.2651846408843994,
|
||
|
"learning_rate": 1.6340160284951025e-05,
|
||
|
"loss": 0.5798,
|
||
|
"step": 8060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.319693094629155,
|
||
|
"grad_norm": 5.219095706939697,
|
||
|
"learning_rate": 1.6295636687444346e-05,
|
||
|
"loss": 0.6797,
|
||
|
"step": 8070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.332480818414322,
|
||
|
"grad_norm": 9.029091835021973,
|
||
|
"learning_rate": 1.6251113089937667e-05,
|
||
|
"loss": 0.7711,
|
||
|
"step": 8080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.345268542199488,
|
||
|
"grad_norm": 1.4664132595062256,
|
||
|
"learning_rate": 1.620658949243099e-05,
|
||
|
"loss": 0.6837,
|
||
|
"step": 8090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.358056265984654,
|
||
|
"grad_norm": 1.5009568929672241,
|
||
|
"learning_rate": 1.6162065894924312e-05,
|
||
|
"loss": 0.2394,
|
||
|
"step": 8100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.37084398976982,
|
||
|
"grad_norm": 3.754551410675049,
|
||
|
"learning_rate": 1.6117542297417633e-05,
|
||
|
"loss": 0.5243,
|
||
|
"step": 8110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.383631713554987,
|
||
|
"grad_norm": 0.4475407302379608,
|
||
|
"learning_rate": 1.6073018699910954e-05,
|
||
|
"loss": 0.4114,
|
||
|
"step": 8120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.396419437340153,
|
||
|
"grad_norm": 0.2138330191373825,
|
||
|
"learning_rate": 1.6028495102404275e-05,
|
||
|
"loss": 0.4244,
|
||
|
"step": 8130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.40920716112532,
|
||
|
"grad_norm": 0.3168162703514099,
|
||
|
"learning_rate": 1.5983971504897595e-05,
|
||
|
"loss": 0.3199,
|
||
|
"step": 8140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.421994884910486,
|
||
|
"grad_norm": 7.511226654052734,
|
||
|
"learning_rate": 1.5939447907390916e-05,
|
||
|
"loss": 0.8374,
|
||
|
"step": 8150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.434782608695652,
|
||
|
"grad_norm": 1.9676591157913208,
|
||
|
"learning_rate": 1.589492430988424e-05,
|
||
|
"loss": 0.3264,
|
||
|
"step": 8160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.447570332480819,
|
||
|
"grad_norm": 0.2713398039340973,
|
||
|
"learning_rate": 1.585040071237756e-05,
|
||
|
"loss": 0.1938,
|
||
|
"step": 8170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.460358056265985,
|
||
|
"grad_norm": 3.062129020690918,
|
||
|
"learning_rate": 1.5805877114870882e-05,
|
||
|
"loss": 0.3254,
|
||
|
"step": 8180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.473145780051151,
|
||
|
"grad_norm": 6.017972946166992,
|
||
|
"learning_rate": 1.5761353517364203e-05,
|
||
|
"loss": 0.298,
|
||
|
"step": 8190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.485933503836318,
|
||
|
"grad_norm": 4.598347187042236,
|
||
|
"learning_rate": 1.5716829919857528e-05,
|
||
|
"loss": 0.5199,
|
||
|
"step": 8200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.498721227621484,
|
||
|
"grad_norm": 4.863534927368164,
|
||
|
"learning_rate": 1.5672306322350845e-05,
|
||
|
"loss": 0.6863,
|
||
|
"step": 8210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.51150895140665,
|
||
|
"grad_norm": 2.691654682159424,
|
||
|
"learning_rate": 1.5627782724844166e-05,
|
||
|
"loss": 0.3303,
|
||
|
"step": 8220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.524296675191817,
|
||
|
"grad_norm": 4.457635879516602,
|
||
|
"learning_rate": 1.558325912733749e-05,
|
||
|
"loss": 0.4365,
|
||
|
"step": 8230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.537084398976981,
|
||
|
"grad_norm": 5.06966495513916,
|
||
|
"learning_rate": 1.553873552983081e-05,
|
||
|
"loss": 0.3855,
|
||
|
"step": 8240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.549872122762148,
|
||
|
"grad_norm": 1.14505934715271,
|
||
|
"learning_rate": 1.5494211932324132e-05,
|
||
|
"loss": 0.3378,
|
||
|
"step": 8250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.562659846547314,
|
||
|
"grad_norm": 2.8286354541778564,
|
||
|
"learning_rate": 1.5449688334817456e-05,
|
||
|
"loss": 0.3858,
|
||
|
"step": 8260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.57544757033248,
|
||
|
"grad_norm": 0.6444804668426514,
|
||
|
"learning_rate": 1.5405164737310777e-05,
|
||
|
"loss": 0.3181,
|
||
|
"step": 8270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.588235294117647,
|
||
|
"grad_norm": 1.543545126914978,
|
||
|
"learning_rate": 1.5360641139804098e-05,
|
||
|
"loss": 0.5232,
|
||
|
"step": 8280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.601023017902813,
|
||
|
"grad_norm": 1.701751708984375,
|
||
|
"learning_rate": 1.531611754229742e-05,
|
||
|
"loss": 0.4632,
|
||
|
"step": 8290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.61381074168798,
|
||
|
"grad_norm": 3.7274434566497803,
|
||
|
"learning_rate": 1.527159394479074e-05,
|
||
|
"loss": 0.3775,
|
||
|
"step": 8300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.626598465473146,
|
||
|
"grad_norm": 3.264287233352661,
|
||
|
"learning_rate": 1.5227070347284062e-05,
|
||
|
"loss": 0.5725,
|
||
|
"step": 8310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.639386189258312,
|
||
|
"grad_norm": 1.2630383968353271,
|
||
|
"learning_rate": 1.5182546749777381e-05,
|
||
|
"loss": 0.2777,
|
||
|
"step": 8320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.652173913043478,
|
||
|
"grad_norm": 1.6009305715560913,
|
||
|
"learning_rate": 1.5138023152270706e-05,
|
||
|
"loss": 0.6674,
|
||
|
"step": 8330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.664961636828645,
|
||
|
"grad_norm": 3.4451444149017334,
|
||
|
"learning_rate": 1.5093499554764027e-05,
|
||
|
"loss": 0.3947,
|
||
|
"step": 8340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.677749360613811,
|
||
|
"grad_norm": 11.27043342590332,
|
||
|
"learning_rate": 1.5048975957257347e-05,
|
||
|
"loss": 0.6128,
|
||
|
"step": 8350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.690537084398978,
|
||
|
"grad_norm": 5.958801746368408,
|
||
|
"learning_rate": 1.5004452359750668e-05,
|
||
|
"loss": 0.4243,
|
||
|
"step": 8360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.703324808184144,
|
||
|
"grad_norm": 1.0724457502365112,
|
||
|
"learning_rate": 1.4959928762243991e-05,
|
||
|
"loss": 0.2594,
|
||
|
"step": 8370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.71611253196931,
|
||
|
"grad_norm": 0.009859677404165268,
|
||
|
"learning_rate": 1.4915405164737312e-05,
|
||
|
"loss": 0.3739,
|
||
|
"step": 8380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.728900255754475,
|
||
|
"grad_norm": 0.07025722414255142,
|
||
|
"learning_rate": 1.4870881567230633e-05,
|
||
|
"loss": 0.4447,
|
||
|
"step": 8390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.741687979539641,
|
||
|
"grad_norm": 0.9080604314804077,
|
||
|
"learning_rate": 1.4826357969723955e-05,
|
||
|
"loss": 0.3147,
|
||
|
"step": 8400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.754475703324808,
|
||
|
"grad_norm": 4.071467399597168,
|
||
|
"learning_rate": 1.4781834372217276e-05,
|
||
|
"loss": 0.808,
|
||
|
"step": 8410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.767263427109974,
|
||
|
"grad_norm": 1.357560634613037,
|
||
|
"learning_rate": 1.4737310774710597e-05,
|
||
|
"loss": 0.4693,
|
||
|
"step": 8420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.78005115089514,
|
||
|
"grad_norm": 9.695436477661133,
|
||
|
"learning_rate": 1.4692787177203918e-05,
|
||
|
"loss": 0.7551,
|
||
|
"step": 8430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.792838874680307,
|
||
|
"grad_norm": 7.561251163482666,
|
||
|
"learning_rate": 1.464826357969724e-05,
|
||
|
"loss": 0.5718,
|
||
|
"step": 8440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.805626598465473,
|
||
|
"grad_norm": 5.493609428405762,
|
||
|
"learning_rate": 1.4603739982190561e-05,
|
||
|
"loss": 0.5034,
|
||
|
"step": 8450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.81841432225064,
|
||
|
"grad_norm": 1.3392680883407593,
|
||
|
"learning_rate": 1.4559216384683882e-05,
|
||
|
"loss": 0.9159,
|
||
|
"step": 8460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.831202046035806,
|
||
|
"grad_norm": 0.01922355219721794,
|
||
|
"learning_rate": 1.4514692787177206e-05,
|
||
|
"loss": 0.0897,
|
||
|
"step": 8470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.843989769820972,
|
||
|
"grad_norm": 4.552350044250488,
|
||
|
"learning_rate": 1.4470169189670527e-05,
|
||
|
"loss": 0.4221,
|
||
|
"step": 8480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.856777493606138,
|
||
|
"grad_norm": 0.30915215611457825,
|
||
|
"learning_rate": 1.4425645592163846e-05,
|
||
|
"loss": 0.2513,
|
||
|
"step": 8490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.869565217391305,
|
||
|
"grad_norm": 7.572796821594238,
|
||
|
"learning_rate": 1.4381121994657167e-05,
|
||
|
"loss": 0.6666,
|
||
|
"step": 8500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.882352941176471,
|
||
|
"grad_norm": 3.443830966949463,
|
||
|
"learning_rate": 1.4336598397150492e-05,
|
||
|
"loss": 0.2706,
|
||
|
"step": 8510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.895140664961637,
|
||
|
"grad_norm": 2.4022560119628906,
|
||
|
"learning_rate": 1.4292074799643812e-05,
|
||
|
"loss": 0.4621,
|
||
|
"step": 8520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.907928388746804,
|
||
|
"grad_norm": 1.282814860343933,
|
||
|
"learning_rate": 1.4247551202137133e-05,
|
||
|
"loss": 0.2189,
|
||
|
"step": 8530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.92071611253197,
|
||
|
"grad_norm": 0.48741990327835083,
|
||
|
"learning_rate": 1.4203027604630456e-05,
|
||
|
"loss": 0.6214,
|
||
|
"step": 8540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.933503836317136,
|
||
|
"grad_norm": 5.9614434242248535,
|
||
|
"learning_rate": 1.4158504007123777e-05,
|
||
|
"loss": 0.3349,
|
||
|
"step": 8550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.946291560102301,
|
||
|
"grad_norm": 1.9144011735916138,
|
||
|
"learning_rate": 1.4113980409617098e-05,
|
||
|
"loss": 0.4819,
|
||
|
"step": 8560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.959079283887467,
|
||
|
"grad_norm": 1.8512147665023804,
|
||
|
"learning_rate": 1.4069456812110419e-05,
|
||
|
"loss": 0.3379,
|
||
|
"step": 8570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.971867007672634,
|
||
|
"grad_norm": 5.208499908447266,
|
||
|
"learning_rate": 1.4024933214603741e-05,
|
||
|
"loss": 0.1968,
|
||
|
"step": 8580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.9846547314578,
|
||
|
"grad_norm": 0.16035664081573486,
|
||
|
"learning_rate": 1.3980409617097062e-05,
|
||
|
"loss": 0.4305,
|
||
|
"step": 8590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 10.997442455242966,
|
||
|
"grad_norm": 7.619056224822998,
|
||
|
"learning_rate": 1.3935886019590383e-05,
|
||
|
"loss": 0.6934,
|
||
|
"step": 8600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.0,
|
||
|
"eval_loss": 0.23721352219581604,
|
||
|
"eval_runtime": 0.9685,
|
||
|
"eval_samples_per_second": 101.184,
|
||
|
"eval_steps_per_second": 13.422,
|
||
|
"step": 8602
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.010230179028133,
|
||
|
"grad_norm": 6.740601062774658,
|
||
|
"learning_rate": 1.3891362422083707e-05,
|
||
|
"loss": 0.459,
|
||
|
"step": 8610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.023017902813299,
|
||
|
"grad_norm": 0.46416130661964417,
|
||
|
"learning_rate": 1.3846838824577026e-05,
|
||
|
"loss": 0.3525,
|
||
|
"step": 8620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.035805626598465,
|
||
|
"grad_norm": 7.859157562255859,
|
||
|
"learning_rate": 1.3802315227070347e-05,
|
||
|
"loss": 0.4976,
|
||
|
"step": 8630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.048593350383632,
|
||
|
"grad_norm": 0.018627116456627846,
|
||
|
"learning_rate": 1.3757791629563668e-05,
|
||
|
"loss": 0.2593,
|
||
|
"step": 8640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.061381074168798,
|
||
|
"grad_norm": 0.6259503960609436,
|
||
|
"learning_rate": 1.3713268032056992e-05,
|
||
|
"loss": 0.4457,
|
||
|
"step": 8650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.074168797953964,
|
||
|
"grad_norm": 0.4090126156806946,
|
||
|
"learning_rate": 1.3668744434550313e-05,
|
||
|
"loss": 0.4761,
|
||
|
"step": 8660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.08695652173913,
|
||
|
"grad_norm": 0.9712822437286377,
|
||
|
"learning_rate": 1.3624220837043632e-05,
|
||
|
"loss": 0.3631,
|
||
|
"step": 8670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.099744245524297,
|
||
|
"grad_norm": 5.4518046379089355,
|
||
|
"learning_rate": 1.3579697239536957e-05,
|
||
|
"loss": 0.3915,
|
||
|
"step": 8680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.112531969309464,
|
||
|
"grad_norm": 11.45752239227295,
|
||
|
"learning_rate": 1.3535173642030278e-05,
|
||
|
"loss": 0.4239,
|
||
|
"step": 8690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.12531969309463,
|
||
|
"grad_norm": 0.3467662036418915,
|
||
|
"learning_rate": 1.3490650044523598e-05,
|
||
|
"loss": 0.6385,
|
||
|
"step": 8700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.138107416879796,
|
||
|
"grad_norm": 0.231705442070961,
|
||
|
"learning_rate": 1.344612644701692e-05,
|
||
|
"loss": 0.3373,
|
||
|
"step": 8710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.15089514066496,
|
||
|
"grad_norm": 7.90257453918457,
|
||
|
"learning_rate": 1.3401602849510242e-05,
|
||
|
"loss": 0.5158,
|
||
|
"step": 8720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.163682864450127,
|
||
|
"grad_norm": 0.020741138607263565,
|
||
|
"learning_rate": 1.3357079252003563e-05,
|
||
|
"loss": 0.4563,
|
||
|
"step": 8730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.176470588235293,
|
||
|
"grad_norm": 9.415164947509766,
|
||
|
"learning_rate": 1.3312555654496884e-05,
|
||
|
"loss": 0.5224,
|
||
|
"step": 8740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.18925831202046,
|
||
|
"grad_norm": 11.169380187988281,
|
||
|
"learning_rate": 1.3268032056990206e-05,
|
||
|
"loss": 0.477,
|
||
|
"step": 8750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.202046035805626,
|
||
|
"grad_norm": 6.20239782333374,
|
||
|
"learning_rate": 1.3223508459483527e-05,
|
||
|
"loss": 0.41,
|
||
|
"step": 8760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.214833759590793,
|
||
|
"grad_norm": 11.079065322875977,
|
||
|
"learning_rate": 1.3178984861976848e-05,
|
||
|
"loss": 0.5932,
|
||
|
"step": 8770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.227621483375959,
|
||
|
"grad_norm": 0.30395638942718506,
|
||
|
"learning_rate": 1.3134461264470169e-05,
|
||
|
"loss": 0.4147,
|
||
|
"step": 8780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.240409207161125,
|
||
|
"grad_norm": 1.8934444189071655,
|
||
|
"learning_rate": 1.3089937666963491e-05,
|
||
|
"loss": 0.4328,
|
||
|
"step": 8790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.253196930946292,
|
||
|
"grad_norm": 8.08961009979248,
|
||
|
"learning_rate": 1.3045414069456812e-05,
|
||
|
"loss": 0.5754,
|
||
|
"step": 8800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.265984654731458,
|
||
|
"grad_norm": 0.6705631017684937,
|
||
|
"learning_rate": 1.3000890471950133e-05,
|
||
|
"loss": 0.4323,
|
||
|
"step": 8810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.278772378516624,
|
||
|
"grad_norm": 11.220414161682129,
|
||
|
"learning_rate": 1.2956366874443457e-05,
|
||
|
"loss": 0.5894,
|
||
|
"step": 8820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.29156010230179,
|
||
|
"grad_norm": 8.43408489227295,
|
||
|
"learning_rate": 1.2911843276936778e-05,
|
||
|
"loss": 0.6214,
|
||
|
"step": 8830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.304347826086957,
|
||
|
"grad_norm": 1.6605695486068726,
|
||
|
"learning_rate": 1.2867319679430097e-05,
|
||
|
"loss": 0.2958,
|
||
|
"step": 8840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.317135549872123,
|
||
|
"grad_norm": 5.640063762664795,
|
||
|
"learning_rate": 1.2822796081923418e-05,
|
||
|
"loss": 0.433,
|
||
|
"step": 8850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.32992327365729,
|
||
|
"grad_norm": 2.0308477878570557,
|
||
|
"learning_rate": 1.2778272484416743e-05,
|
||
|
"loss": 0.7216,
|
||
|
"step": 8860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.342710997442456,
|
||
|
"grad_norm": 0.7579576373100281,
|
||
|
"learning_rate": 1.2733748886910063e-05,
|
||
|
"loss": 0.1198,
|
||
|
"step": 8870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.355498721227622,
|
||
|
"grad_norm": 5.299111366271973,
|
||
|
"learning_rate": 1.2689225289403384e-05,
|
||
|
"loss": 0.2332,
|
||
|
"step": 8880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.368286445012787,
|
||
|
"grad_norm": 4.141612529754639,
|
||
|
"learning_rate": 1.2644701691896707e-05,
|
||
|
"loss": 0.3213,
|
||
|
"step": 8890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.381074168797953,
|
||
|
"grad_norm": 5.730710983276367,
|
||
|
"learning_rate": 1.2600178094390028e-05,
|
||
|
"loss": 0.3246,
|
||
|
"step": 8900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.39386189258312,
|
||
|
"grad_norm": 2.543301582336426,
|
||
|
"learning_rate": 1.2555654496883349e-05,
|
||
|
"loss": 0.4036,
|
||
|
"step": 8910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.406649616368286,
|
||
|
"grad_norm": 0.17073774337768555,
|
||
|
"learning_rate": 1.251113089937667e-05,
|
||
|
"loss": 0.6859,
|
||
|
"step": 8920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.419437340153452,
|
||
|
"grad_norm": 0.9711840152740479,
|
||
|
"learning_rate": 1.246660730186999e-05,
|
||
|
"loss": 0.3302,
|
||
|
"step": 8930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.432225063938619,
|
||
|
"grad_norm": 11.181052207946777,
|
||
|
"learning_rate": 1.2422083704363313e-05,
|
||
|
"loss": 0.5149,
|
||
|
"step": 8940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.445012787723785,
|
||
|
"grad_norm": 0.5489789247512817,
|
||
|
"learning_rate": 1.2377560106856636e-05,
|
||
|
"loss": 0.767,
|
||
|
"step": 8950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.457800511508951,
|
||
|
"grad_norm": 2.264814615249634,
|
||
|
"learning_rate": 1.2333036509349956e-05,
|
||
|
"loss": 0.1734,
|
||
|
"step": 8960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.470588235294118,
|
||
|
"grad_norm": 2.2042288780212402,
|
||
|
"learning_rate": 1.2288512911843277e-05,
|
||
|
"loss": 0.5622,
|
||
|
"step": 8970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.483375959079284,
|
||
|
"grad_norm": 0.05039510875940323,
|
||
|
"learning_rate": 1.2243989314336598e-05,
|
||
|
"loss": 0.801,
|
||
|
"step": 8980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.49616368286445,
|
||
|
"grad_norm": 8.120935440063477,
|
||
|
"learning_rate": 1.219946571682992e-05,
|
||
|
"loss": 0.3447,
|
||
|
"step": 8990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.508951406649617,
|
||
|
"grad_norm": 6.028167724609375,
|
||
|
"learning_rate": 1.2154942119323242e-05,
|
||
|
"loss": 0.6221,
|
||
|
"step": 9000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.521739130434783,
|
||
|
"grad_norm": 0.06073115020990372,
|
||
|
"learning_rate": 1.2110418521816562e-05,
|
||
|
"loss": 0.1343,
|
||
|
"step": 9010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.53452685421995,
|
||
|
"grad_norm": 1.3959262371063232,
|
||
|
"learning_rate": 1.2065894924309885e-05,
|
||
|
"loss": 0.213,
|
||
|
"step": 9020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.547314578005116,
|
||
|
"grad_norm": 0.5288462042808533,
|
||
|
"learning_rate": 1.2021371326803206e-05,
|
||
|
"loss": 0.5034,
|
||
|
"step": 9030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.56010230179028,
|
||
|
"grad_norm": 5.911684989929199,
|
||
|
"learning_rate": 1.1976847729296529e-05,
|
||
|
"loss": 0.3314,
|
||
|
"step": 9040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.572890025575447,
|
||
|
"grad_norm": 2.7740604877471924,
|
||
|
"learning_rate": 1.193232413178985e-05,
|
||
|
"loss": 0.4807,
|
||
|
"step": 9050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.585677749360613,
|
||
|
"grad_norm": 0.6244329810142517,
|
||
|
"learning_rate": 1.188780053428317e-05,
|
||
|
"loss": 0.1838,
|
||
|
"step": 9060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.59846547314578,
|
||
|
"grad_norm": 2.633812189102173,
|
||
|
"learning_rate": 1.1843276936776491e-05,
|
||
|
"loss": 0.4862,
|
||
|
"step": 9070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.611253196930946,
|
||
|
"grad_norm": 10.810276985168457,
|
||
|
"learning_rate": 1.1798753339269814e-05,
|
||
|
"loss": 0.4788,
|
||
|
"step": 9080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.624040920716112,
|
||
|
"grad_norm": 2.0004940032958984,
|
||
|
"learning_rate": 1.1754229741763136e-05,
|
||
|
"loss": 0.3771,
|
||
|
"step": 9090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.636828644501279,
|
||
|
"grad_norm": 0.30808359384536743,
|
||
|
"learning_rate": 1.1709706144256455e-05,
|
||
|
"loss": 0.335,
|
||
|
"step": 9100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.649616368286445,
|
||
|
"grad_norm": 5.277163028717041,
|
||
|
"learning_rate": 1.1665182546749778e-05,
|
||
|
"loss": 0.4029,
|
||
|
"step": 9110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.662404092071611,
|
||
|
"grad_norm": 0.022072020918130875,
|
||
|
"learning_rate": 1.1620658949243099e-05,
|
||
|
"loss": 0.4137,
|
||
|
"step": 9120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.675191815856778,
|
||
|
"grad_norm": 3.9940779209136963,
|
||
|
"learning_rate": 1.1576135351736421e-05,
|
||
|
"loss": 0.514,
|
||
|
"step": 9130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.687979539641944,
|
||
|
"grad_norm": 0.14408734440803528,
|
||
|
"learning_rate": 1.1531611754229742e-05,
|
||
|
"loss": 0.1492,
|
||
|
"step": 9140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.70076726342711,
|
||
|
"grad_norm": 2.5065701007843018,
|
||
|
"learning_rate": 1.1487088156723063e-05,
|
||
|
"loss": 0.4501,
|
||
|
"step": 9150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.713554987212277,
|
||
|
"grad_norm": 3.592348098754883,
|
||
|
"learning_rate": 1.1442564559216386e-05,
|
||
|
"loss": 0.2521,
|
||
|
"step": 9160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.726342710997443,
|
||
|
"grad_norm": 4.004711151123047,
|
||
|
"learning_rate": 1.1398040961709707e-05,
|
||
|
"loss": 0.3974,
|
||
|
"step": 9170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.73913043478261,
|
||
|
"grad_norm": 8.429434776306152,
|
||
|
"learning_rate": 1.135351736420303e-05,
|
||
|
"loss": 0.4025,
|
||
|
"step": 9180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.751918158567776,
|
||
|
"grad_norm": 5.526464462280273,
|
||
|
"learning_rate": 1.1308993766696348e-05,
|
||
|
"loss": 0.479,
|
||
|
"step": 9190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.764705882352942,
|
||
|
"grad_norm": 0.15864869952201843,
|
||
|
"learning_rate": 1.1264470169189671e-05,
|
||
|
"loss": 0.7387,
|
||
|
"step": 9200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.777493606138107,
|
||
|
"grad_norm": 0.2955300509929657,
|
||
|
"learning_rate": 1.1219946571682992e-05,
|
||
|
"loss": 0.2657,
|
||
|
"step": 9210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.790281329923273,
|
||
|
"grad_norm": 1.9746955633163452,
|
||
|
"learning_rate": 1.1175422974176314e-05,
|
||
|
"loss": 0.3473,
|
||
|
"step": 9220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.80306905370844,
|
||
|
"grad_norm": 0.009776926599442959,
|
||
|
"learning_rate": 1.1130899376669635e-05,
|
||
|
"loss": 0.4985,
|
||
|
"step": 9230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.815856777493606,
|
||
|
"grad_norm": 0.8222724199295044,
|
||
|
"learning_rate": 1.1086375779162956e-05,
|
||
|
"loss": 0.1595,
|
||
|
"step": 9240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.828644501278772,
|
||
|
"grad_norm": 2.545729875564575,
|
||
|
"learning_rate": 1.1041852181656279e-05,
|
||
|
"loss": 0.2437,
|
||
|
"step": 9250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.841432225063938,
|
||
|
"grad_norm": 1.1305873394012451,
|
||
|
"learning_rate": 1.09973285841496e-05,
|
||
|
"loss": 0.4101,
|
||
|
"step": 9260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.854219948849105,
|
||
|
"grad_norm": 1.6995846033096313,
|
||
|
"learning_rate": 1.0952804986642922e-05,
|
||
|
"loss": 0.3361,
|
||
|
"step": 9270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.867007672634271,
|
||
|
"grad_norm": 1.532027244567871,
|
||
|
"learning_rate": 1.0908281389136243e-05,
|
||
|
"loss": 0.2246,
|
||
|
"step": 9280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.879795396419437,
|
||
|
"grad_norm": 0.10980970412492752,
|
||
|
"learning_rate": 1.0863757791629564e-05,
|
||
|
"loss": 0.1659,
|
||
|
"step": 9290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.892583120204604,
|
||
|
"grad_norm": 1.9785058498382568,
|
||
|
"learning_rate": 1.0819234194122887e-05,
|
||
|
"loss": 0.4823,
|
||
|
"step": 9300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.90537084398977,
|
||
|
"grad_norm": 2.5999562740325928,
|
||
|
"learning_rate": 1.0774710596616207e-05,
|
||
|
"loss": 0.2816,
|
||
|
"step": 9310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.918158567774936,
|
||
|
"grad_norm": 7.072868824005127,
|
||
|
"learning_rate": 1.0730186999109528e-05,
|
||
|
"loss": 0.4803,
|
||
|
"step": 9320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.930946291560103,
|
||
|
"grad_norm": 0.15491001307964325,
|
||
|
"learning_rate": 1.068566340160285e-05,
|
||
|
"loss": 0.6164,
|
||
|
"step": 9330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.94373401534527,
|
||
|
"grad_norm": 5.728983402252197,
|
||
|
"learning_rate": 1.0641139804096172e-05,
|
||
|
"loss": 0.323,
|
||
|
"step": 9340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.956521739130435,
|
||
|
"grad_norm": 2.930337905883789,
|
||
|
"learning_rate": 1.0596616206589494e-05,
|
||
|
"loss": 0.3664,
|
||
|
"step": 9350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.969309462915602,
|
||
|
"grad_norm": 1.9165003299713135,
|
||
|
"learning_rate": 1.0552092609082813e-05,
|
||
|
"loss": 0.8193,
|
||
|
"step": 9360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.982097186700766,
|
||
|
"grad_norm": 1.9771157503128052,
|
||
|
"learning_rate": 1.0507569011576136e-05,
|
||
|
"loss": 0.3378,
|
||
|
"step": 9370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 11.994884910485933,
|
||
|
"grad_norm": 0.9281581044197083,
|
||
|
"learning_rate": 1.0463045414069457e-05,
|
||
|
"loss": 0.5581,
|
||
|
"step": 9380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.0,
|
||
|
"eval_loss": 0.23496317863464355,
|
||
|
"eval_runtime": 0.9841,
|
||
|
"eval_samples_per_second": 99.579,
|
||
|
"eval_steps_per_second": 13.21,
|
||
|
"step": 9384
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.007672634271099,
|
||
|
"grad_norm": 2.9337079524993896,
|
||
|
"learning_rate": 1.041852181656278e-05,
|
||
|
"loss": 0.3057,
|
||
|
"step": 9390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.020460358056265,
|
||
|
"grad_norm": 0.2717020809650421,
|
||
|
"learning_rate": 1.03739982190561e-05,
|
||
|
"loss": 0.5384,
|
||
|
"step": 9400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.033248081841432,
|
||
|
"grad_norm": 0.10977739095687866,
|
||
|
"learning_rate": 1.0329474621549421e-05,
|
||
|
"loss": 0.461,
|
||
|
"step": 9410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.046035805626598,
|
||
|
"grad_norm": 1.6865909099578857,
|
||
|
"learning_rate": 1.0284951024042744e-05,
|
||
|
"loss": 0.2365,
|
||
|
"step": 9420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.058823529411764,
|
||
|
"grad_norm": 0.12684215605258942,
|
||
|
"learning_rate": 1.0240427426536065e-05,
|
||
|
"loss": 0.3649,
|
||
|
"step": 9430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.07161125319693,
|
||
|
"grad_norm": 0.053901903331279755,
|
||
|
"learning_rate": 1.0195903829029387e-05,
|
||
|
"loss": 0.4141,
|
||
|
"step": 9440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.084398976982097,
|
||
|
"grad_norm": 0.8139101266860962,
|
||
|
"learning_rate": 1.0151380231522706e-05,
|
||
|
"loss": 0.3258,
|
||
|
"step": 9450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.097186700767264,
|
||
|
"grad_norm": 7.989099685801193e-05,
|
||
|
"learning_rate": 1.0106856634016029e-05,
|
||
|
"loss": 0.5458,
|
||
|
"step": 9460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.10997442455243,
|
||
|
"grad_norm": 1.4419445991516113,
|
||
|
"learning_rate": 1.006233303650935e-05,
|
||
|
"loss": 0.4101,
|
||
|
"step": 9470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.122762148337596,
|
||
|
"grad_norm": 8.941499710083008,
|
||
|
"learning_rate": 1.0017809439002672e-05,
|
||
|
"loss": 0.3453,
|
||
|
"step": 9480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.135549872122763,
|
||
|
"grad_norm": 0.2980097532272339,
|
||
|
"learning_rate": 9.973285841495993e-06,
|
||
|
"loss": 0.1952,
|
||
|
"step": 9490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.148337595907929,
|
||
|
"grad_norm": 7.380556583404541,
|
||
|
"learning_rate": 9.928762243989314e-06,
|
||
|
"loss": 0.3819,
|
||
|
"step": 9500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.161125319693095,
|
||
|
"grad_norm": 0.3187771439552307,
|
||
|
"learning_rate": 9.884238646482637e-06,
|
||
|
"loss": 0.3993,
|
||
|
"step": 9510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.173913043478262,
|
||
|
"grad_norm": 3.5564401149749756,
|
||
|
"learning_rate": 9.839715048975958e-06,
|
||
|
"loss": 0.4138,
|
||
|
"step": 9520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.186700767263428,
|
||
|
"grad_norm": 0.04383537545800209,
|
||
|
"learning_rate": 9.79519145146928e-06,
|
||
|
"loss": 0.2085,
|
||
|
"step": 9530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.199488491048593,
|
||
|
"grad_norm": 8.45487117767334,
|
||
|
"learning_rate": 9.7506678539626e-06,
|
||
|
"loss": 0.3879,
|
||
|
"step": 9540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.212276214833759,
|
||
|
"grad_norm": 0.07102257758378983,
|
||
|
"learning_rate": 9.706144256455922e-06,
|
||
|
"loss": 0.2747,
|
||
|
"step": 9550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.225063938618925,
|
||
|
"grad_norm": 8.977646827697754,
|
||
|
"learning_rate": 9.661620658949245e-06,
|
||
|
"loss": 0.5841,
|
||
|
"step": 9560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.237851662404092,
|
||
|
"grad_norm": 6.482713222503662,
|
||
|
"learning_rate": 9.617097061442565e-06,
|
||
|
"loss": 0.5037,
|
||
|
"step": 9570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.250639386189258,
|
||
|
"grad_norm": 0.6768947243690491,
|
||
|
"learning_rate": 9.572573463935886e-06,
|
||
|
"loss": 0.629,
|
||
|
"step": 9580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.263427109974424,
|
||
|
"grad_norm": 2.5135774194495752e-05,
|
||
|
"learning_rate": 9.528049866429207e-06,
|
||
|
"loss": 0.3964,
|
||
|
"step": 9590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.27621483375959,
|
||
|
"grad_norm": 7.924706935882568,
|
||
|
"learning_rate": 9.48352626892253e-06,
|
||
|
"loss": 0.4249,
|
||
|
"step": 9600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.289002557544757,
|
||
|
"grad_norm": 0.7605132460594177,
|
||
|
"learning_rate": 9.43900267141585e-06,
|
||
|
"loss": 0.2857,
|
||
|
"step": 9610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.301790281329923,
|
||
|
"grad_norm": 0.39697137475013733,
|
||
|
"learning_rate": 9.394479073909172e-06,
|
||
|
"loss": 0.2537,
|
||
|
"step": 9620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.31457800511509,
|
||
|
"grad_norm": 0.43480339646339417,
|
||
|
"learning_rate": 9.349955476402494e-06,
|
||
|
"loss": 0.2062,
|
||
|
"step": 9630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.327365728900256,
|
||
|
"grad_norm": 2.8665430545806885,
|
||
|
"learning_rate": 9.305431878895815e-06,
|
||
|
"loss": 0.5541,
|
||
|
"step": 9640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.340153452685422,
|
||
|
"grad_norm": 1.6264675855636597,
|
||
|
"learning_rate": 9.260908281389138e-06,
|
||
|
"loss": 0.366,
|
||
|
"step": 9650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.352941176470589,
|
||
|
"grad_norm": 9.299280166625977,
|
||
|
"learning_rate": 9.216384683882458e-06,
|
||
|
"loss": 0.6633,
|
||
|
"step": 9660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.365728900255755,
|
||
|
"grad_norm": 0.8438981175422668,
|
||
|
"learning_rate": 9.17186108637578e-06,
|
||
|
"loss": 0.5287,
|
||
|
"step": 9670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.378516624040921,
|
||
|
"grad_norm": 13.061861038208008,
|
||
|
"learning_rate": 9.1273374888691e-06,
|
||
|
"loss": 0.3324,
|
||
|
"step": 9680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.391304347826088,
|
||
|
"grad_norm": 3.5456817150115967,
|
||
|
"learning_rate": 9.082813891362423e-06,
|
||
|
"loss": 0.3801,
|
||
|
"step": 9690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.404092071611252,
|
||
|
"grad_norm": 5.760250091552734,
|
||
|
"learning_rate": 9.038290293855745e-06,
|
||
|
"loss": 0.4844,
|
||
|
"step": 9700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.416879795396419,
|
||
|
"grad_norm": 6.475959777832031,
|
||
|
"learning_rate": 8.993766696349064e-06,
|
||
|
"loss": 0.4455,
|
||
|
"step": 9710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.429667519181585,
|
||
|
"grad_norm": 3.8550329208374023,
|
||
|
"learning_rate": 8.949243098842387e-06,
|
||
|
"loss": 0.4175,
|
||
|
"step": 9720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.442455242966751,
|
||
|
"grad_norm": 2.0850658416748047,
|
||
|
"learning_rate": 8.904719501335708e-06,
|
||
|
"loss": 0.361,
|
||
|
"step": 9730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.455242966751918,
|
||
|
"grad_norm": 4.074941158294678,
|
||
|
"learning_rate": 8.86019590382903e-06,
|
||
|
"loss": 0.2447,
|
||
|
"step": 9740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.468030690537084,
|
||
|
"grad_norm": 1.6458179950714111,
|
||
|
"learning_rate": 8.815672306322351e-06,
|
||
|
"loss": 0.4367,
|
||
|
"step": 9750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.48081841432225,
|
||
|
"grad_norm": 1.9982742071151733,
|
||
|
"learning_rate": 8.771148708815672e-06,
|
||
|
"loss": 0.3444,
|
||
|
"step": 9760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.493606138107417,
|
||
|
"grad_norm": 6.526026725769043,
|
||
|
"learning_rate": 8.726625111308995e-06,
|
||
|
"loss": 0.5819,
|
||
|
"step": 9770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.506393861892583,
|
||
|
"grad_norm": 9.310763359069824,
|
||
|
"learning_rate": 8.682101513802316e-06,
|
||
|
"loss": 0.3059,
|
||
|
"step": 9780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.51918158567775,
|
||
|
"grad_norm": 1.0071550607681274,
|
||
|
"learning_rate": 8.637577916295638e-06,
|
||
|
"loss": 0.3726,
|
||
|
"step": 9790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.531969309462916,
|
||
|
"grad_norm": 0.882957935333252,
|
||
|
"learning_rate": 8.593054318788957e-06,
|
||
|
"loss": 0.4725,
|
||
|
"step": 9800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.544757033248082,
|
||
|
"grad_norm": 1.2012654542922974,
|
||
|
"learning_rate": 8.54853072128228e-06,
|
||
|
"loss": 0.2205,
|
||
|
"step": 9810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.557544757033249,
|
||
|
"grad_norm": 1.7305279970169067,
|
||
|
"learning_rate": 8.504007123775601e-06,
|
||
|
"loss": 0.4537,
|
||
|
"step": 9820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.570332480818415,
|
||
|
"grad_norm": 4.674372673034668,
|
||
|
"learning_rate": 8.459483526268923e-06,
|
||
|
"loss": 0.4568,
|
||
|
"step": 9830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.583120204603581,
|
||
|
"grad_norm": 6.6475138664245605,
|
||
|
"learning_rate": 8.414959928762244e-06,
|
||
|
"loss": 0.3144,
|
||
|
"step": 9840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.595907928388748,
|
||
|
"grad_norm": 0.38528770208358765,
|
||
|
"learning_rate": 8.370436331255565e-06,
|
||
|
"loss": 0.4792,
|
||
|
"step": 9850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.608695652173914,
|
||
|
"grad_norm": 0.9754725694656372,
|
||
|
"learning_rate": 8.325912733748888e-06,
|
||
|
"loss": 0.5235,
|
||
|
"step": 9860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.621483375959079,
|
||
|
"grad_norm": 4.076246738433838,
|
||
|
"learning_rate": 8.281389136242209e-06,
|
||
|
"loss": 0.348,
|
||
|
"step": 9870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.634271099744245,
|
||
|
"grad_norm": 1.0100876092910767,
|
||
|
"learning_rate": 8.23686553873553e-06,
|
||
|
"loss": 0.5218,
|
||
|
"step": 9880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.647058823529411,
|
||
|
"grad_norm": 17.8681640625,
|
||
|
"learning_rate": 8.19234194122885e-06,
|
||
|
"loss": 0.6763,
|
||
|
"step": 9890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.659846547314578,
|
||
|
"grad_norm": 6.97352933883667,
|
||
|
"learning_rate": 8.147818343722173e-06,
|
||
|
"loss": 0.4291,
|
||
|
"step": 9900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.672634271099744,
|
||
|
"grad_norm": 0.3931565582752228,
|
||
|
"learning_rate": 8.103294746215496e-06,
|
||
|
"loss": 0.4243,
|
||
|
"step": 9910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.68542199488491,
|
||
|
"grad_norm": 2.1343562602996826,
|
||
|
"learning_rate": 8.058771148708816e-06,
|
||
|
"loss": 0.3933,
|
||
|
"step": 9920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.698209718670077,
|
||
|
"grad_norm": 5.404961109161377,
|
||
|
"learning_rate": 8.014247551202137e-06,
|
||
|
"loss": 0.4113,
|
||
|
"step": 9930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.710997442455243,
|
||
|
"grad_norm": 0.09293472766876221,
|
||
|
"learning_rate": 7.969723953695458e-06,
|
||
|
"loss": 0.351,
|
||
|
"step": 9940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.72378516624041,
|
||
|
"grad_norm": 0.13212403655052185,
|
||
|
"learning_rate": 7.92520035618878e-06,
|
||
|
"loss": 0.462,
|
||
|
"step": 9950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.736572890025576,
|
||
|
"grad_norm": 5.489344120025635,
|
||
|
"learning_rate": 7.880676758682102e-06,
|
||
|
"loss": 0.3703,
|
||
|
"step": 9960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.749360613810742,
|
||
|
"grad_norm": 1.962679386138916,
|
||
|
"learning_rate": 7.836153161175422e-06,
|
||
|
"loss": 0.2338,
|
||
|
"step": 9970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.762148337595908,
|
||
|
"grad_norm": 9.600525856018066,
|
||
|
"learning_rate": 7.791629563668745e-06,
|
||
|
"loss": 0.43,
|
||
|
"step": 9980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.774936061381075,
|
||
|
"grad_norm": 1.3438434600830078,
|
||
|
"learning_rate": 7.747105966162066e-06,
|
||
|
"loss": 0.2854,
|
||
|
"step": 9990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.787723785166241,
|
||
|
"grad_norm": 0.0004410437832120806,
|
||
|
"learning_rate": 7.702582368655389e-06,
|
||
|
"loss": 0.3249,
|
||
|
"step": 10000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.800511508951407,
|
||
|
"grad_norm": 0.4983418881893158,
|
||
|
"learning_rate": 7.65805877114871e-06,
|
||
|
"loss": 0.3296,
|
||
|
"step": 10010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.813299232736572,
|
||
|
"grad_norm": 0.41612160205841064,
|
||
|
"learning_rate": 7.613535173642031e-06,
|
||
|
"loss": 0.5248,
|
||
|
"step": 10020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.826086956521738,
|
||
|
"grad_norm": 13.50173568725586,
|
||
|
"learning_rate": 7.569011576135353e-06,
|
||
|
"loss": 0.5579,
|
||
|
"step": 10030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.838874680306905,
|
||
|
"grad_norm": 3.2554118633270264,
|
||
|
"learning_rate": 7.524487978628674e-06,
|
||
|
"loss": 0.6241,
|
||
|
"step": 10040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.851662404092071,
|
||
|
"grad_norm": 1.226417064666748,
|
||
|
"learning_rate": 7.4799643811219954e-06,
|
||
|
"loss": 0.2834,
|
||
|
"step": 10050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.864450127877237,
|
||
|
"grad_norm": 2.9790737628936768,
|
||
|
"learning_rate": 7.435440783615316e-06,
|
||
|
"loss": 0.4032,
|
||
|
"step": 10060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.877237851662404,
|
||
|
"grad_norm": 13.057442665100098,
|
||
|
"learning_rate": 7.390917186108638e-06,
|
||
|
"loss": 0.247,
|
||
|
"step": 10070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.89002557544757,
|
||
|
"grad_norm": 5.512662410736084,
|
||
|
"learning_rate": 7.346393588601959e-06,
|
||
|
"loss": 0.3662,
|
||
|
"step": 10080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.902813299232736,
|
||
|
"grad_norm": 1.990576148033142,
|
||
|
"learning_rate": 7.301869991095281e-06,
|
||
|
"loss": 0.5448,
|
||
|
"step": 10090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.915601023017903,
|
||
|
"grad_norm": 0.43409115076065063,
|
||
|
"learning_rate": 7.257346393588603e-06,
|
||
|
"loss": 0.6568,
|
||
|
"step": 10100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.92838874680307,
|
||
|
"grad_norm": 1.7592841386795044,
|
||
|
"learning_rate": 7.212822796081923e-06,
|
||
|
"loss": 0.4341,
|
||
|
"step": 10110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.941176470588236,
|
||
|
"grad_norm": 5.928600788116455,
|
||
|
"learning_rate": 7.168299198575246e-06,
|
||
|
"loss": 0.6926,
|
||
|
"step": 10120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.953964194373402,
|
||
|
"grad_norm": 0.49512559175491333,
|
||
|
"learning_rate": 7.123775601068567e-06,
|
||
|
"loss": 0.3134,
|
||
|
"step": 10130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.966751918158568,
|
||
|
"grad_norm": 0.061214692890644073,
|
||
|
"learning_rate": 7.079252003561888e-06,
|
||
|
"loss": 0.8422,
|
||
|
"step": 10140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.979539641943735,
|
||
|
"grad_norm": 10.013786315917969,
|
||
|
"learning_rate": 7.034728406055209e-06,
|
||
|
"loss": 0.4474,
|
||
|
"step": 10150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 12.992327365728901,
|
||
|
"grad_norm": 2.202415943145752,
|
||
|
"learning_rate": 6.990204808548531e-06,
|
||
|
"loss": 0.23,
|
||
|
"step": 10160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.0,
|
||
|
"eval_loss": 0.23341350257396698,
|
||
|
"eval_runtime": 0.9774,
|
||
|
"eval_samples_per_second": 100.265,
|
||
|
"eval_steps_per_second": 13.301,
|
||
|
"step": 10166
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.005115089514067,
|
||
|
"grad_norm": 8.305723190307617,
|
||
|
"learning_rate": 6.9456812110418536e-06,
|
||
|
"loss": 0.5032,
|
||
|
"step": 10170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.017902813299234,
|
||
|
"grad_norm": 9.274243354797363,
|
||
|
"learning_rate": 6.901157613535174e-06,
|
||
|
"loss": 0.2192,
|
||
|
"step": 10180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.030690537084398,
|
||
|
"grad_norm": 2.3518593311309814,
|
||
|
"learning_rate": 6.856634016028496e-06,
|
||
|
"loss": 0.5402,
|
||
|
"step": 10190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.043478260869565,
|
||
|
"grad_norm": 0.012316963635385036,
|
||
|
"learning_rate": 6.812110418521816e-06,
|
||
|
"loss": 0.4061,
|
||
|
"step": 10200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.05626598465473,
|
||
|
"grad_norm": 6.4899582862854,
|
||
|
"learning_rate": 6.767586821015139e-06,
|
||
|
"loss": 0.3366,
|
||
|
"step": 10210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.069053708439897,
|
||
|
"grad_norm": 0.12233175337314606,
|
||
|
"learning_rate": 6.72306322350846e-06,
|
||
|
"loss": 0.1374,
|
||
|
"step": 10220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.081841432225064,
|
||
|
"grad_norm": 0.4392085075378418,
|
||
|
"learning_rate": 6.678539626001781e-06,
|
||
|
"loss": 0.1306,
|
||
|
"step": 10230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.09462915601023,
|
||
|
"grad_norm": 13.07235050201416,
|
||
|
"learning_rate": 6.634016028495103e-06,
|
||
|
"loss": 0.4273,
|
||
|
"step": 10240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.107416879795396,
|
||
|
"grad_norm": 7.395537376403809,
|
||
|
"learning_rate": 6.589492430988424e-06,
|
||
|
"loss": 0.5841,
|
||
|
"step": 10250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.120204603580563,
|
||
|
"grad_norm": 10.473769187927246,
|
||
|
"learning_rate": 6.544968833481746e-06,
|
||
|
"loss": 0.6586,
|
||
|
"step": 10260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.132992327365729,
|
||
|
"grad_norm": 1.897230863571167,
|
||
|
"learning_rate": 6.5004452359750666e-06,
|
||
|
"loss": 0.5209,
|
||
|
"step": 10270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.145780051150895,
|
||
|
"grad_norm": 4.206538200378418,
|
||
|
"learning_rate": 6.455921638468389e-06,
|
||
|
"loss": 0.3086,
|
||
|
"step": 10280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.158567774936062,
|
||
|
"grad_norm": 5.3881001472473145,
|
||
|
"learning_rate": 6.411398040961709e-06,
|
||
|
"loss": 0.614,
|
||
|
"step": 10290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.171355498721228,
|
||
|
"grad_norm": 0.24799497425556183,
|
||
|
"learning_rate": 6.366874443455032e-06,
|
||
|
"loss": 0.0553,
|
||
|
"step": 10300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.184143222506394,
|
||
|
"grad_norm": 7.40368127822876,
|
||
|
"learning_rate": 6.3223508459483535e-06,
|
||
|
"loss": 0.5088,
|
||
|
"step": 10310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.19693094629156,
|
||
|
"grad_norm": 0.08739714324474335,
|
||
|
"learning_rate": 6.277827248441674e-06,
|
||
|
"loss": 0.092,
|
||
|
"step": 10320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.209718670076727,
|
||
|
"grad_norm": 1.746079921722412,
|
||
|
"learning_rate": 6.233303650934995e-06,
|
||
|
"loss": 0.2567,
|
||
|
"step": 10330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.222506393861893,
|
||
|
"grad_norm": 0.45384278893470764,
|
||
|
"learning_rate": 6.188780053428318e-06,
|
||
|
"loss": 0.0995,
|
||
|
"step": 10340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.235294117647058,
|
||
|
"grad_norm": 1.0237295627593994,
|
||
|
"learning_rate": 6.144256455921639e-06,
|
||
|
"loss": 0.6154,
|
||
|
"step": 10350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.248081841432224,
|
||
|
"grad_norm": 6.016015529632568,
|
||
|
"learning_rate": 6.09973285841496e-06,
|
||
|
"loss": 0.623,
|
||
|
"step": 10360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.26086956521739,
|
||
|
"grad_norm": 3.6509177684783936,
|
||
|
"learning_rate": 6.055209260908281e-06,
|
||
|
"loss": 0.3978,
|
||
|
"step": 10370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.273657289002557,
|
||
|
"grad_norm": 3.9235923290252686,
|
||
|
"learning_rate": 6.010685663401603e-06,
|
||
|
"loss": 0.8585,
|
||
|
"step": 10380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.286445012787723,
|
||
|
"grad_norm": 4.775753974914551,
|
||
|
"learning_rate": 5.966162065894925e-06,
|
||
|
"loss": 0.3978,
|
||
|
"step": 10390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.29923273657289,
|
||
|
"grad_norm": 11.553483009338379,
|
||
|
"learning_rate": 5.9216384683882456e-06,
|
||
|
"loss": 0.424,
|
||
|
"step": 10400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.312020460358056,
|
||
|
"grad_norm": 3.354985237121582,
|
||
|
"learning_rate": 5.877114870881568e-06,
|
||
|
"loss": 0.539,
|
||
|
"step": 10410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.324808184143222,
|
||
|
"grad_norm": 0.004566879011690617,
|
||
|
"learning_rate": 5.832591273374889e-06,
|
||
|
"loss": 0.5421,
|
||
|
"step": 10420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.337595907928389,
|
||
|
"grad_norm": 13.376380920410156,
|
||
|
"learning_rate": 5.788067675868211e-06,
|
||
|
"loss": 0.8866,
|
||
|
"step": 10430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.350383631713555,
|
||
|
"grad_norm": 5.068173408508301,
|
||
|
"learning_rate": 5.743544078361532e-06,
|
||
|
"loss": 0.4386,
|
||
|
"step": 10440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.363171355498721,
|
||
|
"grad_norm": 0.2643067538738251,
|
||
|
"learning_rate": 5.699020480854853e-06,
|
||
|
"loss": 0.4168,
|
||
|
"step": 10450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.375959079283888,
|
||
|
"grad_norm": 6.765013694763184,
|
||
|
"learning_rate": 5.654496883348174e-06,
|
||
|
"loss": 0.4382,
|
||
|
"step": 10460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.388746803069054,
|
||
|
"grad_norm": 0.811938464641571,
|
||
|
"learning_rate": 5.609973285841496e-06,
|
||
|
"loss": 0.4302,
|
||
|
"step": 10470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.40153452685422,
|
||
|
"grad_norm": 2.1787633895874023,
|
||
|
"learning_rate": 5.565449688334818e-06,
|
||
|
"loss": 0.4329,
|
||
|
"step": 10480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.414322250639387,
|
||
|
"grad_norm": 0.008785980753600597,
|
||
|
"learning_rate": 5.520926090828139e-06,
|
||
|
"loss": 0.2213,
|
||
|
"step": 10490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.427109974424553,
|
||
|
"grad_norm": 3.6294312477111816,
|
||
|
"learning_rate": 5.476402493321461e-06,
|
||
|
"loss": 0.7337,
|
||
|
"step": 10500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.43989769820972,
|
||
|
"grad_norm": 1.247524619102478,
|
||
|
"learning_rate": 5.431878895814782e-06,
|
||
|
"loss": 0.4815,
|
||
|
"step": 10510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.452685421994884,
|
||
|
"grad_norm": 0.05368124693632126,
|
||
|
"learning_rate": 5.387355298308104e-06,
|
||
|
"loss": 0.3219,
|
||
|
"step": 10520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.46547314578005,
|
||
|
"grad_norm": 3.9428138732910156,
|
||
|
"learning_rate": 5.342831700801425e-06,
|
||
|
"loss": 0.6114,
|
||
|
"step": 10530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.478260869565217,
|
||
|
"grad_norm": 8.967109680175781,
|
||
|
"learning_rate": 5.298308103294747e-06,
|
||
|
"loss": 0.6697,
|
||
|
"step": 10540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.491048593350383,
|
||
|
"grad_norm": 4.612414836883545,
|
||
|
"learning_rate": 5.253784505788068e-06,
|
||
|
"loss": 0.2481,
|
||
|
"step": 10550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.50383631713555,
|
||
|
"grad_norm": 0.3696252107620239,
|
||
|
"learning_rate": 5.20926090828139e-06,
|
||
|
"loss": 0.4671,
|
||
|
"step": 10560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.516624040920716,
|
||
|
"grad_norm": 2.8999485969543457,
|
||
|
"learning_rate": 5.164737310774711e-06,
|
||
|
"loss": 0.2148,
|
||
|
"step": 10570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.529411764705882,
|
||
|
"grad_norm": 0.0017122033750638366,
|
||
|
"learning_rate": 5.120213713268032e-06,
|
||
|
"loss": 0.328,
|
||
|
"step": 10580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.542199488491049,
|
||
|
"grad_norm": 0.07302047312259674,
|
||
|
"learning_rate": 5.075690115761353e-06,
|
||
|
"loss": 0.2222,
|
||
|
"step": 10590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.554987212276215,
|
||
|
"grad_norm": 77.11892700195312,
|
||
|
"learning_rate": 5.031166518254675e-06,
|
||
|
"loss": 0.3489,
|
||
|
"step": 10600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.567774936061381,
|
||
|
"grad_norm": 13.24903678894043,
|
||
|
"learning_rate": 4.986642920747997e-06,
|
||
|
"loss": 0.3993,
|
||
|
"step": 10610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.580562659846548,
|
||
|
"grad_norm": 0.006769936066120863,
|
||
|
"learning_rate": 4.942119323241318e-06,
|
||
|
"loss": 0.2839,
|
||
|
"step": 10620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.593350383631714,
|
||
|
"grad_norm": 6.966930389404297,
|
||
|
"learning_rate": 4.89759572573464e-06,
|
||
|
"loss": 0.5219,
|
||
|
"step": 10630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.60613810741688,
|
||
|
"grad_norm": 5.570155620574951,
|
||
|
"learning_rate": 4.853072128227961e-06,
|
||
|
"loss": 0.2924,
|
||
|
"step": 10640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.618925831202047,
|
||
|
"grad_norm": 8.221465110778809,
|
||
|
"learning_rate": 4.808548530721283e-06,
|
||
|
"loss": 0.4148,
|
||
|
"step": 10650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.631713554987213,
|
||
|
"grad_norm": 6.763041019439697,
|
||
|
"learning_rate": 4.764024933214604e-06,
|
||
|
"loss": 0.3678,
|
||
|
"step": 10660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.644501278772378,
|
||
|
"grad_norm": 5.139638423919678,
|
||
|
"learning_rate": 4.719501335707925e-06,
|
||
|
"loss": 0.3983,
|
||
|
"step": 10670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.657289002557544,
|
||
|
"grad_norm": 0.2467830628156662,
|
||
|
"learning_rate": 4.674977738201247e-06,
|
||
|
"loss": 0.4656,
|
||
|
"step": 10680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.67007672634271,
|
||
|
"grad_norm": 2.647254705429077,
|
||
|
"learning_rate": 4.630454140694569e-06,
|
||
|
"loss": 0.5215,
|
||
|
"step": 10690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.682864450127877,
|
||
|
"grad_norm": 8.770064353942871,
|
||
|
"learning_rate": 4.58593054318789e-06,
|
||
|
"loss": 0.4681,
|
||
|
"step": 10700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.695652173913043,
|
||
|
"grad_norm": 0.30153679847717285,
|
||
|
"learning_rate": 4.541406945681211e-06,
|
||
|
"loss": 0.2378,
|
||
|
"step": 10710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.70843989769821,
|
||
|
"grad_norm": 0.015129966661334038,
|
||
|
"learning_rate": 4.496883348174532e-06,
|
||
|
"loss": 0.2995,
|
||
|
"step": 10720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.721227621483376,
|
||
|
"grad_norm": 8.25349235534668,
|
||
|
"learning_rate": 4.452359750667854e-06,
|
||
|
"loss": 0.5158,
|
||
|
"step": 10730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.734015345268542,
|
||
|
"grad_norm": 2.6685609817504883,
|
||
|
"learning_rate": 4.407836153161176e-06,
|
||
|
"loss": 0.3549,
|
||
|
"step": 10740
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.746803069053708,
|
||
|
"grad_norm": 0.4903467297554016,
|
||
|
"learning_rate": 4.363312555654497e-06,
|
||
|
"loss": 0.1934,
|
||
|
"step": 10750
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.759590792838875,
|
||
|
"grad_norm": 0.016465764492750168,
|
||
|
"learning_rate": 4.318788958147819e-06,
|
||
|
"loss": 0.3642,
|
||
|
"step": 10760
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.772378516624041,
|
||
|
"grad_norm": 11.288249015808105,
|
||
|
"learning_rate": 4.27426536064114e-06,
|
||
|
"loss": 0.6398,
|
||
|
"step": 10770
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.785166240409207,
|
||
|
"grad_norm": 0.20837096869945526,
|
||
|
"learning_rate": 4.229741763134462e-06,
|
||
|
"loss": 0.1693,
|
||
|
"step": 10780
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.797953964194374,
|
||
|
"grad_norm": 0.0036407741717994213,
|
||
|
"learning_rate": 4.185218165627783e-06,
|
||
|
"loss": 0.1372,
|
||
|
"step": 10790
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.81074168797954,
|
||
|
"grad_norm": 3.989978790283203,
|
||
|
"learning_rate": 4.140694568121104e-06,
|
||
|
"loss": 0.3317,
|
||
|
"step": 10800
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.823529411764707,
|
||
|
"grad_norm": 0.8736965656280518,
|
||
|
"learning_rate": 4.096170970614425e-06,
|
||
|
"loss": 0.2362,
|
||
|
"step": 10810
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.836317135549873,
|
||
|
"grad_norm": 0.37958985567092896,
|
||
|
"learning_rate": 4.051647373107748e-06,
|
||
|
"loss": 0.2248,
|
||
|
"step": 10820
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.84910485933504,
|
||
|
"grad_norm": 0.4776633381843567,
|
||
|
"learning_rate": 4.007123775601069e-06,
|
||
|
"loss": 0.2542,
|
||
|
"step": 10830
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.861892583120204,
|
||
|
"grad_norm": 2.976607084274292,
|
||
|
"learning_rate": 3.96260017809439e-06,
|
||
|
"loss": 0.3775,
|
||
|
"step": 10840
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.87468030690537,
|
||
|
"grad_norm": 2.79518723487854,
|
||
|
"learning_rate": 3.918076580587711e-06,
|
||
|
"loss": 0.283,
|
||
|
"step": 10850
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.887468030690536,
|
||
|
"grad_norm": 7.698398590087891,
|
||
|
"learning_rate": 3.873552983081033e-06,
|
||
|
"loss": 0.3615,
|
||
|
"step": 10860
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.900255754475703,
|
||
|
"grad_norm": 5.496623992919922,
|
||
|
"learning_rate": 3.829029385574355e-06,
|
||
|
"loss": 0.6363,
|
||
|
"step": 10870
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.91304347826087,
|
||
|
"grad_norm": 2.927433490753174,
|
||
|
"learning_rate": 3.7845057880676764e-06,
|
||
|
"loss": 0.45,
|
||
|
"step": 10880
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.925831202046036,
|
||
|
"grad_norm": 0.43909209966659546,
|
||
|
"learning_rate": 3.7399821905609977e-06,
|
||
|
"loss": 0.5708,
|
||
|
"step": 10890
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.938618925831202,
|
||
|
"grad_norm": 0.40781036019325256,
|
||
|
"learning_rate": 3.695458593054319e-06,
|
||
|
"loss": 0.2988,
|
||
|
"step": 10900
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.951406649616368,
|
||
|
"grad_norm": 0.11614171415567398,
|
||
|
"learning_rate": 3.6509349955476403e-06,
|
||
|
"loss": 0.3176,
|
||
|
"step": 10910
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.964194373401535,
|
||
|
"grad_norm": 7.913348197937012,
|
||
|
"learning_rate": 3.6064113980409616e-06,
|
||
|
"loss": 0.3468,
|
||
|
"step": 10920
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.976982097186701,
|
||
|
"grad_norm": 0.017280923202633858,
|
||
|
"learning_rate": 3.5618878005342833e-06,
|
||
|
"loss": 0.4185,
|
||
|
"step": 10930
|
||
|
},
|
||
|
{
|
||
|
"epoch": 13.989769820971867,
|
||
|
"grad_norm": 9.15585994720459,
|
||
|
"learning_rate": 3.5173642030276046e-06,
|
||
|
"loss": 0.5216,
|
||
|
"step": 10940
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.0,
|
||
|
"eval_loss": 0.23057223856449127,
|
||
|
"eval_runtime": 0.9717,
|
||
|
"eval_samples_per_second": 100.856,
|
||
|
"eval_steps_per_second": 13.379,
|
||
|
"step": 10948
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.002557544757034,
|
||
|
"grad_norm": 9.044001579284668,
|
||
|
"learning_rate": 3.4728406055209268e-06,
|
||
|
"loss": 0.5606,
|
||
|
"step": 10950
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.0153452685422,
|
||
|
"grad_norm": 1.5585741996765137,
|
||
|
"learning_rate": 3.428317008014248e-06,
|
||
|
"loss": 0.1146,
|
||
|
"step": 10960
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.028132992327366,
|
||
|
"grad_norm": 0.24437369406223297,
|
||
|
"learning_rate": 3.3837934105075694e-06,
|
||
|
"loss": 0.7053,
|
||
|
"step": 10970
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.040920716112533,
|
||
|
"grad_norm": 0.48745203018188477,
|
||
|
"learning_rate": 3.3392698130008907e-06,
|
||
|
"loss": 0.5357,
|
||
|
"step": 10980
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.053708439897699,
|
||
|
"grad_norm": 0.3812559247016907,
|
||
|
"learning_rate": 3.294746215494212e-06,
|
||
|
"loss": 0.2654,
|
||
|
"step": 10990
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.066496163682864,
|
||
|
"grad_norm": 4.564992904663086,
|
||
|
"learning_rate": 3.2502226179875333e-06,
|
||
|
"loss": 0.6689,
|
||
|
"step": 11000
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.07928388746803,
|
||
|
"grad_norm": 7.109955310821533,
|
||
|
"learning_rate": 3.2056990204808546e-06,
|
||
|
"loss": 0.4216,
|
||
|
"step": 11010
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.092071611253196,
|
||
|
"grad_norm": 3.396707773208618,
|
||
|
"learning_rate": 3.1611754229741767e-06,
|
||
|
"loss": 0.4376,
|
||
|
"step": 11020
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.104859335038363,
|
||
|
"grad_norm": 2.241431951522827,
|
||
|
"learning_rate": 3.1166518254674976e-06,
|
||
|
"loss": 0.4878,
|
||
|
"step": 11030
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.117647058823529,
|
||
|
"grad_norm": 0.006932465359568596,
|
||
|
"learning_rate": 3.0721282279608193e-06,
|
||
|
"loss": 0.6128,
|
||
|
"step": 11040
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.130434782608695,
|
||
|
"grad_norm": 3.4660990238189697,
|
||
|
"learning_rate": 3.0276046304541406e-06,
|
||
|
"loss": 0.3435,
|
||
|
"step": 11050
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.143222506393862,
|
||
|
"grad_norm": 2.24495792388916,
|
||
|
"learning_rate": 2.9830810329474623e-06,
|
||
|
"loss": 0.3927,
|
||
|
"step": 11060
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.156010230179028,
|
||
|
"grad_norm": 11.082945823669434,
|
||
|
"learning_rate": 2.938557435440784e-06,
|
||
|
"loss": 0.2957,
|
||
|
"step": 11070
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.168797953964194,
|
||
|
"grad_norm": 7.921354293823242,
|
||
|
"learning_rate": 2.8940338379341054e-06,
|
||
|
"loss": 0.4532,
|
||
|
"step": 11080
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.18158567774936,
|
||
|
"grad_norm": 7.9056806564331055,
|
||
|
"learning_rate": 2.8495102404274267e-06,
|
||
|
"loss": 0.5782,
|
||
|
"step": 11090
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.194373401534527,
|
||
|
"grad_norm": 9.842604637145996,
|
||
|
"learning_rate": 2.804986642920748e-06,
|
||
|
"loss": 0.4558,
|
||
|
"step": 11100
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.207161125319693,
|
||
|
"grad_norm": 9.401638984680176,
|
||
|
"learning_rate": 2.7604630454140697e-06,
|
||
|
"loss": 0.3192,
|
||
|
"step": 11110
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.21994884910486,
|
||
|
"grad_norm": 1.4201183319091797,
|
||
|
"learning_rate": 2.715939447907391e-06,
|
||
|
"loss": 0.1412,
|
||
|
"step": 11120
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.232736572890026,
|
||
|
"grad_norm": 3.4593756198883057,
|
||
|
"learning_rate": 2.6714158504007123e-06,
|
||
|
"loss": 0.3824,
|
||
|
"step": 11130
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.245524296675192,
|
||
|
"grad_norm": 7.897853851318359,
|
||
|
"learning_rate": 2.626892252894034e-06,
|
||
|
"loss": 0.3943,
|
||
|
"step": 11140
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.258312020460359,
|
||
|
"grad_norm": 3.6695263385772705,
|
||
|
"learning_rate": 2.5823686553873553e-06,
|
||
|
"loss": 0.2146,
|
||
|
"step": 11150
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.271099744245525,
|
||
|
"grad_norm": 1.173981785774231,
|
||
|
"learning_rate": 2.5378450578806766e-06,
|
||
|
"loss": 0.3024,
|
||
|
"step": 11160
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.28388746803069,
|
||
|
"grad_norm": 0.7691462635993958,
|
||
|
"learning_rate": 2.4933214603739983e-06,
|
||
|
"loss": 0.2728,
|
||
|
"step": 11170
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.296675191815856,
|
||
|
"grad_norm": 6.5430169105529785,
|
||
|
"learning_rate": 2.44879786286732e-06,
|
||
|
"loss": 0.2054,
|
||
|
"step": 11180
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.309462915601022,
|
||
|
"grad_norm": 6.1036505699157715,
|
||
|
"learning_rate": 2.4042742653606414e-06,
|
||
|
"loss": 0.5137,
|
||
|
"step": 11190
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.322250639386189,
|
||
|
"grad_norm": 6.128425598144531,
|
||
|
"learning_rate": 2.3597506678539627e-06,
|
||
|
"loss": 0.727,
|
||
|
"step": 11200
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.335038363171355,
|
||
|
"grad_norm": 5.091713905334473,
|
||
|
"learning_rate": 2.3152270703472844e-06,
|
||
|
"loss": 0.405,
|
||
|
"step": 11210
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.347826086956522,
|
||
|
"grad_norm": 5.891820430755615,
|
||
|
"learning_rate": 2.2707034728406057e-06,
|
||
|
"loss": 0.3249,
|
||
|
"step": 11220
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.360613810741688,
|
||
|
"grad_norm": 7.751905918121338,
|
||
|
"learning_rate": 2.226179875333927e-06,
|
||
|
"loss": 0.375,
|
||
|
"step": 11230
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.373401534526854,
|
||
|
"grad_norm": 2.656019687652588,
|
||
|
"learning_rate": 2.1816562778272487e-06,
|
||
|
"loss": 0.3961,
|
||
|
"step": 11240
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.38618925831202,
|
||
|
"grad_norm": 4.47562313079834,
|
||
|
"learning_rate": 2.13713268032057e-06,
|
||
|
"loss": 0.2487,
|
||
|
"step": 11250
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.398976982097187,
|
||
|
"grad_norm": 0.4639877676963806,
|
||
|
"learning_rate": 2.0926090828138913e-06,
|
||
|
"loss": 0.3217,
|
||
|
"step": 11260
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.411764705882353,
|
||
|
"grad_norm": 3.0707197189331055,
|
||
|
"learning_rate": 2.0480854853072126e-06,
|
||
|
"loss": 0.2258,
|
||
|
"step": 11270
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.42455242966752,
|
||
|
"grad_norm": 1.7559298276901245,
|
||
|
"learning_rate": 2.0035618878005343e-06,
|
||
|
"loss": 0.3735,
|
||
|
"step": 11280
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.437340153452686,
|
||
|
"grad_norm": 3.869406223297119,
|
||
|
"learning_rate": 1.9590382902938556e-06,
|
||
|
"loss": 0.3116,
|
||
|
"step": 11290
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.450127877237852,
|
||
|
"grad_norm": 0.0035378236789256334,
|
||
|
"learning_rate": 1.9145146927871773e-06,
|
||
|
"loss": 0.5503,
|
||
|
"step": 11300
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.462915601023019,
|
||
|
"grad_norm": 10.881844520568848,
|
||
|
"learning_rate": 1.8699910952804989e-06,
|
||
|
"loss": 0.2782,
|
||
|
"step": 11310
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.475703324808185,
|
||
|
"grad_norm": 0.003400342771783471,
|
||
|
"learning_rate": 1.8254674977738202e-06,
|
||
|
"loss": 0.5413,
|
||
|
"step": 11320
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.48849104859335,
|
||
|
"grad_norm": 0.07425220310688019,
|
||
|
"learning_rate": 1.7809439002671417e-06,
|
||
|
"loss": 0.2179,
|
||
|
"step": 11330
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.501278772378516,
|
||
|
"grad_norm": 8.549428939819336,
|
||
|
"learning_rate": 1.7364203027604634e-06,
|
||
|
"loss": 0.4874,
|
||
|
"step": 11340
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.514066496163682,
|
||
|
"grad_norm": 2.6542978286743164,
|
||
|
"learning_rate": 1.6918967052537847e-06,
|
||
|
"loss": 0.4715,
|
||
|
"step": 11350
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.526854219948849,
|
||
|
"grad_norm": 2.429234266281128,
|
||
|
"learning_rate": 1.647373107747106e-06,
|
||
|
"loss": 0.2447,
|
||
|
"step": 11360
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.539641943734015,
|
||
|
"grad_norm": 0.5469067096710205,
|
||
|
"learning_rate": 1.6028495102404273e-06,
|
||
|
"loss": 0.2469,
|
||
|
"step": 11370
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.552429667519181,
|
||
|
"grad_norm": 3.950800657272339,
|
||
|
"learning_rate": 1.5583259127337488e-06,
|
||
|
"loss": 0.5352,
|
||
|
"step": 11380
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.565217391304348,
|
||
|
"grad_norm": 7.118297576904297,
|
||
|
"learning_rate": 1.5138023152270703e-06,
|
||
|
"loss": 0.5058,
|
||
|
"step": 11390
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.578005115089514,
|
||
|
"grad_norm": 0.0335397832095623,
|
||
|
"learning_rate": 1.469278717720392e-06,
|
||
|
"loss": 0.1879,
|
||
|
"step": 11400
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.59079283887468,
|
||
|
"grad_norm": 3.543941020965576,
|
||
|
"learning_rate": 1.4247551202137133e-06,
|
||
|
"loss": 0.2616,
|
||
|
"step": 11410
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.603580562659847,
|
||
|
"grad_norm": 8.556031227111816,
|
||
|
"learning_rate": 1.3802315227070348e-06,
|
||
|
"loss": 0.9001,
|
||
|
"step": 11420
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.616368286445013,
|
||
|
"grad_norm": 6.954402446746826,
|
||
|
"learning_rate": 1.3357079252003561e-06,
|
||
|
"loss": 0.5127,
|
||
|
"step": 11430
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.62915601023018,
|
||
|
"grad_norm": 1.0035245418548584,
|
||
|
"learning_rate": 1.2911843276936777e-06,
|
||
|
"loss": 0.3251,
|
||
|
"step": 11440
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.641943734015346,
|
||
|
"grad_norm": 1.676684021949768,
|
||
|
"learning_rate": 1.2466607301869992e-06,
|
||
|
"loss": 0.1386,
|
||
|
"step": 11450
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.654731457800512,
|
||
|
"grad_norm": 0.003588082268834114,
|
||
|
"learning_rate": 1.2021371326803207e-06,
|
||
|
"loss": 0.4051,
|
||
|
"step": 11460
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.667519181585678,
|
||
|
"grad_norm": 0.2613386809825897,
|
||
|
"learning_rate": 1.1576135351736422e-06,
|
||
|
"loss": 0.2099,
|
||
|
"step": 11470
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.680306905370845,
|
||
|
"grad_norm": 2.826719045639038,
|
||
|
"learning_rate": 1.1130899376669635e-06,
|
||
|
"loss": 0.4122,
|
||
|
"step": 11480
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.693094629156011,
|
||
|
"grad_norm": 0.5583800673484802,
|
||
|
"learning_rate": 1.068566340160285e-06,
|
||
|
"loss": 0.8122,
|
||
|
"step": 11490
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.705882352941176,
|
||
|
"grad_norm": 4.03103494644165,
|
||
|
"learning_rate": 1.0240427426536063e-06,
|
||
|
"loss": 0.6181,
|
||
|
"step": 11500
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.718670076726342,
|
||
|
"grad_norm": 1.6675188541412354,
|
||
|
"learning_rate": 9.795191451469278e-07,
|
||
|
"loss": 0.123,
|
||
|
"step": 11510
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.731457800511508,
|
||
|
"grad_norm": 4.71064567565918,
|
||
|
"learning_rate": 9.349955476402494e-07,
|
||
|
"loss": 0.2775,
|
||
|
"step": 11520
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.744245524296675,
|
||
|
"grad_norm": 12.104289054870605,
|
||
|
"learning_rate": 8.904719501335708e-07,
|
||
|
"loss": 0.6795,
|
||
|
"step": 11530
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.757033248081841,
|
||
|
"grad_norm": 5.273158550262451,
|
||
|
"learning_rate": 8.459483526268923e-07,
|
||
|
"loss": 0.3161,
|
||
|
"step": 11540
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.769820971867007,
|
||
|
"grad_norm": 4.798052787780762,
|
||
|
"learning_rate": 8.014247551202136e-07,
|
||
|
"loss": 0.7235,
|
||
|
"step": 11550
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.782608695652174,
|
||
|
"grad_norm": 0.33628830313682556,
|
||
|
"learning_rate": 7.569011576135352e-07,
|
||
|
"loss": 0.2189,
|
||
|
"step": 11560
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.79539641943734,
|
||
|
"grad_norm": 3.923007011413574,
|
||
|
"learning_rate": 7.123775601068567e-07,
|
||
|
"loss": 0.3109,
|
||
|
"step": 11570
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.808184143222507,
|
||
|
"grad_norm": 1.1927415132522583,
|
||
|
"learning_rate": 6.678539626001781e-07,
|
||
|
"loss": 0.3873,
|
||
|
"step": 11580
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.820971867007673,
|
||
|
"grad_norm": 0.0009544580243527889,
|
||
|
"learning_rate": 6.233303650934996e-07,
|
||
|
"loss": 0.203,
|
||
|
"step": 11590
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.83375959079284,
|
||
|
"grad_norm": 5.8102641105651855,
|
||
|
"learning_rate": 5.788067675868211e-07,
|
||
|
"loss": 0.3458,
|
||
|
"step": 11600
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.846547314578006,
|
||
|
"grad_norm": 0.0030290207359939814,
|
||
|
"learning_rate": 5.342831700801425e-07,
|
||
|
"loss": 0.6794,
|
||
|
"step": 11610
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.859335038363172,
|
||
|
"grad_norm": 9.012167930603027,
|
||
|
"learning_rate": 4.897595725734639e-07,
|
||
|
"loss": 0.4832,
|
||
|
"step": 11620
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.872122762148338,
|
||
|
"grad_norm": 18.302406311035156,
|
||
|
"learning_rate": 4.452359750667854e-07,
|
||
|
"loss": 0.6115,
|
||
|
"step": 11630
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.884910485933505,
|
||
|
"grad_norm": 3.3673505783081055,
|
||
|
"learning_rate": 4.007123775601068e-07,
|
||
|
"loss": 0.1894,
|
||
|
"step": 11640
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.89769820971867,
|
||
|
"grad_norm": 0.5946460366249084,
|
||
|
"learning_rate": 3.5618878005342833e-07,
|
||
|
"loss": 0.2352,
|
||
|
"step": 11650
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.910485933503836,
|
||
|
"grad_norm": 5.374065399169922,
|
||
|
"learning_rate": 3.116651825467498e-07,
|
||
|
"loss": 0.5661,
|
||
|
"step": 11660
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.923273657289002,
|
||
|
"grad_norm": 0.6203376054763794,
|
||
|
"learning_rate": 2.6714158504007125e-07,
|
||
|
"loss": 0.2715,
|
||
|
"step": 11670
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.936061381074168,
|
||
|
"grad_norm": 0.9134934544563293,
|
||
|
"learning_rate": 2.226179875333927e-07,
|
||
|
"loss": 0.3172,
|
||
|
"step": 11680
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.948849104859335,
|
||
|
"grad_norm": 0.07863820344209671,
|
||
|
"learning_rate": 1.7809439002671417e-07,
|
||
|
"loss": 0.4238,
|
||
|
"step": 11690
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.961636828644501,
|
||
|
"grad_norm": 3.0305428504943848,
|
||
|
"learning_rate": 1.3357079252003563e-07,
|
||
|
"loss": 0.5941,
|
||
|
"step": 11700
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.974424552429667,
|
||
|
"grad_norm": 3.0656425952911377,
|
||
|
"learning_rate": 8.904719501335708e-08,
|
||
|
"loss": 0.3601,
|
||
|
"step": 11710
|
||
|
},
|
||
|
{
|
||
|
"epoch": 14.987212276214834,
|
||
|
"grad_norm": 1.1194722652435303,
|
||
|
"learning_rate": 4.452359750667854e-08,
|
||
|
"loss": 0.3946,
|
||
|
"step": 11720
|
||
|
},
|
||
|
{
|
||
|
"epoch": 15.0,
|
||
|
"grad_norm": 0.5027629137039185,
|
||
|
"learning_rate": 0.0,
|
||
|
"loss": 0.5649,
|
||
|
"step": 11730
|
||
|
},
|
||
|
{
|
||
|
"epoch": 15.0,
|
||
|
"eval_loss": 0.22965534031391144,
|
||
|
"eval_runtime": 0.8342,
|
||
|
"eval_samples_per_second": 117.472,
|
||
|
"eval_steps_per_second": 15.583,
|
||
|
"step": 11730
|
||
|
}
|
||
|
],
|
||
|
"logging_steps": 10,
|
||
|
"max_steps": 11730,
|
||
|
"num_input_tokens_seen": 0,
|
||
|
"num_train_epochs": 15,
|
||
|
"save_steps": 500,
|
||
|
"stateful_callbacks": {
|
||
|
"TrainerControl": {
|
||
|
"args": {
|
||
|
"should_epoch_stop": false,
|
||
|
"should_evaluate": false,
|
||
|
"should_log": false,
|
||
|
"should_save": true,
|
||
|
"should_training_stop": true
|
||
|
},
|
||
|
"attributes": {}
|
||
|
}
|
||
|
},
|
||
|
"total_flos": 1550558468505600.0,
|
||
|
"train_batch_size": 1,
|
||
|
"trial_name": null,
|
||
|
"trial_params": null
|
||
|
}
|