-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.json
1 lines (1 loc) · 157 KB
/
data.json
1
[{"val": {"loss": 4.594279885292053, "accuracy": 0.01025390625, "attn_entropy": 0.7219804227352142, "param_norm": 121.8750153082079}, "ground_truth": {"loss": 4.583467066287994, "accuracy": 0.0126953125, "attn_entropy": 0.7223016582429409, "param_norm": 121.8750153082079}, "train": {"loss": 4.591872692108154, "accuracy": 0.017578125, "attn_entropy": 0.63545361161232, "param_norm": 121.87102287766795}, "step": 10, "lr": 0.001}, {"val": {"loss": 4.612835049629211, "accuracy": 0.006103515625, "attn_entropy": 0.6922217793762684, "param_norm": 121.90414824226515}, "ground_truth": {"loss": 4.621441841125488, "accuracy": 0.010009765625, "attn_entropy": 0.692286528646946, "param_norm": 121.90414824226515}, "train": {"loss": 4.5894269943237305, "accuracy": 0.01171875, "attn_entropy": 0.5921379029750824, "param_norm": 121.90267073420799}, "step": 20, "lr": 0.001}, {"val": {"loss": 4.608464360237122, "accuracy": 0.010009765625, "attn_entropy": 0.6572220139205456, "param_norm": 121.92316599750691}, "ground_truth": {"loss": 4.59269255399704, "accuracy": 0.002685546875, "attn_entropy": 0.6569989882409573, "param_norm": 121.92316599750691}, "train": {"loss": 4.579757213592529, "accuracy": 0.0078125, "attn_entropy": 0.5680878460407257, "param_norm": 121.92026019945273}, "step": 30, "lr": 0.001}, {"val": {"loss": 4.5934640765190125, "accuracy": 0.009765625, "attn_entropy": 0.5535182133316994, "param_norm": 121.96388332793904}, "ground_truth": {"loss": 4.604132413864136, "accuracy": 0.01025390625, "attn_entropy": 0.5535905957221985, "param_norm": 121.96388332793904}, "train": {"loss": 4.573510646820068, "accuracy": 0.015625, "attn_entropy": 0.46864162385463715, "param_norm": 121.95917833805231}, "step": 40, "lr": 0.001}, {"val": {"loss": 4.595536947250366, "accuracy": 0.005126953125, "attn_entropy": 0.4887684788554907, "param_norm": 122.01535194809568}, "ground_truth": {"loss": 4.61325204372406, "accuracy": 0.010009765625, "attn_entropy": 0.48680464923381805, "param_norm": 122.01535194809568}, "train": {"loss": 4.564382076263428, "accuracy": 0.017578125, "attn_entropy": 0.39846986532211304, "param_norm": 122.01009598813155}, "step": 50, "lr": 0.001}, {"val": {"loss": 4.60455048084259, "accuracy": 0.009521484375, "attn_entropy": 0.44680112786591053, "param_norm": 122.07718169871342}, "ground_truth": {"loss": 4.59404993057251, "accuracy": 0.007080078125, "attn_entropy": 0.4444937203079462, "param_norm": 122.07718169871342}, "train": {"loss": 4.557654857635498, "accuracy": 0.01171875, "attn_entropy": 0.3558581471443176, "param_norm": 122.07022027478548}, "step": 60, "lr": 0.001}, {"val": {"loss": 4.60322380065918, "accuracy": 0.010009765625, "attn_entropy": 0.41366141848266125, "param_norm": 122.15449190020897}, "ground_truth": {"loss": 4.603328227996826, "accuracy": 0.021484375, "attn_entropy": 0.4079647846519947, "param_norm": 122.15449190020897}, "train": {"loss": 4.531663417816162, "accuracy": 0.015625, "attn_entropy": 0.32787103950977325, "param_norm": 122.14612006389892}, "step": 70, "lr": 0.001}, {"val": {"loss": 4.613166332244873, "accuracy": 0.006103515625, "attn_entropy": 0.3791139926761389, "param_norm": 122.2460939395959}, "ground_truth": {"loss": 4.616857349872589, "accuracy": 0.004150390625, "attn_entropy": 0.37391154281795025, "param_norm": 122.2460939395959}, "train": {"loss": 4.555311679840088, "accuracy": 0.01953125, "attn_entropy": 0.2874717712402344, "param_norm": 122.23631877710564}, "step": 80, "lr": 0.001}, {"val": {"loss": 4.616944968700409, "accuracy": 0.0068359375, "attn_entropy": 0.34596178121864796, "param_norm": 122.35201188370704}, "ground_truth": {"loss": 4.608799040317535, "accuracy": 0.01416015625, "attn_entropy": 0.3418096285313368, "param_norm": 122.35201188370704}, "train": {"loss": 4.522709846496582, "accuracy": 0.03125, "attn_entropy": 0.25555458664894104, "param_norm": 122.34100346223171}, "step": 90, "lr": 0.001}, {"val": {"loss": 4.614219069480896, "accuracy": 0.007080078125, "attn_entropy": 0.31912753358483315, "param_norm": 122.47521185963623}, "ground_truth": {"loss": 4.625028192996979, "accuracy": 0.0126953125, "attn_entropy": 0.31391786225140095, "param_norm": 122.47521185963623}, "train": {"loss": 4.495171070098877, "accuracy": 0.03125, "attn_entropy": 0.22975172102451324, "param_norm": 122.46182818613622}, "step": 100, "lr": 0.001}, {"val": {"loss": 4.633833169937134, "accuracy": 0.006103515625, "attn_entropy": 0.2921015676110983, "param_norm": 122.62044566733981}, "ground_truth": {"loss": 4.6396554708480835, "accuracy": 0.003173828125, "attn_entropy": 0.290496526286006, "param_norm": 122.62044566733981}, "train": {"loss": 4.4921369552612305, "accuracy": 0.015625, "attn_entropy": 0.204623281955719, "param_norm": 122.60530089343683}, "step": 110, "lr": 0.001}, {"val": {"loss": 4.6553356647491455, "accuracy": 0.007080078125, "attn_entropy": 0.279239384457469, "param_norm": 122.78119846008495}, "ground_truth": {"loss": 4.64872419834137, "accuracy": 0.019775390625, "attn_entropy": 0.277567395940423, "param_norm": 122.78119846008495}, "train": {"loss": 4.466106414794922, "accuracy": 0.03515625, "attn_entropy": 0.1908455640077591, "param_norm": 122.76436873499082}, "step": 120, "lr": 0.001}, {"val": {"loss": 4.679832756519318, "accuracy": 0.0068359375, "attn_entropy": 0.2685586977750063, "param_norm": 122.95754144001592}, "ground_truth": {"loss": 4.681183695793152, "accuracy": 0.01953125, "attn_entropy": 0.264633234590292, "param_norm": 122.95754144001592}, "train": {"loss": 4.440983295440674, "accuracy": 0.04296875, "attn_entropy": 0.17589475959539413, "param_norm": 122.93858826133292}, "step": 130, "lr": 0.001}, {"val": {"loss": 4.697492718696594, "accuracy": 0.009765625, "attn_entropy": 0.2525237826630473, "param_norm": 123.15674658019942}, "ground_truth": {"loss": 4.742733716964722, "accuracy": 0.009521484375, "attn_entropy": 0.24648677743971348, "param_norm": 123.15674658019942}, "train": {"loss": 4.410593032836914, "accuracy": 0.052734375, "attn_entropy": 0.15716084092855453, "param_norm": 123.1357779753278}, "step": 140, "lr": 0.001}, {"val": {"loss": 4.79804527759552, "accuracy": 0.007568359375, "attn_entropy": 0.24334660731256008, "param_norm": 123.37984392730131}, "ground_truth": {"loss": 4.877881586551666, "accuracy": 0.011474609375, "attn_entropy": 0.2389513934031129, "param_norm": 123.37984392730131}, "train": {"loss": 4.346066951751709, "accuracy": 0.05078125, "attn_entropy": 0.15637605637311935, "param_norm": 123.35770496635561}, "step": 150, "lr": 0.001}, {"val": {"loss": 4.999540567398071, "accuracy": 0.006103515625, "attn_entropy": 0.23914594110101461, "param_norm": 123.58698544443642}, "ground_truth": {"loss": 5.09455543756485, "accuracy": 0.014892578125, "attn_entropy": 0.23737667221575975, "param_norm": 123.58698544443642}, "train": {"loss": 4.227836608886719, "accuracy": 0.05859375, "attn_entropy": 0.14904087781906128, "param_norm": 123.56675927395555}, "step": 160, "lr": 0.001}, {"val": {"loss": 5.1724772453308105, "accuracy": 0.00830078125, "attn_entropy": 0.22935693059116602, "param_norm": 123.75458516783303}, "ground_truth": {"loss": 5.262948095798492, "accuracy": 0.01611328125, "attn_entropy": 0.22651180904358625, "param_norm": 123.75458516783303}, "train": {"loss": 4.2290802001953125, "accuracy": 0.05078125, "attn_entropy": 0.1386854648590088, "param_norm": 123.74011288662435}, "step": 170, "lr": 0.001}, {"val": {"loss": 5.205318212509155, "accuracy": 0.01025390625, "attn_entropy": 0.21842783316969872, "param_norm": 123.89468922645423}, "ground_truth": {"loss": 5.358752369880676, "accuracy": 0.016357421875, "attn_entropy": 0.2159428671002388, "param_norm": 123.89468922645423}, "train": {"loss": 4.230528354644775, "accuracy": 0.078125, "attn_entropy": 0.13152937591075897, "param_norm": 123.8815206380558}, "step": 180, "lr": 0.001}, {"val": {"loss": 5.2420647740364075, "accuracy": 0.01025390625, "attn_entropy": 0.21698432881385088, "param_norm": 124.01892488161455}, "ground_truth": {"loss": 5.3566694259643555, "accuracy": 0.009033203125, "attn_entropy": 0.21436184272170067, "param_norm": 124.01892488161455}, "train": {"loss": 4.187121868133545, "accuracy": 0.0625, "attn_entropy": 0.12646032869815826, "param_norm": 124.00600595026678}, "step": 190, "lr": 0.001}, {"val": {"loss": 5.300320267677307, "accuracy": 0.014404296875, "attn_entropy": 0.20954707264900208, "param_norm": 124.14621856597022}, "ground_truth": {"loss": 5.422053813934326, "accuracy": 0.02685546875, "attn_entropy": 0.21170819364488125, "param_norm": 124.14621856597022}, "train": {"loss": 4.0718584060668945, "accuracy": 0.107421875, "attn_entropy": 0.125257708132267, "param_norm": 124.13359884624876}, "step": 200, "lr": 0.001}, {"val": {"loss": 5.332078695297241, "accuracy": 0.009033203125, "attn_entropy": 0.20488916523754597, "param_norm": 124.26550424132073}, "ground_truth": {"loss": 5.526282429695129, "accuracy": 0.019287109375, "attn_entropy": 0.20144387800246477, "param_norm": 124.26550424132073}, "train": {"loss": 4.165070056915283, "accuracy": 0.078125, "attn_entropy": 0.11679795756936073, "param_norm": 124.25391702921162}, "step": 210, "lr": 0.001}, {"val": {"loss": 5.4314292669296265, "accuracy": 0.00927734375, "attn_entropy": 0.19535310193896294, "param_norm": 124.3918041534656}, "ground_truth": {"loss": 5.574137568473816, "accuracy": 0.014892578125, "attn_entropy": 0.1921254489570856, "param_norm": 124.3918041534656}, "train": {"loss": 4.119430065155029, "accuracy": 0.06640625, "attn_entropy": 0.10811083018779755, "param_norm": 124.37934836327958}, "step": 220, "lr": 0.001}, {"val": {"loss": 5.478249967098236, "accuracy": 0.012451171875, "attn_entropy": 0.19237651117146015, "param_norm": 124.51624274311531}, "ground_truth": {"loss": 5.60814768075943, "accuracy": 0.025146484375, "attn_entropy": 0.19094852078706026, "param_norm": 124.51624274311531}, "train": {"loss": 4.096652030944824, "accuracy": 0.083984375, "attn_entropy": 0.09405866637825966, "param_norm": 124.50420744785926}, "step": 230, "lr": 0.001}, {"val": {"loss": 5.490205109119415, "accuracy": 0.011962890625, "attn_entropy": 0.18686640448868275, "param_norm": 124.63161011085195}, "ground_truth": {"loss": 5.656213462352753, "accuracy": 0.0185546875, "attn_entropy": 0.1847080048173666, "param_norm": 124.63161011085195}, "train": {"loss": 4.049617290496826, "accuracy": 0.08984375, "attn_entropy": 0.09832999855279922, "param_norm": 124.62047322123277}, "step": 240, "lr": 0.001}, {"val": {"loss": 5.490980327129364, "accuracy": 0.0126953125, "attn_entropy": 0.18563394527882338, "param_norm": 124.74140716086652}, "ground_truth": {"loss": 5.656631290912628, "accuracy": 0.024169921875, "attn_entropy": 0.18238491844385862, "param_norm": 124.74140716086652}, "train": {"loss": 3.940467596054077, "accuracy": 0.11328125, "attn_entropy": 0.09324643015861511, "param_norm": 124.73065210860379}, "step": 250, "lr": 0.001}, {"val": {"loss": 5.522727370262146, "accuracy": 0.015869140625, "attn_entropy": 0.18604940548539162, "param_norm": 124.85173111791654}, "ground_truth": {"loss": 5.688613474369049, "accuracy": 0.025390625, "attn_entropy": 0.18168814852833748, "param_norm": 124.85173111791654}, "train": {"loss": 3.9136292934417725, "accuracy": 0.125, "attn_entropy": 0.0937701165676117, "param_norm": 124.84038619159007}, "step": 260, "lr": 0.001}, {"val": {"loss": 5.6055604219436646, "accuracy": 0.010986328125, "attn_entropy": 0.1807390544563532, "param_norm": 124.96664486874887}, "ground_truth": {"loss": 5.745224893093109, "accuracy": 0.024658203125, "attn_entropy": 0.1787789911031723, "param_norm": 124.96664486874887}, "train": {"loss": 3.9155962467193604, "accuracy": 0.142578125, "attn_entropy": 0.0881526805460453, "param_norm": 124.9557828766457}, "step": 270, "lr": 0.001}, {"val": {"loss": 5.591505229473114, "accuracy": 0.0126953125, "attn_entropy": 0.17615624703466892, "param_norm": 125.07764404430463}, "ground_truth": {"loss": 5.757704198360443, "accuracy": 0.0166015625, "attn_entropy": 0.170874685049057, "param_norm": 125.07764404430463}, "train": {"loss": 3.834301233291626, "accuracy": 0.123046875, "attn_entropy": 0.08768071979284286, "param_norm": 125.06642141296498}, "step": 280, "lr": 0.001}, {"val": {"loss": 5.6218907833099365, "accuracy": 0.013427734375, "attn_entropy": 0.17369584180414677, "param_norm": 125.1935119339787}, "ground_truth": {"loss": 5.818986475467682, "accuracy": 0.0166015625, "attn_entropy": 0.16977530531585217, "param_norm": 125.1935119339787}, "train": {"loss": 3.8253519535064697, "accuracy": 0.140625, "attn_entropy": 0.07744232937693596, "param_norm": 125.1823714445514}, "step": 290, "lr": 0.001}, {"val": {"loss": 5.586330771446228, "accuracy": 0.01416015625, "attn_entropy": 0.17143205553293228, "param_norm": 125.30666526058566}, "ground_truth": {"loss": 5.8503366112709045, "accuracy": 0.026123046875, "attn_entropy": 0.16857696138322353, "param_norm": 125.30666526058566}, "train": {"loss": 3.8378429412841797, "accuracy": 0.12890625, "attn_entropy": 0.07949161902070045, "param_norm": 125.29525044973413}, "step": 300, "lr": 0.001}, {"val": {"loss": 5.629417359828949, "accuracy": 0.020263671875, "attn_entropy": 0.16847451403737068, "param_norm": 125.42308665640532}, "ground_truth": {"loss": 5.877093493938446, "accuracy": 0.016357421875, "attn_entropy": 0.16570120491087437, "param_norm": 125.42308665640532}, "train": {"loss": 3.8175840377807617, "accuracy": 0.150390625, "attn_entropy": 0.07483382523059845, "param_norm": 125.41173215599854}, "step": 310, "lr": 0.001}, {"val": {"loss": 5.700646877288818, "accuracy": 0.015869140625, "attn_entropy": 0.16390046291053295, "param_norm": 125.53640155815253}, "ground_truth": {"loss": 5.935691595077515, "accuracy": 0.018798828125, "attn_entropy": 0.1621920671314001, "param_norm": 125.53640155815253}, "train": {"loss": 3.68567156791687, "accuracy": 0.169921875, "attn_entropy": 0.07148800790309906, "param_norm": 125.52499697236416}, "step": 320, "lr": 0.001}, {"val": {"loss": 5.701316595077515, "accuracy": 0.016357421875, "attn_entropy": 0.1609683372080326, "param_norm": 125.64878664557922}, "ground_truth": {"loss": 5.986230492591858, "accuracy": 0.01611328125, "attn_entropy": 0.1572690475732088, "param_norm": 125.64878664557922}, "train": {"loss": 3.7040305137634277, "accuracy": 0.14453125, "attn_entropy": 0.06515307910740376, "param_norm": 125.63792633406987}, "step": 330, "lr": 0.001}, {"val": {"loss": 5.644801735877991, "accuracy": 0.0224609375, "attn_entropy": 0.15641225408762693, "param_norm": 125.7592172423808}, "ground_truth": {"loss": 5.940473139286041, "accuracy": 0.01904296875, "attn_entropy": 0.154520520940423, "param_norm": 125.7592172423808}, "train": {"loss": 3.6685972213745117, "accuracy": 0.177734375, "attn_entropy": 0.0656109657138586, "param_norm": 125.7478433103469}, "step": 340, "lr": 0.001}, {"val": {"loss": 5.7439218163490295, "accuracy": 0.014892578125, "attn_entropy": 0.15717646945267916, "param_norm": 125.87020123696604}, "ground_truth": {"loss": 5.996939122676849, "accuracy": 0.02001953125, "attn_entropy": 0.15258935932070017, "param_norm": 125.87020123696604}, "train": {"loss": 3.61739182472229, "accuracy": 0.181640625, "attn_entropy": 0.06435095518827438, "param_norm": 125.85925328740451}, "step": 350, "lr": 0.001}, {"val": {"loss": 5.736214637756348, "accuracy": 0.019287109375, "attn_entropy": 0.15673279203474522, "param_norm": 125.97869472151162}, "ground_truth": {"loss": 6.046280384063721, "accuracy": 0.025634765625, "attn_entropy": 0.15374373272061348, "param_norm": 125.97869472151162}, "train": {"loss": 3.576685667037964, "accuracy": 0.1796875, "attn_entropy": 0.062127161771059036, "param_norm": 125.96788129834357}, "step": 360, "lr": 0.001}, {"val": {"loss": 5.782017827033997, "accuracy": 0.020751953125, "attn_entropy": 0.1555442614480853, "param_norm": 126.08690933220498}, "ground_truth": {"loss": 5.994977414608002, "accuracy": 0.02099609375, "attn_entropy": 0.1537030916661024, "param_norm": 126.08690933220498}, "train": {"loss": 3.5317513942718506, "accuracy": 0.21484375, "attn_entropy": 0.06090899929404259, "param_norm": 126.07643387397583}, "step": 370, "lr": 0.001}, {"val": {"loss": 5.808298647403717, "accuracy": 0.02490234375, "attn_entropy": 0.15399721264839172, "param_norm": 126.19550691595069}, "ground_truth": {"loss": 6.092039704322815, "accuracy": 0.020263671875, "attn_entropy": 0.15157882031053305, "param_norm": 126.19550691595069}, "train": {"loss": 3.591052770614624, "accuracy": 0.169921875, "attn_entropy": 0.06119788624346256, "param_norm": 126.18431364252416}, "step": 380, "lr": 0.001}, {"val": {"loss": 5.822998583316803, "accuracy": 0.02734375, "attn_entropy": 0.15731317549943924, "param_norm": 126.31192960048806}, "ground_truth": {"loss": 6.059002757072449, "accuracy": 0.024658203125, "attn_entropy": 0.15287169814109802, "param_norm": 126.31192960048806}, "train": {"loss": 3.3833792209625244, "accuracy": 0.23046875, "attn_entropy": 0.05883431062102318, "param_norm": 126.30009508049777}, "step": 390, "lr": 0.001}, {"val": {"loss": 5.775799870491028, "accuracy": 0.035888671875, "attn_entropy": 0.1549274381250143, "param_norm": 126.42454330833324}, "ground_truth": {"loss": 6.111547291278839, "accuracy": 0.0205078125, "attn_entropy": 0.15079282131046057, "param_norm": 126.42454330833324}, "train": {"loss": 3.521775245666504, "accuracy": 0.18359375, "attn_entropy": 0.05869711562991142, "param_norm": 126.41368246699153}, "step": 400, "lr": 0.001}, {"val": {"loss": 5.8443872928619385, "accuracy": 0.0322265625, "attn_entropy": 0.15451697446405888, "param_norm": 126.53442107795999}, "ground_truth": {"loss": 6.154454171657562, "accuracy": 0.01513671875, "attn_entropy": 0.14906197041273117, "param_norm": 126.53442107795999}, "train": {"loss": 3.5123586654663086, "accuracy": 0.1875, "attn_entropy": 0.061971183866262436, "param_norm": 126.52343544384401}, "step": 410, "lr": 0.001}, {"val": {"loss": 5.8392192125320435, "accuracy": 0.03271484375, "attn_entropy": 0.15171152353286743, "param_norm": 126.64436071352681}, "ground_truth": {"loss": 6.134539484977722, "accuracy": 0.021484375, "attn_entropy": 0.1477947048842907, "param_norm": 126.64436071352681}, "train": {"loss": 3.3427894115448, "accuracy": 0.251953125, "attn_entropy": 0.05836816690862179, "param_norm": 126.63330620988864}, "step": 420, "lr": 0.001}, {"val": {"loss": 5.912562727928162, "accuracy": 0.03564453125, "attn_entropy": 0.15120849292725325, "param_norm": 126.75543658325135}, "ground_truth": {"loss": 6.181281089782715, "accuracy": 0.010009765625, "attn_entropy": 0.14836007356643677, "param_norm": 126.75543658325135}, "train": {"loss": 3.409463882446289, "accuracy": 0.216796875, "attn_entropy": 0.06091800332069397, "param_norm": 126.74408049050965}, "step": 430, "lr": 0.001}, {"val": {"loss": 5.936709523200989, "accuracy": 0.032470703125, "attn_entropy": 0.1512548578903079, "param_norm": 126.86572478025388}, "ground_truth": {"loss": 6.25166642665863, "accuracy": 0.01220703125, "attn_entropy": 0.14806992188096046, "param_norm": 126.86572478025388}, "train": {"loss": 3.4030306339263916, "accuracy": 0.244140625, "attn_entropy": 0.05956925079226494, "param_norm": 126.85503879510964}, "step": 440, "lr": 0.001}, {"val": {"loss": 5.953439116477966, "accuracy": 0.031005859375, "attn_entropy": 0.15011885669082403, "param_norm": 126.97532399150099}, "ground_truth": {"loss": 6.169587969779968, "accuracy": 0.017578125, "attn_entropy": 0.1458144816569984, "param_norm": 126.97532399150099}, "train": {"loss": 3.3270275592803955, "accuracy": 0.21484375, "attn_entropy": 0.05563156493008137, "param_norm": 126.96410563490312}, "step": 450, "lr": 0.001}, {"val": {"loss": 5.89757913351059, "accuracy": 0.039306640625, "attn_entropy": 0.14784324821084738, "param_norm": 127.08899591634169}, "ground_truth": {"loss": 6.24809467792511, "accuracy": 0.0234375, "attn_entropy": 0.1433021673001349, "param_norm": 127.08899591634169}, "train": {"loss": 3.246654987335205, "accuracy": 0.232421875, "attn_entropy": 0.053932687267661095, "param_norm": 127.07752791107264}, "step": 460, "lr": 0.001}, {"val": {"loss": 5.964172780513763, "accuracy": 0.0361328125, "attn_entropy": 0.15022848267108202, "param_norm": 127.20014719695543}, "ground_truth": {"loss": 6.319421648979187, "accuracy": 0.025390625, "attn_entropy": 0.14680309686809778, "param_norm": 127.20014719695543}, "train": {"loss": 3.302743911743164, "accuracy": 0.19921875, "attn_entropy": 0.05479878932237625, "param_norm": 127.18949232673296}, "step": 470, "lr": 0.001}, {"val": {"loss": 5.965970039367676, "accuracy": 0.042236328125, "attn_entropy": 0.1495414860546589, "param_norm": 127.30402539001457}, "ground_truth": {"loss": 6.2507219314575195, "accuracy": 0.015625, "attn_entropy": 0.14454405196011066, "param_norm": 127.30402539001457}, "train": {"loss": 3.3426456451416016, "accuracy": 0.228515625, "attn_entropy": 0.05704818665981293, "param_norm": 127.29366249263354}, "step": 480, "lr": 0.001}, {"val": {"loss": 5.9817004799842834, "accuracy": 0.043701171875, "attn_entropy": 0.15086335223168135, "param_norm": 127.41258709020431}, "ground_truth": {"loss": 6.322962939739227, "accuracy": 0.02587890625, "attn_entropy": 0.14627508260309696, "param_norm": 127.41258709020431}, "train": {"loss": 3.278095006942749, "accuracy": 0.25, "attn_entropy": 0.05302807316184044, "param_norm": 127.4018966762284}, "step": 490, "lr": 0.001}, {"val": {"loss": 6.000580608844757, "accuracy": 0.048583984375, "attn_entropy": 0.1506309099495411, "param_norm": 127.52186143503027}, "ground_truth": {"loss": 6.342121422290802, "accuracy": 0.02685546875, "attn_entropy": 0.14594499580562115, "param_norm": 127.52186143503027}, "train": {"loss": 3.1032869815826416, "accuracy": 0.26953125, "attn_entropy": 0.05755722522735596, "param_norm": 127.51090604903615}, "step": 500, "lr": 0.001}, {"val": {"loss": 6.05521547794342, "accuracy": 0.045654296875, "attn_entropy": 0.14924809709191322, "param_norm": 127.63310033908257}, "ground_truth": {"loss": 6.355819225311279, "accuracy": 0.019775390625, "attn_entropy": 0.14720110781490803, "param_norm": 127.63310033908257}, "train": {"loss": 3.1982178688049316, "accuracy": 0.25, "attn_entropy": 0.05238840915262699, "param_norm": 127.62231444648428}, "step": 510, "lr": 0.001}, {"val": {"loss": 6.0508617758750916, "accuracy": 0.046142578125, "attn_entropy": 0.1481476416811347, "param_norm": 127.73944849305028}, "ground_truth": {"loss": 6.446830332279205, "accuracy": 0.01806640625, "attn_entropy": 0.14592506270855665, "param_norm": 127.73944849305028}, "train": {"loss": 3.116711139678955, "accuracy": 0.271484375, "attn_entropy": 0.055343884974718094, "param_norm": 127.72859044478734}, "step": 520, "lr": 0.001}, {"val": {"loss": 6.154681921005249, "accuracy": 0.046875, "attn_entropy": 0.1473313570022583, "param_norm": 127.84990300317439}, "ground_truth": {"loss": 6.399652600288391, "accuracy": 0.025146484375, "attn_entropy": 0.14365573599934578, "param_norm": 127.84990300317439}, "train": {"loss": 3.107562303543091, "accuracy": 0.25, "attn_entropy": 0.051675599068403244, "param_norm": 127.83877265371437}, "step": 530, "lr": 0.001}, {"val": {"loss": 6.137357711791992, "accuracy": 0.048583984375, "attn_entropy": 0.1462778765708208, "param_norm": 127.96237443642863}, "ground_truth": {"loss": 6.46468448638916, "accuracy": 0.02587890625, "attn_entropy": 0.1418538959696889, "param_norm": 127.96237443642863}, "train": {"loss": 3.0939619541168213, "accuracy": 0.287109375, "attn_entropy": 0.05170224420726299, "param_norm": 127.95101927212609}, "step": 540, "lr": 0.001}, {"val": {"loss": 6.063952803611755, "accuracy": 0.060791015625, "attn_entropy": 0.145398935303092, "param_norm": 128.0759063511}, "ground_truth": {"loss": 6.56752747297287, "accuracy": 0.02294921875, "attn_entropy": 0.14036945346742868, "param_norm": 128.0759063511}, "train": {"loss": 2.9842708110809326, "accuracy": 0.287109375, "attn_entropy": 0.05036565661430359, "param_norm": 128.0647254594151}, "step": 550, "lr": 0.001}, {"val": {"loss": 6.193920910358429, "accuracy": 0.04833984375, "attn_entropy": 0.14260025741532445, "param_norm": 128.18551667590552}, "ground_truth": {"loss": 6.589592278003693, "accuracy": 0.021240234375, "attn_entropy": 0.13802565587684512, "param_norm": 128.18551667590552}, "train": {"loss": 2.983304738998413, "accuracy": 0.294921875, "attn_entropy": 0.04644310940057039, "param_norm": 128.1746321299595}, "step": 560, "lr": 0.001}, {"val": {"loss": 6.118537783622742, "accuracy": 0.06005859375, "attn_entropy": 0.14269937807694077, "param_norm": 128.29439372037407}, "ground_truth": {"loss": 6.572428822517395, "accuracy": 0.029296875, "attn_entropy": 0.13703053584322333, "param_norm": 128.29439372037407}, "train": {"loss": 3.0090396404266357, "accuracy": 0.2734375, "attn_entropy": 0.04914291948080063, "param_norm": 128.28349999953195}, "step": 570, "lr": 0.001}, {"val": {"loss": 6.050354361534119, "accuracy": 0.06201171875, "attn_entropy": 0.14421109715476632, "param_norm": 128.40460857939397}, "ground_truth": {"loss": 6.6142972111701965, "accuracy": 0.0234375, "attn_entropy": 0.13893208838999271, "param_norm": 128.40460857939397}, "train": {"loss": 3.054609537124634, "accuracy": 0.27734375, "attn_entropy": 0.04693836160004139, "param_norm": 128.39358524467946}, "step": 580, "lr": 0.001}, {"val": {"loss": 6.159006357192993, "accuracy": 0.068603515625, "attn_entropy": 0.14531384408473969, "param_norm": 128.51418495762803}, "ground_truth": {"loss": 6.6351438164711, "accuracy": 0.029052734375, "attn_entropy": 0.14250416355207562, "param_norm": 128.51418495762803}, "train": {"loss": 2.9027812480926514, "accuracy": 0.310546875, "attn_entropy": 0.047827763482928276, "param_norm": 128.5033631207043}, "step": 590, "lr": 0.001}, {"val": {"loss": 6.186326563358307, "accuracy": 0.077392578125, "attn_entropy": 0.14711882919073105, "param_norm": 128.62221680922832}, "ground_truth": {"loss": 6.64938497543335, "accuracy": 0.027099609375, "attn_entropy": 0.1438528406433761, "param_norm": 128.62221680922832}, "train": {"loss": 2.890617847442627, "accuracy": 0.302734375, "attn_entropy": 0.050425431691110134, "param_norm": 128.61163048938283}, "step": 600, "lr": 0.001}, {"val": {"loss": 6.136586248874664, "accuracy": 0.072998046875, "attn_entropy": 0.14758290257304907, "param_norm": 128.72810830441043}, "ground_truth": {"loss": 6.678857266902924, "accuracy": 0.02783203125, "attn_entropy": 0.14180140476673841, "param_norm": 128.72810830441043}, "train": {"loss": 2.832008123397827, "accuracy": 0.328125, "attn_entropy": 0.05118764564394951, "param_norm": 128.71755639297518}, "step": 610, "lr": 0.001}, {"val": {"loss": 6.165488660335541, "accuracy": 0.070556640625, "attn_entropy": 0.14469221560284495, "param_norm": 128.83584698979283}, "ground_truth": {"loss": 6.597037494182587, "accuracy": 0.01318359375, "attn_entropy": 0.14043186511844397, "param_norm": 128.83584698979283}, "train": {"loss": 2.7846474647521973, "accuracy": 0.314453125, "attn_entropy": 0.049631085246801376, "param_norm": 128.8249817548608}, "step": 620, "lr": 0.001}, {"val": {"loss": 6.216589868068695, "accuracy": 0.07275390625, "attn_entropy": 0.14209059346467257, "param_norm": 128.94255090707153}, "ground_truth": {"loss": 6.674741744995117, "accuracy": 0.023193359375, "attn_entropy": 0.13679333636537194, "param_norm": 128.94255090707153}, "train": {"loss": 2.6758828163146973, "accuracy": 0.32421875, "attn_entropy": 0.047300102189183235, "param_norm": 128.931953355309}, "step": 630, "lr": 0.001}, {"val": {"loss": 6.267041981220245, "accuracy": 0.077880859375, "attn_entropy": 0.1410833983682096, "param_norm": 129.0477905178319}, "ground_truth": {"loss": 6.714787483215332, "accuracy": 0.024169921875, "attn_entropy": 0.13705691462382674, "param_norm": 129.0477905178319}, "train": {"loss": 2.781353235244751, "accuracy": 0.326171875, "attn_entropy": 0.04747484717518091, "param_norm": 129.03725164034546}, "step": 640, "lr": 0.001}, {"val": {"loss": 6.2068281173706055, "accuracy": 0.09130859375, "attn_entropy": 0.14077314641326666, "param_norm": 129.1557101742998}, "ground_truth": {"loss": 6.693525016307831, "accuracy": 0.033203125, "attn_entropy": 0.13776597892865539, "param_norm": 129.1557101742998}, "train": {"loss": 2.766124963760376, "accuracy": 0.322265625, "attn_entropy": 0.04718185216188431, "param_norm": 129.14464965645385}, "step": 650, "lr": 0.001}, {"val": {"loss": 6.209581911563873, "accuracy": 0.085693359375, "attn_entropy": 0.1409827289171517, "param_norm": 129.26337174838872}, "ground_truth": {"loss": 6.7696123123168945, "accuracy": 0.018310546875, "attn_entropy": 0.13782657589763403, "param_norm": 129.26337174838872}, "train": {"loss": 2.827981472015381, "accuracy": 0.314453125, "attn_entropy": 0.048555062152445316, "param_norm": 129.25285206003738}, "step": 660, "lr": 0.001}, {"val": {"loss": 6.198856890201569, "accuracy": 0.092041015625, "attn_entropy": 0.14233317086473107, "param_norm": 129.3720430977534}, "ground_truth": {"loss": 6.733164370059967, "accuracy": 0.026611328125, "attn_entropy": 0.13806897727772593, "param_norm": 129.3720430977534}, "train": {"loss": 2.5742411613464355, "accuracy": 0.369140625, "attn_entropy": 0.049973826855421066, "param_norm": 129.36117019270463}, "step": 670, "lr": 0.001}, {"val": {"loss": 6.190661430358887, "accuracy": 0.091796875, "attn_entropy": 0.14444036222994328, "param_norm": 129.48035365520855}, "ground_truth": {"loss": 6.705735445022583, "accuracy": 0.036376953125, "attn_entropy": 0.13807234028354287, "param_norm": 129.48035365520855}, "train": {"loss": 2.60992431640625, "accuracy": 0.359375, "attn_entropy": 0.049317313358187675, "param_norm": 129.46970949656472}, "step": 680, "lr": 0.001}, {"val": {"loss": 6.22638738155365, "accuracy": 0.107177734375, "attn_entropy": 0.14286868320778012, "param_norm": 129.5903033091281}, "ground_truth": {"loss": 6.933548748493195, "accuracy": 0.0439453125, "attn_entropy": 0.1359303668141365, "param_norm": 129.5903033091281}, "train": {"loss": 2.4956746101379395, "accuracy": 0.376953125, "attn_entropy": 0.04734800569713116, "param_norm": 129.57935656456542}, "step": 690, "lr": 0.001}, {"val": {"loss": 6.305947721004486, "accuracy": 0.09814453125, "attn_entropy": 0.141803290694952, "param_norm": 129.69176964883988}, "ground_truth": {"loss": 6.849418044090271, "accuracy": 0.039306640625, "attn_entropy": 0.13585489755496383, "param_norm": 129.69176964883988}, "train": {"loss": 2.4538209438323975, "accuracy": 0.412109375, "attn_entropy": 0.04605143517255783, "param_norm": 129.68183366604958}, "step": 700, "lr": 0.001}, {"val": {"loss": 6.2642951011657715, "accuracy": 0.111328125, "attn_entropy": 0.14207493513822556, "param_norm": 129.79408503090352}, "ground_truth": {"loss": 6.930825352668762, "accuracy": 0.03662109375, "attn_entropy": 0.13655178854241967, "param_norm": 129.79408503090352}, "train": {"loss": 2.546834945678711, "accuracy": 0.380859375, "attn_entropy": 0.0463013481348753, "param_norm": 129.78378124332477}, "step": 710, "lr": 0.001}, {"val": {"loss": 6.2475462555885315, "accuracy": 0.125244140625, "attn_entropy": 0.1429090118035674, "param_norm": 129.89433420986157}, "ground_truth": {"loss": 6.978228807449341, "accuracy": 0.0361328125, "attn_entropy": 0.136022686958313, "param_norm": 129.89433420986157}, "train": {"loss": 2.5472540855407715, "accuracy": 0.3671875, "attn_entropy": 0.047622199170291424, "param_norm": 129.88413323380547}, "step": 720, "lr": 0.001}, {"val": {"loss": 6.242454290390015, "accuracy": 0.115234375, "attn_entropy": 0.14359008334577084, "param_norm": 129.99664777507977}, "ground_truth": {"loss": 6.963703513145447, "accuracy": 0.0380859375, "attn_entropy": 0.1372537463903427, "param_norm": 129.99664777507977}, "train": {"loss": 2.3511741161346436, "accuracy": 0.427734375, "attn_entropy": 0.044066001661121845, "param_norm": 129.9863372525483}, "step": 730, "lr": 0.001}, {"val": {"loss": 6.349774777889252, "accuracy": 0.123291015625, "attn_entropy": 0.14382068533450365, "param_norm": 130.0967996736317}, "ground_truth": {"loss": 6.957959413528442, "accuracy": 0.035400390625, "attn_entropy": 0.13656998565420508, "param_norm": 130.0967996736317}, "train": {"loss": 2.3591768741607666, "accuracy": 0.43359375, "attn_entropy": 0.04471198935061693, "param_norm": 130.08667850864668}, "step": 740, "lr": 0.001}, {"val": {"loss": 6.313379526138306, "accuracy": 0.125, "attn_entropy": 0.1450937334448099, "param_norm": 130.201225913615}, "ground_truth": {"loss": 7.1234694719314575, "accuracy": 0.03515625, "attn_entropy": 0.14003823651000857, "param_norm": 130.201225913615}, "train": {"loss": 2.3687686920166016, "accuracy": 0.384765625, "attn_entropy": 0.048983871936798096, "param_norm": 130.19062444631334}, "step": 750, "lr": 0.001}, {"val": {"loss": 6.324999988079071, "accuracy": 0.124267578125, "attn_entropy": 0.143969495780766, "param_norm": 130.301703150752}, "ground_truth": {"loss": 7.009430408477783, "accuracy": 0.03515625, "attn_entropy": 0.1374495760537684, "param_norm": 130.301703150752}, "train": {"loss": 2.3227710723876953, "accuracy": 0.421875, "attn_entropy": 0.04707396402955055, "param_norm": 130.29203743265117}, "step": 760, "lr": 0.001}, {"val": {"loss": 6.383530497550964, "accuracy": 0.1337890625, "attn_entropy": 0.1429606368765235, "param_norm": 130.40107035894331}, "ground_truth": {"loss": 7.019847512245178, "accuracy": 0.0361328125, "attn_entropy": 0.1363246669061482, "param_norm": 130.40107035894331}, "train": {"loss": 2.2849931716918945, "accuracy": 0.423828125, "attn_entropy": 0.043799785897135735, "param_norm": 130.39106681173476}, "step": 770, "lr": 0.001}, {"val": {"loss": 6.323022246360779, "accuracy": 0.137451171875, "attn_entropy": 0.14197804499417543, "param_norm": 130.50050929089346}, "ground_truth": {"loss": 7.133639872074127, "accuracy": 0.039306640625, "attn_entropy": 0.13631078600883484, "param_norm": 130.50050929089346}, "train": {"loss": 2.24739933013916, "accuracy": 0.4375, "attn_entropy": 0.04755863733589649, "param_norm": 130.49032653315678}, "step": 780, "lr": 0.001}, {"val": {"loss": 6.379402756690979, "accuracy": 0.144775390625, "attn_entropy": 0.1437868569046259, "param_norm": 130.60122469830227}, "ground_truth": {"loss": 7.14434415102005, "accuracy": 0.043212890625, "attn_entropy": 0.1367576839402318, "param_norm": 130.60122469830227}, "train": {"loss": 2.38877272605896, "accuracy": 0.392578125, "attn_entropy": 0.04729701578617096, "param_norm": 130.59141565683174}, "step": 790, "lr": 0.001}, {"val": {"loss": 6.4558632373809814, "accuracy": 0.140625, "attn_entropy": 0.14323629066348076, "param_norm": 130.70019259751703}, "ground_truth": {"loss": 7.058175563812256, "accuracy": 0.052978515625, "attn_entropy": 0.13689111964777112, "param_norm": 130.70019259751703}, "train": {"loss": 2.255702495574951, "accuracy": 0.431640625, "attn_entropy": 0.04917013645172119, "param_norm": 130.69022428980486}, "step": 800, "lr": 0.001}, {"val": {"loss": 6.477369427680969, "accuracy": 0.151123046875, "attn_entropy": 0.14391022874042392, "param_norm": 130.79797746530696}, "ground_truth": {"loss": 7.12442684173584, "accuracy": 0.05322265625, "attn_entropy": 0.13661266630515456, "param_norm": 130.79797746530696}, "train": {"loss": 2.1881227493286133, "accuracy": 0.447265625, "attn_entropy": 0.049772437661886215, "param_norm": 130.78855038677005}, "step": 810, "lr": 0.001}, {"val": {"loss": 6.395823836326599, "accuracy": 0.156982421875, "attn_entropy": 0.14090826082974672, "param_norm": 130.8965240602768}, "ground_truth": {"loss": 7.1621533036231995, "accuracy": 0.056884765625, "attn_entropy": 0.13397582294419408, "param_norm": 130.8965240602768}, "train": {"loss": 2.161106824874878, "accuracy": 0.474609375, "attn_entropy": 0.04687691852450371, "param_norm": 130.8864483670991}, "step": 820, "lr": 0.001}, {"val": {"loss": 6.516404390335083, "accuracy": 0.150634765625, "attn_entropy": 0.140530358068645, "param_norm": 130.9975027230395}, "ground_truth": {"loss": 7.2465731501579285, "accuracy": 0.050537109375, "attn_entropy": 0.13411842146888375, "param_norm": 130.9975027230395}, "train": {"loss": 2.21846604347229, "accuracy": 0.443359375, "attn_entropy": 0.04265413526445627, "param_norm": 130.98783207263685}, "step": 830, "lr": 0.001}, {"val": {"loss": 6.495717644691467, "accuracy": 0.1572265625, "attn_entropy": 0.13820259179919958, "param_norm": 131.08721177820692}, "ground_truth": {"loss": 7.325007259845734, "accuracy": 0.05517578125, "attn_entropy": 0.13249540980905294, "param_norm": 131.08721177820692}, "train": {"loss": 2.135287046432495, "accuracy": 0.466796875, "attn_entropy": 0.04230879805982113, "param_norm": 131.07822019994418}, "step": 840, "lr": 0.001}, {"val": {"loss": 6.3908790946006775, "accuracy": 0.16455078125, "attn_entropy": 0.14316147845238447, "param_norm": 131.18322208777172}, "ground_truth": {"loss": 7.204154074192047, "accuracy": 0.0625, "attn_entropy": 0.13515661424025893, "param_norm": 131.18322208777172}, "train": {"loss": 2.1120715141296387, "accuracy": 0.462890625, "attn_entropy": 0.04569968953728676, "param_norm": 131.17329079771022}, "step": 850, "lr": 0.001}, {"val": {"loss": 6.457050859928131, "accuracy": 0.168701171875, "attn_entropy": 0.14056321419775486, "param_norm": 131.2754742301711}, "ground_truth": {"loss": 7.423845827579498, "accuracy": 0.050048828125, "attn_entropy": 0.13412692444399, "param_norm": 131.2754742301711}, "train": {"loss": 1.9871724843978882, "accuracy": 0.4921875, "attn_entropy": 0.04775184579193592, "param_norm": 131.26599939734737}, "step": 860, "lr": 0.001}, {"val": {"loss": 6.509527683258057, "accuracy": 0.167236328125, "attn_entropy": 0.1416478306055069, "param_norm": 131.369623537444}, "ground_truth": {"loss": 7.4941394329071045, "accuracy": 0.064697265625, "attn_entropy": 0.13442921312525868, "param_norm": 131.369623537444}, "train": {"loss": 2.132105588912964, "accuracy": 0.43359375, "attn_entropy": 0.04796666279435158, "param_norm": 131.36012534391992}, "step": 870, "lr": 0.001}, {"val": {"loss": 6.461591422557831, "accuracy": 0.18896484375, "attn_entropy": 0.1442437767982483, "param_norm": 131.46213683482804}, "ground_truth": {"loss": 7.578845798969269, "accuracy": 0.0361328125, "attn_entropy": 0.1364967511035502, "param_norm": 131.46213683482804}, "train": {"loss": 2.0965170860290527, "accuracy": 0.44140625, "attn_entropy": 0.04988506808876991, "param_norm": 131.45290291711436}, "step": 880, "lr": 0.001}, {"val": {"loss": 6.492574751377106, "accuracy": 0.189453125, "attn_entropy": 0.14554487075656652, "param_norm": 131.553033914339}, "ground_truth": {"loss": 7.762352406978607, "accuracy": 0.043701171875, "attn_entropy": 0.13875708635896444, "param_norm": 131.553033914339}, "train": {"loss": 1.8473683595657349, "accuracy": 0.517578125, "attn_entropy": 0.05238153785467148, "param_norm": 131.54387119493413}, "step": 890, "lr": 0.001}, {"val": {"loss": 6.4590365290641785, "accuracy": 0.186767578125, "attn_entropy": 0.14227735251188278, "param_norm": 131.64556559341437}, "ground_truth": {"loss": 7.5142077803611755, "accuracy": 0.0556640625, "attn_entropy": 0.13688084576278925, "param_norm": 131.64556559341437}, "train": {"loss": 1.9514391422271729, "accuracy": 0.494140625, "attn_entropy": 0.04678970202803612, "param_norm": 131.63635286131301}, "step": 900, "lr": 0.001}, {"val": {"loss": 6.55405068397522, "accuracy": 0.190185546875, "attn_entropy": 0.14568407740443945, "param_norm": 131.7353887971435}, "ground_truth": {"loss": 7.7845064997673035, "accuracy": 0.050537109375, "attn_entropy": 0.1369158779270947, "param_norm": 131.7353887971435}, "train": {"loss": 2.041367292404175, "accuracy": 0.4765625, "attn_entropy": 0.04572243243455887, "param_norm": 131.7263470749428}, "step": 910, "lr": 0.001}, {"val": {"loss": 6.56335723400116, "accuracy": 0.205810546875, "attn_entropy": 0.14516055397689342, "param_norm": 131.82469322482507}, "ground_truth": {"loss": 7.63233870267868, "accuracy": 0.052490234375, "attn_entropy": 0.1370488777756691, "param_norm": 131.82469322482507}, "train": {"loss": 1.9345064163208008, "accuracy": 0.525390625, "attn_entropy": 0.047363875433802605, "param_norm": 131.81566816488734}, "step": 920, "lr": 0.001}, {"val": {"loss": 6.641290485858917, "accuracy": 0.200927734375, "attn_entropy": 0.1475987220183015, "param_norm": 131.91326974642428}, "ground_truth": {"loss": 7.734895646572113, "accuracy": 0.056640625, "attn_entropy": 0.13856329955160618, "param_norm": 131.91326974642428}, "train": {"loss": 1.952825665473938, "accuracy": 0.5234375, "attn_entropy": 0.05219038389623165, "param_norm": 131.90410501309827}, "step": 930, "lr": 0.001}, {"val": {"loss": 6.558645009994507, "accuracy": 0.204345703125, "attn_entropy": 0.14816910214722157, "param_norm": 132.00341616259539}, "ground_truth": {"loss": 7.801415383815765, "accuracy": 0.054443359375, "attn_entropy": 0.1393920723348856, "param_norm": 132.00341616259539}, "train": {"loss": 1.9344803094863892, "accuracy": 0.4921875, "attn_entropy": 0.05391700938344002, "param_norm": 131.9945823309931}, "step": 940, "lr": 0.001}, {"val": {"loss": 6.560195207595825, "accuracy": 0.203857421875, "attn_entropy": 0.14669060986489058, "param_norm": 132.09094901042002}, "ground_truth": {"loss": 7.86841481924057, "accuracy": 0.049072265625, "attn_entropy": 0.13706188090145588, "param_norm": 132.09094901042002}, "train": {"loss": 1.8005073070526123, "accuracy": 0.560546875, "attn_entropy": 0.04868750460445881, "param_norm": 132.08210793218439}, "step": 950, "lr": 0.001}, {"val": {"loss": 6.593540728092194, "accuracy": 0.20068359375, "attn_entropy": 0.1483736103400588, "param_norm": 132.18277953968152}, "ground_truth": {"loss": 7.741600513458252, "accuracy": 0.066650390625, "attn_entropy": 0.14070353750139475, "param_norm": 132.18277953968152}, "train": {"loss": 1.7617026567459106, "accuracy": 0.55078125, "attn_entropy": 0.05284767784178257, "param_norm": 132.17378690342778}, "step": 960, "lr": 0.001}, {"val": {"loss": 6.58573055267334, "accuracy": 0.221435546875, "attn_entropy": 0.1468591345474124, "param_norm": 132.26549781924453}, "ground_truth": {"loss": 7.873448550701141, "accuracy": 0.070068359375, "attn_entropy": 0.13993114791810513, "param_norm": 132.26549781924453}, "train": {"loss": 1.7164578437805176, "accuracy": 0.583984375, "attn_entropy": 0.05165478214621544, "param_norm": 132.25734991137713}, "step": 970, "lr": 0.001}, {"val": {"loss": 6.667949438095093, "accuracy": 0.223388671875, "attn_entropy": 0.15163006726652384, "param_norm": 132.3509485949836}, "ground_truth": {"loss": 7.938367784023285, "accuracy": 0.06103515625, "attn_entropy": 0.1440594308078289, "param_norm": 132.3509485949836}, "train": {"loss": 1.7594969272613525, "accuracy": 0.5390625, "attn_entropy": 0.05589145049452782, "param_norm": 132.34226749602044}, "step": 980, "lr": 0.001}, {"val": {"loss": 6.646796643733978, "accuracy": 0.21484375, "attn_entropy": 0.15284765418618917, "param_norm": 132.43472203906927}, "ground_truth": {"loss": 7.940712034702301, "accuracy": 0.071044921875, "attn_entropy": 0.14470329508185387, "param_norm": 132.43472203906927}, "train": {"loss": 1.7162933349609375, "accuracy": 0.5546875, "attn_entropy": 0.05404406227171421, "param_norm": 132.42647254752495}, "step": 990, "lr": 0.001}, {"val": {"loss": 6.672303199768066, "accuracy": 0.224365234375, "attn_entropy": 0.1526948492974043, "param_norm": 132.51731600619743}, "ground_truth": {"loss": 7.950656890869141, "accuracy": 0.0556640625, "attn_entropy": 0.1433371566236019, "param_norm": 132.51731600619743}, "train": {"loss": 1.6110775470733643, "accuracy": 0.583984375, "attn_entropy": 0.05910154990851879, "param_norm": 132.5092843607776}, "step": 1000, "lr": 0.001}, {"val": {"loss": 6.629576146602631, "accuracy": 0.233642578125, "attn_entropy": 0.15090758632868528, "param_norm": 132.60123400025762}, "ground_truth": {"loss": 7.947823762893677, "accuracy": 0.053466796875, "attn_entropy": 0.1430882727727294, "param_norm": 132.60123400025762}, "train": {"loss": 1.687793493270874, "accuracy": 0.552734375, "attn_entropy": 0.05445217899978161, "param_norm": 132.5929268013059}, "step": 1010, "lr": 0.001}, {"val": {"loss": 6.751685738563538, "accuracy": 0.2333984375, "attn_entropy": 0.15208821650594473, "param_norm": 132.6812987085148}, "ground_truth": {"loss": 8.045139133930206, "accuracy": 0.06201171875, "attn_entropy": 0.14058320596814156, "param_norm": 132.6812987085148}, "train": {"loss": 1.689959168434143, "accuracy": 0.546875, "attn_entropy": 0.053136829286813736, "param_norm": 132.67307028042762}, "step": 1020, "lr": 0.001}, {"val": {"loss": 6.748387694358826, "accuracy": 0.243408203125, "attn_entropy": 0.1501161763444543, "param_norm": 132.76335153460397}, "ground_truth": {"loss": 8.103020370006561, "accuracy": 0.0673828125, "attn_entropy": 0.14048578683286905, "param_norm": 132.76335153460397}, "train": {"loss": 1.6739771366119385, "accuracy": 0.583984375, "attn_entropy": 0.05633229576051235, "param_norm": 132.75553786770922}, "step": 1030, "lr": 0.001}, {"val": {"loss": 6.528477370738983, "accuracy": 0.258056640625, "attn_entropy": 0.1505907978862524, "param_norm": 132.84759198087707}, "ground_truth": {"loss": 8.076987564563751, "accuracy": 0.05908203125, "attn_entropy": 0.1404043808579445, "param_norm": 132.84759198087707}, "train": {"loss": 1.554580807685852, "accuracy": 0.599609375, "attn_entropy": 0.05247994139790535, "param_norm": 132.83916781779016}, "step": 1040, "lr": 0.001}, {"val": {"loss": 6.857448756694794, "accuracy": 0.236083984375, "attn_entropy": 0.14997955597937107, "param_norm": 132.92759985689892}, "ground_truth": {"loss": 8.226449429988861, "accuracy": 0.062744140625, "attn_entropy": 0.14051463827490807, "param_norm": 132.92759985689892}, "train": {"loss": 1.4767897129058838, "accuracy": 0.583984375, "attn_entropy": 0.056146085262298584, "param_norm": 132.91941320529236}, "step": 1050, "lr": 0.001}, {"val": {"loss": 6.699424684047699, "accuracy": 0.255126953125, "attn_entropy": 0.15087471529841423, "param_norm": 133.00655819255243}, "ground_truth": {"loss": 8.286276519298553, "accuracy": 0.06396484375, "attn_entropy": 0.14047754742205143, "param_norm": 133.00655819255243}, "train": {"loss": 1.437061071395874, "accuracy": 0.6015625, "attn_entropy": 0.05382053181529045, "param_norm": 132.99869484581473}, "step": 1060, "lr": 0.001}, {"val": {"loss": 6.85980761051178, "accuracy": 0.2529296875, "attn_entropy": 0.15103076491504908, "param_norm": 133.08525914130442}, "ground_truth": {"loss": 8.341799020767212, "accuracy": 0.067626953125, "attn_entropy": 0.1409019296988845, "param_norm": 133.08525914130442}, "train": {"loss": 1.563083529472351, "accuracy": 0.55859375, "attn_entropy": 0.051212161779403687, "param_norm": 133.0773202525901}, "step": 1070, "lr": 0.001}, {"val": {"loss": 6.731541872024536, "accuracy": 0.271728515625, "attn_entropy": 0.1506628943607211, "param_norm": 133.16559269597448}, "ground_truth": {"loss": 8.337157487869263, "accuracy": 0.067626953125, "attn_entropy": 0.14075467642396688, "param_norm": 133.16559269597448}, "train": {"loss": 1.4213273525238037, "accuracy": 0.6328125, "attn_entropy": 0.05565654672682285, "param_norm": 133.15789736024374}, "step": 1080, "lr": 0.001}, {"val": {"loss": 6.671071112155914, "accuracy": 0.275390625, "attn_entropy": 0.14976729732006788, "param_norm": 133.2405621544962}, "ground_truth": {"loss": 8.359228491783142, "accuracy": 0.063720703125, "attn_entropy": 0.13965109176933765, "param_norm": 133.2405621544962}, "train": {"loss": 1.4528290033340454, "accuracy": 0.623046875, "attn_entropy": 0.05431170575320721, "param_norm": 133.2332386166567}, "step": 1090, "lr": 0.001}, {"val": {"loss": 6.8115745186805725, "accuracy": 0.259765625, "attn_entropy": 0.15130396094173193, "param_norm": 133.31863853673383}, "ground_truth": {"loss": 8.52622675895691, "accuracy": 0.063720703125, "attn_entropy": 0.13968581892549992, "param_norm": 133.31863853673383}, "train": {"loss": 1.3960812091827393, "accuracy": 0.626953125, "attn_entropy": 0.05259827896952629, "param_norm": 133.31088121000738}, "step": 1100, "lr": 0.001}, {"val": {"loss": 6.9211424589157104, "accuracy": 0.253662109375, "attn_entropy": 0.1511128507554531, "param_norm": 133.39490016101055}, "ground_truth": {"loss": 8.545704007148743, "accuracy": 0.068603515625, "attn_entropy": 0.14151200093328953, "param_norm": 133.39490016101055}, "train": {"loss": 1.382702350616455, "accuracy": 0.65625, "attn_entropy": 0.05377233773469925, "param_norm": 133.3870569793406}, "step": 1110, "lr": 0.001}, {"val": {"loss": 6.932806134223938, "accuracy": 0.265869140625, "attn_entropy": 0.1523549621924758, "param_norm": 133.47087909442328}, "ground_truth": {"loss": 8.473127722740173, "accuracy": 0.0654296875, "attn_entropy": 0.14168414007872343, "param_norm": 133.47087909442328}, "train": {"loss": 1.4590991735458374, "accuracy": 0.6171875, "attn_entropy": 0.05536677688360214, "param_norm": 133.4631778627477}, "step": 1120, "lr": 0.001}, {"val": {"loss": 6.85315603017807, "accuracy": 0.271728515625, "attn_entropy": 0.15257858950644732, "param_norm": 133.541356946712}, "ground_truth": {"loss": 8.589128494262695, "accuracy": 0.060791015625, "attn_entropy": 0.1408103797584772, "param_norm": 133.541356946712}, "train": {"loss": 1.3092079162597656, "accuracy": 0.66796875, "attn_entropy": 0.05609319359064102, "param_norm": 133.53435386164452}, "step": 1130, "lr": 0.001}, {"val": {"loss": 7.11206978559494, "accuracy": 0.2607421875, "attn_entropy": 0.1543256351724267, "param_norm": 133.6151398115594}, "ground_truth": {"loss": 8.720173239707947, "accuracy": 0.066162109375, "attn_entropy": 0.14355160482227802, "param_norm": 133.6151398115594}, "train": {"loss": 1.39925217628479, "accuracy": 0.642578125, "attn_entropy": 0.05526235140860081, "param_norm": 133.60764706078353}, "step": 1140, "lr": 0.001}, {"val": {"loss": 6.981339752674103, "accuracy": 0.269287109375, "attn_entropy": 0.15670764353126287, "param_norm": 133.68597863118814}, "ground_truth": {"loss": 8.828309893608093, "accuracy": 0.061767578125, "attn_entropy": 0.1468557072803378, "param_norm": 133.68597863118814}, "train": {"loss": 1.3032081127166748, "accuracy": 0.65625, "attn_entropy": 0.058162251487374306, "param_norm": 133.67887952155868}, "step": 1150, "lr": 0.001}, {"val": {"loss": 6.982315361499786, "accuracy": 0.27197265625, "attn_entropy": 0.15654195100069046, "param_norm": 133.75731500471036}, "ground_truth": {"loss": 8.867943525314331, "accuracy": 0.05908203125, "attn_entropy": 0.1464151917025447, "param_norm": 133.75731500471036}, "train": {"loss": 1.2329027652740479, "accuracy": 0.66015625, "attn_entropy": 0.05370042659342289, "param_norm": 133.75030877654805}, "step": 1160, "lr": 0.001}, {"val": {"loss": 7.020178198814392, "accuracy": 0.28173828125, "attn_entropy": 0.15415514819324017, "param_norm": 133.82397857250734}, "ground_truth": {"loss": 8.99224865436554, "accuracy": 0.057373046875, "attn_entropy": 0.14522626623511314, "param_norm": 133.82397857250734}, "train": {"loss": 1.2433075904846191, "accuracy": 0.677734375, "attn_entropy": 0.053304946050047874, "param_norm": 133.817077022862}, "step": 1170, "lr": 0.001}, {"val": {"loss": 7.008580982685089, "accuracy": 0.283447265625, "attn_entropy": 0.15438014548271894, "param_norm": 133.89574505842032}, "ground_truth": {"loss": 8.815431952476501, "accuracy": 0.057373046875, "attn_entropy": 0.1448482284322381, "param_norm": 133.89574505842032}, "train": {"loss": 1.2090719938278198, "accuracy": 0.673828125, "attn_entropy": 0.057020217180252075, "param_norm": 133.88899693271057}, "step": 1180, "lr": 0.001}, {"val": {"loss": 7.091413676738739, "accuracy": 0.283447265625, "attn_entropy": 0.15439036674797535, "param_norm": 133.96392575711528}, "ground_truth": {"loss": 8.967347145080566, "accuracy": 0.038330078125, "attn_entropy": 0.14473753422498703, "param_norm": 133.96392575711528}, "train": {"loss": 1.26643705368042, "accuracy": 0.662109375, "attn_entropy": 0.054513536393642426, "param_norm": 133.9566902818952}, "step": 1190, "lr": 0.001}, {"val": {"loss": 6.996750891208649, "accuracy": 0.296630859375, "attn_entropy": 0.15679377131164074, "param_norm": 134.03584269424914}, "ground_truth": {"loss": 8.93121349811554, "accuracy": 0.046142578125, "attn_entropy": 0.14573390409350395, "param_norm": 134.03584269424914}, "train": {"loss": 1.2200989723205566, "accuracy": 0.69140625, "attn_entropy": 0.05689163878560066, "param_norm": 134.02870360340526}, "step": 1200, "lr": 0.001}, {"val": {"loss": 7.046564221382141, "accuracy": 0.293212890625, "attn_entropy": 0.15575683023780584, "param_norm": 134.1052800089113}, "ground_truth": {"loss": 9.161543846130371, "accuracy": 0.048095703125, "attn_entropy": 0.14419273752719164, "param_norm": 134.1052800089113}, "train": {"loss": 1.3555307388305664, "accuracy": 0.630859375, "attn_entropy": 0.05955065041780472, "param_norm": 134.09813697866787}, "step": 1210, "lr": 0.001}, {"val": {"loss": 7.097596108913422, "accuracy": 0.2900390625, "attn_entropy": 0.15384673699736595, "param_norm": 134.17374805187228}, "ground_truth": {"loss": 9.143656373023987, "accuracy": 0.063232421875, "attn_entropy": 0.14273883122950792, "param_norm": 134.17374805187228}, "train": {"loss": 1.225411295890808, "accuracy": 0.650390625, "attn_entropy": 0.05333707481622696, "param_norm": 134.16727383196195}, "step": 1220, "lr": 0.001}, {"val": {"loss": 7.060463786125183, "accuracy": 0.295166015625, "attn_entropy": 0.1536123352125287, "param_norm": 134.2432177683429}, "ground_truth": {"loss": 9.23625910282135, "accuracy": 0.052001953125, "attn_entropy": 0.14321967959403992, "param_norm": 134.2432177683429}, "train": {"loss": 1.27371084690094, "accuracy": 0.6640625, "attn_entropy": 0.05682270973920822, "param_norm": 134.23629186291993}, "step": 1230, "lr": 0.001}, {"val": {"loss": 7.183671057224274, "accuracy": 0.292724609375, "attn_entropy": 0.1537481565028429, "param_norm": 134.3073114210164}, "ground_truth": {"loss": 9.189489364624023, "accuracy": 0.052734375, "attn_entropy": 0.1432297509163618, "param_norm": 134.3073114210164}, "train": {"loss": 1.150278091430664, "accuracy": 0.6953125, "attn_entropy": 0.05581710860133171, "param_norm": 134.30089925297742}, "step": 1240, "lr": 0.001}, {"val": {"loss": 7.177969932556152, "accuracy": 0.2978515625, "attn_entropy": 0.1539715901017189, "param_norm": 134.37267914120685}, "ground_truth": {"loss": 9.28909945487976, "accuracy": 0.049560546875, "attn_entropy": 0.1438888618722558, "param_norm": 134.37267914120685}, "train": {"loss": 1.046342134475708, "accuracy": 0.708984375, "attn_entropy": 0.056521354243159294, "param_norm": 134.36636642970615}, "step": 1250, "lr": 0.001}, {"val": {"loss": 7.225550472736359, "accuracy": 0.29931640625, "attn_entropy": 0.15429329965263605, "param_norm": 134.43497041626145}, "ground_truth": {"loss": 9.453413605690002, "accuracy": 0.0517578125, "attn_entropy": 0.14399266708642244, "param_norm": 134.43497041626145}, "train": {"loss": 1.1950660943984985, "accuracy": 0.662109375, "attn_entropy": 0.058594200760126114, "param_norm": 134.4288747104693}, "step": 1260, "lr": 0.001}, {"val": {"loss": 7.098898112773895, "accuracy": 0.314453125, "attn_entropy": 0.1548047699034214, "param_norm": 134.49615848760507}, "ground_truth": {"loss": 9.533475041389465, "accuracy": 0.044189453125, "attn_entropy": 0.14545079227536917, "param_norm": 134.49615848760507}, "train": {"loss": 1.148638367652893, "accuracy": 0.701171875, "attn_entropy": 0.053539203479886055, "param_norm": 134.4900054839393}, "step": 1270, "lr": 0.001}, {"val": {"loss": 7.360846102237701, "accuracy": 0.292724609375, "attn_entropy": 0.15567935071885586, "param_norm": 134.55684861745206}, "ground_truth": {"loss": 9.326229214668274, "accuracy": 0.041259765625, "attn_entropy": 0.14446072187274694, "param_norm": 134.55684861745206}, "train": {"loss": 1.150506854057312, "accuracy": 0.6875, "attn_entropy": 0.0559599157422781, "param_norm": 134.55051335294127}, "step": 1280, "lr": 0.001}, {"val": {"loss": 7.287169694900513, "accuracy": 0.290283203125, "attn_entropy": 0.15397128462791443, "param_norm": 134.61829632311344}, "ground_truth": {"loss": 9.4964017868042, "accuracy": 0.03369140625, "attn_entropy": 0.14277732651680708, "param_norm": 134.61829632311344}, "train": {"loss": 1.0486030578613281, "accuracy": 0.708984375, "attn_entropy": 0.05591145344078541, "param_norm": 134.61217128375415}, "step": 1290, "lr": 0.001}, {"val": {"loss": 7.284012794494629, "accuracy": 0.305419921875, "attn_entropy": 0.1531667159870267, "param_norm": 134.67745244985065}, "ground_truth": {"loss": 9.655557036399841, "accuracy": 0.044677734375, "attn_entropy": 0.14285537600517273, "param_norm": 134.67745244985065}, "train": {"loss": 1.157157301902771, "accuracy": 0.697265625, "attn_entropy": 0.05616011843085289, "param_norm": 134.67188971008528}, "step": 1300, "lr": 0.001}, {"val": {"loss": 7.193516671657562, "accuracy": 0.3154296875, "attn_entropy": 0.15761669632047415, "param_norm": 134.73670737372217}, "ground_truth": {"loss": 9.750103235244751, "accuracy": 0.0537109375, "attn_entropy": 0.14358442090451717, "param_norm": 134.73670737372217}, "train": {"loss": 1.119792103767395, "accuracy": 0.69921875, "attn_entropy": 0.05699581652879715, "param_norm": 134.730651224484}, "step": 1310, "lr": 0.001}, {"val": {"loss": 7.3665841817855835, "accuracy": 0.292724609375, "attn_entropy": 0.1562990052625537, "param_norm": 134.79647526897875}, "ground_truth": {"loss": 9.64039957523346, "accuracy": 0.051025390625, "attn_entropy": 0.14582457393407822, "param_norm": 134.79647526897875}, "train": {"loss": 1.1058193445205688, "accuracy": 0.685546875, "attn_entropy": 0.05822897143661976, "param_norm": 134.79088672334782}, "step": 1320, "lr": 0.001}, {"val": {"loss": 7.3337549567222595, "accuracy": 0.308349609375, "attn_entropy": 0.1610816102474928, "param_norm": 134.85933092600774}, "ground_truth": {"loss": 9.825578689575195, "accuracy": 0.044677734375, "attn_entropy": 0.1479335194453597, "param_norm": 134.85933092600774}, "train": {"loss": 0.9866870641708374, "accuracy": 0.734375, "attn_entropy": 0.058770155534148216, "param_norm": 134.85280060325456}, "step": 1330, "lr": 0.001}, {"val": {"loss": 7.286277234554291, "accuracy": 0.3154296875, "attn_entropy": 0.15830580983310938, "param_norm": 134.9162931093292}, "ground_truth": {"loss": 9.753530144691467, "accuracy": 0.052978515625, "attn_entropy": 0.14816882088780403, "param_norm": 134.9162931093292}, "train": {"loss": 0.9864040017127991, "accuracy": 0.732421875, "attn_entropy": 0.05627388320863247, "param_norm": 134.9109165031501}, "step": 1340, "lr": 0.001}, {"val": {"loss": 7.424665689468384, "accuracy": 0.303955078125, "attn_entropy": 0.16015184111893177, "param_norm": 134.97548118001853}, "ground_truth": {"loss": 9.811126828193665, "accuracy": 0.04833984375, "attn_entropy": 0.14855588972568512, "param_norm": 134.97548118001853}, "train": {"loss": 0.9688906073570251, "accuracy": 0.75, "attn_entropy": 0.057312991470098495, "param_norm": 134.96964132295517}, "step": 1350, "lr": 0.001}, {"val": {"loss": 7.338508069515228, "accuracy": 0.311279296875, "attn_entropy": 0.15858780033886433, "param_norm": 135.03132812595513}, "ground_truth": {"loss": 10.059670805931091, "accuracy": 0.04833984375, "attn_entropy": 0.14688625559210777, "param_norm": 135.03132812595513}, "train": {"loss": 1.0346043109893799, "accuracy": 0.71875, "attn_entropy": 0.05830579809844494, "param_norm": 135.0255725771028}, "step": 1360, "lr": 0.001}, {"val": {"loss": 7.458299517631531, "accuracy": 0.30859375, "attn_entropy": 0.16154550854116678, "param_norm": 135.08708448845823}, "ground_truth": {"loss": 9.73602819442749, "accuracy": 0.042236328125, "attn_entropy": 0.14793668035417795, "param_norm": 135.08708448845823}, "train": {"loss": 1.0242887735366821, "accuracy": 0.71875, "attn_entropy": 0.061548760160803795, "param_norm": 135.08181627435766}, "step": 1370, "lr": 0.001}, {"val": {"loss": 7.570370078086853, "accuracy": 0.301025390625, "attn_entropy": 0.16364349331706762, "param_norm": 135.1412487718567}, "ground_truth": {"loss": 10.061837553977966, "accuracy": 0.04296875, "attn_entropy": 0.1497750710695982, "param_norm": 135.1412487718567}, "train": {"loss": 0.9287196397781372, "accuracy": 0.74609375, "attn_entropy": 0.059861911460757256, "param_norm": 135.13584206025624}, "step": 1380, "lr": 0.001}, {"val": {"loss": 7.5443931221961975, "accuracy": 0.311767578125, "attn_entropy": 0.16441610641777515, "param_norm": 135.19917279700334}, "ground_truth": {"loss": 10.072293639183044, "accuracy": 0.03857421875, "attn_entropy": 0.15225686598569155, "param_norm": 135.19917279700334}, "train": {"loss": 0.9859330654144287, "accuracy": 0.744140625, "attn_entropy": 0.062429649755358696, "param_norm": 135.1933385685608}, "step": 1390, "lr": 0.001}, {"val": {"loss": 7.478211462497711, "accuracy": 0.31396484375, "attn_entropy": 0.16359287314116955, "param_norm": 135.2549392386421}, "ground_truth": {"loss": 10.174487233161926, "accuracy": 0.031005859375, "attn_entropy": 0.15085108764469624, "param_norm": 135.2549392386421}, "train": {"loss": 0.9806210994720459, "accuracy": 0.75, "attn_entropy": 0.06192300468683243, "param_norm": 135.2496228210616}, "step": 1400, "lr": 0.001}, {"val": {"loss": 7.2572062611579895, "accuracy": 0.3369140625, "attn_entropy": 0.1598669895902276, "param_norm": 135.30895745164017}, "ground_truth": {"loss": 10.25480592250824, "accuracy": 0.0341796875, "attn_entropy": 0.14713853038847446, "param_norm": 135.30895745164017}, "train": {"loss": 0.8941558003425598, "accuracy": 0.76171875, "attn_entropy": 0.060689652338624, "param_norm": 135.30330855865506}, "step": 1410, "lr": 0.001}, {"val": {"loss": 7.475717902183533, "accuracy": 0.315673828125, "attn_entropy": 0.15788133814930916, "param_norm": 135.36312245285418}, "ground_truth": {"loss": 10.150002837181091, "accuracy": 0.024658203125, "attn_entropy": 0.14528632815927267, "param_norm": 135.36312245285418}, "train": {"loss": 1.026923656463623, "accuracy": 0.703125, "attn_entropy": 0.057355182245373726, "param_norm": 135.3580202797544}, "step": 1420, "lr": 0.001}, {"val": {"loss": 7.358637452125549, "accuracy": 0.329345703125, "attn_entropy": 0.1572586875408888, "param_norm": 135.41357421764954}, "ground_truth": {"loss": 10.587825894355774, "accuracy": 0.025390625, "attn_entropy": 0.1446309443563223, "param_norm": 135.41357421764954}, "train": {"loss": 0.8597816824913025, "accuracy": 0.771484375, "attn_entropy": 0.05592946708202362, "param_norm": 135.40827844314452}, "step": 1430, "lr": 0.001}, {"val": {"loss": 7.483040392398834, "accuracy": 0.327392578125, "attn_entropy": 0.15831044781953096, "param_norm": 135.466090004566}, "ground_truth": {"loss": 10.467555165290833, "accuracy": 0.02001953125, "attn_entropy": 0.1455615498125553, "param_norm": 135.466090004566}, "train": {"loss": 0.8418178558349609, "accuracy": 0.767578125, "attn_entropy": 0.05881011299788952, "param_norm": 135.4605974170788}, "step": 1440, "lr": 0.001}, {"val": {"loss": 7.488398373126984, "accuracy": 0.316162109375, "attn_entropy": 0.16038503870368004, "param_norm": 135.51629386277384}, "ground_truth": {"loss": 10.627864003181458, "accuracy": 0.03564453125, "attn_entropy": 0.14643794111907482, "param_norm": 135.51629386277384}, "train": {"loss": 0.883367657661438, "accuracy": 0.74609375, "attn_entropy": 0.06028851121664047, "param_norm": 135.51118369715863}, "step": 1450, "lr": 0.001}, {"val": {"loss": 7.4854007959365845, "accuracy": 0.322998046875, "attn_entropy": 0.15828975941985846, "param_norm": 135.56941084907328}, "ground_truth": {"loss": 10.477527379989624, "accuracy": 0.031982421875, "attn_entropy": 0.1464856183156371, "param_norm": 135.56941084907328}, "train": {"loss": 0.8994138836860657, "accuracy": 0.751953125, "attn_entropy": 0.05771423690021038, "param_norm": 135.56406940693392}, "step": 1460, "lr": 0.001}, {"val": {"loss": 7.623428583145142, "accuracy": 0.3203125, "attn_entropy": 0.16112243756651878, "param_norm": 135.6220323162124}, "ground_truth": {"loss": 10.483123064041138, "accuracy": 0.036865234375, "attn_entropy": 0.14749518688768148, "param_norm": 135.6220323162124}, "train": {"loss": 0.8053380846977234, "accuracy": 0.779296875, "attn_entropy": 0.06067848205566406, "param_norm": 135.61682291617515}, "step": 1470, "lr": 0.001}, {"val": {"loss": 7.471230149269104, "accuracy": 0.326904296875, "attn_entropy": 0.16128194518387318, "param_norm": 135.67389571324728}, "ground_truth": {"loss": 10.57746422290802, "accuracy": 0.039306640625, "attn_entropy": 0.14927859790623188, "param_norm": 135.67389571324728}, "train": {"loss": 0.8827365040779114, "accuracy": 0.779296875, "attn_entropy": 0.062319282442331314, "param_norm": 135.66856047546383}, "step": 1480, "lr": 0.001}, {"val": {"loss": 7.5666696429252625, "accuracy": 0.327392578125, "attn_entropy": 0.162291357293725, "param_norm": 135.7262490408937}, "ground_truth": {"loss": 10.55542778968811, "accuracy": 0.030029296875, "attn_entropy": 0.15136240608990192, "param_norm": 135.7262490408937}, "train": {"loss": 0.8570640683174133, "accuracy": 0.759765625, "attn_entropy": 0.06695295870304108, "param_norm": 135.721121295797}, "step": 1490, "lr": 0.001}, {"val": {"loss": 7.604844272136688, "accuracy": 0.319091796875, "attn_entropy": 0.16458838619291782, "param_norm": 135.77687111148956}, "ground_truth": {"loss": 10.508495211601257, "accuracy": 0.026123046875, "attn_entropy": 0.15134608186781406, "param_norm": 135.77687111148956}, "train": {"loss": 0.8262671828269958, "accuracy": 0.765625, "attn_entropy": 0.0615474171936512, "param_norm": 135.77168851363902}, "step": 1500, "lr": 0.001}, {"val": {"loss": 7.737722039222717, "accuracy": 0.3173828125, "attn_entropy": 0.16170535050332546, "param_norm": 135.827082515274}, "ground_truth": {"loss": 10.849370241165161, "accuracy": 0.025634765625, "attn_entropy": 0.1505744308233261, "param_norm": 135.827082515274}, "train": {"loss": 0.8243656158447266, "accuracy": 0.76953125, "attn_entropy": 0.06395720317959785, "param_norm": 135.82237581908026}, "step": 1510, "lr": 0.001}, {"val": {"loss": 7.558975040912628, "accuracy": 0.3310546875, "attn_entropy": 0.16108606196939945, "param_norm": 135.87515813086608}, "ground_truth": {"loss": 10.64706027507782, "accuracy": 0.035400390625, "attn_entropy": 0.14998024608939886, "param_norm": 135.87515813086608}, "train": {"loss": 0.8548240065574646, "accuracy": 0.75390625, "attn_entropy": 0.06676315888762474, "param_norm": 135.87031894502732}, "step": 1520, "lr": 0.001}, {"val": {"loss": 7.773721396923065, "accuracy": 0.32177734375, "attn_entropy": 0.16119077242910862, "param_norm": 135.9223567087702}, "ground_truth": {"loss": 10.958425879478455, "accuracy": 0.02978515625, "attn_entropy": 0.14904613513499498, "param_norm": 135.9223567087702}, "train": {"loss": 0.7679884433746338, "accuracy": 0.791015625, "attn_entropy": 0.06164167821407318, "param_norm": 135.917317269846}, "step": 1530, "lr": 0.001}, {"val": {"loss": 7.602978765964508, "accuracy": 0.334228515625, "attn_entropy": 0.15942994970828295, "param_norm": 135.97086881510242}, "ground_truth": {"loss": 11.111221671104431, "accuracy": 0.0341796875, "attn_entropy": 0.1468021608889103, "param_norm": 135.97086881510242}, "train": {"loss": 0.7179608345031738, "accuracy": 0.798828125, "attn_entropy": 0.05818049982190132, "param_norm": 135.96630148643615}, "step": 1540, "lr": 0.001}, {"val": {"loss": 7.527742505073547, "accuracy": 0.34033203125, "attn_entropy": 0.15971802826970816, "param_norm": 136.01576212356247}, "ground_truth": {"loss": 11.092240452766418, "accuracy": 0.02685546875, "attn_entropy": 0.14684996102005243, "param_norm": 136.01576212356247}, "train": {"loss": 0.7218611836433411, "accuracy": 0.796875, "attn_entropy": 0.057102497667074203, "param_norm": 136.0112022546873}, "step": 1550, "lr": 0.001}, {"val": {"loss": 7.631104588508606, "accuracy": 0.333251953125, "attn_entropy": 0.16359766479581594, "param_norm": 136.06190187519232}, "ground_truth": {"loss": 11.059354424476624, "accuracy": 0.031005859375, "attn_entropy": 0.15061301458626986, "param_norm": 136.06190187519232}, "train": {"loss": 0.8083032369613647, "accuracy": 0.77734375, "attn_entropy": 0.062334101647138596, "param_norm": 136.05707614049348}, "step": 1560, "lr": 0.001}, {"val": {"loss": 7.71394681930542, "accuracy": 0.32568359375, "attn_entropy": 0.1632385915145278, "param_norm": 136.10918860455016}, "ground_truth": {"loss": 10.970557928085327, "accuracy": 0.036865234375, "attn_entropy": 0.15045102778822184, "param_norm": 136.10918860455016}, "train": {"loss": 0.7491000890731812, "accuracy": 0.79296875, "attn_entropy": 0.06046794168651104, "param_norm": 136.10448927716166}, "step": 1570, "lr": 0.001}, {"val": {"loss": 7.580925464630127, "accuracy": 0.33935546875, "attn_entropy": 0.16242075711488724, "param_norm": 136.1561716038774}, "ground_truth": {"loss": 11.028897643089294, "accuracy": 0.0322265625, "attn_entropy": 0.15155836567282677, "param_norm": 136.1561716038774}, "train": {"loss": 0.664326012134552, "accuracy": 0.8203125, "attn_entropy": 0.059952473267912865, "param_norm": 136.1516463391965}, "step": 1580, "lr": 0.001}, {"val": {"loss": 7.86687159538269, "accuracy": 0.32666015625, "attn_entropy": 0.16249411087483168, "param_norm": 136.20286119922798}, "ground_truth": {"loss": 11.272935271263123, "accuracy": 0.04150390625, "attn_entropy": 0.15051737055182457, "param_norm": 136.20286119922798}, "train": {"loss": 0.7343317270278931, "accuracy": 0.794921875, "attn_entropy": 0.060127075761556625, "param_norm": 136.19821171576038}, "step": 1590, "lr": 0.001}, {"val": {"loss": 7.820237398147583, "accuracy": 0.3330078125, "attn_entropy": 0.16401969734579325, "param_norm": 136.2462755274929}, "ground_truth": {"loss": 11.19002091884613, "accuracy": 0.0263671875, "attn_entropy": 0.15231568831950426, "param_norm": 136.2462755274929}, "train": {"loss": 0.7032347917556763, "accuracy": 0.8125, "attn_entropy": 0.06677935272455215, "param_norm": 136.2419666623939}, "step": 1600, "lr": 0.001}, {"val": {"loss": 7.689063847064972, "accuracy": 0.345947265625, "attn_entropy": 0.1631861785426736, "param_norm": 136.28973697594958}, "ground_truth": {"loss": 11.352378606796265, "accuracy": 0.031005859375, "attn_entropy": 0.14952854998409748, "param_norm": 136.28973697594958}, "train": {"loss": 0.7585865259170532, "accuracy": 0.796875, "attn_entropy": 0.06065955199301243, "param_norm": 136.285452801798}, "step": 1610, "lr": 0.001}, {"val": {"loss": 7.802693545818329, "accuracy": 0.330078125, "attn_entropy": 0.16186178009957075, "param_norm": 136.33622281472734}, "ground_truth": {"loss": 11.224408030509949, "accuracy": 0.028076171875, "attn_entropy": 0.1482921876013279, "param_norm": 136.33622281472734}, "train": {"loss": 0.7013623118400574, "accuracy": 0.802734375, "attn_entropy": 0.05673351138830185, "param_norm": 136.3316454999021}, "step": 1620, "lr": 0.001}, {"val": {"loss": 7.746614158153534, "accuracy": 0.336669921875, "attn_entropy": 0.16227718349546194, "param_norm": 136.3781201512016}, "ground_truth": {"loss": 11.322236180305481, "accuracy": 0.02978515625, "attn_entropy": 0.15000308211892843, "param_norm": 136.3781201512016}, "train": {"loss": 0.7184271812438965, "accuracy": 0.79296875, "attn_entropy": 0.06592806428670883, "param_norm": 136.3738897054525}, "step": 1630, "lr": 0.001}, {"val": {"loss": 7.989892601966858, "accuracy": 0.325927734375, "attn_entropy": 0.16480651777237654, "param_norm": 136.42329003764874}, "ground_truth": {"loss": 11.401716947555542, "accuracy": 0.031005859375, "attn_entropy": 0.1506693884730339, "param_norm": 136.42329003764874}, "train": {"loss": 0.5996667146682739, "accuracy": 0.837890625, "attn_entropy": 0.06230994686484337, "param_norm": 136.41868981369979}, "step": 1640, "lr": 0.001}, {"val": {"loss": 7.672362148761749, "accuracy": 0.346923828125, "attn_entropy": 0.16354368347674608, "param_norm": 136.46485597439258}, "ground_truth": {"loss": 11.504400491714478, "accuracy": 0.0361328125, "attn_entropy": 0.15247040521353483, "param_norm": 136.46485597439258}, "train": {"loss": 0.6968143582344055, "accuracy": 0.818359375, "attn_entropy": 0.06148197688162327, "param_norm": 136.46075572263112}, "step": 1650, "lr": 0.001}, {"val": {"loss": 7.825592875480652, "accuracy": 0.33837890625, "attn_entropy": 0.16292086243629456, "param_norm": 136.50682676337422}, "ground_truth": {"loss": 11.489547491073608, "accuracy": 0.0341796875, "attn_entropy": 0.15002126712352037, "param_norm": 136.50682676337422}, "train": {"loss": 0.6591188907623291, "accuracy": 0.806640625, "attn_entropy": 0.060931235551834106, "param_norm": 136.50226442427925}, "step": 1660, "lr": 0.001}, {"val": {"loss": 8.016491115093231, "accuracy": 0.32568359375, "attn_entropy": 0.16049166303128004, "param_norm": 136.55252388353645}, "ground_truth": {"loss": 11.430756092071533, "accuracy": 0.03515625, "attn_entropy": 0.1480254800990224, "param_norm": 136.55252388353645}, "train": {"loss": 0.5977620482444763, "accuracy": 0.833984375, "attn_entropy": 0.05915078520774841, "param_norm": 136.54817008870904}, "step": 1670, "lr": 0.001}, {"val": {"loss": 7.885530889034271, "accuracy": 0.331298828125, "attn_entropy": 0.1600722512230277, "param_norm": 136.5931501999382}, "ground_truth": {"loss": 11.54015028476715, "accuracy": 0.0322265625, "attn_entropy": 0.147452799603343, "param_norm": 136.5931501999382}, "train": {"loss": 0.7167213559150696, "accuracy": 0.798828125, "attn_entropy": 0.058232299983501434, "param_norm": 136.5891468546224}, "step": 1680, "lr": 0.001}, {"val": {"loss": 7.817316889762878, "accuracy": 0.336181640625, "attn_entropy": 0.15897560957819223, "param_norm": 136.6360547371236}, "ground_truth": {"loss": 11.544132232666016, "accuracy": 0.0322265625, "attn_entropy": 0.14615665469318628, "param_norm": 136.6360547371236}, "train": {"loss": 0.6382330060005188, "accuracy": 0.82421875, "attn_entropy": 0.05887998826801777, "param_norm": 136.63160007846696}, "step": 1690, "lr": 0.001}, {"val": {"loss": 7.815709948539734, "accuracy": 0.341796875, "attn_entropy": 0.1623210972175002, "param_norm": 136.678804483802}, "ground_truth": {"loss": 11.659167885780334, "accuracy": 0.0361328125, "attn_entropy": 0.14966264367103577, "param_norm": 136.678804483802}, "train": {"loss": 0.6376196146011353, "accuracy": 0.837890625, "attn_entropy": 0.062116919085383415, "param_norm": 136.67465935442098}, "step": 1700, "lr": 0.001}, {"val": {"loss": 7.808459758758545, "accuracy": 0.342529296875, "attn_entropy": 0.1623482508584857, "param_norm": 136.72017346681977}, "ground_truth": {"loss": 11.5364009141922, "accuracy": 0.038330078125, "attn_entropy": 0.1497204266488552, "param_norm": 136.72017346681977}, "train": {"loss": 0.6900621652603149, "accuracy": 0.826171875, "attn_entropy": 0.057825637981295586, "param_norm": 136.716102761825}, "step": 1710, "lr": 0.001}, {"val": {"loss": 7.794691443443298, "accuracy": 0.350341796875, "attn_entropy": 0.16388008650392294, "param_norm": 136.75943882696228}, "ground_truth": {"loss": 11.475510835647583, "accuracy": 0.028076171875, "attn_entropy": 0.14957917761057615, "param_norm": 136.75943882696228}, "train": {"loss": 0.570124626159668, "accuracy": 0.828125, "attn_entropy": 0.06221058592200279, "param_norm": 136.755288819555}, "step": 1720, "lr": 0.001}, {"val": {"loss": 8.115417778491974, "accuracy": 0.328857421875, "attn_entropy": 0.163494230248034, "param_norm": 136.79943912925344}, "ground_truth": {"loss": 11.78539264202118, "accuracy": 0.023193359375, "attn_entropy": 0.1501092417165637, "param_norm": 136.79943912925344}, "train": {"loss": 0.6374607682228088, "accuracy": 0.8203125, "attn_entropy": 0.05952932685613632, "param_norm": 136.7956568232358}, "step": 1730, "lr": 0.001}, {"val": {"loss": 7.807504832744598, "accuracy": 0.346923828125, "attn_entropy": 0.16275541484355927, "param_norm": 136.83791080966424}, "ground_truth": {"loss": 11.995137453079224, "accuracy": 0.023681640625, "attn_entropy": 0.15093547385185957, "param_norm": 136.83791080966424}, "train": {"loss": 0.572967529296875, "accuracy": 0.8515625, "attn_entropy": 0.059728341177105904, "param_norm": 136.8339197436382}, "step": 1740, "lr": 0.001}, {"val": {"loss": 7.956074833869934, "accuracy": 0.3349609375, "attn_entropy": 0.16303091403096914, "param_norm": 136.87765833932826}, "ground_truth": {"loss": 11.889174580574036, "accuracy": 0.033203125, "attn_entropy": 0.15001262445002794, "param_norm": 136.87765833932826}, "train": {"loss": 0.6058731079101562, "accuracy": 0.82421875, "attn_entropy": 0.05839390493929386, "param_norm": 136.87372436048759}, "step": 1750, "lr": 0.001}, {"val": {"loss": 7.837784171104431, "accuracy": 0.354736328125, "attn_entropy": 0.16434897482395172, "param_norm": 136.91785442771643}, "ground_truth": {"loss": 11.977532744407654, "accuracy": 0.0205078125, "attn_entropy": 0.15153337083756924, "param_norm": 136.91785442771643}, "train": {"loss": 0.6496201157569885, "accuracy": 0.81640625, "attn_entropy": 0.06369256600737572, "param_norm": 136.91379093181325}, "step": 1760, "lr": 0.001}, {"val": {"loss": 7.998736619949341, "accuracy": 0.336669921875, "attn_entropy": 0.16444118693470955, "param_norm": 136.95937305202327}, "ground_truth": {"loss": 11.987584352493286, "accuracy": 0.028564453125, "attn_entropy": 0.1509758122265339, "param_norm": 136.95937305202327}, "train": {"loss": 0.6625908613204956, "accuracy": 0.806640625, "attn_entropy": 0.06390095502138138, "param_norm": 136.95518427615522}, "step": 1770, "lr": 0.001}, {"val": {"loss": 8.055183827877045, "accuracy": 0.34375, "attn_entropy": 0.16147432662546635, "param_norm": 136.99842201123792}, "ground_truth": {"loss": 12.03596818447113, "accuracy": 0.0234375, "attn_entropy": 0.14793912880122662, "param_norm": 136.99842201123792}, "train": {"loss": 0.6351462602615356, "accuracy": 0.828125, "attn_entropy": 0.05941385217010975, "param_norm": 136.99459642016922}, "step": 1780, "lr": 0.001}, {"val": {"loss": 8.027560830116272, "accuracy": 0.3408203125, "attn_entropy": 0.15867079608142376, "param_norm": 137.03775441550326}, "ground_truth": {"loss": 12.031989455223083, "accuracy": 0.02197265625, "attn_entropy": 0.14542373083531857, "param_norm": 137.03775441550326}, "train": {"loss": 0.5720100998878479, "accuracy": 0.828125, "attn_entropy": 0.057753756642341614, "param_norm": 137.0337535540286}, "step": 1790, "lr": 0.001}, {"val": {"loss": 7.957510232925415, "accuracy": 0.347900390625, "attn_entropy": 0.16021871753036976, "param_norm": 137.07698749254857}, "ground_truth": {"loss": 11.922525763511658, "accuracy": 0.025146484375, "attn_entropy": 0.1463689161464572, "param_norm": 137.07698749254857}, "train": {"loss": 0.6412110924720764, "accuracy": 0.82421875, "attn_entropy": 0.0582855474203825, "param_norm": 137.0730975288027}, "step": 1800, "lr": 0.001}, {"val": {"loss": 8.139245212078094, "accuracy": 0.332763671875, "attn_entropy": 0.16117547452449799, "param_norm": 137.11506867593337}, "ground_truth": {"loss": 12.332858800888062, "accuracy": 0.01904296875, "attn_entropy": 0.146618340164423, "param_norm": 137.11506867593337}, "train": {"loss": 0.6035759449005127, "accuracy": 0.84375, "attn_entropy": 0.05950525589287281, "param_norm": 137.11132761801113}, "step": 1810, "lr": 0.001}, {"val": {"loss": 8.009160816669464, "accuracy": 0.345947265625, "attn_entropy": 0.16310820821672678, "param_norm": 137.15137328822325}, "ground_truth": {"loss": 12.271630644798279, "accuracy": 0.0185546875, "attn_entropy": 0.14712505415081978, "param_norm": 137.15137328822325}, "train": {"loss": 0.5961481928825378, "accuracy": 0.85546875, "attn_entropy": 0.06337753124535084, "param_norm": 137.14764074845442}, "step": 1820, "lr": 0.001}, {"val": {"loss": 8.21903246641159, "accuracy": 0.325927734375, "attn_entropy": 0.16171448212116957, "param_norm": 137.18612847685338}, "ground_truth": {"loss": 12.261288166046143, "accuracy": 0.0166015625, "attn_entropy": 0.14619608875364065, "param_norm": 137.18612847685338}, "train": {"loss": 0.6188071370124817, "accuracy": 0.828125, "attn_entropy": 0.05806637369096279, "param_norm": 137.18250874458204}, "step": 1830, "lr": 0.001}, {"val": {"loss": 7.901832044124603, "accuracy": 0.34912109375, "attn_entropy": 0.16052212473005056, "param_norm": 137.22534794284877}, "ground_truth": {"loss": 12.437105178833008, "accuracy": 0.0224609375, "attn_entropy": 0.1477269371971488, "param_norm": 137.22534794284877}, "train": {"loss": 0.6521376967430115, "accuracy": 0.826171875, "attn_entropy": 0.059022193774580956, "param_norm": 137.22164134261328}, "step": 1840, "lr": 0.001}, {"val": {"loss": 8.06445962190628, "accuracy": 0.34423828125, "attn_entropy": 0.16025291290134192, "param_norm": 137.2616789126848}, "ground_truth": {"loss": 12.158936500549316, "accuracy": 0.020263671875, "attn_entropy": 0.14747029729187489, "param_norm": 137.2616789126848}, "train": {"loss": 0.4760778248310089, "accuracy": 0.87109375, "attn_entropy": 0.056537926197052, "param_norm": 137.25793104987915}, "step": 1850, "lr": 0.001}, {"val": {"loss": 7.967782616615295, "accuracy": 0.347412109375, "attn_entropy": 0.1599333742633462, "param_norm": 137.2990556706866}, "ground_truth": {"loss": 12.370863556861877, "accuracy": 0.020263671875, "attn_entropy": 0.14818920567631721, "param_norm": 137.2990556706866}, "train": {"loss": 0.5874852538108826, "accuracy": 0.83203125, "attn_entropy": 0.0605491790920496, "param_norm": 137.29542741368368}, "step": 1860, "lr": 0.001}, {"val": {"loss": 8.023061037063599, "accuracy": 0.346923828125, "attn_entropy": 0.16091595124453306, "param_norm": 137.33309025555124}, "ground_truth": {"loss": 12.354142785072327, "accuracy": 0.02978515625, "attn_entropy": 0.14919789135456085, "param_norm": 137.33309025555124}, "train": {"loss": 0.463312566280365, "accuracy": 0.875, "attn_entropy": 0.05857917480170727, "param_norm": 137.32972624483574}, "step": 1870, "lr": 0.001}, {"val": {"loss": 8.126655578613281, "accuracy": 0.341552734375, "attn_entropy": 0.1623969953507185, "param_norm": 137.36782108166847}, "ground_truth": {"loss": 12.573835849761963, "accuracy": 0.025390625, "attn_entropy": 0.15007303189486265, "param_norm": 137.36782108166847}, "train": {"loss": 0.4766070246696472, "accuracy": 0.86328125, "attn_entropy": 0.06016258895397186, "param_norm": 137.36430932783975}, "step": 1880, "lr": 0.001}, {"val": {"loss": 8.282013595104218, "accuracy": 0.3330078125, "attn_entropy": 0.16248143184930086, "param_norm": 137.40282223941475}, "ground_truth": {"loss": 12.63171637058258, "accuracy": 0.019287109375, "attn_entropy": 0.14910193905234337, "param_norm": 137.40282223941475}, "train": {"loss": 0.5130143761634827, "accuracy": 0.8515625, "attn_entropy": 0.059453412890434265, "param_norm": 137.39943635051165}, "step": 1890, "lr": 0.001}, {"val": {"loss": 7.8906373381614685, "accuracy": 0.356689453125, "attn_entropy": 0.1615024320781231, "param_norm": 137.43780124349612}, "ground_truth": {"loss": 12.404910206794739, "accuracy": 0.021240234375, "attn_entropy": 0.14878973364830017, "param_norm": 137.43780124349612}, "train": {"loss": 0.5321568846702576, "accuracy": 0.837890625, "attn_entropy": 0.06009850837290287, "param_norm": 137.43442512049103}, "step": 1900, "lr": 0.001}, {"val": {"loss": 7.903443336486816, "accuracy": 0.350341796875, "attn_entropy": 0.15952936001121998, "param_norm": 137.47288772924037}, "ground_truth": {"loss": 12.414076209068298, "accuracy": 0.015869140625, "attn_entropy": 0.14721358194947243, "param_norm": 137.47288772924037}, "train": {"loss": 0.5557640194892883, "accuracy": 0.83984375, "attn_entropy": 0.05743701383471489, "param_norm": 137.4693306451915}, "step": 1910, "lr": 0.001}, {"val": {"loss": 8.153428494930267, "accuracy": 0.340576171875, "attn_entropy": 0.1597964596003294, "param_norm": 137.51002417459424}, "ground_truth": {"loss": 12.524073839187622, "accuracy": 0.023681640625, "attn_entropy": 0.1462570009753108, "param_norm": 137.51002417459424}, "train": {"loss": 0.551616370677948, "accuracy": 0.8359375, "attn_entropy": 0.059891603887081146, "param_norm": 137.50645309147112}, "step": 1920, "lr": 0.001}, {"val": {"loss": 8.006024062633514, "accuracy": 0.35595703125, "attn_entropy": 0.1628252351656556, "param_norm": 137.54735337181458}, "ground_truth": {"loss": 12.93327534198761, "accuracy": 0.02001953125, "attn_entropy": 0.14906312432140112, "param_norm": 137.54735337181458}, "train": {"loss": 0.5021471977233887, "accuracy": 0.8828125, "attn_entropy": 0.05837886780500412, "param_norm": 137.54387070756957}, "step": 1930, "lr": 0.001}, {"val": {"loss": 7.886105060577393, "accuracy": 0.359375, "attn_entropy": 0.16594602540135384, "param_norm": 137.5814725276346}, "ground_truth": {"loss": 12.426208734512329, "accuracy": 0.020263671875, "attn_entropy": 0.15237452555447817, "param_norm": 137.5814725276346}, "train": {"loss": 0.4749686121940613, "accuracy": 0.841796875, "attn_entropy": 0.06219695508480072, "param_norm": 137.57812099516352}, "step": 1940, "lr": 0.001}, {"val": {"loss": 8.096261143684387, "accuracy": 0.35009765625, "attn_entropy": 0.1651250822469592, "param_norm": 137.61574067414475}, "ground_truth": {"loss": 12.790099620819092, "accuracy": 0.021240234375, "attn_entropy": 0.1500747874379158, "param_norm": 137.61574067414475}, "train": {"loss": 0.5317234396934509, "accuracy": 0.85546875, "attn_entropy": 0.06329628266394138, "param_norm": 137.61222573029593}, "step": 1950, "lr": 0.001}, {"val": {"loss": 8.016584694385529, "accuracy": 0.359130859375, "attn_entropy": 0.1652633398771286, "param_norm": 137.64992104490875}, "ground_truth": {"loss": 12.616896510124207, "accuracy": 0.0185546875, "attn_entropy": 0.15187581535428762, "param_norm": 137.64992104490875}, "train": {"loss": 0.4533194601535797, "accuracy": 0.875, "attn_entropy": 0.06001260504126549, "param_norm": 137.64656041111087}, "step": 1960, "lr": 0.001}, {"val": {"loss": 8.102457582950592, "accuracy": 0.34912109375, "attn_entropy": 0.16667495761066675, "param_norm": 137.68513391147238}, "ground_truth": {"loss": 12.837989926338196, "accuracy": 0.020751953125, "attn_entropy": 0.15150984656065702, "param_norm": 137.68513391147238}, "train": {"loss": 0.4393598735332489, "accuracy": 0.873046875, "attn_entropy": 0.06400642171502113, "param_norm": 137.68171630115276}, "step": 1970, "lr": 0.001}, {"val": {"loss": 8.120375156402588, "accuracy": 0.3544921875, "attn_entropy": 0.16560668125748634, "param_norm": 137.7187834152227}, "ground_truth": {"loss": 12.904126405715942, "accuracy": 0.015869140625, "attn_entropy": 0.15147277154028416, "param_norm": 137.7187834152227}, "train": {"loss": 0.49166297912597656, "accuracy": 0.8671875, "attn_entropy": 0.06484293192625046, "param_norm": 137.71563118863833}, "step": 1980, "lr": 0.001}, {"val": {"loss": 8.130350053310394, "accuracy": 0.350830078125, "attn_entropy": 0.1658051796257496, "param_norm": 137.74921106833676}, "ground_truth": {"loss": 12.805716753005981, "accuracy": 0.016357421875, "attn_entropy": 0.15133121889084578, "param_norm": 137.74921106833676}, "train": {"loss": 0.4238654375076294, "accuracy": 0.892578125, "attn_entropy": 0.061874596402049065, "param_norm": 137.74620973755077}, "step": 1990, "lr": 0.001}, {"val": {"loss": 8.155346274375916, "accuracy": 0.3486328125, "attn_entropy": 0.16405239328742027, "param_norm": 137.78015093912293}, "ground_truth": {"loss": 12.893370151519775, "accuracy": 0.014404296875, "attn_entropy": 0.15168046671897173, "param_norm": 137.78015093912293}, "train": {"loss": 0.5729089975357056, "accuracy": 0.85546875, "attn_entropy": 0.06302362307906151, "param_norm": 137.77677658048458}, "step": 2000, "lr": 0.001}, {"val": {"loss": 8.092188239097595, "accuracy": 0.35498046875, "attn_entropy": 0.16628301981836557, "param_norm": 137.81532094500048}, "ground_truth": {"loss": 12.981521248817444, "accuracy": 0.0263671875, "attn_entropy": 0.15187724586576223, "param_norm": 137.81532094500048}, "train": {"loss": 0.44510942697525024, "accuracy": 0.884765625, "attn_entropy": 0.06587819196283817, "param_norm": 137.81192173831099}, "step": 2010, "lr": 0.001}, {"val": {"loss": 8.124514400959015, "accuracy": 0.345947265625, "attn_entropy": 0.16472778376191854, "param_norm": 137.847365435103}, "ground_truth": {"loss": 12.94009804725647, "accuracy": 0.026123046875, "attn_entropy": 0.14985905773937702, "param_norm": 137.847365435103}, "train": {"loss": 0.4787285625934601, "accuracy": 0.859375, "attn_entropy": 0.06262945756316185, "param_norm": 137.8441785998352}, "step": 2020, "lr": 0.001}, {"val": {"loss": 8.092815816402435, "accuracy": 0.344970703125, "attn_entropy": 0.16424281988292933, "param_norm": 137.87937110747728}, "ground_truth": {"loss": 12.82057499885559, "accuracy": 0.03857421875, "attn_entropy": 0.14979513734579086, "param_norm": 137.87937110747728}, "train": {"loss": 0.4542636573314667, "accuracy": 0.875, "attn_entropy": 0.061733897775411606, "param_norm": 137.87626028122574}, "step": 2030, "lr": 0.001}, {"val": {"loss": 8.133463740348816, "accuracy": 0.347412109375, "attn_entropy": 0.16293365228921175, "param_norm": 137.9130863856269}, "ground_truth": {"loss": 12.769645690917969, "accuracy": 0.0244140625, "attn_entropy": 0.14872945938259363, "param_norm": 137.9130863856269}, "train": {"loss": 0.4928615689277649, "accuracy": 0.86328125, "attn_entropy": 0.06075315363705158, "param_norm": 137.90981262989723}, "step": 2040, "lr": 0.001}, {"val": {"loss": 8.209565162658691, "accuracy": 0.34521484375, "attn_entropy": 0.16516902204602957, "param_norm": 137.94447834663376}, "ground_truth": {"loss": 13.091483354568481, "accuracy": 0.01904296875, "attn_entropy": 0.15107138082385063, "param_norm": 137.94447834663376}, "train": {"loss": 0.5017064213752747, "accuracy": 0.861328125, "attn_entropy": 0.06359906494617462, "param_norm": 137.94146057234565}, "step": 2050, "lr": 0.001}, {"val": {"loss": 8.10530138015747, "accuracy": 0.3515625, "attn_entropy": 0.16394810192286968, "param_norm": 137.97285628493958}, "ground_truth": {"loss": 12.995928168296814, "accuracy": 0.018798828125, "attn_entropy": 0.14972865115851164, "param_norm": 137.97285628493958}, "train": {"loss": 0.4949682652950287, "accuracy": 0.869140625, "attn_entropy": 0.05946624279022217, "param_norm": 137.97000397066975}, "step": 2060, "lr": 0.001}, {"val": {"loss": 8.293056070804596, "accuracy": 0.343994140625, "attn_entropy": 0.1598057709634304, "param_norm": 138.00537348761475}, "ground_truth": {"loss": 13.165487170219421, "accuracy": 0.0185546875, "attn_entropy": 0.14709703996777534, "param_norm": 138.00537348761475}, "train": {"loss": 0.4539288282394409, "accuracy": 0.884765625, "attn_entropy": 0.05977368727326393, "param_norm": 138.00205809238295}, "step": 2070, "lr": 0.001}, {"val": {"loss": 8.299679934978485, "accuracy": 0.3388671875, "attn_entropy": 0.16090380027890205, "param_norm": 138.0378209815559}, "ground_truth": {"loss": 13.233875751495361, "accuracy": 0.020751953125, "attn_entropy": 0.1471169013530016, "param_norm": 138.0378209815559}, "train": {"loss": 0.5081584453582764, "accuracy": 0.85546875, "attn_entropy": 0.06336180865764618, "param_norm": 138.03463209721846}, "step": 2080, "lr": 0.001}, {"val": {"loss": 8.183617889881134, "accuracy": 0.351806640625, "attn_entropy": 0.16376918461173773, "param_norm": 138.06998142395275}, "ground_truth": {"loss": 13.245975852012634, "accuracy": 0.017333984375, "attn_entropy": 0.14909786358475685, "param_norm": 138.06998142395275}, "train": {"loss": 0.4439650774002075, "accuracy": 0.87109375, "attn_entropy": 0.06310129351913929, "param_norm": 138.0668008055333}, "step": 2090, "lr": 0.001}, {"val": {"loss": 8.452593207359314, "accuracy": 0.333740234375, "attn_entropy": 0.16600222885608673, "param_norm": 138.1022242704862}, "ground_truth": {"loss": 13.197138667106628, "accuracy": 0.023193359375, "attn_entropy": 0.1521841548383236, "param_norm": 138.1022242704862}, "train": {"loss": 0.4346092641353607, "accuracy": 0.88671875, "attn_entropy": 0.0597582682967186, "param_norm": 138.09898544094614}, "step": 2100, "lr": 0.001}, {"val": {"loss": 8.367199182510376, "accuracy": 0.337158203125, "attn_entropy": 0.1642915653064847, "param_norm": 138.13290659225598}, "ground_truth": {"loss": 13.240179300308228, "accuracy": 0.019775390625, "attn_entropy": 0.1507652597501874, "param_norm": 138.13290659225598}, "train": {"loss": 0.4248988628387451, "accuracy": 0.89453125, "attn_entropy": 0.061398670077323914, "param_norm": 138.1299473902434}, "step": 2110, "lr": 0.001}, {"val": {"loss": 8.329169929027557, "accuracy": 0.33984375, "attn_entropy": 0.16365489549934864, "param_norm": 138.1620736589155}, "ground_truth": {"loss": 13.322447061538696, "accuracy": 0.0224609375, "attn_entropy": 0.14950927253812551, "param_norm": 138.1620736589155}, "train": {"loss": 0.4530541002750397, "accuracy": 0.875, "attn_entropy": 0.06449264287948608, "param_norm": 138.1592096078277}, "step": 2120, "lr": 0.001}, {"val": {"loss": 8.206868827342987, "accuracy": 0.34521484375, "attn_entropy": 0.16490303073078394, "param_norm": 138.19416973018485}, "ground_truth": {"loss": 13.302193522453308, "accuracy": 0.02685546875, "attn_entropy": 0.1512007638812065, "param_norm": 138.19416973018485}, "train": {"loss": 0.38132697343826294, "accuracy": 0.8828125, "attn_entropy": 0.06497597508132458, "param_norm": 138.19083818242945}, "step": 2130, "lr": 0.001}, {"val": {"loss": 8.213070571422577, "accuracy": 0.349853515625, "attn_entropy": 0.1662154933437705, "param_norm": 138.2244849333754}, "ground_truth": {"loss": 13.265269041061401, "accuracy": 0.03173828125, "attn_entropy": 0.1526426263153553, "param_norm": 138.2244849333754}, "train": {"loss": 0.3917531371116638, "accuracy": 0.888671875, "attn_entropy": 0.061832696199417114, "param_norm": 138.22183658717432}, "step": 2140, "lr": 0.001}, {"val": {"loss": 8.254315257072449, "accuracy": 0.34619140625, "attn_entropy": 0.16469453740864992, "param_norm": 138.25128685742928}, "ground_truth": {"loss": 13.617405772209167, "accuracy": 0.016845703125, "attn_entropy": 0.15090399887412786, "param_norm": 138.25128685742928}, "train": {"loss": 0.40542638301849365, "accuracy": 0.888671875, "attn_entropy": 0.060250261798501015, "param_norm": 138.248571614249}, "step": 2150, "lr": 0.001}, {"val": {"loss": 8.05515569448471, "accuracy": 0.3662109375, "attn_entropy": 0.16518488246947527, "param_norm": 138.27950014763704}, "ground_truth": {"loss": 13.562121629714966, "accuracy": 0.01123046875, "attn_entropy": 0.15088453236967325, "param_norm": 138.27950014763704}, "train": {"loss": 0.3849875032901764, "accuracy": 0.896484375, "attn_entropy": 0.06349536031484604, "param_norm": 138.27673576071743}, "step": 2160, "lr": 0.001}, {"val": {"loss": 8.083684861660004, "accuracy": 0.35693359375, "attn_entropy": 0.16404518950730562, "param_norm": 138.30739325044576}, "ground_truth": {"loss": 13.380857706069946, "accuracy": 0.015380859375, "attn_entropy": 0.15314686205238104, "param_norm": 138.30739325044576}, "train": {"loss": 0.37023913860321045, "accuracy": 0.892578125, "attn_entropy": 0.06510523706674576, "param_norm": 138.30435619811254}, "step": 2170, "lr": 0.001}, {"val": {"loss": 8.306676626205444, "accuracy": 0.342529296875, "attn_entropy": 0.16525491047650576, "param_norm": 138.33716825170518}, "ground_truth": {"loss": 13.42480456829071, "accuracy": 0.013671875, "attn_entropy": 0.1520578907802701, "param_norm": 138.33716825170518}, "train": {"loss": 0.4132547378540039, "accuracy": 0.904296875, "attn_entropy": 0.06400953978300095, "param_norm": 138.33427242740422}, "step": 2180, "lr": 0.001}, {"val": {"loss": 8.369563043117523, "accuracy": 0.344482421875, "attn_entropy": 0.16459270007908344, "param_norm": 138.36509882682822}, "ground_truth": {"loss": 13.683826446533203, "accuracy": 0.01904296875, "attn_entropy": 0.14944863971322775, "param_norm": 138.36509882682822}, "train": {"loss": 0.45820388197898865, "accuracy": 0.869140625, "attn_entropy": 0.06407399103045464, "param_norm": 138.362125011677}, "step": 2190, "lr": 0.001}, {"val": {"loss": 8.194683730602264, "accuracy": 0.35302734375, "attn_entropy": 0.16392398066818714, "param_norm": 138.39574872623544}, "ground_truth": {"loss": 13.828616857528687, "accuracy": 0.021484375, "attn_entropy": 0.1482604155316949, "param_norm": 138.39574872623544}, "train": {"loss": 0.45553821325302124, "accuracy": 0.87890625, "attn_entropy": 0.06079276278614998, "param_norm": 138.39270764370173}, "step": 2200, "lr": 0.001}, {"val": {"loss": 8.301572322845459, "accuracy": 0.348388671875, "attn_entropy": 0.16523619648069143, "param_norm": 138.4261918352356}, "ground_truth": {"loss": 13.581040859222412, "accuracy": 0.0283203125, "attn_entropy": 0.1475621284916997, "param_norm": 138.4261918352356}, "train": {"loss": 0.4705369770526886, "accuracy": 0.87109375, "attn_entropy": 0.06276329234242439, "param_norm": 138.4231942754254}, "step": 2210, "lr": 0.001}, {"val": {"loss": 8.23274314403534, "accuracy": 0.355224609375, "attn_entropy": 0.1601885398849845, "param_norm": 138.454360174286}, "ground_truth": {"loss": 13.596888899803162, "accuracy": 0.025634765625, "attn_entropy": 0.14427162893116474, "param_norm": 138.454360174286}, "train": {"loss": 0.4217759072780609, "accuracy": 0.888671875, "attn_entropy": 0.06004724279046059, "param_norm": 138.45163797826945}, "step": 2220, "lr": 0.001}, {"val": {"loss": 8.095260858535767, "accuracy": 0.356201171875, "attn_entropy": 0.1608675355091691, "param_norm": 138.48284205242803}, "ground_truth": {"loss": 13.61110246181488, "accuracy": 0.022705078125, "attn_entropy": 0.14512186776846647, "param_norm": 138.48284205242803}, "train": {"loss": 0.43388310074806213, "accuracy": 0.87890625, "attn_entropy": 0.05890798196196556, "param_norm": 138.48014016887336}, "step": 2230, "lr": 0.001}, {"val": {"loss": 8.375630378723145, "accuracy": 0.340087890625, "attn_entropy": 0.1629993412643671, "param_norm": 138.5102999987382}, "ground_truth": {"loss": 13.657725930213928, "accuracy": 0.020263671875, "attn_entropy": 0.1467457413673401, "param_norm": 138.5102999987382}, "train": {"loss": 0.48639795184135437, "accuracy": 0.87109375, "attn_entropy": 0.059597065672278404, "param_norm": 138.50748197407552}, "step": 2240, "lr": 0.001}, {"val": {"loss": 8.356200754642487, "accuracy": 0.342041015625, "attn_entropy": 0.16313583217561245, "param_norm": 138.53755346617675}, "ground_truth": {"loss": 13.74669349193573, "accuracy": 0.01220703125, "attn_entropy": 0.14889850094914436, "param_norm": 138.53755346617675}, "train": {"loss": 0.41172945499420166, "accuracy": 0.88671875, "attn_entropy": 0.05946535989642143, "param_norm": 138.53494097543552}, "step": 2250, "lr": 0.001}, {"val": {"loss": 8.313248097896576, "accuracy": 0.346435546875, "attn_entropy": 0.16547857597470284, "param_norm": 138.5655707104879}, "ground_truth": {"loss": 13.798017382621765, "accuracy": 0.01611328125, "attn_entropy": 0.15012426022440195, "param_norm": 138.5655707104879}, "train": {"loss": 0.39770594239234924, "accuracy": 0.880859375, "attn_entropy": 0.060168689116835594, "param_norm": 138.5624661274586}, "step": 2260, "lr": 0.001}, {"val": {"loss": 8.094353973865509, "accuracy": 0.36181640625, "attn_entropy": 0.16451415978372097, "param_norm": 138.59697642731737}, "ground_truth": {"loss": 13.918594121932983, "accuracy": 0.015380859375, "attn_entropy": 0.1498100971803069, "param_norm": 138.59697642731737}, "train": {"loss": 0.3730602264404297, "accuracy": 0.884765625, "attn_entropy": 0.06217347830533981, "param_norm": 138.59401613772235}, "step": 2270, "lr": 0.001}, {"val": {"loss": 8.100450992584229, "accuracy": 0.360107421875, "attn_entropy": 0.16118908394128084, "param_norm": 138.6259744499091}, "ground_truth": {"loss": 13.556133031845093, "accuracy": 0.019287109375, "attn_entropy": 0.14726930763572454, "param_norm": 138.6259744499091}, "train": {"loss": 0.44373661279678345, "accuracy": 0.880859375, "attn_entropy": 0.06125234439969063, "param_norm": 138.6233317548479}, "step": 2280, "lr": 0.001}, {"val": {"loss": 8.327638924121857, "accuracy": 0.350830078125, "attn_entropy": 0.1634922418743372, "param_norm": 138.65279505245553}, "ground_truth": {"loss": 13.823450565338135, "accuracy": 0.016845703125, "attn_entropy": 0.1489959517493844, "param_norm": 138.65279505245553}, "train": {"loss": 0.40481412410736084, "accuracy": 0.884765625, "attn_entropy": 0.06113901175558567, "param_norm": 138.65020838870615}, "step": 2290, "lr": 0.001}, {"val": {"loss": 8.195975065231323, "accuracy": 0.355224609375, "attn_entropy": 0.16476147063076496, "param_norm": 138.6793262069639}, "ground_truth": {"loss": 13.764610290527344, "accuracy": 0.0126953125, "attn_entropy": 0.15186142269521952, "param_norm": 138.6793262069639}, "train": {"loss": 0.36233824491500854, "accuracy": 0.890625, "attn_entropy": 0.05970449559390545, "param_norm": 138.67662975374807}, "step": 2300, "lr": 0.001}, {"val": {"loss": 8.264694631099701, "accuracy": 0.35107421875, "attn_entropy": 0.1678166976198554, "param_norm": 138.70528027566095}, "ground_truth": {"loss": 13.693044066429138, "accuracy": 0.018310546875, "attn_entropy": 0.1521887481212616, "param_norm": 138.70528027566095}, "train": {"loss": 0.3556455969810486, "accuracy": 0.89453125, "attn_entropy": 0.061675652861595154, "param_norm": 138.70275763035104}, "step": 2310, "lr": 0.001}, {"val": {"loss": 8.211180627346039, "accuracy": 0.35400390625, "attn_entropy": 0.16462127398699522, "param_norm": 138.73121348603675}, "ground_truth": {"loss": 13.971386313438416, "accuracy": 0.015625, "attn_entropy": 0.15126989409327507, "param_norm": 138.73121348603675}, "train": {"loss": 0.3471909165382385, "accuracy": 0.91015625, "attn_entropy": 0.06345649436116219, "param_norm": 138.72871950586196}, "step": 2320, "lr": 0.001}, {"val": {"loss": 8.267753541469574, "accuracy": 0.34912109375, "attn_entropy": 0.1623858055099845, "param_norm": 138.75679004723767}, "ground_truth": {"loss": 13.816125750541687, "accuracy": 0.019287109375, "attn_entropy": 0.1479681571945548, "param_norm": 138.75679004723767}, "train": {"loss": 0.3461330831050873, "accuracy": 0.89453125, "attn_entropy": 0.06300424784421921, "param_norm": 138.75402690600515}, "step": 2330, "lr": 0.001}, {"val": {"loss": 8.27627569437027, "accuracy": 0.352294921875, "attn_entropy": 0.16355421114712954, "param_norm": 138.7866141515873}, "ground_truth": {"loss": 13.849374413490295, "accuracy": 0.02294921875, "attn_entropy": 0.1489897407591343, "param_norm": 138.7866141515873}, "train": {"loss": 0.3740719258785248, "accuracy": 0.892578125, "attn_entropy": 0.06358715705573559, "param_norm": 138.78366292316343}, "step": 2340, "lr": 0.001}, {"val": {"loss": 8.202138066291809, "accuracy": 0.36572265625, "attn_entropy": 0.1657590502873063, "param_norm": 138.8140000094171}, "ground_truth": {"loss": 14.027406096458435, "accuracy": 0.018310546875, "attn_entropy": 0.15283906366676092, "param_norm": 138.8140000094171}, "train": {"loss": 0.39762625098228455, "accuracy": 0.888671875, "attn_entropy": 0.062025997787714005, "param_norm": 138.81138978216373}, "step": 2350, "lr": 0.001}, {"val": {"loss": 8.145528554916382, "accuracy": 0.3623046875, "attn_entropy": 0.16914333403110504, "param_norm": 138.84008051678708}, "ground_truth": {"loss": 14.037593603134155, "accuracy": 0.021240234375, "attn_entropy": 0.15707767382264137, "param_norm": 138.84008051678708}, "train": {"loss": 0.39389604330062866, "accuracy": 0.875, "attn_entropy": 0.06600327417254448, "param_norm": 138.83747167514008}, "step": 2360, "lr": 0.001}, {"val": {"loss": 8.532083213329315, "accuracy": 0.342041015625, "attn_entropy": 0.1715304246172309, "param_norm": 138.86501843149384}, "ground_truth": {"loss": 14.034289360046387, "accuracy": 0.0146484375, "attn_entropy": 0.15752962045371532, "param_norm": 138.86501843149384}, "train": {"loss": 0.38546550273895264, "accuracy": 0.892578125, "attn_entropy": 0.06564612686634064, "param_norm": 138.86247666267582}, "step": 2370, "lr": 0.001}, {"val": {"loss": 8.227188527584076, "accuracy": 0.353515625, "attn_entropy": 0.17103358171880245, "param_norm": 138.89043311102498}, "ground_truth": {"loss": 13.86574912071228, "accuracy": 0.017578125, "attn_entropy": 0.1559220477938652, "param_norm": 138.89043311102498}, "train": {"loss": 0.45124268531799316, "accuracy": 0.876953125, "attn_entropy": 0.06941849365830421, "param_norm": 138.8877874901102}, "step": 2380, "lr": 0.001}, {"val": {"loss": 8.176282227039337, "accuracy": 0.366943359375, "attn_entropy": 0.17105901706963778, "param_norm": 138.91716507748208}, "ground_truth": {"loss": 14.059815764427185, "accuracy": 0.013916015625, "attn_entropy": 0.1557327127084136, "param_norm": 138.91716507748208}, "train": {"loss": 0.3709534704685211, "accuracy": 0.884765625, "attn_entropy": 0.06800703890621662, "param_norm": 138.9146504741291}, "step": 2390, "lr": 0.001}, {"val": {"loss": 8.174547255039215, "accuracy": 0.356689453125, "attn_entropy": 0.16839818563312292, "param_norm": 138.94173172172586}, "ground_truth": {"loss": 14.403562307357788, "accuracy": 0.015625, "attn_entropy": 0.15295026451349258, "param_norm": 138.94173172172586}, "train": {"loss": 0.34426558017730713, "accuracy": 0.89453125, "attn_entropy": 0.06219998560845852, "param_norm": 138.93926852000956}, "step": 2400, "lr": 0.001}, {"val": {"loss": 8.304279685020447, "accuracy": 0.353515625, "attn_entropy": 0.16400590725243092, "param_norm": 138.96610757865608}, "ground_truth": {"loss": 14.265564918518066, "accuracy": 0.013671875, "attn_entropy": 0.15046178176999092, "param_norm": 138.96610757865608}, "train": {"loss": 0.40052253007888794, "accuracy": 0.896484375, "attn_entropy": 0.057275254279375076, "param_norm": 138.96356646858882}, "step": 2410, "lr": 0.001}, {"val": {"loss": 8.39447569847107, "accuracy": 0.345703125, "attn_entropy": 0.16655238904058933, "param_norm": 138.9934104208231}, "ground_truth": {"loss": 14.310059547424316, "accuracy": 0.01611328125, "attn_entropy": 0.1502252696081996, "param_norm": 138.9934104208231}, "train": {"loss": 0.37899139523506165, "accuracy": 0.88671875, "attn_entropy": 0.0630513709038496, "param_norm": 138.99062024264737}, "step": 2420, "lr": 0.001}, {"val": {"loss": 8.280089318752289, "accuracy": 0.354248046875, "attn_entropy": 0.16453604120761156, "param_norm": 139.01945867045904}, "ground_truth": {"loss": 14.05718731880188, "accuracy": 0.015869140625, "attn_entropy": 0.14922945480793715, "param_norm": 139.01945867045904}, "train": {"loss": 0.36728712916374207, "accuracy": 0.8984375, "attn_entropy": 0.062404513359069824, "param_norm": 139.01699877451728}, "step": 2430, "lr": 0.001}, {"val": {"loss": 8.200353741645813, "accuracy": 0.364990234375, "attn_entropy": 0.16359774488955736, "param_norm": 139.04413545089298}, "ground_truth": {"loss": 14.141307711601257, "accuracy": 0.018310546875, "attn_entropy": 0.14797300659120083, "param_norm": 139.04413545089298}, "train": {"loss": 0.31894123554229736, "accuracy": 0.896484375, "attn_entropy": 0.06158389337360859, "param_norm": 139.0415496991675}, "step": 2440, "lr": 0.001}, {"val": {"loss": 8.240836143493652, "accuracy": 0.35791015625, "attn_entropy": 0.16712398827075958, "param_norm": 139.07062370858432}, "ground_truth": {"loss": 14.206290125846863, "accuracy": 0.01416015625, "attn_entropy": 0.15171850752085447, "param_norm": 139.07062370858432}, "train": {"loss": 0.37233999371528625, "accuracy": 0.876953125, "attn_entropy": 0.06478390097618103, "param_norm": 139.06808002361842}, "step": 2450, "lr": 0.001}, {"val": {"loss": 8.066963255405426, "accuracy": 0.36181640625, "attn_entropy": 0.16743036545813084, "param_norm": 139.09302044044856}, "ground_truth": {"loss": 14.255345344543457, "accuracy": 0.01806640625, "attn_entropy": 0.15034098830074072, "param_norm": 139.09302044044856}, "train": {"loss": 0.3265857994556427, "accuracy": 0.88671875, "attn_entropy": 0.06589784845709801, "param_norm": 139.09080845008168}, "step": 2460, "lr": 0.001}, {"val": {"loss": 8.275168478488922, "accuracy": 0.354736328125, "attn_entropy": 0.16567345894873142, "param_norm": 139.11542486471004}, "ground_truth": {"loss": 13.993957877159119, "accuracy": 0.01416015625, "attn_entropy": 0.15066850651055574, "param_norm": 139.11542486471004}, "train": {"loss": 0.33958229422569275, "accuracy": 0.904296875, "attn_entropy": 0.06456420943140984, "param_norm": 139.11309228980184}, "step": 2470, "lr": 0.001}, {"val": {"loss": 8.127984166145325, "accuracy": 0.365234375, "attn_entropy": 0.16682336200028658, "param_norm": 139.14128389668372}, "ground_truth": {"loss": 14.221397995948792, "accuracy": 0.015625, "attn_entropy": 0.151998701505363, "param_norm": 139.14128389668372}, "train": {"loss": 0.3808257281780243, "accuracy": 0.88671875, "attn_entropy": 0.06352773681282997, "param_norm": 139.13851941875362}, "step": 2480, "lr": 0.001}, {"val": {"loss": 8.217484712600708, "accuracy": 0.357421875, "attn_entropy": 0.16855361498892307, "param_norm": 139.16703644019768}, "ground_truth": {"loss": 14.303207397460938, "accuracy": 0.01611328125, "attn_entropy": 0.1524882735684514, "param_norm": 139.16703644019768}, "train": {"loss": 0.33517301082611084, "accuracy": 0.916015625, "attn_entropy": 0.0660983044654131, "param_norm": 139.1644305439704}, "step": 2490, "lr": 0.001}, {"val": {"loss": 8.304609656333923, "accuracy": 0.35107421875, "attn_entropy": 0.1645385092124343, "param_norm": 139.19226241999948}, "ground_truth": {"loss": 14.049719095230103, "accuracy": 0.015869140625, "attn_entropy": 0.1493572909384966, "param_norm": 139.19226241999948}, "train": {"loss": 0.36222538352012634, "accuracy": 0.912109375, "attn_entropy": 0.061224304139614105, "param_norm": 139.1897944054969}, "step": 2500, "lr": 0.001}, {"val": {"loss": 8.268598318099976, "accuracy": 0.349853515625, "attn_entropy": 0.16626290697604418, "param_norm": 139.21841160142276}, "ground_truth": {"loss": 14.066169261932373, "accuracy": 0.014892578125, "attn_entropy": 0.15005963016301394, "param_norm": 139.21841160142276}, "train": {"loss": 0.3080225884914398, "accuracy": 0.919921875, "attn_entropy": 0.06634077616035938, "param_norm": 139.215678714661}, "step": 2510, "lr": 0.001}, {"val": {"loss": 8.247898519039154, "accuracy": 0.3583984375, "attn_entropy": 0.1643233885988593, "param_norm": 139.24392348950994}, "ground_truth": {"loss": 14.353173851966858, "accuracy": 0.017578125, "attn_entropy": 0.14982694014906883, "param_norm": 139.24392348950994}, "train": {"loss": 0.26504796743392944, "accuracy": 0.921875, "attn_entropy": 0.06152636185288429, "param_norm": 139.2414186455685}, "step": 2520, "lr": 0.001}, {"val": {"loss": 8.393517136573792, "accuracy": 0.343994140625, "attn_entropy": 0.1667006118223071, "param_norm": 139.26983817706028}, "ground_truth": {"loss": 14.108107447624207, "accuracy": 0.015625, "attn_entropy": 0.149699492380023, "param_norm": 139.26983817706028}, "train": {"loss": 0.3593745529651642, "accuracy": 0.8984375, "attn_entropy": 0.06299909390509129, "param_norm": 139.26732615215442}, "step": 2530, "lr": 0.001}, {"val": {"loss": 8.390659868717194, "accuracy": 0.34814453125, "attn_entropy": 0.16598703805357218, "param_norm": 139.29387494000014}, "ground_truth": {"loss": 14.388317108154297, "accuracy": 0.019775390625, "attn_entropy": 0.1508786929771304, "param_norm": 139.29387494000014}, "train": {"loss": 0.32202503085136414, "accuracy": 0.91015625, "attn_entropy": 0.06527675874531269, "param_norm": 139.29151212437233}, "step": 2540, "lr": 0.001}, {"val": {"loss": 8.367391705513, "accuracy": 0.350341796875, "attn_entropy": 0.1674485057592392, "param_norm": 139.31862897796438}, "ground_truth": {"loss": 14.44896912574768, "accuracy": 0.018798828125, "attn_entropy": 0.1506497422233224, "param_norm": 139.31862897796438}, "train": {"loss": 0.2800585627555847, "accuracy": 0.912109375, "attn_entropy": 0.06452844478189945, "param_norm": 139.31606333177132}, "step": 2550, "lr": 0.001}, {"val": {"loss": 8.35059106349945, "accuracy": 0.346923828125, "attn_entropy": 0.16789263393729925, "param_norm": 139.34381243704777}, "ground_truth": {"loss": 14.415688037872314, "accuracy": 0.0146484375, "attn_entropy": 0.15106352139264345, "param_norm": 139.34381243704777}, "train": {"loss": 0.32662609219551086, "accuracy": 0.904296875, "attn_entropy": 0.06347088888287544, "param_norm": 139.3412341664762}, "step": 2560, "lr": 0.001}, {"val": {"loss": 8.232928454875946, "accuracy": 0.3544921875, "attn_entropy": 0.1674884306266904, "param_norm": 139.36902383984156}, "ground_truth": {"loss": 14.449187874794006, "accuracy": 0.015869140625, "attn_entropy": 0.15151510667055845, "param_norm": 139.36902383984156}, "train": {"loss": 0.2914965748786926, "accuracy": 0.921875, "attn_entropy": 0.06358969584107399, "param_norm": 139.36657215465627}, "step": 2570, "lr": 0.001}, {"val": {"loss": 8.160524308681488, "accuracy": 0.36083984375, "attn_entropy": 0.16606911085546017, "param_norm": 139.39399252476497}, "ground_truth": {"loss": 14.572635769844055, "accuracy": 0.015869140625, "attn_entropy": 0.14911967981606722, "param_norm": 139.39399252476497}, "train": {"loss": 0.26481616497039795, "accuracy": 0.927734375, "attn_entropy": 0.05993582308292389, "param_norm": 139.39166844595005}, "step": 2580, "lr": 0.001}, {"val": {"loss": 8.292382061481476, "accuracy": 0.35693359375, "attn_entropy": 0.16656935308128595, "param_norm": 139.41976510102384}, "ground_truth": {"loss": 14.218679547309875, "accuracy": 0.013427734375, "attn_entropy": 0.15026975888758898, "param_norm": 139.41976510102384}, "train": {"loss": 0.2860914170742035, "accuracy": 0.92578125, "attn_entropy": 0.062407173216342926, "param_norm": 139.41701659864248}, "step": 2590, "lr": 0.001}, {"val": {"loss": 8.320819437503815, "accuracy": 0.353271484375, "attn_entropy": 0.1654788265004754, "param_norm": 139.44778177475266}, "ground_truth": {"loss": 14.474916338920593, "accuracy": 0.0166015625, "attn_entropy": 0.14908605348318815, "param_norm": 139.44778177475266}, "train": {"loss": 0.29694414138793945, "accuracy": 0.9140625, "attn_entropy": 0.05980436131358147, "param_norm": 139.4448881714538}, "step": 2600, "lr": 0.001}, {"val": {"loss": 8.324604094028473, "accuracy": 0.3525390625, "attn_entropy": 0.16739368345588446, "param_norm": 139.47426769526012}, "ground_truth": {"loss": 14.245057344436646, "accuracy": 0.0234375, "attn_entropy": 0.15139526966959238, "param_norm": 139.47426769526012}, "train": {"loss": 0.3424934148788452, "accuracy": 0.892578125, "attn_entropy": 0.06217852234840393, "param_norm": 139.4718420432011}, "step": 2610, "lr": 0.001}, {"val": {"loss": 8.165650725364685, "accuracy": 0.354248046875, "attn_entropy": 0.16660284344106913, "param_norm": 139.49699522229585}, "ground_truth": {"loss": 14.64272165298462, "accuracy": 0.0166015625, "attn_entropy": 0.15170144848525524, "param_norm": 139.49699522229585}, "train": {"loss": 0.28770124912261963, "accuracy": 0.927734375, "attn_entropy": 0.06547126173973083, "param_norm": 139.494787911143}, "step": 2620, "lr": 0.001}, {"val": {"loss": 8.277398765087128, "accuracy": 0.357421875, "attn_entropy": 0.16533811669796705, "param_norm": 139.52041217874987}, "ground_truth": {"loss": 14.808501124382019, "accuracy": 0.01220703125, "attn_entropy": 0.15179029759019613, "param_norm": 139.52041217874987}, "train": {"loss": 0.298576295375824, "accuracy": 0.912109375, "attn_entropy": 0.06344876252114773, "param_norm": 139.51803551997924}, "step": 2630, "lr": 0.001}, {"val": {"loss": 8.378450393676758, "accuracy": 0.355712890625, "attn_entropy": 0.16455049440264702, "param_norm": 139.54508230661511}, "ground_truth": {"loss": 14.720751762390137, "accuracy": 0.01220703125, "attn_entropy": 0.15055141877382994, "param_norm": 139.54508230661511}, "train": {"loss": 0.2650691866874695, "accuracy": 0.9296875, "attn_entropy": 0.06260770745575428, "param_norm": 139.5423691609109}, "step": 2640, "lr": 0.001}, {"val": {"loss": 8.108110964298248, "accuracy": 0.365966796875, "attn_entropy": 0.1623542718589306, "param_norm": 139.57002389767493}, "ground_truth": {"loss": 14.450552225112915, "accuracy": 0.013916015625, "attn_entropy": 0.14682560227811337, "param_norm": 139.57002389767493}, "train": {"loss": 0.22100800275802612, "accuracy": 0.9375, "attn_entropy": 0.06063351780176163, "param_norm": 139.56756974303067}, "step": 2650, "lr": 0.001}, {"val": {"loss": 8.077710211277008, "accuracy": 0.36328125, "attn_entropy": 0.16168785654008389, "param_norm": 139.5933354660128}, "ground_truth": {"loss": 14.533044695854187, "accuracy": 0.01953125, "attn_entropy": 0.1466139806434512, "param_norm": 139.5933354660128}, "train": {"loss": 0.25969645380973816, "accuracy": 0.93359375, "attn_entropy": 0.0582372285425663, "param_norm": 139.5910609832059}, "step": 2660, "lr": 0.001}, {"val": {"loss": 8.15180617570877, "accuracy": 0.36669921875, "attn_entropy": 0.16117314156144857, "param_norm": 139.61479396907234}, "ground_truth": {"loss": 14.697515964508057, "accuracy": 0.0166015625, "attn_entropy": 0.1457118270918727, "param_norm": 139.61479396907234}, "train": {"loss": 0.24976079165935516, "accuracy": 0.93359375, "attn_entropy": 0.05851919576525688, "param_norm": 139.61260537656628}, "step": 2670, "lr": 0.001}, {"val": {"loss": 8.25476324558258, "accuracy": 0.359375, "attn_entropy": 0.16221454087644815, "param_norm": 139.63828501261935}, "ground_truth": {"loss": 14.874717354774475, "accuracy": 0.01318359375, "attn_entropy": 0.14636447187513113, "param_norm": 139.63828501261935}, "train": {"loss": 0.24359862506389618, "accuracy": 0.927734375, "attn_entropy": 0.05901133641600609, "param_norm": 139.6358416455039}, "step": 2680, "lr": 0.001}, {"val": {"loss": 8.354463875293732, "accuracy": 0.353515625, "attn_entropy": 0.16490480117499828, "param_norm": 139.66429915245138}, "ground_truth": {"loss": 14.70250403881073, "accuracy": 0.01220703125, "attn_entropy": 0.14926790166646242, "param_norm": 139.66429915245138}, "train": {"loss": 0.28818202018737793, "accuracy": 0.92578125, "attn_entropy": 0.06126480735838413, "param_norm": 139.6617413203789}, "step": 2690, "lr": 0.001}, {"val": {"loss": 8.44411015510559, "accuracy": 0.3466796875, "attn_entropy": 0.16485721711069345, "param_norm": 139.68770151171773}, "ground_truth": {"loss": 14.801319003105164, "accuracy": 0.01025390625, "attn_entropy": 0.14881430566310883, "param_norm": 139.68770151171773}, "train": {"loss": 0.29318201541900635, "accuracy": 0.904296875, "attn_entropy": 0.06331544369459152, "param_norm": 139.68543070619492}, "step": 2700, "lr": 0.001}, {"val": {"loss": 8.338893294334412, "accuracy": 0.352294921875, "attn_entropy": 0.16401198972016573, "param_norm": 139.71025624962638}, "ground_truth": {"loss": 14.842319250106812, "accuracy": 0.0185546875, "attn_entropy": 0.14715492818504572, "param_norm": 139.71025624962638}, "train": {"loss": 0.30520394444465637, "accuracy": 0.916015625, "attn_entropy": 0.059538641944527626, "param_norm": 139.70793866289884}, "step": 2710, "lr": 0.001}, {"val": {"loss": 8.481018662452698, "accuracy": 0.351806640625, "attn_entropy": 0.16290408372879028, "param_norm": 139.73365396325121}, "ground_truth": {"loss": 14.83154046535492, "accuracy": 0.017822265625, "attn_entropy": 0.14638963714241982, "param_norm": 139.73365396325121}, "train": {"loss": 0.28019893169403076, "accuracy": 0.931640625, "attn_entropy": 0.060336701571941376, "param_norm": 139.73136731824195}, "step": 2720, "lr": 0.001}, {"val": {"loss": 8.343895077705383, "accuracy": 0.354736328125, "attn_entropy": 0.1619033608585596, "param_norm": 139.75696726441114}, "ground_truth": {"loss": 14.73711884021759, "accuracy": 0.013916015625, "attn_entropy": 0.14530420675873756, "param_norm": 139.75696726441114}, "train": {"loss": 0.2596149444580078, "accuracy": 0.923828125, "attn_entropy": 0.057886816561222076, "param_norm": 139.754666107758}, "step": 2730, "lr": 0.001}, {"val": {"loss": 8.199168682098389, "accuracy": 0.362060546875, "attn_entropy": 0.16057755425572395, "param_norm": 139.781031462364}, "ground_truth": {"loss": 14.879559874534607, "accuracy": 0.0146484375, "attn_entropy": 0.14580897893756628, "param_norm": 139.781031462364}, "train": {"loss": 0.23840849101543427, "accuracy": 0.939453125, "attn_entropy": 0.05737854912877083, "param_norm": 139.7785489332316}, "step": 2740, "lr": 0.001}, {"val": {"loss": 8.121777713298798, "accuracy": 0.36865234375, "attn_entropy": 0.16229425091296434, "param_norm": 139.8048866085629}, "ground_truth": {"loss": 15.003015756607056, "accuracy": 0.016357421875, "attn_entropy": 0.14598784875124693, "param_norm": 139.8048866085629}, "train": {"loss": 0.25900930166244507, "accuracy": 0.912109375, "attn_entropy": 0.05890887603163719, "param_norm": 139.80267639248487}, "step": 2750, "lr": 0.001}, {"val": {"loss": 8.324447989463806, "accuracy": 0.351318359375, "attn_entropy": 0.16224583610892296, "param_norm": 139.82645876364106}, "ground_truth": {"loss": 14.835307359695435, "accuracy": 0.015625, "attn_entropy": 0.14767432771623135, "param_norm": 139.82645876364106}, "train": {"loss": 0.2653326094150543, "accuracy": 0.919921875, "attn_entropy": 0.060847945511341095, "param_norm": 139.82428229040522}, "step": 2760, "lr": 0.001}, {"val": {"loss": 8.436955451965332, "accuracy": 0.344970703125, "attn_entropy": 0.16091341990977526, "param_norm": 139.84842570799557}, "ground_truth": {"loss": 14.938928604125977, "accuracy": 0.015869140625, "attn_entropy": 0.14661003462970257, "param_norm": 139.84842570799557}, "train": {"loss": 0.3122747540473938, "accuracy": 0.91796875, "attn_entropy": 0.06321876496076584, "param_norm": 139.8461677586812}, "step": 2770, "lr": 0.001}, {"val": {"loss": 8.220896601676941, "accuracy": 0.361572265625, "attn_entropy": 0.163389902561903, "param_norm": 139.8729125956742}, "ground_truth": {"loss": 15.002240419387817, "accuracy": 0.01611328125, "attn_entropy": 0.14892870653420687, "param_norm": 139.8729125956742}, "train": {"loss": 0.23020830750465393, "accuracy": 0.9375, "attn_entropy": 0.06360145285725594, "param_norm": 139.87053036696815}, "step": 2780, "lr": 0.001}, {"val": {"loss": 8.400156021118164, "accuracy": 0.354736328125, "attn_entropy": 0.16412993893027306, "param_norm": 139.89497158209292}, "ground_truth": {"loss": 15.39641261100769, "accuracy": 0.015869140625, "attn_entropy": 0.14671881310641766, "param_norm": 139.89497158209292}, "train": {"loss": 0.2430298924446106, "accuracy": 0.935546875, "attn_entropy": 0.06470335274934769, "param_norm": 139.89272811817696}, "step": 2790, "lr": 0.001}, {"val": {"loss": 8.291721403598785, "accuracy": 0.3623046875, "attn_entropy": 0.1631983919069171, "param_norm": 139.91834785391748}, "ground_truth": {"loss": 14.877107858657837, "accuracy": 0.0146484375, "attn_entropy": 0.1469989027827978, "param_norm": 139.91834785391748}, "train": {"loss": 0.27917414903640747, "accuracy": 0.927734375, "attn_entropy": 0.059730466455221176, "param_norm": 139.91605383421714}, "step": 2800, "lr": 0.001}, {"val": {"loss": 8.172326982021332, "accuracy": 0.362548828125, "attn_entropy": 0.1627962002530694, "param_norm": 139.94222892359696}, "ground_truth": {"loss": 15.068322539329529, "accuracy": 0.014404296875, "attn_entropy": 0.1474113455042243, "param_norm": 139.94222892359696}, "train": {"loss": 0.3164635896682739, "accuracy": 0.912109375, "attn_entropy": 0.06004960089921951, "param_norm": 139.93985434769087}, "step": 2810, "lr": 0.001}, {"val": {"loss": 8.309932172298431, "accuracy": 0.360107421875, "attn_entropy": 0.16387510020285845, "param_norm": 139.9649943027047}, "ground_truth": {"loss": 15.26457130908966, "accuracy": 0.0107421875, "attn_entropy": 0.14803560823202133, "param_norm": 139.9649943027047}, "train": {"loss": 0.2894541919231415, "accuracy": 0.91015625, "attn_entropy": 0.05784774571657181, "param_norm": 139.96269734426335}, "step": 2820, "lr": 0.001}, {"val": {"loss": 8.200239062309265, "accuracy": 0.357421875, "attn_entropy": 0.16378873959183693, "param_norm": 139.98610135931162}, "ground_truth": {"loss": 15.081050753593445, "accuracy": 0.012451171875, "attn_entropy": 0.1487109661102295, "param_norm": 139.98610135931162}, "train": {"loss": 0.26097267866134644, "accuracy": 0.923828125, "attn_entropy": 0.06066010147333145, "param_norm": 139.98408037191496}, "step": 2830, "lr": 0.001}, {"val": {"loss": 8.331611216068268, "accuracy": 0.35546875, "attn_entropy": 0.16542746126651764, "param_norm": 140.0070755145558}, "ground_truth": {"loss": 14.922357201576233, "accuracy": 0.01123046875, "attn_entropy": 0.14846874587237835, "param_norm": 140.0070755145558}, "train": {"loss": 0.3204573690891266, "accuracy": 0.916015625, "attn_entropy": 0.06135574355721474, "param_norm": 140.00498305823274}, "step": 2840, "lr": 0.001}, {"val": {"loss": 8.281346678733826, "accuracy": 0.355712890625, "attn_entropy": 0.1631315601989627, "param_norm": 140.02935520082846}, "ground_truth": {"loss": 14.897935152053833, "accuracy": 0.016845703125, "attn_entropy": 0.1491320850327611, "param_norm": 140.02935520082846}, "train": {"loss": 0.2375585436820984, "accuracy": 0.923828125, "attn_entropy": 0.06605241447687149, "param_norm": 140.02698122261154}, "step": 2850, "lr": 0.001}, {"val": {"loss": 8.26364940404892, "accuracy": 0.357177734375, "attn_entropy": 0.1628702562302351, "param_norm": 140.05166934898998}, "ground_truth": {"loss": 14.952768206596375, "accuracy": 0.025390625, "attn_entropy": 0.14627385651692748, "param_norm": 140.05166934898998}, "train": {"loss": 0.267803817987442, "accuracy": 0.916015625, "attn_entropy": 0.060606881976127625, "param_norm": 140.0494403392949}, "step": 2860, "lr": 0.001}, {"val": {"loss": 8.241568446159363, "accuracy": 0.362548828125, "attn_entropy": 0.16210780199617147, "param_norm": 140.0729590863319}, "ground_truth": {"loss": 15.343859434127808, "accuracy": 0.01708984375, "attn_entropy": 0.14645542297512293, "param_norm": 140.0729590863319}, "train": {"loss": 0.33583566546440125, "accuracy": 0.919921875, "attn_entropy": 0.059460535645484924, "param_norm": 140.07089492709315}, "step": 2870, "lr": 0.001}, {"val": {"loss": 8.186956763267517, "accuracy": 0.36376953125, "attn_entropy": 0.16211146116256714, "param_norm": 140.09499137600935}, "ground_truth": {"loss": 15.027029037475586, "accuracy": 0.01416015625, "attn_entropy": 0.14659858588129282, "param_norm": 140.09499137600935}, "train": {"loss": 0.24950088560581207, "accuracy": 0.927734375, "attn_entropy": 0.055827829986810684, "param_norm": 140.0926458034763}, "step": 2880, "lr": 0.001}, {"val": {"loss": 8.035139679908752, "accuracy": 0.364990234375, "attn_entropy": 0.1610305095091462, "param_norm": 140.1163324207688}, "ground_truth": {"loss": 14.97884452342987, "accuracy": 0.0126953125, "attn_entropy": 0.14667611196637154, "param_norm": 140.1163324207688}, "train": {"loss": 0.30008023977279663, "accuracy": 0.927734375, "attn_entropy": 0.06220069341361523, "param_norm": 140.11420975927592}, "step": 2890, "lr": 0.001}, {"val": {"loss": 8.188479781150818, "accuracy": 0.355712890625, "attn_entropy": 0.16243493556976318, "param_norm": 140.1369440395126}, "ground_truth": {"loss": 14.845456838607788, "accuracy": 0.014404296875, "attn_entropy": 0.14571833610534668, "param_norm": 140.1369440395126}, "train": {"loss": 0.29833370447158813, "accuracy": 0.91015625, "attn_entropy": 0.06081651896238327, "param_norm": 140.13502392249737}, "step": 2900, "lr": 0.001}, {"val": {"loss": 8.204384088516235, "accuracy": 0.3662109375, "attn_entropy": 0.1608986584469676, "param_norm": 140.15734184715913}, "ground_truth": {"loss": 14.943259000778198, "accuracy": 0.011962890625, "attn_entropy": 0.1446540728211403, "param_norm": 140.15734184715913}, "train": {"loss": 0.2284194529056549, "accuracy": 0.955078125, "attn_entropy": 0.058407969772815704, "param_norm": 140.15529677806876}, "step": 2910, "lr": 0.001}, {"val": {"loss": 8.347946763038635, "accuracy": 0.35009765625, "attn_entropy": 0.16050404030829668, "param_norm": 140.17857403964857}, "ground_truth": {"loss": 15.149465322494507, "accuracy": 0.01025390625, "attn_entropy": 0.14357600454241037, "param_norm": 140.17857403964857}, "train": {"loss": 0.2387227714061737, "accuracy": 0.931640625, "attn_entropy": 0.05550340190529823, "param_norm": 140.1764289215399}, "step": 2920, "lr": 0.001}, {"val": {"loss": 8.12596708536148, "accuracy": 0.363037109375, "attn_entropy": 0.16189788561314344, "param_norm": 140.19873629332855}, "ground_truth": {"loss": 15.288935661315918, "accuracy": 0.01220703125, "attn_entropy": 0.14463628455996513, "param_norm": 140.19873629332855}, "train": {"loss": 0.29503992199897766, "accuracy": 0.921875, "attn_entropy": 0.06256017088890076, "param_norm": 140.19676574964623}, "step": 2930, "lr": 0.001}, {"val": {"loss": 8.458229959011078, "accuracy": 0.343505859375, "attn_entropy": 0.16339483205229044, "param_norm": 140.21862292002356}, "ground_truth": {"loss": 15.344564437866211, "accuracy": 0.017578125, "attn_entropy": 0.14673906844109297, "param_norm": 140.21862292002356}, "train": {"loss": 0.2730308473110199, "accuracy": 0.91796875, "attn_entropy": 0.058458294719457626, "param_norm": 140.21661700349443}, "step": 2940, "lr": 0.001}, {"val": {"loss": 8.23696893453598, "accuracy": 0.357177734375, "attn_entropy": 0.16377810668200254, "param_norm": 140.2404421534684}, "ground_truth": {"loss": 15.447857141494751, "accuracy": 0.01708984375, "attn_entropy": 0.1482077343389392, "param_norm": 140.2404421534684}, "train": {"loss": 0.3079245090484619, "accuracy": 0.91796875, "attn_entropy": 0.06211080588400364, "param_norm": 140.23824193494943}, "step": 2950, "lr": 0.001}, {"val": {"loss": 8.228378415107727, "accuracy": 0.360107421875, "attn_entropy": 0.1675547081977129, "param_norm": 140.26384237414916}, "ground_truth": {"loss": 15.058822631835938, "accuracy": 0.011962890625, "attn_entropy": 0.15008259285241365, "param_norm": 140.26384237414916}, "train": {"loss": 0.2711833715438843, "accuracy": 0.935546875, "attn_entropy": 0.0643517766147852, "param_norm": 140.2615535141345}, "step": 2960, "lr": 0.001}, {"val": {"loss": 8.268938601016998, "accuracy": 0.353515625, "attn_entropy": 0.16564921475946903, "param_norm": 140.2873997472651}, "ground_truth": {"loss": 15.159360885620117, "accuracy": 0.013916015625, "attn_entropy": 0.1496878806501627, "param_norm": 140.2873997472651}, "train": {"loss": 0.27212193608283997, "accuracy": 0.92578125, "attn_entropy": 0.06702454760670662, "param_norm": 140.2851420615943}, "step": 2970, "lr": 0.001}, {"val": {"loss": 8.501301527023315, "accuracy": 0.34423828125, "attn_entropy": 0.16781286802142859, "param_norm": 140.30932245490362}, "ground_truth": {"loss": 15.26177167892456, "accuracy": 0.013671875, "attn_entropy": 0.15205155778676271, "param_norm": 140.30932245490362}, "train": {"loss": 0.23092995584011078, "accuracy": 0.9453125, "attn_entropy": 0.06604807823896408, "param_norm": 140.30718006348044}, "step": 2980, "lr": 0.001}, {"val": {"loss": 8.352144658565521, "accuracy": 0.359375, "attn_entropy": 0.16820912435650826, "param_norm": 140.3311290396064}, "ground_truth": {"loss": 15.528480052947998, "accuracy": 0.014892578125, "attn_entropy": 0.15093448385596275, "param_norm": 140.3311290396064}, "train": {"loss": 0.2412765920162201, "accuracy": 0.927734375, "attn_entropy": 0.06474734097719193, "param_norm": 140.328885787615}, "step": 2990, "lr": 0.001}, {"val": {"loss": 8.365189552307129, "accuracy": 0.356201171875, "attn_entropy": 0.16837482247501612, "param_norm": 140.35320008031502}, "ground_truth": {"loss": 15.645561218261719, "accuracy": 0.01513671875, "attn_entropy": 0.15111095644533634, "param_norm": 140.35320008031502}, "train": {"loss": 0.24468913674354553, "accuracy": 0.939453125, "attn_entropy": 0.0687379278242588, "param_norm": 140.35104455785952}, "step": 3000, "lr": 0.001}, {"val": {"loss": 8.468636631965637, "accuracy": 0.3583984375, "attn_entropy": 0.16738148499280214, "param_norm": 140.37412575764142}, "ground_truth": {"loss": 15.51255214214325, "accuracy": 0.011962890625, "attn_entropy": 0.15212456788867712, "param_norm": 140.37412575764142}, "train": {"loss": 0.3174096941947937, "accuracy": 0.912109375, "attn_entropy": 0.06400852277874947, "param_norm": 140.3719952643667}, "step": 3010, "lr": 0.001}, {"val": {"loss": 8.4011749625206, "accuracy": 0.35009765625, "attn_entropy": 0.1679764948785305, "param_norm": 140.39538004840878}, "ground_truth": {"loss": 15.663791179656982, "accuracy": 0.01171875, "attn_entropy": 0.15226980298757553, "param_norm": 140.39538004840878}, "train": {"loss": 0.19473987817764282, "accuracy": 0.9453125, "attn_entropy": 0.06189393997192383, "param_norm": 140.39330671017382}, "step": 3020, "lr": 0.001}, {"val": {"loss": 8.304939985275269, "accuracy": 0.347900390625, "attn_entropy": 0.16484665870666504, "param_norm": 140.41694961013738}, "ground_truth": {"loss": 15.430042147636414, "accuracy": 0.013916015625, "attn_entropy": 0.14780859649181366, "param_norm": 140.41694961013738}, "train": {"loss": 0.1786608248949051, "accuracy": 0.951171875, "attn_entropy": 0.060655975714325905, "param_norm": 140.41468906108915}, "step": 3030, "lr": 0.001}, {"val": {"loss": 8.399339318275452, "accuracy": 0.347900390625, "attn_entropy": 0.1626698700711131, "param_norm": 140.43734804727953}, "ground_truth": {"loss": 15.451404690742493, "accuracy": 0.01318359375, "attn_entropy": 0.14621342439204454, "param_norm": 140.43734804727953}, "train": {"loss": 0.24915561079978943, "accuracy": 0.919921875, "attn_entropy": 0.05953483283519745, "param_norm": 140.43535883321266}, "step": 3040, "lr": 0.001}, {"val": {"loss": 8.225859701633453, "accuracy": 0.363525390625, "attn_entropy": 0.16467926371842623, "param_norm": 140.45909786022187}, "ground_truth": {"loss": 15.579213500022888, "accuracy": 0.01416015625, "attn_entropy": 0.14830917678773403, "param_norm": 140.45909786022187}, "train": {"loss": 0.2902005910873413, "accuracy": 0.912109375, "attn_entropy": 0.0606745220720768, "param_norm": 140.45691805931335}, "step": 3050, "lr": 0.001}, {"val": {"loss": 8.243284225463867, "accuracy": 0.35791015625, "attn_entropy": 0.1643479699268937, "param_norm": 140.4820328260423}, "ground_truth": {"loss": 15.57813560962677, "accuracy": 0.01513671875, "attn_entropy": 0.14778761845082045, "param_norm": 140.4820328260423}, "train": {"loss": 0.22004163265228271, "accuracy": 0.939453125, "attn_entropy": 0.06065086089074612, "param_norm": 140.4797320572922}, "step": 3060, "lr": 0.001}, {"val": {"loss": 8.49448025226593, "accuracy": 0.344482421875, "attn_entropy": 0.16247730795294046, "param_norm": 140.50336160861525}, "ground_truth": {"loss": 15.279696464538574, "accuracy": 0.017578125, "attn_entropy": 0.14636517688632011, "param_norm": 140.50336160861525}, "train": {"loss": 0.23054386675357819, "accuracy": 0.943359375, "attn_entropy": 0.06442329473793507, "param_norm": 140.50134885257577}, "step": 3070, "lr": 0.001}, {"val": {"loss": 8.231260776519775, "accuracy": 0.359375, "attn_entropy": 0.16407121159136295, "param_norm": 140.52402459716143}, "ground_truth": {"loss": 15.661957383155823, "accuracy": 0.01318359375, "attn_entropy": 0.14672802574932575, "param_norm": 140.52402459716143}, "train": {"loss": 0.21576134860515594, "accuracy": 0.939453125, "attn_entropy": 0.06070766597986221, "param_norm": 140.5219254988434}, "step": 3080, "lr": 0.001}, {"val": {"loss": 8.095507979393005, "accuracy": 0.361328125, "attn_entropy": 0.16654224134981632, "param_norm": 140.54294914129477}, "ground_truth": {"loss": 15.30608880519867, "accuracy": 0.0185546875, "attn_entropy": 0.14893514290452003, "param_norm": 140.54294914129477}, "train": {"loss": 0.25581666827201843, "accuracy": 0.927734375, "attn_entropy": 0.0618707537651062, "param_norm": 140.541043323738}, "step": 3090, "lr": 0.001}, {"val": {"loss": 8.23693698644638, "accuracy": 0.3603515625, "attn_entropy": 0.1683336067944765, "param_norm": 140.56322160786206}, "ground_truth": {"loss": 15.456390619277954, "accuracy": 0.018310546875, "attn_entropy": 0.1506149983033538, "param_norm": 140.56322160786206}, "train": {"loss": 0.24900266528129578, "accuracy": 0.9296875, "attn_entropy": 0.06447471678256989, "param_norm": 140.5611780010321}, "step": 3100, "lr": 0.001}, {"val": {"loss": 8.092418432235718, "accuracy": 0.36767578125, "attn_entropy": 0.16576811578124762, "param_norm": 140.58435640201196}, "ground_truth": {"loss": 15.760033369064331, "accuracy": 0.014892578125, "attn_entropy": 0.14823553804308176, "param_norm": 140.58435640201196}, "train": {"loss": 0.2182483673095703, "accuracy": 0.94140625, "attn_entropy": 0.06168848276138306, "param_norm": 140.5822551317109}, "step": 3110, "lr": 0.001}, {"val": {"loss": 8.447752475738525, "accuracy": 0.348388671875, "attn_entropy": 0.16536241583526134, "param_norm": 140.6051235780269}, "ground_truth": {"loss": 15.671241044998169, "accuracy": 0.014404296875, "attn_entropy": 0.14752996200695634, "param_norm": 140.6051235780269}, "train": {"loss": 0.226919025182724, "accuracy": 0.931640625, "attn_entropy": 0.0626209881156683, "param_norm": 140.60285841909547}, "step": 3120, "lr": 0.001}, {"val": {"loss": 8.224119663238525, "accuracy": 0.36328125, "attn_entropy": 0.1646931180730462, "param_norm": 140.62650741669907}, "ground_truth": {"loss": 15.577409982681274, "accuracy": 0.01611328125, "attn_entropy": 0.1480520088225603, "param_norm": 140.62650741669907}, "train": {"loss": 0.2622911334037781, "accuracy": 0.935546875, "attn_entropy": 0.06300226040184498, "param_norm": 140.62452532126528}, "step": 3130, "lr": 0.001}, {"val": {"loss": 8.109853744506836, "accuracy": 0.365478515625, "attn_entropy": 0.16416138131171465, "param_norm": 140.64829890996495}, "ground_truth": {"loss": 15.605111360549927, "accuracy": 0.013427734375, "attn_entropy": 0.14847599249333143, "param_norm": 140.64829890996495}, "train": {"loss": 0.2212546467781067, "accuracy": 0.927734375, "attn_entropy": 0.06355711072683334, "param_norm": 140.64606644288278}, "step": 3140, "lr": 0.001}, {"val": {"loss": 8.299997508525848, "accuracy": 0.3544921875, "attn_entropy": 0.1644797222688794, "param_norm": 140.6721279807667}, "ground_truth": {"loss": 15.439340949058533, "accuracy": 0.014892578125, "attn_entropy": 0.146964518353343, "param_norm": 140.6721279807667}, "train": {"loss": 0.23741652071475983, "accuracy": 0.93359375, "attn_entropy": 0.06053362041711807, "param_norm": 140.66986435861386}, "step": 3150, "lr": 0.001}, {"val": {"loss": 8.234067022800446, "accuracy": 0.35888671875, "attn_entropy": 0.16678856406360865, "param_norm": 140.69312649742878}, "ground_truth": {"loss": 15.428990244865417, "accuracy": 0.018798828125, "attn_entropy": 0.14904834004119039, "param_norm": 140.69312649742878}, "train": {"loss": 0.2792261242866516, "accuracy": 0.92578125, "attn_entropy": 0.065385352820158, "param_norm": 140.6911392324826}, "step": 3160, "lr": 0.001}, {"val": {"loss": 8.373589813709259, "accuracy": 0.350341796875, "attn_entropy": 0.16703128349035978, "param_norm": 140.71182346483377}, "ground_truth": {"loss": 15.575421214103699, "accuracy": 0.0185546875, "attn_entropy": 0.1500045950524509, "param_norm": 140.71182346483377}, "train": {"loss": 0.23246106505393982, "accuracy": 0.9453125, "attn_entropy": 0.06484348885715008, "param_norm": 140.70999291659808}, "step": 3170, "lr": 0.001}, {"val": {"loss": 8.219740390777588, "accuracy": 0.36279296875, "attn_entropy": 0.1646673958748579, "param_norm": 140.73206923791824}, "ground_truth": {"loss": 15.487494945526123, "accuracy": 0.016845703125, "attn_entropy": 0.14781547337770462, "param_norm": 140.73206923791824}, "train": {"loss": 0.23176829516887665, "accuracy": 0.935546875, "attn_entropy": 0.06502294912934303, "param_norm": 140.7300770030119}, "step": 3180, "lr": 0.001}, {"val": {"loss": 8.514180183410645, "accuracy": 0.3408203125, "attn_entropy": 0.16436726786196232, "param_norm": 140.75231130696397}, "ground_truth": {"loss": 15.706576108932495, "accuracy": 0.02001953125, "attn_entropy": 0.14842612762004137, "param_norm": 140.75231130696397}, "train": {"loss": 0.23029407858848572, "accuracy": 0.93359375, "attn_entropy": 0.06170457601547241, "param_norm": 140.75035062902083}, "step": 3190, "lr": 0.001}, {"val": {"loss": 8.261293053627014, "accuracy": 0.35888671875, "attn_entropy": 0.16536949947476387, "param_norm": 140.77422293305196}, "ground_truth": {"loss": 15.732142686843872, "accuracy": 0.01611328125, "attn_entropy": 0.1476966505870223, "param_norm": 140.77422293305196}, "train": {"loss": 0.22972369194030762, "accuracy": 0.931640625, "attn_entropy": 0.0609106570482254, "param_norm": 140.77202994101637}, "step": 3200, "lr": 0.001}, {"val": {"loss": 8.073422253131866, "accuracy": 0.373046875, "attn_entropy": 0.16646111477166414, "param_norm": 140.7946629930973}, "ground_truth": {"loss": 15.622143507003784, "accuracy": 0.01318359375, "attn_entropy": 0.1472274400293827, "param_norm": 140.7946629930973}, "train": {"loss": 0.1992056965827942, "accuracy": 0.94140625, "attn_entropy": 0.05892803147435188, "param_norm": 140.7928263008365}, "step": 3210, "lr": 0.001}, {"val": {"loss": 8.351851999759674, "accuracy": 0.35498046875, "attn_entropy": 0.16560787335038185, "param_norm": 140.813630101964}, "ground_truth": {"loss": 15.321966171264648, "accuracy": 0.017333984375, "attn_entropy": 0.14613179676234722, "param_norm": 140.813630101964}, "train": {"loss": 0.2845703363418579, "accuracy": 0.91015625, "attn_entropy": 0.06463540345430374, "param_norm": 140.81177752032636}, "step": 3220, "lr": 0.001}, {"val": {"loss": 8.161635518074036, "accuracy": 0.367919921875, "attn_entropy": 0.16901619918644428, "param_norm": 140.83176222327867}, "ground_truth": {"loss": 15.893691658973694, "accuracy": 0.011962890625, "attn_entropy": 0.14900499302893877, "param_norm": 140.83176222327867}, "train": {"loss": 0.18829798698425293, "accuracy": 0.939453125, "attn_entropy": 0.06589380279183388, "param_norm": 140.82972035636027}, "step": 3230, "lr": 0.001}, {"val": {"loss": 8.290381073951721, "accuracy": 0.360107421875, "attn_entropy": 0.16953469533473253, "param_norm": 140.85128437932852}, "ground_truth": {"loss": 15.895981311798096, "accuracy": 0.013427734375, "attn_entropy": 0.1511365259066224, "param_norm": 140.85128437932852}, "train": {"loss": 0.22729137539863586, "accuracy": 0.939453125, "attn_entropy": 0.06676302291452885, "param_norm": 140.8494308374275}, "step": 3240, "lr": 0.001}, {"val": {"loss": 8.209867715835571, "accuracy": 0.361328125, "attn_entropy": 0.16795998718589544, "param_norm": 140.87000155783815}, "ground_truth": {"loss": 15.561959505081177, "accuracy": 0.017822265625, "attn_entropy": 0.14932165341451764, "param_norm": 140.87000155783815}, "train": {"loss": 0.2036144733428955, "accuracy": 0.935546875, "attn_entropy": 0.06602303124964237, "param_norm": 140.86812957453733}, "step": 3250, "lr": 0.001}, {"val": {"loss": 8.092268884181976, "accuracy": 0.36865234375, "attn_entropy": 0.1640704618766904, "param_norm": 140.88961742853533}, "ground_truth": {"loss": 15.522795915603638, "accuracy": 0.015625, "attn_entropy": 0.1477632550522685, "param_norm": 140.88961742853533}, "train": {"loss": 0.22088727355003357, "accuracy": 0.94140625, "attn_entropy": 0.06263778358697891, "param_norm": 140.88756434993613}, "step": 3260, "lr": 0.001}, {"val": {"loss": 8.28686398267746, "accuracy": 0.35302734375, "attn_entropy": 0.16656505968421698, "param_norm": 140.91176599905953}, "ground_truth": {"loss": 15.919847011566162, "accuracy": 0.010498046875, "attn_entropy": 0.1494813058525324, "param_norm": 140.91176599905953}, "train": {"loss": 0.19733037054538727, "accuracy": 0.94140625, "attn_entropy": 0.06337585300207138, "param_norm": 140.90959308966347}, "step": 3270, "lr": 0.001}, {"val": {"loss": 8.183553874492645, "accuracy": 0.359130859375, "attn_entropy": 0.16761437337845564, "param_norm": 140.93270373831933}, "ground_truth": {"loss": 15.714032053947449, "accuracy": 0.012451171875, "attn_entropy": 0.14975870866328478, "param_norm": 140.93270373831933}, "train": {"loss": 0.18435651063919067, "accuracy": 0.94140625, "attn_entropy": 0.06151243671774864, "param_norm": 140.93067003837342}, "step": 3280, "lr": 0.001}, {"val": {"loss": 7.988239884376526, "accuracy": 0.367919921875, "attn_entropy": 0.16868897154927254, "param_norm": 140.95204041102537}, "ground_truth": {"loss": 15.899459958076477, "accuracy": 0.01171875, "attn_entropy": 0.15173515677452087, "param_norm": 140.95204041102537}, "train": {"loss": 0.23676438629627228, "accuracy": 0.931640625, "attn_entropy": 0.06672186963260174, "param_norm": 140.95037120949172}, "step": 3290, "lr": 0.001}, {"val": {"loss": 8.407155752182007, "accuracy": 0.351806640625, "attn_entropy": 0.16902799252420664, "param_norm": 140.97057449485263}, "ground_truth": {"loss": 15.969762563705444, "accuracy": 0.01123046875, "attn_entropy": 0.1523803574964404, "param_norm": 140.97057449485263}, "train": {"loss": 0.17974083125591278, "accuracy": 0.9453125, "attn_entropy": 0.06260852701961994, "param_norm": 140.96868697059116}, "step": 3300, "lr": 0.001}, {"val": {"loss": 8.206118941307068, "accuracy": 0.35498046875, "attn_entropy": 0.16827083565294743, "param_norm": 140.9889227345984}, "ground_truth": {"loss": 16.007164239883423, "accuracy": 0.013427734375, "attn_entropy": 0.15104455687105656, "param_norm": 140.9889227345984}, "train": {"loss": 0.19436924159526825, "accuracy": 0.94140625, "attn_entropy": 0.0596110075712204, "param_norm": 140.98715435868527}, "step": 3310, "lr": 0.001}, {"val": {"loss": 8.063115417957306, "accuracy": 0.370849609375, "attn_entropy": 0.1676288889721036, "param_norm": 141.00723811181663}, "ground_truth": {"loss": 15.82796573638916, "accuracy": 0.016357421875, "attn_entropy": 0.150946456938982, "param_norm": 141.00723811181663}, "train": {"loss": 0.19375799596309662, "accuracy": 0.94140625, "attn_entropy": 0.06696448102593422, "param_norm": 141.00515581255712}, "step": 3320, "lr": 0.001}, {"val": {"loss": 8.26373702287674, "accuracy": 0.357421875, "attn_entropy": 0.16627417597919703, "param_norm": 141.02732915413046}, "ground_truth": {"loss": 15.742536306381226, "accuracy": 0.012939453125, "attn_entropy": 0.1517949067056179, "param_norm": 141.02732915413046}, "train": {"loss": 0.17997200787067413, "accuracy": 0.9453125, "attn_entropy": 0.06503986939787865, "param_norm": 141.025354890414}, "step": 3330, "lr": 0.001}, {"val": {"loss": 8.256457448005676, "accuracy": 0.35791015625, "attn_entropy": 0.16712782718241215, "param_norm": 141.0470925251577}, "ground_truth": {"loss": 15.559687495231628, "accuracy": 0.019287109375, "attn_entropy": 0.15165623556822538, "param_norm": 141.0470925251577}, "train": {"loss": 0.26827380061149597, "accuracy": 0.93359375, "attn_entropy": 0.05946166813373566, "param_norm": 141.04524171556966}, "step": 3340, "lr": 0.001}, {"val": {"loss": 8.238841354846954, "accuracy": 0.35302734375, "attn_entropy": 0.16513195727020502, "param_norm": 141.0678330310395}, "ground_truth": {"loss": 15.513990879058838, "accuracy": 0.01904296875, "attn_entropy": 0.14937538374215364, "param_norm": 141.0678330310395}, "train": {"loss": 0.21101713180541992, "accuracy": 0.939453125, "attn_entropy": 0.060194410383701324, "param_norm": 141.06563896771434}, "step": 3350, "lr": 0.001}, {"val": {"loss": 7.991649150848389, "accuracy": 0.367431640625, "attn_entropy": 0.16409669630229473, "param_norm": 141.0883361866615}, "ground_truth": {"loss": 15.792026162147522, "accuracy": 0.0126953125, "attn_entropy": 0.1461435854434967, "param_norm": 141.0883361866615}, "train": {"loss": 0.2635026276111603, "accuracy": 0.93359375, "attn_entropy": 0.05846800655126572, "param_norm": 141.08621579730067}, "step": 3360, "lr": 0.001}, {"val": {"loss": 8.188490569591522, "accuracy": 0.357177734375, "attn_entropy": 0.1618891851976514, "param_norm": 141.10788252844534}, "ground_truth": {"loss": 15.934442162513733, "accuracy": 0.012451171875, "attn_entropy": 0.14804282411932945, "param_norm": 141.10788252844534}, "train": {"loss": 0.2244102507829666, "accuracy": 0.939453125, "attn_entropy": 0.06271040439605713, "param_norm": 141.1061718227322}, "step": 3370, "lr": 0.001}, {"val": {"loss": 8.254019975662231, "accuracy": 0.357421875, "attn_entropy": 0.16641603130847216, "param_norm": 141.12515911770356}, "ground_truth": {"loss": 15.971530199050903, "accuracy": 0.015869140625, "attn_entropy": 0.1499568559229374, "param_norm": 141.12515911770356}, "train": {"loss": 0.2055751085281372, "accuracy": 0.93359375, "attn_entropy": 0.06563236191868782, "param_norm": 141.12330704212923}, "step": 3380, "lr": 0.001}, {"val": {"loss": 8.235670506954193, "accuracy": 0.35595703125, "attn_entropy": 0.16921484470367432, "param_norm": 141.14407002043205}, "ground_truth": {"loss": 16.083749651908875, "accuracy": 0.013671875, "attn_entropy": 0.15184096712619066, "param_norm": 141.14407002043205}, "train": {"loss": 0.1756362020969391, "accuracy": 0.943359375, "attn_entropy": 0.062450144439935684, "param_norm": 141.14219181549262}, "step": 3390, "lr": 0.001}, {"val": {"loss": 8.30435597896576, "accuracy": 0.34814453125, "attn_entropy": 0.16727914661169052, "param_norm": 141.16403634254095}, "ground_truth": {"loss": 15.99000895023346, "accuracy": 0.01123046875, "attn_entropy": 0.1503617954440415, "param_norm": 141.16403634254095}, "train": {"loss": 0.2504505217075348, "accuracy": 0.93359375, "attn_entropy": 0.06556767411530018, "param_norm": 141.1620942549168}, "step": 3400, "lr": 0.001}, {"val": {"loss": 8.050387799739838, "accuracy": 0.373291015625, "attn_entropy": 0.16576498188078403, "param_norm": 141.18356371264645}, "ground_truth": {"loss": 15.955715537071228, "accuracy": 0.015380859375, "attn_entropy": 0.14784931065514684, "param_norm": 141.18356371264645}, "train": {"loss": 0.3431490361690521, "accuracy": 0.912109375, "attn_entropy": 0.060996510088443756, "param_norm": 141.18157920933697}, "step": 3410, "lr": 0.001}, {"val": {"loss": 8.279084920883179, "accuracy": 0.355224609375, "attn_entropy": 0.1647182246670127, "param_norm": 141.2057328051438}, "ground_truth": {"loss": 16.07880675792694, "accuracy": 0.013427734375, "attn_entropy": 0.1477457950823009, "param_norm": 141.2057328051438}, "train": {"loss": 0.19823621213436127, "accuracy": 0.94921875, "attn_entropy": 0.060666486620903015, "param_norm": 141.2035668310159}, "step": 3420, "lr": 0.001}, {"val": {"loss": 8.311922490596771, "accuracy": 0.35498046875, "attn_entropy": 0.16459411289542913, "param_norm": 141.2259913775517}, "ground_truth": {"loss": 16.006278157234192, "accuracy": 0.014892578125, "attn_entropy": 0.14631966268643737, "param_norm": 141.2259913775517}, "train": {"loss": 0.17667539417743683, "accuracy": 0.95703125, "attn_entropy": 0.06166401877999306, "param_norm": 141.22382969413985}, "step": 3430, "lr": 0.001}, {"val": {"loss": 8.159767210483551, "accuracy": 0.359619140625, "attn_entropy": 0.16621355526149273, "param_norm": 141.24518213516217}, "ground_truth": {"loss": 15.887598156929016, "accuracy": 0.013671875, "attn_entropy": 0.14818724431097507, "param_norm": 141.24518213516217}, "train": {"loss": 0.2130011022090912, "accuracy": 0.94140625, "attn_entropy": 0.06118414178490639, "param_norm": 141.24330971052208}, "step": 3440, "lr": 0.001}, {"val": {"loss": 7.989759802818298, "accuracy": 0.365234375, "attn_entropy": 0.16800568252801895, "param_norm": 141.2638073552056}, "ground_truth": {"loss": 15.714847683906555, "accuracy": 0.014892578125, "attn_entropy": 0.14887650404125452, "param_norm": 141.2638073552056}, "train": {"loss": 0.14596542716026306, "accuracy": 0.962890625, "attn_entropy": 0.06382056698203087, "param_norm": 141.2618681439006}, "step": 3450, "lr": 0.001}, {"val": {"loss": 8.032295405864716, "accuracy": 0.363525390625, "attn_entropy": 0.1664330903440714, "param_norm": 141.2832461112532}, "ground_truth": {"loss": 16.181665658950806, "accuracy": 0.014892578125, "attn_entropy": 0.1503411503508687, "param_norm": 141.2832461112532}, "train": {"loss": 0.1663540154695511, "accuracy": 0.95703125, "attn_entropy": 0.06584677658975124, "param_norm": 141.28124411837544}, "step": 3460, "lr": 0.001}, {"val": {"loss": 8.098376750946045, "accuracy": 0.357421875, "attn_entropy": 0.16649405285716057, "param_norm": 141.30422992572105}, "ground_truth": {"loss": 16.062073826789856, "accuracy": 0.017333984375, "attn_entropy": 0.14798077195882797, "param_norm": 141.30422992572105}, "train": {"loss": 0.2047768086194992, "accuracy": 0.94921875, "attn_entropy": 0.0653536356985569, "param_norm": 141.30198272161795}, "step": 3470, "lr": 0.001}, {"val": {"loss": 7.968012154102325, "accuracy": 0.373046875, "attn_entropy": 0.16683169268071651, "param_norm": 141.32624339078941}, "ground_truth": {"loss": 16.179887771606445, "accuracy": 0.018798828125, "attn_entropy": 0.14941993821412325, "param_norm": 141.32624339078941}, "train": {"loss": 0.19574208557605743, "accuracy": 0.94140625, "attn_entropy": 0.062482140958309174, "param_norm": 141.32396480632698}, "step": 3480, "lr": 0.001}, {"val": {"loss": 7.987864673137665, "accuracy": 0.364990234375, "attn_entropy": 0.16620298568159342, "param_norm": 141.34568429878325}, "ground_truth": {"loss": 16.1296865940094, "accuracy": 0.013671875, "attn_entropy": 0.14769312739372253, "param_norm": 141.34568429878325}, "train": {"loss": 0.17120146751403809, "accuracy": 0.94921875, "attn_entropy": 0.06293508037924767, "param_norm": 141.34391002690248}, "step": 3490, "lr": 0.001}, {"val": {"loss": 8.397769749164581, "accuracy": 0.351806640625, "attn_entropy": 0.16477112285792828, "param_norm": 141.36522808925324}, "ground_truth": {"loss": 16.096993327140808, "accuracy": 0.013916015625, "attn_entropy": 0.1461833929643035, "param_norm": 141.36522808925324}, "train": {"loss": 0.16421836614608765, "accuracy": 0.95703125, "attn_entropy": 0.06466945633292198, "param_norm": 141.363265854282}, "step": 3500, "lr": 0.001}, {"val": {"loss": 8.07140839099884, "accuracy": 0.366455078125, "attn_entropy": 0.16337095759809017, "param_norm": 141.3835201745293}, "ground_truth": {"loss": 16.411528825759888, "accuracy": 0.014404296875, "attn_entropy": 0.14505197573453188, "param_norm": 141.3835201745293}, "train": {"loss": 0.14243023097515106, "accuracy": 0.955078125, "attn_entropy": 0.06499534845352173, "param_norm": 141.38156142426686}, "step": 3510, "lr": 0.001}, {"val": {"loss": 8.15939450263977, "accuracy": 0.365234375, "attn_entropy": 0.16545573342591524, "param_norm": 141.4036299745196}, "ground_truth": {"loss": 16.493688821792603, "accuracy": 0.01708984375, "attn_entropy": 0.1454796316102147, "param_norm": 141.4036299745196}, "train": {"loss": 0.18361495435237885, "accuracy": 0.935546875, "attn_entropy": 0.06582748889923096, "param_norm": 141.40153296005906}, "step": 3520, "lr": 0.001}, {"val": {"loss": 8.306009352207184, "accuracy": 0.363037109375, "attn_entropy": 0.16636397037655115, "param_norm": 141.42402924873684}, "ground_truth": {"loss": 16.512791872024536, "accuracy": 0.01416015625, "attn_entropy": 0.1480942526832223, "param_norm": 141.42402924873684}, "train": {"loss": 0.23059521615505219, "accuracy": 0.9375, "attn_entropy": 0.0640081986784935, "param_norm": 141.42226209672873}, "step": 3530, "lr": 0.001}, {"val": {"loss": 8.006533563137054, "accuracy": 0.370361328125, "attn_entropy": 0.16647514887154102, "param_norm": 141.44120282500373}, "ground_truth": {"loss": 16.40046525001526, "accuracy": 0.017822265625, "attn_entropy": 0.1472899168729782, "param_norm": 141.44120282500373}, "train": {"loss": 0.17359596490859985, "accuracy": 0.958984375, "attn_entropy": 0.0640957411378622, "param_norm": 141.4394786924297}, "step": 3540, "lr": 0.001}, {"val": {"loss": 8.212747871875763, "accuracy": 0.35693359375, "attn_entropy": 0.16532318759709597, "param_norm": 141.4612239045347}, "ground_truth": {"loss": 16.2901451587677, "accuracy": 0.013916015625, "attn_entropy": 0.1462113270536065, "param_norm": 141.4612239045347}, "train": {"loss": 0.1820075511932373, "accuracy": 0.947265625, "attn_entropy": 0.06075752526521683, "param_norm": 141.4591530371144}, "step": 3550, "lr": 0.001}, {"val": {"loss": 8.20618450641632, "accuracy": 0.3583984375, "attn_entropy": 0.16252106055617332, "param_norm": 141.48009474413232}, "ground_truth": {"loss": 16.375261306762695, "accuracy": 0.01123046875, "attn_entropy": 0.14480727445334196, "param_norm": 141.48009474413232}, "train": {"loss": 0.19026847183704376, "accuracy": 0.943359375, "attn_entropy": 0.06576993316411972, "param_norm": 141.47823847549444}, "step": 3560, "lr": 0.001}, {"val": {"loss": 8.095941722393036, "accuracy": 0.359375, "attn_entropy": 0.1654127575457096, "param_norm": 141.49977957837675}, "ground_truth": {"loss": 15.951022624969482, "accuracy": 0.01318359375, "attn_entropy": 0.14653574116528034, "param_norm": 141.49977957837675}, "train": {"loss": 0.16778656840324402, "accuracy": 0.953125, "attn_entropy": 0.06126810051500797, "param_norm": 141.4977921156317}, "step": 3570, "lr": 0.001}, {"val": {"loss": 8.093231201171875, "accuracy": 0.358642578125, "attn_entropy": 0.16557544004172087, "param_norm": 141.51966467894795}, "ground_truth": {"loss": 16.31344783306122, "accuracy": 0.0146484375, "attn_entropy": 0.14828675333410501, "param_norm": 141.51966467894795}, "train": {"loss": 0.24915944039821625, "accuracy": 0.921875, "attn_entropy": 0.06491061672568321, "param_norm": 141.51784035135472}, "step": 3580, "lr": 0.001}, {"val": {"loss": 7.947313606739044, "accuracy": 0.3740234375, "attn_entropy": 0.16567541006952524, "param_norm": 141.53879057743393}, "ground_truth": {"loss": 16.19966423511505, "accuracy": 0.014404296875, "attn_entropy": 0.14782692678272724, "param_norm": 141.53879057743393}, "train": {"loss": 0.15420471131801605, "accuracy": 0.966796875, "attn_entropy": 0.06286389753222466, "param_norm": 141.53685767118893}, "step": 3590, "lr": 0.001}, {"val": {"loss": 7.9966036677360535, "accuracy": 0.37353515625, "attn_entropy": 0.16423429548740387, "param_norm": 141.557389046656}, "ground_truth": {"loss": 16.027058362960815, "accuracy": 0.013427734375, "attn_entropy": 0.14639303786680102, "param_norm": 141.557389046656}, "train": {"loss": 0.17032861709594727, "accuracy": 0.955078125, "attn_entropy": 0.06212330609560013, "param_norm": 141.55545484072854}, "step": 3600, "lr": 0.001}, {"val": {"loss": 8.155750572681427, "accuracy": 0.35693359375, "attn_entropy": 0.16472744569182396, "param_norm": 141.57663427626278}, "ground_truth": {"loss": 16.41249430179596, "accuracy": 0.0146484375, "attn_entropy": 0.14653817657381296, "param_norm": 141.57663427626278}, "train": {"loss": 0.1758769452571869, "accuracy": 0.947265625, "attn_entropy": 0.061774443835020065, "param_norm": 141.57469085544335}, "step": 3610, "lr": 0.001}, {"val": {"loss": 8.137846231460571, "accuracy": 0.364013671875, "attn_entropy": 0.16479173209518194, "param_norm": 141.59575762837713}, "ground_truth": {"loss": 16.462316274642944, "accuracy": 0.014404296875, "attn_entropy": 0.14721601642668247, "param_norm": 141.59575762837713}, "train": {"loss": 0.24370263516902924, "accuracy": 0.916015625, "attn_entropy": 0.06380613148212433, "param_norm": 141.5938685466562}, "step": 3620, "lr": 0.001}, {"val": {"loss": 8.128287971019745, "accuracy": 0.36474609375, "attn_entropy": 0.1647551143541932, "param_norm": 141.61380129520236}, "ground_truth": {"loss": 16.36855697631836, "accuracy": 0.0126953125, "attn_entropy": 0.14736600778996944, "param_norm": 141.61380129520236}, "train": {"loss": 0.2038259208202362, "accuracy": 0.931640625, "attn_entropy": 0.06353320181369781, "param_norm": 141.612193891166}, "step": 3630, "lr": 0.001}, {"val": {"loss": 8.138709425926208, "accuracy": 0.36572265625, "attn_entropy": 0.16537989862263203, "param_norm": 141.62930363553758}, "ground_truth": {"loss": 16.105396389961243, "accuracy": 0.013427734375, "attn_entropy": 0.14777319971472025, "param_norm": 141.62930363553758}, "train": {"loss": 0.16321241855621338, "accuracy": 0.95703125, "attn_entropy": 0.06535888463258743, "param_norm": 141.627781347419}, "step": 3640, "lr": 0.001}, {"val": {"loss": 8.019155859947205, "accuracy": 0.369873046875, "attn_entropy": 0.16612612549215555, "param_norm": 141.64597647657547}, "ground_truth": {"loss": 16.49907946586609, "accuracy": 0.0166015625, "attn_entropy": 0.14764816407114267, "param_norm": 141.64597647657547}, "train": {"loss": 0.23047544062137604, "accuracy": 0.927734375, "attn_entropy": 0.06180662848055363, "param_norm": 141.64423144073584}, "step": 3650, "lr": 0.001}, {"val": {"loss": 8.076141059398651, "accuracy": 0.3642578125, "attn_entropy": 0.166934160515666, "param_norm": 141.66383193151967}, "ground_truth": {"loss": 16.337541103363037, "accuracy": 0.014892578125, "attn_entropy": 0.14946445124223828, "param_norm": 141.66383193151967}, "train": {"loss": 0.16614021360874176, "accuracy": 0.958984375, "attn_entropy": 0.06311538629233837, "param_norm": 141.66197089528384}, "step": 3660, "lr": 0.001}]