{"id":"https://openalex.org/W7160928847","doi":"https://doi.org/10.48550/arxiv.2605.10504","title":"Learning Less Is More: Premature Upper-Layer Attention Specialization Hurts Language Model Pretraining","display_name":"Learning Less Is More: Premature Upper-Layer Attention Specialization Hurts Language Model Pretraining","publication_year":2026,"publication_date":"2026-05-11","ids":{"openalex":"https://openalex.org/W7160928847","doi":"https://doi.org/10.48550/arxiv.2605.10504"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.10504","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10504","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.10504","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135965381","display_name":"Jinchang Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Jinchang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135918257","display_name":"Jindong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jindong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135976204","display_name":"Yuwen Hao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hao, Yuwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135968330","display_name":"Chengyu Zou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Chengyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135991069","display_name":"Rong Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Rong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5044844339","display_name":"Menglin Yang","orcid":"https://orcid.org/0000-0002-5719-4198"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Menglin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.7330999970436096,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.7330999970436096,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.04650000110268593,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.019600000232458115,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.6761000156402588},{"id":"https://openalex.org/keywords/commit","display_name":"Commit","score":0.64410001039505},{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.6158999800682068},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.6025000214576721},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.5156000256538391},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.4065999984741211},{"id":"https://openalex.org/keywords/multiplicative-function","display_name":"Multiplicative function","score":0.3885999917984009},{"id":"https://openalex.org/keywords/expectancy-theory","display_name":"Expectancy theory","score":0.37229999899864197},{"id":"https://openalex.org/keywords/mode","display_name":"Mode (computer interface)","score":0.36899998784065247}],"concepts":[{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.6761000156402588},{"id":"https://openalex.org/C153180980","wikidata":"https://www.wikidata.org/wiki/Q19776675","display_name":"Commit","level":2,"score":0.64410001039505},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.6158999800682068},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.6025000214576721},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.5156000256538391},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.5008000135421753},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.49079999327659607},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4787999987602234},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.4065999984741211},{"id":"https://openalex.org/C42747912","wikidata":"https://www.wikidata.org/wiki/Q1048447","display_name":"Multiplicative function","level":2,"score":0.3885999917984009},{"id":"https://openalex.org/C188353592","wikidata":"https://www.wikidata.org/wiki/Q450586","display_name":"Expectancy theory","level":2,"score":0.37229999899864197},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.36899998784065247},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.36250001192092896},{"id":"https://openalex.org/C2780665704","wikidata":"https://www.wikidata.org/wiki/Q959298","display_name":"Intervention (counseling)","level":2,"score":0.3537999987602234},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.35359999537467957},{"id":"https://openalex.org/C191172861","wikidata":"https://www.wikidata.org/wiki/Q7899321","display_name":"Upstream (networking)","level":2,"score":0.3521000146865845},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.3312999904155731},{"id":"https://openalex.org/C138496976","wikidata":"https://www.wikidata.org/wiki/Q175002","display_name":"Developmental psychology","level":1,"score":0.3303999900817871},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.31929999589920044},{"id":"https://openalex.org/C74672266","wikidata":"https://www.wikidata.org/wiki/Q815859","display_name":"Language acquisition","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C2779466056","wikidata":"https://www.wikidata.org/wiki/Q107630651","display_name":"Time point","level":2,"score":0.30640000104904175},{"id":"https://openalex.org/C2781039887","wikidata":"https://www.wikidata.org/wiki/Q1391724","display_name":"Factor (programming language)","level":2,"score":0.29980000853538513},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C77553402","wikidata":"https://www.wikidata.org/wiki/Q13222579","display_name":"Upper and lower bounds","level":2,"score":0.2896000146865845},{"id":"https://openalex.org/C2986432223","wikidata":"https://www.wikidata.org/wiki/Q5449740","display_name":"Risk model","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C104122410","wikidata":"https://www.wikidata.org/wiki/Q1416406","display_name":"Network model","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27390000224113464},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C12426560","wikidata":"https://www.wikidata.org/wiki/Q189569","display_name":"Basis (linear algebra)","level":2,"score":0.2533999979496002}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.10504","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10504","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.10504","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10504","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.49203741550445557,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"A":[0,105],"causal-decoder":[1],"block":[2],"is":[3,78],"hierarchical:":[4],"lower":[5],"layers":[6,13,25],"build":[7],"the":[8,75,94,98,103,111,126],"residual":[9,70,100],"basis":[10],"that":[11,96],"upper":[12,24,63],"attend":[14],"over.":[15],"We":[16,35],"identify":[17,132],"a":[18,115,122,137],"failure":[19],"mode":[20],"in":[21],"GPT":[22],"pretraining:":[23],"commit":[26],"to":[27],"sharp":[28],"attention":[29,40,64],"patterns":[30],"before":[31],"lower-layer":[32],"features":[33],"stabilize.":[34],"call":[36],"this":[37],"premature":[38],"upper-layer":[39,45,133],"specialization.":[41],"Temporarily":[42],"slowing":[43],"only":[44],"Q/K":[46,134],"projections":[47],"during":[48],"early":[49],"training":[50],"improves":[51],"final":[52],"perplexity":[53],"and":[54,144],"downstream":[55],"accuracy":[56],"without":[57],"altering":[58],"other":[59],"parameters;":[60],"it":[61],"prevents":[62],"from":[65],"collapsing":[66],"onto":[67],"an":[68],"immature":[69],"basis.":[71],"In":[72],"LLaMA-style":[73],"blocks,":[74],"same":[76,127],"intervention":[77,113],"nearly":[79],"unnecessary.":[80],"Through":[81],"ablations,":[82],"we":[83],"isolate":[84],"multiplicative":[85],"gated":[86,119],"FFNs":[87,120],"(not":[88],"RMSNorm":[89],"or":[90],"bias":[91],"removal)":[92],"as":[93,136],"component":[95],"suppresses":[97],"upstream":[99],"writes":[101],"driving":[102],"failure.":[104],"pathwise":[106],"analysis":[107],"unifies":[108],"both":[109],"findings:":[110],"learning-rate":[112],"reduces":[114],"step-size":[116],"factor,":[117],"while":[118],"reduce":[121],"residual-energy":[123],"factor":[124],"on":[125],"growth":[128],"pathway.":[129],"Our":[130],"results":[131],"timing":[135],"concrete":[138],"interaction":[139],"point":[140],"between":[141],"decoder":[142],"architecture":[143],"optimization.":[145]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
