<?xml version="1.0"?>
<dblpperson name="Kaifeng Lyu" pid="220/3283" n="61">
<person key="homepages/220/3283" mdate="2018-06-04">
<author pid="220/3283">Kaifeng Lyu</author>
</person>
<r><article publtype="informal" key="journals/corr/abs-2603-14493" mdate="2026-04-11">
<author pid="05/4746">He Li</author>
<author pid="77/1630">Yuhui Zhang</author>
<author pid="73/1307">Xiaohan Wang</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="147/5023">Serena Yeung-Levy</author>
<title>Fine-tuning MLLMs Without Forgetting Is Easier Than You Think.</title>
<year>2026</year>
<month>March</month>
<volume>abs/2603.14493</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2603.14493</ee>
<url>db/journals/corr/corr2603.html#abs-2603-14493</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2603-22213" mdate="2026-04-15">
<author pid="380/7639">Kexian Tang</author>
<author pid="156/8812">Jiani Wang</author>
<author pid="07/852">Shaowen Wang</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>SPA: A Simple but Tough-to-Beat Baseline for Knowledge Injection.</title>
<year>2026</year>
<month>March</month>
<volume>abs/2603.22213</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2603.22213</ee>
<url>db/journals/corr/corr2603.html#abs-2603-22213</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><inproceedings key="conf/iclr/LiPLL25" mdate="2025-05-15">
<author pid="244/9096">Binghui Li</author>
<author pid="387/3522">Zhixuan Pan</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="33/5448">Jian Li</author>
<title>Feature Averaging: An Implicit Bias of Gradient Descent Leading to Non-Robustness in Neural Networks.</title>
<year>2025</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=zPHra4V5Mc</ee>
<crossref>conf/iclr/2025</crossref>
<url>db/conf/iclr/iclr2025.html#LiPLL25</url>
</inproceedings>
</r>
<r><inproceedings key="conf/iclr/LuWLJ0W25" mdate="2026-01-22">
<author pid="33/2451-1">Rui Lu 0001</author>
<author pid="153/0092">Runzhe Wang</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="401/8285">Xitai Jiang</author>
<author pid="120/2687-1">Gao Huang 0001</author>
<author pid="64/10471-1">Mengdi Wang 0001</author>
<title>Towards Understanding Text Hallucination of Diffusion Models via Local Generation Bias.</title>
<year>2025</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=SKW10XJlAI</ee>
<crossref>conf/iclr/2025</crossref>
<url>db/conf/iclr/iclr2025.html#LuWLJ0W25</url>
</inproceedings>
</r>
<r><inproceedings key="conf/iclr/LuoWHSL0LC25" mdate="2025-07-10">
<author pid="402/0024">Kairong Luo</author>
<author pid="375/1042">Haodong Wen</author>
<author pid="268/5534">Shengding Hu</author>
<author pid="280/0130">Zhenbo Sun</author>
<author pid="53/3245-1">Zhiyuan Liu 0001</author>
<author pid="95/3291">Maosong Sun 0001</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="60/810">Wenguang Chen</author>
<title>A Multi-Power Law for Loss Curve Prediction Across Learning Rate Schedules.</title>
<year>2025</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=KnoS9XxIlK</ee>
<crossref>conf/iclr/2025</crossref>
<url>db/conf/iclr/iclr2025.html#LuoWHSL0LC25</url>
</inproceedings>
</r>
<r><inproceedings key="conf/iclr/PanigrahiSLMRKK25" mdate="2025-05-15">
<author pid="208/4926">Abhishek Panigrahi</author>
<author pid="199/2236">Nikunj Saunshi</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="243/5898">Sobhan Miryoosefi</author>
<author pid="50/10452">Sashank J. Reddi</author>
<author pid="52/4768">Satyen Kale</author>
<author pid="66/6804">Sanjiv Kumar</author>
<title>Efficient stagewise pretraining via progressive subnetworks.</title>
<year>2025</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=Y5LjYI4N6P</ee>
<crossref>conf/iclr/2025</crossref>
<url>db/conf/iclr/iclr2025.html#PanigrahiSLMRKK25</url>
</inproceedings>
</r>
<r><inproceedings key="conf/iclr/QiPL0RBM025" mdate="2025-05-15">
<author pid="274/2321">Xiangyu Qi</author>
<author pid="270/1582">Ashwinee Panda</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="35/573-10">Xiao Ma 0010</author>
<author pid="73/8829">Subhrajit Roy</author>
<author pid="41/9367">Ahmad Beirami</author>
<author pid="39/6266">Prateek Mittal</author>
<author pid="h/PeterHenderson2">Peter Henderson 0002</author>
<title>Safety Alignment Should be Made More Than Just a Few Tokens Deep.</title>
<year>2025</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=6Mxhg9PtDE</ee>
<crossref>conf/iclr/2025</crossref>
<url>db/conf/iclr/iclr2025.html#QiPL0RBM025</url>
</inproceedings>
</r>
<r><inproceedings key="conf/iclr/WenDL25" mdate="2025-05-15">
<author pid="322/0395">Kaiyue Wen</author>
<author pid="348/8880">Xingyu Dang</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>RNNs are not Transformers (Yet): The Key Bottleneck on In-Context Retrieval.</title>
<year>2025</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=h3wbI8Uk1Z</ee>
<crossref>conf/iclr/2025</crossref>
<url>db/conf/iclr/iclr2025.html#WenDL25</url>
</inproceedings>
</r>
<r><inproceedings key="conf/icml/MedvedevLYA0S25" mdate="2026-02-04">
<author pid="387/1895">Marko Medvedev</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="39/578">Dingli Yu</author>
<author pid="a/SArora">Sanjeev Arora</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="50/3633">Nathan Srebro</author>
<title>Weak-to-Strong Generalization Even in Random Feature Networks, Provably.</title>
<year>2025</year>
<booktitle>ICML</booktitle>
<ee type="oa">https://proceedings.mlr.press/v267/medvedev25a.html</ee>
<ee type="oa">https://openreview.net/forum?id=OUzDIhgiqr</ee>
<crossref>conf/icml/2025</crossref>
<url>db/conf/icml/icml2025.html#MedvedevLYA0S25</url>
</inproceedings>
</r>
<r><article publtype="informal" key="journals/corr/abs-2503-02877" mdate="2025-04-11">
<author pid="387/1895">Marko Medvedev</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="39/578">Dingli Yu</author>
<author pid="a/SArora">Sanjeev Arora</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="50/3633">Nathan Srebro</author>
<title>Weak-to-Strong Generalization Even in Random Feature Networks, Provably.</title>
<year>2025</year>
<month>March</month>
<volume>abs/2503.02877</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2503.02877</ee>
<url>db/journals/corr/corr2503.html#abs-2503-02877</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2503-03595" mdate="2026-01-22">
<author pid="33/2451-1">Rui Lu 0001</author>
<author pid="153/0092">Runzhe Wang</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="401/8285">Xitai Jiang</author>
<author pid="120/2687-1">Gao Huang 0001</author>
<author pid="64/10471-1">Mengdi Wang 0001</author>
<title>Towards Understanding Text Hallucination of Diffusion Models via Local Generation Bias.</title>
<year>2025</year>
<month>March</month>
<volume>abs/2503.03595</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2503.03595</ee>
<url>db/journals/corr/corr2503.html#abs-2503-03595</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2503-12811" mdate="2025-04-13">
<author pid="402/0024">Kairong Luo</author>
<author pid="375/1042">Haodong Wen</author>
<author pid="268/5534">Shengding Hu</author>
<author pid="280/0130">Zhenbo Sun</author>
<author pid="53/3245-1">Zhiyuan Liu 0001</author>
<author pid="95/3291">Maosong Sun 0001</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="60/810">Wenguang Chen</author>
<title>A Multi-Power Law for Loss Curve Prediction Across Learning Rate Schedules.</title>
<year>2025</year>
<month>March</month>
<volume>abs/2503.12811</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2503.12811</ee>
<url>db/journals/corr/corr2503.html#abs-2503-12811</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2503-19990" mdate="2025-04-19">
<author pid="380/7639">Kexian Tang</author>
<author pid="67/6659-2">Junyao Gao 0002</author>
<author pid="215/4033">Yanhong Zeng</author>
<author pid="211/7919">Haodong Duan</author>
<author pid="44/8711-5">Yanan Sun 0005</author>
<author pid="276/3671">Zhening Xing</author>
<author pid="216/2664">Wenran Liu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="181/2839-26">Kai Chen 0026</author>
<title>LEGO-Puzzles: How Good Are MLLMs at Multi-Step Spatial Reasoning?</title>
<year>2025</year>
<month>March</month>
<volume>abs/2503.19990</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2503.19990</ee>
<url>db/journals/corr/corr2503.html#abs-2503-19990</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2505-18091" mdate="2025-07-03">
<author pid="294/6587">Xinran Gu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="400/4254">Jiazheng Li 0015</author>
<author pid="220/5559">Jingzhao Zhang</author>
<title>Data Mixing Can Induce Phase Transitions in Knowledge Acquisition.</title>
<year>2025</year>
<month>May</month>
<volume>abs/2505.18091</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2505.18091</ee>
<url>db/journals/corr/corr2505.html#abs-2505-18091</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2506-07104" mdate="2025-07-07">
<author pid="304/2243">Jiaxuan Gao</author>
<author pid="44/5507">Shu Yan</author>
<author pid="391/3109">Qixin Tan</author>
<author pid="58/2893">Lu Yang</author>
<author pid="121/0926">Shusheng Xu</author>
<author pid="26/4472">Wei Fu</author>
<author pid="299/5277">Zhiyu Mei</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="44/3684-13">Yi Wu 0013</author>
<title>How Far Are We from Optimal Reasoning Efficiency?</title>
<year>2025</year>
<month>June</month>
<volume>abs/2506.07104</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2506.07104</ee>
<url>db/journals/corr/corr2506.html#abs-2506-07104</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2510-25108" mdate="2025-11-16">
<author pid="387/1895">Marko Medvedev</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="50/3633">Nathan Srebro</author>
<title>Shift is Good: Mismatched Data Mixing Improves Test Performance.</title>
<year>2025</year>
<month>October</month>
<volume>abs/2510.25108</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2510.25108</ee>
<url>db/journals/corr/corr2510.html#abs-2510-25108</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2511-02773" mdate="2025-11-24">
<author pid="158/3530">Xinghan Li</author>
<author pid="375/1042">Haodong Wen</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>Adam Reduces a Unique Form of Sharpness: Theoretical Insights Near the Minimizer Manifold.</title>
<year>2025</year>
<month>November</month>
<volume>abs/2511.02773</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2511.02773</ee>
<url>db/journals/corr/corr2511.html#abs-2511-02773</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2511-07318" mdate="2026-01-02">
<author pid="07/852">Shaowen Wang</author>
<author pid="344/8335">Yiqi Dong</author>
<author pid="352/4488">Ruinian Chang</author>
<author pid="401/0605">Tansheng Zhu</author>
<author pid="393/4439">Yuebo Sun</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="33/5448">Jian Li</author>
<title>When Bias Pretends to Be Truth: How Spurious Correlations Undermine Hallucination Detection in LLMs.</title>
<year>2025</year>
<month>November</month>
<volume>abs/2511.07318</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2511.07318</ee>
<url>db/journals/corr/corr2511.html#abs-2511-07318</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2511-13421" mdate="2026-01-14">
<author pid="349/4767">Tingkai Yan</author>
<author pid="375/1042">Haodong Wen</author>
<author pid="244/9096">Binghui Li</author>
<author pid="402/0024">Kairong Luo</author>
<author pid="60/810">Wenguang Chen</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>Larger Datasets Can Be Repeated More: A Theoretical Analysis of Multi-Epoch Scaling in Linear Regression.</title>
<year>2025</year>
<month>November</month>
<volume>abs/2511.13421</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2511.13421</ee>
<url>db/journals/corr/corr2511.html#abs-2511-13421</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2511-18903" mdate="2026-01-14">
<author pid="402/0024">Kairong Luo</author>
<author pid="280/0130">Zhenbo Sun</author>
<author pid="375/1042">Haodong Wen</author>
<author pid="198/2155">Xinyu Shi</author>
<author pid="23/11478">Jiarui Cui</author>
<author pid="423/7113">Chenyi Dang</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="60/810">Wenguang Chen</author>
<title>How Learning Rate Decay Wastes Your Best Data in Curriculum-Based LLM Pretraining.</title>
<year>2025</year>
<month>November</month>
<volume>abs/2511.18903</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2511.18903</ee>
<url>db/journals/corr/corr2511.html#abs-2511-18903</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2512-07612" mdate="2026-01-28">
<author pid="402/0024">Kairong Luo</author>
<author pid="280/0130">Zhenbo Sun</author>
<author pid="198/2155">Xinyu Shi</author>
<author pid="156/3418">Shengqi Chen 0001</author>
<author pid="95/10266-3">Bowen Yu 0003</author>
<author pid="208/7906">Yunyi Chen</author>
<author pid="423/7113">Chenyi Dang</author>
<author pid="278/9862">Hengtao Tao</author>
<author pid="39/721">Hui Wang</author>
<author pid="31/6019">Fangming Liu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="60/810">Wenguang Chen</author>
<title>PCMind-2.1-Kaiyuan-2B Technical Report.</title>
<year>2025</year>
<month>December</month>
<volume>abs/2512.07612</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2512.07612</ee>
<url>db/journals/corr/corr2512.html#abs-2512-07612</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2512-24503" mdate="2026-01-26">
<author pid="329/4974">Jiachen T. Wang</author>
<author pid="75/5056">Tong Wu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="72/8399">James Zou 0001</author>
<author pid="s/DXSong">Dawn Song</author>
<author pid="147/5355-1">Ruoxi Jia 0001</author>
<author pid="39/6266">Prateek Mittal</author>
<title>Can Small Training Runs Reliably Guide Data Curation? Rethinking Proxy-Model Practice.</title>
<year>2025</year>
<month>December</month>
<volume>abs/2512.24503</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2512.24503</ee>
<url>db/journals/corr/corr2512.html#abs-2512-24503</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><inproceedings key="conf/iclr/GuLAZH24" mdate="2024-08-07">
<author pid="294/6587">Xinran Gu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="a/SArora">Sanjeev Arora</author>
<author pid="220/5559">Jingzhao Zhang</author>
<author pid="79/7077">Longbo Huang</author>
<title>A Quadratic Synchronization Rule for Distributed Deep Learning.</title>
<year>2024</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=yroyhkhWS6</ee>
<crossref>conf/iclr/2024</crossref>
<url>db/conf/iclr/iclr2024.html#GuLAZH24</url>
</inproceedings>
</r>
<r><inproceedings key="conf/iclr/LyuJ0DL024" mdate="2024-08-07">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="276/0406">Jikai Jin</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="176/5602">Simon Shaolei Du</author>
<author pid="88/3262">Jason D. Lee</author>
<author pid="52/173-14">Wei Hu 0014</author>
<title>Dichotomy of Early and Late Phase Implicit Biases Can Provably Induce Grokking.</title>
<year>2024</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=XsHqr9dEGH</ee>
<crossref>conf/iclr/2024</crossref>
<url>db/conf/iclr/iclr2024.html#LyuJ0DL024</url>
</inproceedings>
</r>
<r><inproceedings key="conf/iclr/WangM0L024" mdate="2024-08-07">
<author pid="153/0092">Runzhe Wang</author>
<author pid="176/9810">Sadhika Malladi</author>
<author pid="296/3981">Tianhao Wang 0017</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<title>The Marginal Value of Momentum for Small Learning Rate SGD.</title>
<year>2024</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=3JjJezzVkT</ee>
<crossref>conf/iclr/2024</crossref>
<url>db/conf/iclr/iclr2024.html#WangM0L024</url>
</inproceedings>
</r>
<r><inproceedings key="conf/iclr/ZhouLRMRKKA24" mdate="2024-08-07">
<author pid="322/0479">Yongchao Zhou</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="69/8761">Ankit Singh Rawat</author>
<author pid="89/3514">Aditya Krishna Menon</author>
<author pid="97/4479">Afshin Rostamizadeh</author>
<author pid="66/6804">Sanjiv Kumar</author>
<author pid="244/2146">Jean-Fran&#231;ois Kagy</author>
<author pid="210/6453">Rishabh Agarwal</author>
<title>DistillSpec: Improving Speculative Decoding via Knowledge Distillation.</title>
<year>2024</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=rsY6J3ZaTF</ee>
<crossref>conf/iclr/2024</crossref>
<url>db/conf/iclr/iclr2024.html#ZhouLRMRKKA24</url>
</inproceedings>
</r>
<r><inproceedings key="conf/nips/LyuZGYGA24" mdate="2025-02-13">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="156/0909">Haoyu Zhao</author>
<author pid="294/6587">Xinran Gu</author>
<author pid="39/578">Dingli Yu</author>
<author pid="172/1039">Anirudh Goyal</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Keeping LLMs Aligned After Fine-tuning: The Crucial Role of Prompt Templates.</title>
<year>2024</year>
<booktitle>NeurIPS</booktitle>
<ee type="oa">http://papers.nips.cc/paper_files/paper/2024/hash/d6f034bb216b472fc7d32ec7aff20342-Abstract-Conference.html</ee>
<crossref>conf/nips/2024</crossref>
<url>db/conf/nips/neurips2024.html#LyuZGYGA24</url>
</inproceedings>
</r>
<r><article publtype="informal" key="journals/corr/abs-2402-05913" mdate="2024-02-14">
<author pid="208/4926">Abhishek Panigrahi</author>
<author pid="199/2236">Nikunj Saunshi</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="243/5898">Sobhan Miryoosefi</author>
<author pid="50/10452">Sashank J. Reddi</author>
<author pid="52/4768">Satyen Kale</author>
<author pid="66/6804">Sanjiv Kumar</author>
<title>Efficient Stagewise Pretraining via Progressive Subnetworks.</title>
<year>2024</year>
<volume>abs/2402.05913</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2402.05913</ee>
<url>db/journals/corr/corr2402.html#abs-2402-05913</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2402-18510" mdate="2024-03-26">
<author pid="322/0395">Kaiyue Wen</author>
<author pid="348/8880">Xingyu Dang</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>RNNs are not Transformers (Yet): The Key Bottleneck on In-context Retrieval.</title>
<year>2024</year>
<volume>abs/2402.18510</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2402.18510</ee>
<url>db/journals/corr/corr2402.html#abs-2402-18510</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2402-18540" mdate="2024-03-26">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="156/0909">Haoyu Zhao</author>
<author pid="294/6587">Xinran Gu</author>
<author pid="39/578">Dingli Yu</author>
<author pid="172/1039">Anirudh Goyal</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Keeping LLMs Aligned After Fine-tuning: The Crucial Role of Prompt Templates.</title>
<year>2024</year>
<volume>abs/2402.18540</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2402.18540</ee>
<url>db/journals/corr/corr2402.html#abs-2402-18540</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2406-05946" mdate="2024-07-13">
<author pid="274/2321">Xiangyu Qi</author>
<author pid="270/1582">Ashwinee Panda</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="35/573-10">Xiao Ma 0010</author>
<author pid="73/8829">Subhrajit Roy</author>
<author pid="41/9367">Ahmad Beirami</author>
<author pid="39/6266">Prateek Mittal</author>
<author pid="h/PeterHenderson2">Peter Henderson 0002</author>
<title>Safety Alignment Should Be Made More Than Just a Few Tokens Deep.</title>
<year>2024</year>
<volume>abs/2406.05946</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2406.05946</ee>
<url>db/journals/corr/corr2406.html#abs-2406-05946</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2407-21009" mdate="2025-04-10">
<author pid="304/8089">Vedant Shah</author>
<author pid="39/578">Dingli Yu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="305/6680-2">Simon Park 0002</author>
<author pid="120/5291">Nan Rosemary Ke</author>
<author pid="m/MichaelCMozer">Michael Mozer</author>
<author pid="56/953">Yoshua Bengio</author>
<author pid="a/SArora">Sanjeev Arora</author>
<author pid="172/1039">Anirudh Goyal</author>
<title>AI-Assisted Generation of Difficult Math Questions.</title>
<year>2024</year>
<volume>abs/2407.21009</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2407.21009</ee>
<url>db/journals/corr/corr2407.html#abs-2407-21009</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2410-10322" mdate="2024-11-25">
<author pid="244/9096">Binghui Li</author>
<author pid="387/3522">Zhixuan Pan</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="33/5448">Jian Li</author>
<title>Feature Averaging: An Implicit Bias of Gradient Descent Leading to Non-Robustness in Neural Networks.</title>
<year>2024</year>
<volume>abs/2410.10322</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2410.10322</ee>
<url>db/journals/corr/corr2410.html#abs-2410-10322</url>
<stream>streams/journals/corr</stream>
</article>
</r>
<r><inproceedings key="conf/iclr/GuLHA23" mdate="2024-07-24">
<author pid="294/6587">Xinran Gu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="79/7077">Longbo Huang</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Why (and When) does Local SGD Generalize Better than SGD?</title>
<year>2023</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=svCcui6Drl</ee>
<crossref>conf/iclr/2023</crossref>
<url>db/conf/iclr/iclr2023.html#GuLHA23</url>
</inproceedings>
</r>
<r><inproceedings key="conf/icml/Jin0LDL23" mdate="2024-10-06">
<author pid="276/0406">Jikai Jin</author>
<author orcid="0000-0001-8446-0319" pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="176/5602">Simon Shaolei Du</author>
<author pid="88/3262">Jason D. Lee</author>
<title>Understanding Incremental Learning of Gradient Descent: A Fine-grained Analysis of Matrix Sensing.</title>
<pages>15200-15238</pages>
<year>2023</year>
<booktitle>ICML</booktitle>
<ee type="oa">https://proceedings.mlr.press/v202/jin23a.html</ee>
<crossref>conf/icml/2023</crossref>
<url>db/conf/icml/icml2023.html#Jin0LDL23</url>
</inproceedings>
</r>
<r><article publtype="informal" key="journals/corr/abs-2301-11500" mdate="2023-01-31">
<author pid="276/0406">Jikai Jin</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="176/5602">Simon S. Du</author>
<author pid="88/3262">Jason D. Lee</author>
<title>Understanding Incremental Learning of Gradient Descent: A Fine-grained Analysis of Matrix Sensing.</title>
<year>2023</year>
<volume>abs/2301.11500</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2301.11500</ee>
<url>db/journals/corr/corr2301.html#abs-2301-11500</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2303-01215" mdate="2023-03-06">
<author pid="294/6587">Xinran Gu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="79/7077">Longbo Huang</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Why (and When) does Local SGD Generalize Better than SGD?</title>
<year>2023</year>
<volume>abs/2303.01215</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2303.01215</ee>
<url>db/journals/corr/corr2303.html#abs-2303-01215</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2307-15196" mdate="2024-10-06">
<author pid="153/0092">Runzhe Wang</author>
<author pid="176/9810">Sadhika Malladi</author>
<author pid="296/3981">Tianhao Wang 0017</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author orcid="0000-0001-8446-0319" pid="39/7780-5">Zhiyuan Li 0005</author>
<title>The Marginal Value of Momentum for Small Learning Rate SGD.</title>
<year>2023</year>
<volume>abs/2307.15196</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2307.15196</ee>
<url>db/journals/corr/corr2307.html#abs-2307-15196</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2310-08461" mdate="2023-10-25">
<author pid="322/0479">Yongchao Zhou</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="69/8761">Ankit Singh Rawat</author>
<author pid="89/3514">Aditya Krishna Menon</author>
<author pid="97/4479">Afshin Rostamizadeh</author>
<author pid="66/6804">Sanjiv Kumar</author>
<author pid="244/2146">Jean-Fran&#231;ois Kagy</author>
<author pid="210/6453">Rishabh Agarwal</author>
<title>DistillSpec: Improving Speculative Decoding via Knowledge Distillation.</title>
<year>2023</year>
<volume>abs/2310.08461</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2310.08461</ee>
<url>db/journals/corr/corr2310.html#abs-2310-08461</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2310-14423" mdate="2023-10-30">
<author pid="294/6587">Xinran Gu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="a/SArora">Sanjeev Arora</author>
<author pid="220/5559">Jingzhao Zhang</author>
<author pid="79/7077">Longbo Huang</author>
<title>A Quadratic Synchronization Rule for Distributed Deep Learning.</title>
<year>2023</year>
<volume>abs/2310.14423</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2310.14423</ee>
<url>db/journals/corr/corr2310.html#abs-2310-14423</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2311-18817" mdate="2024-10-06">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="276/0406">Jikai Jin</author>
<author orcid="0000-0001-8446-0319" pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="176/5602">Simon S. Du</author>
<author pid="88/3262">Jason D. Lee</author>
<author pid="52/173-14">Wei Hu 0014</author>
<title>Dichotomy of Early and Late Phase Implicit Biases Can Provably Induce Grokking.</title>
<year>2023</year>
<volume>abs/2311.18817</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2311.18817</ee>
<url>db/journals/corr/corr2311.html#abs-2311-18817</url>
</article>
</r>
<r><inproceedings key="conf/nips/GuptaSYLA22" mdate="2024-01-08">
<author pid="172/3915">Arushi Gupta</author>
<author pid="199/2236">Nikunj Saunshi</author>
<author pid="39/578">Dingli Yu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>New Definitions and Evaluations for Saliency Methods: Staying Intrinsic, Complete and Sound.</title>
<year>2022</year>
<crossref>conf/nips/2022</crossref>
<booktitle>NeurIPS</booktitle>
<ee type="oa">http://papers.nips.cc/paper_files/paper/2022/hash/d6383e7643415842b48a5077a1b09c98-Abstract-Conference.html</ee>
<url>db/conf/nips/neurips2022.html#GuptaSYLA22</url>
</inproceedings>
</r>
<r><inproceedings key="conf/nips/Lyu0A22" mdate="2024-01-08">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Understanding the Generalization Benefit of Normalization Layers: Sharpness Reduction.</title>
<year>2022</year>
<crossref>conf/nips/2022</crossref>
<booktitle>NeurIPS</booktitle>
<ee type="oa">http://papers.nips.cc/paper_files/paper/2022/hash/dffd1c523512e557f4e75e8309049213-Abstract-Conference.html</ee>
<url>db/conf/nips/neurips2022.html#Lyu0A22</url>
</inproceedings>
</r>
<r><inproceedings key="conf/nips/MalladiLPA22" mdate="2024-01-08">
<author pid="176/9810">Sadhika Malladi</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="208/4926">Abhishek Panigrahi</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>On the SDEs and Scaling Rules for Adaptive Gradient Algorithms.</title>
<year>2022</year>
<crossref>conf/nips/2022</crossref>
<booktitle>NeurIPS</booktitle>
<ee type="oa">http://papers.nips.cc/paper_files/paper/2022/hash/32ac710102f0620d0f28d5d05a44fe08-Abstract-Conference.html</ee>
<url>db/conf/nips/neurips2022.html#MalladiLPA22</url>
</inproceedings>
</r>
<r><article publtype="informal" key="journals/corr/abs-2205-10287" mdate="2022-05-23">
<author pid="176/9810">Sadhika Malladi</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="208/4926">Abhishek Panigrahi</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>On the SDEs and Scaling Rules for Adaptive Gradient Algorithms.</title>
<year>2022</year>
<volume>abs/2205.10287</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2205.10287</ee>
<url>db/journals/corr/corr2205.html#abs-2205-10287</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2206-07085" mdate="2022-06-21">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Understanding the Generalization Benefit of Normalization Layers: Sharpness Reduction.</title>
<year>2022</year>
<volume>abs/2206.07085</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2206.07085</ee>
<url>db/journals/corr/corr2206.html#abs-2206-07085</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2211-02912" mdate="2022-11-10">
<author pid="172/3915">Arushi Gupta</author>
<author pid="199/2236">Nikunj Saunshi</author>
<author pid="39/578">Dingli Yu</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>New Definitions and Evaluations for Saliency Methods: Staying Intrinsic, Complete and Sound.</title>
<year>2022</year>
<volume>abs/2211.02912</volume>
<journal>CoRR</journal>
<ee type="oa">https://doi.org/10.48550/arXiv.2211.02912</ee>
<url>db/journals/corr/corr2211.html#abs-2211-02912</url>
</article>
</r>
<r><inproceedings key="conf/iclr/LiLL21" mdate="2024-10-06">
<author orcid="0000-0001-8446-0319" pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="70/4804">Yuping Luo</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>Towards Resolving the Implicit Bias of Gradient Descent for Matrix Factorization: Greedy Low-Rank Learning.</title>
<year>2021</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=AHOs7Sm5H7R</ee>
<crossref>conf/iclr/2021</crossref>
<url>db/conf/iclr/iclr2021.html#LiLL21</url>
</inproceedings>
</r>
<r><inproceedings key="conf/nips/LyuLWA21" mdate="2024-10-06">
<author pid="220/3283">Kaifeng Lyu</author>
<author orcid="0000-0001-8446-0319" pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="153/0092">Runzhe Wang</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Gradient Descent on Two-layer Nets: Margin Maximization and Simplicity Bias.</title>
<pages>12978-12991</pages>
<year>2021</year>
<booktitle>NeurIPS</booktitle>
<ee type="oa">https://proceedings.neurips.cc/paper/2021/hash/6c351da15b5e8a743a21ee96a86e25df-Abstract.html</ee>
<crossref>conf/nips/2021</crossref>
<url>db/conf/nips/neurips2021.html#LyuLWA21</url>
</inproceedings>
</r>
<r><article publtype="informal" key="journals/corr/abs-2110-13905" mdate="2021-10-29">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="153/0092">Runzhe Wang</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Gradient Descent on Two-layer Nets: Margin Maximization and Simplicity Bias.</title>
<year>2021</year>
<volume>abs/2110.13905</volume>
<journal>CoRR</journal>
<ee type="oa">https://arxiv.org/abs/2110.13905</ee>
<url>db/journals/corr/corr2110.html#abs-2110-13905</url>
</article>
</r>
<r><inproceedings key="conf/iclr/LyuL20" mdate="2020-05-12">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="33/5448-15">Jian Li 0015</author>
<title>Gradient Descent Maximizes the Margin of Homogeneous Neural Networks.</title>
<year>2020</year>
<booktitle>ICLR</booktitle>
<ee type="oa">https://openreview.net/forum?id=SJeLIgBKPS</ee>
<crossref>conf/iclr/2020</crossref>
<url>db/conf/iclr/iclr2020.html#LyuL20</url>
</inproceedings>
</r>
<r><inproceedings key="conf/nips/0005LA20" mdate="2024-10-06">
<author orcid="0000-0001-8446-0319" pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Reconciling Modern Deep Learning with Traditional Optimization Analyses: The Intrinsic Learning Rate.</title>
<year>2020</year>
<booktitle>NeurIPS</booktitle>
<ee type="oa">https://proceedings.neurips.cc/paper/2020/hash/a7453a5f026fb6831d68bdc9cb0edcae-Abstract.html</ee>
<crossref>conf/nips/2020</crossref>
<url>db/conf/nips/neurips2020.html#0005LA20</url>
</inproceedings>
</r>
<r><article publtype="informal" key="journals/corr/abs-2010-02916" mdate="2020-10-13">
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="a/SArora">Sanjeev Arora</author>
<title>Reconciling Modern Deep Learning with Traditional Optimization Analyses: The Intrinsic Learning Rate.</title>
<year>2020</year>
<volume>abs/2010.02916</volume>
<journal>CoRR</journal>
<ee type="oa">https://arxiv.org/abs/2010.02916</ee>
<url>db/journals/corr/corr2010.html#abs-2010-02916</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-2012-09839" mdate="2021-01-04">
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="70/4804">Yuping Luo</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>Towards Resolving the Implicit Bias of Gradient Descent for Matrix Factorization: Greedy Low-Rank Learning.</title>
<year>2020</year>
<volume>abs/2012.09839</volume>
<journal>CoRR</journal>
<ee type="oa">https://arxiv.org/abs/2012.09839</ee>
<url>db/journals/corr/corr2012.html#abs-2012-09839</url>
</article>
</r>
<r><inproceedings key="conf/iclr/AroraLL19" mdate="2024-10-06">
<author pid="a/SArora">Sanjeev Arora</author>
<author orcid="0000-0001-8446-0319" pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>Theoretical Analysis of Auto Rate-Tuning by Batch Normalization.</title>
<year>2019</year>
<booktitle>ICLR (Poster)</booktitle>
<ee type="oa">https://openreview.net/forum?id=rkxQ-nA9FX</ee>
<crossref>conf/iclr/2019</crossref>
<url>db/conf/iclr/iclr2019.html#AroraLL19</url>
</inproceedings>
</r>
<r><inproceedings key="conf/soda/ChenGLRR19" mdate="2021-07-15">
<author pid="116/0948-1">Lijie Chen 0001</author>
<author pid="g/ShafiGoldwasser">Shafi Goldwasser</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="00/6232">Guy N. Rothblum</author>
<author pid="11/10308">Aviad Rubinstein</author>
<title>Fine-grained Complexity Meets IP = PSPACE.</title>
<pages>1-20</pages>
<year>2019</year>
<booktitle>SODA</booktitle>
<ee type="oa">https://doi.org/10.1137/1.9781611975482.1</ee>
<ee>https://dl.acm.org/citation.cfm?id=3310436</ee>
<crossref>conf/soda/2019</crossref>
<url>db/conf/soda/soda2019.html#ChenGLRR19</url>
</inproceedings>
</r>
<r><article publtype="informal" key="journals/corr/abs-1906-05890" mdate="2020-05-12">
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="33/5448-15">Jian Li 0015</author>
<title>Gradient Descent Maximizes the Margin of Homogeneous Neural Networks.</title>
<year>2019</year>
<volume>abs/1906.05890</volume>
<journal>CoRR</journal>
<ee type="oa">http://arxiv.org/abs/1906.05890</ee>
<url>db/journals/corr/corr1906.html#abs-1906-05890</url>
</article>
</r>
<r><inproceedings key="conf/icalp/DuanLX18" mdate="2018-08-23">
<author pid="90/5794">Ran Duan</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="222/3274">Yuanhang Xie</author>
<title>Single-Source Bottleneck Path Algorithm Faster than Sorting for Sparse Graphs.</title>
<pages>43:1-43:14</pages>
<year>2018</year>
<booktitle>ICALP</booktitle>
<ee type="oa">https://doi.org/10.4230/LIPIcs.ICALP.2018.43</ee>
<crossref>conf/icalp/2018</crossref>
<url>db/conf/icalp/icalp2018.html#DuanLX18</url>
</inproceedings>
</r>
<r><article publtype="informal" key="journals/corr/abs-1805-02351" mdate="2021-02-05">
<author pid="116/0948-1">Lijie Chen 0001</author>
<author pid="g/ShafiGoldwasser">Shafi Goldwasser</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="00/6232">Guy N. Rothblum</author>
<author pid="11/10308">Aviad Rubinstein</author>
<title>Fine-grained Complexity Meets IP = PSPACE.</title>
<year>2018</year>
<volume>abs/1805.02351</volume>
<journal>CoRR</journal>
<ee type="oa">http://arxiv.org/abs/1805.02351</ee>
<url>db/journals/corr/corr1805.html#abs-1805-02351</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-1808-10658" mdate="2018-09-03">
<author pid="90/5794">Ran Duan</author>
<author pid="220/3283">Kaifeng Lyu</author>
<author pid="224/0209">Hongxun Wu</author>
<author pid="222/3274">Yuanhang Xie</author>
<title>Single-Source Bottleneck Path Algorithm Faster than Sorting for Sparse Graphs.</title>
<year>2018</year>
<volume>abs/1808.10658</volume>
<journal>CoRR</journal>
<ee type="oa">http://arxiv.org/abs/1808.10658</ee>
<url>db/journals/corr/corr1808.html#abs-1808-10658</url>
</article>
</r>
<r><article publtype="informal" key="journals/corr/abs-1812-03981" mdate="2019-11-25">
<author pid="a/SArora">Sanjeev Arora</author>
<author pid="39/7780-5">Zhiyuan Li 0005</author>
<author pid="220/3283">Kaifeng Lyu</author>
<title>Theoretical Analysis of Auto Rate-Tuning by Batch Normalization.</title>
<year>2018</year>
<volume>abs/1812.03981</volume>
<journal>CoRR</journal>
<ee type="oa">http://arxiv.org/abs/1812.03981</ee>
<url>db/journals/corr/corr1812.html#abs-1812-03981</url>
</article>
</r>
<coauthors n="110" nc="1">
<co c="0"><na f="a/Agarwal:Rishabh" pid="210/6453">Rishabh Agarwal</na></co>
<co c="0"><na f="a/Arora:Sanjeev" pid="a/SArora">Sanjeev Arora</na></co>
<co c="0"><na f="b/Beirami:Ahmad" pid="41/9367">Ahmad Beirami</na></co>
<co c="0"><na f="b/Bengio:Yoshua" pid="56/953">Yoshua Bengio</na></co>
<co c="0"><na f="c/Chang:Ruinian" pid="352/4488">Ruinian Chang</na></co>
<co c="0"><na f="c/Chen_0026:Kai" pid="181/2839-26">Kai Chen 0026</na></co>
<co c="0"><na f="c/Chen_0001:Lijie" pid="116/0948-1">Lijie Chen 0001</na></co>
<co c="0"><na f="c/Chen_0001:Shengqi" pid="156/3418">Shengqi Chen 0001</na></co>
<co c="0"><na f="c/Chen:Wenguang" pid="60/810">Wenguang Chen</na></co>
<co c="0"><na f="c/Chen:Yunyi" pid="208/7906">Yunyi Chen</na></co>
<co c="0" n="2"><na f="c/Cui:Jia=Rui" pid="23/11478">Jia-Rui Cui</na><na>Jiarui Cui</na></co>
<co c="0"><na f="d/Dang:Chenyi" pid="423/7113">Chenyi Dang</na></co>
<co c="0"><na f="d/Dang:Xingyu" pid="348/8880">Xingyu Dang</na></co>
<co c="0"><na f="d/Dong:Yiqi" pid="344/8335">Yiqi Dong</na></co>
<co c="0" n="2"><na f="d/Du:Simon_S=" pid="176/5602">Simon S. Du</na><na>Simon Shaolei Du</na></co>
<co c="0"><na f="d/Duan:Haodong" pid="211/7919">Haodong Duan</na></co>
<co c="0"><na f="d/Duan:Ran" pid="90/5794">Ran Duan</na></co>
<co c="0"><na f="f/Fu:Wei" pid="26/4472">Wei Fu</na></co>
<co c="0"><na f="g/Gao:Jiaxuan" pid="304/2243">Jiaxuan Gao</na></co>
<co c="0"><na f="g/Gao_0002:Junyao" pid="67/6659-2">Junyao Gao 0002</na></co>
<co c="0"><na f="g/Goldwasser:Shafi" pid="g/ShafiGoldwasser">Shafi Goldwasser</na></co>
<co c="0"><na f="g/Goyal:Anirudh" pid="172/1039">Anirudh Goyal</na></co>
<co c="0"><na f="g/Gu:Xinran" pid="294/6587">Xinran Gu</na></co>
<co c="0"><na f="g/Gupta:Arushi" pid="172/3915">Arushi Gupta</na></co>
<co c="0"><na f="h/Henderson_0002:Peter" pid="h/PeterHenderson2">Peter Henderson 0002</na></co>
<co c="0"><na f="h/Hu:Shengding" pid="268/5534">Shengding Hu</na></co>
<co c="0"><na f="h/Hu_0014:Wei" pid="52/173-14">Wei Hu 0014</na></co>
<co c="0"><na f="h/Huang_0001:Gao" pid="120/2687-1">Gao Huang 0001</na></co>
<co c="0"><na f="h/Huang:Longbo" pid="79/7077">Longbo Huang</na></co>
<co c="0"><na f="j/Jia_0001:Ruoxi" pid="147/5355-1">Ruoxi Jia 0001</na></co>
<co c="0"><na f="j/Jiang:Xitai" pid="401/8285">Xitai Jiang</na></co>
<co c="0"><na f="j/Jin:Jikai" pid="276/0406">Jikai Jin</na></co>
<co c="0"><na f="k/Kagy:Jean=Fran=ccedil=ois" pid="244/2146">Jean-Fran&#231;ois Kagy</na></co>
<co c="0"><na f="k/Kale:Satyen" pid="52/4768">Satyen Kale</na></co>
<co c="0"><na f="k/Ke:Nan_Rosemary" pid="120/5291">Nan Rosemary Ke</na></co>
<co c="0"><na f="k/Kumar:Sanjiv" pid="66/6804">Sanjiv Kumar</na></co>
<co c="0"><na f="l/Lee:Jason_D=" pid="88/3262">Jason D. Lee</na></co>
<co c="0"><na f="l/Li:Binghui" pid="244/9096">Binghui Li</na></co>
<co c="0"><na f="l/Li:He" pid="05/4746">He Li</na></co>
<co c="0"><na f="l/Li:Jian" pid="33/5448">Jian Li</na></co>
<co c="0"><na f="l/Li_0015:Jian" pid="33/5448-15">Jian Li 0015</na></co>
<co c="0"><na f="l/Li_0015:Jiazheng" pid="400/4254">Jiazheng Li 0015</na></co>
<co c="0"><na f="l/Li:Xinghan" pid="158/3530">Xinghan Li</na></co>
<co c="0"><na f="l/Li_0005:Zhiyuan" pid="39/7780-5">Zhiyuan Li 0005</na></co>
<co c="0"><na f="l/Liu:Fangming" pid="31/6019">Fangming Liu</na></co>
<co c="0"><na f="l/Liu:Wenran" pid="216/2664">Wenran Liu</na></co>
<co c="0"><na f="l/Liu_0001:Zhiyuan" pid="53/3245-1">Zhiyuan Liu 0001</na></co>
<co c="0"><na f="l/Lu_0001:Rui" pid="33/2451-1">Rui Lu 0001</na></co>
<co c="0"><na f="l/Luo:Kairong" pid="402/0024">Kairong Luo</na></co>
<co c="0"><na f="l/Luo:Yuping" pid="70/4804">Yuping Luo</na></co>
<co c="0"><na f="m/Ma_0010:Xiao" pid="35/573-10">Xiao Ma 0010</na></co>
<co c="0"><na f="m/Malladi:Sadhika" pid="176/9810">Sadhika Malladi</na></co>
<co c="0"><na f="m/Medvedev:Marko" pid="387/1895">Marko Medvedev</na></co>
<co c="0"><na f="m/Mei:Zhiyu" pid="299/5277">Zhiyu Mei</na></co>
<co c="0"><na f="m/Menon:Aditya_Krishna" pid="89/3514">Aditya Krishna Menon</na></co>
<co c="0"><na f="m/Miryoosefi:Sobhan" pid="243/5898">Sobhan Miryoosefi</na></co>
<co c="0"><na f="m/Mittal:Prateek" pid="39/6266">Prateek Mittal</na></co>
<co c="0" n="2"><na f="m/Mozer:Michael_C=" pid="m/MichaelCMozer">Michael C. Mozer</na><na>Michael Mozer</na></co>
<co c="0"><na f="p/Pan:Zhixuan" pid="387/3522">Zhixuan Pan</na></co>
<co c="0"><na f="p/Panda:Ashwinee" pid="270/1582">Ashwinee Panda</na></co>
<co c="0"><na f="p/Panigrahi:Abhishek" pid="208/4926">Abhishek Panigrahi</na></co>
<co c="0"><na f="p/Park_0002:Simon" pid="305/6680-2">Simon Park 0002</na></co>
<co c="0"><na f="q/Qi:Xiangyu" pid="274/2321">Xiangyu Qi</na></co>
<co c="0"><na f="r/Rawat:Ankit_Singh" pid="69/8761">Ankit Singh Rawat</na></co>
<co c="0"><na f="r/Reddi:Sashank_J=" pid="50/10452">Sashank J. Reddi</na></co>
<co c="0"><na f="r/Rostamizadeh:Afshin" pid="97/4479">Afshin Rostamizadeh</na></co>
<co c="0"><na f="r/Rothblum:Guy_N=" pid="00/6232">Guy N. Rothblum</na></co>
<co c="0"><na f="r/Roy:Subhrajit" pid="73/8829">Subhrajit Roy</na></co>
<co c="0"><na f="r/Rubinstein:Aviad" pid="11/10308">Aviad Rubinstein</na></co>
<co c="0"><na f="s/Saunshi:Nikunj" pid="199/2236">Nikunj Saunshi</na></co>
<co c="0"><na f="s/Shah:Vedant" pid="304/8089">Vedant Shah</na></co>
<co c="0"><na f="s/Shi:Xinyu" pid="198/2155">Xinyu Shi</na></co>
<co c="0"><na f="s/Song:Dawn" pid="s/DXSong">Dawn Song</na></co>
<co c="0"><na f="s/Srebro:Nathan" pid="50/3633">Nathan Srebro</na></co>
<co c="0"><na f="s/Sun_0001:Maosong" pid="95/3291">Maosong Sun 0001</na></co>
<co c="0"><na f="s/Sun_0005:Yanan" pid="44/8711-5">Yanan Sun 0005</na></co>
<co c="0"><na f="s/Sun:Yuebo" pid="393/4439">Yuebo Sun</na></co>
<co c="0"><na f="s/Sun:Zhenbo" pid="280/0130">Zhenbo Sun</na></co>
<co c="0"><na f="t/Tan:Qixin" pid="391/3109">Qixin Tan</na></co>
<co c="0"><na f="t/Tang:Kexian" pid="380/7639">Kexian Tang</na></co>
<co c="0"><na f="t/Tao:Hengtao" pid="278/9862">Hengtao Tao</na></co>
<co c="0"><na f="w/Wang:Hui" pid="39/721">Hui Wang</na></co>
<co c="0"><na f="w/Wang:Jiachen_T=" pid="329/4974">Jiachen T. Wang</na></co>
<co c="0"><na f="w/Wang:Jiani" pid="156/8812">Jiani Wang</na></co>
<co c="0"><na f="w/Wang_0001:Mengdi" pid="64/10471-1">Mengdi Wang 0001</na></co>
<co c="0"><na f="w/Wang:Runzhe" pid="153/0092">Runzhe Wang</na></co>
<co c="0"><na f="w/Wang:Shaowen" pid="07/852">Shaowen Wang</na></co>
<co c="0"><na f="w/Wang_0017:Tianhao" pid="296/3981">Tianhao Wang 0017</na></co>
<co c="0"><na f="w/Wang:Xiaohan" pid="73/1307">Xiaohan Wang</na></co>
<co c="0"><na f="w/Wen:Haodong" pid="375/1042">Haodong Wen</na></co>
<co c="0"><na f="w/Wen:Kaiyue" pid="322/0395">Kaiyue Wen</na></co>
<co c="0"><na f="w/Wu:Hongxun" pid="224/0209">Hongxun Wu</na></co>
<co c="0"><na f="w/Wu:Tong" pid="75/5056">Tong Wu</na></co>
<co c="0"><na f="w/Wu_0013:Yi" pid="44/3684-13">Yi Wu 0013</na></co>
<co c="0"><na f="x/Xie:Yuanhang" pid="222/3274">Yuanhang Xie</na></co>
<co c="0"><na f="x/Xing:Zhening" pid="276/3671">Zhening Xing</na></co>
<co c="0"><na f="x/Xu:Shusheng" pid="121/0926">Shusheng Xu</na></co>
<co c="0"><na f="y/Yan:Shu" pid="44/5507">Shu Yan</na></co>
<co c="0"><na f="y/Yan:Tingkai" pid="349/4767">Tingkai Yan</na></co>
<co c="0"><na f="y/Yang:Lu" pid="58/2893">Lu Yang</na></co>
<co c="0"><na f="y/Yeung=Levy:Serena" pid="147/5023">Serena Yeung-Levy</na></co>
<co c="0"><na f="y/Yu_0003:Bowen" pid="95/10266-3">Bowen Yu 0003</na></co>
<co c="0"><na f="y/Yu:Dingli" pid="39/578">Dingli Yu</na></co>
<co c="0"><na f="z/Zeng:Yanhong" pid="215/4033">Yanhong Zeng</na></co>
<co c="0"><na f="z/Zhang:Jingzhao" pid="220/5559">Jingzhao Zhang</na></co>
<co c="0"><na f="z/Zhang:Yuhui" pid="77/1630">Yuhui Zhang</na></co>
<co c="0"><na f="z/Zhao:Haoyu" pid="156/0909">Haoyu Zhao</na></co>
<co c="0"><na f="z/Zhou:Yongchao" pid="322/0479">Yongchao Zhou</na></co>
<co c="0"><na f="z/Zhu:Tansheng" pid="401/0605">Tansheng Zhu</na></co>
<co c="0"><na f="z/Zou_0001:James" pid="72/8399">James Zou 0001</na></co>
</coauthors>
</dblpperson>

