{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:09:47Z","timestamp":1750219787199,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,1,4]],"date-time":"2023-01-04T00:00:00Z","timestamp":1672790400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,1,4]]},"DOI":"10.1145\/3570991.3571014","type":"proceedings-article","created":{"date-parts":[[2023,1,5]],"date-time":"2023-01-05T04:13:03Z","timestamp":1672891983000},"page":"10-18","update-policy":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["RetroKD : Leveraging Past States for Regularizing Targets in Teacher-Student Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/orcid.org\/0000-0002-6660-702X","authenticated-orcid":false,"given":"Surgan","family":"Jandial","sequence":"first","affiliation":[{"name":"Adobe, India"}]},{"ORCID":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/orcid.org\/0000-0002-6730-7846","authenticated-orcid":false,"given":"Yash","family":"Khasbage","sequence":"additional","affiliation":[{"name":"Microsoft, India"}]},{"ORCID":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/orcid.org\/0000-0003-4918-2565","authenticated-orcid":false,"given":"Arghya","family":"Pal","sequence":"additional","affiliation":[{"name":"Harvard University, USA"}]},{"ORCID":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/orcid.org\/0000-0002-0366-2427","authenticated-orcid":false,"given":"Balaji","family":"Krishnamurthy","sequence":"additional","affiliation":[{"name":"Adobe, India"}]},{"ORCID":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/orcid.org\/0000-0003-2656-0375","authenticated-orcid":false,"given":"Vineeth N","family":"Balasubramanian","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, IIT Hyderabad, India"}]}],"member":"320","published-online":{"date-parts":[[2023,1,4]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Armen Aghajanyan. 2016. SoftTarget Regularization: An Effective Technique to Reduce Over-Fitting in Neural Networks. arXiv:arXiv:1609.06693","DOI":"10.1109\/CYBConf.2017.7985811"},{"key":"e_1_3_2_1_2_1","volume-title":"Model Compression. In Proceedings of the 12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining(KDD \u201906)","author":"Bucilua Cristian","year":"2006","unstructured":"Cristian Bucilua, Rich Caruana, and Alexandru Niculescu-Mizil. 2006. Model Compression. In Proceedings of the 12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining(KDD \u201906). Association for Computing Machinery, 535\u2013541."},{"key":"e_1_3_2_1_3_1","volume-title":"Entropy-SGD: Biasing Gradient Descent Into Wide Valleys. In International Conference on Learning Representations, ICLR","author":"Chaudhari Pratik","year":"2017","unstructured":"Pratik Chaudhari, Anna Choromanska, Stefano Soatto, Yann LeCun, Carlo Baldassi, Christian Borgs, Jennifer Chayes, Levent Sagun, and Riccardo Zecchina. [n. d.]. Entropy-SGD: Biasing Gradient Descent Into Wide Valleys. In International Conference on Learning Representations, ICLR 2017."},{"volume-title":"On the Efficacy of Knowledge Distillation. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). 4793\u20134801","author":"Cho H.","key":"e_1_3_2_1_5_1","unstructured":"J.\u00a0H. Cho and B. Hariharan. 2019. On the Efficacy of Knowledge Distillation. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). 4793\u20134801."},{"key":"e_1_3_2_1_6_1","unstructured":"Qianggang Ding Sifan Wu Hao Sun Jiadong Guo and Shu-Tao Xia. 2019. Adaptive Regularization of Labels. arXiv:arXiv:1908.05474"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1088\/0034-4885\/74\/9\/096501"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"T. Fukuda Masayuki Suzuki Gakuto Kurata S. Thomas Jia Cui and B. Ramabhadran. 2017. Efficient Knowledge Distillation from an Ensemble of Teachers. In INTERSPEECH.","DOI":"10.21437\/Interspeech.2017-614"},{"key":"e_1_3_2_1_9_1","volume-title":"Born-Again Neural Networks. In International Conference on Machine Learning, ICML 2018(Proceedings of Machine Learning Research, Vol.\u00a080)","author":"Furlanello Tommaso","year":"2018","unstructured":"Tommaso Furlanello, Zachary\u00a0Chase Lipton, Michael Tschannen, Laurent Itti, and Anima Anandkumar. 2018. Born-Again Neural Networks. In International Conference on Machine Learning, ICML 2018(Proceedings of Machine Learning Research, Vol.\u00a080). PMLR, 1602\u20131611."},{"key":"e_1_3_2_1_10_1","unstructured":"Mengya Gao Yujun Shen Quanquan Li Junjie Yan Liang Wan Dahua Lin Chen\u00a0Change Loy and Xiaoou Tang. 2018. An Embarrassingly Simple Approach for Knowledge Distillation. arXiv:arXiv:1812.01819"},{"key":"e_1_3_2_1_11_1","unstructured":"Sangchul Hahn and Heeyoul Choi. 2019. Self-Knowledge Distillation in Natural Language Processing. In RANLP."},{"key":"e_1_3_2_1_12_1","unstructured":"Haowei He Gao Huang and Yang Yuan. 2019. Asymmetric Valleys: Beyond Sharp and Flat Local Minima. In Advances in Neural Information Processing Systems H.\u00a0Wallach H.\u00a0Larochelle A.\u00a0Beygelzimer F.\u00a0d\u00c1lch\u00e9-Buc E.\u00a0Fox and R.\u00a0Garnett (Eds.). Vol.\u00a032. Curran Associates Inc. 2553\u20132564. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/proceedings.neurips.cc\/paper\/2019\/file\/01d8bae291b1e4724443375634ccfa0e-Paper.pdf"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","volume-title":"NIPS Deep Learning and Representation Learning Workshop. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/http\/arxiv.org\/abs\/1503","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeffrey Dean. 2015. Distilling the Knowledge in a Neural Network. In NIPS Deep Learning and Representation Learning Workshop. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/http\/arxiv.org\/abs\/1503.02531"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.1.1"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403165"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5859"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00143"},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Learning Representations, ICLR","author":"Keskar Nitish\u00a0Shirish","year":"2017","unstructured":"Nitish\u00a0Shirish Keskar, Dheevatsa Mudigere, Jorge Nocedal, Mikhail Smelyanskiy, and Ping Tak\u00a0Peter Tang. 2017. On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima. In International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings."},{"key":"e_1_3_2_1_22_1","volume-title":"Similarity of Neural Network Representations Revisited. In International Conference on Machine Learning, ICML","author":"Kornblith Simon","year":"2019","unstructured":"Simon Kornblith, Mohammad Norouzi, H. Lee, and Geoffrey\u00a0E. Hinton. 2019. Similarity of Neural Network Representations Revisited. In International Conference on Machine Learning, ICML 2019."},{"key":"e_1_3_2_1_23_1","volume-title":"Temporal Ensembling for Semi-Supervised Learning. In International Conference on Learning Representations, ICLR","author":"Laine Samuli","year":"2017","unstructured":"Samuli Laine and Timo Aila. [n. d.]. Temporal Ensembling for Semi-Supervised Learning. In International Conference on Learning Representations, ICLR 2017. arXiv:arXiv:1610.02242"},{"key":"e_1_3_2_1_24_1","unstructured":"Hao Li Zheng Xu Gavin Taylor Christoph Studer and Tom Goldstein. 2018. Visualizing the Loss Landscape of Neural Nets. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_25_1","volume-title":"ResKD: Residual-Guided Knowledge Distillation","author":"Li Xuewei","year":"2021","unstructured":"Xuewei Li, Songyuan Li, Bourahla Omar, Fei Wu, and Xi Li. 2021. ResKD: Residual-Guided Knowledge Distillation. IEEE Transactions on Image Processing(2021)."},{"key":"e_1_3_2_1_26_1","unstructured":"David Lopez-Paz L\u00e9on Bottou Bernhard Sch\u00f6lkopf and Vladimir Vapnik. 2015. Unifying distillation and privileged information. arXiv preprint arXiv:1511.03643(2015)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"e_1_3_2_1_28_1","volume-title":"Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.\u00a0F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates","author":"Mobahi Hossein","year":"2020","unstructured":"Hossein Mobahi, Mehrdad Farajtabar, and Peter Bartlett. 2020. Self-Distillation Amplifies Regularization in Hilbert Space. In Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.\u00a0F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates, Inc., 3351\u20133361. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/proceedings.neurips.cc\/paper\/2020\/file\/2288f691b58edecadcc9a8691762b4fd-Paper.pdf"},{"key":"e_1_3_2_1_29_1","unstructured":"Ari\u00a0S. Morcos M. Raghu and S. Bengio. 2018. Insights on representational similarity in neural networks with canonical correlation. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_30_1","volume-title":"Triplet Loss for Knowledge Distillation. In International Joint Conference on Neural Networks (IJCNN), 2020","author":"Oki Hideki","year":"2020","unstructured":"Hideki Oki, Motoshi Abe, Junichi Miyao, and Takio Kurita. 2020. Triplet Loss for Knowledge Distillation. In International Joint Conference on Neural Networks (IJCNN), 2020. arXiv:arXiv:2004.08116"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00409"},{"volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","key":"e_1_3_2_1_32_1","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems. 8024\u20138035. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/http\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"e_1_3_2_1_33_1","unstructured":"Akshay Rangamani Nam\u00a0H. Nguyen Abhishek Kumar D. Phan S.\u00a0H. Chin and Trac\u00a0D. Tran. 2019. A Scale Invariant Flatness Measure for Deep Network Minima. ArXiv abs\/1902.02434(2019)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLA.2015.152"},{"key":"e_1_3_2_1_35_1","volume-title":"FitNets: Hints for Thin Deep Nets. In International Conference on Learning Representations, ICLR","author":"Romero Adriana","year":"2015","unstructured":"Adriana Romero, Nicolas Ballas, Samira\u00a0Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. [n. d.]. FitNets: Hints for Thin Deep Nets. In International Conference on Learning Representations, ICLR 2015, Yoshua Bengio and Yann LeCun (Eds.)."},{"key":"e_1_3_2_1_36_1","unstructured":"Bharat\u00a0Bhusan Sau and Vineeth\u00a0N Balasubramanian. 2016. Deep model compression: Distilling knowledge from noisy teachers. arXiv preprint arXiv:1610.09650(2016)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390273"},{"volume-title":"2015 53rd Annual Allerton Conference on Communication, Control, and Computing (Allerton). 909\u2013910","author":"Shokri R.","key":"e_1_3_2_1_38_1","unstructured":"R. Shokri and V. Shmatikov. 2015. Privacy-preserving deep learning. In 2015 53rd Annual Allerton Conference on Communication, Control, and Computing (Allerton). 909\u2013910."},{"key":"e_1_3_2_1_39_1","unstructured":"Antti Tarvainen and Harri Valpola. 2017. Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results. In Advances in Neural Information Processing Systems. arXiv:arXiv:1703.01780"},{"key":"e_1_3_2_1_40_1","volume-title":"Statistical learning theory new york","author":"Vapnik V","year":"1998","unstructured":"V Vapnik. 1998. Statistical learning theory new york. NY: Wiley (1998)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.31193\/ssap.01.9787509752807"},{"volume-title":"Network Minimization and Transfer Learning. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 7130\u20137138","author":"Yim J.","key":"e_1_3_2_1_42_1","unstructured":"J. Yim, D. Joo, J. Bae, and J. Kim. 2017. A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 7130\u20137138."},{"key":"e_1_3_2_1_43_1","volume-title":"International Conference on Learning Representations, ICLR","author":"Zagoruyko Sergey","year":"2017","unstructured":"Sergey Zagoruyko and Nikos Komodakis. [n. d.]. Paying More Attention to Attention: Improving the Performance of Convolutional Neural Networks via Attention Transfer. In International Conference on Learning Representations, ICLR 2017. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/arxiv.org\/abs\/1612.03928"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3446776"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00454"}],"event":{"name":"CODS-COMAD 2023: 6th Joint International Conference on Data Science & Management of Data (10th ACM IKDD CODS and 28th COMAD)","acronym":"CODS-COMAD 2023","location":"Mumbai India"},"container-title":["Proceedings of the 6th Joint International Conference on Data Science &amp; Management of Data (10th ACM IKDD CODS and 28th COMAD)"],"original-title":[],"link":[{"URL":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/dl.acm.org\/doi\/10.1145\/3570991.3571014","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/dl.acm.org\/doi\/pdf\/10.1145\/3570991.3571014","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:53Z","timestamp":1750178273000},"score":1,"resource":{"primary":{"URL":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/dl.acm.org\/doi\/10.1145\/3570991.3571014"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,4]]},"references-count":43,"alternative-id":["10.1145\/3570991.3571014","10.1145\/3570991"],"URL":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/doi.org\/10.1145\/3570991.3571014","relation":{},"subject":[],"published":{"date-parts":[[2023,1,4]]},"assertion":[{"value":"2023-01-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}