{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T03:59:16Z","timestamp":1769745556311,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3729945","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:18:36Z","timestamp":1752455916000},"page":"884-894","update-policy":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Question-Answering Dense Video Events"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/orcid.org\/0009-0009-0883-9920","authenticated-orcid":false,"given":"Hangyu","family":"Qin","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/orcid.org\/0000-0001-5573-6195","authenticated-orcid":false,"given":"Junbin","family":"Xiao","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/orcid.org\/0000-0001-7418-6141","authenticated-orcid":false,"given":"Angela","family":"Yao","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716--23736."},{"key":"e_1_3_2_1_2_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Bai Ziyi","year":"2024","unstructured":"Ziyi Bai, Ruiping Wang, and Xilin Chen. 2024. Glance and focus: Memory prompting for multi-event video question answering. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"ECAI","author":"Chen Haoran","year":"2020","unstructured":"Haoran Chen, Jianmin Li, and Xiaolin Hu. 2020. Delving deeper into the decoder for video captioning. In ECAI 2020. IOS Press, 1079--1086."},{"key":"e_1_3_2_1_4_1","unstructured":"Zesen Cheng Sicong Leng Hang Zhang Yifei Xin Xin Li Guanzheng Chen Yongxin Zhu Wenqi Zhang Ziyang Luo Deli Zhao et al. 2024. VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs. arXiv preprint arXiv:2406.07476 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16203"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems. 49250--49267","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: towards general-purpose vision-language models with instruction tuning. In Proceedings of the 37th International Conference on Neural Information Processing Systems. 49250--49267."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72670-5_5"},{"key":"e_1_3_2_1_8_1","unstructured":"Chaoyou Fu Yuhan Dai Yondong Luo Lei Li Shuhuai Ren Renrui Zhang Zihan Wang Chenyu Zhou Yunhang Shen Mengdan Zhang et al. 2024. Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis. arXiv preprint arXiv:2405.21075 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Google. 2024. Introducing Gemini 2.0: our new AI model for the agentic era. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/blog.google\/technology\/google-deepmind\/google-gemini-ai-update-december-2024\/"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"e_1_3_2_1_11_1","volume-title":"An image grid can be worth a video: Zero-shot video question answering using a vlm. arXiv preprint arXiv:2403.18406","author":"Kim Wonkyun","year":"2024","unstructured":"Wonkyun Kim, Changin Choi, Wonseok Lee, and Wonjong Rhee. 2024. An image grid can be worth a video: Zero-shot video question answering using a vlm. arXiv preprint arXiv:2403.18406 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.261"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1167"},{"key":"e_1_3_2_1_15_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, et al. 2024c. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_2_1_18_1","volume-title":"European Conference on Computer Vision.","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024a. LLaMA-VID: An Image is Worth 2 Tokens in Large Language Models. European Conference on Computer Vision."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00294"},{"key":"e_1_3_2_1_20_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"e_1_3_2_1_22_1","volume-title":"Llava-next: Improved reasoning, ocr, and world knowledge.","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, and Yong Jae Lee. 2024a. Llava-next: Improved reasoning, ocr, and world knowledge."},{"key":"e_1_3_2_1_23_1","volume-title":"European Conference on Computer Vision. Springer, 92--107","author":"Liu Huabin","year":"2024","unstructured":"Huabin Liu, Xiao Ma, Cheng Zhong, Yang Zhang, and Weiyao Lin. 2024b. Timecraft: Navigate weakly-supervised temporal grounded video question answering via bi-directional reasoning. In European Conference on Computer Vision. Springer, 92--107."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3284038"},{"key":"e_1_3_2_1_25_1","volume-title":"Vista-llama: Reliable video narrator via equal distance to visual tokens. arXiv preprint arXiv:2312.08870","author":"Ma Fan","year":"2023","unstructured":"Fan Ma, Xiaojie Jin, Heng Wang, Yuchen Xian, Jiashi Feng, and Yi Yang. 2023. Vista-llama: Reliable video narrator via equal distance to visual tokens. arXiv preprint arXiv:2312.08870 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_2_1_27_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Mangalam Karttikeya","year":"2024","unstructured":"Karttikeya Mangalam, Raiymbek Akshulakov, and Jitendra Malik. 2024. Egoschema: A diagnostic benchmark for very long-form video language understanding. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01257"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01428"},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI. 2024a. GPT-4. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/openai.com\/gpt-4"},{"key":"e_1_3_2_1_31_1","unstructured":"OpenAI. 2024b. Hello GPT-4o. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/openai.com\/index\/hello-gpt-4o\/"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_33_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Patraucean Viorica","year":"2024","unstructured":"Viorica Patraucean, Lucas Smaira, Ankush Gupta, Adria Recasens, Larisa Markeeva, Dylan Banarse, Skanda Koppula, Mateusz Malinowski, Yi Yang, Carl Doersch, et al. 2024. Perception test: A diagnostic benchmark for multimodal video models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Streaming long video understanding with large language models. arXiv preprint arXiv:2405.16009","author":"Qian Rui","year":"2024","unstructured":"Rui Qian, Xiaoyi Dong, Pan Zhang, Yuhang Zang, Shuangrui Ding, Dahua Lin, and Jiaqi Wang. 2024. Streaming long video understanding with large language models. arXiv preprint arXiv:2405.16009 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1437"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.544"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28363"},{"key":"e_1_3_2_1_41_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00443"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72989-8_4"},{"key":"e_1_3_2_1_44_1","volume-title":"InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation. In The Twelfth International Conference on Learning Representations.","author":"Wang Yi","year":"2023","unstructured":"Yi Wang, Yinan He, Yizhuo Li, Kunchang Li, Jiashuo Yu, Xin Ma, Xinhao Li, Guo Chen, Xinyuan Chen, Yaohui Wang, et al. 2023. InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_45_1","volume-title":"VideoQA in the Era of LLMs: An Empirical Study. arXiv preprint arXiv:2408.04223","author":"Xiao Junbin","year":"2024","unstructured":"Junbin Xiao, Nanxin Huang, Hangyu Qin, Dongyang Li, Yicong Li, Fengbin Zhu, Zhulin Tao, Jianxing Yu, Liang Lin, Tat-Seng Chua, and Angela Yao. 2024a. VideoQA in the Era of LLMs: An Empirical Study. arXiv preprint arXiv:2408.04223 (2024)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01254"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_3"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"key":"e_1_3_2_1_50_1","volume-title":"See Kiong Ng, and Jiashi Feng","author":"Xu Lin","year":"2024","unstructured":"Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, and Jiashi Feng. 2024b. Pllava: Parameter-free llava extension from images to videos for video dense captioning. arXiv preprint arXiv:2404.16994 (2024)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.176"},{"key":"e_1_3_2_1_52_1","first-page":"124","article-title":"Zero-shot video question answering via frozen bidirectional language models","volume":"35","author":"Yang Antoine","year":"2022","unstructured":"Antoine Yang, Antoine Miech, Josef Sivic, Ivan Laptev, and Cordelia Schmid. 2022. Zero-shot video question answering via frozen bidirectional language models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 124--141.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080655"},{"key":"e_1_3_2_1_56_1","volume-title":"Cross-Modal Reasoning with Event Correlation for Video Question Answering. arXiv preprint arXiv:2312.12721","author":"Yin Chengxiang","year":"2023","unstructured":"Chengxiang Yin, Zhengping Che, Kun Wu, Zhiyuan Xu, Qinru Qiu, and Jian Tang. 2023. Cross-Modal Reasoning with Event Correlation for Video Question Answering. arXiv preprint arXiv:2312.12721 (2023)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1137"},{"key":"e_1_3_2_1_58_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Yu Shoubin","year":"2024","unstructured":"Shoubin Yu, Jaemin Cho, Prateek Yadav, and Mohit Bansal. 2024a. Self-chained image-language model for video localization and question answering. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"e_1_3_2_1_60_1","volume-title":"Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language. In The Eleventh International Conference on Learning Representations.","author":"Zeng Andy","unstructured":"Andy Zeng, Maria Attarian, Krzysztof Marcin Choromanski, Adrian Wong, Stefan Welker, Federico Tombari, Aveek Purohit, Michael S Ryoo, Vikas Sindhwani, Johnny Lee, et al. [n.,d.]. Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_1_63_1","volume-title":"Long context transfer from language to vision. arXiv preprint arXiv:2406.16852","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Kaichen Zhang, Bo Li, Guangtao Zeng, Jingkang Yang, Yuanhan Zhang, Ziyue Wang, Haoran Tan, Chunyuan Li, and Ziwei Liu. 2024c. Long context transfer from language to vision. arXiv preprint arXiv:2406.16852 (2024)."},{"key":"e_1_3_2_1_64_1","volume-title":"Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199","author":"Zhang Renrui","year":"2023","unstructured":"Renrui Zhang, Jiaming Han, Chris Liu, Peng Gao, Aojun Zhou, Xiangfei Hu, Shilin Yan, Pan Lu, Hongsheng Li, and Yu Qiao. 2023a. Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)."},{"key":"e_1_3_2_1_65_1","volume-title":"Yong jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, and Chunyuan Li.","author":"Zhang Yuanhan","year":"2024","unstructured":"Yuanhan Zhang, Bo Li, haotian Liu, Yong jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, and Chunyuan Li. 2024a. LLaVA-NeXT: A Strong Zero-shot Video Understanding Model. https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/llava-vl.github.io\/blog\/2024-04--30-llava-next-video\/"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.432"},{"key":"e_1_3_2_1_67_1","volume-title":"MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding. arXiv preprint arXiv:2406.04264","author":"Zhou Junjie","year":"2024","unstructured":"Junjie Zhou, Yan Shu, Bo Zhao, Boya Wu, Shitao Xiao, Xi Yang, Yongping Xiong, Bo Zhang, Tiejun Huang, and Zheng Liu. 2024. MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding. arXiv preprint arXiv:2406.04264 (2024)."}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3729945","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T18:31:46Z","timestamp":1755887506000},"score":1,"resource":{"primary":{"URL":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/dl.acm.org\/doi\/10.1145\/3726302.3729945"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":67,"alternative-id":["10.1145\/3726302.3729945","10.1145\/3726302"],"URL":"https:\/\/summer-heart-0930.chufeiyun1688.workers.dev:443\/https\/doi.org\/10.1145\/3726302.3729945","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}