@inproceedings{wiegreffe-etal-2023-increasing,
title = "Increasing Probability Mass on Answer Choices Does Not Always Improve Accuracy",
author = "Wiegreffe, Sarah and
Finlayson, Matthew and
Tafjord, Oyvind and
Clark, Peter and
Sabharwal, Ashish",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.522/",
doi = "10.18653/v1/2023.emnlp-main.522",
pages = "8392--8417",
abstract = "When pretrained language models (LMs) are applied to discriminative tasks such as multiple-choice questions, they place probability mass on vocabulary tokens that aren`t among the given answer choices. Spreading probability mass across multiple surface forms with identical meaning (such as {\textquotedblleft}bath{\textquotedblright} and {\textquotedblleft}bathtub{\textquotedblright}) is thought to cause an underestimation of a model`s true performance, referred to as the {\textquotedblleft}surface form competition{\textquotedblright} (SFC) hypothesis. This has motivated the introduction of various probability normalization methods. However, many core questions remain unanswered. How do we measure SFC? Are there direct ways of reducing it, and does doing so improve task performance? We propose a mathematical formalism for SFC which allows us to quantify and bound its impact for the first time. We identify a simple method for reducing it{---}namely, increasing probability mass on the given answer choices by a) including them in the prompt and b) using in-context learning with even just one example. We show this method eliminates the impact of SFC in the majority of instances. Our experiments on three diverse datasets and six LMs reveal several additional surprising findings. For example, both normalization and prompting methods for reducing SFC can be ineffective or even detrimental to task performance for some LMs. We conclude with practical insights for effectively prompting LMs for multiple-choice tasks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wiegreffe-etal-2023-increasing">
<titleInfo>
<title>Increasing Probability Mass on Answer Choices Does Not Always Improve Accuracy</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Wiegreffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Finlayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oyvind</namePart>
<namePart type="family">Tafjord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Clark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Sabharwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>When pretrained language models (LMs) are applied to discriminative tasks such as multiple-choice questions, they place probability mass on vocabulary tokens that aren‘t among the given answer choices. Spreading probability mass across multiple surface forms with identical meaning (such as “bath” and “bathtub”) is thought to cause an underestimation of a model‘s true performance, referred to as the “surface form competition” (SFC) hypothesis. This has motivated the introduction of various probability normalization methods. However, many core questions remain unanswered. How do we measure SFC? Are there direct ways of reducing it, and does doing so improve task performance? We propose a mathematical formalism for SFC which allows us to quantify and bound its impact for the first time. We identify a simple method for reducing it—namely, increasing probability mass on the given answer choices by a) including them in the prompt and b) using in-context learning with even just one example. We show this method eliminates the impact of SFC in the majority of instances. Our experiments on three diverse datasets and six LMs reveal several additional surprising findings. For example, both normalization and prompting methods for reducing SFC can be ineffective or even detrimental to task performance for some LMs. We conclude with practical insights for effectively prompting LMs for multiple-choice tasks.</abstract>
<identifier type="citekey">wiegreffe-etal-2023-increasing</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.522</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.522/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>8392</start>
<end>8417</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Increasing Probability Mass on Answer Choices Does Not Always Improve Accuracy
%A Wiegreffe, Sarah
%A Finlayson, Matthew
%A Tafjord, Oyvind
%A Clark, Peter
%A Sabharwal, Ashish
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F wiegreffe-etal-2023-increasing
%X When pretrained language models (LMs) are applied to discriminative tasks such as multiple-choice questions, they place probability mass on vocabulary tokens that aren‘t among the given answer choices. Spreading probability mass across multiple surface forms with identical meaning (such as “bath” and “bathtub”) is thought to cause an underestimation of a model‘s true performance, referred to as the “surface form competition” (SFC) hypothesis. This has motivated the introduction of various probability normalization methods. However, many core questions remain unanswered. How do we measure SFC? Are there direct ways of reducing it, and does doing so improve task performance? We propose a mathematical formalism for SFC which allows us to quantify and bound its impact for the first time. We identify a simple method for reducing it—namely, increasing probability mass on the given answer choices by a) including them in the prompt and b) using in-context learning with even just one example. We show this method eliminates the impact of SFC in the majority of instances. Our experiments on three diverse datasets and six LMs reveal several additional surprising findings. For example, both normalization and prompting methods for reducing SFC can be ineffective or even detrimental to task performance for some LMs. We conclude with practical insights for effectively prompting LMs for multiple-choice tasks.
%R 10.18653/v1/2023.emnlp-main.522
%U https://aclanthology.org/2023.emnlp-main.522/
%U https://doi.org/10.18653/v1/2023.emnlp-main.522
%P 8392-8417
Markdown (Informal)
[Increasing Probability Mass on Answer Choices Does Not Always Improve Accuracy](https://aclanthology.org/2023.emnlp-main.522/) (Wiegreffe et al., EMNLP 2023)
ACL