forked from alshedivat/al-folio
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpapers.bib
600 lines (545 loc) · 72.2 KB
/
papers.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
---
---
@misc{ghosh2024compare,
title={Compare without Despair: Reliable Preference Evaluation with Generation Separability},
author={Sayan Ghosh, Tejas Srinivasan, Swabha Swayamdipta},
year={2024},
abbr={Preprint},
url={},
selected=true,
preview={separability.jpeg},
abstract={Human evaluation of generated language through pairwise preference judgments is pervasive. However, under common scenarios, such as when generations from a model pair are very similar, or when stochastic decoding results in large variations in generations, it results in inconsistent preference ratings. We address these challenges by introducing a meta-evaluation measure, separability, which estimates how suitable a test instance is for pairwise preference evaluation. For a candidate test instance, separability samples multiple generations from a pair of models, and measures how distinguishable the two sets of generations are. Our experiments show that instances with high separability values yield more consistent preference ratings from both human- and auto-raters. Further, the distribution of separability allows insights into which test benchmarks are more valuable for comparing models. Finally, we incorporate separability into ELO ratings, accounting for how suitable each test instance might be for reliably ranking LLMs. Overall, separability has implications for consistent, efficient and robust preference evaluation of LLMs with both human- and auto-raters.},
}
@misc{ranjit2024oath,
title={OATH-Frames: Characterizing Online Attitudes Towards Homelessness via LLM Assistants},
author={Jaspreet Ranjit and Brihi Joshi and Rebecca Dorn and Laura Petry and Olga Koumoundouros and Jayne Bottarini and Peichen Liu and Eric Rice and Swabha Swayamdipta},
year={2024},
abbr={Preprint},
url={https://dill-lab.github.io/oath-frames/},
selected=true,
preview={oath_frames.jpeg},
selected=true,
abstract={Homelessness in the U.S. is widespread; individual beliefs and attitudes towards homelessness—often expressed on social media are complex and nuanced (e.g. critical as well as sympathetic). Such attitudes can be challenging to summarize at scale, obfuscating the broader public opinion which advocacy organizations use to guide public policy and reform efforts. Our work proposes an approach to enable a large-scale study on homelessness via two major contributions. First, with the help of domain experts in social work and their trainees, we characterize Online Attitudes towards Homelessness in nine hierarchical frames (OATH-Frames) on a collection of 4K social media posts. Further, in an effort to ease the annotation of these frames, we employ GPT-4 as an LLM assistant to the experts; GPT-4 + Expert annotation presents an attractive trade off owing to a 6.5× speedup in annotation time despite only incurring a 2 point F1 difference in annotation performance. Our effort results in a collection of 8K social media posts labeled by domain and trained experts (with and without GPT-4 assistance). Second, using predicted OATH-Frames on a Flan-T5-Large model trained on our data, we perform a large-scale analysis on 2.4M posts on homelessness. We find that posts that contain mentions of west coast states express more harmful generalizations of people experiencing homelessness (PEH) compared to posts about east coast states. We also find marked differences in attitudes across vulnerable populations as they are compared to PEH as being either more or less deserving of aid.},
}
@misc{gulati2024out,
title={Out-of-Distribution Detection through Soft Clustering with Non-Negative Kernel Regression},
author={Aryan Gulati and Xingjian Dong and Carlos Hurtado and Sarath Shekkizhar and Swabha Swayamdipta and Antonio Ortega},
year={2024},
abbr={Preprint},
url={},
abstract={As language models become more general purpose, increased attention needs to be paid to detecting out-of-distribution (OOD) instances, i.e., those not belonging to any of the distributions seen during training. Existing methods for detecting OOD data are computationally complex and storage-intensive. We propose a novel soft clustering approach for OOD detection based on non-negative kernel regression. Our approach greatly reduces computational and space complexities (up to $11\times $ improvement in inference time and 87% reduction in storage requirements) and outperforms existing approaches by up to 4 AUROC points on four different benchmarks. We also introduce an entropy-constrained version of our algorithm, which leads to further reductions in storage requirements (up to 97% lower than comparable approaches) while retaining competitive performance. Our soft clustering approach for OOD detection highlights its potential for detecting tail-end phenomena in extreme-scale data settings.},
}
@misc{khurana2024crowd,
title={Crowd-Calibrator: Can Annotator Disagreement Inform Calibration in Subjective Tasks?},
author={Urja Khurana and Eric Nalisnick and Antske Fokkens and Swabha Swayamdipta},
year={2024},
abbr={Preprint},
url={},
abstract={Subjective tasks in NLP have been mostly relegated to objective ones where the gold label is decided by taking the majority vote, thereby obfuscating annotator disagreement and inherent uncertainty of instances. We argue that that subjectivity should play a role in model decisions, considering a selective prediction setting. However, instead of calibrating confidence purely from the model’s perspective, we calibrate models for subjective tasks based on crowdworker agreement. Our method, Crowd-Calibrator, models annotations from crowdworkers and the distance between crowdworker distribution and the model’s own distribution over labels to inform whether the model should abstain from a decision. On two highly subjective tasks, namely hate speech detection and natural language inference (NLI), our experiments show Crowd-Calibrator either outperforming or achieving competitive performance with selective prediction baselines, highlighting the value of bringing in human decision making into model predictions.},
}
@misc{finlayson2024logits,
title={Logits of API-Protected LLMs Leak Proprietary Information},
author={Matthew Finlayson and Xiang Ren and Swabha Swayamdipta},
year={2024},
abbr={Preprint},
url={https://arxiv.org/abs/2403.09539},
preview={logits.png},
selected=true,
abstract={The commercialization of large language models (LLMs) has led to the common practice of high-level API-only access to proprietary models. In this work, we show that even with a conservative assumption about the model architecture, it is possible to learn a surprisingly large amount of non-public information about an API-protected LLM from a relatively small number of API queries (e.g., costing under $1,000 for OpenAI's gpt-3.5-turbo). Our findings are centered on one key observation: most modern LLMs suffer from a softmax bottleneck, which restricts the model outputs to a linear subspace of the full output space. We show that this lends itself to a model image or a model signature which unlocks several capabilities with affordable cost: efficiently discovering the LLM's hidden size, obtaining full-vocabulary outputs, detecting and disambiguating different model updates, identifying the source LLM given a single full LLM output, and even estimating the output layer parameters. Our empirical investigations show the effectiveness of our methods, which allow us to estimate the embedding size of OpenAI's gpt-3.5-turbo to be about 4,096. Lastly, we discuss ways that LLM providers can guard against these attacks, as well as how these capabilities can be viewed as a feature (rather than a bug) by allowing for greater transparency and accountability.},
}
@inproceedings{cui2024annotating,
title={Annotating FrameNet via Structure-Conditioned Language Generation},
author={Xinyue Cui and Swabha Swayamdipta},
url={https://arxiv.org/abs/2406.04834},
abbr={ACL},
booktitle={Proceedings of ACL (to appear)},
year={2024},
selected=true,
preview={fn-conditioned-generation.jpeg},
abstract={Despite the mounting evidence for generative capabilities of language models in understanding and generating natural language, their effectiveness on explicit manipulation and generation of linguistic structures remain understudied. In this paper, we investigate the task of generating new sentences preserving a given semantic structure, following the FrameNet formalism. We propose a framework to produce novel frame-semantically annotated sentences following an overgenerate-and-filter approach. Our results show that conditioning on rich, explicit semantic information tends to produce generations with high human acceptance, under both prompting and finetuning. Nevertheless, we discover that generated frame-semantic structured data is ineffective at training data augmentation for frame-semantic role labeling. Our study concludes that while generating high-quality, semantically rich data might be within reach, their downstream utility remains to be seen, highlighting the outstanding challenges with automating linguistic annotation tasks.},
}
@inproceedings{nazari2024generative,
title={Generative Explanations for Program Synthesizers},
author={Amirmohammad Nazari and Souti Chattopadhyay and Swabha Swayamdipta and Mukund Raghothaman},
year={2024},
url={https://arxiv.org/abs/2403.03429},
abbr={VL-HCC},
booktitle={Proceedings of VL/HCC (To Appear)},
abstract={Despite great advances in program synthesis techniques, they remain algorithmic black boxes. Although they guarantee that when synthesis is successful, the implementation satisfies the specification, they provide no additional information regarding how the implementation works or the manner in which the specification is realized. One possibility to answer these questions is to use large language models (LLMs) to construct human-readable explanations. Unfortunately, experiments reveal that LLMs frequently produce nonsensical or misleading explanations when applied to the unidiomatic code produced by program synthesizers. In this paper, we develop an approach to reliably augment the implementation with explanatory names. We recover fine-grained input-output data from the synthesis algorithm to enhance the prompt supplied to the LLM, and use a combination of a program verifier and a second language model to validate the proposed explanations before presenting them to the user. Together, these techniques massively improve the accuracy of the proposed names, from 24% to 79% respectively. Through a pair of small user studies, we find that users significantly prefer the explanations produced by our technique (76% of responses indicating the appropriateness of the presenting names) to the baseline (with only 2% of responses approving of the suggestions), and that the proposed names measurably help users in understanding the synthesized implementation.}
}
@inproceedings{vazquez2024proceedings,
title={Proceedings of the 1st Workshop on Uncertainty-Aware NLP (UncertaiNLP 2024)},
author={V{\'a}zquez, Ra{\'u}l and Celikkanat, Hande and Ulmer, Dennis and Tiedemann, J{\"o}rg and Swayamdipta, Swabha and Aziz, Wilker and Plank, Barbara and Baan, Joris and de Marneffe, Marie-Catherine},
booktitle={Proceedings of the 1st Workshop on Uncertainty-Aware NLP (UncertaiNLP 2024)},
url={https://aclanthology.org/2024.uncertainlp-1.0/},
year={2024},
abbr={EACL},
abstract={Human languages are inherently ambiguous and understanding language input is subject to interpretation and complex contextual dependencies. Nevertheless, the main body of research in NLP is still based on the assumption that ambiguities and other types of underspecification can and have to be resolved. The UncertaiNLP workshop (workshop on uncertainty-aware NLP) aims to provide a platform for research that embraces variability in human language and aims to represent and evaluate the uncertainty that arises from it, and from modeling tools themselves.},
}
@inproceedings{finlayson2023closing,
title={Closing the Curious Case of Neural Text Degeneration},
author={Matthew Finlayson and John Hewitt and Alexander Koller and Swabha Swayamdipta and Ashish Sabharwal},
year={2024},
url={https://arxiv.org/abs/2310.01693},
abbr={ICLR},
booktitle={Proceedings of ICLR},
selected=true,
abstract={Despite their ubiquity in language generation, it remains unknown why truncation sampling heuristics like nucleus sampling are so effective. We provide a theoretical explanation for the effectiveness of the truncation sampling by proving that truncation methods that discard tokens below some probability threshold (the most common type of truncation) can guarantee that all sampled tokens have nonzero true probability. However, thresholds are a coarse heuristic, and necessarily discard some tokens with nonzero true probability as well. In pursuit of a more precise sampling strategy, we show that we can leverage a known source of model errors, the softmax bottleneck, to prove that certain tokens have nonzero true probability, without relying on a threshold. Based on our findings, we develop an experimental truncation strategy and the present pilot studies demonstrating the promise of this type of algorithm. Our evaluations show that our method outperforms its threshold-based counterparts under automatic and human evaluation metrics for low-entropy (i.e., close to greedy) open-ended text generation. Our theoretical findings and pilot experiments provide both insight into why truncation sampling works, and make progress toward more expressive sampling algorithms that better surface the generative capabilities of large language models.}
}
@inproceedings{nam2023does,
title={Does Video Summarization Require Videos? Quantifying the Effectiveness of Language in Video Summarization},
author={Yoonsoo Nam and Adam Lehavi and Daniel Yang and Digbalay Bose and Swabha Swayamdipta and Shrikanth Narayanan},
year={2024},
url={https://arxiv.org/abs/2309.09405},
booktitle={Proceedings of ICASSP},
abbr={ICASSP},
abstract={Video summarization remains a huge challenge in computer vision due to the size of the input videos to be summarized. We propose an efficient, language-only video summarizer that achieves competitive accuracy with high data efficiency. Using only textual captions obtained via a zero-shot approach, we train a language transformer model and forego image representations. This method allows us to perform filtration amongst the representative text vectors and condense the sequence. With our approach, we gain explainability with natural language that comes easily for human interpretation and textual summaries of the videos. An ablation study that focuses on modality and data compression shows that leveraging text modality only effectively reduces input data processing while retaining comparable results.}
}
@inproceedings{howard2023neurocomparatives,
title={NeuroComparatives: Neuro-Symbolic Distillation of Comparative Knowledge},
author={Phillip Howard and Junlin Wang and Vasudev Lal and Gadi Singer and Yejin Choi and Swabha Swayamdipta},
year={2024},
url={https://arxiv.org/abs/2305.04978},
booktitle={Findings of NAACL},
abbr={NAACL},
abstract={Comparative knowledge (e.g., steel is stronger and heavier than styrofoam) is an essential component of our world knowledge, yet understudied in prior literature. In this paper, we study the task of comparative knowledge acquisition, motivated by the dramatic improvements in the capabilities of extreme-scale language models like GPT-3, which have fueled efforts towards harvesting their knowledge into knowledge bases. However, access to inference API for such models is limited, thereby restricting the scope and the diversity of the knowledge acquisition. We thus ask a seemingly implausible question: whether more accessible, yet considerably smaller and weaker models such as GPT-2, can be utilized to acquire comparative knowledge, such that the resulting quality is on par with their large-scale counterparts? We introduce NeuroComparatives, a novel framework for comparative knowledge distillation using lexically-constrained decoding, followed by stringent filtering of generated knowledge. Our framework acquires comparative knowledge between everyday objects and results in a corpus of 8.7M comparisons over 1.74M entity pairs - 10X larger and 30% more diverse than existing resources. Moreover, human evaluations show that NeuroComparatives outperform existing resources (up to 32% absolute improvement), even including GPT-3, despite using a 100X smaller model. Our results motivate neuro-symbolic manipulation of smaller models as a cost-effective alternative to the currently dominant practice of relying on extreme-scale language models with limited inference access.},
}
@inproceedings{liu2023were,
title={We're Afraid Language Models Aren't Modeling Ambiguity},
author={Alisa Liu and Zhaofeng Wu and Julian Michael and Alane Suhr and Peter West and Alexander Koller and Swabha Swayamdipta and Noah A. Smith and Yejin Choi},
year={2023},
booktitle={Proceedings of EMNLP},
url={https://arxiv.org/abs/2304.14399},
abbr={EMNLP},
selected=true,
abstract={Ambiguity is an intrinsic feature of natural language. Managing ambiguity is a key part of human language understanding, allowing us to anticipate misunderstanding as communicators and revise our interpretations as listeners. As language models (LMs) are increasingly employed as dialogue interfaces and writing aids, handling ambiguous language is critical to their success. We characterize ambiguity in a sentence by its effect on entailment relations with another sentence, and collect AmbiEnt, a linguist-annotated benchmark of 1,645 examples with diverse kinds of ambiguity. We design a suite of tests based on AmbiEnt, presenting the first evaluation of pretrained LMs to recognize ambiguity and disentangle possible meanings. We find that the task remains extremely challenging, including for the recent GPT-4, whose generated disambiguations are considered correct only 32% of the time in human evaluation, compared to 90% for disambiguations in our dataset. Finally, to illustrate the value of ambiguity-sensitive tools, we show that a multilabel NLI model can flag political claims in the wild that are misleading due to ambiguity. We encourage the field to rediscover the importance of ambiguity for NLP.}
}
@inproceedings{pillutla2022mauve,
title={MAUVE Scores for Generative Models: Theory and Practice},
author={Krishna Pillutla and Lang Liu and John Thickstun and Sean Welleck and Swabha Swayamdipta and Rowan Zellers and Sewoong Oh and Yejin Choi and Zaid Harchaoui},
year={2023},
booktitle={Journal of Machine Learning Research},
url={https://arxiv.org/abs/2212.14578},
abbr={JMLR},
abstract={Generative AI has matured to a point where large-scale models can generate text that seems indistinguishable from human-written text and remarkably photorealistic images. Automatically measuring how close the distribution of generated data is to the target real data distribution is a key step in diagnosing existing models and developing better models. We present MAUVE, a family of comparison measures between pairs of distributions such as those encountered in the generative modeling of text or images. These scores are statistical summaries of divergence frontiers capturing two types of errors in generative modeling. We explore four approaches to statistically estimate these scores: vector quantization, non-parametric estimation, classifier-based estimation, and parametric Gaussian approximations. We provide statistical bounds for the vector quantization approach. Empirically, we find that the proposed scores paired with a range of f-divergences and statistical estimation methods can quantify the gaps between the distributions of human-written text and those of modern neural language models by correlating with human judgments and identifying known properties of the generated texts. We conclude the paper by demonstrating its applications to other AI domains and discussing practical recommendations.}
}
@inproceedings{zhou2023cobra,
title={{COBRA Frames: Contextual Reasoning about Effects and Harms of Offensive Statements}},
author={Zhou, Xuhui and Zhu, Hao and Yerukola, Akhila and Davidson, Thomas and Hwang, Jena D. and Swayamdipta, Swabha and Sap, Maarten},
booktitle={Findings of ACL},
year = {2023},
abbr={ACL},
selected=true,
url={https://swabhs.com/assets/pdf/papers/cobra.pdf},
abstract={Understanding the harms and offensiveness of statements requires reasoning about the social and situational context in which statements are made. For example, the utterance “your English is very good” may implicitly signal an insult when uttered by a white man to a non-white colleague, but uttered by an ESL teacher to their student would be interpreted as a genuine compliment. Such contextual factors have been largely ignored by previous approaches to toxic language detection. We introduce COBRA , the first context-aware formalism for explaining the intents, reactions, and harms of offensive or biased statements grounded in their social and situational context. We create COBRACORPUS, a dataset of 33k potentially offensive statements paired with machine-generated contexts and free-text explanations of offensiveness, implied biases, speaker intents, and listener reactions. To study the contextual dynamics of offensiveness, we train models to generate COBRA explanations, with and without access to the context. We find that explanations by context-agnostic models are significantly worse than by context-aware ones, especially in situations where the context inverts the statement’s offensiveness (29% accuracy drop). Our work highlights the importance and feasibility of contextualized NLP by modeling social factors.}
}
@inproceedings{bhagavatula2022i2d2,
title={{I2D2: Inductive Knowledge Distillation with NeuroLogic and Self-Imitation}},
author={Chandra Bhagavatula and Jena D. Hwang and Doug Downey and Ronan Le Bras and Ximing Lu and Keisuke Sakaguchi and Swabha Swayamdipta and Peter West and Yejin Choi},
year={2023},
url={https://arxiv.org/abs/2212.09246},
abbr={ACL},
booktitle={Proc. of ACL},
abstract={Pre-trained language models, despite their rapid advancements powered by scale, still fall short of robust commonsense capabilities. And yet, scale appears to be the winning recipe; after all, the largest models seem to have acquired the largest amount of commonsense capabilities. Or is it? In this paper, we investigate the possibility of a seemingly impossible match: can smaller language models with dismal commonsense capabilities (i.e., GPT-2), ever win over models that are orders of magnitude larger and better (i.e., GPT-3), if the smaller models are powered with novel commonsense distillation algorithms? The key intellectual question we ask here is whether it is possible, if at all, to design a learning algorithm that does not benefit from scale, yet leads to a competitive level of commonsense acquisition. In this work, we study the generative models of commonsense knowledge, focusing on the task of generating generics, statements of commonsense facts about everyday concepts, e.g., birds can fly. We introduce a novel commonsense distillation framework, I2D2, that loosely follows the Symbolic Knowledge Distillation of West et al. but breaks the dependence on the extreme-scale models as the teacher model by two innovations: (1) the novel adaptation of NeuroLogic Decoding to enhance the generation quality of the weak, off-the-shelf language models, and (2) self-imitation learning to iteratively learn from the model's own enhanced commonsense acquisition capabilities. Empirical results suggest that scale is not the only way, as novel algorithms can be a promising alternative. Moreover, our study leads to a new corpus of generics, Gen-A-Tomic, that is of the largest and highest quality available to date.}
}
@inproceedings{chen2022rev,
title={{REV}: Information-Theoretic Evaluation of Free-Text Rationales},
author={Hanjie Chen and Faeze Brahman and Xiang Ren and Yangfeng Ji and Yejin Choi and Swabha Swayamdipta},
year={2023},
url={https://arxiv.org/abs/2210.04982},
abbr={ACL},
booktitle={Proc. of ACL},
selected=true,
abstract={Free-text rationales are a promising step towards explainable AI, yet their evaluation remains an open research problem. While existing metrics have mostly focused on measuring the direct association between the rationale and a given label, we argue that an ideal metric should also be able to focus on the new information uniquely provided in the rationale that is otherwise not provided in the input or the label. We investigate this research problem from an information-theoretic perspective using the conditional V-information. More concretely, we propose a metric called REV (Rationale Evaluation with conditional V-information), that can quantify the new information in a rationale supporting a given label beyond the information already available in the input or the label. Experiments on reasoning tasks across four benchmarks, including few-shot prompting with GPT-3, demonstrate the effectiveness of REV in evaluating different types of rationale-label pairs, compared to existing metrics. Through several quantitative comparisons, we demonstrate the capability of REV in providing more sensitive measurements of new information in free-text rationales with respect to a label. Furthermore, REV is consistent with human judgments on rationale evaluations. Overall, when used alongside traditional performance metrics, REV provides deeper insights into a models' reasoning and prediction processes.},
}
@inproceedings{sun2022investigating,
title={Investigating the Benefits of Free-Form Rationales},
author={Jiao Sun and Swabha Swayamdipta and Jonathan May and Xuezhe Ma},
year={2022},
abbr={EMNLP},
booktitle={Findings of EMNLP},
url={https://arxiv.org/abs/2206.11083},
code={https://github.com/sunjiao123sun/rationale-utility},
abstract={Free-form rationales aim to aid model interpretability by supplying the background knowledge that can help understand model decisions. Crowdsourced rationales are provided for commonsense QA instances in popular datasets such as CoS-E and ECQA, but their utility remains under-investigated. We present human studies which show that ECQA rationales indeed provide additional background information to understand a decision, while over 88% of CoS-E rationales do not. Inspired by this finding, we ask: can the additional context provided by free-form rationales benefit models, similar to human users? We investigate the utility of rationales as an additional source of supervision, by varying the quantity and quality of rationales during training. After controlling for instances where rationales leak the correct answer while not providing additional background knowledge, we find that incorporating only 5% of rationales during training can boost model performance by 47.22% for CoS-E and 57.14% for ECQA during inference. Moreover, we also show that rationale quality matters: compared to crowdsourced rationales, T5-generated rationales provide not only weaker supervision to models, but are also not helpful for humans in aiding model interpretability.},
}
@inproceedings{howard2022neurocf,
author = {Phillip Howard and Gadi Singer and Vasudev Lal and Yejin Choi and Swabha Swayamdipta},
title = {NeuroCounterfactuals: Beyond Minimal-Edit Counterfactuals for Richer Data Augmentation},
year = {2022},
abbr={EMNLP},
booktitle={Findings of EMNLP},
code={https://github.com/IntelLabs/NeuroCounterfactuals},
url={https://arxiv.org/abs/2210.12365},
selected=true,
abstract={While counterfactual data augmentation offers a promising step towards robust generalization in natural language processing, producing a set of counterfactuals that offer valuable inductive bias for models remains a challenge. Most existing approaches for producing counterfactuals, manual or automated, rely on small perturbations via minimal edits, resulting in simplistic changes. We introduce NeuroCounterfactuals, designed as loose counterfactuals, allowing for larger edits which result in naturalistic generations containing linguistic diversity, while still bearing similarity to the original document. Our novel generative approach bridges the benefits of constrained decoding, with those of language model adaptation for sentiment steering. Training data augmentation with our generations results in both in-domain and out-of-domain improvements for sentiment classification, outperforming even manually curated counterfactuals, under select settings. We further present detailed analyses to show the advantages of NeuroCounterfactuals over approaches involving simple, minimal edits.}
}
@inproceedings{liu2022wanli,
title={{WaNLI: Worker and AI Collaboration for Natural Language Inference Dataset Creation}},
author={Alisa Liu and Swabha Swayamdipta and Noah A. Smith and Yejin Choi},
year={2022},
abbr={EMNLP},
booktitle={Findings of EMNLP},
url={https://arxiv.org/abs/2201.05955},
code={https://github.com/alisawuffles/wanli},
selected=true,
abstract={A recurring challenge of crowdsourcing NLP datasets at scale is that human writers often rely on repetitive patterns when crafting examples, leading to a lack of linguistic diversity. We introduce a novel paradigm for dataset creation based on human and machine collaboration, which brings together the generative strength of language models and the evaluative strength of humans. Starting with an existing dataset, MultiNLI, our approach uses dataset cartography to automatically identify examples that demonstrate challenging reasoning patterns, and instructs GPT-3 to compose new examples with similar patterns. Machine generated examples are then automatically filtered, and finally revised and labeled by human crowdworkers to ensure quality. The resulting dataset, WANLI, consists of 108,357 natural language inference (NLI) examples that present unique empirical strengths over existing NLI datasets. Remarkably, training a model on WANLI instead of MNLI (which is 4 times larger) improves performance on seven out-of-domain test sets we consider, including by 11% on HANS and 9% on Adversarial NLI. Moreover, combining MNLI with WANLI is more effective than combining with other augmentation sets that have been introduced. Our results demonstrate the potential of natural language generation techniques to curate NLP datasets of enhanced quality and diversity.},
}
@proceedings{deeplo-2022-deep,
title = "Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing",
author = "Cherry, Colin and
Fan, Angela and
Foster, George and
Haffari, Gholamreza (Reza) and
Khadivi, Shahram and
Peng, Nanyun (Violet) and
Ren, Xiang and
Shareghi, Ehsan and
Swayamdipta, Swabha",
month = jul,
year = "2022",
address = "Hybrid",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.deeplo-1.0",
abbr={NAACL},
abstract={The NAACL 2022 Workshop on Deep Learning Approaches for Low-Resource Natural Language Processing (DeepLo) takes place on Thursday, July 22, in Seattle Washington, USA, immediately after the main conference. Natural Language Processing is being revolutionized by deep learning. However, deep learning requires large amounts of annotated data, and its advantage over traditional statistical methods typically diminishes when such data is not available. Large amounts of annotated data simply do not exist for many low-resource languages. Even for high-resource languages it can be difficult to find linguistically annotated data of sufficient size and quality to allow neural methods to excel; this remains true even as few-shot learning approaches have gained popularity in recent years. This workshop aims to bring together researchers from the NLP and ML communities who work on learning with neural methods when there is not enough data for those methods to succeed out-of-the-box. Specifically, it will provide attendees with an overview of new and existing approaches from various disciplines, and enable them to distill principles that can be more generally applicable. We will also discuss the main challenges arising in this setting, and outline potential directions for future progress. Our program covers a broad spectrum of applications and techniques. It is augmented by invited talks from Yulia Tsvetkov, Sebastian Ruder, Graham Neubig, and David Ifeoluwa Adelani. We would like to thank the members of our Program Committee for their timely and thoughtful reviews.}
}
@inproceedings{wiegreffe2021reframing,
title={{Reframing Human-AI Collaboration for Generating Free-Text Explanations}},
author={Sarah Wiegreffe and Jack Hessel and Swabha Swayamdipta and Mark Riedl and Yejin Choi},
year={2022},
abbr={NAACL},
booktitle={Proc. of NAACL},
code={https://github.com/allenai/few_shot_explanations/},
url={https://arxiv.org/abs/2112.08674},
selected=true,
abstract={Large language models are increasingly capable of generating fluent-appearing text with relatively little task-specific supervision. But can these models accurately explain classification decisions? We consider the task of generating free-text explanations using a small number of human-written examples (i.e., in a few-shot manner). We find that (1) authoring higher-quality examples for prompting results in higher quality generations; and (2) surprisingly, in a head-to-head comparison, crowdworkers often prefer explanations generated by GPT-3 to crowdsourced human-written explanations contained within existing datasets. Crowdworker ratings also show, however, that while models produce factual, grammatical, and sufficient explanations, they have room to improve, e.g., along axes such as providing novel information and supporting the label. We create a pipeline that combines GPT-3 with a supervised filter that incorporates humans-in-the-loop via binary acceptability judgments. Despite significant subjectivity intrinsic to judging acceptability, our approach is able to consistently filter GPT-3 generated explanations deemed acceptable by humans.}
}
@inproceedings{sap2021annotators,
title={{Annotators with Attitudes: How Annotator Beliefs And Identities Bias Toxic Language Detection}},
author={Maarten Sap and Swabha Swayamdipta and Laura Vianna and Xuhui Zhou and Yejin Choi and Noah A. Smith},
year={2022},
abbr={NAACL},
booktitle={Proc. of NAACL},
url={https://arxiv.org/abs/2111.07997},
selected=true,
abstract={The perceived toxicity of language can vary based on someone's identity and beliefs, but this variation is often ignored when collecting toxic language datasets, resulting in dataset and model biases. We seek to understand the who, why, and what behind biases in toxicity annotations. In two online studies with demographically and politically diverse participants, we investigate the effect of annotator identities (who) and beliefs (why), drawing from social psychology research about hate speech, free speech, racist beliefs, political leaning, and more. We disentangle what is annotated as toxic by considering posts with three characteristics: anti-Black language, African American English (AAE) dialect, and vulgarity. Our results show strong associations between annotator identity and beliefs and their ratings of toxicity. Notably, more conservative annotators and those who scored highly on our scale for racist beliefs were less likely to rate anti-Black language as toxic, but more likely to rate AAE as toxic. We additionally present a case study illustrating how a popular toxicity detection system's ratings inherently reflect only specific beliefs and perspectives. Our findings call for contextualizing toxicity labels in social variables, which raises immense implications for toxic language annotation and detection.}
}
@inproceedings{ethayarajh2021informationtheoretic,
title={{Understanding Dataset Difficulty with 𝒱-Usable Information}},
author={Kawin Ethayarajh and Yejin Choi and Swabha Swayamdipta},
year={2022},
eprint={2110.08420},
abbr={ICML},
booktitle={Proc. of ICML},
code={https://github.com/kawine/dataset_difficulty},
url={https://arxiv.org/abs/2110.08420},
prize={Outstanding Paper Award},
selected=true,
abstract={Estimating the difficulty of a dataset typically involves comparing state-of-the-art models to humans; the bigger the performance gap, the harder the dataset is said to be. However, this comparison provides little understanding of how difficult each instance in a given distribution is, or what attributes make the dataset difficult for a given model. To address these questions, we frame dataset difficulty -- w.r.t. a model 𝒱 -- as the lack of 𝒱-usable information (Xu et al., 2019), where a lower value indicates a more difficult dataset for 𝒱. We further introduce pointwise -information (PVI) for measuring the difficulty of individual instances w.r.t. a given distribution. While standard evaluation metrics typically only compare different models for the same dataset, 𝒱-usable information and PVI also permit the converse: for a given model 𝒱, we can compare different datasets, as well as different instances/slices of the same dataset. Furthermore, our framework allows for the interpretability of different input attributes via transformations of the input, which we use to discover annotation artefacts in widely-used NLP benchmarks.}
}
@inproceedings{pillutla2021mauve,
title={MAUVE: Measuring the Gap Between Neural Text and Human Text using Divergence Frontiers},
author={Krishna Pillutla and Swabha Swayamdipta and Rowan Zellers and John Thickstun and Sean Wellecks and Yejin Choi and Zaid Harchaoui},
year={2021},
booktitle={Proc. of NeurIPS},
abbr={NeurIPS},
url={https://arxiv.org/abs/2102.01454},
code={https://github.com/krishnap25/mauve},
prize={Outstanding Paper Award},
selected=true,
abstract={As major progress is made in open-ended text generation, measuring how close machine-generated text is to human language remains a critical open problem. We introduce MAUVE, a comparison measure for open-ended text generation, which directly compares the learnt distribution from a text generation model to the distribution of human-written text using divergence frontiers. MAUVE scales up to modern text generation models by computing information divergences in a quantized embedding space. Through an extensive empirical study on three open-ended generation tasks, we find that MAUVE identifies known properties of generated text, scales naturally with model size, and correlates with human judgments, with fewer restrictions than existing distributional evaluation metrics.}
}
@inproceedings{pancholy2021sister,
title={Sister Help: Data Augmentation for Frame-Semantic Role Labeling},
author={Ayush Pancholy and Miriam R. L. Petruck and Swabha Swayamdipta},
year={2021},
booktitle={Proc. of LAW-DMR Workshop at EMNLP},
abbr={EMNLP},
url={http://arxiv.org/abs/2109.07725},
code={https://github.com/ayush-pancholy/sister-help},
poster={posters/sisters-ayush-lawdmr-emnlp2021.pdf},
abstract={While FrameNet is widely regarded as a rich resource of semantics in natural language processing, a major criticism concerns its lack of coverage and the relative paucity of its labeled data compared to other commonly used lexical resources such as PropBank and VerbNet. This paper reports on a pilot study to address these gaps. We propose a data augmentation approach, which uses existing frame-specific annotation to automatically annotate other lexical units of the same frame which are unannotated. Our rule-based approach defines the notion of a sister lexical unit and generates frame-specific augmented data for training. We present experiments on frame-semantic role labeling which demonstrate the importance of this data augmentation: we obtain a large improvement to prior results on frame identification and argument identification for FrameNet, utilizing both full-text and lexicographic annotations under FrameNet. Our findings on data augmentation highlight the value of automatic resource creation for improved models in frame-semantic parsing.}
}
@inproceedings{jacovi2021contrastive,
title={Contrastive Explanations for Model Interpretability},
author={Alon Jacovi and Swabha Swayamdipta and Shauli Ravfogel and Yanai Elazar and Yejin Choi and Yoav Goldberg},
year={2021},
booktitle={Proc. of EMNLP},
abbr={EMNLP},
url={https://arxiv.org/abs/2103.01378},
code={https://github.com/allenai/contrastive-explanations},
abstract={Contrastive explanations clarify why an event occurred in contrast to another. They are more inherently intuitive to humans to both produce and comprehend. We propose a methodology to produce contrastive explanations for classification models by modifying the representation to disregard non-contrastive information, and modifying model behavior to only be based on contrastive reasoning. Our method is based on projecting model representation to a latent space that captures only the features that are useful (to the model) to differentiate two potential decisions. We demonstrate the value of contrastive explanations by analyzing two different scenarios, using both high-level abstract concept attribution and low-level input token/span attribution, on two widely used text classification tasks. Specifically, we produce explanations for answering: for which label, and against which alternative label, is some aspect of the input useful? And which aspects of the input are useful for and against particular decisions? Overall, our findings shed light on the ability of label-contrastive explanations to provide a more accurate and finer-grained interpretability of a model's decision.}
}
@inproceedings{liu2021onthefly,
title={{DExperts: Decoding-Time Controlled Text Generation with Experts and Anti-Experts}},
author={Alisa Liu and Maarten Sap and Ximing Lu and Swabha Swayamdipta and Chandra Bhagavatula and Noah A. Smith and Yejin Choi},
year={2021},
url={https://arxiv.org/abs/2105.03023},
booktitle={Proc. of ACL},
abbr={ACL},
code={https://github.com/alisawuffles/DExperts},
selected=true,
abstract={Despite recent advances in natural language generation, it remains challenging to control attributes of generated text. We propose DExperts: Decoding-time Experts, a decoding-time method for controlled text generation that combines a pretrained language model with "expert" LMs and/or "anti-expert" LMs in a product of experts. Intuitively, under the ensemble, tokens only get high probability if they are considered likely by the experts, and unlikely by the anti-experts. We apply DExperts to language detoxification and sentiment-controlled generation, where we outperform existing controllable generation methods on both automatic and human evaluations. Moreover, because DExperts operates only on the output of the pretrained LM, it is effective with (anti-)experts of smaller size, including when operating on GPT-3. Our work highlights the promise of tuning small LMs on text with (un)desirable attributes for efficient decoding-time steering.}
}
@inproceedings{Zhou2021ToxicDebias,
author={Xuhui Zhou and Maarten Sap and Swabha Swayamdipta and Noah A. Smith and Yejin Choi},
title={Challenges in Automated Debiasing for Toxic Language Detection},
booktitle={Proc. of EACL},
abbr={EACL},
year={2021},
url={https://arxiv.org/abs/2102.00086},
code={https://github.com/XuhuiZhou/Toxic_Debias},
selected=true,
abstract={Biased associations have been a challenge in the development of classifiers for detecting toxic language, hindering both fairness and accuracy. As potential solutions, we investigate recently introduced debiasing methods for text classification datasets and models, as applied to toxic language detection. Our focus is on lexical (e.g., swear words, slurs, identity mentions) and dialectal markers (specifically African American English). Our comprehensive experiments establish that existing methods are limited in their ability to prevent biased behavior in current toxicity detectors. We then propose an automatic, dialect-aware data correction method, as a proof-of-concept. Despite the use of synthetic labels, this method reduces dialectal associations with toxicity. Overall, our findings show that debiasing a model trained on biased toxic language data is not as effective as simply relabeling the data to remove existing biases.}
}
@inproceedings{swayamdipta2020datamaps,
title = {Dataset Cartography: Mapping and Diagnosing Datasets with Training Dynamics},
author = {Swabha Swayamdipta and Roy Schwartz and Nicholas Lourie and Yizhong Wang and Hannaneh Hajishirzi and Noah A. Smith and Yejin Choi},
booktitle = {Proc. of EMNLP},
abbr = {EMNLP},
url = {https://arxiv.org/abs/2009.10795},
year = {2020},
code = {https://github.com/allenai/cartography},
slides = {https://slideslive.com/38939175},
selected=true,
abstract = {Large datasets have become commonplace in NLP research. However, the increased emphasis on data quantity has made it challenging to assess the quality of data. We introduce Data Maps---a model-based tool to characterize and diagnose datasets. We leverage a largely ignored source of information: the behavior of the model on individual instances during training (training dynamics) for building data maps. This yields two intuitive measures for each example---the model's confidence in the true class, and the variability of this confidence across epochs---obtained in a single run of training. Experiments across four datasets show that these model-dependent measures reveal three distinct regions in the data map, each with pronounced characteristics. First, our data maps show the presence of "ambiguous" regions with respect to the model, which contribute the most towards out-of-distribution generalization. Second, the most populous regions in the data are "easy to learn" for the model, and play an important role in model optimization. Finally, data maps uncover a region with instances that the model finds "hard to learn"; these often correspond to labeling errors. Our results indicate that a shift in focus from quantity to quality of data could lead to robust models and improved out-of-distribution generalization.}
}
@inproceedings{gururangan2020dont,
title={Don't Stop Pretraining: Adapt Language Models to Domains and Tasks},
author={Suchin Gururangan and Ana Marasović and Swabha Swayamdipta and Kyle Lo and Iz Beltagy and Doug Downey and Noah A. Smith},
year={2020},
url={https://arxiv.org/abs/2004.10964},
booktitle = {Proc. of ACL},
abbr = {ACL},
code = {https://github.com/allenai/dont-stop-pretraining},
prize = {Best Paper Honorable Mention},
selected=true,
abstract = {Language models pretrained on text from a wide variety of sources form the foundation of today's NLP. In light of the success of these broad-coverage models, we investigate whether it is still helpful to tailor a pretrained model to the domain of a target task. We present a study across four domains (biomedical and computer science publications, news, and reviews) and eight classification tasks, showing that a second phase of pretraining in-domain (domain-adaptive pretraining) leads to performance gains, under both high- and low-resource settings. Moreover, adapting to the task's unlabeled data (task-adaptive pretraining) improves performance even after domain-adaptive pretraining. Finally, we show that adapting to a task corpus augmented using simple data selection strategies is an effective alternative, especially when resources for domain-adaptive pretraining might be unavailable. Overall, we consistently find that multi-phase adaptive pretraining offers large gains in task performance.}
}
@inproceedings{bras2020adversarial,
title={Adversarial Filters of Dataset Biases},
author={Ronan LeBras and Swabha Swayamdipta and Chandra Bhagavatula and Rowan Zellers and Matthew E. Peters and Ashish Sabharwal and Yejin Choi},
year={2020},
booktitle={Proc. of ICML},
abbr = {ICML},
url={https://arxiv.org/abs/2002.04108},
code={https://github.com/swabhs/notebooks_for_aflite},
selected=true,
abstract={Large neural models have demonstrated human-level performance on language and vision benchmarks, while their performance degrades considerably on adversarial or out-of-distribution samples. This raises the question of whether these models have learned to solve a dataset rather than the underlying task by overfitting to spurious dataset biases. We investigate one recently proposed approach, AFLite, which adversarially filters such dataset biases, as a means to mitigate the prevalent overestimation of machine performance. We provide a theoretical understanding for AFLite, by situating it in the generalized framework for optimum bias reduction. We present extensive supporting evidence that AFLite is broadly applicable for reduction of measurable dataset biases, and that models trained on the filtered datasets yield better generalization to out-of-distribution tasks. Finally, filtering results in a large drop in model performance (e.g., from 92% to 62% for SNLI), while human performance still remains high. Our work thus shows that such filtered datasets can pose new research challenges for robust generalization by serving as upgraded benchmarks. }
}
@inproceedings{schwartz2020right,
title={The Right Tool for the Job: Matching Model and Instance Complexities},
author={Roy Schwartz and Gabi Stanovsky and Swabha Swayamdipta and Jesse Dodge and Noah A. Smith},
year={2020},
booktitle={Proc. of ACL},
abbr = {ACL},
url={https://arxiv.org/abs/2004.07453},
code={https://github.com/allenai/sledgehammer},
abstract={As NLP models become larger, executing a trained model requires significant computational resources incurring monetary and environmental costs. To better respect a given inference budget, we propose a modification to contextual representation fine-tuning which, during inference, allows for an early (and fast) "exit" from neural network calculations for simple instances, and late (and accurate) exit for hard instances. To achieve this, we add classifiers to different layers of BERT and use their calibrated confidence scores to make early exit decisions. We test our proposed modification on five different datasets in two tasks: three text classification datasets and two natural language inference benchmarks. Our method presents a favorable speed/accuracy tradeoff in almost all cases, producing models which are up to five times faster than the state of the art, while preserving their accuracy. Our method also requires almost no additional training resources (in either time or parameters) compared to the baseline BERT model. Finally, our method alleviates the need for costly retraining of multiple models at different levels of efficiency; we allow users to control the inference speed/accuracy tradeoff using a single trained model, by setting a single variable at inference time. We publicly release our code.}
}
@inproceedings{yang2020gdaug,
title={G-DAUG: Generative Data Augmentation for Commonsense Reasoning},
author={Yiben Yang and Chaitanya Malaviya and Jared Fernandez and Swabha Swayamdipta and Ronan LeBras and Ji-Ping Wang and Chandra Bhagavatula and Yejin Choi and Doug Downey},
year={2020},
month={Jun},
url={https://arxiv.org/abs/2004.11546},
booktitle={Findings of EMNLP},
abbr = {EMNLP},
code={https://github.com/yangyiben/G-DAUG-c-Generative-Data-Augmentation-for-Commonsense-Reasoning},
abstract={Recent advances in commonsense reasoning depend on large-scale human-annotated training data to achieve peak performance. However, manual curation of training examples is expensive and has been shown to introduce annotation artifacts that neural models can readily exploit and overfit on. We investigate G-DAUG^C, a novel generative data augmentation method that aims to achieve more accurate and robust learning in the low-resource setting. Our approach generates synthetic examples using pretrained language models, and selects the most informative and diverse set of examples for data augmentation. In experiments with multiple commonsense reasoning benchmarks, G-DAUG^C consistently outperforms existing data augmentation methods based on back-translation, and establishes a new state-of-the-art on WinoGrande, CODAH, and CommonsenseQA. Further, in addition to improvements in in-distribution accuracy, G-DAUG^C-augmented training also enhances out-of-distribution generalization, showing greater robustness against adversarial or perturbed examples. Our analysis demonstrates that G-DAUG^C produces a diverse set of fluent training examples, and that its selection and training approaches are important for performance. Our findings encourage future research toward generative data augmentation to enhance both in-distribution learning and out-of-distribution generalization.}
}
}
---
---
@phdthesis{swayamdipta2019syntactic,
title={{PhD Thesis: Syntactic Inductive Biases for Natural Language Processing}},
author={Swayamdipta, Swabha},
year={2019},
school={Carnegie Mellon University},
url={https://swabhs.com/assets/pdf/theses/swabha_thesis.pdf},
abbr = {PhD},
abstract = {With the rise in availability of data for language learning, the role of linguistic structure is under scrutiny. The underlying syntactic structure of language allows for composition of simple elements into more complex ones in innumerable ways; generalization to new examples hinges on this structure. We define a syntactic inductive bias as a signal that steers the learning algorithm towards a syntactically robust solution, over others. This thesis explores the need for incorporation of such biases into already powerful neural models of language. We describe three general approaches for incorporating syntactic inductive biases into task-specific models, under different levels of supervision. The first method calls for joint learning of entire syntactic dependency trees with semantic dependency graphs through direct supervision, to facilitate better semantic dependency parsing. Second, we introduce the paradigm of scaffolded learning, which enables us to leverage inductive biases from syntactic sources to predict a related semantic structure, using only as much supervision as is necessary. The third approach yields general-purpose contextualized representations conditioned on large amounts of data along with their shallow syntactic structures, obtained automatically. The linguistic representations learned as a result of syntactic inductive biases are shown to be effective across a range of downstream tasks, but their usefulness is especially pronounced for semantic tasks.}
}
@inproceedings{Ruder:19,
title={Tutorial on Transfer Learning in Natural Language Processing},
author={Ruder, Sebastian and Peters, Matthew E and Swayamdipta, Swabha and Wolf, Thomas},
booktitle={Proc. of NAACL},
abbr = {NAACL},
url={https://www.aclweb.org/anthology/N19-5004/},
pages={15--18},
year={2019},
abstract={The classic supervised machine learning paradigm is based on learning in isolation, a single predictive model for a task using a single dataset. This approach requires a large number of training examples and performs best for well-defined and narrow tasks. Transfer learning refers to a set of methods that extend this approach by leveraging data from additional domains or tasks to train a model with better generalization properties. Over the last two years, the field of Natural Language Processing (NLP) has witnessed the emergence of several transfer learning methods and architectures which significantly improved upon the state-of-the-art on a wide range of NLP tasks. These improvements together with the wide availability and ease of integration of these methods are reminiscent of the factors that led to the success of pretrained word embeddings and ImageNet pretraining in computer vision, and indicate that these methods will likely become a common tool in the NLP landscape as well as an important research direction. We will present an overview of modern transfer learning methods in NLP, how models are pre-trained, what information the representations they learn capture, and review examples and case studies on how these models can be integrated and adapted in downstream NLP tasks.}
}
@misc{Swayamdipta:19,
title={Shallow Syntax in Deep Water},
author={Swabha Swayamdipta and Matthew Peters and Brendan Roof and Chris Dyer and Noah A. Smith},
year={2019},
eprint={1908.11047},
archivePrefix={arXiv},
abbr = {arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1908.11047},
abstract={Shallow syntax provides an approximation of phrase-syntactic structure of sentences; it can be produced with high accuracy, and is computationally cheap to obtain. We investigate the role of shallow syntax-aware representations for NLP tasks using two techniques. First, we enhance the ELMo architecture to allow pretraining on predicted shallow syntactic parses, instead of just raw text, so that contextual embeddings make use of shallow syntactic context. Our second method involves shallow syntactic features obtained automatically on downstream task data. Neither approach leads to a significant gain on any of the four downstream tasks we considered relative to ELMo-only baselines. Further analysis using black-box probes confirms that our shallow-syntax-aware contextual embeddings do not transfer to linguistic tasks any more easily than ELMo's embeddings. We take these findings as evidence that ELMo-style pretraining discovers representations which make additional awareness of shallow syntax redundant.}
}
---
---
@inproceedings{Gururangan:18,
title={Annotation Artifacts in Natural Language Inference Data},
author={Gururangan, Suchin and Swayamdipta, Swabha and Levy, Omer and Schwartz, Roy and Bowman, Samuel and Smith, Noah A.},
booktitle={Proc. of NAACL},
abbr = {NAACL},
year={2018},
url={https://arxiv.org/abs/1803.02324},
poster={posters/artifacts-naacl.pdf},
code={https://github.com/swabhs/notebooks/blob/master/annotation_artifacts.ipynb},
abstract = {Large-scale datasets for natural language inference are created by presenting crowd workers with a sentence (premise), and asking them to generate three new sentences (hypotheses) that it entails, contradicts, or is logically neutral with respect to. We show that, in a significant portion of such data, this protocol leaves clues that make it possible to identify the label by looking only at the hypothesis, without observing the premise. Specifically, we show that a simple text categorization model can correctly classify the hypothesis alone in about 67% of SNLI (Bowman et. al, 2015) and 53% of MultiNLI (Williams et. al, 2017). Our analysis reveals that specific linguistic phenomena such as negation and vagueness are highly correlated with certain inference classes. Our findings suggest that the success of natural language inference models to date has been overestimated, and that the task remains a hard open problem.}
}
@inproceedings{Swayamdipta:18b,
title = {{Syntactic Scaffolds for Semantic Structures}},
author = {Swayamdipta, Swabha and Thomson, Sam and Lee, Kenton and Zettlemoyer, Luke and
Dyer, Chris and Smith, Noah A.},
booktitle = {EMNLP},
abbr = {EMNLP},
year ={2018},
url = {https://arxiv.org/abs/1808.10485},
talk = {https://youtu.be/1YK9dEjIGlU?t=40m},
code = {https://github.com/swabhs/scaffolding},
abstract = {We introduce the syntactic scaffold, an approach to incorporating syntactic information into semantic tasks. Syntactic scaffolds avoid expensive syntactic processing at runtime, only making use of a treebank during training, through a multitask objective. We improve over strong baselines on PropBank semantics, frame semantics, and coreference resolution, achieving competitive performance on all three tasks.}
}
@inproceedings{Swayamdipta:18a,
title={{Multi-Mention Learning for Reading Comprehension with Neural Cascades}},
url = {https://openreview.net/forum?id=HyRnez-RW},
author={Swayamdipta, Swabha and Parikh, Ankur P and Kwiatkowski, Tom},
booktitle={Proc. of ICLR},
abbr = {ICLR},
year={2018},
poster = {posters/triviaqa-iclr.pdf},
abstract={Reading comprehension is a challenging task, especially when executed across longer or across multiple evidence documents, where the answer is likely to reoccur. Existing neural architectures typically do not scale to the entire evidence, and hence, resort to selecting a single passage in the document (either via truncation or other means), and carefully searching for the answer within that passage. However, in some cases, this strategy can be suboptimal, since by focusing on a specific passage, it becomes difficult to leverage multiple mentions of the same answer throughout the document. In this work, we take a different approach by constructing lightweight models that are combined in a cascade to find the answer. Each submodel consists only of feed-forward networks equipped with an attention mechanism, making it trivially parallelizable. We show that our approach can scale to approximately an order of magnitude larger evidence documents and can aggregate information from multiple mentions of each answer candidate across the document. Empirically, our approach achieves state-of-the-art performance on both the Wikipedia and web domains of the TriviaQA dataset, outperforming more complex, recurrent architectures.}
}
@inproceedings{Peng:18a,
title = {Learning Joint Semantic Parsers from Disjoint Data},
author = {Peng, Hao and Thomson, Sam and Swayamdipta, Swabha and Smith, Noah A.},
booktitle = {Proc. of NAACL},
abbr = {NAACL},
year={2018},
url = {http://aclweb.org/anthology/N18-1135},
code = {https://github.com/Noahs-ARK/NeurboParser},
abstract={We present a new approach to learning a semantic parser from multiple datasets, even when the target semantic formalisms are drastically different and the underlying corpora do not overlap. We handle such “disjoint” data by treating annotations for unobserved formalisms as latent structured variables. Building on state-of-the-art baselines, we show improvements both in frame-semantic parsing and semantic dependency parsing by modeling them jointly.}
}
@inproceedings{Mulcaire:18,
title={Polyglot Semantic Role Labeling},
author={Mulcaire, Phoebe and Swayamdipta, Swabha and Smith, Noah A.},
booktitle={Proc. of ACL},
abbr = {ACL},
year={2018},
url={https://aclanthology.org/P18-2106/},
abstract={Previous approaches to multilingual semantic dependency parsing treat languages independently, without exploiting the similarities between semantic structures across languages. We experiment with a new approach where we combine resources from different languages in the CoNLL 2009 shared task to build a single polyglot semantic dependency parser. Notwithstanding the absence of parallel data, and the dissimilarity in annotations between languages, our approach results in improvement in parsing performance on several languages over a monolingual baseline. Analysis of the polyglot models’ performance provides a new understanding of the similarities and differences between languages in the shared task.}
}
@inproceedings{baker-etal-2018-frame,
title = "Frame Semantics across Languages: Towards a Multilingual {F}rame{N}et",
author = "Baker, Collin F. and
Ellsworth, Michael and
Petruck, Miriam R. L. and
Swayamdipta, Swabha",
booktitle = "COLING: Tutorial Abstracts",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/C18-3003",
pages = "9--12",
abbr = {COLING},
abstract={FrameNet is a lexical resource that provides rich semantic representations of the core English vocabulary based on Fillmore’s Frame Semantics, with more than 200k manually annotated examples. Resources based on FrameNet have now been created for roughly a dozen languages. This workshop will present current research on aligning Frame Semantic resources across languages and automatic frame semantic parsing in English and other languages. We will explore the extent to which semantic frames are similar across languages and the implications for theories of semantic universals, the practice of translation (whether human or machine), and multilingual knowledge representation. Does not require prior familiarity with Frame Semantics.}
}
---
---
@misc{Neubig:17,
title = {{DyNet}: The Dynamic Neural Network Toolkit},
author = {Graham Neubig and Chris Dyer and Yoav Goldberg and Austin Matthews and Waleed Ammar and
Antonios Anastasopoulos and Miguel Ballesteros and David Chiang and Daniel Clothiaux and
Trevor Cohn and Kevin Duh and Manaal Faruqui and Cynthia Gan and Dan Garrette and
Yangfeng Ji and Lingpeng Kong and Adhiguna Kuncoro and Gaurav Kumar and
Chaitanya Malaviya and Paul Michel and Yusuke Oda and Matthew Richardson and
Naomi Saphra and Swabha Swayamdipta and Pengcheng Yin},
url = {https://arxiv.org/abs/1701.03980},
abbr = {arXiv},
year = {2017},
code = {https://github.com/clab/dynet},
abstract = {We describe DyNet, a toolkit for implementing neural network models based on dynamic declaration of network structure. In the static declaration strategy that is used in toolkits like Theano, CNTK, and TensorFlow, the user first defines a computation graph (a symbolic representation of the computation), and then examples are fed into an engine that executes this computation and computes its derivatives. In DyNet's dynamic declaration strategy, computation graph construction is mostly transparent, being implicitly constructed by executing procedural code that computes the network outputs, and the user is free to use different network structures for each input. Dynamic declaration thus facilitates the implementation of more complicated network architectures, and DyNet is specifically designed to allow users to implement their models in a way that is idiomatic in their preferred programming language (C++ or Python). One challenge with dynamic declaration is that because the symbolic computation graph is defined anew for every training example, its construction must have low overhead. To achieve this, DyNet has an optimized C++ backend and lightweight graph representation. Experiments show that DyNet's speeds are faster than or comparable with static declaration toolkits, and significantly faster than Chainer, another dynamic declaration toolkit. DyNet is released open-source under the Apache 2.0 license.}
}
@misc{Swayamdipta:17,
author = {Swabha Swayamdipta and Sam Thomson and Chris Dyer and Noah A. Smith},
title = {{Frame-Semantic Parsing with Softmax-Margin Segmental {RNN}s and a Syntactic Scaffold}},
url = {http://arxiv.org/abs/1706.09528},
year = {2017},
abbr = {arXiv},
url = {https://arxiv.org/abs/1706.09528},
poster = {posters/open-sesame.pdf},
code = {https://github.com/swabhs/open-sesame},
abstract = {We present a new, efficient frame-semantic parser that labels semantic arguments to FrameNet predicates. Built using an extension to the segmental RNN that emphasizes recall, our basic system achieves competitive performance without any calls to a syntactic parser. We then introduce a method that uses phrase-syntactic annotations from the Penn Treebank during training only, through a multitask objective; no parsing is required at training or test time. This "syntactic scaffold" offers a cheaper alternative to traditional syntactic pipelining, and achieves state-of-the-art performance.}
}
---
---
@inproceedings{Swayamdipta:16,
author = {Swabha Swayamdipta and Miguel Ballesteros and Chris Dyer and Noah A. Smith},
title = {Greedy, Joint Syntactic-Semantic Parsing with {Stack LSTM}s},
booktitle = {Proc. of CoNLL},
abbr = {CoNLL},
url = {https://www.aclweb.org/anthology/K16-1019},
year = {2016},
slides = {talks/conll16.pdf},
code = {https://github.com/swabhs/joint-lstm-parser},
abstract = {We present a transition-based parser that jointly produces syntactic and semantic dependencies. It learns a representation of the entire algorithm state, using stack long short-term memories. Our greedy inference algorithm has linear time, including feature extraction. On the CoNLL 2008–9 English shared tasks, we obtain the best published parsing performance among models that jointly learn syntax and semantics.}
}
---
---
@inproceedings{Kong:2014,
author = {Kong, Lingpeng and Schneider, Nathan and Swayamdipta, Swabha and Bhatia Archna
and Dyer, Chris and Smith, Noah A.},
title = {{A Dependency Parser for Tweets}},
booktitle = {Proc. of EMNLP},
abbr = {EMNLP},
year = {2014},
url = {https://www.aclweb.org/anthology/D14-1108},
code = {https://github.com/ikekonglp/TweeboParser},
abstract = {We describe a new dependency parser for English tweets, TWEEBOPARSER. The parser builds on several contributions: new syntactic annotations for a corpus of tweets (TWEEBANK), with conventions informed by the domain; adaptations to a statistical parsing algorithm; and a new approach to exploiting out-of-domain Penn Treebank data. Our experiments show that the parser achieves over 80% unlabeled attachment accuracy on our new, high-quality test set and measure the benefit of our contributions.}
}
@inproceedings{thomson-etal-2014-cmu,
title = "{CMU}: Arc-Factored, Discriminative Semantic Dependency Parsing",
author = "Thomson, Sam and O{'}Connor, Brendan and Flanigan, Jeffrey and Bamman, David and
Dodge, Jesse and Swayamdipta, Swabha and Schneider, Nathan and Dyer, Chris and
Smith, Noah A.",
booktitle = "Proc. of {S}em{E}val",
abbr = {SemEval},
year = "2014",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/S14-2027",
doi = "10.3115/v1/S14-2027",
pages = "176--180",
abstract={We present an arc-factored statistical model for semantic dependency parsing, as defined by the SemEval 2014 Shared Task 8 on Broad-Coverage Semantic Dependency Parsing. Our entry in the open track placed second in the competition.}
}
@inproceedings{matthews2014cmu,
title={The CMU machine translation systems at WMT 2014},
author={Matthews, Austin and Ammar, Waleed and Bhatia, Archna and Feely, Weston and Hanneman, Greg
and Schlinger, Eva and Swayamdipta, Swabha and Tsvetkov, Yulia and Lavie, Alon and Dyer, Chris},
booktitle={Proc. of WMT},
abbr = {WMT},
pages={142--149},
year={2014},
url={https://www.aclweb.org/anthology/W14-3315},
abstract={We describe the CMU systems submitted to the 2014 WMT shared translation task. We participated in two language pairs, German–English and Hindi–English. Our innovations include: a label coarsening scheme for syntactic tree-to-tree translation, a host of new discriminative features, several modules to create “synthetic translation options” that can generalize beyond what is directly observed in the training data, and a method of combining the output of multiple word aligners to uncover extra phrase pairs and grammar rules.}
}
---
---
@inproceedings{swayamdipta2012pursuit,
title={{The Pursuit of Power and its Manifestation in Written Dialog}},
author={Swayamdipta, Swabha and Rambow, Owen},
booktitle={Proc. of ICSC},
abbr = {ICSC},
pages={22--29},
year={2012},
organization={IEEE},
url={https://ieeexplore.ieee.org/abstract/document/6337078},
abstract={In this paper we explore the written dialog behavior of participants in anon line discussion for automatic identification of participants who pursue power within the discussion group. We employ various standard unsupervised machine learning approaches to make this prediction. Our approach relies on the identification of certain discourse structures and linguistic techniques used by participants in the discussion. We achive an F-measure of 69.5% using unsupervised methods.}
}