-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathbaseline.py
253 lines (220 loc) · 10.3 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import os
import re
import spacy
from glob import glob
nlp = spacy.load('en_core_web_sm')
keyword_path = os.path.join(os.getcwd(), 'keywords') # path to keywords folder
TRIGGER_WORDS = frozenset(
[
'project', 'framework', 'methodology', 'algorithm', 'approach', 'system', 'platform',
'prototype', 'work', 'paper', 'project', 'article', 'research', 'study', 'prototype',
'library', 'finding'
]
)
def load_keywords(path=keyword_path):
"""
Load list of 186 keywords listed in from the given path
"""
keyword_files = glob(os.path.join(path, '*.txt'))
keywords = []
for keyword_file in keyword_files:
if not any([d in keyword_file for d in ['deictic_cliche', 'deictic_phrase']]):
with open(keyword_file, 'r') as f:
keywords.extend([w.strip() for w in f.readlines()])
keywords = frozenset(keywords)
return keywords
def find_noun_phrase_offset(token):
"""
Given tokenized sentence string,
return start/end position of noun chunks token
(end in this case is one position after noun phrase token)
"""
noun_phrase_position = [(s.start, s.end) for s in token.noun_chunks]
return noun_phrase_position
def find_deictic(token, keywords=[]):
"""
Give a tokenized sentence input (and list of keywords),
return all start and end indices for deictic
"""
deilectics = []
n_token = len(token)
token_zip = list(zip(token, token[1:]))
for token1, token2 in token_zip:
# deilectic rule 1: This paper presents a use case of ...
if (token1.pos_ == 'DET') and any([token2.lemma_ in g for g in keywords]):
deilectics.append((token1.i, token2.i))
# deilectic rule 2: Here, we demonstrate how our interpretation ...
if token1.pos_ == 'ADV' and token2.pos_ == 'PUNCT' and token1.lemma_ not in ['so']:
deilectics.append((token1.i, token2.i))
# deilectic rule 3: Our approach is compatible with the ...
noun_phrase_position = find_noun_phrase_offset(token)
if noun_phrase_position:
for (start, end) in noun_phrase_position:
tag_list = [t.text.lower() for t in token[start:end]]
is_we_in = any([w in tag_list for w in ['we', 'our', 'paper']])
if token[min(end, n_token - 1)].pos_ == 'VERB' and is_we_in:
deilectics.append((start, end - 1))
deilectics = list(set(deilectics))
return deilectics
def find_meta_discourse(token, keywords=[]):
"""
Give a tokenized sentence input,
return all start and end indices for meta discourse
"""
meta_discourses = []
deilectics = find_deictic(token, keywords)
n_token = len(token)
token_zip = list(zip(token, token[1:]))
for (start, end) in deilectics:
n_sel = end
for j in range(end, min(end + 3, n_token)):
if token[j].pos_ == 'NOUN':
n_sel = j
# select two tokens ahead
token1 = token[min(n_sel + 1, n_token - 1)]
token2 = token[min(n_sel + 2, n_token - 1)]
# metadiscourse rule 1: deictic + verb_presentation
if token1.pos_ == 'VERB' and (token1.lemma_ in keywords):
meta_discourses.append((start, n_sel + 1))
# metadiscourse rule 2: deictic + pronoun + verb_presentation
if token1.pos_ == 'PRON' and (token2.lemma_ in keywords):
meta_discourses.append((start, n_sel + 2))
# metadiscourse rule 3: pron + verb_presentation, We built the first ...
for token1, token2 in token_zip:
if (token1.pos_ == 'PRON') and (token2.pos_ == 'VERB') and (token2.pos_ in keywords):
meta_discourses.append((token1.i, token2.i))
return meta_discourses
def find_contribution(token, keywords=[]):
"""
Give a tokenized sentence input,
return all start and end indices for contribution statement
"""
contributions = []
n_token = len(token)
meta_discourses = find_meta_discourse(token, keywords)
skip_noun_phrase_dict = dict(find_noun_phrase_offset(token))
# contribution rule 1: metadiscourse + noun phrase
for (start, end) in meta_discourses:
if (skip_noun_phrase_dict.get(end + 1) is
not None) or (token[min(end + 1, n_token - 1)].pos_ == 'NOUN'):
contributions.append((start, skip_noun_phrase_dict.get(end + 1)))
# contribution rule 2: metadiscourse + adverb + noun phrase
for (start, end) in meta_discourses:
if (token[min(end + 1, n_token - 1)].pos_ == 'ADV' and token[min(end + 2, n_token - 1)].pos_ == 'ADJ') or \
(token[min(end + 1, n_token - 1)].pos_ == 'ADV' and token[min(end + 2, n_token - 1)].pos_ == 'NOUN'):
contributions.append((start, end + 2))
return contributions
def find_claim(token, keywords=set()):
"""
Give a tokenized sentence input from spacy,
return all start and end indices of claim statement
Parameters
==========
token: spacy token of the string
keywords: set of keywords, default to all keywords located in 'keywords' folder
Output
======
claims: list of tuple, list of tuple which contains token position of claims,
if list is not empty meaning that there is claim in the given token
Example
=======
keywords = load_keywords('keywords')
find_claim(nlp('In this study ...'), keywords) # return position of claim, if
"""
claims = []
if len(token) < 5 or not any(
[t.text.lower() in TRIGGER_WORDS.union({'we', 'our'}) for t in token]
):
return claims
deilectics = find_deictic(token, keywords)
meta_discourses = find_meta_discourse(token, keywords)
skip_noun_phrase_dict = dict(find_noun_phrase_offset(token))
n_token = len(token)
lemma_all = [t.lemma_ for t in token]
lemma_in_keywords = any([(l in keywords) for l in lemma_all])
# claim rule 1: meta discourse + det + adj + trigger
# We built the first BauDenkMalNetz prototype using SMW
for (start, end) in meta_discourses:
token1 = token[min(end + 1, n_token - 1)]
token2 = token[min(end + 2, n_token - 1)]
if (token1.pos_ == 'DET'
) and (token2.pos_ == 'ADJ' or token2.pos_ == 'ADV') and lemma_in_keywords:
claims.append((start, end + 2))
for (start, end) in deilectics:
# claim rule 2: deictic + adjective or adverb
token1 = token[min(end + 1, n_token - 1)]
token2 = token[min(end + 2, n_token - 1)]
token3 = token[min(end + 3, n_token - 1)]
if token1.pos_ == 'VERB' and (token2.pos_ == 'ADJ' or token2.pos_ == 'ADV') \
and 'we ' in ' '.join([t.text.lower() for t in token[start: min(end + 2, n_token - 1)]]):
claims.append((start, end + 2))
# claim rule 3: deilectics + VBP + ..., We have found that ...
# This paper has presented a computational strategy for ...
is_kw_in = any([t.text.lower() in TRIGGER_WORDS for t in token[start:end]])
if (
token1.pos_ == 'VERB' and token2.pos_ == 'VERB' and
(token3.pos_ in ['ADP', 'DET', 'ADJ', 'NUM'])
) and is_kw_in:
claims.append((start, end + 3))
# claim adding rule: Our system maintains a set ...
deilectics_text = ' '.join([t.text for t in token[start:end + 1]])
if (token1.pos_ == 'VERB') and ('our' in deilectics_text.lower()):
claims.append((start, end + 1))
# claim rule 5: This work is an important first step ...
if token1.pos_ == 'VERB' and (
(token2.pos_ == 'DET' and token3.pos_ == 'ADJ') or token2.pos_ == 'ADJ'
):
claims.append((start, end + 2))
if skip_noun_phrase_dict.get(token1.i + 1) is not None and is_kw_in:
claims.append((start, skip_noun_phrase_dict.get(token1.i + 1)))
# Finally, we point out how to use the FOAF ...
if (token1.pos_ == 'PRON' and token1.text.lower() == 'we') and token2.pos_ == 'VERB':
claims.append((start, end))
# claim rule 4: Our study also shows...
noun_phrase_position = find_noun_phrase_offset(token)
for (start, end) in noun_phrase_position:
token1 = token[min(end, n_token - 1)]
token2 = token[min(end + 1, n_token - 1)]
tag_list = [t.text.lower() for t in token[start:end]]
is_we_in = any([w in tag_list for w in ['we', 'our', 'paper', 'study']])
if (token1.lemma_ in keywords) and is_we_in: # or (lemma2 in keywords)
claims.append((start, end))
if token1.pos_ == 'ADV' and token2.lemma_ in keywords:
claims.append((start, end + 1))
# example: In this paper, we discuss a web-first approach
if token1.pos_ == 'PUNCT' and (token2.pos_ == 'PRON' and token2.text.lower() == 'we'):
claims.append((start, end + 1))
# example: In this manuscript we produce and analyze ...
if token1.pos_ == 'PRON' and token1.pos_.lower() == 'we' and token2.pos_ == 'VERB':
claims.append((start, end))
claims = list(set(claims))
return claims
def find_extra_claim(token, keywords=set()):
"""
Loosen one rule to get more non-claim examples
"""
claims = []
if len(token) < 5 or not any(
[t.text.lower() in TRIGGER_WORDS.union({'we', 'our'}) for t in token]
):
return claims
n_token = len(token)
noun_phrase_position = find_noun_phrase_offset(token)
for (start, end) in noun_phrase_position:
token1 = token[min(end, n_token - 1)]
token2 = token[min(end + 1, n_token - 1)]
tag_list = [t.text.lower() for t in token[start:end]]
is_we_in = any([w in tag_list for w in ['we', 'our', 'paper', 'study']])
# loosen the rule here!
if ((token1.lemma_ in keywords) or (token2.lemma_ in keywords)) and is_we_in:
claims.append((start, end))
if token1.pos_ == 'ADV' and token2.lemma_ in keywords:
claims.append((start, end + 1))
# example: In this paper, we discuss a web-first approach
if token1.pos_ == 'PUNCT' and (token2.pos_ == 'PRON' and token2.text.lower() == 'we'):
claims.append((start, end + 1))
# example: In this manuscript we produce and analyze ...
if token1.pos_ == 'PRON' and token1.text.lower() == 'we' and token2.pos_ == 'VERB':
claims.append((start, end))
claims = list(set(claims))
return claims