snow8261
diff --git a/‎5-2.BERT/BERT-Torch.py
+80-72 b/‎5-2.BERT/BERT-Torch.py
+80-72
@@ -13,10 +13,10 @@
 from torch.autograd import Variable
 
 # BERT Parameters
-maxlen = 512
+maxlen = 30
 batch_size = 6
-max_pred = 20 # max tokens of prediction
-n_layers = 12
+max_pred = 5 # max tokens of prediction
+n_layers = 6
 n_heads = 12
 d_model = 768
 d_ff = 768*4 # 4*d_model, FeedForward dimension
@@ -44,6 +44,58 @@
     arr = [word_dict[s] for s in sentence.split()]
     token_list.append(arr)
 
+# sample IsNext and NotNext to be same in small batch size
+def make_batch():
+    batch = []
+    positive = negative = 0
+    while positive != batch_size/2 or negative != batch_size/2:
+        tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
+        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]
+        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
+        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
+
+        # MASK LM
+        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
+        cand_maked_pos = [i for i, token in enumerate(input_ids)
+                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
+        shuffle(cand_maked_pos)
+        masked_tokens, masked_pos = [], []
+        for pos in cand_maked_pos[:n_pred]:
+            masked_pos.append(pos)
+            masked_tokens.append(input_ids[pos])
+            if random() < 0.8:  # 80%
+                input_ids[pos] = word_dict['[MASK]'] # make mask
+            elif random() < 0.5:  # 10%
+                index = randint(0, vocab_size - 1) # random index in vocabulary
+                input_ids[pos] = word_dict[number_dict[index]] # replace
+
+        # Zero Paddings
+        n_pad = maxlen - len(input_ids)
+        input_ids.extend([0] * n_pad)
+        segment_ids.extend([0] * n_pad)
+
+        # Zero Padding (100% - 15%) tokens
+        if max_pred > n_pred:
+            n_pad = max_pred - n_pred
+            masked_tokens.extend([0] * n_pad)
+            masked_pos.extend([0] * n_pad)
+
+        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
+            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
+            positive += 1
+        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
+            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
+            negative += 1
+    return batch
+# Proprecessing Finished
+
+def get_attn_pad_mask(seq_q, seq_k):
+    batch_size, len_q = seq_q.size()
+    batch_size, len_k = seq_k.size()
+    # eq(zero) is PAD token
+    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
+    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k
+
 def gelu(x):
     "Implementation of the gelu activation function by Hugging Face"
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
@@ -55,21 +107,21 @@ def __init__(self):
         self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
         self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
         self.norm = nn.LayerNorm(d_model)
-        self.drop = nn.Dropout(0.1)
 
     def forward(self, x, seg):
         seq_len = x.size(1)
         pos = torch.arange(seq_len, dtype=torch.long)
         pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
         embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
-        return self.drop(self.norm(embedding))
+        return self.norm(embedding)
 
 class ScaledDotProductAttention(nn.Module):
     def __init__(self):
         super(ScaledDotProductAttention, self).__init__()
 
-    def forward(self, Q, K, V, attn_mask=None):
+    def forward(self, Q, K, V, attn_mask):
         scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
+        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
         attn = nn.Softmax(dim=-1)(scores)
         context = torch.matmul(attn, V)
         return context, attn
@@ -80,19 +132,18 @@ def __init__(self):
         self.W_Q = nn.Linear(d_model, d_k * n_heads)
         self.W_K = nn.Linear(d_model, d_k * n_heads)
         self.W_V = nn.Linear(d_model, d_v * n_heads)
-
-    def forward(self, Q, K, V, attn_mask=None):
+    def forward(self, Q, K, V, attn_mask):
         # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
         residual, batch_size = Q, Q.size(0)
         # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
         q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
         k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
         v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]
 
-        if attn_mask is not None: # attn_mask : [batch_size x len_q x len_k]
-            attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]
+        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]
+
         # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
-        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask=attn_mask)
+        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
         context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
         output = nn.Linear(n_heads * d_v, d_model)(context)
         return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]
@@ -113,8 +164,8 @@ def __init__(self):
         self.enc_self_attn = MultiHeadAttention()
         self.pos_ffn = PoswiseFeedForwardNet()
 
-    def forward(self, enc_inputs):
-        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs) # enc_inputs to same Q,K,V
+    def forward(self, enc_inputs, enc_self_attn_mask):
+        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
         enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
         return enc_outputs, attn
 
@@ -138,9 +189,11 @@ def __init__(self):
 
     def forward(self, input_ids, segment_ids, masked_pos):
         output = self.embedding(input_ids, segment_ids)
+        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
         for layer in self.layers:
-            output, enc_self_attn = layer(output)
+            output, enc_self_attn = layer(output, enc_self_attn_mask)
         # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
+        # it will be decided by first token(CLS)
         h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]
         logits_clsf = self.classifier(h_pooled) # [batch_size, 2]
 
@@ -151,80 +204,35 @@ def forward(self, input_ids, segment_ids, masked_pos):
 
         return logits_lm, logits_clsf
 
-# sample IsNext and NotNext to be same in small batch size
-def make_batch():
-    batch = []
-    positive = negative = 0
-    while positive != batch_size/2 or negative != batch_size/2:
-        tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
-        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]
-        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
-        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
-
-        # MASK LM
-        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
-        cand_maked_pos = [i for i, token in enumerate(input_ids)]
-        shuffle(cand_maked_pos)
-        masked_tokens, masked_pos = [], []
-        for pos in cand_maked_pos[:n_pred]:
-            masked_pos.append(pos)
-            masked_tokens.append(input_ids[pos])
-            if random() < 0.8:  # 80%
-                input_ids[pos] = word_dict['[MASK]'] # make mask
-            elif random() < 0.5:  # 10%
-                index = randint(0, vocab_size - 1) # random index in vocabulary
-                input_ids[pos] = word_dict[number_dict[index]] # replace
-
-        # Zero Paddings
-        n_pad = maxlen - len(input_ids)
-        input_ids.extend([0] * n_pad)
-        segment_ids.extend([0] * n_pad)
-
-        # Zero Padding (100% - 15%) tokens
-        if max_pred > n_pred:
-            n_pad = max_pred - n_pred
-            masked_tokens.extend([0] * n_pad)
-            masked_pos.extend([0] * n_pad)
-
-        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
-            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
-            positive += 1
-        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
-            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
-            negative += 1
-    return batch
-# Proprecessing Finished
-
 model = BERT()
-criterion1 = nn.CrossEntropyLoss(reduction='none')
-criterion2 = nn.CrossEntropyLoss()
-optimizer = optim.Adam(model.parameters(), lr=1e-4)
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=0.001)
 
 batch = make_batch()
 input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
-input_ids = Variable(torch.LongTensor(input_ids))
-segment_ids = Variable(torch.LongTensor(segment_ids))
-masked_pos = Variable(torch.LongTensor(masked_pos))
-masked_tokens = Variable(torch.LongTensor(masked_tokens))
-isNext = Variable(torch.LongTensor(isNext))
+input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
+    torch.LongTensor(input_ids),  torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens), \
+    torch.LongTensor(masked_pos), torch.LongTensor(isNext)
 
-for epoch in range(25):
+for epoch in range(100):
     optimizer.zero_grad()
     logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
-    loss_lm = criterion1(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
+    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
     loss_lm = (loss_lm.float()).mean()
-    loss_clsf = criterion2(logits_clsf, isNext) # for sentence classification
+    loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
     loss = loss_lm + loss_clsf
-    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
+    if (epoch + 1) % 10 == 0:
+        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
     loss.backward()
     optimizer.step()
 
 # Predict mask tokens ans isNext
-input_ids, segment_ids, masked_tokens, masked_pos, isNext = make_batch()[0]
+input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[0]
 print(text)
 print([number_dict[w] for w in input_ids if number_dict[w] != '[PAD]'])
 
-logits_lm, logits_clsf = model(Variable(torch.LongTensor([input_ids])), Variable(torch.LongTensor([segment_ids])), Variable(torch.LongTensor([masked_pos])))
+logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), \
+                               torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))
 logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
 print('masked tokens list : ',[pos for pos in masked_tokens if pos != 0])
 print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])