Skip to content

Commit

Permalink
upload multiheadattn decoder
Browse files Browse the repository at this point in the history
  • Loading branch information
demdecuong committed Sep 30, 2020
1 parent 3680c9f commit a158fec
Show file tree
Hide file tree
Showing 48 changed files with 241 additions and 68 deletions.
Binary file added src/__pycache__/config.cpython-38.pyc
Binary file not shown.
Binary file added src/__pycache__/loader.cpython-38.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
vocab_size = top_k + 1
features_shape = 2048
attention_features_shape = 64
data_path = '/home/levanpon1009/work/project/xray-report/data'
data_path = '../../xraydata/data'
EPOCHS = 20
8 changes: 5 additions & 3 deletions src/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
from tensorflow.keras.applications.inception_resnet_v2 import preprocess_input
from sklearn.model_selection import train_test_split

from src import config
import config


def load_csv(data_root):
contents = pd.read_csv(os.path.join(data_root, 'data.csv'))
all_text = contents['findings'].map(lambda x: '<start> ' + x + ' <end>').astype(str).to_numpy()
all_images = contents['filename'].map(lambda x: os.path.join(data_root, 'images', x)).astype(str).to_numpy()
all_images = contents['filename'].map(lambda x: os.path.join(data_root, 'images/images_normalized', x)).astype(str).to_numpy()

train_images, valid_images, train_texts, valid_texts = train_test_split(all_images, all_text, test_size=0.2,
random_state=42)
Expand Down Expand Up @@ -47,7 +47,9 @@ def load_data(data_path):
train_images, valid_images, train_texts, valid_texts, all_text = load_csv(data_path)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=config.top_k,
oov_token="<unk>")
oov_token="<unk>",
filters='!"#$\t\n',
lower= True)

tokenizer.fit_on_texts(all_text)
tokenizer.word_index['<pad>'] = 0
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added src/models/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
Binary file added src/models/__pycache__/decoder.cpython-38.pyc
Binary file not shown.
Binary file added src/models/__pycache__/encoder.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file added src/models/__pycache__/transformer.cpython-38.pyc
Binary file not shown.
51 changes: 48 additions & 3 deletions src/models/decoder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import tensorflow as tf

from models.transformer import MultiHeadAttention

class BahdanauAttention(tf.keras.Model):
def __init__(self, units):
Expand Down Expand Up @@ -46,11 +46,11 @@ def __init__(self, embedding_dim, units, vocab_size):
self.fc1 = tf.keras.layers.Dense(self.units)
self.fc2 = tf.keras.layers.Dense(vocab_size)

self.attention = BahdanauAttention(self.units)
self.attention = MultiHeadAttention(self.units, num_heads= 8)

def call(self, x, features, hidden):
# defining attention as a separate model
context_vector, attention_weights = self.attention(features, hidden)
context_vector, attention_weights = self.MultiHeadAttention(features, hidden)

# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x = self.embedding(x)
Expand All @@ -74,3 +74,48 @@ def call(self, x, features, hidden):

def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units))

class MultiheadDecoder(tf.keras.Model):
def __init__(self, embedding_dim, units, vocab_size):
super(MultiheadDecoder, self).__init__()
self.units = units

self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc1 = tf.keras.layers.Dense(self.units)
self.fc2 = tf.keras.layers.Dense(vocab_size)

self.multiheadattention = MultiHeadAttention(self.units, num_heads= 8)
self.attention = BahdanauAttention(self.units)

def call(self, x, features, hidden):
# defining attention as a separate model

features, _ = self.multiheadattention(features, features, features)
context_vector, attention_weights = self.attention(features, hidden)

# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x = self.embedding(x)

# x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

# passing the concatenated vector to the GRU
output, state = self.gru(x)

# shape == (batch_size, max_length, hidden_size)
x = self.fc1(output)

# x shape == (batch_size * max_length, hidden_size)
x = tf.reshape(x, (-1, x.shape[2]))

# output shape == (batch_size * max_length, vocab)
x = self.fc2(x)

return x, state, attention_weights

def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units))
2 changes: 1 addition & 1 deletion src/models/encoder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB3

from src import config
import config


class Encoder(tf.keras.Model):
Expand Down
49 changes: 0 additions & 49 deletions src/models/multihead_attn.py

This file was deleted.

141 changes: 141 additions & 0 deletions src/models/transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import tensorflow as tf

def scaled_dot_product_attention(q, k, v, mask):
"""Calculate the attention weights.
q, k, v must have matching leading dimensions.
k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
The mask has different shapes depending on its type(padding or look ahead)
but it must be broadcastable for addition.
Args:
q: query shape == (..., seq_len_q, depth)
k: key shape == (..., seq_len_k, depth)
v: value shape == (..., seq_len_v, depth_v)
mask: Float tensor with shape broadcastable
to (..., seq_len_q, seq_len_k). Defaults to None.
Returns:
output, attention_weights
"""

matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)

# scale matmul_qk
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

# add the mask to the scaled tensor.
if mask is not None:
scaled_attention_logits += (mask * -1e9)

# softmax is normalized on the last axis (seq_len_k) so that the scores
# add up to 1.
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)

output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)

return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model

assert d_model % self.num_heads == 0

self.depth = d_model // self.num_heads

self.wq = tf.keras.layers.Dense(d_model)
self.wk = tf.keras.layers.Dense(d_model)
self.wv = tf.keras.layers.Dense(d_model)

self.dense = tf.keras.layers.Dense(d_model)

def split_heads(self, x, batch_size):
"""Split the last dimension into (num_heads, depth).
Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
"""
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])

def call(self, v, k, q, mask = None):
batch_size = tf.shape(q)[0]

q = self.wq(q) # (batch_size, seq_len, d_model)
k = self.wk(k) # (batch_size, seq_len, d_model)
v = self.wv(v) # (batch_size, seq_len, d_model)

q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)

# scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
scaled_attention, attention_weights = scaled_dot_product_attention(
q, k, v, mask)

scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)

concat_attention = tf.reshape(scaled_attention,
(batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)

output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)

return output, attention_weights

def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units))

def create_padding_mask(seq):
seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

# add extra dimensions to add the padding
# to the attention logits.
return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)

class FullyConnected(tf.keras.layers.Layer):
def __init__(self, dmodel, dff = 2048):
self.ffn = tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model)
])
def call(self, x ):
return self.ffn(x)


class TransformerLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1, with_external = False):
super(TransformerLayer, self).__init__()

self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = FullyConnected(d_model, dff)

self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)

self.with_external = with_external
# if self.with_external:

def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)

ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)

return out2

class Transformer(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1):
self.enc_layers = [TransformerLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]
self.enc_layers = [TransformerLayer(d_model, num_heads, dff, rate, with_external=Transformer)
for _ in range(num_layers)]

Loading

0 comments on commit a158fec

Please sign in to comment.