BERT - Assignment - Jupyter Notebook
BERT - Assignment - Jupyter Notebook
instructions:
2. Please read the instructions on the code cells and markdown cell
s. We will explain what to write.
3. please return outputs in the same format what we asked. Eg. Don't
return List of we are asking for a numpy array.
4. Please read the external links that we are given so that you will
learn the concept behind the code that you are writing.
In [ ]: 1 #all imports
2 import numpy as np
3 import pandas as pd
4 import tensorflow as tf
5 import tensorflow_hub as hub
6 from tensorflow.keras.models import Model
In [ ]: 1 tf.test.gpu_device_name()
Grader function 1
In [ ]: 1 def grader_tf_version():
2 assert((tf.__version__)>'2')
3 return True
4 grader_tf_version()
Part-1: Preprocessing
Grader function 2
In [ ]: 1 def grader_reviews():
2 temp_shape = (reviews.shape == (525814, 2)) and (reviews.Score.value_cou
3 assert(temp_shape == True)
4 return True
5 grader_reviews()
In [ ]: 1 def get_wordlen(x):
2 return len(x.split())
3 reviews['len'] = reviews.Text.apply(get_wordlen)
4 reviews = reviews[reviews.len<50]
5 reviews = reviews.sample(n=100000, random_state=30)
In [ ]: 1 #remove HTML from the Text column and save in the Text column only
In [ ]: 1 #print head 5
In [ ]: 1 #split the data into train and valudation data(20%) with Stratify sampling,
For this assignment, we are using BERT uncased Base model (https://tfhu
b.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1). It uses L=12 hidden
layers (i.e., Transformer blocks), a hidden size of H=768, and A=12 atte
ntion heads.
In [ ]: 1 bert_model.summary()
In [ ]: 1 bert_model.output
Part-3: Tokenization
Grader function 3
Example
In [ ]: 1 import pickle
In [ ]: 1 ##save all your results to disk so that, no need to run all again.
2 pickle.dump((X_train, X_train_tokens, X_train_mask, X_train_segment, y_train
3 pickle.dump((X_test, X_test_tokens, X_test_mask, X_test_segment, y_test),ope
Grader function 4
In [ ]: 1 def grader_alltokens_train():
2 out = False
3
4 if type(X_train_tokens) == np.ndarray:
5
6 temp_shapes = (X_train_tokens.shape[1]==max_seq_length) and (X_train_
7 (X_train_segment.shape[1]==max_seq_length)
8
9 segment_temp = not np.any(X_train_segment)
10
11 mask_temp = np.sum(X_train_mask==0) == np.sum(X_train_tokens==0)
12
13 no_cls = np.sum(X_train_tokens==tokenizer.vocab['[CLS]'])==X_train_t
14
15 no_sep = np.sum(X_train_tokens==tokenizer.vocab['[SEP]'])==X_train_t
16
17 out = temp_shapes and segment_temp and mask_temp and no_cls and no_s
18
19 else:
20 print('Type of all above token arrays should be list not numpy array
21 out = False
22 assert(out==True)
23 return out
24
25 grader_alltokens_train()
Grader function 5
In [ ]: 1 def grader_alltokens_test():
2 out = False
3 if type(X_test_tokens) == np.ndarray:
4
5 temp_shapes = (X_test_tokens.shape[1]==max_seq_length) and (X_test_m
6 (X_test_segment.shape[1]==max_seq_length)
7
8 segment_temp = not np.any(X_test_segment)
9
10 mask_temp = np.sum(X_test_mask==0) == np.sum(X_test_tokens==0)
11
12 no_cls = np.sum(X_test_tokens==tokenizer.vocab['[CLS]'])==X_test_tok
13
14 no_sep = np.sum(X_test_tokens==tokenizer.vocab['[SEP]'])==X_test_tok
15
16 out = temp_shapes and segment_temp and mask_temp and no_cls and no_s
17
18 else:
19 print('Type of all above token arrays should be list not numpy array
20 out = False
21 assert(out==True)
22 return out
23 grader_alltokens_test()
In [ ]: 1 bert_model.input
In [ ]: 1 bert_model.output
In [ ]: 1 # get the train output, BERT model will give one output so save in
2 # X_train_pooled_output
In [ ]: 1 # get the test output, BERT model will give one output so save in
2 # X_test_pooled_output
In [ ]: 1 ##save all your results to disk so that, no need to run all again.
2 pickle.dump((X_train_pooled_output, X_test_pooled_output),open('final_output
Grader function 6
In [ ]: 1 ##imports
2 from tensorflow.keras.layers import Input, Dense, Activation, Dropout
3 from tensorflow.keras.models import Model
In [ ]: 1 ##create an NN and
2
In [ ]: 1