-
Notifications
You must be signed in to change notification settings - Fork 129
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
rationale code for ubuntu data and medical data
- Loading branch information
Tao Lei
committed
Oct 27, 2016
1 parent
2281005
commit bdeaa6f
Showing
8 changed files
with
2,065 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
|
||
import gzip | ||
import random | ||
import json | ||
|
||
import theano | ||
import numpy as np | ||
|
||
from nn import EmbeddingLayer | ||
from utils import say, load_embedding_iterator | ||
|
||
def read_rationales(path): | ||
data = [ ] | ||
fopen = gzip.open if path.endswith(".gz") else open | ||
with fopen(path) as fin: | ||
for line in fin: | ||
item = json.loads(line) | ||
data.append(item) | ||
return data | ||
|
||
def read_annotations(path): | ||
data_x, data_y = [ ], [ ] | ||
fopen = gzip.open if path.endswith(".gz") else open | ||
with fopen(path) as fin: | ||
for line in fin: | ||
y, sep, x = line.partition("\t") | ||
x, y = x.split(), y.split() | ||
if len(x) == 0: continue | ||
y = np.asarray([ float(v) for v in y ], dtype = theano.config.floatX) | ||
data_x.append(x) | ||
data_y.append(y) | ||
say("{} examples loaded from {}\n".format( | ||
len(data_x), path | ||
)) | ||
say("max text length: {}\n".format( | ||
max(len(x) for x in data_x) | ||
)) | ||
return data_x, data_y | ||
|
||
def create_embedding_layer(path): | ||
embedding_layer = EmbeddingLayer( | ||
n_d = 200, | ||
vocab = [ "<unk>", "<padding>" ], | ||
embs = load_embedding_iterator(path), | ||
oov = "<unk>", | ||
#fix_init_embs = True | ||
fix_init_embs = False | ||
) | ||
return embedding_layer | ||
|
||
|
||
def create_batches(x, y, batch_size, padding_id, sort=True): | ||
batches_x, batches_y = [ ], [ ] | ||
N = len(x) | ||
M = (N-1)/batch_size + 1 | ||
if sort: | ||
perm = range(N) | ||
perm = sorted(perm, key=lambda i: len(x[i])) | ||
x = [ x[i] for i in perm ] | ||
y = [ y[i] for i in perm ] | ||
for i in xrange(M): | ||
bx, by = create_one_batch( | ||
x[i*batch_size:(i+1)*batch_size], | ||
y[i*batch_size:(i+1)*batch_size], | ||
padding_id | ||
) | ||
batches_x.append(bx) | ||
batches_y.append(by) | ||
if sort: | ||
random.seed(5817) | ||
perm2 = range(M) | ||
random.shuffle(perm2) | ||
batches_x = [ batches_x[i] for i in perm2 ] | ||
batches_y = [ batches_y[i] for i in perm2 ] | ||
return batches_x, batches_y | ||
|
||
def create_one_batch(lstx, lsty, padding_id): | ||
max_len = max(len(x) for x in lstx) | ||
assert min(len(x) for x in lstx) > 0 | ||
bx = np.column_stack([ np.pad(x, (max_len-len(x),0), "constant", | ||
constant_values=padding_id) for x in lstx ]) | ||
by = np.vstack(lsty).astype(theano.config.floatX) | ||
return bx, by |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
|
||
import sys | ||
import argparse | ||
|
||
def load_arguments(): | ||
argparser = argparse.ArgumentParser(sys.argv[0]) | ||
argparser.add_argument("--embedding", | ||
type = str, | ||
default = "", | ||
help = "path to pre-trained word vectors" | ||
) | ||
argparser.add_argument("--save_model", | ||
type = str, | ||
default = "", | ||
help = "path to save model parameters" | ||
) | ||
argparser.add_argument("--train", | ||
type = str, | ||
default = "", | ||
help = "path to training data" | ||
) | ||
argparser.add_argument("--dev", | ||
type = str, | ||
default = "", | ||
help = "path to development data" | ||
) | ||
argparser.add_argument("--test", | ||
type = str, | ||
default = "", | ||
help = "path to test data" | ||
) | ||
argparser.add_argument("--dump", | ||
type = str, | ||
default = "", | ||
help = "path to dump rationale" | ||
) | ||
argparser.add_argument("--max_epochs", | ||
type = int, | ||
default = 800, | ||
help = "maximum # of epochs" | ||
) | ||
argparser.add_argument("--eval_period", | ||
type = int, | ||
default = -1, | ||
help = "evaluate model every k examples" | ||
) | ||
argparser.add_argument("--batch", | ||
type = int, | ||
default = 256, | ||
help = "mini-batch size" | ||
) | ||
argparser.add_argument("--learning", | ||
type = str, | ||
default = "adam", | ||
help = "learning method" | ||
) | ||
argparser.add_argument("--learning_rate", | ||
type = float, | ||
default = 0.0002, | ||
help = "learning rate" | ||
) | ||
argparser.add_argument("--dropout", | ||
type = float, | ||
default = 0.0, | ||
help = "dropout probability" | ||
) | ||
argparser.add_argument("--l2_reg", | ||
type = float, | ||
default = 1e-7, | ||
help = "L2 regularization weight" | ||
) | ||
argparser.add_argument("-act", "--activation", | ||
type = str, | ||
default = "tanh", | ||
help = "type of activatioin function" | ||
) | ||
argparser.add_argument("-d", "--hidden_dimension", | ||
type = int, | ||
default = 50, | ||
help = "hidden dimension" | ||
) | ||
argparser.add_argument("-d2", "--hidden_dimension2", | ||
type = int, | ||
default = 50, | ||
help = "hidden dimension" | ||
) | ||
argparser.add_argument("--layer", | ||
type = str, | ||
default = "rcnn", | ||
help = "type of recurrent layer" | ||
) | ||
argparser.add_argument("--depth", | ||
type = int, | ||
default = 1, | ||
help = "number of layers" | ||
) | ||
argparser.add_argument("--pooling", | ||
type = int, | ||
default = 0, | ||
help = "whether to use mean pooling or the last state" | ||
) | ||
argparser.add_argument("--order", | ||
type = int, | ||
default = 3, | ||
help = "feature filter width" | ||
) | ||
argparser.add_argument("--use_all", | ||
type = int, | ||
default = 0, | ||
help = "whether to use the states of all layers" | ||
) | ||
argparser.add_argument("--max_len", | ||
type = int, | ||
default = 2048, | ||
help = "max number of words in input" | ||
) | ||
argparser.add_argument("--sparsity", | ||
type = float, | ||
default = 0.0004, | ||
) | ||
argparser.add_argument("--coherent", | ||
type = float, | ||
default = 3.0 | ||
) | ||
argparser.add_argument("--aspect", | ||
type = int, | ||
default = -1 | ||
) | ||
argparser.add_argument("--decay_lr", | ||
type = int, | ||
default = 0 | ||
) | ||
argparser.add_argument("--fix_emb", | ||
type = int, | ||
default = 1 | ||
) | ||
args = argparser.parse_args() | ||
return args |
Oops, something went wrong.