Skip to content

Commit

Permalink
rationale code for ubuntu data and medical data
Browse files Browse the repository at this point in the history
  • Loading branch information
Tao Lei committed Oct 27, 2016
1 parent 2281005 commit bdeaa6f
Show file tree
Hide file tree
Showing 8 changed files with 2,065 additions and 0 deletions.
83 changes: 83 additions & 0 deletions code/rationale/medical/myio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@

import gzip
import random
import json

import theano
import numpy as np

from nn import EmbeddingLayer
from utils import say, load_embedding_iterator

def read_rationales(path):
data = [ ]
fopen = gzip.open if path.endswith(".gz") else open
with fopen(path) as fin:
for line in fin:
item = json.loads(line)
data.append(item)
return data

def read_annotations(path):
data_x, data_y = [ ], [ ]
fopen = gzip.open if path.endswith(".gz") else open
with fopen(path) as fin:
for line in fin:
y, sep, x = line.partition("\t")
x, y = x.split(), y.split()
if len(x) == 0: continue
y = np.asarray([ float(v) for v in y ], dtype = theano.config.floatX)
data_x.append(x)
data_y.append(y)
say("{} examples loaded from {}\n".format(
len(data_x), path
))
say("max text length: {}\n".format(
max(len(x) for x in data_x)
))
return data_x, data_y

def create_embedding_layer(path):
embedding_layer = EmbeddingLayer(
n_d = 200,
vocab = [ "<unk>", "<padding>" ],
embs = load_embedding_iterator(path),
oov = "<unk>",
#fix_init_embs = True
fix_init_embs = False
)
return embedding_layer


def create_batches(x, y, batch_size, padding_id, sort=True):
batches_x, batches_y = [ ], [ ]
N = len(x)
M = (N-1)/batch_size + 1
if sort:
perm = range(N)
perm = sorted(perm, key=lambda i: len(x[i]))
x = [ x[i] for i in perm ]
y = [ y[i] for i in perm ]
for i in xrange(M):
bx, by = create_one_batch(
x[i*batch_size:(i+1)*batch_size],
y[i*batch_size:(i+1)*batch_size],
padding_id
)
batches_x.append(bx)
batches_y.append(by)
if sort:
random.seed(5817)
perm2 = range(M)
random.shuffle(perm2)
batches_x = [ batches_x[i] for i in perm2 ]
batches_y = [ batches_y[i] for i in perm2 ]
return batches_x, batches_y

def create_one_batch(lstx, lsty, padding_id):
max_len = max(len(x) for x in lstx)
assert min(len(x) for x in lstx) > 0
bx = np.column_stack([ np.pad(x, (max_len-len(x),0), "constant",
constant_values=padding_id) for x in lstx ])
by = np.vstack(lsty).astype(theano.config.floatX)
return bx, by
138 changes: 138 additions & 0 deletions code/rationale/medical/options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@

import sys
import argparse

def load_arguments():
argparser = argparse.ArgumentParser(sys.argv[0])
argparser.add_argument("--embedding",
type = str,
default = "",
help = "path to pre-trained word vectors"
)
argparser.add_argument("--save_model",
type = str,
default = "",
help = "path to save model parameters"
)
argparser.add_argument("--train",
type = str,
default = "",
help = "path to training data"
)
argparser.add_argument("--dev",
type = str,
default = "",
help = "path to development data"
)
argparser.add_argument("--test",
type = str,
default = "",
help = "path to test data"
)
argparser.add_argument("--dump",
type = str,
default = "",
help = "path to dump rationale"
)
argparser.add_argument("--max_epochs",
type = int,
default = 800,
help = "maximum # of epochs"
)
argparser.add_argument("--eval_period",
type = int,
default = -1,
help = "evaluate model every k examples"
)
argparser.add_argument("--batch",
type = int,
default = 256,
help = "mini-batch size"
)
argparser.add_argument("--learning",
type = str,
default = "adam",
help = "learning method"
)
argparser.add_argument("--learning_rate",
type = float,
default = 0.0002,
help = "learning rate"
)
argparser.add_argument("--dropout",
type = float,
default = 0.0,
help = "dropout probability"
)
argparser.add_argument("--l2_reg",
type = float,
default = 1e-7,
help = "L2 regularization weight"
)
argparser.add_argument("-act", "--activation",
type = str,
default = "tanh",
help = "type of activatioin function"
)
argparser.add_argument("-d", "--hidden_dimension",
type = int,
default = 50,
help = "hidden dimension"
)
argparser.add_argument("-d2", "--hidden_dimension2",
type = int,
default = 50,
help = "hidden dimension"
)
argparser.add_argument("--layer",
type = str,
default = "rcnn",
help = "type of recurrent layer"
)
argparser.add_argument("--depth",
type = int,
default = 1,
help = "number of layers"
)
argparser.add_argument("--pooling",
type = int,
default = 0,
help = "whether to use mean pooling or the last state"
)
argparser.add_argument("--order",
type = int,
default = 3,
help = "feature filter width"
)
argparser.add_argument("--use_all",
type = int,
default = 0,
help = "whether to use the states of all layers"
)
argparser.add_argument("--max_len",
type = int,
default = 2048,
help = "max number of words in input"
)
argparser.add_argument("--sparsity",
type = float,
default = 0.0004,
)
argparser.add_argument("--coherent",
type = float,
default = 3.0
)
argparser.add_argument("--aspect",
type = int,
default = -1
)
argparser.add_argument("--decay_lr",
type = int,
default = 0
)
argparser.add_argument("--fix_emb",
type = int,
default = 1
)
args = argparser.parse_args()
return args
Loading

0 comments on commit bdeaa6f

Please sign in to comment.