From 48eec7b02763c4c8f73f6cc9e3a967d0a826769f Mon Sep 17 00:00:00 2001 From: Thanapon Noraset Date: Thu, 18 Feb 2016 09:23:22 -0600 Subject: [PATCH 1/5] Add torch preprocessing script --- scripts/preprocess.lua | 81 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 scripts/preprocess.lua diff --git a/scripts/preprocess.lua b/scripts/preprocess.lua new file mode 100644 index 00000000..053526f2 --- /dev/null +++ b/scripts/preprocess.lua @@ -0,0 +1,81 @@ +require 'pl' + +cmd = torch.CmdLine() +cmd:text() +cmd:text('Preprocess a text file for training a language model.') +cmd:option('--input_text', 'data/tiny-shakespeare.txt', 'Input text file') +cmd:option('--output_t7', 'data/tiny-shakespeare.t7', 'Output text file in torch binary file') +cmd:option('--output_vocab', 'data/tiny-shakespeare.vocab.t7', 'Output vocab in torch binary file') +cmd:option('--val_frac', 0.1, 'Validation fraction') +cmd:option('--test_frac', 0.1, 'Testing fraction') +cmd:option('--quiet', false, 'Disable all verbose outputs') +cmd:text() +opt = cmd:parse(arg or {}) + + +-- First pass collect statistics and build vocab +char2index = {} +char_count = 0 +vocab_count = 0 +f = io.open(opt.input_text) +while true do + line = f:read() + if not line then break end + for c in line:gmatch('.') do + if not char2index[c] then + vocab_count = vocab_count + 1 + char2index[c] = vocab_count + end + char_count = char_count + 1 + end + -- new line + char_count = char_count + 1 +end +f:close() +-- XXX: hard code newline string +vocab_count = vocab_count + 1 +char2index['\n'] = vocab_count +index2char = {} +-- create index to vocab map +for k, v in pairs(char2index) do table.insert(index2char, k) end + +-- compute split size +val_size = math.floor(opt.val_frac * char_count) +test_size = math.floor(opt.test_frac * char_count) +train_size = char_count - val_size - test_size + +-- verbose +if not opt.quiet then + print('Total vocabulary size: ' .. #index2char) + print('Total tokens in file: ' .. char_count) + print(' Training size: ' .. train_size) + print(' Val size: ' .. val_size) + print(' Test size: ' .. test_size) +end + +train = torch.IntTensor(train_size) +valid = torch.IntTensor(val_size) +test = torch.IntTensor(test_size) +dataset = {train, valid, test} + +-- second pass reading data to Tensor +split_idx, cur_idx = 1, 1 +f = io.open(opt.input_text) +while true do + line = f:read() + if not line then break end + -- XXX: Hard code new line + line = line .. '\n' + for c in line:gmatch('.') do + dataset[split_idx][cur_idx] = char2index[c] + cur_idx = cur_idx + 1 + if cur_idx > dataset[split_idx]:size(1) then + split_idx = split_idx + 1 + cur_idx = 1 + end + end +end +f:close() +-- save to file +torch.save(opt.output_t7, dataset) +torch.save(opt.output_vocab, char2index) From 2d6f1b274adb51d4409e532f550bbc7b91836b82 Mon Sep 17 00:00:00 2001 From: Thanapon Noraset Date: Thu, 18 Feb 2016 09:44:11 -0600 Subject: [PATCH 2/5] Split output data into separate files and handle zero valid frac --- scripts/preprocess.lua | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/scripts/preprocess.lua b/scripts/preprocess.lua index 053526f2..2f41864a 100644 --- a/scripts/preprocess.lua +++ b/scripts/preprocess.lua @@ -4,8 +4,15 @@ cmd = torch.CmdLine() cmd:text() cmd:text('Preprocess a text file for training a language model.') cmd:option('--input_text', 'data/tiny-shakespeare.txt', 'Input text file') -cmd:option('--output_t7', 'data/tiny-shakespeare.t7', 'Output text file in torch binary file') -cmd:option('--output_vocab', 'data/tiny-shakespeare.vocab.t7', 'Output vocab in torch binary file') +-- split output into multiple files +cmd:option('--train_t7', 'data/train-tiny-shakespeare.t7', + 'Output training data file in torch binary file') +cmd:option('--valid_t7', 'data/valid-tiny-shakespeare.t7', + 'Output validating data file in torch binary file') +cmd:option('--test_t7', 'data/test-tiny-shakespeare.t7', + 'Output testing data file in torch binary file') +cmd:option('--output_vocab', 'data/tiny-shakespeare.vocab.t7', + 'Output vocab in torch binary file') cmd:option('--val_frac', 0.1, 'Validation fraction') cmd:option('--test_frac', 0.1, 'Testing fraction') cmd:option('--quiet', false, 'Disable all verbose outputs') @@ -67,6 +74,10 @@ while true do -- XXX: Hard code new line line = line .. '\n' for c in line:gmatch('.') do + -- some split has 0 size + while dataset[split_idx]:size():size() == 0 do + split_idx = split_idx + 1 + end dataset[split_idx][cur_idx] = char2index[c] cur_idx = cur_idx + 1 if cur_idx > dataset[split_idx]:size(1) then @@ -77,5 +88,7 @@ while true do end f:close() -- save to file -torch.save(opt.output_t7, dataset) +torch.save(opt.train_t7, train) +if val_size > 0 then torch.save(opt.valid_t7, valid) end +if test_size > 0 then torch.save(opt.test_t7, test) end torch.save(opt.output_vocab, char2index) From 1265acd57327665f5d12027e16a076c15715f567 Mon Sep 17 00:00:00 2001 From: Thanapon Noraset Date: Thu, 18 Feb 2016 10:43:09 -0600 Subject: [PATCH 3/5] Fix vocab bug --- scripts/preprocess.lua | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/preprocess.lua b/scripts/preprocess.lua index 2f41864a..71a97941 100644 --- a/scripts/preprocess.lua +++ b/scripts/preprocess.lua @@ -1,4 +1,5 @@ require 'pl' +require 'torch' cmd = torch.CmdLine() cmd:text() @@ -11,7 +12,7 @@ cmd:option('--valid_t7', 'data/valid-tiny-shakespeare.t7', 'Output validating data file in torch binary file') cmd:option('--test_t7', 'data/test-tiny-shakespeare.t7', 'Output testing data file in torch binary file') -cmd:option('--output_vocab', 'data/tiny-shakespeare.vocab.t7', +cmd:option('--output_vocab', 'data/vocab-tiny-shakespeare.t7', 'Output vocab in torch binary file') cmd:option('--val_frac', 0.1, 'Validation fraction') cmd:option('--test_frac', 0.1, 'Testing fraction') @@ -44,7 +45,7 @@ vocab_count = vocab_count + 1 char2index['\n'] = vocab_count index2char = {} -- create index to vocab map -for k, v in pairs(char2index) do table.insert(index2char, k) end +for k, v in pairs(char2index) do index2char[v] = k end -- compute split size val_size = math.floor(opt.val_frac * char_count) @@ -91,4 +92,4 @@ f:close() torch.save(opt.train_t7, train) if val_size > 0 then torch.save(opt.valid_t7, valid) end if test_size > 0 then torch.save(opt.test_t7, test) end -torch.save(opt.output_vocab, char2index) +torch.save(opt.output_vocab, index2char) \ No newline at end of file From 64b1a66785530d295561f0c24a999dcb8af27546 Mon Sep 17 00:00:00 2001 From: Thanapon Noraset Date: Thu, 18 Feb 2016 11:53:22 -0600 Subject: [PATCH 4/5] Add data loader that reads torch binary format --- util/MiniBatchLoader.lua | 71 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 util/MiniBatchLoader.lua diff --git a/util/MiniBatchLoader.lua b/util/MiniBatchLoader.lua new file mode 100644 index 00000000..19dd365c --- /dev/null +++ b/util/MiniBatchLoader.lua @@ -0,0 +1,71 @@ +require 'torch' + +local MiniBatchLoader = torch.class('MiniBatchLoader') + +function MiniBatchLoader:__init(config) + config = config or {} + local args + args, self.train_file, self.valid_file, self.test_file, + self.batch_size, self.seq_length + = xlua.unpack( + {config}, + 'MiniBatchLoader', + 'Load data files in torch binary format. Data will be cliped to fit mini batches', + {arg='train_file', type='string', default='data/train-tiny-shakespeare.t7', + help='training data in torch binary (see script/preprocess.lua)'}, + {arg='valid_file', type='string', default='data/valid-tiny-shakespeare.t7', + help='training data in torch binary (see script/preprocess.lua)'}, + {arg='test_file', type='string', default='data/test-tiny-shakespeare.t7', + help='training data in torch binary (see script/preprocess.lua)'}, + {arg='batch_size', type='number', default=8, + help='number of sequences to run for each mini batch'}, + {arg='seq_length', type='number', default=6, + help='number of characters for each sequence'} + ) + self.x_splits = {} + self.y_splits = {} + self.split_sizes = {} + local b, l = self.batch_size, self.seq_length + self.x_splits['train'], self.y_splits['train'] = self:loadData(self.train_file, b, l) + self.x_splits['val'], self.y_splits['val'] = self:loadData(self.valid_file, b, l) + self.x_splits['test'], self.y_splits['test'] = self:loadData(self.test_file, b, l) + self.split_sizes['train'] = self.x_splits['train']:size(1) + self.split_sizes['val'] = self.x_splits['val']:size(1) + self.split_sizes['test'] = self.x_splits['test']:size(1) + self.split_idxs = {train=1, val=1, test=1} + collectgarbage() +end + +function MiniBatchLoader:loadData(file_path, b, l) + local tensor = torch.load(file_path) + local num = tensor:nElement() + local extra = num % (b * l) + -- Chop out the extra bits at the end to make it evenly divide + -- Each batch will have a continuous stream of data + local vx = tensor[{{1, num - extra}}]:view(b, -1, l) + local vy = tensor[{{2, num - extra + 1}}]:view(b, -1, l) + -- rearrage data so that the last two dimensions are B and L + -- XXX: This is not very efficient. + local vxx = torch.IntTensor(vx:size(2), vx:size(1), vx:size(3)) + local vyy = torch.IntTensor(vy:size(2), vy:size(1), vy:size(3)) + for i = 1, vyy:size(1) do + vyy[i] = vy[{{}, i, {}}] + vxx[i] = vx[{{}, i, {}}] + end + vxx:contiguous() + vyy:contiguous() + return vxx, vyy +end + +function MiniBatchLoader:nextBatch(split) + local idx = self.split_idxs[split] + assert(idx, 'invalid split ' .. split) + local x = self.x_splits[split][idx] + local y = self.y_splits[split][idx] + if idx == self.split_sizes[split] then + self.split_idxs[split] = 1 + else + self.split_idxs[split] = idx + 1 + end + return x, y +end \ No newline at end of file From dc7fa244599c7fd022544fcabd58e60eec02374d Mon Sep 17 00:00:00 2001 From: Thanapon Noraset Date: Thu, 18 Feb 2016 12:18:03 -0600 Subject: [PATCH 5/5] Add support for torch binary file format --- train.lua | 24 +++++++++++++++++++----- util/MiniBatchLoader.lua | 6 +++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/train.lua b/train.lua index 6215443a..76e3ffeb 100644 --- a/train.lua +++ b/train.lua @@ -4,6 +4,7 @@ require 'optim' require 'LanguageModel' require 'util.DataLoader' +require 'util.MiniBatchLoader' local utils = require 'util.utils' @@ -15,6 +16,12 @@ cmd:option('-input_h5', 'data/tiny-shakespeare.h5') cmd:option('-input_json', 'data/tiny-shakespeare.json') cmd:option('-batch_size', 50) cmd:option('-seq_length', 50) +-- Optional: load dataset in t7 +cmd:option('-format', 'h5') +cmd:option('-train_t7', 'data/train-tiny-shakespeare.t7') +cmd:option('-valid_t7', 'data/valid-tiny-shakespeare.t7') +cmd:option('-test_t7', 'data/test-tiny-shakespeare.t7') +cmd:option('-vocab_t7', 'data/vocab-tiny-shakespeare.t7') -- Model options cmd:option('-model_type', 'lstm') @@ -44,7 +51,7 @@ cmd:option('-gpu', 0) cmd:option('-gpu_backend', 'cuda') local opt = cmd:parse(arg) - +print(opt) -- Set up GPU stuff local dtype = 'torch.FloatTensor' @@ -70,13 +77,20 @@ end -- Initialize the DataLoader and vocabulary -local loader = DataLoader(opt) -local vocab = utils.read_json(opt.input_json) +local loader, vocab local idx_to_token = {} -for k, v in pairs(vocab.idx_to_token) do - idx_to_token[tonumber(k)] = v +if opt.format == 't7' then + loader = MiniBatchLoader(opt) + idx_to_token = torch.load(opt.vocab_t7) +else + loader = DataLoader(opt) + vocab = utils.read_json(opt.input_json) + for k, v in pairs(vocab.idx_to_token) do + idx_to_token[tonumber(k)] = v + end end + -- Initialize the model and criterion local opt_clone = torch.deserialize(torch.serialize(opt)) opt_clone.idx_to_token = idx_to_token diff --git a/util/MiniBatchLoader.lua b/util/MiniBatchLoader.lua index 19dd365c..40fb5d1b 100644 --- a/util/MiniBatchLoader.lua +++ b/util/MiniBatchLoader.lua @@ -11,11 +11,11 @@ function MiniBatchLoader:__init(config) {config}, 'MiniBatchLoader', 'Load data files in torch binary format. Data will be cliped to fit mini batches', - {arg='train_file', type='string', default='data/train-tiny-shakespeare.t7', + {arg='train_t7', type='string', default='data/train-tiny-shakespeare.t7', help='training data in torch binary (see script/preprocess.lua)'}, - {arg='valid_file', type='string', default='data/valid-tiny-shakespeare.t7', + {arg='valid_t7', type='string', default='data/valid-tiny-shakespeare.t7', help='training data in torch binary (see script/preprocess.lua)'}, - {arg='test_file', type='string', default='data/test-tiny-shakespeare.t7', + {arg='test_t7', type='string', default='data/test-tiny-shakespeare.t7', help='training data in torch binary (see script/preprocess.lua)'}, {arg='batch_size', type='number', default=8, help='number of sequences to run for each mini batch'},