From 48eec7b02763c4c8f73f6cc9e3a967d0a826769f Mon Sep 17 00:00:00 2001
From: Thanapon Noraset <nor.thanapon@u.northwestern.edu>
Date: Thu, 18 Feb 2016 09:23:22 -0600
Subject: [PATCH 1/5] Add torch preprocessing script

---
 scripts/preprocess.lua | 81 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 scripts/preprocess.lua

diff --git a/scripts/preprocess.lua b/scripts/preprocess.lua
new file mode 100644
index 00000000..053526f2
--- /dev/null
+++ b/scripts/preprocess.lua
@@ -0,0 +1,81 @@
+require 'pl'
+
+cmd = torch.CmdLine()
+cmd:text()
+cmd:text('Preprocess a text file for training a language model.')
+cmd:option('--input_text', 'data/tiny-shakespeare.txt', 'Input text file')
+cmd:option('--output_t7', 'data/tiny-shakespeare.t7', 'Output text file in torch binary file')
+cmd:option('--output_vocab', 'data/tiny-shakespeare.vocab.t7', 'Output vocab in torch binary file')
+cmd:option('--val_frac', 0.1, 'Validation fraction')
+cmd:option('--test_frac', 0.1, 'Testing fraction')
+cmd:option('--quiet', false, 'Disable all verbose outputs')
+cmd:text()
+opt = cmd:parse(arg or {})
+
+
+-- First pass collect statistics and build vocab
+char2index = {}
+char_count = 0
+vocab_count = 0
+f = io.open(opt.input_text)
+while true do
+    line = f:read()
+    if not line then break end
+    for c in line:gmatch('.') do
+        if not char2index[c] then
+            vocab_count = vocab_count + 1
+            char2index[c] = vocab_count
+        end
+        char_count = char_count + 1
+    end
+    -- new line
+    char_count = char_count + 1
+end
+f:close()
+-- XXX: hard code newline string
+vocab_count = vocab_count + 1
+char2index['\n'] = vocab_count
+index2char = {}
+-- create index to vocab map
+for k, v in pairs(char2index) do table.insert(index2char, k) end
+
+-- compute split size
+val_size = math.floor(opt.val_frac * char_count)
+test_size = math.floor(opt.test_frac * char_count)
+train_size = char_count - val_size - test_size
+
+-- verbose
+if not opt.quiet then
+    print('Total vocabulary size: ' .. #index2char)
+    print('Total tokens in file: ' .. char_count)
+    print('  Training size: ' .. train_size)
+    print('  Val size: ' .. val_size)
+    print('  Test size: ' .. test_size)
+end
+
+train = torch.IntTensor(train_size)
+valid = torch.IntTensor(val_size)
+test = torch.IntTensor(test_size)
+dataset = {train, valid, test}
+
+-- second pass reading data to Tensor
+split_idx, cur_idx = 1, 1
+f = io.open(opt.input_text)
+while true do
+    line = f:read()
+    if not line then break end
+    -- XXX: Hard code new line
+    line = line .. '\n'
+    for c in line:gmatch('.') do
+        dataset[split_idx][cur_idx] = char2index[c]
+        cur_idx = cur_idx + 1
+        if cur_idx > dataset[split_idx]:size(1) then
+            split_idx = split_idx + 1
+            cur_idx = 1
+        end
+    end
+end
+f:close()
+-- save to file
+torch.save(opt.output_t7, dataset)
+torch.save(opt.output_vocab, char2index)

From 2d6f1b274adb51d4409e532f550bbc7b91836b82 Mon Sep 17 00:00:00 2001
From: Thanapon Noraset <nor.thanapon@u.northwestern.edu>
Date: Thu, 18 Feb 2016 09:44:11 -0600
Subject: [PATCH 2/5] Split output data into separate files and handle zero
 valid frac

---
 scripts/preprocess.lua | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/scripts/preprocess.lua b/scripts/preprocess.lua
index 053526f2..2f41864a 100644
--- a/scripts/preprocess.lua
+++ b/scripts/preprocess.lua
@@ -4,8 +4,15 @@ cmd = torch.CmdLine()
 cmd:text()
 cmd:text('Preprocess a text file for training a language model.')
 cmd:option('--input_text', 'data/tiny-shakespeare.txt', 'Input text file')
-cmd:option('--output_t7', 'data/tiny-shakespeare.t7', 'Output text file in torch binary file')
-cmd:option('--output_vocab', 'data/tiny-shakespeare.vocab.t7', 'Output vocab in torch binary file')
+-- split output into multiple files
+cmd:option('--train_t7', 'data/train-tiny-shakespeare.t7',
+           'Output training data file in torch binary file')
+cmd:option('--valid_t7', 'data/valid-tiny-shakespeare.t7',
+           'Output validating data file in torch binary file')
+cmd:option('--test_t7', 'data/test-tiny-shakespeare.t7',
+           'Output testing data file in torch binary file')
+cmd:option('--output_vocab', 'data/tiny-shakespeare.vocab.t7',
+           'Output vocab in torch binary file')
 cmd:option('--val_frac', 0.1, 'Validation fraction')
 cmd:option('--test_frac', 0.1, 'Testing fraction')
 cmd:option('--quiet', false, 'Disable all verbose outputs')
@@ -67,6 +74,10 @@ while true do
     -- XXX: Hard code new line
     line = line .. '\n'
     for c in line:gmatch('.') do
+        -- some split has 0 size
+        while dataset[split_idx]:size():size() == 0 do
+            split_idx = split_idx + 1
+        end
         dataset[split_idx][cur_idx] = char2index[c]
         cur_idx = cur_idx + 1
         if cur_idx > dataset[split_idx]:size(1) then
@@ -77,5 +88,7 @@ while true do
 end
 f:close()
 -- save to file
-torch.save(opt.output_t7, dataset)
+torch.save(opt.train_t7, train)
+if val_size > 0 then torch.save(opt.valid_t7, valid) end
+if test_size > 0 then torch.save(opt.test_t7, test) end
 torch.save(opt.output_vocab, char2index)

From 1265acd57327665f5d12027e16a076c15715f567 Mon Sep 17 00:00:00 2001
From: Thanapon Noraset <nor.thanapon@u.northwestern.edu>
Date: Thu, 18 Feb 2016 10:43:09 -0600
Subject: [PATCH 3/5] Fix vocab bug

---
 scripts/preprocess.lua | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/preprocess.lua b/scripts/preprocess.lua
index 2f41864a..71a97941 100644
--- a/scripts/preprocess.lua
+++ b/scripts/preprocess.lua
@@ -1,4 +1,5 @@
 require 'pl'
+require 'torch'
 
 cmd = torch.CmdLine()
 cmd:text()
@@ -11,7 +12,7 @@ cmd:option('--valid_t7', 'data/valid-tiny-shakespeare.t7',
            'Output validating data file in torch binary file')
 cmd:option('--test_t7', 'data/test-tiny-shakespeare.t7',
            'Output testing data file in torch binary file')
-cmd:option('--output_vocab', 'data/tiny-shakespeare.vocab.t7',
+cmd:option('--output_vocab', 'data/vocab-tiny-shakespeare.t7',
            'Output vocab in torch binary file')
 cmd:option('--val_frac', 0.1, 'Validation fraction')
 cmd:option('--test_frac', 0.1, 'Testing fraction')
@@ -44,7 +45,7 @@ vocab_count = vocab_count + 1
 char2index['\n'] = vocab_count
 index2char = {}
 -- create index to vocab map
-for k, v in pairs(char2index) do table.insert(index2char, k) end
+for k, v in pairs(char2index) do index2char[v] = k end
 
 -- compute split size
 val_size = math.floor(opt.val_frac * char_count)
@@ -91,4 +92,4 @@ f:close()
 torch.save(opt.train_t7, train)
 if val_size > 0 then torch.save(opt.valid_t7, valid) end
 if test_size > 0 then torch.save(opt.test_t7, test) end
-torch.save(opt.output_vocab, char2index)
+torch.save(opt.output_vocab, index2char)
\ No newline at end of file

From 64b1a66785530d295561f0c24a999dcb8af27546 Mon Sep 17 00:00:00 2001
From: Thanapon Noraset <nor.thanapon@u.northwestern.edu>
Date: Thu, 18 Feb 2016 11:53:22 -0600
Subject: [PATCH 4/5] Add data loader that reads torch binary format

---
 util/MiniBatchLoader.lua | 71 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 util/MiniBatchLoader.lua

diff --git a/util/MiniBatchLoader.lua b/util/MiniBatchLoader.lua
new file mode 100644
index 00000000..19dd365c
--- /dev/null
+++ b/util/MiniBatchLoader.lua
@@ -0,0 +1,71 @@
+require 'torch'
+
+local MiniBatchLoader = torch.class('MiniBatchLoader')
+
+function MiniBatchLoader:__init(config)
+    config = config or {}
+    local args
+    args, self.train_file, self.valid_file, self.test_file,
+          self.batch_size, self.seq_length
+        = xlua.unpack(
+        {config},
+        'MiniBatchLoader',
+        'Load data files in torch binary format. Data will be cliped to fit mini batches',
+        {arg='train_file', type='string', default='data/train-tiny-shakespeare.t7',
+         help='training data in torch binary (see script/preprocess.lua)'},
+        {arg='valid_file', type='string', default='data/valid-tiny-shakespeare.t7',
+         help='training data in torch binary (see script/preprocess.lua)'},
+        {arg='test_file', type='string', default='data/test-tiny-shakespeare.t7',
+         help='training data in torch binary (see script/preprocess.lua)'},
+        {arg='batch_size', type='number', default=8,
+         help='number of sequences to run for each mini batch'},
+        {arg='seq_length', type='number', default=6,
+         help='number of characters for each sequence'}
+    )
+    self.x_splits = {}
+    self.y_splits = {}
+    self.split_sizes = {}
+    local b, l = self.batch_size, self.seq_length
+    self.x_splits['train'], self.y_splits['train'] = self:loadData(self.train_file, b, l)
+    self.x_splits['val'], self.y_splits['val'] = self:loadData(self.valid_file, b, l)
+    self.x_splits['test'], self.y_splits['test'] = self:loadData(self.test_file, b, l)
+    self.split_sizes['train'] = self.x_splits['train']:size(1)
+    self.split_sizes['val'] = self.x_splits['val']:size(1)
+    self.split_sizes['test'] = self.x_splits['test']:size(1)
+    self.split_idxs = {train=1, val=1, test=1}
+    collectgarbage()
+end
+
+function MiniBatchLoader:loadData(file_path, b, l)
+    local tensor = torch.load(file_path)
+    local num = tensor:nElement()
+    local extra = num % (b * l)
+    -- Chop out the extra bits at the end to make it evenly divide
+    -- Each batch will have a continuous stream of data
+    local vx = tensor[{{1, num - extra}}]:view(b, -1, l)
+    local vy = tensor[{{2, num - extra + 1}}]:view(b, -1, l)
+    -- rearrage data so that the last two dimensions are B and L
+    -- XXX: This is not very efficient.
+    local vxx = torch.IntTensor(vx:size(2), vx:size(1), vx:size(3))
+    local vyy = torch.IntTensor(vy:size(2), vy:size(1), vy:size(3))
+    for i = 1, vyy:size(1) do
+        vyy[i] = vy[{{}, i, {}}]
+        vxx[i] = vx[{{}, i, {}}]
+    end
+    vxx:contiguous()
+    vyy:contiguous()
+    return vxx, vyy
+end
+
+function MiniBatchLoader:nextBatch(split)
+  local idx = self.split_idxs[split]
+  assert(idx, 'invalid split ' .. split)
+  local x = self.x_splits[split][idx]
+  local y = self.y_splits[split][idx]
+  if idx == self.split_sizes[split] then
+    self.split_idxs[split] = 1
+  else
+    self.split_idxs[split] = idx + 1
+  end
+  return x, y
+end
\ No newline at end of file

From dc7fa244599c7fd022544fcabd58e60eec02374d Mon Sep 17 00:00:00 2001
From: Thanapon Noraset <nor.thanapon@u.northwestern.edu>
Date: Thu, 18 Feb 2016 12:18:03 -0600
Subject: [PATCH 5/5] Add support for torch binary file format

---
 train.lua                | 24 +++++++++++++++++++-----
 util/MiniBatchLoader.lua |  6 +++---
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/train.lua b/train.lua
index 6215443a..76e3ffeb 100644
--- a/train.lua
+++ b/train.lua
@@ -4,6 +4,7 @@ require 'optim'
 
 require 'LanguageModel'
 require 'util.DataLoader'
+require 'util.MiniBatchLoader'
 
 local utils = require 'util.utils'
 
@@ -15,6 +16,12 @@ cmd:option('-input_h5', 'data/tiny-shakespeare.h5')
 cmd:option('-input_json', 'data/tiny-shakespeare.json')
 cmd:option('-batch_size', 50)
 cmd:option('-seq_length', 50)
+-- Optional: load dataset in t7
+cmd:option('-format', 'h5')
+cmd:option('-train_t7', 'data/train-tiny-shakespeare.t7')
+cmd:option('-valid_t7', 'data/valid-tiny-shakespeare.t7')
+cmd:option('-test_t7', 'data/test-tiny-shakespeare.t7')
+cmd:option('-vocab_t7', 'data/vocab-tiny-shakespeare.t7')
 
 -- Model options
 cmd:option('-model_type', 'lstm')
@@ -44,7 +51,7 @@ cmd:option('-gpu', 0)
 cmd:option('-gpu_backend', 'cuda')
 
 local opt = cmd:parse(arg)
-
+print(opt)
 
 -- Set up GPU stuff
 local dtype = 'torch.FloatTensor'
@@ -70,13 +77,20 @@ end
 
 
 -- Initialize the DataLoader and vocabulary
-local loader = DataLoader(opt)
-local vocab = utils.read_json(opt.input_json)
+local loader, vocab
 local idx_to_token = {}
-for k, v in pairs(vocab.idx_to_token) do
-  idx_to_token[tonumber(k)] = v
+if opt.format == 't7' then
+  loader = MiniBatchLoader(opt)
+  idx_to_token = torch.load(opt.vocab_t7)
+else
+  loader = DataLoader(opt)
+  vocab = utils.read_json(opt.input_json)
+  for k, v in pairs(vocab.idx_to_token) do
+    idx_to_token[tonumber(k)] = v
+  end
 end
 
+
 -- Initialize the model and criterion
 local opt_clone = torch.deserialize(torch.serialize(opt))
 opt_clone.idx_to_token = idx_to_token
diff --git a/util/MiniBatchLoader.lua b/util/MiniBatchLoader.lua
index 19dd365c..40fb5d1b 100644
--- a/util/MiniBatchLoader.lua
+++ b/util/MiniBatchLoader.lua
@@ -11,11 +11,11 @@ function MiniBatchLoader:__init(config)
         {config},
         'MiniBatchLoader',
         'Load data files in torch binary format. Data will be cliped to fit mini batches',
-        {arg='train_file', type='string', default='data/train-tiny-shakespeare.t7',
+        {arg='train_t7', type='string', default='data/train-tiny-shakespeare.t7',
          help='training data in torch binary (see script/preprocess.lua)'},
-        {arg='valid_file', type='string', default='data/valid-tiny-shakespeare.t7',
+        {arg='valid_t7', type='string', default='data/valid-tiny-shakespeare.t7',
          help='training data in torch binary (see script/preprocess.lua)'},
-        {arg='test_file', type='string', default='data/test-tiny-shakespeare.t7',
+        {arg='test_t7', type='string', default='data/test-tiny-shakespeare.t7',
          help='training data in torch binary (see script/preprocess.lua)'},
         {arg='batch_size', type='number', default=8,
          help='number of sequences to run for each mini batch'},