fork download
  1. import os, sys
  2. import time
  3. import math
  4. import glob
  5. import copy
  6. print("\nUsing torch-0.2.0_3-cp36")
  7. # Problem when using 0.3
  8. ''' Ignore Warnings '''
  9. import warnings
  10. warnings.filterwarnings("ignore")
  11.  
  12. import torch
  13. import torch.optim as O
  14. import torch.nn as nn
  15. from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
  16. from torch.autograd import Variable
  17. from torchtext import data
  18. from torchtext import datasets
  19.  
  20. import torch.optim as optim
  21.  
  22. import nltk
  23. import re
  24. from custom_snli_loader import CustomSNLI
  25. from enc_dec import EncDec
  26. from vnmt import VRAE_VNMT
  27.  
  28. import matplotlib
  29. # matplotlib.use('qt5agg')
  30. #matplotlib.use('agg')
  31. import matplotlib.pyplot as plt
  32. import matplotlib.ticker as ticker
  33. import pickle
  34. from utils import get_args, makedirs, tokenize, load_dataset
  35. from gte import create_example, reverse_input, show_plot, plot_losses
  36.  
  37.  
  38. ##################################
  39. # Load the entailment only snli
  40. ##################################
  41. SOS_TOKEN = 2
  42. EOS_TOKEN = 1
  43. batch_size = 16#256#128
  44. max_seq_len = 110 #52#35
  45. vocab_size = 60000
  46. word_vectors = 'glove.6B.300d'
  47. vector_cache = os.path.join(os.getcwd(), '.vector_cache/input_vectors.pt')
  48. opt = get_args()
  49.  
  50. print("Batch Size : " + str(batch_size))
  51. inputs, train_iter, val_iter, test_iter = load_dataset(batch_size, max_seq_len, vocab_size, word_vectors, vector_cache)
  52. print("Dataset Loaded")
  53. config = opt
  54. d_embed = 300
  55. n_hid = 256 #512 # becuase we'll concat two hidden tensors later
  56. n_layers = 1 ## IMPORTANT
  57. dropout = 0.2 #todo: changed from 0.5
  58. model_name = 'vnmt'
  59. #rnn_type = 'LSTM'
  60. rnn_type = 'GRU'
  61. dec_type = 'attn'
  62. #dec_type = 'vanilla'
  63. config.n_embed = len(inputs.vocab)
  64. ntokens = len(inputs.vocab)
  65. gpu = 0
  66. torch.cuda.set_device(gpu)
  67. # torch.cuda.set_device(1)
  68. finetune = False
  69.  
  70.  
  71. ##################################
  72. # Load model
  73. ##################################
  74. print("Loading model now")
  75. model = VRAE_VNMT(rnn_type, d_embed, n_hid, config.n_embed, max_seq_len, n_layers=n_layers, dropout=dropout)#, word_dropout=0.5)
  76. model.encoder_prior.embeddings.weight.data = inputs.vocab.vectors
  77. model.encoder_post.embeddings.weight.data = inputs.vocab.vectors
  78. model.decoder.embeddings.weight.data = inputs.vocab.vectors
  79. model.encoder_prior.embeddings.weight.requires_grad = False
  80. model.encoder_post.embeddings.weight.requires_grad = False
  81. model.decoder.embeddings.weight.requires_grad = False
  82. print("model loaded")
  83.  
  84. if finetune:
  85. # Initialize enc/dec's weights with the pretrained model
  86. loaded_model = torch.load('vnmt_pretrain_gru_gte_best.pkl', map_location=lambda storage, locatoin: storage.cuda(gpu))
  87. print(loaded_model.encoder.hidden_dim)
  88.  
  89. #loaded_model = torch.load('vnmt_pretrain_3-gru_12162017/vnmt_pretrain_gru_gte_best.pkl', map_location=lambda storage, locatoin: storage.cuda(gpu))
  90. #model.encoder_prior = loaded_model.encoder
  91. model.decoder = loaded_model.decoder
  92. model.encoder_prior = copy.deepcopy(loaded_model.encoder)
  93. model.encoder_post = copy.deepcopy(loaded_model.encoder)
  94. print(model.encoder_prior.hidden_dim)#512
  95. print(model.encoder_post.hidden_dim)#512
  96. #loaded_reverse = torch.load('vnmt_pretrain_reverse_1-gru1e-3_12082017/vnmt_pretrain_reverse_gru_gte_e10.pkl', map_location=lambda storage, locatoin: storage.cuda(gpu))
  97. #model.encoder_post = loaded_reverse.encoder
  98.  
  99. model.encoder_prior.cuda()
  100. model.encoder_post.cuda()
  101. model.decoder.cuda()
  102. model.encoder_prior.embeddings.weight.requires_grad = False
  103. model.encoder_post.embeddings.weight.requires_grad = False
  104. model.decoder.embeddings.weight.requires_grad = False
  105.  
  106.  
  107. # setup optimizer
  108. lr = 1e-4#5e-5
  109. epochs = 50
  110. clip = 5.0
  111. log_interval = 50
  112. save_interval = 1
  113. model_parameters = filter(lambda p: p.requires_grad, model.parameters())
  114. optimizer = optim.Adam(model_parameters, lr=lr, betas=(0.9, 0.999))
  115. model.cuda()
  116. print("Model loaded int cuda")
  117.  
  118.  
  119. def evaluate(val_iter, model, n_tokens, eval_batch_size, kld_weight=1.0, wv=None):
  120. """
  121. Eval acc, bleu, etc.
  122. """
  123.  
  124. # Turn on evaluation mode which disables dropout.
  125. model.eval()
  126. model.encoder_prior.eval()
  127. model.encoder_post.eval()
  128. model.decoder.eval()
  129. total_loss = 0
  130. loss = 0
  131.  
  132. for batch_idx, batch in enumerate(val_iter):
  133. batch.premise.data = batch.premise.data.transpose(1,0)
  134. batch.hypothesis.data = batch.hypothesis.data.transpose(1,0)
  135. _loss, _kld = model.batchNLLLoss(batch.premise, batch.hypothesis, train=False)
  136. loss += _loss + kld_weight * _kld # full kld when evaluation?
  137.  
  138. return loss / float(len(val_iter))
  139.  
  140.  
  141.  
  142. def kld_coef(i, batch_size):
  143. #return (math.tanh((i - 17500)/1000) + 1)/2 # 700 minibatches * 25 epochs = 17500
  144. return (math.tanh( (i - int(3500/(batch_size/float(32))) ) / 1000) + 1)/2 # bs: 256 vs 32. 256/32=8. 3500/8 = 437.5
  145.  
  146.  
  147. def plot_vae_loss(nlls, klds, kld_weights, filename):
  148. plt.clf()
  149. plt.figure()
  150. fig, ax = plt.subplots()
  151. plt.plot(train_losses, label='train')
  152. plt.plot(val_losses, label='validation')
  153. ax.legend()
  154. plt.xlabel('Number of iterations')
  155. plt.ylabel('Negative log likelihood loss')
  156. plt.savefig(filename)
  157.  
  158. ''' For testing Purpose of the model'''
  159.  
  160. def test():
  161. loaded_model = torch.load('./vnmt_gru_gte_best.pkl',
  162. map_location=lambda storage, locatoin: storage.cuda(gpu))
  163. # print(loaded_model.encoder.hidden_dim)
  164. model.decoder = loaded_model.decoder
  165. model.encoder_prior = copy.deepcopy(loaded_model.encoder_prior)
  166. model.encoder_post = copy.deepcopy(loaded_model.encoder_post)
  167. ntokens = len(inputs.vocab)
  168. best_val_loss = float('inf')
  169.  
  170. sents = [
  171. 'People are celebrating a victory on the square.',
  172. 'Two women who just had lunch hugging and saying goodbye.',
  173. ]
  174. example0 = create_example(inputs, sents[0], max_seq_len)
  175. example1 = create_example(inputs, sents[1], max_seq_len)
  176. print(model.generate(inputs, ntokens, example0, max_seq_len))
  177. print(model.generate(inputs, ntokens, example1, max_seq_len))
  178.  
  179.  
  180. def train(pretrain=False, kld_annealing=True):
  181. DEBUG=False
  182. print('gte_vae.train')
  183. print('lr=%F'%lr)
  184.  
  185. # Turn on training mode which enables dropout.
  186. model.train()
  187. total_loss = 0
  188. total_acc = 0
  189. # for plotting
  190. train_losses = []
  191. val_losses = []
  192. kld_values = [] # unweighted values
  193. kld_weights = []
  194. nlls = []
  195.  
  196. ntokens = len(inputs.vocab)
  197. best_val_loss = float('inf')
  198.  
  199. sents = [
  200. 'People are celebrating a victory on the square.',
  201. 'Two women who just had lunch hugging and saying goodbye.',
  202. ]
  203.  
  204. iteration = 0
  205. if kld_annealing:
  206.  
  207. kld_weight = kld_coef(iteration, batch_size)
  208. else:
  209. kld_weight = 1.0
  210. val_loss = evaluate(val_iter, model, ntokens, opt.batch_size, kld_weight=kld_weight)
  211. val_loss = val_loss.data[0]
  212.  
  213. print('kld_annealing:')
  214. print(kld_annealing)
  215. print('Eavluating...')
  216. print(val_loss)
  217. example0 = create_example(inputs, sents[0], max_seq_len)
  218. example1 = create_example(inputs, sents[1], max_seq_len)
  219. print(model.generate(inputs, ntokens, example0, max_seq_len))
  220. print(model.generate(inputs, ntokens, example1, max_seq_len))
  221.  
  222. start_time = time.time()
  223.  
  224.  
  225.  
  226. # plot / dump check before proceeding with training
  227. kld_stats = { 'nll': nlls, 'kld_values': kld_values, 'kld_weights': kld_weights }
  228. with open('kld_stats.pkl', 'wb') as f:
  229. pickle.dump(kld_stats, f, protocol=pickle.HIGHEST_PROTOCOL)
  230. plot_losses([0, 1, 2, 3, 4], 'train', 'train_loss.eps')
  231.  
  232.  
  233.  
  234. for epoch in range(epochs):
  235. train_iter.init_epoch()
  236. n_correct, n_total = 0, 0
  237. total_loss = 0
  238. train_loss = 0
  239.  
  240.  
  241. for batch_idx, batch in enumerate(train_iter):
  242. # Turn on training mode which enables dropout.
  243. model.train()
  244. model.encoder_prior.train()
  245. model.encoder_post.train()
  246. model.decoder.train()
  247. optimizer.zero_grad()
  248.  
  249. #print(batch.text.data.shape) # 35 x 64
  250. #batch.text.data = batch.text.data.view(-1, max_seq_len) # -1 instead of opt.batch_size to avoid reshaping err at the end of the epoch
  251. batch.premise.data = batch.premise.data.transpose(1,0) # should be 64x35 [batch_size x seq_len]
  252. batch.hypothesis.data = batch.hypothesis.data.transpose(1,0) # should be 64x35 [batch_size x seq_len]
  253. #nll, kld = model.batchNLLLoss(batch.premise, batch.hypothesis)
  254. nll, kld = model.batchNLLLoss(batch.premise, batch.hypothesis, train=True)
  255.  
  256. # KLD Cost Annealing
  257. # ref: https://a...content-available-to-author-only...v.org/pdf/1511.06349.pdf
  258. iteration += 1
  259. if kld_annealing:
  260. kld_weight = kld_coef(iteration, batch_size)
  261. else:
  262. kld_weight = 1.0
  263. loss = nll + kld_weight * kld
  264.  
  265. nlls.append(nll.data)
  266. kld_values.append(kld.data)
  267. kld_weights.append(kld_weight)
  268.  
  269. loss.backward()
  270. torch.nn.utils.clip_grad_norm(model.encoder_prior.parameters(), clip)
  271. torch.nn.utils.clip_grad_norm(model.encoder_post.parameters(), clip)
  272. torch.nn.utils.clip_grad_norm(model.decoder.parameters(), clip)
  273. #torch.nn.utils.clip_grad_norm(model.parameters(), clip)
  274. optimizer.step()
  275.  
  276.  
  277. batch_loss = loss.data
  278. total_loss += batch_loss
  279. train_loss += batch_loss
  280.  
  281. if batch_idx % log_interval == 0 and batch_idx > 0:
  282. print('iteration: %d' % iteration)
  283. print('kld_weight: %.16f' % kld_weight)
  284. print('nll: %.16f' % nll.data[0])
  285. print('kld_value: %.16f' % kld.data[0])
  286. cur_loss = total_loss[0] / log_interval
  287. elapsed = time.time() - start_time
  288. print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
  289. 'loss {:5.2f} | ppl {:8.2f}'.format(
  290. epoch, batch_idx, len(train_iter) // max_seq_len, lr,
  291. elapsed * 1000 / log_interval, cur_loss, 0))#math.exp(cur_loss)
  292. total_loss = 0
  293. start_time = time.time()
  294.  
  295. print('Evalating...')
  296. val_loss = evaluate(val_iter, model, ntokens, opt.batch_size, kld_weight=kld_weight)
  297. print(val_loss.data[0])
  298. print(model.generate(inputs, ntokens, example0, max_seq_len))
  299. print(model.generate(inputs, ntokens, example1, max_seq_len))
  300.  
  301. print(nlls[-1])
  302. print(kld_values[-1])
  303. print(kld_weights[-1])
  304. print('Epoch train loss:')
  305. print(train_loss[0])
  306. train_loss = train_loss / float(len(train_iter))
  307. print(train_loss[0])
  308. train_losses.append(train_loss[0])
  309.  
  310.  
  311. val_loss = evaluate(val_iter, model, ntokens, opt.batch_size, kld_weight=kld_weight)
  312. val_loss = val_loss.data[0]
  313. val_losses.append(val_loss)
  314. # Save the model if the validation loss is the best we've seen so far.
  315. if val_loss < best_val_loss:
  316. with open('%s_%s_gte_best.pkl'%(model_name, rnn_type.lower()), 'wb') as f:
  317. torch.save(model, f)
  318. best_val_loss = val_loss
  319. else:
  320. # Anneal the learning rate if no improvement has been seen in the validation dataset.
  321. #lr /= 4.0
  322. #print('lr annealed: %f'%lr)
  323. pass
  324. if epoch % save_interval == 0:
  325. with open('%s_%s_gte_e%d.pkl'%(model_name, rnn_type.lower(), epoch), 'wb') as f:
  326. torch.save(model, f)
  327.  
  328. # save train/val loss lists
  329. with open('train_losses.pkl', 'wb') as f:
  330. pickle.dump(train_losses, f, protocol=pickle.HIGHEST_PROTOCOL)
  331. with open('val_losses.pkl', 'wb') as f:
  332. pickle.dump(val_losses, f, protocol=pickle.HIGHEST_PROTOCOL)
  333. kld_stats = { 'nll': nlls, 'kld_values': kld_values, 'kld_weights': kld_weights }
  334. with open('kld_stats.pkl', 'wb') as f:
  335. pickle.dump(kld_stats, f, protocol=pickle.HIGHEST_PROTOCOL)
  336.  
  337.  
  338. plot_losses(train_losses, 'train', 'train_loss.eps')
  339. plot_losses(val_losses, 'validation', 'val_loss.eps')
  340. show_plot(train_losses, val_losses, 'train-val_loss.eps')
  341.  
  342.  
  343. print(train_losses)
  344. print(val_losses)
  345.  
  346. # save train/val loss lists
  347. with open('train_losses.pickle', 'wb') as f:
  348. pickle.dump(train_losses, f, protocol=pickle.HIGHEST_PROTOCOL)
  349. with open('val_losses.pickle', 'wb') as f:
  350. pickle.dump(val_losses, f, protocol=pickle.HIGHEST_PROTOCOL)
  351. show_plot(train_losses, val_losses, 'train-val_loss.eps')
  352.  
  353.  
  354.  
  355. if __name__ == "__main__":
  356. print('Training VRAE...')
  357. train(kld_annealing=False)
  358. #train(kld_annealing=True)
  359. #train()
  360.  
  361.  
Runtime error #stdin #stdout #stderr 0.04s 9720KB
stdin
Standard input is empty
stdout
Using torch-0.2.0_3-cp36
stderr
Traceback (most recent call last):
  File "./prog.py", line 12, in <module>
ImportError: No module named 'torch'