From 8b7e88a38756f93a5a037c578480b32b34596933 Mon Sep 17 00:00:00 2001
From: mehrad <mehrad@stanford.edu>
Date: Mon, 6 Jul 2020 22:33:27 -0700
Subject: [PATCH] use transformers library to automate model caching

---
 .gitignore                                    |  6 ++++
 .../sumbt/BeliefTrackerSlotQueryMultiSlot.py  | 14 ++++----
 convlab2/dst/sumbt/crosswoz_en/sumbt.py       | 23 +++++++------
 .../dst/sumbt/crosswoz_en/sumbt_config.py     |  4 +--
 convlab2/dst/sumbt/multiwoz/sumbt.py          | 22 ++++++-------
 convlab2/dst/sumbt/multiwoz/sumbt_config.py   |  3 +-
 convlab2/dst/sumbt/multiwoz_zh/sumbt.py       | 32 ++++++++-----------
 .../dst/sumbt/multiwoz_zh/sumbt_config.py     |  5 +--
 8 files changed, 56 insertions(+), 53 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2edbed1..b8268b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+# extras
+Pipfile*
+results*
+
 *.pyc
 __pycache__
 
@@ -13,6 +17,7 @@ __pycache__
 data/**/train.json
 data/**/val.json
 data/**/test.json
+data/**/human_val.json
 data/camrest/CamRest676_v2.json
 data/multiwoz/annotated_user_da_with_span_full.json
 data/schema/dstc8-schema-guided-dialogue-master
@@ -34,6 +39,7 @@ convlab2/nlg/sclstm/**/generated_sens_sys.json
 convlab2/nlg/template/**/generated_sens_sys.json
 convlab2/nlu/jointBERT/crosswoz/**/data
 convlab2/nlu/jointBERT/multiwoz/**/data
+
 # test script
 *_test.py
 
diff --git a/convlab2/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py b/convlab2/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py
index f2b5358..bd3dc57 100755
--- a/convlab2/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py
+++ b/convlab2/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py
@@ -1,14 +1,12 @@
-import os.path
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from torch.nn import CrossEntropyLoss
-from torch.nn import CosineEmbeddingLoss
 
-from pytorch_pretrained_bert.modeling import BertModel
-from pytorch_pretrained_bert.modeling import BertPreTrainedModel
+from transformers import BertModel
+from transformers import BertPreTrainedModel
 
 
 class BertForUtteranceEncoding(BertPreTrainedModel):
@@ -19,7 +17,7 @@ class BertForUtteranceEncoding(BertPreTrainedModel):
         self.bert = BertModel(config)
 
     def forward(self, input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False):
-        return self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers)
+        return self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, encoder_hidden_states=output_all_encoded_layers)
 
 
 class MultiHeadAttention(nn.Module):
@@ -93,7 +91,8 @@ class BeliefTracker(nn.Module):
         self.device = device
 
         ### Utterance Encoder
-        self.utterance_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model)
+        self.utterance_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
+        self.utterance_encoder.train()
         self.bert_output_dim = self.utterance_encoder.config.hidden_size
         self.hidden_dropout_prob = self.utterance_encoder.config.hidden_dropout_prob
         if args.fix_utterance_encoder:
@@ -101,7 +100,8 @@ class BeliefTracker(nn.Module):
                 p.requires_grad = False
 
         ### slot, slot-value Encoder (not trainable)
-        self.sv_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model)
+        self.sv_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
+        self.sv_encoder.train()
         for p in self.sv_encoder.bert.parameters():
             p.requires_grad = False
 
diff --git a/convlab2/dst/sumbt/crosswoz_en/sumbt.py b/convlab2/dst/sumbt/crosswoz_en/sumbt.py
index d211db1..f39d1f9 100644
--- a/convlab2/dst/sumbt/crosswoz_en/sumbt.py
+++ b/convlab2/dst/sumbt/crosswoz_en/sumbt.py
@@ -7,15 +7,15 @@ import zipfile
 
 from matplotlib import pyplot as plt
 
-# from tensorboardX.writer import SummaryWriter
+from tensorboardX.writer import SummaryWriter
 from tqdm._tqdm import trange, tqdm
 
 from convlab2.util.file_util import cached_path
 
 from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam
+from transformers import BertTokenizer
+from transformers import get_linear_schedule_with_warmup, AdamW
 
 from convlab2.dst.dst import DST
 from convlab2.dst.sumbt.crosswoz_en.convert_to_glue_format import convert_to_glue_format, trans_value
@@ -114,10 +114,7 @@ class SUMBTTracker(DST):
         num_labels = [len(labels) for labels in label_list]  # number of slot-values in each slot-type
 
         # tokenizer
-        # vocab_dir = os.path.join(data_dir, 'model', '%s-vocab.txt' % args.bert_model)
-        # if not os.path.exists(vocab_dir):
-        #     raise ValueError("Can't find %s " % vocab_dir)
-        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
         random.seed(args.seed)
         np.random.seed(args.seed)
         torch.manual_seed(args.seed)
@@ -196,7 +193,7 @@ class SUMBTTracker(DST):
                 print('loading weights from trained model')
                 self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin'))
             else:
-                raise ValueError('no availabel weights found.')
+                raise ValueError('no available weights found.')
             self.param_restored = True
 
     def construct_query(self, context):
@@ -395,10 +392,8 @@ class SUMBTTracker(DST):
                 optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale)
 
         else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=t_total)
+            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
+            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total)
         logger.info(optimizer)
 
         # Training code
@@ -470,7 +465,11 @@ class SUMBTTracker(DST):
                         summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step)
                     for param_group in optimizer.param_groups:
                         param_group['lr'] = lr_this_step
+                    if scheduler is not None:
+                        torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0)
                     optimizer.step()
+                    if scheduler is not None:
+                        scheduler.step()
                     optimizer.zero_grad()
                     global_step += 1
 
diff --git a/convlab2/dst/sumbt/crosswoz_en/sumbt_config.py b/convlab2/dst/sumbt/crosswoz_en/sumbt_config.py
index 93ac2b5..c51665e 100644
--- a/convlab2/dst/sumbt/crosswoz_en/sumbt_config.py
+++ b/convlab2/dst/sumbt/crosswoz_en/sumbt_config.py
@@ -23,9 +23,9 @@ class DotMap():
         self.do_eval = True
         self.num_train_epochs = 300
 
-
         self.bert_model = os.path.join(convlab2.get_root_path(), "pre-trained-models/bert-base-uncased")
-
+        self.bert_model_cache_dir = os.path.join(convlab2.get_root_path(), "pre-trained-models/")
+        self.bert_model_name = "bert-base-uncased"
         self.do_lower_case = True
         self.task_name = 'bert-gru-sumbt'
         self.nbt = 'rnn'
diff --git a/convlab2/dst/sumbt/multiwoz/sumbt.py b/convlab2/dst/sumbt/multiwoz/sumbt.py
index 8df123a..9176e54 100755
--- a/convlab2/dst/sumbt/multiwoz/sumbt.py
+++ b/convlab2/dst/sumbt/multiwoz/sumbt.py
@@ -5,15 +5,15 @@ from itertools import chain
 import numpy as np
 import zipfile
 
-# from tensorboardX.writer import SummaryWriter
+from tensorboardX.writer import SummaryWriter
 from tqdm._tqdm import trange, tqdm
 
 from convlab2.util.file_util import cached_path
 
 from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam
+from transformers import BertTokenizer
+from transformers import get_linear_schedule_with_warmup, AdamW
 
 from convlab2.dst.dst import DST
 from convlab2.dst.sumbt.multiwoz.convert_to_glue_format import convert_to_glue_format
@@ -94,10 +94,7 @@ class SUMBTTracker(DST):
         num_labels = [len(labels) for labels in label_list]  # number of slot-values in each slot-type
 
         # tokenizer
-        # vocab_dir = os.path.join(data_dir, 'model', '%s-vocab.txt' % args.bert_model)
-        # if not os.path.exists(vocab_dir):
-        #     raise ValueError("Can't find %s " % vocab_dir)
-        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
         random.seed(args.seed)
         np.random.seed(args.seed)
         torch.manual_seed(args.seed)
@@ -402,6 +399,7 @@ class SUMBTTracker(DST):
 
         t_total = num_train_steps
 
+        scheduler = None
         if args.fp16:
             try:
                 from apex.optimizers import FP16_Optimizer
@@ -420,10 +418,8 @@ class SUMBTTracker(DST):
                 optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale)
 
         else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=t_total)
+            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
+            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total)
         logger.info(optimizer)
 
         # Training code
@@ -492,7 +488,11 @@ class SUMBTTracker(DST):
                         summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step)
                     for param_group in optimizer.param_groups:
                         param_group['lr'] = lr_this_step
+                    if scheduler is not None:
+                        torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0)
                     optimizer.step()
+                    if scheduler is not None:
+                        scheduler.step()
                     optimizer.zero_grad()
                     global_step += 1
 
diff --git a/convlab2/dst/sumbt/multiwoz/sumbt_config.py b/convlab2/dst/sumbt/multiwoz/sumbt_config.py
index 592cd28..5488f8f 100755
--- a/convlab2/dst/sumbt/multiwoz/sumbt_config.py
+++ b/convlab2/dst/sumbt/multiwoz/sumbt_config.py
@@ -29,8 +29,9 @@ class DotMap():
         self.do_eval = True
         self.num_train_epochs = 300
 
-
         self.bert_model = os.path.join(convlab2.get_root_path(), "pre-trained-models/bert-base-uncased")
+        self.bert_model_cache_dir = os.path.join(convlab2.get_root_path(), "pre-trained-models/")
+        self.bert_model_name = "bert-base-uncased"
         self.do_lower_case = True
         self.task_name = 'bert-gru-sumbt'
         self.nbt = 'rnn'
diff --git a/convlab2/dst/sumbt/multiwoz_zh/sumbt.py b/convlab2/dst/sumbt/multiwoz_zh/sumbt.py
index cd99f39..96e83f0 100644
--- a/convlab2/dst/sumbt/multiwoz_zh/sumbt.py
+++ b/convlab2/dst/sumbt/multiwoz_zh/sumbt.py
@@ -1,23 +1,19 @@
-import os
 import copy
-from pprint import pprint
 import random
 from itertools import chain
 import numpy as np
 import zipfile
 
-# from tensorboardX.writer import SummaryWriter
-from tqdm._tqdm import trange, tqdm
-
-from convlab2.util.file_util import cached_path
+from tensorboardX.writer import SummaryWriter
+from tqdm import trange, tqdm
 
 from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam
+from transformers import BertTokenizer
+from transformers import get_linear_schedule_with_warmup, AdamW
 
 from convlab2.dst.dst import DST
-from convlab2.dst.sumbt.multiwoz_zh.convert_to_glue_format import convert_to_glue_format, trans_value
+from convlab2.dst.sumbt.multiwoz_zh.convert_to_glue_format import convert_to_glue_format
 from convlab2.util.multiwoz_zh.state import default_state
 
 from convlab2.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker
@@ -98,10 +94,7 @@ class SUMBTTracker(DST):
         num_labels = [len(labels) for labels in label_list]  # number of slot-values in each slot-type
 
         # tokenizer
-        # vocab_dir = os.path.join(data_dir, 'model', '%s-vocab.txt' % args.bert_model)
-        # if not os.path.exists(vocab_dir):
-        #     raise ValueError("Can't find %s " % vocab_dir)
-        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
         random.seed(args.seed)
         np.random.seed(args.seed)
         torch.manual_seed(args.seed)
@@ -256,6 +249,7 @@ class SUMBTTracker(DST):
 
         t_total = num_train_steps
 
+        scheduler = None
         if args.fp16:
             try:
                 from apex.optimizers import FP16_Optimizer
@@ -274,10 +268,8 @@ class SUMBTTracker(DST):
                 optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale)
 
         else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=t_total)
+            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
+            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total)
         logger.info(optimizer)
 
         # Training code
@@ -346,7 +338,11 @@ class SUMBTTracker(DST):
                         summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step)
                     for param_group in optimizer.param_groups:
                         param_group['lr'] = lr_this_step
+                    if scheduler is not None:
+                        torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0)
                     optimizer.step()
+                    if scheduler is not None:
+                        scheduler.step()
                     optimizer.zero_grad()
                     global_step += 1
 
@@ -567,7 +563,7 @@ class SUMBTTracker(DST):
                 print('loading weights from trained model')
                 self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin'))
             else:
-            raise ValueError('no available weights found.')
+                raise ValueError('no available weights found.')
             self.param_restored = True
 
     def update(self, user_act=None):
diff --git a/convlab2/dst/sumbt/multiwoz_zh/sumbt_config.py b/convlab2/dst/sumbt/multiwoz_zh/sumbt_config.py
index b8943c1..1f4952b 100644
--- a/convlab2/dst/sumbt/multiwoz_zh/sumbt_config.py
+++ b/convlab2/dst/sumbt/multiwoz_zh/sumbt_config.py
@@ -27,8 +27,9 @@ class DotMap():
         self.do_eval = True
         self.num_train_epochs = 300
 
-
-        self.bert_model = os.path.join(convlab2.get_root_path(), "pre-trained-models/bert-chinese-wwm-ext")
+        self.bert_model = os.path.join(convlab2.get_root_path(), "pre-trained-models/chinese-bert-wwm-ext")
+        self.bert_model_cache_dir = os.path.join(convlab2.get_root_path(), "pre-trained-models/")
+        self.bert_model_name = "hfl/chinese-bert-wwm-ext"
         self.do_lower_case = True
         self.task_name = 'bert-gru-sumbt'
         self.nbt = 'rnn'
-- 
GitLab