From 8b7e88a38756f93a5a037c578480b32b34596933 Mon Sep 17 00:00:00 2001 From: mehrad <mehrad@stanford.edu> Date: Mon, 6 Jul 2020 22:33:27 -0700 Subject: [PATCH] use transformers library to automate model caching --- .gitignore | 6 ++++ .../sumbt/BeliefTrackerSlotQueryMultiSlot.py | 14 ++++---- convlab2/dst/sumbt/crosswoz_en/sumbt.py | 23 +++++++------ .../dst/sumbt/crosswoz_en/sumbt_config.py | 4 +-- convlab2/dst/sumbt/multiwoz/sumbt.py | 22 ++++++------- convlab2/dst/sumbt/multiwoz/sumbt_config.py | 3 +- convlab2/dst/sumbt/multiwoz_zh/sumbt.py | 32 ++++++++----------- .../dst/sumbt/multiwoz_zh/sumbt_config.py | 5 +-- 8 files changed, 56 insertions(+), 53 deletions(-) diff --git a/.gitignore b/.gitignore index 2edbed1..b8268b1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# extras +Pipfile* +results* + *.pyc __pycache__ @@ -13,6 +17,7 @@ __pycache__ data/**/train.json data/**/val.json data/**/test.json +data/**/human_val.json data/camrest/CamRest676_v2.json data/multiwoz/annotated_user_da_with_span_full.json data/schema/dstc8-schema-guided-dialogue-master @@ -34,6 +39,7 @@ convlab2/nlg/sclstm/**/generated_sens_sys.json convlab2/nlg/template/**/generated_sens_sys.json convlab2/nlu/jointBERT/crosswoz/**/data convlab2/nlu/jointBERT/multiwoz/**/data + # test script *_test.py diff --git a/convlab2/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py b/convlab2/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py index f2b5358..bd3dc57 100755 --- a/convlab2/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py +++ b/convlab2/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py @@ -1,14 +1,12 @@ -import os.path import math import torch import torch.nn as nn import torch.nn.functional as F from torch.nn import CrossEntropyLoss -from torch.nn import CosineEmbeddingLoss -from pytorch_pretrained_bert.modeling import BertModel -from pytorch_pretrained_bert.modeling import BertPreTrainedModel +from transformers import BertModel +from transformers import BertPreTrainedModel class BertForUtteranceEncoding(BertPreTrainedModel): @@ -19,7 +17,7 @@ class BertForUtteranceEncoding(BertPreTrainedModel): self.bert = BertModel(config) def forward(self, input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False): - return self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers) + return self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, encoder_hidden_states=output_all_encoded_layers) class MultiHeadAttention(nn.Module): @@ -93,7 +91,8 @@ class BeliefTracker(nn.Module): self.device = device ### Utterance Encoder - self.utterance_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model) + self.utterance_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) + self.utterance_encoder.train() self.bert_output_dim = self.utterance_encoder.config.hidden_size self.hidden_dropout_prob = self.utterance_encoder.config.hidden_dropout_prob if args.fix_utterance_encoder: @@ -101,7 +100,8 @@ class BeliefTracker(nn.Module): p.requires_grad = False ### slot, slot-value Encoder (not trainable) - self.sv_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model) + self.sv_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) + self.sv_encoder.train() for p in self.sv_encoder.bert.parameters(): p.requires_grad = False diff --git a/convlab2/dst/sumbt/crosswoz_en/sumbt.py b/convlab2/dst/sumbt/crosswoz_en/sumbt.py index d211db1..f39d1f9 100644 --- a/convlab2/dst/sumbt/crosswoz_en/sumbt.py +++ b/convlab2/dst/sumbt/crosswoz_en/sumbt.py @@ -7,15 +7,15 @@ import zipfile from matplotlib import pyplot as plt -# from tensorboardX.writer import SummaryWriter +from tensorboardX.writer import SummaryWriter from tqdm._tqdm import trange, tqdm from convlab2.util.file_util import cached_path from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler -from pytorch_pretrained_bert.tokenization import BertTokenizer -from pytorch_pretrained_bert.optimization import BertAdam +from transformers import BertTokenizer +from transformers import get_linear_schedule_with_warmup, AdamW from convlab2.dst.dst import DST from convlab2.dst.sumbt.crosswoz_en.convert_to_glue_format import convert_to_glue_format, trans_value @@ -114,10 +114,7 @@ class SUMBTTracker(DST): num_labels = [len(labels) for labels in label_list] # number of slot-values in each slot-type # tokenizer - # vocab_dir = os.path.join(data_dir, 'model', '%s-vocab.txt' % args.bert_model) - # if not os.path.exists(vocab_dir): - # raise ValueError("Can't find %s " % vocab_dir) - self.tokenizer = BertTokenizer.from_pretrained(args.bert_model) + self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) @@ -196,7 +193,7 @@ class SUMBTTracker(DST): print('loading weights from trained model') self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')) else: - raise ValueError('no availabel weights found.') + raise ValueError('no available weights found.') self.param_restored = True def construct_query(self, context): @@ -395,10 +392,8 @@ class SUMBTTracker(DST): optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale) else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=t_total) + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total) logger.info(optimizer) # Training code @@ -470,7 +465,11 @@ class SUMBTTracker(DST): summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step + if scheduler is not None: + torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0) optimizer.step() + if scheduler is not None: + scheduler.step() optimizer.zero_grad() global_step += 1 diff --git a/convlab2/dst/sumbt/crosswoz_en/sumbt_config.py b/convlab2/dst/sumbt/crosswoz_en/sumbt_config.py index 93ac2b5..c51665e 100644 --- a/convlab2/dst/sumbt/crosswoz_en/sumbt_config.py +++ b/convlab2/dst/sumbt/crosswoz_en/sumbt_config.py @@ -23,9 +23,9 @@ class DotMap(): self.do_eval = True self.num_train_epochs = 300 - self.bert_model = os.path.join(convlab2.get_root_path(), "pre-trained-models/bert-base-uncased") - + self.bert_model_cache_dir = os.path.join(convlab2.get_root_path(), "pre-trained-models/") + self.bert_model_name = "bert-base-uncased" self.do_lower_case = True self.task_name = 'bert-gru-sumbt' self.nbt = 'rnn' diff --git a/convlab2/dst/sumbt/multiwoz/sumbt.py b/convlab2/dst/sumbt/multiwoz/sumbt.py index 8df123a..9176e54 100755 --- a/convlab2/dst/sumbt/multiwoz/sumbt.py +++ b/convlab2/dst/sumbt/multiwoz/sumbt.py @@ -5,15 +5,15 @@ from itertools import chain import numpy as np import zipfile -# from tensorboardX.writer import SummaryWriter +from tensorboardX.writer import SummaryWriter from tqdm._tqdm import trange, tqdm from convlab2.util.file_util import cached_path from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler -from pytorch_pretrained_bert.tokenization import BertTokenizer -from pytorch_pretrained_bert.optimization import BertAdam +from transformers import BertTokenizer +from transformers import get_linear_schedule_with_warmup, AdamW from convlab2.dst.dst import DST from convlab2.dst.sumbt.multiwoz.convert_to_glue_format import convert_to_glue_format @@ -94,10 +94,7 @@ class SUMBTTracker(DST): num_labels = [len(labels) for labels in label_list] # number of slot-values in each slot-type # tokenizer - # vocab_dir = os.path.join(data_dir, 'model', '%s-vocab.txt' % args.bert_model) - # if not os.path.exists(vocab_dir): - # raise ValueError("Can't find %s " % vocab_dir) - self.tokenizer = BertTokenizer.from_pretrained(args.bert_model) + self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) @@ -402,6 +399,7 @@ class SUMBTTracker(DST): t_total = num_train_steps + scheduler = None if args.fp16: try: from apex.optimizers import FP16_Optimizer @@ -420,10 +418,8 @@ class SUMBTTracker(DST): optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale) else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=t_total) + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total) logger.info(optimizer) # Training code @@ -492,7 +488,11 @@ class SUMBTTracker(DST): summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step + if scheduler is not None: + torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0) optimizer.step() + if scheduler is not None: + scheduler.step() optimizer.zero_grad() global_step += 1 diff --git a/convlab2/dst/sumbt/multiwoz/sumbt_config.py b/convlab2/dst/sumbt/multiwoz/sumbt_config.py index 592cd28..5488f8f 100755 --- a/convlab2/dst/sumbt/multiwoz/sumbt_config.py +++ b/convlab2/dst/sumbt/multiwoz/sumbt_config.py @@ -29,8 +29,9 @@ class DotMap(): self.do_eval = True self.num_train_epochs = 300 - self.bert_model = os.path.join(convlab2.get_root_path(), "pre-trained-models/bert-base-uncased") + self.bert_model_cache_dir = os.path.join(convlab2.get_root_path(), "pre-trained-models/") + self.bert_model_name = "bert-base-uncased" self.do_lower_case = True self.task_name = 'bert-gru-sumbt' self.nbt = 'rnn' diff --git a/convlab2/dst/sumbt/multiwoz_zh/sumbt.py b/convlab2/dst/sumbt/multiwoz_zh/sumbt.py index cd99f39..96e83f0 100644 --- a/convlab2/dst/sumbt/multiwoz_zh/sumbt.py +++ b/convlab2/dst/sumbt/multiwoz_zh/sumbt.py @@ -1,23 +1,19 @@ -import os import copy -from pprint import pprint import random from itertools import chain import numpy as np import zipfile -# from tensorboardX.writer import SummaryWriter -from tqdm._tqdm import trange, tqdm - -from convlab2.util.file_util import cached_path +from tensorboardX.writer import SummaryWriter +from tqdm import trange, tqdm from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler -from pytorch_pretrained_bert.tokenization import BertTokenizer -from pytorch_pretrained_bert.optimization import BertAdam +from transformers import BertTokenizer +from transformers import get_linear_schedule_with_warmup, AdamW from convlab2.dst.dst import DST -from convlab2.dst.sumbt.multiwoz_zh.convert_to_glue_format import convert_to_glue_format, trans_value +from convlab2.dst.sumbt.multiwoz_zh.convert_to_glue_format import convert_to_glue_format from convlab2.util.multiwoz_zh.state import default_state from convlab2.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker @@ -98,10 +94,7 @@ class SUMBTTracker(DST): num_labels = [len(labels) for labels in label_list] # number of slot-values in each slot-type # tokenizer - # vocab_dir = os.path.join(data_dir, 'model', '%s-vocab.txt' % args.bert_model) - # if not os.path.exists(vocab_dir): - # raise ValueError("Can't find %s " % vocab_dir) - self.tokenizer = BertTokenizer.from_pretrained(args.bert_model) + self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) @@ -256,6 +249,7 @@ class SUMBTTracker(DST): t_total = num_train_steps + scheduler = None if args.fp16: try: from apex.optimizers import FP16_Optimizer @@ -274,10 +268,8 @@ class SUMBTTracker(DST): optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale) else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=t_total) + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total) logger.info(optimizer) # Training code @@ -346,7 +338,11 @@ class SUMBTTracker(DST): summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step + if scheduler is not None: + torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0) optimizer.step() + if scheduler is not None: + scheduler.step() optimizer.zero_grad() global_step += 1 @@ -567,7 +563,7 @@ class SUMBTTracker(DST): print('loading weights from trained model') self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')) else: - raise ValueError('no available weights found.') + raise ValueError('no available weights found.') self.param_restored = True def update(self, user_act=None): diff --git a/convlab2/dst/sumbt/multiwoz_zh/sumbt_config.py b/convlab2/dst/sumbt/multiwoz_zh/sumbt_config.py index b8943c1..1f4952b 100644 --- a/convlab2/dst/sumbt/multiwoz_zh/sumbt_config.py +++ b/convlab2/dst/sumbt/multiwoz_zh/sumbt_config.py @@ -27,8 +27,9 @@ class DotMap(): self.do_eval = True self.num_train_epochs = 300 - - self.bert_model = os.path.join(convlab2.get_root_path(), "pre-trained-models/bert-chinese-wwm-ext") + self.bert_model = os.path.join(convlab2.get_root_path(), "pre-trained-models/chinese-bert-wwm-ext") + self.bert_model_cache_dir = os.path.join(convlab2.get_root_path(), "pre-trained-models/") + self.bert_model_name = "hfl/chinese-bert-wwm-ext" self.do_lower_case = True self.task_name = 'bert-gru-sumbt' self.nbt = 'rnn' -- GitLab