Merge pull request #53 from ConvLab/pre-training

Pre training

Merge pull request #53 from ConvLab/pre-training
a74d1579 · zhuqi · GitHub · ee8991e9 · 4b333b1f · a74d1579
Unverified Commit a74d1579 authored Apr 20, 2022 by zhuqi Committed by GitHub Apr 20, 2022
--- a/convlab2/base_models/gpt/create_data.py
+++ b/convlab2/base_models/gpt/create_data.py
+import os
+import json
+from tqdm import tqdm
+import re
+from convlab2.util import load_dataset
+
+
+def create_lm_data(dataset, data_dir, args):
+    data_by_split = dataset
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            if args.model_type == 'dialogpt':
+                dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + ' <|endoftext|>'
+            else:
+                dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
+            data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n')
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="create data for seq2seq training")
+    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['lm'], help='names of tasks')
+    parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
+    parser.add_argument('--model_type', '-m', metavar='model_type', help='type of the language model: gpt, dialogpt, ..')
+    args = parser.parse_args()
+    print(args)
+    for dataset_name in tqdm(args.datasets, desc='datasets'):
+        dataset = load_dataset(dataset_name)
+        for task_name in tqdm(args.tasks, desc='tasks', leave=False):
+            data_dir = os.path.join('data', task_name, dataset_name, args.model_type)
+            eval(f"create_{task_name}_data")(dataset, data_dir, args)
--- a/convlab2/base_models/gpt/keyword_extraction/get_keywords.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/get_keywords.sh
+model_type=dialogpt
+dataset_name=multiwoz21
+model_name=dialogpt-large
+data_dir="data/lm/${dataset_name}/${model_type}"
+word_loss_file="${data_dir}/${model_name}_${dataset_name}_word_loss.json"
+keywords_num=5
+keywords_ratio=1
+keywords_th=0
+stopwords=True
+output_file="${data_dir}/${dataset_name}_keywords_${model_name}_topk_${keywords_num}_ratio_${keywords_ratio}_th_${keywords_th}_stopwords_${stopwords}.json"
+
+python lmloss2keywords.py \
+    --model_type ${model_type} \
+    --word_loss_file ${word_loss_file} \
+    --keywords_num ${keywords_num} \
+    --keywords_ratio ${keywords_ratio} \
+    --keywords_th ${keywords_th} \
+    --stopwords ${stopwords} \
+    --output_file ${output_file}
+    
\ No newline at end of file
--- a/convlab2/base_models/gpt/keyword_extraction/get_word_loss.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/get_word_loss.sh
+set -e
+n_gpus=1
+task_name="lm"
+dataset_name="multiwoz21"
+model_type="dialogpt"
+data_dir="data/${task_name}/${dataset_name}/${model_type}"
+output_dir="output/${task_name}/${dataset_name}/${model_type}"
+cache_dir="../cache"
+validation_file="${data_dir}/validation.json"
+source_column="dialogue"
+max_length=512
+model_name_or_path="microsoft/DialoGPT-large"
+per_device_eval_batch_size=4
+
+dump_eval_loss_to="${data_dir}/dialogpt-large_${dataset_name}_token_loss.json"
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
+python ../run_clm.py \
+    --dump_eval_loss_to ${dump_eval_loss_to}\
+    --model_name_or_path ${model_name_or_path} \
+    --output_dir ${data_dir} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --max_length ${max_length} \
+    --do_eval \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --preprocessing_num_workers 4 \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}
+python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
+
+dump_eval_loss_to="${data_dir}/dialogpt-large-mwoz_${dataset_name}_token_loss.json"
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
+python ../run_clm.py \
+    --dump_eval_loss_to ${dump_eval_loss_to}\
+    --model_name_or_path ${output_dir} \
+    --output_dir ${data_dir} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --max_length ${max_length} \
+    --do_eval \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --preprocessing_num_workers 4 \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}
+python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
+
+model_type="gpt"
+data_dir="data/${task_name}/${dataset_name}/${model_type}"
+validation_file="${data_dir}/validation.json"
+model_name_or_path="gpt2-large"
+dump_eval_loss_to="${data_dir}/gpt2-large_${dataset_name}_token_loss.json"
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type gpt
+python ../run_clm.py \
+    --dump_eval_loss_to ${dump_eval_loss_to}\
+    --model_name_or_path ${model_name_or_path} \
+    --output_dir ${data_dir} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --max_length ${max_length} \
+    --do_eval \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --preprocessing_num_workers 4 \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}
+python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
--- a/convlab2/base_models/gpt/keyword_extraction/lmloss2keywords.py
+++ b/convlab2/base_models/gpt/keyword_extraction/lmloss2keywords.py
+import json
+import json_lines
+from pprint import pprint
+import os
+from tqdm import tqdm
+import numpy as np
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+
+
+
+def merge_tokens(tokens, losses, loss_merge_func=np.mean):
+    res = []
+    i = 0
+    while i < len(tokens):
+        token = tokens[i]
+        loss = losses[i]
+        if token in ['Ġ', 'Ċ']:
+            if token == 'Ċ' and i < len(tokens) - 1:
+                tokens[i+1] = 'Ġ'+tokens[i+1]
+            i += 1
+            continue
+        if token in ['user', 'system', 'Ġuser', 'Ġsystem'] and i < len(tokens)-1 and tokens[i+1] == ':':
+            if i > 0:
+                tokens[i+1] = '<|endoftext|>'
+                i += 1
+            else:
+                i += 2
+            continue
+        if token.startswith('Ġ'):
+            # Ġ means space
+            token = token.replace("Ġ", "")
+            res.append([token, loss])
+        elif token == '<|endoftext|>':
+            res.append([token, loss])
+        else:
+            assert 'Ġ' not in token
+            if len(res) > 0:
+                res[-1][0] += token
+                res[-1].append(loss)
+            else:
+                res.append([token, loss])
+        i += 1
+    if loss_merge_func:
+        for i in range(len(res)):
+            res[i] = [res[i][0], loss_merge_func(res[i][1:])]
+    return res
+
+
+def convert_token_loss2word_loss(token_loss_file, loss_merge_func=np.mean):
+    word_loss_file = os.path.join(os.path.dirname(token_loss_file), token_loss_file.split('/')[-1].replace('token', 'word'))
+    fin = open(token_loss_file, 'rb')
+    fout = open(word_loss_file, 'w', encoding='utf-8')
+    lines = []
+
+    for item in tqdm(json_lines.reader(fin)):
+        tokens, losses = item['tokens'], item['losses']
+        assert len(tokens) == len(losses)
+        word2losses = merge_tokens(tokens, losses, loss_merge_func)
+        lines.append({"words": [x[0] for x in word2losses], "losses": [x[1] for x in word2losses]})
+        fout.write(json.dumps(lines[-1], ensure_ascii=False)+'\n')
+
+    fin.close()
+    fout.close()
+    return lines
+
+def main(args):
+    if not args.word_loss_file:
+        word_loss_list = convert_token_loss2word_loss(args.token_loss_file)
+    else:
+        fin = open(args.word_loss_file, 'rb')
+        word_loss_list = []
+        for item in json_lines.reader(fin):
+            words, losses = item['words'], item['losses']
+            word_loss_list.append({"words": words, "losses": losses})
+        fin.close()
+
+    if not args.output_file:
+        return
+
+    stop_words = set(stopwords.words('english'))
+
+    dialogs = []
+    for item in word_loss_list:
+        words = item['words']
+        losses = item['losses']
+        turns = []
+        turn = {'words': [], 'losses': []}
+        for word, loss in zip(words, losses):
+            if word == '<|endoftext|>':
+                # switch turn
+                turn['utterance'] = ' '.join(turn['words'])
+                turn['keywords'] = list(zip(turn['words'], turn['losses']))
+                if args.stopwords:
+                    turn['keywords'] = [x for x in turn['keywords'] if not any([w.lower() in stop_words for w in word_tokenize(x[0])])]
+                turn['keywords'] = sorted(turn['keywords'], key=lambda x: x[1], reverse=True)
+                turn['keywords'] = [x for x in turn['keywords'] if x[1] > args.keywords_th][:min(round(args.keywords_ratio*len(turn['keywords'])), args.keywords_num)]
+                turn.pop('words')
+                turn.pop('losses')
+                turns.append(turn)
+                turn = {'words': [], 'losses': []}
+            else:
+                turn['words'].append(word)
+                turn['losses'].append(loss)
+        dialogs.append(turns)
+    json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False)
+
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="extract keywords according to lm loss")
+    parser.add_argument('--model_type', '-m', type=str, help='gpt or dialogpt')
+    parser.add_argument('--token_loss_file', '-t', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
+    parser.add_argument('--word_loss_file', '-w', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
+    parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
+    parser.add_argument('--keywords_num', '-n', type=int, default=100, help='how many words in an utterance serve as keywords')
+    parser.add_argument('--keywords_ratio', '-r', type=float, default=1.0, help='how many words (in ratio) in an utterance serve as keywords')
+    parser.add_argument('--keywords_th', '-th', type=float, default=0., help='loss threshold for the keywords')
+    parser.add_argument('--stopwords', '-s', type=lambda x: bool(eval(x)), default=True, help='filter out stopwords')
+    args = parser.parse_args()
+    print(args)
+    main(args)
--- a/convlab2/base_models/gpt/keyword_extraction/merge_keywords_res.py
+++ b/convlab2/base_models/gpt/keyword_extraction/merge_keywords_res.py
+import json
+
+def main(args):
+    filename2data = {f.split('/')[-1]: json.load(open(f)) for f in args.keywords_files}
+    first_filename = args.keywords_files[0].split('/')[-1]
+    dialogs = []
+    for i in range(len(filename2data[first_filename])):
+        turns = []
+        for j in range(len(filename2data[first_filename][i])):
+            utt = filename2data[first_filename][i][j]['utterance']
+            keywords = {filename.split('_')[2]+'_nonstopword'+filename.split('_')[-1]: ' | '.join([x[0] for x in filename2data[filename][i][j]['keywords']]) for filename in filename2data}
+            turns.append({
+                "utterance": utt,
+                **keywords
+            })
+        dialogs.append(turns)
+    json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False)
+
+
+    
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
+    parser.add_argument('--keywords_files', '-f', metavar='keywords_files', nargs='*', help='keywords files')
+    parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
+    args = parser.parse_args()
+    print(args)
+    main(args)
--- a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
+++ b/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
+set -e
 n_gpus=1
-task_name="nlu"
-dataset_name="tm1"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
+task_name="lm"
+dataset_name="multiwoz21"
+model_type="dialogpt"
+data_dir="data/${task_name}/${dataset_name}/${model_type}"
+output_dir="output/${task_name}/${dataset_name}/${model_type}"
 cache_dir="../cache"
 logging_dir="${output_dir}/runs"
 train_file="${data_dir}/train.json"
 validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=64
-per_device_eval_batch_size=64
+source_column="dialogue"
+max_length=512
+model_name_or_path="microsoft/DialoGPT-large"
+per_device_train_batch_size=16
+per_device_eval_batch_size=16
 gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
+lr=5e-5
+num_train_epochs=3

-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt

-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
+python ../run_clm.py \
+    --model_name_or_path ${model_name_or_path} \
    --train_file ${train_file} \
+    --validation_file ${validation_file} \
    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
+    --max_length ${max_length} \
    --do_train \
+    --do_eval \
    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --load_best_model_at_end \
    --prediction_loss_only \
    --cache_dir ${cache_dir} \
    --output_dir ${output_dir} \
@@ -39,30 +39,9 @@ python -m torch.distributed.launch \
    --overwrite_output_dir \
    --preprocessing_num_workers 4 \
    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
    --gradient_accumulation_steps ${gradient_accumulation_steps} \
    --learning_rate ${lr} \
    --num_train_epochs ${num_train_epochs} \
    --debug underflow_overflow \
-    --adafactor \
    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/gpt/run_clm.py
+++ b/convlab2/base_models/gpt/run_clm.py
--- a/convlab2/base_models/t5/create_data.py
+++ b/convlab2/base_models/t5/create_data.py
@@ -2,6 +2,7 @@ import os
 import json
 from tqdm import tqdm
 import re
+from transformers import AutoTokenizer
 from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
 from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts, deserialize_dialogue_acts, equal_da_seq
 from convlab2.base_models.t5.dst.serialization import serialize_dialogue_state, deserialize_dialogue_state, equal_state_seq
@@ -15,13 +16,16 @@ def create_rg_data(dataset, data_dir, args):
    for data_split in data_splits:
        data = []
        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']])
-            response = f"{sample['speaker']}: {sample['utterance']}"
-            data.append(json.dumps({'context': context, 'response': response}, ensure_ascii=False)+'\n')
+            if len(sample['context']) == 0:
+                continue
+            context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
+            data.append(json.dumps({'context': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')

        file_name = os.path.join(data_dir, f"{data_split}.json")
        with open(file_name, "w", encoding='utf-8') as f:
            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split

 def create_nlu_data(dataset, data_dir, args):
    data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
@@ -34,7 +38,7 @@ def create_nlu_data(dataset, data_dir, args):
        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
            response = f"{sample['speaker']}: {sample['utterance']}"
            if args.context_window_size>0:
-                context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
+                context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
            else:
                context = response
            dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
@@ -44,6 +48,8 @@ def create_nlu_data(dataset, data_dir, args):
        file_name = os.path.join(data_dir, f"{data_split}.json")
        with open(file_name, "w", encoding='utf-8') as f:
            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split

 def create_dst_data(dataset, data_dir, args):
    data_by_split = load_dst_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
@@ -56,7 +62,7 @@ def create_dst_data(dataset, data_dir, args):
        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
            response = f"{sample['speaker']}: {sample['utterance']}"
            if args.context_window_size>0:
-                context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
+                context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
            else:
                context = response
            state_seq = serialize_dialogue_state(sample['state'])
@@ -66,6 +72,32 @@ def create_dst_data(dataset, data_dir, args):
        file_name = os.path.join(data_dir, f"{data_split}.json")
        with open(file_name, "w", encoding='utf-8') as f:
            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_nlg_data(dataset, data_dir, args):
+    data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
+    data_dir = os.path.join(data_dir, args.speaker, f'context_{args.context_window_size}')
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
+            if args.context_window_size>0:
+                context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
+                context = f'{dialogue_acts_seq}\n{context}'
+            else:
+                context = f'{dialogue_acts_seq}\n{sample["speaker"]}: '
+            assert equal_da_seq(sample['dialogue_acts'], dialogue_acts_seq), print(sample['dialogue_acts'], dialogue_acts_seq, deserialize_dialogue_acts(dialogue_acts_seq))
+            data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split

 def create_goal2dialogue_data(dataset, data_dir, args):
    data_by_split = dataset
@@ -76,25 +108,44 @@ def create_goal2dialogue_data(dataset, data_dir, args):
        data = []
        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
            goal = re.sub(r'<.*?>', '', sample['goal']['description'])
-            dialogue = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
+            dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
            data.append(json.dumps({'goal': goal, 'dialogue': dialogue}, ensure_ascii=False)+'\n')

        file_name = os.path.join(data_dir, f"{data_split}.json")
        with open(file_name, "w", encoding='utf-8') as f:
            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def get_max_len(data_by_split, tokenizer):
+    for data_split in data_by_split.keys():
+        seq_len = {}
+        for line in data_by_split[data_split]:
+            item = json.loads(line.strip())
+            for column, seq in item.items():
+                seq_len.setdefault(column, [])
+                seq_len[column].append(len(tokenizer.tokenize(seq)))
+        print(f"data split: {data_split}")
+        for column, lens in seq_len.items():
+            print(f'\t{column}\tmax_len: {max(lens)}\tmean_len: {round(sum(lens)/len(lens),2)}')


 if __name__ == '__main__':
    from argparse import ArgumentParser
    parser = ArgumentParser(description="create data for seq2seq training")
-    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['rg', 'nlu', 'dst', 'goal2dialogue'], help='names of tasks')
+    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['rg', 'nlu', 'dst', 'nlg', 'goal2dialogue'], help='names of tasks')
    parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
    parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
    parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
+    parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len')
    args = parser.parse_args()
    print(args)
+    if args.len_tokenizer:
+        tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer)
    for dataset_name in tqdm(args.datasets, desc='datasets'):
        dataset = load_dataset(dataset_name)
        for task_name in tqdm(args.tasks, desc='tasks', leave=False):
            data_dir = os.path.join('data', task_name, dataset_name)
-            eval(f"create_{task_name}_data")(dataset, data_dir, args)
+            data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
+            if args.len_tokenizer:
+                get_max_len(data_by_split, tokenizer)
--- a/convlab2/base_models/t5/dst/serialization.py
+++ b/convlab2/base_models/t5/dst/serialization.py
 def serialize_dialogue_state(state):
-    state_seqs = []
+    state_dict = {}
    for domain in state:
-        for slot, value in state[domain].items():
+        for slot, value in sorted(state[domain].items()):
            if len(value) > 0:
-                state_seqs.append(f'[{domain}][{slot}][{value}]')
-    
-    return ';'.join(state_seqs)
+                state_dict.setdefault(f'[{domain}]', [])
+                state_dict[f'[{domain}]'].append(f'[{slot}][{value}]')
+    return ';'.join([domain+'('+','.join(slot_values)+')' for domain, slot_values in state_dict.items()])

 def deserialize_dialogue_state(state_seq):
    state = {}
    if len(state_seq) == 0:
        return state
-    state_seqs = state_seq.split('];[')
+    state_seqs = state_seq.split(']);[')  # will consume "])" and "["
    for i, state_seq in enumerate(state_seqs):
-        if len(state_seq) == 0:
+        if len(state_seq) == 0 or len(state_seq.split(']([')) != 2:
            continue
        if i == 0:
            if state_seq[0] == '[':
                state_seq = state_seq[1:]
        if i == len(state_seqs) - 1:
-            if state_seq[-1] == ']':
-                state_seq = state_seq[:-1]
-        s = state_seq.split('][')
-        if len(s) != 3:
+            if state_seq[-2:] == '])':
+                state_seq = state_seq[:-2]
+        
+        try:
+            domain, slot_values = state_seq.split(']([')
+        except:
+            continue
+        for slot_value in slot_values.split('],['):
+            try:
+                slot, value = slot_value.split('][')
+            except:
                continue
-        domain, slot, value = s
            state.setdefault(domain, {})
            state[domain][slot] = value
    return state

--- a/convlab2/base_models/t5/nlg/nlg_metric.py
+++ b/convlab2/base_models/t5/nlg/nlg_metric.py
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NLG Metric"""
+
+import datasets
+import sacrebleu
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+Metric to evaluate text-to-text models on the natural language generation task.
+"""
+
+_KWARGS_DESCRIPTION = """
+Calculates corpus-bleu4
+Args:
+    predictions: list of predictions to score. Each predictions
+        should be a string.
+    references: list of reference for each prediction. Each
+        reference should be a string.
+Returns:
+    bleu: corpus-bleu score
+Examples:
+
+    >>> nlg_metric = datasets.load_metric("nlg_metric.py")
+    >>> predictions = ["hello there general kenobi", "foo bar foobar"]
+    >>> references = ["hello there kenobi", "foo bar foobar"]
+    >>> results = nlg_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'bleu': 35.35533905932737}
+"""
+
+
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class NLGMetrics(datasets.Metric):
+    """Metric to evaluate text-to-text models on the natural language generation task."""
+
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features({
+                'predictions': datasets.Value('string'),
+                'references': datasets.Value('string'),
+            })
+        )
+
+    def _compute(self, predictions, references):
+        """Returns the scores: bleu"""
+        bleu = sacrebleu.corpus_bleu(predictions, [references], lowercase=True).score
+        
+        return {
+            "bleu": bleu
+        }
--- a/convlab2/base_models/t5/nlu/nlu_metric.py
+++ b/convlab2/base_models/t5/nlu/nlu_metric.py
@@ -36,21 +36,15 @@ Returns:
    seq_em: sequence exact match
    accuracy: dialog acts accuracy
    overall_f1: dialog acts overall f1
-    binary_f1: binary dialog acts f1
-    categorical_f1: categorical dialog acts f1
-    non-categorical_f1: non-categorical dialog acts f1
 Examples:

    >>> nlu_metric = datasets.load_metric("nlu_metric.py")
-    >>> predictions = ["[binary][thank][general][]", "[non-categorical][inform][taxi][leave at][17:15]"]
-    >>> references = ["[binary][thank][general][]", "[non-categorical][inform][train][leave at][17:15]"]
+    >>> predictions = ["[thank][general]{[][]}", "[inform][taxi]{[leave at][17:15]}"]
+    >>> references = ["[thank][general]{[][]}", "[inform][train]{[leave at][17:15]}"]
    >>> results = nlu_metric.compute(predictions=predictions, references=references)
    >>> print(results)
    {'seq_em': 0.5, 'accuracy': 0.5, 
-    'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5, 
-    'binary_f1': 1.0, 'binary_precision': 1.0, 'binary_recall': 1.0, 
-    'categorical_f1': 0.0, 'categorical_precision': 0.0, 'categorical_recall': 0.0, 
-    'non-categorical_f1': 0.0, 'non-categorical_precision': 0.0, 'non-categorical_recall': 0.0}
+    'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5}
 """


@@ -74,45 +68,33 @@ class NLUMetrics(datasets.Metric):
        """Returns the scores: sequence exact match, dialog acts accuracy and f1"""
        seq_em = []
        acc = []
-        f1_metrics = {x: {'TP':0, 'FP':0, 'FN':0} for x in ['overall', 'binary', 'categorical', 'non-categorical']}
+        f1_metrics = {'TP':0, 'FP':0, 'FN':0}

        for prediction, reference in zip(predictions, references):
            seq_em.append(prediction.strip()==reference.strip())
            pred_da = deserialize_dialogue_acts(prediction)
            gold_da = deserialize_dialogue_acts(reference)
-            flag = True
-            for da_type in ['binary', 'categorical', 'non-categorical']:
-                if da_type == 'binary':
-                    predicts = sorted(list({(x['intent'], x['domain'], x['slot']) for x in pred_da[da_type]}))
-                    labels = sorted(list({(x['intent'], x['domain'], x['slot']) for x in gold_da[da_type]}))
+            pred_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in pred_da}))
+            gold_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in gold_da}))
+            acc.append(pred_da==gold_da)
+            for ele in pred_da:
+                if ele in gold_da:
+                    f1_metrics['TP'] += 1
                else:
-                    predicts = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in pred_da[da_type]}))
-                    labels = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in gold_da[da_type]}))
-                for ele in predicts:
-                    if ele in labels:
-                        f1_metrics['overall']['TP'] += 1
-                        f1_metrics[da_type]['TP'] += 1
-                    else:
-                        f1_metrics['overall']['FP'] += 1
-                        f1_metrics[da_type]['FP'] += 1
-                for ele in labels:
-                    if ele not in predicts:
-                        f1_metrics['overall']['FN'] += 1
-                        f1_metrics[da_type]['FN'] += 1
-                flag &= (predicts==labels)
-            acc.append(flag)
+                    f1_metrics['FP'] += 1
+            for ele in gold_da:
+                if ele not in pred_da:
+                    f1_metrics['FN'] += 1

-        for metric in list(f1_metrics.keys()):
-            TP = f1_metrics[metric].pop('TP')
-            FP = f1_metrics[metric].pop('FP')
-            FN = f1_metrics[metric].pop('FN')
+        TP = f1_metrics.pop('TP')
+        FP = f1_metrics.pop('FP')
+        FN = f1_metrics.pop('FN')
        precision = 1.0 * TP / (TP + FP) if TP + FP else 0.
        recall = 1.0 * TP / (TP + FN) if TP + FN else 0.
        f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0.
-            f1_metrics.pop(metric)
-            f1_metrics[f'{metric}_f1'] = f1
-            f1_metrics[f'{metric}_precision'] = precision
-            f1_metrics[f'{metric}_recall'] = recall
+        f1_metrics[f'overall_f1'] = f1
+        f1_metrics[f'overall_precision'] = precision
+        f1_metrics[f'overall_recall'] = recall

        return {
            "seq_em": sum(seq_em)/len(seq_em),

--- a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
+++ b/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
-n_gpus=1
-task_name="nlu"
-dataset_name="multiwoz21"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
+++ b/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
-n_gpus=1
-task_name="nlu"
-dataset_name="multiwoz21"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/t5/dst/run_multiwoz21.sh
+++ b/convlab2/base_models/t5/dst/run_multiwoz21.sh
-n_gpus=4
-task_name="dst"
-dataset_name="multiwoz21"
+n_gpus=1
+task_name="nlu"
+dataset_name=$1
 speaker="user"
-context_window_size=100
+context_window_size=$2
 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 cache_dir="../cache"
@@ -10,28 +10,26 @@ logging_dir="${output_dir}/runs"
 train_file="${data_dir}/train.json"
 validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
-metric_name_or_path="dst_metric.py"
-metric_for_best_model="accuracy"
+metric_name_or_path="nlu_metric.py"
+metric_for_best_model="overall_f1"
 source_column="context"
-target_column="state_seq"
+target_column="dialogue_acts_seq"
 truncation_side="left"
 max_source_length=512
 max_target_length=512
 model_name_or_path="t5-small"
-per_device_train_batch_size=64
+per_device_train_batch_size=128
 per_device_eval_batch_size=64
 gradient_accumulation_steps=2
 lr=1e-3
 num_train_epochs=10

-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
+python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}

-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+python ../run_seq2seq.py \
    --task_name ${task_name} \
    --train_file ${train_file} \
    --validation_file ${validation_file} \
-    --test_file ${test_file} \
    --source_column ${source_column} \
    --target_column ${target_column} \
    --max_source_length ${max_source_length} \
@@ -40,7 +38,6 @@ python -m torch.distributed.launch \
    --model_name_or_path ${model_name_or_path} \
    --do_train \
    --do_eval \
-    --do_predict \
    --save_strategy epoch \
    --evaluation_strategy epoch \
    --prediction_loss_only \
@@ -58,8 +55,7 @@ python -m torch.distributed.launch \
    --adafactor \
    --gradient_checkpointing

-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+python ../run_seq2seq.py \
    --task_name ${task_name} \
    --test_file ${test_file} \
    --source_column ${source_column} \
@@ -76,8 +72,8 @@ python -m torch.distributed.launch \
    --logging_dir ${logging_dir} \
    --overwrite_output_dir \
    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}

 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json

-python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
+python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/t5/nlu/run_tm1_user.sh
+++ b/convlab2/base_models/t5/nlu/run_tm1_user.sh
-n_gpus=1
-task_name="nlu"
-dataset_name="tm1"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/t5/nlu/run_tm2_user.sh
+++ b/convlab2/base_models/t5/nlu/run_tm2_user.sh
-n_gpus=1
-task_name="nlu"
-dataset_name="tm2"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
+++ b/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
-n_gpus=1
-task_name="nlu"
-dataset_name="tm2"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/t5/nlu/run_tm3_user.sh
+++ b/convlab2/base_models/t5/nlu/run_tm3_user.sh
-n_gpus=1
-task_name="nlu"
-dataset_name="tm3"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
+++ b/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
-n_gpus=1
-task_name="nlu"
-dataset_name="tm3"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab2/base_models/t5/nlu/serialization.py
+++ b/convlab2/base_models/t5/nlu/serialization.py
 def serialize_dialogue_acts(dialogue_acts):
-    da_seqs = []
+    da_dict = {}
    for da_type in dialogue_acts:
        for da in dialogue_acts[da_type]:
-            intent, domain, slot = da['intent'], da['domain'], da['slot']
-            if da_type == 'binary':
-                da_seq = f'[{da_type}][{intent}][{domain}][{slot}]'
-            else:
-                value = da['value']
-                da_seq = f'[{da_type}][{intent}][{domain}][{slot}][{value}]'
-            da_seqs.append(da_seq)
-    return ';'.join(da_seqs)
+            intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '')
+            intent_domain = f'[{intent}][{domain}]'
+            da_dict.setdefault(intent_domain, [])
+            da_dict[intent_domain].append(f'[{slot}][{value}]')
+    return ';'.join([intent_domain+'('+','.join(slot_values)+')' for intent_domain, slot_values in da_dict.items()])

 def deserialize_dialogue_acts(das_seq):
-    dialogue_acts = {'binary': [], 'categorical': [], 'non-categorical': []}
+    dialogue_acts = []
    if len(das_seq) == 0:
        return dialogue_acts
-    da_seqs = das_seq.split('];[')
+    da_seqs = das_seq.split(']);[')  # will consume "])" and "["
    for i, da_seq in enumerate(da_seqs):
-        if len(da_seq) == 0:
+        if len(da_seq) == 0 or len(da_seq.split(']([')) != 2:
            continue
        if i == 0:
            if da_seq[0] == '[':
                da_seq = da_seq[1:]
        if i == len(da_seqs) - 1:
-            if da_seq[-1] == ']':
-                da_seq = da_seq[:-1]
-        da = da_seq.split('][')
-        if len(da) == 0:
+            if da_seq[-2:] == '])':
+                da_seq = da_seq[:-2]
+        
+        try:
+            intent_domain, slot_values = da_seq.split(']([')
+            intent, domain = intent_domain.split('][')
+        except:
+            continue
+        for slot_value in slot_values.split('],['):
+            try:
+                slot, value = slot_value.split('][')
+            except:
                continue
-        da_type = da[0]
-        if len(da) == 5 and da_type in ['categorical', 'non-categorical']:
-            dialogue_acts[da_type].append({'intent': da[1], 'domain': da[2], 'slot': da[3], 'value': da[4]})
-        elif len(da) == 4 and da_type == 'binary':
-            dialogue_acts[da_type].append({'intent': da[1], 'domain': da[2], 'slot': da[3]})
-        else:
-            # invalid da format, skip
-            # print(das_seq)
-            # print(da_seq)
-            # print()
-            pass
+            dialogue_acts.append({'intent': intent, 'domain': domain, 'slot': slot, 'value': value})
+        
    return dialogue_acts

 def equal_da_seq(dialogue_acts, das_seq):
    predict_dialogue_acts = deserialize_dialogue_acts(das_seq)
-    for da_type in ['binary', 'categorical', 'non-categorical']:
-        das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in dialogue_acts[da_type]])
-        predict_das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in predict_dialogue_acts[da_type]])
+    das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da_type in ['binary', 'categorical', 'non-categorical'] for da in dialogue_acts[da_type]])
+    predict_das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in predict_dialogue_acts])
    if das != predict_das:
        return False
    return True