Merge branch 'github_master' of...

Merge branch 'github_master' of gitlab.cs.uni-duesseldorf.de:dsml/convlab/ConvLab3 into github_master

Merge branch 'github_master' of...
26a6256d · Christian · 0cd19bb3 · ef06e29f · 26a6256d · 26a6256d
Commit 26a6256d authored Mar 27, 2023 by Christian
--- a/README.md
+++ b/README.md
@@ -16,11 +16,12 @@
 ## Updates
+- **2023.2.26**: Update ConvLab on PyPI to 3.0.1 to reflect bug fixes.
 - **2022.11.30**: ConvLab-3 release.
 ## Installation
-You can install ConvLab-3 in one of the following ways according to your need. Higher versions of `torch` and `transformers` may also work.
+You can install ConvLab-3 in one of the following ways according to your need. We use `torch>=1.10.1,<=1.13` and `transformers>=4.17.0,<=4.24.0`. Higher versions of `torch` and `transformers` may also work.
 ### Git clone and pip install in development mode (Recommend)

--- a/convlab/base_models/t5/dst/merge_predict_res.py
+++ b/convlab/base_models/t5/dst/merge_predict_res.py
@@ -4,7 +4,7 @@ from convlab.util import load_dataset, load_dst_data
 from convlab.base_models.t5.dst.serialization import deserialize_dialogue_state
-def merge(dataset_names, speaker, save_dir, context_window_size, predict_result):
+def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order):
    assert os.path.exists(predict_result)
    if save_dir is None:
@@ -17,14 +17,18 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result)
    i = 0
    for dataset_name in dataset_names.split('+'):
        print(dataset_name)
-        dataset = load_dataset(dataset_name, args.dial_ids_order)
+        single = []
+        dataset = load_dataset(dataset_name, dial_ids_order)
        data = load_dst_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test']
        for sample in data:
            sample['predictions'] = {'state': predict_result[i]}
            i += 1
+            single.append(sample)
            merged.append(sample)
+        json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
    json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
@@ -35,8 +39,8 @@ if __name__ == '__main__':
    parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s) of utterances')
    parser.add_argument('--save_dir', type=str, help='merged data will be saved as $save_dir/predictions.json. default: on the same directory as predict_result')
    parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
-    parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file generated_predictions.json')
+    parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file test_generated_predictions.json')
    parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
    args = parser.parse_args()
    print(args)
-    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result)
+    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order)
--- a/convlab/base_models/t5/dst/run_dst.sh
+++ b/convlab/base_models/t5/dst/run_dst.sh
 n_gpus=1
 task_name="dst"
-dataset_name=$1
+dataset_name=crosswoz
 speaker="user"
 context_window_size=100
 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
@@ -17,10 +17,10 @@ target_column="state_seq"
 truncation_side="left"
 max_source_length=1024
 max_target_length=512
-model_name_or_path="t5-small"
+model_name_or_path="/data/zhuqi/pre-trained-models/mt5-small"
-per_device_train_batch_size=64
+per_device_train_batch_size=16
-per_device_eval_batch_size=64
+per_device_eval_batch_size=16
-gradient_accumulation_steps=2
+gradient_accumulation_steps=4
 lr=1e-3
 num_train_epochs=10
@@ -80,6 +80,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
 python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab/base_models/t5/dst/run_dst_fewshot.sh
+++ b/convlab/base_models/t5/dst/run_dst_fewshot.sh
@@ -82,6 +82,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json -o ${dial_ids_order}
 python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab/base_models/t5/dst/run_dst_multitask.sh
+++ b/convlab/base_models/t5/dst/run_dst_multitask.sh
@@ -30,7 +30,7 @@ mkdir -p ${data_dir}
 for name in ${names};
 do
    echo "preprocessing ${name}"
-    # python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size}
+    python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size}
 done
 python merge_data.py $(echo ${dataset_name} | tr "+" " ")
@@ -89,6 +89,10 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
-python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
+for name in ${names};
+do
+    echo "evaluating ${name}"
+    python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json
+done
\ No newline at end of file
--- a/convlab/base_models/t5/nlg/merge_predict_res.py
+++ b/convlab/base_models/t5/nlg/merge_predict_res.py
@@ -3,7 +3,7 @@ import os
 from convlab.util import load_dataset, load_nlg_data
-def merge(dataset_names, speaker, save_dir, context_window_size, predict_result):
+def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order):
    assert os.path.exists(predict_result)
    if save_dir is None:
@@ -16,7 +16,8 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result)
    i = 0
    for dataset_name in dataset_names.split('+'):
        print(dataset_name)
-        dataset = load_dataset(dataset_name, args.dial_ids_order)
+        single = []
+        dataset = load_dataset(dataset_name, dial_ids_order)
        data = load_nlg_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test']
        for sample in data:
@@ -24,8 +25,11 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result)
                continue
            sample['predictions'] = {'utterance': predict_result[i]}
            i += 1
+            single.append(sample)
            merged.append(sample)
+        json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
    json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
@@ -36,8 +40,8 @@ if __name__ == '__main__':
    parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s) of utterances')
    parser.add_argument('--save_dir', type=str, help='merged data will be saved as $save_dir/predictions.json. default: on the same directory as predict_result')
    parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
-    parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file generated_predictions.json')
+    parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file test_generated_predictions.json')
    parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
    args = parser.parse_args()
    print(args)
-    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result)
+    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order)
--- a/convlab/base_models/t5/nlg/run_nlg.sh
+++ b/convlab/base_models/t5/nlg/run_nlg.sh
 n_gpus=1
 task_name="nlg"
-dataset_name=$1
+dataset_name=crosswoz
-speaker="system"
+speaker="all"
-context_window_size=$2
+context_window_size=0
 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 cache_dir="../cache"
@@ -17,10 +17,10 @@ target_column="response"
 truncation_side="left"
 max_source_length=512
 max_target_length=512
-model_name_or_path="t5-small"
+model_name_or_path="/data/zhuqi/pre-trained-models/mt5-small"
-per_device_train_batch_size=128
+per_device_train_batch_size=32
-per_device_eval_batch_size=64
+per_device_eval_batch_size=16
-gradient_accumulation_steps=4
+gradient_accumulation_steps=8
 lr=1e-3
 num_train_epochs=10
@@ -80,6 +80,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
 python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/nlg/run_nlg_fewshot.sh
+++ b/convlab/base_models/t5/nlg/run_nlg_fewshot.sh
@@ -83,6 +83,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json -o ${dial_ids_order}
 python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/nlg/run_nlg_multitask.sh
+++ b/convlab/base_models/t5/nlg/run_nlg_multitask.sh
@@ -89,6 +89,10 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
+for name in ${names};
+do
+    echo "evaluating ${name}"
+    python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json --dataset_name ${name}
+done
\ No newline at end of file
--- a/convlab/base_models/t5/nlu/merge_predict_res.py
+++ b/convlab/base_models/t5/nlu/merge_predict_res.py
@@ -4,7 +4,7 @@ from convlab.util import load_dataset, load_nlu_data
 from convlab.base_models.t5.nlu.serialization import deserialize_dialogue_acts
-def merge(dataset_names, speaker, save_dir, context_window_size, predict_result):
+def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order):
    assert os.path.exists(predict_result)
    if save_dir is None:
@@ -17,14 +17,18 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result)
    i = 0
    for dataset_name in dataset_names.split('+'):
        print(dataset_name)
-        dataset = load_dataset(dataset_name, args.dial_ids_order)
+        single = []
+        dataset = load_dataset(dataset_name, dial_ids_order)
        data = load_nlu_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test']
        for sample in data:
            sample['predictions'] = {'dialogue_acts': predict_result[i]}
            i += 1
+            single.append(sample)
            merged.append(sample)
+        json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
    json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
@@ -35,8 +39,8 @@ if __name__ == '__main__':
    parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s) of utterances')
    parser.add_argument('--save_dir', type=str, help='merged data will be saved as $save_dir/predictions.json. default: on the same directory as predict_result')
    parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
-    parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file generated_predictions.json')
+    parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file test_generated_predictions.json')
    parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
    args = parser.parse_args()
    print(args)
-    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result)
+    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order)
--- a/convlab/base_models/t5/nlu/run_nlu.sh
+++ b/convlab/base_models/t5/nlu/run_nlu.sh
 n_gpus=1
 task_name="nlu"
-dataset_name=$1
+dataset_name=crosswoz
-speaker="user"
+speaker="all"
-context_window_size=$2
+context_window_size=0
 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 cache_dir="../cache"
@@ -17,10 +17,10 @@ target_column="dialogue_acts_seq"
 truncation_side="left"
 max_source_length=512
 max_target_length=512
-model_name_or_path="t5-small"
+model_name_or_path="/data/zhuqi/pre-trained-models/mt5-small"
-per_device_train_batch_size=128
+per_device_train_batch_size=16
-per_device_eval_batch_size=64
+per_device_eval_batch_size=16
-gradient_accumulation_steps=2
+gradient_accumulation_steps=16
 lr=1e-3
 num_train_epochs=10
@@ -80,6 +80,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
 python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab/base_models/t5/nlu/run_nlu_fewshot.sh
+++ b/convlab/base_models/t5/nlu/run_nlu_fewshot.sh
@@ -83,6 +83,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json -o ${dial_ids_order}
 python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab/base_models/t5/nlu/run_nlu_multitask.sh
+++ b/convlab/base_models/t5/nlu/run_nlu_multitask.sh
@@ -89,6 +89,10 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
+for name in ${names};
+do
+    echo "evaluating ${name}"
+    python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json
+done
\ No newline at end of file
--- a/convlab/base_models/t5/nlu/run_retnlu.sh
+++ b/convlab/base_models/t5/nlu/run_retnlu.sh
@@ -81,6 +81,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
 python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab/base_models/t5/nlu/run_retnlu_fewshot.sh
+++ b/convlab/base_models/t5/nlu/run_retnlu_fewshot.sh
@@ -84,6 +84,6 @@ num_train_epochs=100
 #     --optim adafactor \
 #     --gradient_checkpointing
-# python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
+# python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json -o ${dial_ids_order}
 python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab/base_models/t5/nlu/run_retnlu_in_context.sh
+++ b/convlab/base_models/t5/nlu/run_retnlu_in_context.sh
@@ -81,6 +81,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
 python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab/base_models/t5/nlu/run_retnlu_in_context_fewshot.sh
+++ b/convlab/base_models/t5/nlu/run_retnlu_in_context_fewshot.sh
@@ -84,6 +84,6 @@ python ../run_seq2seq.py \
    --optim adafactor \
    --gradient_checkpointing
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json -o ${dial_ids_order}
 python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
--- a/convlab/base_models/t5/run_seq2seq.py
+++ b/convlab/base_models/t5/run_seq2seq.py
@@ -37,6 +37,8 @@ from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
+    T5ForConditionalGeneration,
+    T5Tokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    EarlyStoppingCallback,
@@ -358,6 +360,7 @@ def main():
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
+    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
@@ -374,6 +377,23 @@ def main():
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
+    except:
+        tokenizer = T5Tokenizer.from_pretrained(
+            model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            truncation_side=model_args.truncation_side,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+        model = T5ForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
    model.resize_token_embeddings(len(tokenizer))
@@ -612,16 +632,17 @@ def main():
    # Predict
    if training_args.do_predict:
-        logger.info("*** Predict ***")
+        file_prefix = os.path.splitext(os.path.basename(data_args.test_file))[0]
-        predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict")
+        logger.info(f"*** Predict {file_prefix}***")
+        predict_results = trainer.predict(predict_dataset, metric_key_prefix=file_prefix)
        metrics = predict_results.metrics
        max_predict_samples = (
            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+        metrics[f"{file_prefix}_samples"] = min(max_predict_samples, len(predict_dataset))
-        trainer.log_metrics("predict", metrics)
+        trainer.log_metrics(file_prefix, metrics)
-        trainer.save_metrics("predict", metrics)
+        trainer.save_metrics(file_prefix, metrics)
        if trainer.is_world_process_zero():
            if training_args.predict_with_generate:
@@ -629,10 +650,13 @@ def main():
                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                predictions = [pred.strip() for pred in predictions]
-                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.json")
+                output_prediction_file = os.path.join(training_args.output_dir, f"{file_prefix}_generated_predictions.json")
                with open(output_prediction_file, "w", encoding='utf-8') as writer:
-                    for sample, pred in zip(raw_datasets["test"], predictions):
+                    for idx, sample in enumerate(raw_datasets["test"]):
-                        sample["predictions"] = pred
+                        if training_args.num_return_sequences > 1:
+                            sample["predictions"] = predictions[idx*training_args.num_return_sequences:(idx+1)*training_args.num_return_sequences]
+                        else:
+                            sample["predictions"] = predictions[idx]
                        writer.write(json.dumps(sample, ensure_ascii=False)+'\n')
    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": data_args.task_name}

--- a/convlab/dst/rule/multiwoz/dst.py
+++ b/convlab/dst/rule/multiwoz/dst.py
@@ -70,82 +70,88 @@ if __name__ == '__main__':
    dst = RuleDST()
-    # Action is a dict. Its keys are strings(domain-type pairs, both uppercase and lowercase is OK) and its values are list of lists.
+    # Action (dialog acts) is a list of (intent, domain, slot, value) tuples.
-    # The domain may be one of ('Attraction', 'Hospital', 'Booking', 'Hotel', 'Restaurant', 'Taxi', 'Train', 'Police').
+    # RuleDST will only handle `inform` and `request` actions
-    # The type may be "inform" or "request".
-    # For example, the action below has a key "Hotel-Inform", in which "Hotel" is domain and "Inform" is action type.
-    # Each list in the value of "Hotel-Inform" is a slot-value pair. "Area" is slot and "east" is value. "Star" is slot and "4" is value.
    action = [
-        ["Inform", "Hotel", "Area", "east"],
+        ["inform", "hotel", "area", "east"],
-        ["Inform", "Hotel", "Stars", "4"]
+        ["inform", "hotel", "stars", "4"]
    ]
    # method `update` updates the attribute `state` of tracker, and returns it.
    state = dst.update(action)
    assert state == dst.state
-    assert state == {'user_action': [],
+    assert state == {'belief_state': {'attraction': {'area': '', 'name': '', 'type': ''},
-                     'system_action': [],
+                                      'hospital': {'department': ''},
-                     'belief_state': {'police': {'book': {'booked': []}, 'semi': {}},
+                                      'hotel': {'area': 'east',
-                                      'hotel': {'book': {'booked': [], 'people': '', 'day': '', 'stay': ''},
+                                                'book day': '',
-                                                'semi': {'name': '',
+                                                'book people': '',
-                                                         'area': 'east',
+                                                'book stay': '',
+                                                'internet': '',
+                                                'name': '',
                                                'parking': '',
                                                'price range': '',
                                                'stars': '4',
-                                                         'internet': '',
+                                                'type': ''},
-                                                         'type': ''}},
+                                      'restaurant': {'area': '',
-                                      'attraction': {'book': {'booked': []},
+                                                     'book day': '',
-                                                     'semi': {'type': '', 'name': '', 'area': ''}},
+                                                     'book people': '',
-                                      'restaurant': {'book': {'booked': [], 'people': '', 'day': '', 'time': ''},
+                                                     'book time': '',
-                                                     'semi': {'food': '', 'pricerange': '', 'name': '', 'area': ''}},
+                                                     'food': '',
-                                      'hospital': {'book': {'booked': []}, 'semi': {'department': ''}},
+                                                     'name': '',
-                                      'taxi': {'book': {'booked': []},
+                                                     'price range': ''},
-                                               'semi': {'leaveAt': '',
+                                      'taxi': {'arrive by': '',
-                                                        'destination': '',
                                               'departure': '',
-                                                        'arriveBy': ''}},
-                                      'train': {'book': {'booked': [], 'people': ''},
-                                                'semi': {'leaveAt': '',
                                               'destination': '',
+                                               'leave at': ''},
+                                      'train': {'arrive by': '',
+                                                'book people': '',
                                                'day': '',
-                                                         'arriveBy': '',
+                                                'departure': '',
-                                                         'departure': ''}}},
+                                                'destination': '',
+                                                'leave at': ''}},
+                     'booked': {},
+                     'history': [],
                     'request_state': {},
+                     'system_action': [],
                     'terminated': False,
-                     'history': []}
+                     'user_action': []}
    # Please call `init_session` before a new dialog. This initializes the attribute `state` of tracker with a default state, which `convlab.util.multiwoz.state.default_state` returns. But You needn't call it before the first dialog, because tracker gets a default state in its constructor.
    dst.init_session()
-    action = [["Inform", "Train", "Arrive", "19:45"]]
+    action = [["inform", "train", "arrive by", "19:45"]]
    state = dst.update(action)
-    assert state == {'user_action': [],
+    assert state == {'belief_state': {'attraction': {'area': '', 'name': '', 'type': ''},
-                     'system_action': [],
+                                      'hospital': {'department': ''},
-                     'belief_state': {'police': {'book': {'booked': []}, 'semi': {}},
+                                      'hotel': {'area': '',
-                                      'hotel': {'book': {'booked': [], 'people': '', 'day': '', 'stay': ''},
+                                                'book day': '',
-                                                'semi': {'name': '',
+                                                'book people': '',
-                                                         'area': '',
+                                                'book stay': '',
+                                                'internet': '',
+                                                'name': '',
                                                'parking': '',
                                                'price range': '',
                                                'stars': '',
-                                                         'internet': '',
+                                                'type': ''},
-                                                         'type': ''}},
+                                      'restaurant': {'area': '',
-                                      'attraction': {'book': {'booked': []},
+                                                     'book day': '',
-                                                     'semi': {'type': '', 'name': '', 'area': ''}},
+                                                     'book people': '',
-                                      'restaurant': {'book': {'booked': [], 'people': '', 'day': '', 'time': ''},
+                                                     'book time': '',
-                                                     'semi': {'food': '', 'pricerange': '', 'name': '', 'area': ''}},
+                                                     'food': '',
-                                      'hospital': {'book': {'booked': []}, 'semi': {'department': ''}},
+                                                     'name': '',
-                                      'taxi': {'book': {'booked': []},
+                                                     'price range': ''},
-                                               'semi': {'leaveAt': '',
+                                      'taxi': {'arrive by': '',
-                                                        'destination': '',
                                               'departure': '',
-                                                        'arriveBy': ''}},
-                                      'train': {'book': {'booked': [], 'people': ''},
-                                                'semi': {'leaveAt': '',
                                               'destination': '',
+                                               'leave at': ''},
+                                      'train': {'arrive by': '19:45',
+                                                'book people': '',
                                                'day': '',
-                                                         'arriveBy': '19:45',
+                                                'departure': '',
-                                                         'departure': ''}}},
+                                                'destination': '',
+                                                'leave at': ''}},
+                     'booked': {},
+                     'history': [],
                     'request_state': {},
+                     'system_action': [],
                     'terminated': False,
-                     'history': []}
+                     'user_action': []}
--- a/convlab/dst/rule/multiwoz/evaluate.py
+++ b/convlab/dst/rule/multiwoz/evaluate.py
+# -*- coding: utf-8 -*-
+# Copyright 2023 DSML Group, Heinrich Heine University, Düsseldorf
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MultiWOZ Test data inference for RuleDST and BERTNLU+RuleDST"""
+import json
+from copy import deepcopy
+import os
+from tqdm import tqdm
+from convlab.util import load_dataset, load_dst_data
+from convlab.dst.rule.multiwoz.dst import RuleDST
+from convlab.nlu.jointBERT.unified_datasets.nlu import BERTNLU
+BERTNLU_PATH = "https://huggingface.co/ConvLab/bert-base-nlu/resolve/main/bertnlu_unified_multiwoz21_user_context3.zip"
+def flatten_act(acts: dict) -> list:
+    acts_list = list()
+    for act_type, _acts in acts.items():
+        for act in _acts:
+            if 'value' in act:
+                _act = [act['intent'], act['domain'], act['slot'], act['value']]
+            else:
+                _act = [act['intent'], act['domain'], act['slot'], '']
+            acts_list.append(_act)
+    return acts_list
+def load_act_data(dataset: dict) -> list:
+    data = list()
+    for dialogue in tqdm(dataset['test']):
+        dial = []
+        for _turn in dialogue['turns']:
+            if _turn['speaker'] == 'user':
+                turn = {'user_acts': flatten_act(_turn['dialogue_acts']),
+                        'state': _turn['state']}
+                dial.append(turn)
+        data.append(dial)
+    return data
+def load_text_data(dataset: dict) -> list:
+    data = list()
+    for dialogue in tqdm(dataset['test']):
+        dial = []
+        turn = {'user': '', 'system': 'Start', 'state': None}
+        for _turn in dialogue['turns']:
+            if _turn['speaker'] == 'user':
+                turn['user'] = _turn['utterance']
+                turn['state'] = _turn['state']
+            elif _turn['speaker'] == 'system':
+                turn['system'] = _turn['utterance']
+            if turn['user'] and turn['system']:
+                if turn['system'] == 'Start':
+                    turn['system'] = ''
+                dial.append(deepcopy(turn))
+                turn = {'user': '', 'system': '', 'state': None}
+        data.append(dial)
+    return data
+def predict_acts(data: list, nlu: BERTNLU) -> list:
+    processed_data = list()
+    for dialogue in tqdm(data):
+        context = list()
+        dial = list()
+        for turn in dialogue:
+            context.append(['sys', turn['system']])
+            acts = nlu.predict(turn['user'], context=context)
+            context.append(['usr', turn['user']])
+            dial.append({'user_acts': deepcopy(acts), 'state': turn['state']})
+        processed_data.append(dial)
+    return processed_data
+def predict_states(data: list):
+    dst = RuleDST()
+    processed_data = list()
+    for dialogue in tqdm(data):
+        dst.init_session()
+        for turn in dialogue:
+            pred = dst.update(turn['user_acts'])
+            dial = {'state': turn['state'],
+                    'predictions': {'state': deepcopy(pred['belief_state'])}}
+            processed_data.append(dial)
+    return processed_data
+if __name__ == '__main__':
+    dataset = load_dataset(dataset_name='multiwoz21')
+    dataset = load_dst_data(dataset, data_split='test', speaker='all', dialogue_acts=True, split_to_turn=False)
+    data = load_text_data(dataset)
+    nlu = BERTNLU(mode='user', config_file='multiwoz21_user_context3.json', model_file=BERTNLU_PATH)
+    bertnlu_data = predict_acts(data, nlu)
+    golden_data = load_act_data(dataset)
+    bertnlu_data = predict_states(bertnlu_data)
+    golden_data = predict_states(golden_data)
+    path = os.path.dirname(os.path.realpath(__file__))
+    writer = open(os.path.join(path, f"predictions_BERTNLU-RuleDST.json"), 'w')
+    json.dump(bertnlu_data, writer)
+    writer.close()
+    writer = open(os.path.join(path, f"predictions_RuleDST.json"), 'w')
+    json.dump(golden_data, writer)
+    writer.close()