From edbebd06994214d3e354ea487014370978477d5c Mon Sep 17 00:00:00 2001
From: zqwerty <zhuq96@hotmail.com>
Date: Tue, 2 Aug 2022 08:39:17 +0800
Subject: [PATCH] Revert "update ft data processing"

This reverts commit df7e29a34156b0be1cb14a723a8cc6ea436ed464.
---
 convlab/base_models/t5/key2gen/create_data.py | 104 +++------
 convlab/base_models/t5/key2gen/evaluate.py    |  45 ++--
 convlab/base_models/t5/key2gen/finetune.sh    | 219 ++++++++++--------
 convlab/base_models/t5/key2gen/metric.py      |  13 +-
 convlab/base_models/t5/run_seq2seq.py         |   2 +-
 5 files changed, 196 insertions(+), 187 deletions(-)

diff --git a/convlab/base_models/t5/key2gen/create_data.py b/convlab/base_models/t5/key2gen/create_data.py
index 54cb8e2a..cb4e12c8 100644
--- a/convlab/base_models/t5/key2gen/create_data.py
+++ b/convlab/base_models/t5/key2gen/create_data.py
@@ -4,26 +4,18 @@ from tqdm import tqdm
 from convlab.util import load_dataset, load_unified_data, load_nlu_data
 
 def create_nlg_data(dataset, data_dir, args):
-    data_by_split = dataset
+    data_by_split = load_nlu_data(dataset, speaker='system', use_context=True, context_window_size=3)
     os.makedirs(data_dir, exist_ok=True)
 
     data_splits = data_by_split.keys()
     for data_split in data_splits:
         data = []
-        num_dial = 0
-        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            context = []
-            is_valid = False
-            for turn in dial['turns']:
-                response = turn['utterance']
-                context.append((turn['speaker'], turn['utterance']))
-                if turn['speaker'] == 'system' and len(context) > 1 and len(response) > 0:
-                    data.append(json.dumps({'context': context[-4:-1], 'knowledge': turn['dialogue_acts'], 'response': response}, ensure_ascii=False)+'\n')
-                    is_valid = True
-            if is_valid:
-                num_dial += 1
-            if 'test' not in data_split and args.shot and isinstance(args.shot, int) and args.shot >= 1 and args.shot == num_dial:
-                break
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            context = [(turn['speaker'], turn['utterance']) for turn in sample['context']]
+            response = sample['utterance']
+            if len(context) > 0 and len(response) > 0:
+                knowledge = sample['dialogue_acts']
+                data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
 
         if 'test' in data_split:
             file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
@@ -35,34 +27,24 @@ def create_nlg_data(dataset, data_dir, args):
     return data_by_split
 
 def create_kvret_data(dataset, data_dir, args):
-    data_by_split = dataset
+    data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100)
     os.makedirs(data_dir, exist_ok=True)
 
     domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'}
     data_splits = data_by_split.keys()
     for data_split in data_splits:
         data = []
-        num_dial = 0
-        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            context = []
-            is_valid = False
-            for turn in dial['turns']:
-                response = turn['utterance']
-                context.append((turn['speaker'], turn['utterance']))
-                if turn['speaker'] == 'system' and len(context) > 1 and len(response) > 0:
-                    knowledge = turn['db_results']
-                    if dial['domains'][0] == 'schedule' and len(knowledge['schedule']) == 0:
-                        continue
-                    for domain, db_items in knowledge.items():
-                        entity_col = domain2entity_col[domain]
-                        for db_item in db_items:
-                            db_item['entity'] = db_item.pop(entity_col)
-                    data.append(json.dumps({'context': context[:-1], 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
-                    is_valid = True
-            if is_valid:
-                num_dial += 1
-            if 'test' not in data_split and args.shot and isinstance(args.shot, int) and args.shot >= 1 and args.shot == num_dial:
-                break
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            context = [(turn['speaker'], turn['utterance']) for turn in sample['context']]
+            response = sample['utterance']
+            if len(context) > 0 and len(response) > 0:
+                knowledge = sample['db_results']
+                for domain, db_items in knowledge.items():
+                    entity_col = domain2entity_col[domain]
+                    for db_item in db_items:
+                        db_item['entity'] = db_item.pop(entity_col)
+
+                data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
 
         if 'test' in data_split:
             file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
@@ -80,21 +62,14 @@ def create_personachat_data(dataset, data_dir, args):
     data_splits = data_by_split.keys()
     for data_split in data_splits:
         data = []
-        num_dial = 0
         for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
             knowledge = dial['persona']['system']
             context = []
-            is_valid = False
             for turn in dial['turns']:
                 response = turn['utterance']
+                if turn['speaker'] == 'system' and len(context) > 0 and len(response) > 0:
+                    data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
                 context.append((turn['speaker'], turn['utterance']))
-                if turn['speaker'] == 'system' and len(context) > 1 and len(response) > 0:
-                    data.append(json.dumps({'context': context[:-1], 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
-                    is_valid = True
-            if is_valid:
-                num_dial += 1
-            if 'test' not in data_split and args.shot and isinstance(args.shot, int) and args.shot >= 1 and args.shot == num_dial:
-                break
 
         if 'test' in data_split:
             file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
@@ -108,30 +83,22 @@ def create_personachat_data(dataset, data_dir, args):
 def create_wow_data(dataset, data_dir, args):
     data_by_split = dataset
     os.makedirs(data_dir, exist_ok=True)
-    data_by_split['test'] = data_by_split['test_seen'] + data_by_split['test_unseen']
-    data_by_split.pop('test_seen')
-    data_by_split.pop('test_unseen')
 
     data_splits = data_by_split.keys()
     for data_split in data_splits:
         data = []
-        num_dial = 0
         for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
             context = []
-            is_valid = False
             for turn in dial['turns']:
                 response = turn['utterance']
-                context.append((turn['speaker'], turn['utterance']))
-                if turn['speaker'] == 'system' and len(context) > 1 and len(response) > 0:
+                if turn['speaker'] == 'system' and len(context) > 0 and len(response) > 0:
                     knowledge = turn['checked_passage']
                     if knowledge is None:
-                        continue
-                    data.append(json.dumps({'context': context[:-1], 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
-                    is_valid = True
-            if is_valid:
-                num_dial += 1
-            if 'test' not in data_split and args.shot and isinstance(args.shot, int) and args.shot >= 1 and args.shot == num_dial:
-                break
+                        knowledge = []
+                    elif isinstance(knowledge, str):
+                        knowledge = [knowledge]
+                    data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
+                context.append((turn['speaker'], turn['utterance']))
 
         if 'test' in data_split:
             file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
@@ -149,22 +116,14 @@ def create_opendialkg_data(dataset, data_dir, args):
     data_splits = data_by_split.keys()
     for data_split in data_splits:
         data = []
-        num_dial = 0
         for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
             context = []
-            is_valid = False
             for turn in dial['turns']:
                 response = turn['utterance']
-                context.append((turn['speaker'], turn['utterance']))
                 if turn['speaker'] == 'system' and 'kg_path' in turn and len(context) > 0 and len(response) > 0:
                     knowledge = turn['kg_path']['triples']
-                    assert len(knowledge) > 0
-                    data.append(json.dumps({'context': context[:-1], 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
-                    is_valid = True
-            if is_valid:
-                num_dial += 1
-            if 'test' not in data_split and args.shot and isinstance(args.shot, int) and args.shot >= 1 and args.shot == num_dial:
-                break
+                    data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
+                context.append((turn['speaker'], turn['utterance']))
 
         if 'test' in data_split:
             file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
@@ -188,14 +147,13 @@ if __name__ == '__main__':
     for dataset_name in tqdm(args.datasets, desc='datasets'):
         dataset = load_dataset(dataset_name, dial_ids_order=args.dial_ids_order)
         if args.shot:
-            # few-shot
             if args.shot < 1:
-                # percentage
                 dataset['train'] = dataset['train'][:round(len(dataset['train'])*args.shot)]
                 dataset['validation'] = dataset['validation'][:round(len(dataset['validation'])*args.shot)]
             else:
-                # absolute, handle inside process function
                 args.shot = int(args.shot)
+                dataset['train'] = dataset['train'][:args.shot]
+                dataset['validation'] = dataset['validation'][:args.shot]
         for task_name in tqdm(args.tasks, desc='tasks', leave=False):
             data_dir = os.path.join('data', task_name, (dataset_name if not args.shot else f'{dataset_name}_{args.shot}shot_order{args.dial_ids_order}'))
             data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
diff --git a/convlab/base_models/t5/key2gen/evaluate.py b/convlab/base_models/t5/key2gen/evaluate.py
index 769fdfcf..7acb1118 100644
--- a/convlab/base_models/t5/key2gen/evaluate.py
+++ b/convlab/base_models/t5/key2gen/evaluate.py
@@ -59,26 +59,31 @@ if __name__ == '__main__':
         for shot in tqdm(args.shots, desc='shots', leave=False):
             for output_dir in tqdm(args.output_dirs, desc='models', leave=False):
                 model_name = output_dir.split('/')[-1]
-                results = []
-                for dial_ids_order in tqdm(args.dial_ids_orders, desc='dial_ids_orders', leave=False):
-                    result_dir = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen")
-                    result_file = os.path.join(result_dir, "result.json")
-                    if not os.path.exists(result_file):
-                        filename = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen/generated_predictions.json")
-                        result = evaluate(filename, metric)
-                        json.dump(result, open(result_file, 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
-                    else:
-                        result = json.load(open(result_file))
-                    results.append(result)
-                res = {
-                    "dataset": f"{task_name}-{shot}shot",
-                    "model": f"{model_name}",
-                    **avg_result(results)
-                }
-                table.append(res)
-                for k in res:
-                    if k not in fieldnames:
-                        fieldnames.append(k)
+                if task_name == "wow":
+                    test_splits = ["_seen", "_unseen"]
+                else:
+                    test_splits = [""]
+                for test_split in test_splits:
+                    results = []
+                    for dial_ids_order in tqdm(args.dial_ids_orders, desc='dial_ids_orders', leave=False):
+                        result_dir = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen{test_split}")
+                        result_file = os.path.join(result_dir, "result.json")
+                        if not os.path.exists(result_file):
+                            filename = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen{test_split}/generated_predictions.json")
+                            result = evaluate(filename, metric)
+                            json.dump(result, open(result_file, 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+                        else:
+                            result = json.load(open(result_file))
+                        results.append(result)
+                    res = {
+                        "dataset": f"{task_name}{test_split}-{shot}shot",
+                        "model": f"{model_name}",
+                        **avg_result(results)
+                    }
+                    table.append(res)
+                    for k in res:
+                        if k not in fieldnames:
+                            fieldnames.append(k)
                     
     res = tabulate(table, headers='keys', tablefmt='github')
     with open(f'eval_results.txt', 'w', encoding='utf-8') as f:
diff --git a/convlab/base_models/t5/key2gen/finetune.sh b/convlab/base_models/t5/key2gen/finetune.sh
index df534068..390ea1a9 100644
--- a/convlab/base_models/t5/key2gen/finetune.sh
+++ b/convlab/base_models/t5/key2gen/finetune.sh
@@ -1,24 +1,15 @@
-set -e
-dataset_path=$1
-model_name=$2
-model_name_or_path=$3
-dataset_name=$4
-if [ "${dataset_name}" == "multiwoz21" ]
-then
-    task_name="nlg"
-else
-    task_name=${dataset_name}
-fi
-master_port=$5
-
 n_gpus=2
+master_port=23456
 cache_dir="../cache"
+dataset_path="dataset_vanilla.py"
 metric_name_or_path="metric.py"
 source_column="context+knowledge"
 target_column="response"
 truncation_side="left"
 max_source_length=512
 max_target_length=512
+model_name="t5-small"
+model_name_or_path="t5-small"
 per_device_train_batch_size=64
 per_device_eval_batch_size=64
 gradient_accumulation_steps=1
@@ -26,91 +17,137 @@ num_workers=16
 lr=1e-3
 num_train_epochs=100
 
-for shot in 50 100 200
+for dataset_name in multiwoz21 kvret opendialkg wow personachat
 do
-    for dial_ids_order in 0 1 2 3 4
+    if [ "${dataset_name}" == "multiwoz21" ]
+    then
+        task_name="nlg"
+    else
+        task_name=${dataset_name}
+    fi
+    for shot in 50 100 200
     do
-        python create_data.py -t ${task_name} -d ${dataset_name} -o ${dial_ids_order} -s ${shot}
+        for dial_ids_order in 0 1 2 3 4
+        do
+            python create_data.py -t ${task_name} -d ${dataset_name} -o ${dial_ids_order} -s ${shot}
 
-        data_dir="data/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}"
-        output_dir="output/${model_name}/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}"
-        logging_dir="${output_dir}/runs"
-        train_file="${data_dir}/train.json"
-        validation_file="${data_dir}/validation.json"
+            data_dir="data/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}"
+            output_dir="output/${model_name}/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}"
+            logging_dir="${output_dir}/runs"
+            train_file="${data_dir}/train.json"
+            validation_file="${data_dir}/validation.json"
 
-        # training
-        python -m torch.distributed.launch --master_port ${master_port} \
-            --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-            --task_name ${task_name} \
-            --dataset_name ${dataset_path} \
-            --dataset_config_name ${task_name} \
-            --train_file ${train_file} \
-            --validation_file ${validation_file} \
-            --source_column ${source_column} \
-            --target_column ${target_column} \
-            --max_source_length ${max_source_length} \
-            --max_target_length ${max_target_length} \
-            --truncation_side ${truncation_side} \
-            --model_name_or_path ${model_name_or_path} \
-            --do_train \
-            --do_eval \
-            --save_strategy epoch \
-            --evaluation_strategy epoch \
-            --save_total_limit 1 \
-            --prediction_loss_only \
-            --load_best_model_at_end \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --output_dir ${output_dir} \
-            --logging_dir ${logging_dir} \
-            --preprocessing_num_workers ${num_workers} \
-            --dataloader_num_workers ${num_workers} \
-            --per_device_train_batch_size ${per_device_train_batch_size} \
-            --per_device_eval_batch_size ${per_device_eval_batch_size} \
-            --gradient_accumulation_steps ${gradient_accumulation_steps} \
-            --learning_rate ${lr} \
-            --num_train_epochs ${num_train_epochs} \
-            --optim adafactor \
-            --lr_scheduler_type constant \
-            --gradient_checkpointing
+            # training
+            python -m torch.distributed.launch --master_port ${master_port} \
+                --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+                --task_name ${task_name} \
+                --dataset_name ${dataset_path} \
+                --dataset_config_name ${task_name} \
+                --train_file ${train_file} \
+                --validation_file ${validation_file} \
+                --source_column ${source_column} \
+                --target_column ${target_column} \
+                --max_source_length ${max_source_length} \
+                --max_target_length ${max_target_length} \
+                --truncation_side ${truncation_side} \
+                --model_name_or_path ${model_name_or_path} \
+                --do_train \
+                --do_eval \
+                --save_strategy epoch \
+                --evaluation_strategy epoch \
+                --save_total_limit 1 \
+                --prediction_loss_only \
+                --load_best_model_at_end \
+                --overwrite_output_dir \
+                --cache_dir ${cache_dir} \
+                --output_dir ${output_dir} \
+                --logging_dir ${logging_dir} \
+                --preprocessing_num_workers ${num_workers} \
+                --dataloader_num_workers ${num_workers} \
+                --per_device_train_batch_size ${per_device_train_batch_size} \
+                --per_device_eval_batch_size ${per_device_eval_batch_size} \
+                --gradient_accumulation_steps ${gradient_accumulation_steps} \
+                --learning_rate ${lr} \
+                --num_train_epochs ${num_train_epochs} \
+                --optim adafactor \
+                --lr_scheduler_type constant \
+                --gradient_checkpointing
 
-        # inference
-        test_file="data/${task_name}/test.json"
-        gen_output_dir="${output_dir}/gen"
+            # inference
+            if [ "${dataset_name}" == "wow" ]
+            then
+                for test_split in seen unseen
+                do
+                    test_file="data/${task_name}/test_${test_split}.json"
+                    gen_output_dir="${output_dir}/gen_${test_split}"
 
-        python -m torch.distributed.launch --master_port ${master_port} \
-            --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-            --task_name ${task_name} \
-            --dataset_name ${dataset_path} \
-            --dataset_config_name ${task_name} \
-            --metric_name_or_path ${metric_name_or_path} \
-            --metric_config_name ${task_name} \
-            --test_file ${test_file} \
-            --source_column ${source_column} \
-            --target_column ${target_column} \
-            --max_source_length ${max_source_length} \
-            --max_target_length ${max_target_length} \
-            --truncation_side ${truncation_side} \
-            --model_name_or_path ${output_dir} \
-            --do_predict \
-            --predict_with_generate \
-            --cache_dir ${cache_dir} \
-            --output_dir ${gen_output_dir} \
-            --logging_dir ${logging_dir} \
-            --overwrite_output_dir \
-            --preprocessing_num_workers ${num_workers} \
-            --dataloader_num_workers ${num_workers} \
-            --per_device_train_batch_size ${per_device_train_batch_size} \
-            --per_device_eval_batch_size ${per_device_eval_batch_size} \
-            --gradient_accumulation_steps ${gradient_accumulation_steps} \
-            --learning_rate ${lr} \
-            --num_train_epochs ${num_train_epochs} \
-            --optim adafactor \
-            --lr_scheduler_type constant \
-            --gradient_checkpointing
-        
+                    python -m torch.distributed.launch --master_port ${master_port} \
+                        --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+                        --task_name ${task_name} \
+                        --dataset_name ${dataset_path} \
+                        --dataset_config_name ${task_name} \
+                        --test_file ${test_file} \
+                        --source_column ${source_column} \
+                        --target_column ${target_column} \
+                        --max_source_length ${max_source_length} \
+                        --max_target_length ${max_target_length} \
+                        --truncation_side ${truncation_side} \
+                        --model_name_or_path ${output_dir} \
+                        --do_predict \
+                        --predict_with_generate \
+                        --cache_dir ${cache_dir} \
+                        --output_dir ${gen_output_dir} \
+                        --logging_dir ${logging_dir} \
+                        --overwrite_output_dir \
+                        --preprocessing_num_workers ${num_workers} \
+                        --dataloader_num_workers ${num_workers} \
+                        --per_device_train_batch_size ${per_device_train_batch_size} \
+                        --per_device_eval_batch_size ${per_device_eval_batch_size} \
+                        --gradient_accumulation_steps ${gradient_accumulation_steps} \
+                        --learning_rate ${lr} \
+                        --num_train_epochs ${num_train_epochs} \
+                        --optim adafactor \
+                        --lr_scheduler_type constant \
+                        --gradient_checkpointing
+                done
+            else
+                test_file="data/${task_name}/test.json"
+                gen_output_dir="${output_dir}/gen"
+            
+                python -m torch.distributed.launch --master_port ${master_port} \
+                    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+                    --task_name ${task_name} \
+                    --dataset_name ${dataset_path} \
+                    --dataset_config_name ${task_name} \
+                    --metric_name_or_path ${metric_name_or_path} \
+                    --metric_config_name ${task_name} \
+                    --test_file ${test_file} \
+                    --source_column ${source_column} \
+                    --target_column ${target_column} \
+                    --max_source_length ${max_source_length} \
+                    --max_target_length ${max_target_length} \
+                    --truncation_side ${truncation_side} \
+                    --model_name_or_path ${output_dir} \
+                    --do_predict \
+                    --predict_with_generate \
+                    --cache_dir ${cache_dir} \
+                    --output_dir ${gen_output_dir} \
+                    --logging_dir ${logging_dir} \
+                    --overwrite_output_dir \
+                    --preprocessing_num_workers ${num_workers} \
+                    --dataloader_num_workers ${num_workers} \
+                    --per_device_train_batch_size ${per_device_train_batch_size} \
+                    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+                    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+                    --learning_rate ${lr} \
+                    --num_train_epochs ${num_train_epochs} \
+                    --optim adafactor \
+                    --lr_scheduler_type constant \
+                    --gradient_checkpointing
+            fi
+        done
     done
 done
 
 # evaluation
-python evaluate.py --output_dirs output/${model_name} -t ${task_name} -s 50 100 200 -o 0 1 2 3 4
\ No newline at end of file
+python evaluate.py --output_dirs output/${model_name} -t nlg kvret opendialkg personachat wow -s 50 100 200 -o 0 1 2 3 4
\ No newline at end of file
diff --git a/convlab/base_models/t5/key2gen/metric.py b/convlab/base_models/t5/key2gen/metric.py
index ce385c9d..d3e49318 100644
--- a/convlab/base_models/t5/key2gen/metric.py
+++ b/convlab/base_models/t5/key2gen/metric.py
@@ -65,6 +65,15 @@ Returns:
     unigram f1: unigram overlap, from parlai
     distinct-1/2: from parlai
     other knowledge utility score: task-specific knowledge utility metrics
+
+Examples:
+
+    >>> nlg_metric = datasets.load_metric("metric.py", "nlg")
+    >>> predictions = ["hello there general kenobi", "foo bar foobar"]
+    >>> references = ["hello there kenobi", "foo bar foobar"]
+    >>> results = nlg_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {"bleu": 35.35533905932737}
 """
 
 re_art = re.compile(r'\b(a|an|the)\b')
@@ -316,11 +325,11 @@ def f1_score(y_pred, y_true, average="micro"):
 
     if average == "macro":
         F1_macro_score = F1_pred / float(F1_count) if F1_count != 0 else 0
-        return F1_macro_score * 100
+        return F1_macro_score
     elif average == "micro":
         P_score = TP_all / float(TP_all + FP_all) if (TP_all + FP_all) != 0 else 0
         R_score = TP_all / float(TP_all + FN_all) if (TP_all + FN_all) != 0 else 0
-        F1_micro_score = _compute_F1(P_score, R_score) * 100
+        F1_micro_score = _compute_F1(P_score, R_score)
         return F1_micro_score
     else:
         raise ValueError("Options other than micro/macro are not supported.")
diff --git a/convlab/base_models/t5/run_seq2seq.py b/convlab/base_models/t5/run_seq2seq.py
index 1566cd4b..5fa921f0 100644
--- a/convlab/base_models/t5/run_seq2seq.py
+++ b/convlab/base_models/t5/run_seq2seq.py
@@ -221,7 +221,7 @@ class DataTrainingArguments:
         default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
     )
     early_stopping_patience: Optional[int] = field(
-        default=5, metadata={"help": "early stopping patience, set to 0 if you do not want to use early stopping."},
+        default=10, metadata={"help": "early stopping patience, set to 0 if you do not want to use early stopping."},
     )
 
     def __post_init__(self):
-- 
GitLab