Skip to content
Snippets Groups Projects
Commit 196e2ade authored by zqwerty's avatar zqwerty
Browse files

add benchmark res for bertnlu

parent e2572afb
No related branches found
No related tags found
No related merge requests found
Showing
with 268 additions and 2 deletions
......@@ -6,8 +6,10 @@ def evaluate(predict_result):
predict_result = json.load(open(predict_result))
metrics = {x: {'TP':0, 'FP':0, 'FN':0} for x in ['overall', 'binary', 'categorical', 'non-categorical']}
acc = []
for sample in predict_result:
flag = True
for da_type in ['binary', 'categorical', 'non-categorical']:
if da_type == 'binary':
predicts = [(x['intent'], x['domain'], x['slot']) for x in sample['predictions']['dialogue_acts'][da_type]]
......@@ -26,6 +28,8 @@ def evaluate(predict_result):
if ele not in predicts:
metrics['overall']['FN'] += 1
metrics[da_type]['FN'] += 1
flag &= (sorted(predicts)==sorted(labels))
acc.append(flag)
for metric in metrics:
TP = metrics[metric].pop('TP')
......@@ -37,6 +41,7 @@ def evaluate(predict_result):
metrics[metric]['precision'] = precision
metrics[metric]['recall'] = recall
metrics[metric]['f1'] = f1
metrics['accuracy'] = sum(acc)/len(acc)
return metrics
......
......@@ -31,6 +31,50 @@ $ python test.py --config_path path_to_a_config_file
The result (`output.json`) will be saved under the `output_dir` of the config file. Also, it will be zipped as `zipped_model_path` in the config file.
## Performance on unified format datasets
To illustrate that it is easy to use the model for any dataset that in our unified format, we report the performance on several datasets in our unified format. We follow `README.md` and config files in `unified_datasets/` to generate `predictions.json`, then evaluate it using `../evaluate_unified_datasets.py`. Note that we use almost the same hyper-parameters for different datasets, which may not be optimal.
<table>
<thead>
<tr>
<th></th>
<th colspan=2>MultiWOZ 2.1</th>
<th colspan=2>Taskmaster-1</th>
<th colspan=2>Taskmaster-2</th>
<th colspan=2>Taskmaster-3</th>
</tr>
</thead>
<thead>
<tr>
<th>Model</th>
<th>Acc</th><th>F1</th>
<th>Acc</th><th>F1</th>
<th>Acc</th><th>F1</th>
<th>Acc</th><th>F1</th>
</tr>
</thead>
<tbody>
<tr>
<td>BERTNLU</td>
<td>74.5</td><td>85.9</td>
<td>72.8</td><td>50.6</td>
<td>79.2</td><td>70.6</td>
<td>86.1</td><td>81.9</td>
</tr>
<tr>
<td>BERTNLU (context=3)</td>
<td>80.6</td><td>90.3</td>
<td>74.2</td><td>52.7</td>
<td>80.9</td><td>73.3</td>
<td>87.8</td><td>83.8</td>
</tr>
</tbody>
</table>
- Acc: whether all dialogue acts of an utterance are correctly predicted
- F1: F1 measure of the dialogue act predictions over the corpus.
## References
```
......
......@@ -2,7 +2,7 @@
"dataset_name": "multiwoz21",
"data_dir": "unified_datasets/data/multiwoz21/user/context_window_size_0",
"output_dir": "unified_datasets/output/multiwoz21/user/context_window_size_0",
"zipped_model_path": "unified_datasets/output/multiwoz21/user/context_window_size_0/bertnlu_unified_multiwoz_user_context0.zip",
"zipped_model_path": "unified_datasets/output/multiwoz21/user/context_window_size_0/bertnlu_unified_multiwoz21_user_context0.zip",
"log_dir": "unified_datasets/output/multiwoz21/user/context_window_size_0/log",
"DEVICE": "cuda:0",
"seed": 2019,
......
......@@ -2,7 +2,7 @@
"dataset_name": "multiwoz21",
"data_dir": "unified_datasets/data/multiwoz21/user/context_window_size_3",
"output_dir": "unified_datasets/output/multiwoz21/user/context_window_size_3",
"zipped_model_path": "unified_datasets/output/multiwoz21/user/context_window_size_3/bertnlu_unified_multiwoz_user_context3.zip",
"zipped_model_path": "unified_datasets/output/multiwoz21/user/context_window_size_3/bertnlu_unified_multiwoz21_user_context3.zip",
"log_dir": "unified_datasets/output/multiwoz21/user/context_window_size_3/log",
"DEVICE": "cuda:0",
"seed": 2019,
......
{
"dataset_name": "sgd",
"data_dir": "unified_datasets/data/sgd/user/context_window_size_0",
"output_dir": "unified_datasets/output/sgd/user/context_window_size_0",
"zipped_model_path": "unified_datasets/output/sgd/user/context_window_size_0/bertnlu_unified_sgd_user_context0.zip",
"log_dir": "unified_datasets/output/sgd/user/context_window_size_0/log",
"DEVICE": "cuda:0",
"seed": 2019,
"cut_sen_len": 40,
"use_bert_tokenizer": true,
"context_window_size": 0,
"model": {
"finetune": true,
"context": false,
"context_grad": false,
"pretrained_weights": "bert-base-uncased",
"check_step": 1000,
"max_step": 10000,
"batch_size": 128,
"learning_rate": 1e-4,
"adam_epsilon": 1e-8,
"warmup_steps": 0,
"weight_decay": 0.0,
"dropout": 0.1,
"hidden_units": 768
}
}
\ No newline at end of file
{
"dataset_name": "sgd",
"data_dir": "unified_datasets/data/sgd/user/context_window_size_3",
"output_dir": "unified_datasets/output/sgd/user/context_window_size_3",
"zipped_model_path": "unified_datasets/output/sgd/user/context_window_size_3/bertnlu_unified_sgd_user_context3.zip",
"log_dir": "unified_datasets/output/sgd/user/context_window_size_3/log",
"DEVICE": "cuda:0",
"seed": 2019,
"cut_sen_len": 40,
"use_bert_tokenizer": true,
"context_window_size": 3,
"model": {
"finetune": true,
"context": true,
"context_grad": true,
"pretrained_weights": "bert-base-uncased",
"check_step": 1000,
"max_step": 10000,
"batch_size": 128,
"learning_rate": 1e-4,
"adam_epsilon": 1e-8,
"warmup_steps": 0,
"weight_decay": 0.0,
"dropout": 0.1,
"hidden_units": 1536
}
}
\ No newline at end of file
{
"dataset_name": "tm1",
"data_dir": "unified_datasets/data/tm1/user/context_window_size_0",
"output_dir": "unified_datasets/output/tm1/user/context_window_size_0",
"zipped_model_path": "unified_datasets/output/tm1/user/context_window_size_0/bertnlu_unified_tm1_user_context0.zip",
"log_dir": "unified_datasets/output/tm1/user/context_window_size_0/log",
"DEVICE": "cuda:0",
"seed": 2019,
"cut_sen_len": 40,
"use_bert_tokenizer": true,
"context_window_size": 0,
"model": {
"finetune": true,
"context": false,
"context_grad": false,
"pretrained_weights": "bert-base-uncased",
"check_step": 1000,
"max_step": 10000,
"batch_size": 128,
"learning_rate": 1e-4,
"adam_epsilon": 1e-8,
"warmup_steps": 0,
"weight_decay": 0.0,
"dropout": 0.1,
"hidden_units": 768
}
}
\ No newline at end of file
{
"dataset_name": "tm1",
"data_dir": "unified_datasets/data/tm1/user/context_window_size_3",
"output_dir": "unified_datasets/output/tm1/user/context_window_size_3",
"zipped_model_path": "unified_datasets/output/tm1/user/context_window_size_3/bertnlu_unified_tm1_user_context3.zip",
"log_dir": "unified_datasets/output/tm1/user/context_window_size_3/log",
"DEVICE": "cuda:0",
"seed": 2019,
"cut_sen_len": 40,
"use_bert_tokenizer": true,
"context_window_size": 3,
"model": {
"finetune": true,
"context": true,
"context_grad": true,
"pretrained_weights": "bert-base-uncased",
"check_step": 1000,
"max_step": 10000,
"batch_size": 128,
"learning_rate": 1e-4,
"adam_epsilon": 1e-8,
"warmup_steps": 0,
"weight_decay": 0.0,
"dropout": 0.1,
"hidden_units": 1536
}
}
\ No newline at end of file
{
"dataset_name": "tm2",
"data_dir": "unified_datasets/data/tm2/user/context_window_size_0",
"output_dir": "unified_datasets/output/tm2/user/context_window_size_0",
"zipped_model_path": "unified_datasets/output/tm2/user/context_window_size_0/bertnlu_unified_tm2_user_context0.zip",
"log_dir": "unified_datasets/output/tm2/user/context_window_size_0/log",
"DEVICE": "cuda:0",
"seed": 2019,
"cut_sen_len": 40,
"use_bert_tokenizer": true,
"context_window_size": 0,
"model": {
"finetune": true,
"context": false,
"context_grad": false,
"pretrained_weights": "bert-base-uncased",
"check_step": 1000,
"max_step": 10000,
"batch_size": 128,
"learning_rate": 1e-4,
"adam_epsilon": 1e-8,
"warmup_steps": 0,
"weight_decay": 0.0,
"dropout": 0.1,
"hidden_units": 768
}
}
\ No newline at end of file
{
"dataset_name": "tm2",
"data_dir": "unified_datasets/data/tm2/user/context_window_size_3",
"output_dir": "unified_datasets/output/tm2/user/context_window_size_3",
"zipped_model_path": "unified_datasets/output/tm2/user/context_window_size_3/bertnlu_unified_tm2_user_context3.zip",
"log_dir": "unified_datasets/output/tm2/user/context_window_size_3/log",
"DEVICE": "cuda:0",
"seed": 2019,
"cut_sen_len": 40,
"use_bert_tokenizer": true,
"context_window_size": 3,
"model": {
"finetune": true,
"context": true,
"context_grad": true,
"pretrained_weights": "bert-base-uncased",
"check_step": 1000,
"max_step": 10000,
"batch_size": 128,
"learning_rate": 1e-4,
"adam_epsilon": 1e-8,
"warmup_steps": 0,
"weight_decay": 0.0,
"dropout": 0.1,
"hidden_units": 1536
}
}
\ No newline at end of file
{
"dataset_name": "tm3",
"data_dir": "unified_datasets/data/tm3/user/context_window_size_0",
"output_dir": "unified_datasets/output/tm3/user/context_window_size_0",
"zipped_model_path": "unified_datasets/output/tm3/user/context_window_size_0/bertnlu_unified_tm3_user_context0.zip",
"log_dir": "unified_datasets/output/tm3/user/context_window_size_0/log",
"DEVICE": "cuda:0",
"seed": 2019,
"cut_sen_len": 40,
"use_bert_tokenizer": true,
"context_window_size": 0,
"model": {
"finetune": true,
"context": false,
"context_grad": false,
"pretrained_weights": "bert-base-uncased",
"check_step": 1000,
"max_step": 10000,
"batch_size": 128,
"learning_rate": 1e-4,
"adam_epsilon": 1e-8,
"warmup_steps": 0,
"weight_decay": 0.0,
"dropout": 0.1,
"hidden_units": 768
}
}
\ No newline at end of file
{
"dataset_name": "tm3",
"data_dir": "unified_datasets/data/tm3/user/context_window_size_3",
"output_dir": "unified_datasets/output/tm3/user/context_window_size_3",
"zipped_model_path": "unified_datasets/output/tm3/user/context_window_size_3/bertnlu_unified_tm3_user_context3.zip",
"log_dir": "unified_datasets/output/tm3/user/context_window_size_3/log",
"DEVICE": "cuda:0",
"seed": 2019,
"cut_sen_len": 40,
"use_bert_tokenizer": true,
"context_window_size": 3,
"model": {
"finetune": true,
"context": true,
"context_grad": true,
"pretrained_weights": "bert-base-uncased",
"check_step": 1000,
"max_step": 20000,
"batch_size": 64,
"learning_rate": 1e-4,
"adam_epsilon": 1e-8,
"warmup_steps": 0,
"weight_decay": 0.0,
"dropout": 0.1,
"hidden_units": 1536
}
}
\ No newline at end of file
......@@ -41,6 +41,7 @@ setup(
'numpy',
'nltk',
'scipy',
'tensorboard',
'torch>=1.6',
'transformers>=4.0',
'datasets>=1.8',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment