diff --git a/convlab/dst/setsumbt/README.md b/convlab/dst/setsumbt/README.md index c081dedbde4e2e2a6aa706f7495456f735cef37a..96701aef8ced46dc0f08d50f3e6ce72b8f04673d 100644 --- a/convlab/dst/setsumbt/README.md +++ b/convlab/dst/setsumbt/README.md @@ -1,63 +1,183 @@ -# Our paper -[Uncertainty Measures in Neural Belief Tracking and the Effects on Dialogue Policy Performance](https://todo.pdf) +# SetSUMBT & SUMBT +## Dialogue State Tracking and Language Understanding -## Structure - +The SUMBT and SetSUMBT models is a group of dialogue state tracking models. +These models include natural language understanding prediction heads which +provide crucial information, such as the user request actions, required to +incorporate the model in a pipeline dialogue system. [SUMBT](https://arxiv.org/pdf/1907.07421.pdf) +utilises a Slot-Utterance matching attention mechanism (SUM) for information extraction, +a recurrent module for latent information tracking and a picklist state +prediction head using similarity based matching. [SetSUMBT](https://aclanthology.org/2021.emnlp-main.623/) +extends the SUMBT model through the extension of the Slot-Utterance matching +using to a set based Slot-Utterance matching module and a set based similarity +matching prediction head. This model also introduces the language understanding +prediction heads required for predicting additional crucial information. In addition, +this model code allows for training of an ensemble and distillation of the ensemble +producing a belief tracking model which predicts well calibrated belief states. + + +## Our paper +[Uncertainty Measures in Neural Belief Tracking and the Effects on Dialogue Policy Performance](https://aclanthology.org/2021.emnlp-main.623/) + +## SetSUMBT Model Architecture + +The default configuration of the models are as follows: + +| Hyperparameter | SetSUMBT | SUMBT | +|:----------------------------|:---------------------------------------------------:|:-------------------------------------------------------------:| +| Max Turns | 12 | 12 | +| Max Turn Length | 64 | 64 | +| Max Candidate Desc. Length | 12 | 12 | +| Encoder model | [roberta-base](https://huggingface.co/roberta-base) | [bert-base-uncased](https://huggingface.co/bert-base-uncased) | +| Hidden Size | 768 | 768 | +| SUM Attention Heads | 12 | 12 | +| Dropout rate | 0.3 | 0.3 | +| Tracker type | GRU | GRU | +| Tracker Hidden Size | 300 | 300 | +| Tracker RNN Layers | 1 | 1 | +| Set Pooler type | CNN | No Set Pooler | +| Candidate Desc. Pooler type | No Pooler | CLS Token | +| Loss Function | Label smoothing | Label smoothing | +| Epochs | 50 | 50 | +| Early stopping criteria | 20 Epochs | 20 Epochs | +| Learning rate | 5e-5 | 5e-5 | +| LR Scheduler | Linear(0.2) | Linear(0.2) | ## Usages -### Data preprocessing +### Data sets We conduct experiments on the following datasets: -* MultiWOZ 2.1 [Download](https://github.com/budzianowski/multiwoz/raw/master/data/MultiWOZ_2.1.zip) to get `MULTIWOZ2.1.zip` +* [MultiWOZ 2.1](https://huggingface.co/datasets/ConvLab/multiwoz21) +* [Schema Guided Dialogue(SGD)](https://huggingface.co/datasets/ConvLab/sgd) +* [Taskmaster 1](https://huggingface.co/datasets/ConvLab/tm1) +* [Taskmaster 2](https://huggingface.co/datasets/ConvLab/tm2) +* [Taskmaster 3](https://huggingface.co/datasets/ConvLab/tm3) + +### Model checkpoints available on Huggingface + +The following pre-trained model checkpoints are available via huggingface hub: + +| Model | Dataset | Training Setup | Checkpoint | +|:---------|:-------------|:---------------------------------|:----------------------------------------------------------------------------------| +| SetSUMBT | MultiWOZ 2.1 | Full dataset | [setsumbt-dst-multiwoz21](https://huggingface.co/ConvLab/setsumbt-dst-multiwoz21) | +| SetSUMBT | SGD | Full dataset | [setsumbt-dst-sgd](https://huggingface.co/ConvLab/setsumbt-dst-sgd) | +| SetSUMBT | TM1+TM2+TM3 | Full dataset | [setsumbt-dst-tm123](https://huggingface.co/ConvLab/setsumbt-dst-tm123) | +| SetSUMBT | MultiWOZ 2.1 | DST+NLU tasks + Uncertainty Est. | [setsumbt-dst_nlu-multiwoz21-EnD2](https://huggingface.co/ConvLab/setsumbt-dst_nlu-multiwoz21-EnD2) | ### Train **Train baseline single instance SetSUMBT** + +Command to train the model on the MultiWOZ 2.1 dataset, to train the model on +other datasets/setups or to train the SUMBT model set the relevant `starting_config_name`. +To fine tune a pre-trained model set the `model_name_or_path` to the path of the pre-trained +model. See below for more configurations of this model: + +| Model | Dataset | Training Setup | Starting Config Name | +|:---------|:---------------------|:-----------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------| +| SetSUMBT | MultiWOZ21 | Full dataset | [setsumbt_multiwoz21](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json) | +| SetSUMBT | MultiWOZ21 | DST and NLU Tasks | [setsumbt_nlu_multiwoz21](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21.json) | +| SetSUMBT | MultiWOZ21 | Ensemble Distillation | [setsumbt_nlu_multiwoz21_end](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end.json) | +| SetSUMBT | MultiWOZ21 | Ensemble Distribution Distillation | [setsumbt_nlu_multiwoz21_end2](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end2.json) | +| SetSUMBT | MultiWOZ21 | 10% of the training data | [setsumbt_multiwoz21_10p](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask_10p.json) | +| SetSUMBT | MultiWOZ21 | 1% of the training data | [setsumbt_multiwoz21_1p](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask_1p.json) | +| SetSUMBT | TM1+TM2+TM3 | Full dataset | [setsumbt_tm](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json) | +| SetSUMBT | SGD | Full dataset | [setsumbt_sgd](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json) | +| SetSUMBT | MW21+SGD+TM1+TM2+TM3 | Joint training | [setsumbt_joint](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json) | +| SetSUMBT | SGD+TM1+TM2+TM3 | Pre training | [setsumbt_pretrain](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json) | +| SUMBT | MultiWOZ21 | Full dataset | [sumbt_multiwoz21](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json) | +| SUMBT | MultiWOZ21 | 10% of the training data | [sumbt_multiwoz21_10p](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask_10p.json) | +| SUMBT | MultiWOZ21 | 1% of the training data | [sumbt_multiwoz21_1p](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask_1p.json) | +| SUMBT | TM1+TM2+TM3 | Full dataset | [sumbt_tm](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json) | +| SUMBT | SGD | Full dataset | [sumbt_sgd](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json) | +| SUMBT | MW21+SGD+TM1+TM2+TM3 | Joint training | [sumbt_joint](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json) | +| SUMBT | SGD+TM1+TM2+TM3 | Pre training | [sumbt_pretrain](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json) | + ``` -python run.py --run_nbt \ - --use_descriptions --set_similarity \ - --do_train --do_eval \ - --seed 20211202 +python3 run.py \ + --starting_config_name setsumbt_multiwoz21 \ + --seed 0 \ + --do_train ``` **Train ensemble SetSUMBT** ``` SEED=20211202 MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')" -./configure_ensemble.sh $SEED $MODEL_PATH -./train_ensemble.sh $SEED $MODEL_PATH +ENSEMBLE_SIZE=10 +DATA_SIZE=7500 + +python3 run.py \ + --starting_config_name setsumbt_nlu_multiwoz21 \ + --output_dir $MODEL_PATH \ + --ensemble_size $ENSEMBLE_SIZE \ + --data_sampling_size $DATA_SIZE \ + --seed $SEED + +ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1)) +for e in $(seq 0 $ENSEMBLE_SIZE);do + python3 run.py \ + --starting_config_name setsumbt_nlu_multiwoz21 + --output_dir "$OUT/ens-$e" \ + --do_train \ + --seed $SEED +done ``` **Distill Ensemble SetSUMBT** ``` SEED=20211202 MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')" -./distill_end.sh $SEED $MODEL_PATH +for SUBSET in train dev test;do + python3 distillation_setup.py \ + --model_path $MODEL_PATH \ + --set_type $SUBSET \ + --reduction mean \ + --get_ensemble_distributions \ + --convert_distributions_to_predictions +done +python3 run.py \ + --starting_config_name setsumbt_nlu_multiwoz21_end \ + --seed $SEED \ + --output_dir $MODEL_PATH \ + --do_train ``` **Distribution Distill Ensemble SetSUMBT** ``` SEED=20211202 MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')" -./distill_end2.sh $SEED $MODEL_PATH +for SUBSET in train dev test;do + python3 distillation_setup.py \ + --model_path $MODEL_PATH \ + --set_type $SUBSET \ + --reduction none \ + --get_ensemble_distributions \ + --convert_distributions_to_predictions +done +python3 run.py \ + --starting_config_name setsumbt_nlu_multiwoz21_end2 \ + --seed $SEED \ + --output_dir $MODEL_PATH \ + --do_train ``` ### Evaluation -``` -SEED=20211202 -MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')" -python run.py --run_calibration \ - --seed $SEED \ - --output_dir $MODEL_PATH -``` - -### Convert training setup to convlab model +To evaluate a model set the `$MODEL_PATH` to the path or URL of that model. +The URL is the download URL of the model archive from the pretrained model +for example for `setsumbt-dst-multiwoz21` the url is +https://huggingface.co/ConvLab/setsumbt-dst-multiwoz21/resolve/main/SetSUMBT-multiwoz21-roberta-gru-cosine-labelsmoothing-Seed0.zip. ``` -SEED=20211202 -MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')" -OUT_PATH="models/labelsmoothing" -./configure_model.sh $MODEL_PATH data $OUT_PATH +python3 run.py \ + --starting_config_name setsumbt_multiwoz21 \ + --output_dir $MODEL_PATH \ + --do_test +python3 get_golden_labels.py \ + --dataset_name multiwoz21 \ + --model_path $MODEL_PATH +python3 ../evaluate_unified_datasets.py \ + -p "$MODEL_PATH/predictions/test_multiwoz21.json" ``` ### Training PPO policy using SetSUMBT tracker and uncertainty @@ -68,5 +188,5 @@ cd ../../policy/ppo ``` In this directory run the relevant train script, for example to train the policy using END-SetSUMBT using no uncertainty metrics run: ``` -./train_setsumbt_end_baseline.sh +python3 train.py --path setsumbt_config.json ``` diff --git a/convlab/dst/setsumbt/configs/setsumbt_multitask.json b/convlab/dst/setsumbt/configs/setsumbt_joint.json similarity index 61% rename from convlab/dst/setsumbt/configs/setsumbt_multitask.json rename to convlab/dst/setsumbt/configs/setsumbt_joint.json index c076a557cb3e1d567784c70559fb1922fe05c545..b97920f28538ebfd8f026e5c966434334e7257b4 100644 --- a/convlab/dst/setsumbt/configs/setsumbt_multitask.json +++ b/convlab/dst/setsumbt/configs/setsumbt_joint.json @@ -2,8 +2,8 @@ "model_type": "SetSUMBT", "dataset": "multiwoz21+sgd+tm1+tm2+tm3", "no_action_prediction": true, - "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base", - "transformers_local_files_only": true, + "model_name_or_path": "roberta-base", + "transformers_local_files_only": false, "train_batch_size": 3, "dev_batch_size": 8, "test_batch_size": 8, diff --git a/convlab/dst/setsumbt/configs/setsumbt_multiwoz21.json b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21.json index 0bff751c16f0bdcdf61f04ce33d616370c0d32d8..57a245518aae0a111f6220b1a088943b8b64ee4c 100644 --- a/convlab/dst/setsumbt/configs/setsumbt_multiwoz21.json +++ b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21.json @@ -2,8 +2,9 @@ "model_type": "SetSUMBT", "dataset": "multiwoz21", "no_action_prediction": true, - "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base", - "transformers_local_files_only": true, + "model_name_or_path": "roberta-base", + "candidate_embedding_model_name": "roberta-base", + "transformers_local_files_only": false, "train_batch_size": 3, "dev_batch_size": 16, "test_batch_size": 16, diff --git a/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_10p.json b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_10p.json new file mode 100644 index 0000000000000000000000000000000000000000..e4b54276f657930e46d14a20e677685153217e6a --- /dev/null +++ b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_10p.json @@ -0,0 +1,16 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21", + "dataset_train_ratio": 0.1, + "no_action_prediction": true, + "model_name_or_path": "roberta-base", + "candidate_embedding_model_name": "roberta-base", + "transformers_local_files_only": false, + "num_train_epochs": 500, + "patience": 50, + "warmup_proportion": 0.02, + "train_batch_size": 3, + "dev_batch_size": 16, + "test_batch_size": 16, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_1p.json b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_1p.json new file mode 100644 index 0000000000000000000000000000000000000000..c902d0f7a456f453ff208f258594a4d090376628 --- /dev/null +++ b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_1p.json @@ -0,0 +1,16 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21", + "dataset_train_ratio": 0.01, + "no_action_prediction": true, + "model_name_or_path": "roberta-base", + "candidate_embedding_model_name": "roberta-base", + "transformers_local_files_only": false, + "num_train_epochs": 1000, + "patience": 100, + "warmup_proportion": 0.01, + "train_batch_size": 3, + "dev_batch_size": 16, + "test_batch_size": 16, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21.json b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21.json new file mode 100644 index 0000000000000000000000000000000000000000..59c272c171d7e0b6bc1a2e87b49b18a863464681 --- /dev/null +++ b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21.json @@ -0,0 +1,12 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21", + "no_action_prediction": false, + "model_name_or_path": "roberta-base", + "candidate_embedding_model_name": "roberta-base", + "transformers_local_files_only": false, + "train_batch_size": 3, + "dev_batch_size": 16, + "test_batch_size": 16, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end.json b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end.json new file mode 100644 index 0000000000000000000000000000000000000000..7ae2f58885d07aa057d73ba475e48b24dc6e6861 --- /dev/null +++ b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end.json @@ -0,0 +1,13 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21", + "no_action_prediction": false, + "model_name_or_path": "roberta-base", + "candidate_embedding_model_name": "roberta-base", + "transformers_local_files_only": false, + "loss_function": "distillation", + "train_batch_size": 3, + "dev_batch_size": 16, + "test_batch_size": 16, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end2.json b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end2.json new file mode 100644 index 0000000000000000000000000000000000000000..41f20072f3fa4706c5872c08e6b4b3f8b7bb1c47 --- /dev/null +++ b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end2.json @@ -0,0 +1,13 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21", + "no_action_prediction": false, + "model_name_or_path": "roberta-base", + "candidate_embedding_model_name": "roberta-base", + "transformers_local_files_only": false, + "loss_function": "distribution_distillation", + "train_batch_size": 3, + "dev_batch_size": 16, + "test_batch_size": 16, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/setsumbt_pretrain.json b/convlab/dst/setsumbt/configs/setsumbt_pretrain.json index fdc22d157840e7494b0266d0bd99f8a99d242969..3fa4104ceeca964986199a366a024bbdcf03847a 100644 --- a/convlab/dst/setsumbt/configs/setsumbt_pretrain.json +++ b/convlab/dst/setsumbt/configs/setsumbt_pretrain.json @@ -2,8 +2,8 @@ "model_type": "SetSUMBT", "dataset": "sgd+tm1+tm2+tm3", "no_action_prediction": true, - "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base", - "transformers_local_files_only": true, + "model_name_or_path": "roberta-base", + "transformers_local_files_only": false, "train_batch_size": 3, "dev_batch_size": 12, "test_batch_size": 12, diff --git a/convlab/dst/setsumbt/configs/setsumbt_sgd.json b/convlab/dst/setsumbt/configs/setsumbt_sgd.json index 97f5818334af4c7984ec24448861b627315820e3..7e6946126d6595e903bd03dfb346a9d1ba1068cd 100644 --- a/convlab/dst/setsumbt/configs/setsumbt_sgd.json +++ b/convlab/dst/setsumbt/configs/setsumbt_sgd.json @@ -2,8 +2,8 @@ "model_type": "SetSUMBT", "dataset": "sgd", "no_action_prediction": true, - "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base", - "transformers_local_files_only": true, + "model_name_or_path": "roberta-base", + "transformers_local_files_only": false, "train_batch_size": 3, "dev_batch_size": 6, "test_batch_size": 3, diff --git a/convlab/dst/setsumbt/configs/setsumbt_tm.json b/convlab/dst/setsumbt/configs/setsumbt_tm.json index 138f84c358067389d5f7b478ae94c3eb2aa90ea3..1cf54f443ee444a2035ecfb7e6ecb9ee7a9084bd 100644 --- a/convlab/dst/setsumbt/configs/setsumbt_tm.json +++ b/convlab/dst/setsumbt/configs/setsumbt_tm.json @@ -2,8 +2,8 @@ "model_type": "SetSUMBT", "dataset": "tm1+tm2+tm3", "no_action_prediction": true, - "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base", - "transformers_local_files_only": true, + "model_name_or_path": "roberta-base", + "transformers_local_files_only": false, "train_batch_size": 3, "dev_batch_size": 8, "test_batch_size": 8, diff --git a/convlab/dst/setsumbt/configs/sumbt_joint.json b/convlab/dst/setsumbt/configs/sumbt_joint.json new file mode 100644 index 0000000000000000000000000000000000000000..958a4261cfeaf5858112d225f994c8ccc6075c91 --- /dev/null +++ b/convlab/dst/setsumbt/configs/sumbt_joint.json @@ -0,0 +1,14 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21+sgd+tm1+tm2+tm3", + "no_action_prediction": true, + "model_type": "bert", + "model_name_or_path": "bert-base-uncased", + "transformers_local_files_only": false, + "no_set_similarity": false, + "candidate_pooling": "cls", + "train_batch_size": 3, + "dev_batch_size": 8, + "test_batch_size": 8, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/sumbt_multiwoz21.json b/convlab/dst/setsumbt/configs/sumbt_multiwoz21.json new file mode 100644 index 0000000000000000000000000000000000000000..ebebe8a6631a03aff85a08ee6608b0d757f2a33d --- /dev/null +++ b/convlab/dst/setsumbt/configs/sumbt_multiwoz21.json @@ -0,0 +1,15 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21", + "no_action_prediction": true, + "model_type": "bert", + "model_name_or_path": "bert-base-uncased", + "candidate_embedding_model_name": "bert-base-uncased", + "transformers_local_files_only": false, + "no_set_similarity": false, + "candidate_pooling": "cls", + "train_batch_size": 3, + "dev_batch_size": 16, + "test_batch_size": 16, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/sumbt_multiwoz21_10p.json b/convlab/dst/setsumbt/configs/sumbt_multiwoz21_10p.json new file mode 100644 index 0000000000000000000000000000000000000000..f3e5454e34b4001b7dcab50975a15a75f58a44ba --- /dev/null +++ b/convlab/dst/setsumbt/configs/sumbt_multiwoz21_10p.json @@ -0,0 +1,19 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21", + "dataset_train_ratio": 0.1, + "no_action_prediction": true, + "model_type": "bert", + "model_name_or_path": "bert-base-uncased", + "candidate_embedding_model_name": "bert-base-uncased", + "transformers_local_files_only": false, + "no_set_similarity": false, + "candidate_pooling": "cls", + "num_train_epochs": 500, + "patience": 50, + "warmup_proportion": 0.02, + "train_batch_size": 3, + "dev_batch_size": 16, + "test_batch_size": 16, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/sumbt_multiwoz21_1p.json b/convlab/dst/setsumbt/configs/sumbt_multiwoz21_1p.json new file mode 100644 index 0000000000000000000000000000000000000000..45f684db422b695019c173ad670206129fee793c --- /dev/null +++ b/convlab/dst/setsumbt/configs/sumbt_multiwoz21_1p.json @@ -0,0 +1,19 @@ +{ + "model_type": "SetSUMBT", + "dataset": "multiwoz21", + "dataset_train_ratio": 0.01, + "no_action_prediction": true, + "model_type": "bert", + "model_name_or_path": "bert-base-uncased", + "candidate_embedding_model_name": "bert-base-uncased", + "transformers_local_files_only": false, + "no_set_similarity": false, + "candidate_pooling": "cls", + "num_train_epochs": 1000, + "patience": 100, + "warmup_proportion": 0.01, + "train_batch_size": 3, + "dev_batch_size": 16, + "test_batch_size": 16, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/sumbt_pretrain.json b/convlab/dst/setsumbt/configs/sumbt_pretrain.json new file mode 100644 index 0000000000000000000000000000000000000000..bf8a22d6a93be24daf63c4931c1323fcfefe7d12 --- /dev/null +++ b/convlab/dst/setsumbt/configs/sumbt_pretrain.json @@ -0,0 +1,14 @@ +{ + "model_type": "SetSUMBT", + "dataset": "sgd+tm1+tm2+tm3", + "no_action_prediction": true, + "model_type": "bert", + "model_name_or_path": "bert-base-uncased", + "transformers_local_files_only": false, + "no_set_similarity": false, + "candidate_pooling": "cls", + "train_batch_size": 3, + "dev_batch_size": 12, + "test_batch_size": 12, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/sumbt_sgd.json b/convlab/dst/setsumbt/configs/sumbt_sgd.json new file mode 100644 index 0000000000000000000000000000000000000000..307bca67f2996e1ecf5a29eaf231804367964934 --- /dev/null +++ b/convlab/dst/setsumbt/configs/sumbt_sgd.json @@ -0,0 +1,14 @@ +{ + "model_type": "SetSUMBT", + "dataset": "sgd", + "no_action_prediction": true, + "model_type": "bert", + "model_name_or_path": "bert-base-uncased", + "transformers_local_files_only": false, + "no_set_similarity": false, + "candidate_pooling": "cls", + "train_batch_size": 3, + "dev_batch_size": 6, + "test_batch_size": 3, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configs/sumbt_tm.json b/convlab/dst/setsumbt/configs/sumbt_tm.json new file mode 100644 index 0000000000000000000000000000000000000000..c1263fd384e37c0b708a8771e0412b2c8fbc9cd4 --- /dev/null +++ b/convlab/dst/setsumbt/configs/sumbt_tm.json @@ -0,0 +1,14 @@ +{ + "model_type": "SetSUMBT", + "dataset": "tm1+tm2+tm3", + "no_action_prediction": true, + "model_type": "bert", + "model_name_or_path": "bert-base-uncased", + "transformers_local_files_only": false, + "no_set_similarity": false, + "candidate_pooling": "cls", + "train_batch_size": 3, + "dev_batch_size": 8, + "test_batch_size": 8, + "run_nbt": true +} \ No newline at end of file diff --git a/convlab/dst/setsumbt/configure_ensemble.sh b/convlab/dst/setsumbt/configure_ensemble.sh deleted file mode 100755 index fc33df5a6f7e0bdbfae103b5999d212a73f7f4a3..0000000000000000000000000000000000000000 --- a/convlab/dst/setsumbt/configure_ensemble.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -ENSEMBLE_SIZE=10 -DATA_SIZE=7500 -SEED=$1 -OUT=$2 - -python run.py --run_nbt \ - --output_dir $OUT \ - --use_descriptions --set_similarity \ - --ensemble_size $ENSEMBLE_SIZE \ - --data_sampling_size $DATA_SIZE \ - --seed $SEED - -ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1)) -for e in $(seq 0 $ENSEMBLE_SIZE);do - mkdir -p "$OUT/ensemble-$e/dataloaders" - - mv "$OUT/ensemble-$e/train.dataloader" "$OUT/ensemble-$e/dataloaders/" - cp "$OUT/dataloaders/dev.dataloader" "$OUT/ensemble-$e/dataloaders/" - cp "$OUT/dataloaders/test.dataloader" "$OUT/ensemble-$e/dataloaders/" - cp -r $OUT/database "$OUT/ensemble-$e/" -done diff --git a/convlab/dst/setsumbt/configure_model.sh b/convlab/dst/setsumbt/configure_model.sh deleted file mode 100755 index cea833bf9df741409f9c7fb28454493acc08cc49..0000000000000000000000000000000000000000 --- a/convlab/dst/setsumbt/configure_model.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -IN=$1 -IN_DATA=$2 -OUT=$3 - -mkdir -p $OUT -cp "$IN/database/test.db" "$OUT/ontology.db" -cp "$IN_DATA/ontology_test.db" "$OUT/ontology.json" -cp "$IN/pytorch_model.bin" "$OUT/pytorch_model.bin" -cp "$IN/config.json" "$OUT/config.json" diff --git a/convlab/dst/setsumbt/distill_end.sh b/convlab/dst/setsumbt/distill_end.sh deleted file mode 100755 index 76db8dfdb64204938c1e657b87562a4731cdf3a2..0000000000000000000000000000000000000000 --- a/convlab/dst/setsumbt/distill_end.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -ENSEMBLE_SIZE=10 -SEED=$1 -OUT=$2 - -ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1)) -for e in $(seq 0 $ENSEMBLE_SIZE);do - cp "$OUT/ensemble-$e/pytorch_model.bin" "$OUT/pytorch_model_$e.bin" -done -cp "$OUT/ensemble-0/config.json" "$OUT/config.json" - -for SET in "train" "dev" "test";do - python distillation_setup.py --get_ensemble_distributions \ - --model_path $OUT \ - --model_type roberta \ - --set_type $SET \ - --ensemble_size $ENSEMBLE_SIZE \ - --reduction mean -done - -python distillation_setup.py --build_dataloaders \ - --model_path $OUT \ - --set_type train \ - --batch_size 3 - -for SET in "dev" "test";do - python distillation_setup.py --build_dataloaders \ - --model_path $OUT \ - --set_type $SET \ - --batch_size 16 -done - -python run.py --run_nbt \ - --output_dir $OUT \ - --loss_function distillation \ - --use_descriptions --set_similarity \ - --do_train --do_eval \ - --seed $SEED diff --git a/convlab/dst/setsumbt/distill_end2.sh b/convlab/dst/setsumbt/distill_end2.sh deleted file mode 100755 index 434375a2363a79d98f99a7eecd20d1dc44c4dbec..0000000000000000000000000000000000000000 --- a/convlab/dst/setsumbt/distill_end2.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -ENSEMBLE_SIZE=10 -SEED=$1 -OUT=$2 - -ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1)) -for e in $(seq 0 $ENSEMBLE_SIZE);do - cp "$OUT/ensemble-$e/pytorch_model.bin" "$OUT/pytorch_model_$e.bin" -done -cp "$OUT/ensemble-0/config.json" "$OUT/config.json" - -for SET in "train" "dev" "test";do - python distillation_setup.py --get_ensemble_distributions \ - --model_path $OUT \ - --model_type roberta \ - --set_type $SET \ - --ensemble_size $ENSEMBLE_SIZE \ - --reduction none -done - -python distillation_setup.py --build_dataloaders \ - --model_path $OUT \ - --set_type train \ - --batch_size 3 - -for SET in "dev" "test";do - python distillation_setup.py --build_dataloaders \ - --model_path $OUT \ - --set_type $SET \ - --batch_size 16 -done - -python run.py --run_nbt \ - --output_dir $OUT \ - --loss_function "distribution_distillation" \ - --use_descriptions --set_similarity \ - --do_train --do_eval \ - --seed $SEED diff --git a/convlab/dst/setsumbt/train_ensemble.sh b/convlab/dst/setsumbt/train_ensemble.sh deleted file mode 100755 index 911f8baa93966a1c3030655de97a263f492e7a0d..0000000000000000000000000000000000000000 --- a/convlab/dst/setsumbt/train_ensemble.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -ENSEMBLE_SIZE=10 -SEED=$1 -OUT=$2 - -ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1)) -for e in $(seq 0 $ENSEMBLE_SIZE);do - python run.py --run_nbt \ - --output_dir "$OUT/ensemble-$e" \ - --use_descriptions --set_similarity \ - --do_train --do_eval \ - --seed $SEED -done diff --git a/convlab/dst/sumbt/.gitignore b/convlab/dst/sumbt/.gitignore deleted file mode 100644 index 4629906dfe3c88b813e109039739c38a0a890228..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*/model_output/ diff --git a/convlab/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py b/convlab/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py deleted file mode 100755 index 6897678bc5dbf8760738cd118b836d0c0bb4c355..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py +++ /dev/null @@ -1,300 +0,0 @@ -import math -import torch -import torch.nn as nn -import torch.nn.functional as F - -from torch.nn import CrossEntropyLoss -from transformers import BertModel -from transformers import BertPreTrainedModel - - -class BertForUtteranceEncoding(BertPreTrainedModel): - def __init__(self, config): - super(BertForUtteranceEncoding, self).__init__(config) - - self.config = config - self.bert = BertModel(config) - - def forward(self, input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False): - - return self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, encoder_hidden_states=output_all_encoded_layers) - - -class MultiHeadAttention(nn.Module): - def __init__(self, heads, d_model, dropout=0.1): - super().__init__() - - self.d_model = d_model - self.d_k = d_model // heads - self.h = heads - - self.q_linear = nn.Linear(d_model, d_model) - self.v_linear = nn.Linear(d_model, d_model) - self.k_linear = nn.Linear(d_model, d_model) - self.dropout = nn.Dropout(dropout) - self.out = nn.Linear(d_model, d_model) - - self.scores = None - - def attention(self, q, k, v, d_k, mask=None, dropout=None): - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) - - if mask is not None: - mask = mask.unsqueeze(1) - scores = scores.masked_fill(mask == 0, -1e9) - scores = F.softmax(scores, dim=-1) - - if dropout is not None: - scores = dropout(scores) - - self.scores = scores - output = torch.matmul(scores, v) - return output - - def forward(self, q, k, v, mask=None): - bs = q.size(0) - - # perform linear operation and split into h heads - k = self.k_linear(k).view(bs, -1, self.h, self.d_k) - q = self.q_linear(q).view(bs, -1, self.h, self.d_k) - v = self.v_linear(v).view(bs, -1, self.h, self.d_k) - - # transpose to get dimensions bs * h * sl * d_model - k = k.transpose(1, 2) - q = q.transpose(1, 2) - v = v.transpose(1, 2) - - scores = self.attention(q, k, v, self.d_k, mask, self.dropout) - - # concatenate heads and put through final linear layer - concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model) - output = self.out(concat) - return output - - def get_scores(self): - return self.scores - - -class BeliefTracker(nn.Module): - def __init__(self, args, num_labels, device="cuda"): - super(BeliefTracker, self).__init__() - - self.hidden_dim = args.hidden_dim - self.rnn_num_layers = args.num_rnn_layers - self.zero_init_rnn = args.zero_init_rnn - self.max_seq_length = args.max_seq_length - self.max_label_length = args.max_label_length - self.num_labels = num_labels - self.num_slots = len(num_labels) - self.attn_head = args.attn_head - self.device = device - - ### Utterance Encoder - self.utterance_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) - self.utterance_encoder.train() - self.bert_output_dim = self.utterance_encoder.config.hidden_size - self.hidden_dropout_prob = self.utterance_encoder.config.hidden_dropout_prob - if args.fix_utterance_encoder: - for p in self.utterance_encoder.bert.pooler.parameters(): - p.requires_grad = False - - ### slot, slot-value Encoder (not trainable) - self.sv_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) - self.sv_encoder.train() - for p in self.sv_encoder.bert.parameters(): - p.requires_grad = False - - self.slot_lookup = nn.Embedding(self.num_slots, self.bert_output_dim) - self.value_lookup = nn.ModuleList([nn.Embedding(num_label, self.bert_output_dim) for num_label in num_labels]) - - ### Attention layer - self.attn = MultiHeadAttention(self.attn_head, self.bert_output_dim, dropout=0) - - ### RNN Belief Tracker - self.nbt = None - if args.task_name.find("gru") != -1: - self.nbt = nn.GRU(input_size=self.bert_output_dim, - hidden_size=self.hidden_dim, - num_layers=self.rnn_num_layers, - dropout=self.hidden_dropout_prob, - batch_first=True) - self.init_parameter(self.nbt) - elif args.task_name.find("lstm") != -1: - self.nbt = nn.LSTM(input_size=self.bert_output_dim, - hidden_size=self.hidden_dim, - num_layers=self.rnn_num_layers, - dropout=self.hidden_dropout_prob, - batch_first=True) - self.init_parameter(self.nbt) - if not self.zero_init_rnn: - self.rnn_init_linear = nn.Sequential( - nn.Linear(self.bert_output_dim, self.hidden_dim), - nn.ReLU(), - nn.Dropout(self.hidden_dropout_prob) - ) - - self.linear = nn.Linear(self.hidden_dim, self.bert_output_dim) - self.layer_norm = nn.LayerNorm(self.bert_output_dim) - - ### Measure - self.distance_metric = args.distance_metric - if self.distance_metric == "cosine": - self.metric = torch.nn.CosineSimilarity(dim=-1, eps=1e-08) - elif self.distance_metric == "euclidean": - self.metric = torch.nn.PairwiseDistance(p=2.0, eps=1e-06, keepdim=False) - - ### Classifier - self.nll = CrossEntropyLoss(ignore_index=-1) - - ### Etc. - self.dropout = nn.Dropout(self.hidden_dropout_prob) - - # default evaluation mode - self.eval() - - def initialize_slot_value_lookup(self, label_ids, slot_ids): - - self.sv_encoder.eval() - - # Slot encoding - slot_type_ids = torch.zeros(slot_ids.size(), dtype=torch.long).to(self.device) - slot_mask = slot_ids > 0 - - hid_slot, _ = self.sv_encoder(slot_ids.view(-1, self.max_label_length), - slot_type_ids.view(-1, self.max_label_length), - slot_mask.view(-1, self.max_label_length), - output_all_encoded_layers=False) - hid_slot = hid_slot[:, 0, :] - hid_slot = hid_slot.detach() - self.slot_lookup = nn.Embedding.from_pretrained(hid_slot, freeze=True) - - for s, label_id in enumerate(label_ids): - label_type_ids = torch.zeros(label_id.size(), dtype=torch.long).to(self.device) - label_mask = label_id > 0 - hid_label, _ = self.sv_encoder(label_id.view(-1, self.max_label_length), - label_type_ids.view(-1, self.max_label_length), - label_mask.view(-1, self.max_label_length), - output_all_encoded_layers=False) - hid_label = hid_label[:, 0, :] - hid_label = hid_label.detach() - self.value_lookup[s] = nn.Embedding.from_pretrained(hid_label, freeze=True) - self.value_lookup[s].padding_idx = -1 - - print("Complete initialization of slot and value lookup") - - def _make_aux_tensors(self, ids, len): - token_type_ids = torch.zeros(ids.size(), dtype=torch.long).to(self.device) - for i in range(len.size(0)): - for j in range(len.size(1)): - if len[i, j, 0] == 0: # padding - break - elif len[i, j, 1] > 0: # escape only text_a case - start = len[i, j, 0] - ending = len[i, j, 0] + len[i, j, 1] - token_type_ids[i, j, start:ending] = 1 - attention_mask = ids > 0 - return token_type_ids, attention_mask - - def forward(self, input_ids, input_len, labels, n_gpu=1, target_slot=None): - - # if target_slot is not specified, output values corresponding all slot-types - if target_slot is None: - target_slot = list(range(0, self.num_slots)) - - ds = input_ids.size(0) # dialog size - ts = input_ids.size(1) # turn size - bs = ds * ts - slot_dim = len(target_slot) - - # Utterance encoding - token_type_ids, attention_mask = self._make_aux_tensors(input_ids, input_len) - - hidden, _ = self.utterance_encoder(input_ids.view(-1, self.max_seq_length), - token_type_ids.view(-1, self.max_seq_length), - attention_mask.view(-1, self.max_seq_length), - output_all_encoded_layers=False) - hidden = torch.mul(hidden, attention_mask.view(-1, self.max_seq_length, 1).expand(hidden.size()).float()) - hidden = hidden.repeat(slot_dim, 1, 1) # [(slot_dim*ds*ts), bert_seq, hid_size] - - hid_slot = self.slot_lookup.weight[target_slot, :] # Select target slot embedding - hid_slot = hid_slot.repeat(1, bs).view(bs * slot_dim, -1) # [(slot_dim*ds*ts), bert_seq, hid_size] - - # Attended utterance vector - hidden = self.attn(hid_slot, hidden, hidden, - mask=attention_mask.view(-1, 1, self.max_seq_length).repeat(slot_dim, 1, 1)) - hidden = hidden.squeeze() # [slot_dim*ds*ts, bert_dim] - hidden = hidden.view(slot_dim, ds, ts, -1).view(-1, ts, self.bert_output_dim) - - # NBT - if self.zero_init_rnn: - h = torch.zeros(self.rnn_num_layers, input_ids.shape[0] * slot_dim, self.hidden_dim).to( - self.device) # [1, slot_dim*ds, hidden] - else: - h = hidden[:, 0, :].unsqueeze(0).repeat(self.rnn_num_layers, 1, 1) - h = self.rnn_init_linear(h) - - if isinstance(self.nbt, nn.GRU): - rnn_out, _ = self.nbt(hidden, h) # [slot_dim*ds, turn, hidden] - elif isinstance(self.nbt, nn.LSTM): - c = torch.zeros(self.rnn_num_layers, input_ids.shape[0] * slot_dim, self.hidden_dim).to( - self.device) # [1, slot_dim*ds, hidden] - rnn_out, _ = self.nbt(hidden, (h, c)) # [slot_dim*ds, turn, hidden] - rnn_out = self.layer_norm(self.linear(self.dropout(rnn_out))) - - hidden = rnn_out.view(slot_dim, ds, ts, -1) - - # Label (slot-value) encoding - loss = 0 - loss_slot = [] - pred_slot = [] - output = [] - for s, slot_id in enumerate(target_slot): ## note: target_slots are successive - # loss calculation - hid_label = self.value_lookup[slot_id].weight - num_slot_labels = hid_label.size(0) - - _hid_label = hid_label.unsqueeze(0).unsqueeze(0).repeat(ds, ts, 1, 1).view(ds * ts * num_slot_labels, -1) - _hidden = hidden[s, :, :, :].unsqueeze(2).repeat(1, 1, num_slot_labels, 1).view(ds * ts * num_slot_labels, - -1) - _dist = self.metric(_hid_label, _hidden).view(ds, ts, num_slot_labels) - - if self.distance_metric == "euclidean": - _dist = -_dist - _, pred = torch.max(_dist, -1) - pred_slot.append(pred.view(ds, ts, 1)) - output.append(_dist) - - if labels is not None: - _loss = self.nll(_dist.view(ds * ts, -1), labels[:, :, s].view(-1)) - loss_slot.append(_loss.item()) - loss += _loss - - if labels is None: - return output, torch.cat(pred_slot, 2) - - # calculate joint accuracy - pred_slot = torch.cat(pred_slot, 2) - # print('pred slot:', pred_slot[0][0]) - # print('labels:', labels[0][0]) - accuracy = (pred_slot == labels).view(-1, slot_dim) - acc_slot = torch.sum(accuracy, 0).float() \ - / torch.sum(labels.view(-1, slot_dim) > -1, 0).float() - acc = sum(torch.sum(accuracy, 1) / slot_dim).float() \ - / torch.sum(labels[:, :, 0].view(-1) > -1, 0).float() # joint accuracy - - if n_gpu == 1: - return loss, loss_slot, acc, acc_slot, pred_slot - else: - return loss.unsqueeze(0), None, acc.unsqueeze(0), acc_slot.unsqueeze(0), pred_slot.unsqueeze(0) - - @staticmethod - def init_parameter(module): - if isinstance(module, nn.Linear): - torch.nn.init.xavier_normal_(module.weight) - torch.nn.init.constant_(module.bias, 0.0) - elif isinstance(module, nn.GRU) or isinstance(module, nn.LSTM): - torch.nn.init.xavier_normal_(module.weight_ih_l0) - torch.nn.init.xavier_normal_(module.weight_hh_l0) - torch.nn.init.constant_(module.bias_ih_l0, 0.0) - torch.nn.init.constant_(module.bias_hh_l0, 0.0) diff --git a/convlab/dst/sumbt/README.md b/convlab/dst/sumbt/README.md index 67d0a80ae67f417e93306d2808233bb967ddfcd2..7509ae2eed889bbc85f7eca3716d7dbfbc0beb6f 100755 --- a/convlab/dst/sumbt/README.md +++ b/convlab/dst/sumbt/README.md @@ -1,73 +1 @@ -# SUMBT on Multiwoz - -SUMBT (Slot-Utterance Matching Belief Tracker) is a belief tracking model that -utilizes semantic similarity between dialogue utterances and slot-values -, which is proposed by [Hwaran Lee et al., 2019](https://www.aclweb.org/anthology/P19-1546.pdf). - -The code derives from [github](https://github.com/SKTBrain/SUMBT). We modify it to support user DST. - -## Usage - - -### Train & Evaluate - -from Convlab root directory -```python -from convlab.dst.sumbt.multiwoz.sumbt import * -m = SUMBTTracker() -m.train() # will train and output the model checkpoint in the output_path defined in 'sumbt_config.py' file -# m.test(mode, model_path) # where testset in ['dev', 'test'], respectively run evaluation on dev/test set of MultiWoz, model_path specify the model you want to evaluate with. will create 2 files containing evaluation metrics in the output_path defined in config file. - -``` - - -### Track -from Convlab root directory -```python -from convlab.dst.sumbt.multiwoz.sumbt import * -test_update() -``` - -At the first run, the SumbtTracker will download a pre-trained model and save it into 'downloaded_model/' directory. - -## Data - -We use the multiwoz data. - -## Performance on Multiwoz - -`mode` determines the data we use: if mode=`usr`, use user utterances to train; if mode=`sys`, use system utterances to train. - -We evaluate the Joint accuracy and Slot accuracy on Multiwoz 2.0 validation and test set. -The accuracy on validation set are slightly higher than the results reported in the paper, -because in the evaluation code all undefined values in ontology are set `none` but predictions -will always be wrong for all undefined domain-slots. - -| | Joint acc | Slot acc | Joint acc (Restaurant) | Slot acc (Restaurant)| -| ----- | ----- | ------ | ------ | ---- | -| dev | 0.47 | 0.97 | 0.83 | 0.97 | -| test | 0.51 | 0.97 | 0.84 | 0.97 - -## Model Structure - -SUMBT considers a domain-slot type (e.g., 'restaurant-food') as a query and finds the corresponding -slot-value in a pair of system-user utterances, under the assumption that the answer appear in the utterances. - -The model encodes domain-slot with a fixed BERT model and encodes utterances with another BERT -of which parameters are fine-tuned during training. A MultiHead attention layer is -employed to capture slot-specific information, and the attention context vector is fed -into an RNN to model the flow of dialogues. - - -## Reference - -``` -@inproceedings{lee2019sumbt, - title={SUMBT: Slot-Utterance Matching for Universal and Scalable Belief Tracking}, - author={Lee, Hwaran and Lee, Jinsik and Kim, Tae-Yoon}, - booktitle={Proceedings of the 57th Conference of the Association for Computational Linguistics}, - pages={5478--5483}, - year={2019} -} -``` - +See the [SetSUMBT Code](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt) for the new SUMBT code. \ No newline at end of file diff --git a/convlab/dst/sumbt/__init__.py b/convlab/dst/sumbt/__init__.py deleted file mode 100755 index 71cad5770be9eb7bf253646f060d84fbc024270a..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from convlab.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker - diff --git a/convlab/dst/sumbt/crosswoz_en/.gitignore b/convlab/dst/sumbt/crosswoz_en/.gitignore deleted file mode 100644 index eb23bddd0762d03a8d0a3d25aaf033ef22948b43..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/crosswoz_en/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -model_output/ -pre-trained/ diff --git a/convlab/dst/sumbt/crosswoz_en/__init__.py b/convlab/dst/sumbt/crosswoz_en/__init__.py deleted file mode 100644 index 91fa08da403f338d95db032b621c74904e56f986..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/crosswoz_en/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from convlab.dst.sumbt.crosswoz_en.sumbt import SUMBTTracker as SUMBT diff --git a/convlab/dst/sumbt/crosswoz_en/convert_to_glue_format.py b/convlab/dst/sumbt/crosswoz_en/convert_to_glue_format.py deleted file mode 100644 index bb56b27334cdec6ffe98a3ad21efdb29d93e64e9..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/crosswoz_en/convert_to_glue_format.py +++ /dev/null @@ -1,139 +0,0 @@ -import json -import zipfile -from convlab.dst.sumbt.crosswoz_en.sumbt_config import * - -null = 'none' - -def trans_value(value): - trans = { - '': 'none', - } - value = value.strip() - value = trans.get(value, value) - value = value.replace('’', "'") - value = value.replace('‘', "'") - return value - -def convert_to_glue_format(data_dir, sumbt_dir): - - if not os.path.isdir(os.path.join(sumbt_dir, args.tmp_data_dir)): - os.mkdir(os.path.join(sumbt_dir, args.tmp_data_dir)) - - ### Read ontology file - with open(os.path.join(data_dir, "ontology.json"), "r") as fp_ont: - data_ont = json.load(fp_ont) - ontology = {} - facilities = [] - for domain_slot in data_ont: - domain, slot = domain_slot.split('-', 1) - if domain not in ontology: - ontology[domain] = {} - if slot.startswith('Hotel Facilities'): - facilities.append(slot.split(' - ')[1]) - ontology[domain][slot] = set(map(str.lower, data_ont[domain_slot])) - - ### Read woz logs and write to tsv files - tsv_filename = os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv") - print('tsv file: ', os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv")) - if os.path.exists(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv")): - print('data has been processed!') - return 0 - else: - print('processing data') - - with open(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv"), "w") as fp_train, \ - open(os.path.join(sumbt_dir, args.tmp_data_dir, "dev.tsv"), "w") as fp_dev, \ - open(os.path.join(sumbt_dir, args.tmp_data_dir, "test.tsv"), "w") as fp_test: - - fp_train.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - fp_dev.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - fp_test.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - - for domain in sorted(ontology.keys()): - for slot in sorted(ontology[domain].keys()): - fp_train.write(f'{str(domain)}-{str(slot)}\t') - fp_dev.write(f'{str(domain)}-{str(slot)}\t') - fp_test.write(f'{str(domain)}-{str(slot)}\t') - - fp_train.write('\n') - fp_dev.write('\n') - fp_test.write('\n') - - # fp_data = open(os.path.join(SELF_DATA_DIR, "data.json"), "r") - # data = json.load(fp_data) - - file_split = ['train', 'val', 'test'] - fp = [fp_train, fp_dev, fp_test] - - for split_type, split_fp in zip(file_split, fp): - - zipfile_name = "{}.json.zip".format(split_type) - zip_fp = zipfile.ZipFile(os.path.join(data_dir, zipfile_name)) - data = json.loads(str(zip_fp.read(zip_fp.namelist()[0]), 'utf-8')) - - for file_id in data: - user_utterance = '' - system_response = '' - turn_idx = 0 - messages = data[file_id]['messages'] - for idx, turn in enumerate(messages): - if idx % 2 == 0: # user turn - user_utterance = turn['content'] - else: # system turn - user_utterance = user_utterance.replace('\t', ' ') - user_utterance = user_utterance.replace('\n', ' ') - user_utterance = user_utterance.replace(' ', ' ') - - system_response = system_response.replace('\t', ' ') - system_response = system_response.replace('\n', ' ') - system_response = system_response.replace(' ', ' ') - - split_fp.write(str(file_id)) # 0: dialogue ID - split_fp.write('\t' + str(turn_idx)) # 1: turn index - split_fp.write('\t' + str(user_utterance)) # 2: user utterance - split_fp.write('\t' + str(system_response)) # 3: system response - - # hardcode the value of facilities as 'yes' and 'no' - belief = {f'Hotel-Hotel Facilities - {str(facility)}': null for facility in facilities} - sys_state_init = turn['sys_state_init'] - for domain, slots in sys_state_init.items(): - for slot, value in slots.items(): - # skip selected results - if isinstance(value, list): - continue - if domain not in ontology: - print("domain (%s) is not defined" % domain) - continue - - if slot == 'Hotel Facilities': - for facility in value.split(','): - belief[f'{str(domain)}-Hotel Facilities - {str(facility)}'] = 'yes' - else: - if slot not in ontology[domain]: - print("slot (%s) in domain (%s) is not defined" % (slot, domain)) # bus-arriveBy not defined - continue - - value = trans_value(value).lower() - - if value not in ontology[domain][slot] and value != null: - print("%s: value (%s) in domain (%s) slot (%s) is not defined in ontology" % - (file_id, value, domain, slot)) - value = null - - belief[f'{str(domain)}-{str(slot)}'] = value - - for domain in sorted(ontology.keys()): - for slot in sorted(ontology[domain].keys()): - key = str(domain) + '-' + str(slot) - if key in belief: - val = belief[key] - split_fp.write('\t' + val) - else: - split_fp.write(f'\t{null}') - - split_fp.write('\n') - split_fp.flush() - - system_response = turn['content'] - turn_idx += 1 - print('data has been processed!') diff --git a/convlab/dst/sumbt/crosswoz_en/sumbt.py b/convlab/dst/sumbt/crosswoz_en/sumbt.py deleted file mode 100644 index 8b0b5cc82c5588f591d97f1292fc6eb61f22d98d..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/crosswoz_en/sumbt.py +++ /dev/null @@ -1,683 +0,0 @@ -import copy -from pprint import pprint -import random -from itertools import chain -import numpy as np -import zipfile - -from matplotlib import pyplot as plt - -from tensorboardX.writer import SummaryWriter -from tqdm._tqdm import trange, tqdm - -from convlab.util.file_util import cached_path - -from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler - -from transformers import BertTokenizer -from transformers import get_linear_schedule_with_warmup, AdamW - -from convlab.dst.dst import DST -from convlab.dst.sumbt.crosswoz_en.convert_to_glue_format import convert_to_glue_format, trans_value -from convlab.util.crosswoz_en.state import default_state - -from convlab.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker -from convlab.dst.sumbt.crosswoz_en.sumbt_utils import * -from convlab.dst.sumbt.crosswoz_en.sumbt_config import * - -from convlab.dst.sumbt.crosswoz_en.convert_to_glue_format import null - -USE_CUDA = torch.cuda.is_available() -N_GPU = torch.cuda.device_count() if USE_CUDA else 1 -DEVICE = "cuda" if USE_CUDA else "cpu" -ROOT_PATH = convlab.get_root_path() -SUMBT_PATH = os.path.dirname(os.path.abspath(__file__)) -DATA_PATH = os.path.join(ROOT_PATH, 'data/crosswoz_en') -DOWNLOAD_DIRECTORY = os.path.join(SUMBT_PATH, "pre-trained/") -crosswoz_en_slot_list = ['Attraction-duration', 'Attraction-fee', 'Attraction-name', 'Attraction-nearby attract.', 'Attraction-nearby hotels', 'Attraction-nearby rest.', 'Attraction-rating', 'Hotel-Hotel Facilities - 24-hour Hot Water', 'Hotel-Hotel Facilities - Bar', 'Hotel-Hotel Facilities - Breakfast Service', 'Hotel-Hotel Facilities - Broadband Internet', 'Hotel-Hotel Facilities - Business Center', 'Hotel-Hotel Facilities - Car Rental', 'Hotel-Hotel Facilities - Chess-Poker Room', 'Hotel-Hotel Facilities - Childcare Services', 'Hotel-Hotel Facilities - Chinese Restaurant', 'Hotel-Hotel Facilities - Disabled Facilities', 'Hotel-Hotel Facilities - Foreign Guests Reception', 'Hotel-Hotel Facilities - Free Breakfast Service', 'Hotel-Hotel Facilities - Free Domestic Long Distance Call', 'Hotel-Hotel Facilities - Free Local Calls', 'Hotel-Hotel Facilities - Gym', 'Hotel-Hotel Facilities - Hair Dryer', 'Hotel-Hotel Facilities - Heating', 'Hotel-Hotel Facilities - Hot Spring', 'Hotel-Hotel Facilities - Indoor Swimming Pool', 'Hotel-Hotel Facilities - International Call', 'Hotel-Hotel Facilities - Laundry Service', 'Hotel-Hotel Facilities - Luggage Storage', 'Hotel-Hotel Facilities - Meeting Room', 'Hotel-Hotel Facilities - Non-smoking Room', 'Hotel-Hotel Facilities - Outdoor Swimming Pool', 'Hotel-Hotel Facilities - Pay Parking', 'Hotel-Hotel Facilities - Pick-up Service', 'Hotel-Hotel Facilities - SPA', 'Hotel-Hotel Facilities - Sauna', 'Hotel-Hotel Facilities - Wake Up Service', 'Hotel-Hotel Facilities - Western Restaurant', 'Hotel-Hotel Facilities - WiFi in All Rooms', 'Hotel-Hotel Facilities - WiFi in Public Areas', 'Hotel-Hotel Facilities - WiFi in Public Areas and Some Rooms', 'Hotel-Hotel Facilities - WiFi in Some Rooms', 'Hotel-Hotel Facilities - WiFi throughout the Hotel', 'Hotel-name', 'Hotel-nearby attract.', 'Hotel-nearby hotels', 'Hotel-nearby rest.', 'Hotel-price', 'Hotel-rating', 'Hotel-type', 'Metro-from', 'Metro-to', 'Restaurant-cost', 'Restaurant-dishes', 'Restaurant-name', 'Restaurant-nearby attract.', 'Restaurant-nearby hotels', 'Restaurant-nearby rest.', 'Restaurant-rating', 'Taxi-from', 'Taxi-to'] - -def plot(x, y): - a, b = [], [] - for x, y in sorted(zip(x, y)): - a.append(x) - b.append(y) - plt.plot(a, b) - -# def get_label_embedding(labels, max_seq_length, tokenizer, device): -# features = [] -# for label in labels: -# label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"] -# label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens) -# label_len = len(label_token_ids) - -# label_padding = [0] * (max_seq_length - len(label_token_ids)) -# label_token_ids += label_padding -# assert len(label_token_ids) == max_seq_length - -# features.append((label_token_ids, label_len)) - -# all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device) -# all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device) - -# return all_label_token_ids, all_label_len - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class SUMBTTracker(DST): - """ - Transferable multi-domain dialogue state tracker, adopted from https://github.com/SKTBrain/SUMBT - """ - - @staticmethod - def init_data(): - if not os.path.exists(os.path.join(DATA_PATH, 'train.json.zip')): - with zipfile.ZipFile(os.path.join(DATA_PATH, 'mt.zip')) as f: - f.extractall(DATA_PATH) - - for split in ['train', 'test', 'val']: - with zipfile.ZipFile(os.path.join(DATA_PATH, f'{split}.json.zip'), 'w') as f: - f.write(os.path.join(DATA_PATH, f'{split}.json'), f'{split}.json') - - def __init__(self, data_dir=DATA_PATH): - - DST.__init__(self) - - # if not os.path.exists(data_dir): - # if model_file == '': - # raise Exception( - # 'Please provide remote model file path in config') - # resp = urllib.request.urlretrieve(model_file)[0] - # temp_file = tarfile.open(resp) - # temp_file.extractall('data') - # assert os.path.exists(data_dir) - - processor = Processor(args) - self.processor = processor - # values of each slot e.g. values_list - label_list = processor.get_labels() - num_labels = [len(labels) for labels in label_list] # number of slot-values in each slot-type - - # tokenizer - self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - - self.device = torch.device("cuda" if USE_CUDA else "cpu") - - self.sumbt_model = BeliefTracker(args, num_labels, self.device) - if USE_CUDA and N_GPU > 1: - self.sumbt_model = torch.nn.DataParallel(self.sumbt_model) - if args.fp16: - self.sumbt_model.half() - self.sumbt_model.to(self.device) - - ## Get slot-value embeddings - self.label_token_ids, self.label_len = [], [] - for labels in label_list: - # encoding values - token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, self.device) - self.label_token_ids.append(token_ids) - self.label_len.append(lens) - self.label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list] - self.label_map_inv = [{i: label for i, label in enumerate(labels)} for labels in label_list] - self.label_list = label_list - self.target_slot = processor.target_slot - ## Get domain-slot-type embeddings - self.slot_token_ids, self.slot_len = \ - get_label_embedding(processor.target_slot, args.max_label_length, self.tokenizer, self.device) - - self.args = args - self.state = default_state() - self.param_restored = False - if USE_CUDA and N_GPU == 1: - self.sumbt_model.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids) - elif USE_CUDA and N_GPU > 1: - self.sumbt_model.module.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids) - - self.cached_res = {} - convert_to_glue_format(DATA_PATH, SUMBT_PATH) - if not os.path.isdir(os.path.join(SUMBT_PATH, args.output_dir)): - os.makedirs(os.path.join(SUMBT_PATH, args.output_dir)) - self.train_examples = processor.get_train_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - self.dev_examples = processor.get_dev_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - self.test_examples = processor.get_test_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - - def load_weights(self, model_path=None): - if model_path is None: - model_ckpt = os.path.join(SUMBT_PATH, 'pre-trained/pytorch_model.bin') - else: - model_ckpt = model_path - model = self.sumbt_model - # in the case that slot and values are different between the training and evaluation - if not USE_CUDA: - ptr_model = torch.load(model_ckpt, map_location=torch.device('cpu')) - else: - ptr_model = torch.load(model_ckpt) - print('loading pretrained weights') - - if not USE_CUDA or N_GPU == 1: - state = model.state_dict() - state.update(ptr_model) - model.load_state_dict(state) - else: - # print("Evaluate using only one device!") - model.module.load_state_dict(ptr_model) - - if USE_CUDA: - model.to("cuda") - - def init_session(self): - self.state = default_state() - if not self.param_restored: - if os.path.isfile(os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')): - print('loading weights from downloaded model') - self.load_weights(model_path=os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')) - elif os.path.isfile(os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')): - print('loading weights from trained model') - self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')) - else: - raise ValueError('no available weights found.') - self.param_restored = True - - def construct_query(self, context): - '''Construct query from context''' - ids = [] - lens = [] - context_len = len(context) - if context[0][0] != 'sys': - context = [['sys', '']] + context - for i in range(0, context_len, 2): - # utt_user = '' - # utt_sys = '' - # for evaluation - utt_sys = context[i][1] - utt_user = context[i + 1][1] - - tokens_user = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_user)] - tokens_sys = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_sys)] - - _truncate_seq_pair(tokens_user, tokens_sys, self.args.max_seq_length - 3) - tokens = ["[CLS]"] + tokens_user + ["[SEP]"] + tokens_sys + ["[SEP]"] - input_len = [len(tokens_user) + 2, len(tokens_sys) + 1] - - input_ids = self.tokenizer.convert_tokens_to_ids(tokens) - padding = [0] * (self.args.max_seq_length - len(input_ids)) - input_ids += padding - assert len(input_ids) == self.args.max_seq_length - ids.append(input_ids) - lens.append(input_len) - - return (ids, lens) - - - def update(self, user_act=None): - if not isinstance(user_act, str): - raise Exception( - 'Expected user_act is str but found {}'.format(type(user_act)) - ) - prev_state = self.state - - actual_history = copy.deepcopy(prev_state['history']) - - # if actual_history[-1][0] == 'user': - # actual_history[-1][1] += user_act - # else: - # actual_history.append(['user', user_act]) - query = self.construct_query(actual_history) - pred_states = self.predict(query) - - new_belief_state = copy.deepcopy(prev_state['belief_state']) - for domain_slot, value in pred_states: - domain, slot = domain_slot.split('-', 1) - value = trans_value(value) - - # print(domain, slot, value) - - if domain not in new_belief_state: - raise Exception( - 'Error: domain <{}> not in belief state'.format(domain)) - - domain_dic = new_belief_state[domain] - if slot in domain_dic: - domain_dic[slot] = value - else: - with open('sumbt_tracker_unknown_slot.log', 'a+') as f: - f.write( - 'unknown slot name <{}> with value <{}> of domain <{}>\nitem: {}\n\n'.format(slot, value, domain, state) - ) - - new_state = copy.deepcopy(dict(prev_state)) - new_state['belief_state'] = new_belief_state - self.state = new_state - return self.state - - def predict(self, query): - cache_query_key = ''.join(str(list(chain.from_iterable(query[0])))) - if cache_query_key in self.cached_res.keys(): - return self.cached_res[cache_query_key] - - input_ids, input_len = query - input_ids = torch.tensor(input_ids).to(self.device).unsqueeze(0) - input_len = torch.tensor(input_len).to(self.device).unsqueeze(0) - labels = None - _, pred_slot = self.sumbt_model(input_ids, input_len, labels) - pred_slot_t = pred_slot[0][-1].tolist() - predict_belief = [] - for idx, i in enumerate(pred_slot_t): - predict_belief.append((self.target_slot[idx], self.label_map_inv[idx][i])) - # predict_belief.append('{}-{}'.format(self.target_slot[idx], self.label_map_inv[idx][i])) - self.cached_res[cache_query_key] = predict_belief - - return predict_belief - - def train(self, load_model=False, model_path=None): - if load_model: - if model_path is not None: - self.load_weights(model_path) - ## Training utterances - all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( - self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - - print('all input ids size: ', all_input_ids.size()) - num_train_batches = all_input_ids.size(0) - num_train_steps = int( - num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) - - logger.info("***** training *****") - logger.info(" Num examples = %d", len(self.train_examples)) - logger.info(" Batch size = %d", args.train_batch_size) - logger.info(" Num steps = %d", num_train_steps) - - all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to( - DEVICE), all_label_ids.to(DEVICE) - - train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) - train_sampler = RandomSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) - - all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features( - self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - - logger.info("***** validation *****") - logger.info(" Num examples = %d", len(self.dev_examples)) - logger.info(" Batch size = %d", args.dev_batch_size) - - all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \ - all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE) - - dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev) - dev_sampler = SequentialSampler(dev_data) - dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size) - - logger.info("Loaded data!") - - if args.fp16: - self.sumbt_model.half() - self.sumbt_model.to(DEVICE) - - # ## Get domain-slot-type embeddings - # slot_token_ids, slot_len = \ - # get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE) - - # # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot): - # # self.idx2slot[slot_idx] = slot_str - - # ## Get slot-value embeddings - # label_token_ids, label_len = [], [] - # for slot_idx, labels in zip(slot_token_ids, self.label_list): - # # self.idx2value[slot_idx] = {} - # token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE) - # label_token_ids.append(token_ids) - # label_len.append(lens) - # # for label, token_id in zip(labels, token_ids): - # # self.idx2value[slot_idx][token_id] = label - - # logger.info('embeddings prepared') - - # if USE_CUDA and N_GPU > 1: - # self.sumbt_model.module.initialize_slot_value_lookup(label_token_ids, slot_token_ids) - # else: - # self.sumbt_model.initialize_slot_value_lookup(label_token_ids, slot_token_ids) - - def get_optimizer_grouped_parameters(model): - param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad] - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, - 'lr': args.learning_rate}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, - 'lr': args.learning_rate}, - ] - return optimizer_grouped_parameters - - if not USE_CUDA or N_GPU == 1: - optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model) - else: - optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model.module) - - t_total = num_train_steps - - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.fp16_loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale) - - else: - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total) - logger.info(optimizer) - - # Training code - ############################################################################### - - print(torch.cuda.memory_allocated()) - - logger.info("Training...") - - global_step = 0 - last_update = None - best_loss = None - model = self.sumbt_model - if not args.do_not_use_tensorboard: - summary_writer = None - else: - summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/") - - for epoch in trange(int(args.num_train_epochs), desc="Epoch"): - # Train - model.train() - tr_loss = 0 - nb_tr_examples = 0 - nb_tr_steps = 0 - - for step, batch in enumerate(tqdm(train_dataloader)): - batch = tuple(t.to(DEVICE) for t in batch) - input_ids, input_len, label_ids = batch - # print(input_ids.size()) - - # Forward - if N_GPU == 1: - loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - else: - loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - - # average to multi-gpus - loss = loss.mean() - acc = acc.mean() - acc_slot = acc_slot.mean(0) - - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - # Backward - if args.fp16: - optimizer.backward(loss) - else: - loss.backward() - - # tensrboard logging - if summary_writer is not None: - summary_writer.add_scalar("Epoch", epoch, global_step) - summary_writer.add_scalar("Train/Loss", loss, global_step) - summary_writer.add_scalar("Train/JointAcc", acc, global_step) - if N_GPU == 1: - for i, slot in enumerate(self.processor.target_slot): - summary_writer.add_scalar("Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i], - global_step) - summary_writer.add_scalar("Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step) - - tr_loss += loss.item() - nb_tr_examples += input_ids.size(0) - nb_tr_steps += 1 - if (step + 1) % args.gradient_accumulation_steps == 0: - # modify lealrning rate with special warm up BERT uses - lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) - if summary_writer is not None: - summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) - for param_group in optimizer.param_groups: - param_group['lr'] = lr_this_step - if scheduler is not None: - torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0) - optimizer.step() - if scheduler is not None: - scheduler.step() - optimizer.zero_grad() - global_step += 1 - - - # Perform evaluation on validation dataset - model.eval() - dev_loss = 0 - dev_acc = 0 - dev_loss_slot, dev_acc_slot = None, None - nb_dev_examples, nb_dev_steps = 0, 0 - - for step, batch in enumerate(tqdm(dev_dataloader, desc="Validation")): - batch = tuple(t.to(DEVICE) for t in batch) - input_ids, input_len, label_ids = batch - if input_ids.dim() == 2: - input_ids = input_ids.unsqueeze(0) - input_len = input_len.unsqueeze(0) - label_ids = label_ids.unsuqeeze(0) - - with torch.no_grad(): - if N_GPU == 1: - loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - else: - loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - - # average to multi-gpus - loss = loss.mean() - acc = acc.mean() - acc_slot = acc_slot.mean(0) - - num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item() - dev_loss += loss.item() * num_valid_turn - dev_acc += acc.item() * num_valid_turn - - if N_GPU == 1: - if dev_loss_slot is None: - dev_loss_slot = [l * num_valid_turn for l in loss_slot] - dev_acc_slot = acc_slot * num_valid_turn - else: - for i, l in enumerate(loss_slot): - dev_loss_slot[i] = dev_loss_slot[i] + l * num_valid_turn - dev_acc_slot += acc_slot * num_valid_turn - - nb_dev_examples += num_valid_turn - - - dev_loss = dev_loss / nb_dev_examples - dev_acc = dev_acc / nb_dev_examples - - if N_GPU == 1: - dev_acc_slot = dev_acc_slot / nb_dev_examples - - # tensorboard logging - if summary_writer is not None: - summary_writer.add_scalar("Validate/Loss", dev_loss, global_step) - summary_writer.add_scalar("Validate/Acc", dev_acc, global_step) - if N_GPU == 1: - for i, slot in enumerate(self.processor.target_slot): - summary_writer.add_scalar("Validate/Loss_%s" % slot.replace(' ', '_'), - dev_loss_slot[i] / nb_dev_examples, global_step) - summary_writer.add_scalar("Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i], - global_step) - - dev_loss = round(dev_loss, 6) - - output_model_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin") - - if last_update is None or dev_loss < best_loss: - last_update = epoch - best_loss = dev_loss - best_acc = dev_acc - if not USE_CUDA or N_GPU == 1: - torch.save(model.state_dict(), output_model_file) - else: - torch.save(model.module.state_dict(), output_model_file) - - logger.info( - "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % ( - last_update, best_loss, best_acc, global_step)) - else: - logger.info( - "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % ( - epoch, dev_loss, dev_acc, global_step)) - - if last_update + args.patience <= epoch: - break - - def test(self, mode='dev', model_path=os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")): - '''Testing funciton of TRADE (to be added)''' - # Evaluation - self.load_weights(model_path) - - if mode == 'test': - eval_examples = self.dev_examples - elif mode == 'dev': - eval_examples = self.test_examples - - all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( - eval_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to( - DEVICE), all_label_ids.to(DEVICE) - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.dev_batch_size) - - eval_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) - - # Run prediction for full data - eval_sampler = SequentialSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.dev_batch_size) - - model = self.sumbt_model - eval_loss, eval_accuracy = 0, 0 - eval_loss_slot, eval_acc_slot = None, None - nb_eval_steps, nb_eval_examples = 0, 0 - - accuracies = {'joint7': 0, 'slot7': 0, 'joint5': 0, 'slot5': 0, 'joint_rest': 0, 'slot_rest': 0, - 'num_turn': 0, 'num_slot7': 0, 'num_slot5': 0, 'num_slot_rest': 0} - - for input_ids, input_len, label_ids in tqdm(eval_dataloader, desc="Evaluating"): - # if input_ids.dim() == 2: - # input_ids = input_ids.unsqueeze(0) - # input_len = input_len.unsqueeze(0) - # label_ids = label_ids.unsuqeeze(0) - - with torch.no_grad(): - if not USE_CUDA or N_GPU == 1: - loss, loss_slot, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, 1) - else: - loss, _, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, N_GPU) - nbatch = label_ids.size(0) - nslot = pred_slot.size(3) - pred_slot = pred_slot.view(nbatch, -1, nslot) - - accuracies = eval_all_accs(pred_slot, label_ids, accuracies) - - nb_eval_ex = (label_ids[:, :, 0].view(-1) != -1).sum().item() - nb_eval_examples += nb_eval_ex - nb_eval_steps += 1 - - if not USE_CUDA or N_GPU == 1: - eval_loss += loss.item() * nb_eval_ex - eval_accuracy += acc.item() * nb_eval_ex - if eval_loss_slot is None: - eval_loss_slot = [l * nb_eval_ex for l in loss_slot] - eval_acc_slot = acc_slot * nb_eval_ex - else: - for i, l in enumerate(loss_slot): - eval_loss_slot[i] = eval_loss_slot[i] + l * nb_eval_ex - eval_acc_slot += acc_slot * nb_eval_ex - else: - eval_loss += sum(loss) * nb_eval_ex - eval_accuracy += sum(acc) * nb_eval_ex - - # exit(1) - - eval_loss = eval_loss / nb_eval_examples - eval_accuracy = eval_accuracy / nb_eval_examples - if not USE_CUDA or N_GPU == 1: - eval_acc_slot = eval_acc_slot / nb_eval_examples - - loss = None - - if not USE_CUDA or N_GPU == 1: - result = { - # 'num': '\t'.join([str(x) for x in model.num_labels]), - 'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'loss': loss, - 'eval_loss_slot': '\t'.join([str(val / nb_eval_examples) for val in eval_loss_slot]), - 'eval_acc_slot': '\t'.join([str((val).item()) for val in eval_acc_slot]), - } - else: - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'loss': loss - } - - out_file_name = 'eval_results' - # if TARGET_SLOT == 'all': - # out_file_name += '_all' - output_eval_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name) - - if not USE_CUDA or N_GPU == 1: - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - out_file_name = 'eval_all_accuracies' - with open(os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name), 'w') as f: - s = '{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}'.format( - 'joint acc (7 domain)', - 'slot acc (7 domain)', - 'joint acc (5 domain)', - 'slot acc (5 domain)', - 'joint restaurant', - 'slot acc restaurant') - f.write(s + '\n') - print(s) - s = '{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}'.format( - (accuracies['joint7'] / accuracies['num_turn']).item(), - (accuracies['slot7'] / accuracies['num_slot7']).item(), - (accuracies['joint5'] / accuracies['num_turn']).item(), - (accuracies['slot5'] / accuracies['num_slot5']).item(), - (accuracies['joint_rest'] / accuracies['num_turn']).item(), - (accuracies['slot_rest'] / accuracies['num_slot_rest']).item() - ) - f.write(s + '\n') - print(s) diff --git a/convlab/dst/sumbt/crosswoz_en/sumbt_config.py b/convlab/dst/sumbt/crosswoz_en/sumbt_config.py deleted file mode 100644 index a31551177b947697bb751799191af2ed6a25a703..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/crosswoz_en/sumbt_config.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -import convlab -class DotMap(): - def __init__(self): - self.max_label_length = 35 - self.num_rnn_layers = 1 - self.zero_init_rnn = False - self.attn_head = 4 - self.do_eval = True - self.do_train = False - self.train_batch_size = 3 - self.dev_batch_size = 1 - self.eval_batch_size = 16 - self.learning_rate = 5e-5 - self.warmup_proportion = 0.1 - self.local_rank = -1 - self.seed = 42 - self.gradient_accumulation_steps = 1 - self.fp16 = False - self.loss_scale = 0 - self.do_not_use_tensorboard = False - self.fix_utterance_encoder = False - self.do_eval = True - self.num_train_epochs = 300 - - self.bert_model = os.path.join(convlab.get_root_path(), "pre-trained-models/bert-base-uncased") - self.bert_model_cache_dir = os.path.join(convlab.get_root_path(), "pre-trained-models/") - self.bert_model_name = "bert-base-uncased" - self.do_lower_case = True - self.task_name = 'bert-gru-sumbt' - self.nbt = 'rnn' - self.target_slot = 'all' - self.distance_metric = 'euclidean' - self.patience = 15 - - self.hidden_dim = 300 - self.max_seq_length = 35 - self.max_turn_length = 23 - - self.fp16_loss_scale = 0.0 - self.data_dir = 'data/crosswoz_en/' - self.tf_dir = 'tensorboard' - self.tmp_data_dir = 'processed_data/' - self.output_dir = 'model_output/' - -args = DotMap() \ No newline at end of file diff --git a/convlab/dst/sumbt/crosswoz_en/sumbt_utils.py b/convlab/dst/sumbt/crosswoz_en/sumbt_utils.py deleted file mode 100644 index a67a291a4f8d9ef4d1cd6762742faaa151f109e3..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/crosswoz_en/sumbt_utils.py +++ /dev/null @@ -1,449 +0,0 @@ -import csv -import os -import json -import collections -import logging -import re -import torch - -from convlab.dst.sumbt.crosswoz_en.convert_to_glue_format import null - -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.INFO) -logger = logging.getLogger(__name__) - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding='utf-8') as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - if len(line) > 0 and line[0][0] == '#': # ignore comments (starting with '#') - continue - lines.append(line) - return lines - - -class Processor(DataProcessor): - """Processor for the belief tracking dataset (GLUE version).""" - - def __init__(self, config): - super(Processor, self).__init__() - - # crosswoz dataset - with open(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))), config.data_dir, "ontology.json"), "r") as fp_ontology: - ontology = json.load(fp_ontology) - for slot in ontology.keys(): - ontology[slot].append(null) - - assert config.target_slot == 'all' - # if not config.target_slot == 'all': - # slot_idx = {'attraction': '0:1:2', 'bus': '3:4:5:6', 'hospital': '7', - # 'hotel': '8:9:10:11:12:13:14:15:16:17', \ - # 'restaurant': '18:19:20:21:22:23:24', 'taxi': '25:26:27:28', 'train': '29:30:31:32:33:34'} - # target_slot = [] - # for key, value in slot_idx.items(): - # if key != config.target_slot: - # target_slot.append(value) - # config.target_slot = ':'.join(target_slot) - - # sorting the ontology according to the alphabetic order of the slots - ontology = collections.OrderedDict(sorted(ontology.items())) - - # select slots to train - nslots = len(ontology.keys()) - target_slot = list(ontology.keys()) - if config.target_slot == 'all': - self.target_slot_idx = [*range(0, nslots)] - else: - self.target_slot_idx = sorted([int(x) for x in config.target_slot.split(':')]) - - for idx in range(0, nslots): - if not idx in self.target_slot_idx: - del ontology[target_slot[idx]] - - self.ontology = ontology - self.target_slot = list(self.ontology.keys()) - # for i, slot in enumerate(self.target_slot): - # if slot == "pricerange": - # self.target_slot[i] = "price range" - logger.info('Processor: target_slot') - logger.info(self.target_slot) - - def get_train_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", accumulation) - - def get_dev_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", accumulation) - - def get_test_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "test.tsv")), "test", accumulation) - - def get_labels(self): - """See base class.""" - return [list(map(str.lower, self.ontology[slot])) for slot in self.target_slot] - - def _create_examples(self, lines, set_type, accumulation=False): - """Creates examples for the training and dev sets.""" - prev_dialogue_index = None - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s-%s" % (set_type, line[0], line[1]) # line[0]: dialogue index, line[1]: turn index - if accumulation: - if prev_dialogue_index is None or prev_dialogue_index != line[0]: - text_a = line[2] - text_b = line[3] - prev_dialogue_index = line[0] - else: - # The symbol '#' will be replaced with '[SEP]' after tokenization. - text_a = line[2] + " # " + text_a - text_b = line[3] + " # " + text_b - else: - text_a = line[2] # line[2]: user utterance - text_b = line[3] # line[3]: system response - - label = [line[4 + idx] for idx in self.target_slot_idx] - - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -def normalize_text(text): - global replacements - # lower case every word - text = text.lower() - # replace white spaces in front and end - text = re.sub(r'^\s*|\s*$', '', text) - - # hotel domain pfb30 - text = re.sub(r"b&b", "bed and breakfast", text) - text = re.sub(r"b and b", "bed and breakfast", text) - - # replace st. - text = text.replace(';', ',') - text = re.sub('$\/', '', text) - text = text.replace('/', ' and ') - - # replace other special characters - text = text.replace('-', ' ') - text = re.sub('[\"\<>@\(\)]', '', text) # remove - - # insert white space before and after tokens: - for token in ['?', '.', ',', '!']: - text = insertSpace(token, text) - - # insert white space for 's - text = insertSpace('\'s', text) - - # replace it's, does't, you'd ... etc - text = re.sub('^\'', '', text) - text = re.sub('\'$', '', text) - text = re.sub('\'\s', ' ', text) - text = re.sub('\s\'', ' ', text) - for fromx, tox in replacements: - text = ' ' + text + ' ' - text = text.replace(fromx, tox)[1:-1] - - # remove multiple spaces - text = re.sub(' +', ' ', text) - - # concatenate numbers - tmp = text - tokens = text.split() - i = 1 - while i < len(tokens): - if re.match(u'^\d+$', tokens[i]) and \ - re.match(u'\d+$', tokens[i - 1]): - tokens[i - 1] += tokens[i] - del tokens[i] - else: - i += 1 - text = ' '.join(tokens) - - return text - - -def insertSpace(token, text): - sidx = 0 - while True: - sidx = text.find(token, sidx) - if sidx == -1: - break - if sidx + 1 < len(text) and re.match('[0-9]', text[sidx - 1]) and \ - re.match('[0-9]', text[sidx + 1]): - sidx += 1 - continue - if text[sidx - 1] != ' ': - text = text[:sidx] + ' ' + text[sidx:] - sidx += 1 - if sidx + len(token) < len(text) and text[sidx + len(token)] != ' ': - text = text[:sidx + 1] + ' ' + text[sidx + 1:] - sidx += 1 - return text - -# convert tokens in labels to the identifier in vocabulary -def get_label_embedding(labels, max_seq_length, tokenizer, device): - features = [] - for label in labels: - label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"] - # just truncate, some names are unreasonable long - label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens)[:max_seq_length] - label_len = len(label_token_ids) - - label_padding = [0] * (max_seq_length - len(label_token_ids)) - label_token_ids += label_padding - assert len(label_token_ids) == max_seq_length - - features.append((label_token_ids, label_len)) - - all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device) - all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device) - - return all_label_token_ids, all_label_len - - -def warmup_linear(x, warmup=0.002): - if x < warmup: - return x / warmup - return 1.0 - x - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_len, label_id): - self.input_ids = input_ids - self.input_len = input_len - self.label_id = label_id - - -def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, max_turn_length): - """Loads a data file into a list of `InputBatch`s.""" - - label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list] - slot_dim = len(label_list) - - features = [] - prev_dialogue_idx = None - all_padding = [0] * max_seq_length - all_padding_len = [0, 0] - - max_turn = 0 - for (ex_index, example) in enumerate(examples): - if max_turn < int(example.guid.split('-')[2]): - max_turn = int(example.guid.split('-')[2]) - max_turn_length = min(max_turn + 1, max_turn_length) - logger.info("max_turn_length = %d" % max_turn) - - for (ex_index, example) in enumerate(examples): - tokens_a = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_a)] - tokens_b = None - if example.text_b: - tokens_b = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_b)] - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[:(max_seq_length - 2)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambigiously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - - tokens = ["[CLS]"] + tokens_a + ["[SEP]"] - input_len = [len(tokens), 0] - - if tokens_b: - tokens += tokens_b + ["[SEP]"] - input_len[1] = len(tokens_b) + 1 - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # Zero-pad up to the sequence length. - padding = [0] * (max_seq_length - len(input_ids)) - input_ids += padding - assert len(input_ids) == max_seq_length - - FLAG_TEST = False - if example.label is not None: - label_id = [] - label_info = 'label: ' - for i, label in enumerate(example.label): - if label == 'dontcare': - label = 'do not care' - label_id.append(label_map[i][label]) - label_info += '%s (id = %d) ' % (label, label_map[i][label]) - - if ex_index < 5: - logger.info("*** Example ***") - logger.info("guid: %s" % example.guid) - logger.info("tokens: %s" % " ".join( - [str(x) for x in tokens])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_len: %s" % " ".join([str(x) for x in input_len])) - logger.info("label: " + label_info) - else: - FLAG_TEST = True - label_id = None - - curr_dialogue_idx = example.guid.split('-')[1] - curr_turn_idx = int(example.guid.split('-')[2]) - - if prev_dialogue_idx is not None and prev_dialogue_idx != curr_dialogue_idx: - if prev_turn_idx < max_turn_length: - features += [InputFeatures(input_ids=all_padding, - input_len=all_padding_len, - label_id=[-1] * slot_dim)] \ - * (max_turn_length - prev_turn_idx - 1) - # print(len(features), max_turn_length) - assert len(features) % max_turn_length == 0 - - if prev_dialogue_idx is None or prev_turn_idx < max_turn_length: - features.append( - InputFeatures(input_ids=input_ids, - input_len=input_len, - label_id=label_id)) - - prev_dialogue_idx = curr_dialogue_idx - prev_turn_idx = curr_turn_idx - - if prev_turn_idx < max_turn_length: - features += [InputFeatures(input_ids=all_padding, - input_len=all_padding_len, - label_id=[-1] * slot_dim)] \ - * (max_turn_length - prev_turn_idx - 1) - assert len(features) % max_turn_length == 0 - - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_len = torch.tensor([f.input_len for f in features], dtype=torch.long) - if not FLAG_TEST: - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) - - # reshape tensors to [#batch, #max_turn_length, #max_seq_length] - all_input_ids = all_input_ids.view(-1, max_turn_length, max_seq_length) - all_input_len = all_input_len.view(-1, max_turn_length, 2) - if not FLAG_TEST: - all_label_ids = all_label_ids.view(-1, max_turn_length, slot_dim) - else: - all_label_ids = None - - return all_input_ids, all_input_len, all_label_ids - - -def eval_all_accs(pred_slot, labels, accuracies): - - def _eval_acc(_pred_slot, _labels): - slot_dim = _labels.size(-1) - accuracy = (_pred_slot == _labels).view(-1, slot_dim) - num_turn = torch.sum(_labels[:, :, 0].view(-1) > -1, 0).float() - num_data = torch.sum(_labels > -1).float() - # joint accuracy - joint_acc = sum(torch.sum(accuracy, 1) / slot_dim).float() - # slot accuracy - slot_acc = torch.sum(accuracy).float() - return joint_acc, slot_acc, num_turn, num_data - - # 7 domains - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot, labels) - accuracies['joint7'] += joint_acc - accuracies['slot7'] += slot_acc - accuracies['num_turn'] += num_turn - accuracies['num_slot7'] += num_data - - # restaurant domain - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot[:,:,18:25], labels[:,:,18:25]) - accuracies['joint_rest'] += joint_acc - accuracies['slot_rest'] += slot_acc - accuracies['num_slot_rest'] += num_data - - pred_slot5 = torch.cat((pred_slot[:,:,0:3], pred_slot[:,:,8:]), 2) - label_slot5 = torch.cat((labels[:,:,0:3], labels[:,:,8:]), 2) - - # 5 domains (excluding bus and hotel domain) - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot5, label_slot5) - accuracies['joint5'] += joint_acc - accuracies['slot5'] += slot_acc - accuracies['num_slot5'] += num_data - - return accuracies diff --git a/convlab/dst/sumbt/multiwoz/__init__.py b/convlab/dst/sumbt/multiwoz/__init__.py deleted file mode 100755 index 2072ea48e1cb4f7c682c4d5dbf9be152db9ad4d0..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from convlab.dst.sumbt.multiwoz.sumbt import SUMBTTracker as SUMBT diff --git a/convlab/dst/sumbt/multiwoz/convert_to_glue_format.py b/convlab/dst/sumbt/multiwoz/convert_to_glue_format.py deleted file mode 100755 index f6645d11ccc564bb13bb251d6b1b01291ef04b34..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz/convert_to_glue_format.py +++ /dev/null @@ -1,167 +0,0 @@ -import json -import zipfile -from convlab.dst.sumbt.multiwoz.sumbt_config import * - - -def convert_to_glue_format(data_dir, sumbt_dir): - - if not os.path.isdir(os.path.join(sumbt_dir, args.tmp_data_dir)): - os.mkdir(os.path.join(sumbt_dir, args.tmp_data_dir)) - - ### Read ontology file - fp_ont = open(os.path.join(data_dir, "ontology_sumbt.json"), "r") - data_ont = json.load(fp_ont) - ontology = {} - for domain_slot in data_ont: - domain, slot = domain_slot.split('-') - if domain not in ontology: - ontology[domain] = {} - ontology[domain][slot] = {} - for value in data_ont[domain_slot]: - ontology[domain][slot][value] = 1 - fp_ont.close() - - ### Read woz logs and write to tsv files - if os.path.exists(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv")): - print('data has been processed!') - return 0 - - fp_train = open(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv"), "w") - fp_dev = open(os.path.join(sumbt_dir, args.tmp_data_dir, "dev.tsv"), "w") - fp_test = open(os.path.join(sumbt_dir, args.tmp_data_dir, "test.tsv"), "w") - - fp_train.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - fp_dev.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - fp_test.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - - for domain in sorted(ontology.keys()): - for slot in sorted(ontology[domain].keys()): - fp_train.write(str(domain) + '-' + str(slot) + '\t') - fp_dev.write(str(domain) + '-' + str(slot) + '\t') - fp_test.write(str(domain) + '-' + str(slot) + '\t') - - fp_train.write('\n') - fp_dev.write('\n') - fp_test.write('\n') - - # fp_data = open(os.path.join(SELF_DATA_DIR, "data.json"), "r") - # data = json.load(fp_data) - - file_split = ['train', 'val', 'test'] - fp = [fp_train, fp_dev, fp_test] - - for split_type, split_fp in zip(file_split, fp): - - zipfile_name = "{}.json.zip".format(split_type) - zip_fp = zipfile.ZipFile(os.path.join(data_dir, zipfile_name)) - data = json.loads(str(zip_fp.read(zip_fp.namelist()[0]), 'utf-8')) - - for file_id in data: - user_utterance = '' - system_response = '' - turn_idx = 0 - for idx, turn in enumerate(data[file_id]['log']): - if idx % 2 == 0: # user turn - user_utterance = data[file_id]['log'][idx]['text'] - else: # system turn - user_utterance = user_utterance.replace('\t', ' ') - user_utterance = user_utterance.replace('\n', ' ') - user_utterance = user_utterance.replace(' ', ' ') - - system_response = system_response.replace('\t', ' ') - system_response = system_response.replace('\n', ' ') - system_response = system_response.replace(' ', ' ') - - split_fp.write(str(file_id)) # 0: dialogue ID - split_fp.write('\t' + str(turn_idx)) # 1: turn index - split_fp.write('\t' + str(user_utterance)) # 2: user utterance - split_fp.write('\t' + str(system_response)) # 3: system response - - belief = {} - for domain in data[file_id]['log'][idx]['metadata'].keys(): - for slot in data[file_id]['log'][idx]['metadata'][domain]['semi'].keys(): - value = data[file_id]['log'][idx]['metadata'][domain]['semi'][slot].strip() - value = value.lower() - if value == '' or value == 'not mentioned' or value == 'not given': - value = 'none' - - if slot == "leaveAt" and domain != "bus": - slot = "leave at" - elif slot == "arriveBy" and domain != "bus": - slot = "arrive by" - elif slot == "pricerange": - slot = "price range" - - if value == "doesn't care" or value == "don't care" or value == "dont care" or value == "does not care" or value == 'dontcare': - value = "do not care" - elif value == "guesthouse" or value == "guesthouses": - value = "guest house" - elif value == "city center" or value == "town centre" or value == "town center" or \ - value == "centre of town" or value == "center" or value == "center of town": - value = "centre" - elif value == "west part of town": - value = "west" - elif value == "mutliple sports": - value = "multiple sports" - elif value == "swimmingpool": - value = "swimming pool" - elif value == "concerthall": - value = "concert hall" - - if domain not in ontology: - # print("domain (%s) is not defined" % domain) - continue - - if slot not in ontology[domain]: - # print("slot (%s) in domain (%s) is not defined" % (slot, domain)) # bus-arriveBy not defined - continue - - if value not in ontology[domain][slot] and value != 'none': - # print("%s: value (%s) in domain (%s) slot (%s) is not defined in ontology" % - # (file_id, value, domain, slot)) - value = 'none' - - belief[str(domain) + '-' + str(slot)] = value - - for slot in data[file_id]['log'][idx]['metadata'][domain]['book'].keys(): - if slot == 'booked': - continue - if domain == 'bus' and slot == 'people': - continue # not defined in ontology - - value = data[file_id]['log'][idx]['metadata'][domain]['book'][slot].strip() - value = value.lower() - - if value == '' or value == 'not mentioned' or value == 'not given': - value = 'none' - elif value == "doesn't care" or value == "don't care" or value == "dont care" or value == "does not care" or value == 'dontcare': - value = "do not care" - - if str('book ' + slot) not in ontology[domain]: - # print("book %s is not defined in domain %s" % (slot, domain)) - continue - - if value not in ontology[domain]['book ' + slot] and value != 'none': - # print("%s: value (%s) in domain (%s) slot (book %s) is not defined in ontology" % - # (file_id, value, domain, slot)) - value = 'none' - - belief[str(domain) + '-book ' + str(slot)] = value - - for domain in sorted(ontology.keys()): - for slot in sorted(ontology[domain].keys()): - key = str(domain) + '-' + str(slot) - if key in belief: - split_fp.write('\t' + belief[key]) - else: - split_fp.write('\tnone') - - split_fp.write('\n') - split_fp.flush() - - system_response = data[file_id]['log'][idx]['text'] - turn_idx += 1 - - fp_train.close() - fp_dev.close() - fp_test.close() \ No newline at end of file diff --git a/convlab/dst/sumbt/multiwoz/sumbt.py b/convlab/dst/sumbt/multiwoz/sumbt.py deleted file mode 100755 index 506ea358018d451f6a76d003f50272c0b89fab5d..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz/sumbt.py +++ /dev/null @@ -1,818 +0,0 @@ -import copy -from pprint import pprint -import random -from itertools import chain -import numpy as np -import zipfile - -from tensorboardX.writer import SummaryWriter -from tqdm._tqdm import trange, tqdm - -from convlab.util.file_util import cached_path - -from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler - -from transformers import BertTokenizer -from transformers import get_linear_schedule_with_warmup, AdamW - -from convlab.dst.dst import DST -from convlab.dst.sumbt.multiwoz.convert_to_glue_format import convert_to_glue_format -from convlab.util.multiwoz.state import default_state -from convlab.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker -from convlab.dst.sumbt.multiwoz.sumbt_utils import * -from convlab.dst.sumbt.multiwoz.sumbt_config import * -from convlab.util.multiwoz.multiwoz_slot_trans import REF_SYS_DA, REF_USR_DA - -USE_CUDA = torch.cuda.is_available() -N_GPU = torch.cuda.device_count() if USE_CUDA else 1 -DEVICE = "cuda" if USE_CUDA else "cpu" -ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) -SUMBT_PATH = os.path.dirname(os.path.abspath(__file__)) -DATA_PATH = os.path.join(ROOT_PATH, 'data/multiwoz') -DOWNLOAD_DIRECTORY = os.path.join(SUMBT_PATH, 'downloaded_model') -multiwoz_slot_list = ['attraction-area', 'attraction-name', 'attraction-type', 'hotel-day', 'hotel-people', 'hotel-stay', 'hotel-area', 'hotel-internet', 'hotel-name', 'hotel-parking', 'hotel-pricerange', 'hotel-stars', 'hotel-type', 'restaurant-day', 'restaurant-people', 'restaurant-time', 'restaurant-area', 'restaurant-food', 'restaurant-name', 'restaurant-pricerange', 'taxi-arriveby', 'taxi-departure', 'taxi-destination', 'taxi-leaveat', 'train-people', 'train-arriveby', 'train-day', 'train-departure', 'train-destination', 'train-leaveat'] - - -def get_label_embedding(labels, max_seq_length, tokenizer, device): - features = [] - for label in labels: - label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"] - label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens) - label_len = len(label_token_ids) - - label_padding = [0] * (max_seq_length - len(label_token_ids)) - label_token_ids += label_padding - assert len(label_token_ids) == max_seq_length - - features.append((label_token_ids, label_len)) - - all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device) - all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device) - - return all_label_token_ids, all_label_len - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class SUMBTTracker(DST): - """ - Transferable multi-domain dialogue state tracker, adopted from https://github.com/SKTBrain/SUMBT - """ - - - def __init__(self, data_dir=DATA_PATH, model_file='https://huggingface.co/ConvLab/ConvLab-2_models/resolve/main/sumbt.tar.gz', eval_slots=multiwoz_slot_list): - - DST.__init__(self) - - # if not os.path.exists(data_dir): - # if model_file == '': - # raise Exception( - # 'Please provide remote model file path in config') - # resp = urllib.request.urlretrieve(model_file)[0] - # temp_file = tarfile.open(resp) - # temp_file.extractall('data') - # assert os.path.exists(data_dir) - - processor = Processor(args) - self.processor = processor - label_list = processor.get_labels() - num_labels = [len(labels) for labels in label_list] # number of slot-values in each slot-type - - # tokenizer - self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - - self.device = torch.device("cuda" if USE_CUDA else "cpu") - - self.sumbt_model = BeliefTracker(args, num_labels, self.device) - if USE_CUDA and N_GPU > 1: - self.sumbt_model = torch.nn.DataParallel(self.sumbt_model) - if args.fp16: - self.sumbt_model.half() - self.sumbt_model.to(self.device) - - ## Get slot-value embeddings - self.label_token_ids, self.label_len = [], [] - for labels in label_list: - token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, self.device) - self.label_token_ids.append(token_ids) - self.label_len.append(lens) - self.label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list] - self.label_map_inv = [{i: label for i, label in enumerate(labels)} for labels in label_list] - self.label_list = label_list - self.target_slot = processor.target_slot - ## Get domain-slot-type embeddings - self.slot_token_ids, self.slot_len = \ - get_label_embedding(processor.target_slot, args.max_label_length, self.tokenizer, self.device) - - self.args = args - self.state = default_state() - self.param_restored = False - if USE_CUDA and N_GPU == 1: - self.sumbt_model.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids) - elif USE_CUDA and N_GPU > 1: - self.sumbt_model.module.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids) - - self.det_dic = {} - for domain, dic in REF_USR_DA.items(): - for key, value in dic.items(): - assert '-' not in key - self.det_dic[key.lower()] = key + '-' + domain - self.det_dic[value.lower()] = key + '-' + domain - - self.cached_res = {} - convert_to_glue_format(DATA_PATH, SUMBT_PATH) - if not os.path.isdir(os.path.join(SUMBT_PATH, args.output_dir)): - os.makedirs(os.path.join(SUMBT_PATH, args.output_dir)) - self.train_examples = processor.get_train_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - self.dev_examples = processor.get_dev_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - self.test_examples = processor.get_test_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - self.eval_slots = eval_slots - self.download_model() - - def download_model(self): - if not os.path.isdir(DOWNLOAD_DIRECTORY): - os.mkdir(DOWNLOAD_DIRECTORY) - # model_file = os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.zip') - - # if not os.path.isfile(model_file): - model_file = 'https://huggingface.co/ConvLab/ConvLab-2_models/resolve/main/sumbt.tar.gz' - - import tarfile - if not os.path.isfile(os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')): - archive_file = cached_path(model_file) - # archive = zipfile.ZipFile(archive_file, 'r') - t = tarfile.open(archive_file) - t.extractall(path=DOWNLOAD_DIRECTORY) - # archive.extractall(DOWNLOAD_DIRECTORY) - - def load_weights(self, model_path=None): - if model_path is None: - model_ckpt = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), 'pytorch_model.bin') - else: - model_ckpt = model_path - model = self.sumbt_model - # in the case that slot and values are different between the training and evaluation - if not USE_CUDA: - ptr_model = torch.load(model_ckpt, map_location=torch.device('cpu')) - else: - ptr_model = torch.load(model_ckpt) - print('loading pretrained weights') - - if not USE_CUDA or N_GPU == 1: - state = model.state_dict() - state.update(ptr_model) - model.load_state_dict(state) - else: - # print("Evaluate using only one device!") - model.module.load_state_dict(ptr_model) - - if USE_CUDA: - model.to("cuda") - - def init_session(self): - self.state = default_state() - if not self.param_restored: - if os.path.isfile(os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')): - print('loading weights from downloaded model') - self.load_weights(model_path=os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')) - elif os.path.isfile(os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')): - print('loading weights from trained model') - self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')) - else: - raise ValueError('no availabel weights found.') - self.param_restored = True - - def update(self, user_act=None): - """Update the dialogue state with the generated tokens from TRADE""" - if not user_act: - user_act = "" - if not isinstance(user_act, str): - raise Exception( - 'Expected user_act is str but found {}'.format(type(user_act)) - ) - prev_state = self.state - - actual_history = copy.deepcopy(prev_state['history']) - # if actual_history[-1][0] == 'user': - # actual_history[-1][1] += user_act - # else: - # actual_history.append(['user', user_act]) - query = self.construct_query(actual_history) - pred_states = self.predict(query) - - new_belief_state = copy.deepcopy(prev_state['belief_state']) - for state in pred_states: - domain, slot, value = state.split('-', 2) - value = '' if value == 'none' else value - value = 'dontcare' if value == 'do not care' else value - value = 'guesthouse' if value == 'guest house' else value - if slot not in ['name', 'book']: - if domain not in new_belief_state: - if domain == 'bus': - continue - else: - raise Exception( - 'Error: domain <{}> not in belief state'.format(domain)) - slot = REF_SYS_DA[domain.capitalize()].get(slot, slot) - assert 'semi' in new_belief_state[domain] - assert 'book' in new_belief_state[domain] - if 'book' in slot: - assert slot.startswith('book ') - slot = slot.strip().split()[1] - if slot == 'arrive by': - slot = 'arriveBy' - elif slot == 'leave at': - slot = 'leaveAt' - elif slot == 'price range': - slot = 'pricerange' - domain_dic = new_belief_state[domain] - if slot in domain_dic['semi']: - new_belief_state[domain]['semi'][slot] = value - # normalize_value(self.value_dict, domain, slot, value) - elif slot in domain_dic['book']: - new_belief_state[domain]['book'][slot] = value - elif slot.lower() in domain_dic['book']: - new_belief_state[domain]['book'][slot.lower()] = value - else: - with open('trade_tracker_unknown_slot.log', 'a+') as f: - f.write( - 'unknown slot name <{}> with value <{}> of domain <{}>\nitem: {}\n\n'.format(slot, value, domain, state) - ) - new_request_state = copy.deepcopy(prev_state['request_state']) - # update request_state - user_request_slot = self.detect_requestable_slots(user_act) - for domain in user_request_slot: - for key in user_request_slot[domain]: - if domain not in new_request_state: - new_request_state[domain] = {} - if key not in new_request_state[domain]: - new_request_state[domain][key] = user_request_slot[domain][key] - - new_state = copy.deepcopy(dict(prev_state)) - new_state['belief_state'] = new_belief_state - new_state['request_state'] = new_request_state - self.state = new_state - # print((pred_states, query)) - return self.state - - def predict(self, query): - cache_query_key = ''.join(str(list(chain.from_iterable(query[0])))) - if cache_query_key in self.cached_res.keys(): - return self.cached_res[cache_query_key] - - input_ids, input_len = query - input_ids = torch.tensor(input_ids).to(self.device).unsqueeze(0) - input_len = torch.tensor(input_len).to(self.device).unsqueeze(0) - labels = None - _, pred_slot = self.sumbt_model(input_ids, input_len, labels) - pred_slot_t = pred_slot[0][-1].tolist() - predict_belief = [] - for idx, i in enumerate(pred_slot_t): - predict_belief.append('{}-{}'.format(self.target_slot[idx], self.label_map_inv[idx][i])) - self.cached_res[cache_query_key] = predict_belief - - # print(predict_belief) - fixed_belief = [] - for sv in predict_belief: - d, s, v = sv.split('-', 2) - if s == 'book day': - s = 'day' - elif s == 'book people': - s = 'people' - elif s == 'book stay': - s = 'stay' - elif s == 'price range': - s = 'pricerange' - elif s == 'book time': - s = 'time' - elif s == 'arrive by': - s = 'arriveby' - elif s == 'leave at': - s = 'leaveat' - elif s == 'arrive by': - s = 'arriveby' - _fixed_slot = d + '-' + s - if _fixed_slot in self.eval_slots: - fixed_belief.append(_fixed_slot+'-'+v) - return predict_belief - - def train(self, load_model=False, model_path=None): - - if load_model: - if model_path is not None: - self.load_weights(model_path) - ## Training utterances - all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( - self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - - num_train_batches = all_input_ids.size(0) - num_train_steps = int( - num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) - - logger.info("***** training *****") - logger.info(" Num examples = %d", len(self.train_examples)) - logger.info(" Batch size = %d", args.train_batch_size) - logger.info(" Num steps = %d", num_train_steps) - - all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to( - DEVICE), all_label_ids.to(DEVICE) - - train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) - train_sampler = RandomSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) - - all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features( - self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - - logger.info("***** validation *****") - logger.info(" Num examples = %d", len(self.dev_examples)) - logger.info(" Batch size = %d", args.dev_batch_size) - - all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \ - all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE) - - dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev) - dev_sampler = SequentialSampler(dev_data) - dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size) - - logger.info("Loaded data!") - - if args.fp16: - self.sumbt_model.half() - self.sumbt_model.to(DEVICE) - - ## Get domain-slot-type embeddings - slot_token_ids, slot_len = \ - get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE) - - # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot): - # self.idx2slot[slot_idx] = slot_str - - ## Get slot-value embeddings - label_token_ids, label_len = [], [] - for slot_idx, labels in zip(slot_token_ids, self.label_list): - # self.idx2value[slot_idx] = {} - token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE) - label_token_ids.append(token_ids) - label_len.append(lens) - # for label, token_id in zip(labels, token_ids): - # self.idx2value[slot_idx][token_id] = label - - logger.info('embeddings prepared') - - if USE_CUDA and N_GPU > 1: - self.sumbt_model.module.initialize_slot_value_lookup(label_token_ids, slot_token_ids) - else: - self.sumbt_model.initialize_slot_value_lookup(label_token_ids, slot_token_ids) - - def get_optimizer_grouped_parameters(model): - param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad] - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, - 'lr': args.learning_rate}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, - 'lr': args.learning_rate}, - ] - return optimizer_grouped_parameters - - if not USE_CUDA or N_GPU == 1: - optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model) - else: - optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model.module) - - t_total = num_train_steps - - scheduler = None - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.fp16_loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale) - - else: - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total) - logger.info(optimizer) - - # Training code - ############################################################################### - - logger.info("Training...") - - global_step = 0 - last_update = None - best_loss = None - model = self.sumbt_model - if not args.do_not_use_tensorboard: - summary_writer = None - else: - summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/") - - for epoch in trange(int(args.num_train_epochs), desc="Epoch"): - # Train - model.train() - tr_loss = 0 - nb_tr_examples = 0 - nb_tr_steps = 0 - - for step, batch in enumerate(tqdm(train_dataloader)): - batch = tuple(t.to(DEVICE) for t in batch) - input_ids, input_len, label_ids = batch - - # Forward - if N_GPU == 1: - loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - else: - loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - - # average to multi-gpus - loss = loss.mean() - acc = acc.mean() - acc_slot = acc_slot.mean(0) - - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - # Backward - if args.fp16: - optimizer.backward(loss) - else: - loss.backward() - - # tensrboard logging - if summary_writer is not None: - summary_writer.add_scalar("Epoch", epoch, global_step) - summary_writer.add_scalar("Train/Loss", loss, global_step) - summary_writer.add_scalar("Train/JointAcc", acc, global_step) - if N_GPU == 1: - for i, slot in enumerate(self.processor.target_slot): - summary_writer.add_scalar("Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i], - global_step) - summary_writer.add_scalar("Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step) - - tr_loss += loss.item() - nb_tr_examples += input_ids.size(0) - nb_tr_steps += 1 - if (step + 1) % args.gradient_accumulation_steps == 0: - # modify lealrning rate with special warm up BERT uses - lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) - if summary_writer is not None: - summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) - for param_group in optimizer.param_groups: - param_group['lr'] = lr_this_step - if scheduler is not None: - torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0) - optimizer.step() - if scheduler is not None: - scheduler.step() - optimizer.zero_grad() - global_step += 1 - - - # Perform evaluation on validation dataset - model.eval() - dev_loss = 0 - dev_acc = 0 - dev_loss_slot, dev_acc_slot = None, None - nb_dev_examples, nb_dev_steps = 0, 0 - - for step, batch in enumerate(tqdm(dev_dataloader, desc="Validation")): - batch = tuple(t.to(DEVICE) for t in batch) - input_ids, input_len, label_ids = batch - if input_ids.dim() == 2: - input_ids = input_ids.unsqueeze(0) - input_len = input_len.unsqueeze(0) - label_ids = label_ids.unsuqeeze(0) - - with torch.no_grad(): - if N_GPU == 1: - loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - else: - loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - - # average to multi-gpus - loss = loss.mean() - acc = acc.mean() - acc_slot = acc_slot.mean(0) - - num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item() - dev_loss += loss.item() * num_valid_turn - dev_acc += acc.item() * num_valid_turn - - if N_GPU == 1: - if dev_loss_slot is None: - dev_loss_slot = [l * num_valid_turn for l in loss_slot] - dev_acc_slot = acc_slot * num_valid_turn - else: - for i, l in enumerate(loss_slot): - dev_loss_slot[i] = dev_loss_slot[i] + l * num_valid_turn - dev_acc_slot += acc_slot * num_valid_turn - - nb_dev_examples += num_valid_turn - - - dev_loss = dev_loss / nb_dev_examples - dev_acc = dev_acc / nb_dev_examples - - if N_GPU == 1: - dev_acc_slot = dev_acc_slot / nb_dev_examples - - # tensorboard logging - if summary_writer is not None: - summary_writer.add_scalar("Validate/Loss", dev_loss, global_step) - summary_writer.add_scalar("Validate/Acc", dev_acc, global_step) - if N_GPU == 1: - for i, slot in enumerate(self.processor.target_slot): - summary_writer.add_scalar("Validate/Loss_%s" % slot.replace(' ', '_'), - dev_loss_slot[i] / nb_dev_examples, global_step) - summary_writer.add_scalar("Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i], - global_step) - - dev_loss = round(dev_loss, 6) - - output_model_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin") - - if last_update is None or dev_loss < best_loss: - - if not USE_CUDA or N_GPU == 1: - torch.save(model.state_dict(), output_model_file) - else: - torch.save(model.module.state_dict(), output_model_file) - - last_update = epoch - best_loss = dev_loss - best_acc = dev_acc - - logger.info( - "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % ( - last_update, best_loss, best_acc, global_step)) - else: - logger.info( - "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % ( - epoch, dev_loss, dev_acc, global_step)) - - if last_update + args.patience <= epoch: - break - - - - def test(self, mode='dev', model_path=None): - '''Testing funciton of TRADE (to be added)''' - # Evaluation - self.load_weights(model_path) - - if mode == 'test': - eval_examples = self.dev_examples - elif mode == 'dev': - eval_examples = self.test_examples - - all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( - eval_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to( - DEVICE), all_label_ids.to(DEVICE) - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.dev_batch_size) - - eval_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) - - # Run prediction for full data - eval_sampler = SequentialSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.dev_batch_size) - - model = self.sumbt_model - eval_loss, eval_accuracy = 0, 0 - eval_loss_slot, eval_acc_slot = None, None - nb_eval_steps, nb_eval_examples = 0, 0 - - accuracies = {'joint7': 0, 'slot7': 0, 'joint5': 0, 'slot5': 0, 'joint_rest': 0, 'slot_rest': 0, - 'num_turn': 0, 'num_slot7': 0, 'num_slot5': 0, 'num_slot_rest': 0} - - for input_ids, input_len, label_ids in tqdm(eval_dataloader, desc="Evaluating"): - # if input_ids.dim() == 2: - # input_ids = input_ids.unsqueeze(0) - # input_len = input_len.unsqueeze(0) - # label_ids = label_ids.unsuqeeze(0) - - with torch.no_grad(): - if not USE_CUDA or N_GPU == 1: - loss, loss_slot, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, 1) - else: - loss, _, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, N_GPU) - nbatch = label_ids.size(0) - nslot = pred_slot.size(3) - pred_slot = pred_slot.view(nbatch, -1, nslot) - - accuracies = eval_all_accs(pred_slot, label_ids, accuracies) - - nb_eval_ex = (label_ids[:, :, 0].view(-1) != -1).sum().item() - nb_eval_examples += nb_eval_ex - nb_eval_steps += 1 - - if not USE_CUDA or N_GPU == 1: - eval_loss += loss.item() * nb_eval_ex - eval_accuracy += acc.item() * nb_eval_ex - if eval_loss_slot is None: - eval_loss_slot = [l * nb_eval_ex for l in loss_slot] - eval_acc_slot = acc_slot * nb_eval_ex - else: - for i, l in enumerate(loss_slot): - eval_loss_slot[i] = eval_loss_slot[i] + l * nb_eval_ex - eval_acc_slot += acc_slot * nb_eval_ex - else: - eval_loss += sum(loss) * nb_eval_ex - eval_accuracy += sum(acc) * nb_eval_ex - - eval_loss = eval_loss / nb_eval_examples - eval_accuracy = eval_accuracy / nb_eval_examples - if not USE_CUDA or N_GPU == 1: - eval_acc_slot = eval_acc_slot / nb_eval_examples - - loss = None - - if not USE_CUDA or N_GPU == 1: - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'loss': loss, - 'eval_loss_slot': '\t'.join([str(val / nb_eval_examples) for val in eval_loss_slot]), - 'eval_acc_slot': '\t'.join([str((val).item()) for val in eval_acc_slot]) - } - else: - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'loss': loss - } - - out_file_name = 'eval_results' - # if TARGET_SLOT == 'all': - # out_file_name += '_all' - output_eval_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name) - - if not USE_CUDA or N_GPU == 1: - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - out_file_name = 'eval_all_accuracies' - with open(os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name), 'w') as f: - f.write( - 'joint acc (7 domain) : slot acc (7 domain) : joint acc (5 domain): slot acc (5 domain): joint ' - 'restaurant : slot acc restaurant \n') - f.write('%.5f : %.5f : %.5f : %.5f : %.5f : %.5f \n' % ( - (accuracies['joint7'] / accuracies['num_turn']).item(), - (accuracies['slot7'] / accuracies['num_slot7']).item(), - (accuracies['joint5'] / accuracies['num_turn']).item(), - (accuracies['slot5'] / accuracies['num_slot5']).item(), - (accuracies['joint_rest'] / accuracies['num_turn']).item(), - (accuracies['slot_rest'] / accuracies['num_slot_rest']).item() - )) - - def construct_query(self, context): - '''Construct query from context''' - ids = [] - lens = [] - context_len = len(context) - if context[0][0] != 'usr': - context = [['usr', '']] + context - for i in range(0, context_len, 2): - # utt_user = '' - # utt_sys = '' - # for evaluation - utt_sys = context[i][1] - if context_len < 2: - utt_user = " " - else: - # print(context_len) - utt_user = context[i + 1][1] - - tokens_user = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_user)] - tokens_sys = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_sys)] - - _truncate_seq_pair(tokens_user, tokens_sys, self.args.max_seq_length - 3) - tokens = ["[CLS]"] + tokens_user + ["[SEP]"] + tokens_sys + ["[SEP]"] - input_len = [len(tokens_user) + 2, len(tokens_sys) + 1] - - input_ids = self.tokenizer.convert_tokens_to_ids(tokens) - padding = [0] * (self.args.max_seq_length - len(input_ids)) - input_ids += padding - assert len(input_ids) == self.args.max_seq_length - ids.append(input_ids) - lens.append(input_len) - - return (ids, lens) - - def detect_requestable_slots(self, observation): - result = {} - observation = observation.lower() - _observation = ' {} '.format(observation) - for value in self.det_dic.keys(): - _value = ' {} '.format(value.strip()) - if _value in _observation: - key, domain = self.det_dic[value].split('-') - if domain not in result: - result[domain] = {} - result[domain][key] = 0 - return result - - -def test_update(): - sumbt_tracker = SUMBTTracker() - sumbt_tracker.init_session() - - sumbt_tracker.state['history'] = [ - ['sys', ''], - ['user', 'Could you book a 4 stars hotel for one night, 1 person?'], - ['sys', 'If you\'d like something cheap, I recommend the Allenbell'] - ] - sumbt_tracker.state['history'].append(['user', 'Friday and Can you book it for me and get a reference number ?']) - from timeit import default_timer as timer - start = timer() - pprint(sumbt_tracker.update('Friday and Can you book it for me and get a reference number ?')) - end = timer() - print(end - start) - # - start = timer() - sumbt_tracker.state['history'].append(['sys', 'what is the area']) - sumbt_tracker.state['history'].append(['user', "it doesn't matter. I don't care"]) - pprint(sumbt_tracker.update('in the east area of cambridge')) - end = timer() - print(end - start) - - start = timer() - # sumbt_tracker.state['history'].append(['what is the area']) - pprint(sumbt_tracker.update('in the east area of cambridge')) - end = timer() - print(end - start) - - -def test_update_bak(): - - sumbt_tracker = SUMBTTracker() - sumbt_tracker.init_session() - - sumbt_tracker.state['history'] = [ - ['null', 'Could you book a 4 stars hotel for one night, 1 person?'], - ['If you\'d like something cheap, I recommend the Allenbell'] - ] - from timeit import default_timer as timer - start = timer() - pprint(sumbt_tracker.update('Friday and Can you book it for me and get a reference number ?')) - sumbt_tracker.state['history'][-1].append('Friday and Can you book it for me and get a reference number ?') - end = timer() - print(end - start) - # - start = timer() - sumbt_tracker.state['history'].append(['what is the area']) - pprint(sumbt_tracker.update('i do not care')) - # pprint(sumbt_tracker.update('in the east area of cambridge')) - end = timer() - print(end - start) - - start = timer() - # sumbt_tracker.state['history'].append(['what is the area']) - pprint(sumbt_tracker.update('in the east area of cambridge')) - end = timer() - print(end - start) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument('--train', action='store_true') -parser.add_argument('--dev', action='store_true') -parser.add_argument('--test', action='store_true') - - -if __name__ == '__main__': - test_update() diff --git a/convlab/dst/sumbt/multiwoz/sumbt_config.py b/convlab/dst/sumbt/multiwoz/sumbt_config.py deleted file mode 100755 index df96eded9c7de28ebe16dd7df3b6f2b4d9fffc37..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz/sumbt_config.py +++ /dev/null @@ -1,57 +0,0 @@ -import os - -import convlab -class DotMap(): - def __init__(self): - self.max_label_length = 32 - self.max_turn_length = 22 - self.hidden_dim = 100 - self.num_rnn_layers = 1 - self.zero_init_rnn = False - self.attn_head = 4 - self.do_eval = True - self.do_train = False - self.distance_metric = 'cosine' - self.train_batch_size = 4 - self.dev_batch_size = 1 - self.eval_batch_size = 16 - self.learning_rate = 5e-5 - self.num_train_epochs = 3 - self.patience = 10 - self.warmup_proportion = 0.1 - self.local_rank = -1 - self.seed = 42 - self.gradient_accumulation_steps = 1 - self.fp16 = False - self.loss_scale = 0 - self.do_not_use_tensorboard = False - self.fix_utterance_encoder = False - self.do_eval = True - self.num_train_epochs = 300 - - self.bert_model = os.path.join(convlab.get_root_path(), "pre-trained-models/bert-base-uncased") - self.bert_model_cache_dir = os.path.join(convlab.get_root_path(), "pre-trained-models/") - self.bert_model_name = "bert-base-uncased" - self.do_lower_case = True - self.task_name = 'bert-gru-sumbt' - self.nbt = 'rnn' - # self.output_dir = os.path.join(path, 'ckpt/') - self.target_slot = 'all' - self.learning_rate = 5e-5 - self.train_batch_size = 4 - self.eval_batch_size = 16 - self.distance_metric = 'euclidean' - self.patience = 15 - - self.hidden_dim = 300 - self.max_label_length = 32 - self.max_seq_length = 64 - self.max_turn_length = 22 - - self.fp16_loss_scale = 0.0 - self.data_dir = 'data/multiwoz/' - self.tf_dir = 'tensorboard' - self.tmp_data_dir = 'processed_data/' - self.output_dir = 'model_output/' - -args = DotMap() diff --git a/convlab/dst/sumbt/multiwoz/sumbt_utils.py b/convlab/dst/sumbt/multiwoz/sumbt_utils.py deleted file mode 100755 index 489e5eca3535de4b951246a103b323f339082afa..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz/sumbt_utils.py +++ /dev/null @@ -1,450 +0,0 @@ -import csv -import os -import json -import collections -import logging -import re -import torch - -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.INFO) -logger = logging.getLogger(__name__) - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding='utf-8') as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - if len(line) > 0 and line[0][0] == '#': # ignore comments (starting with '#') - continue - lines.append(line) - return lines - - -class Processor(DataProcessor): - """Processor for the belief tracking dataset (GLUE version).""" - - def __init__(self, config): - super(Processor, self).__init__() - - print(config) - # MultiWOZ dataset - fp_ontology = open(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))), config.data_dir, "ontology_sumbt.json"), "r") - - ontology = json.load(fp_ontology) - for slot in ontology.keys(): - ontology[slot].append("none") - fp_ontology.close() - - if not config.target_slot == 'all': - slot_idx = {'attraction': '0:1:2', 'bus': '3:4:5:6', 'hospital': '7', - 'hotel': '8:9:10:11:12:13:14:15:16:17', \ - 'restaurant': '18:19:20:21:22:23:24', 'taxi': '25:26:27:28', 'train': '29:30:31:32:33:34'} - target_slot = [] - for key, value in slot_idx.items(): - if key != config.target_slot: - target_slot.append(value) - config.target_slot = ':'.join(target_slot) - - # sorting the ontology according to the alphabetic order of the slots - ontology = collections.OrderedDict(sorted(ontology.items())) - - # select slots to train - nslots = len(ontology.keys()) - target_slot = list(ontology.keys()) - if config.target_slot == 'all': - self.target_slot_idx = [*range(0, nslots)] - else: - self.target_slot_idx = sorted([int(x) for x in config.target_slot.split(':')]) - - for idx in range(0, nslots): - if not idx in self.target_slot_idx: - del ontology[target_slot[idx]] - - self.ontology = ontology - self.target_slot = list(self.ontology.keys()) - for i, slot in enumerate(self.target_slot): - if slot == "pricerange": - self.target_slot[i] = "price range" - - logger.info('Processor: target_slot') - logger.info(self.target_slot) - - def get_train_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", accumulation) - - def get_dev_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", accumulation) - - def get_test_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "test.tsv")), "test", accumulation) - - def get_labels(self): - """See base class.""" - return [self.ontology[slot] for slot in self.target_slot] - - def _create_examples(self, lines, set_type, accumulation=False): - """Creates examples for the training and dev sets.""" - prev_dialogue_index = None - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s-%s" % (set_type, line[0], line[1]) # line[0]: dialogue index, line[1]: turn index - if accumulation: - if prev_dialogue_index is None or prev_dialogue_index != line[0]: - text_a = line[2] - text_b = line[3] - prev_dialogue_index = line[0] - else: - # The symbol '#' will be replaced with '[SEP]' after tokenization. - text_a = line[2] + " # " + text_a - text_b = line[3] + " # " + text_b - else: - text_a = line[2] # line[2]: user utterance - text_b = line[3] # line[3]: system response - - label = [line[4 + idx] for idx in self.target_slot_idx] - - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -def normalize_text(text): - global replacements - # lower case every word - text = text.lower() - # replace white spaces in front and end - text = re.sub(r'^\s*|\s*$', '', text) - - # hotel domain pfb30 - text = re.sub(r"b&b", "bed and breakfast", text) - text = re.sub(r"b and b", "bed and breakfast", text) - - # replace st. - text = text.replace(';', ',') - text = re.sub('$\/', '', text) - text = text.replace('/', ' and ') - - # replace other special characters - text = text.replace('-', ' ') - text = re.sub('[\"\<>@\(\)]', '', text) # remove - - # insert white space before and after tokens: - for token in ['?', '.', ',', '!']: - text = insertSpace(token, text) - - # insert white space for 's - text = insertSpace('\'s', text) - - # replace it's, does't, you'd ... etc - text = re.sub('^\'', '', text) - text = re.sub('\'$', '', text) - text = re.sub('\'\s', ' ', text) - text = re.sub('\s\'', ' ', text) - for fromx, tox in replacements: - text = ' ' + text + ' ' - text = text.replace(fromx, tox)[1:-1] - - # remove multiple spaces - text = re.sub(' +', ' ', text) - - # concatenate numbers - tmp = text - tokens = text.split() - i = 1 - while i < len(tokens): - if re.match(u'^\d+$', tokens[i]) and \ - re.match(u'\d+$', tokens[i - 1]): - tokens[i - 1] += tokens[i] - del tokens[i] - else: - i += 1 - text = ' '.join(tokens) - - return text - - -def insertSpace(token, text): - sidx = 0 - while True: - sidx = text.find(token, sidx) - if sidx == -1: - break - if sidx + 1 < len(text) and re.match('[0-9]', text[sidx - 1]) and \ - re.match('[0-9]', text[sidx + 1]): - sidx += 1 - continue - if text[sidx - 1] != ' ': - text = text[:sidx] + ' ' + text[sidx:] - sidx += 1 - if sidx + len(token) < len(text) and text[sidx + len(token)] != ' ': - text = text[:sidx + 1] + ' ' + text[sidx + 1:] - sidx += 1 - return text - - -def get_label_embedding(labels, max_seq_length, tokenizer, device): - features = [] - for label in labels: - label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"] - label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens) - label_len = len(label_token_ids) - - label_padding = [0] * (max_seq_length - len(label_token_ids)) - label_token_ids += label_padding - assert len(label_token_ids) == max_seq_length - - features.append((label_token_ids, label_len)) - - all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device) - all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device) - - return all_label_token_ids, all_label_len - - -def warmup_linear(x, warmup=0.002): - if x < warmup: - return x / warmup - return 1.0 - x - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_len, label_id): - self.input_ids = input_ids - self.input_len = input_len - self.label_id = label_id - - -def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, max_turn_length): - """Loads a data file into a list of `InputBatch`s.""" - - label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list] - slot_dim = len(label_list) - - features = [] - prev_dialogue_idx = None - all_padding = [0] * max_seq_length - all_padding_len = [0, 0] - - max_turn = 0 - for (ex_index, example) in enumerate(examples): - if max_turn < int(example.guid.split('-')[2]): - max_turn = int(example.guid.split('-')[2]) - max_turn_length = min(max_turn + 1, max_turn_length) - logger.info("max_turn_length = %d" % max_turn) - - for (ex_index, example) in enumerate(examples): - tokens_a = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_a)] - tokens_b = None - if example.text_b: - tokens_b = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_b)] - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[:(max_seq_length - 2)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambigiously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - - tokens = ["[CLS]"] + tokens_a + ["[SEP]"] - input_len = [len(tokens), 0] - - if tokens_b: - tokens += tokens_b + ["[SEP]"] - input_len[1] = len(tokens_b) + 1 - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # Zero-pad up to the sequence length. - padding = [0] * (max_seq_length - len(input_ids)) - input_ids += padding - assert len(input_ids) == max_seq_length - - FLAG_TEST = False - if example.label is not None: - label_id = [] - label_info = 'label: ' - for i, label in enumerate(example.label): - if label == 'dontcare': - label = 'do not care' - label_id.append(label_map[i][label]) - label_info += '%s (id = %d) ' % (label, label_map[i][label]) - - if ex_index < 5: - logger.info("*** Example ***") - logger.info("guid: %s" % example.guid) - logger.info("tokens: %s" % " ".join( - [str(x) for x in tokens])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_len: %s" % " ".join([str(x) for x in input_len])) - logger.info("label: " + label_info) - else: - FLAG_TEST = True - label_id = None - - curr_dialogue_idx = example.guid.split('-')[1] - curr_turn_idx = int(example.guid.split('-')[2]) - - if prev_dialogue_idx is not None and prev_dialogue_idx != curr_dialogue_idx: - if prev_turn_idx < max_turn_length: - features += [InputFeatures(input_ids=all_padding, - input_len=all_padding_len, - label_id=[-1] * slot_dim)] \ - * (max_turn_length - prev_turn_idx - 1) - assert len(features) % max_turn_length == 0 - - if prev_dialogue_idx is None or prev_turn_idx < max_turn_length: - features.append( - InputFeatures(input_ids=input_ids, - input_len=input_len, - label_id=label_id)) - - prev_dialogue_idx = curr_dialogue_idx - prev_turn_idx = curr_turn_idx - - if prev_turn_idx < max_turn_length: - features += [InputFeatures(input_ids=all_padding, - input_len=all_padding_len, - label_id=[-1] * slot_dim)] \ - * (max_turn_length - prev_turn_idx - 1) - assert len(features) % max_turn_length == 0 - - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_len = torch.tensor([f.input_len for f in features], dtype=torch.long) - if not FLAG_TEST: - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) - - # reshape tensors to [#batch, #max_turn_length, #max_seq_length] - all_input_ids = all_input_ids.view(-1, max_turn_length, max_seq_length) - all_input_len = all_input_len.view(-1, max_turn_length, 2) - if not FLAG_TEST: - all_label_ids = all_label_ids.view(-1, max_turn_length, slot_dim) - else: - all_label_ids = None - - return all_input_ids, all_input_len, all_label_ids - - -def eval_all_accs(pred_slot, labels, accuracies): - - def _eval_acc(_pred_slot, _labels): - slot_dim = _labels.size(-1) - accuracy = (_pred_slot == _labels).view(-1, slot_dim) - num_turn = torch.sum(_labels[:, :, 0].view(-1) > -1, 0).float() - num_data = torch.sum(_labels > -1).float() - # joint accuracy - # joint_acc = sum(torch.sum(accuracy, 1) / slot_dim).float() - num_slots = accuracy.shape[1] - joint_acc = sum(torch.sum(accuracy, 1) == num_slots) - # slot accuracy - slot_acc = torch.sum(accuracy).float() - return joint_acc, slot_acc, num_turn, num_data - - # 7 domains - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot, labels) - accuracies['joint7'] += joint_acc - accuracies['slot7'] += slot_acc - accuracies['num_turn'] += num_turn - accuracies['num_slot7'] += num_data - - # restaurant domain - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot[:,:,18:25], labels[:,:,18:25]) - accuracies['joint_rest'] += joint_acc - accuracies['slot_rest'] += slot_acc - accuracies['num_slot_rest'] += num_data - - pred_slot5 = torch.cat((pred_slot[:,:,0:3], pred_slot[:,:,8:]), 2) - label_slot5 = torch.cat((labels[:,:,0:3], labels[:,:,8:]), 2) - - # 5 domains (excluding bus and hotel domain) - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot5, label_slot5) - accuracies['joint5'] += joint_acc - accuracies['slot5'] += slot_acc - accuracies['num_slot5'] += num_data - - return accuracies diff --git a/convlab/dst/sumbt/multiwoz_zh/.gitignore b/convlab/dst/sumbt/multiwoz_zh/.gitignore deleted file mode 100644 index 5b558554b2dea1b954e9bbcacb01bda105f03f17..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz_zh/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -pre-trained/ -model_output/ diff --git a/convlab/dst/sumbt/multiwoz_zh/__init__.py b/convlab/dst/sumbt/multiwoz_zh/__init__.py deleted file mode 100644 index 1550c99e34cef1733e369510958291e7d1f98998..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz_zh/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from convlab.dst.sumbt.multiwoz_zh.sumbt import SUMBTTracker as SUMBT diff --git a/convlab/dst/sumbt/multiwoz_zh/convert_to_glue_format.py b/convlab/dst/sumbt/multiwoz_zh/convert_to_glue_format.py deleted file mode 100644 index b8fa6c6b9f4dae0c20476a1334d23f4ad2f5576b..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz_zh/convert_to_glue_format.py +++ /dev/null @@ -1,158 +0,0 @@ -import json -import zipfile -from convlab.dst.sumbt.multiwoz_zh.sumbt_config import * - -def trans_value(value): - trans = { - '': '未提及', - '没有提到': '未提及', - '没有': '未提及', - '未提到': '未提及', - '一个也没有': '未提及', - '无': '未提及', - '是的': '有', - '不是': '没有', - '不关心': '不在意', - '不在乎': '不在意', - } - - return trans.get(value, value) - - -def convert_to_glue_format(data_dir, sumbt_dir): - - if not os.path.isdir(os.path.join(sumbt_dir, args.tmp_data_dir)): - os.mkdir(os.path.join(sumbt_dir, args.tmp_data_dir)) - - ### Read ontology file - with open(os.path.join(data_dir, "ontology.json"), "r") as fp_ont: - data_ont = json.load(fp_ont) - ontology = {} - for domain_slot in data_ont: - domain, slot = domain_slot.split('-') - if domain not in ontology: - ontology[domain] = {} - ontology[domain][slot] = {} - for value in data_ont[domain_slot]: - ontology[domain][slot][value] = 1 - - ### Read woz logs and write to tsv files - if os.path.exists(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv")): - print('data has been processed!') - return 0 - print('begin processing data') - - fp_train = open(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv"), "w") - fp_dev = open(os.path.join(sumbt_dir, args.tmp_data_dir, "dev.tsv"), "w") - fp_test = open(os.path.join(sumbt_dir, args.tmp_data_dir, "test.tsv"), "w") - - fp_train.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - fp_dev.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - fp_test.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t') - - for domain in sorted(ontology.keys()): - for slot in sorted(ontology[domain].keys()): - fp_train.write(str(domain) + '-' + str(slot) + '\t') - fp_dev.write(str(domain) + '-' + str(slot) + '\t') - fp_test.write(str(domain) + '-' + str(slot) + '\t') - - fp_train.write('\n') - fp_dev.write('\n') - fp_test.write('\n') - - # fp_data = open(os.path.join(SELF_DATA_DIR, "data.json"), "r") - # data = json.load(fp_data) - - file_split = ['train', 'val', 'test'] - fp = [fp_train, fp_dev, fp_test] - - for split_type, split_fp in zip(file_split, fp): - - zipfile_name = "{}.json.zip".format(split_type) - zip_fp = zipfile.ZipFile(os.path.join(data_dir, zipfile_name)) - data = json.loads(str(zip_fp.read(zip_fp.namelist()[0]), 'utf-8')) - - for file_id in data: - user_utterance = '' - system_response = '' - turn_idx = 0 - for idx, turn in enumerate(data[file_id]['log']): - if idx % 2 == 0: # user turn - user_utterance = data[file_id]['log'][idx]['text'] - else: # system turn - user_utterance = user_utterance.replace('\t', ' ') - user_utterance = user_utterance.replace('\n', ' ') - user_utterance = user_utterance.replace(' ', ' ') - - system_response = system_response.replace('\t', ' ') - system_response = system_response.replace('\n', ' ') - system_response = system_response.replace(' ', ' ') - - split_fp.write(str(file_id)) # 0: dialogue ID - split_fp.write('\t' + str(turn_idx)) # 1: turn index - split_fp.write('\t' + str(user_utterance)) # 2: user utterance - split_fp.write('\t' + str(system_response)) # 3: system response - - belief = {} - - for domain in data[file_id]['log'][idx]['metadata'].keys(): - for slot in data[file_id]['log'][idx]['metadata'][domain]['semi'].keys(): - value = data[file_id]['log'][idx]['metadata'][domain]['semi'][slot].strip() - # value = value_trans.get(value, value) - value = trans_value(value) - - if domain not in ontology: - print("domain (%s) is not defined" % domain) - continue - - if slot not in ontology[domain]: - print("slot (%s) in domain (%s) is not defined" % (slot, domain)) # bus-arriveBy not defined - continue - - if value not in ontology[domain][slot] and value != '未提及': - print("%s: value (%s) in domain (%s) slot (%s) is not defined in ontology" % - (file_id, value, domain, slot)) - value = '未提及' - - belief[str(domain) + '-' + str(slot)] = value - - for slot in data[file_id]['log'][idx]['metadata'][domain]['book'].keys(): - if slot == 'booked': - continue - if domain == '公共汽车' and slot == '人数' or domain == '列车' and slot == '票价': - continue # not defined in ontology - - value = data[file_id]['log'][idx]['metadata'][domain]['book'][slot].strip() - value = trans_value(value) - - if str('预订' + slot) not in ontology[domain]: - print("预订%s is not defined in domain %s" % (slot, domain)) - continue - - if value not in ontology[domain]['预订' + slot] and value != '未提及': - print("%s: value (%s) in domain (%s) slot (预订%s) is not defined in ontology" % - (file_id, value, domain, slot)) - value = '未提及' - - belief[str(domain) + '-预订' + str(slot)] = value - - for domain in sorted(ontology.keys()): - for slot in sorted(ontology[domain].keys()): - key = str(domain) + '-' + str(slot) - if key in belief: - split_fp.write('\t' + belief[key]) - else: - split_fp.write('\t未提及') - - split_fp.write('\n') - split_fp.flush() - - system_response = data[file_id]['log'][idx]['text'] - turn_idx += 1 - - fp_train.close() - fp_dev.close() - fp_test.close() - - print('data has been processed!') - \ No newline at end of file diff --git a/convlab/dst/sumbt/multiwoz_zh/sumbt.py b/convlab/dst/sumbt/multiwoz_zh/sumbt.py deleted file mode 100644 index 16dc674c0a3d2f1faa8c85194c610f7ef41b987b..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz_zh/sumbt.py +++ /dev/null @@ -1,682 +0,0 @@ -import copy -import random -from itertools import chain -import numpy as np -import zipfile - -from tensorboardX.writer import SummaryWriter -from tqdm import trange, tqdm - -from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler - -from transformers import BertTokenizer -from transformers import get_linear_schedule_with_warmup, AdamW - -from convlab.dst.dst import DST -from convlab.dst.sumbt.multiwoz_zh.convert_to_glue_format import convert_to_glue_format -from convlab.util.multiwoz_zh.state import default_state - -from convlab.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker -from convlab.dst.sumbt.multiwoz_zh.sumbt_utils import * -from convlab.dst.sumbt.multiwoz_zh.sumbt_config import * - -USE_CUDA = torch.cuda.is_available() -N_GPU = torch.cuda.device_count() if USE_CUDA else 1 -DEVICE = "cuda" if USE_CUDA else "cpu" -ROOT_PATH = convlab.get_root_path() -SUMBT_PATH = os.path.dirname(os.path.abspath(__file__)) -DATA_PATH = os.path.join(ROOT_PATH, 'data/multiwoz_zh') -DOWNLOAD_DIRECTORY = os.path.join(SUMBT_PATH, 'pre-trained') -multiwoz_zh_slot_list = ['公共汽车-出发地', '公共汽车-出发时间', '公共汽车-到达时间', '公共汽车-日期', '公共汽车-目的地', '出租车-出发地', '出租车-出发时间', '出租车-到达时间', '出租车-目的地', '列车-出发地', '列车-出发时间', '列车-到达时间', '列车-日期', '列车-目的地', '列车-预订人数', '医院-科室', '旅馆-互联网', '旅馆-价格范围', '旅馆-停车处', '旅馆-区域', '旅馆-名称', '旅馆-星级', '旅馆-类型', '旅馆-预订人数', '旅馆-预订停留天数', '旅馆-预订日期', '景点-区域', '景点-名称', '景点-类型', '餐厅-价格范围', '餐厅-区域', '餐厅-名称', '餐厅-预订人数', '餐厅-预订日期', '餐厅-预订时间', '餐厅-食物'] - - -def get_label_embedding(labels, max_seq_length, tokenizer, device): - features = [] - for label in labels: - label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"] - label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens) - label_len = len(label_token_ids) - - label_padding = [0] * (max_seq_length - len(label_token_ids)) - label_token_ids += label_padding - assert len(label_token_ids) == max_seq_length - - features.append((label_token_ids, label_len)) - - all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device) - all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device) - - return all_label_token_ids, all_label_len - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class SUMBTTracker(DST): - """ - Transferable multi-domain dialogue state tracker, adopted from https://github.com/SKTBrain/SUMBT - """ - - # adapt data provider - # unzip mt.zip, and zip each [train|val|test].json - @staticmethod - def init_data(): - if not os.path.exists(os.path.join(DATA_PATH, 'train.json.zip')): - with zipfile.ZipFile(os.path.join(DATA_PATH, 'mt.zip')) as f: - f.extractall(DATA_PATH) - - for split in ['train', 'test', 'val']: - with zipfile.ZipFile(os.path.join(DATA_PATH, f'{split}.json.zip'), 'w') as f: - f.write(os.path.join(DATA_PATH, f'{split}.json'), f'{split}.json') - - def __init__(self, data_dir=DATA_PATH, eval_slots=multiwoz_zh_slot_list): - DST.__init__(self) - - self.init_data() - - processor = Processor(args) - self.processor = processor - label_list = processor.get_labels() - num_labels = [len(labels) for labels in label_list] # number of slot-values in each slot-type - - # tokenizer - self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir) - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - - self.device = torch.device("cuda" if USE_CUDA else "cpu") - - self.sumbt_model = BeliefTracker(args, num_labels, self.device) - if USE_CUDA and N_GPU > 1: - self.sumbt_model = torch.nn.DataParallel(self.sumbt_model) - if args.fp16: - self.sumbt_model.half() - self.sumbt_model.to(self.device) - - ## Get slot-value embeddings - self.label_token_ids, self.label_len = [], [] - for labels in label_list: - token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, self.device) - self.label_token_ids.append(token_ids) - self.label_len.append(lens) - self.label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list] - self.label_map_inv = [{i: label for i, label in enumerate(labels)} for labels in label_list] - self.label_list = label_list - self.target_slot = processor.target_slot - ## Get domain-slot-type embeddings - self.slot_token_ids, self.slot_len = \ - get_label_embedding(processor.target_slot, args.max_label_length, self.tokenizer, self.device) - - self.args = args - self.state = default_state() - self.param_restored = False - if USE_CUDA and N_GPU == 1: - self.sumbt_model.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids) - elif USE_CUDA and N_GPU > 1: - self.sumbt_model.module.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids) - - self.cached_res = {} - convert_to_glue_format(DATA_PATH, SUMBT_PATH) - if not os.path.isdir(os.path.join(SUMBT_PATH, args.output_dir)): - os.makedirs(os.path.join(SUMBT_PATH, args.output_dir)) - self.train_examples = processor.get_train_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - self.dev_examples = processor.get_dev_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - self.test_examples = processor.get_test_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False) - self.eval_slots = eval_slots - - def load_weights(self, model_path=None): - if model_path is None: - model_ckpt = os.path.join(SUMBT_PATH, 'pre-trained/pytorch_model.bin') - else: - model_ckpt = model_path - model = self.sumbt_model - # in the case that slot and values are different between the training and evaluation - if not USE_CUDA: - ptr_model = torch.load(model_ckpt, map_location=torch.device('cpu')) - else: - ptr_model = torch.load(model_ckpt) - print('loading pretrained weights') - - if not USE_CUDA or N_GPU == 1: - state = model.state_dict() - state.update(ptr_model) - model.load_state_dict(state) - else: - # print("Evaluate using only one device!") - model.module.load_state_dict(ptr_model) - - if USE_CUDA: - model.to("cuda") - - def train(self, load_model=False, model_path=None): - if load_model: - if model_path is not None: - self.load_weights(model_path) - ## Training utterances - all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( - self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - - num_train_batches = all_input_ids.size(0) - num_train_steps = int( - num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) - - logger.info("***** training *****") - logger.info(" Num examples = %d", len(self.train_examples)) - logger.info(" Batch size = %d", args.train_batch_size) - logger.info(" Num steps = %d", num_train_steps) - - all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to( - DEVICE), all_label_ids.to(DEVICE) - - train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) - train_sampler = RandomSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) - - all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features( - self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - - logger.info("***** validation *****") - logger.info(" Num examples = %d", len(self.dev_examples)) - logger.info(" Batch size = %d", args.dev_batch_size) - - all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \ - all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE) - - dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev) - dev_sampler = SequentialSampler(dev_data) - dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size) - - logger.info("Loaded data!") - - if args.fp16: - self.sumbt_model.half() - self.sumbt_model.to(DEVICE) - - ## Get domain-slot-type embeddings - slot_token_ids, slot_len = \ - get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE) - - # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot): - # self.idx2slot[slot_idx] = slot_str - - ## Get slot-value embeddings - label_token_ids, label_len = [], [] - for slot_idx, labels in zip(slot_token_ids, self.label_list): - # self.idx2value[slot_idx] = {} - token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE) - label_token_ids.append(token_ids) - label_len.append(lens) - # for label, token_id in zip(labels, token_ids): - # self.idx2value[slot_idx][token_id] = label - - logger.info('embeddings prepared') - - if USE_CUDA and N_GPU > 1: - self.sumbt_model.module.initialize_slot_value_lookup(label_token_ids, slot_token_ids) - else: - self.sumbt_model.initialize_slot_value_lookup(label_token_ids, slot_token_ids) - - def get_optimizer_grouped_parameters(model): - param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad] - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, - 'lr': args.learning_rate}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, - 'lr': args.learning_rate}, - ] - return optimizer_grouped_parameters - - if not USE_CUDA or N_GPU == 1: - optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model) - else: - optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model.module) - - t_total = num_train_steps - - scheduler = None - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.fp16_loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale) - - else: - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total) - logger.info(optimizer) - - # Training code - ############################################################################### - - logger.info("Training...") - - global_step = 0 - last_update = None - best_loss = None - model = self.sumbt_model - if not args.do_not_use_tensorboard: - summary_writer = None - else: - summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/") - - for epoch in trange(int(args.num_train_epochs), desc="Epoch"): - # Train - model.train() - tr_loss = 0 - nb_tr_examples = 0 - nb_tr_steps = 0 - - for step, batch in enumerate(tqdm(train_dataloader)): - batch = tuple(t.to(DEVICE) for t in batch) - input_ids, input_len, label_ids = batch - - # Forward - if N_GPU == 1: - loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - else: - loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - - # average to multi-gpus - loss = loss.mean() - acc = acc.mean() - acc_slot = acc_slot.mean(0) - - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - # Backward - if args.fp16: - optimizer.backward(loss) - else: - loss.backward() - - # tensrboard logging - if summary_writer is not None: - summary_writer.add_scalar("Epoch", epoch, global_step) - summary_writer.add_scalar("Train/Loss", loss, global_step) - summary_writer.add_scalar("Train/JointAcc", acc, global_step) - if N_GPU == 1: - for i, slot in enumerate(self.processor.target_slot): - summary_writer.add_scalar("Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i], - global_step) - summary_writer.add_scalar("Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step) - - tr_loss += loss.item() - nb_tr_examples += input_ids.size(0) - nb_tr_steps += 1 - if (step + 1) % args.gradient_accumulation_steps == 0: - # modify lealrning rate with special warm up BERT uses - lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) - if summary_writer is not None: - summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) - for param_group in optimizer.param_groups: - param_group['lr'] = lr_this_step - if scheduler is not None: - torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0) - optimizer.step() - if scheduler is not None: - scheduler.step() - optimizer.zero_grad() - global_step += 1 - - - # Perform evaluation on validation dataset - model.eval() - dev_loss = 0 - dev_acc = 0 - dev_loss_slot, dev_acc_slot = None, None - nb_dev_examples, nb_dev_steps = 0, 0 - - for step, batch in enumerate(tqdm(dev_dataloader, desc="Validation")): - batch = tuple(t.to(DEVICE) for t in batch) - input_ids, input_len, label_ids = batch - if input_ids.dim() == 2: - input_ids = input_ids.unsqueeze(0) - input_len = input_len.unsqueeze(0) - label_ids = label_ids.unsuqeeze(0) - - with torch.no_grad(): - if N_GPU == 1: - loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - else: - loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) - - # average to multi-gpus - loss = loss.mean() - acc = acc.mean() - acc_slot = acc_slot.mean(0) - - num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item() - dev_loss += loss.item() * num_valid_turn - dev_acc += acc.item() * num_valid_turn - - if N_GPU == 1: - if dev_loss_slot is None: - dev_loss_slot = [l * num_valid_turn for l in loss_slot] - dev_acc_slot = acc_slot * num_valid_turn - else: - for i, l in enumerate(loss_slot): - dev_loss_slot[i] = dev_loss_slot[i] + l * num_valid_turn - dev_acc_slot += acc_slot * num_valid_turn - - nb_dev_examples += num_valid_turn - - - dev_loss = dev_loss / nb_dev_examples - dev_acc = dev_acc / nb_dev_examples - - if N_GPU == 1: - dev_acc_slot = dev_acc_slot / nb_dev_examples - - # tensorboard logging - if summary_writer is not None: - summary_writer.add_scalar("Validate/Loss", dev_loss, global_step) - summary_writer.add_scalar("Validate/Acc", dev_acc, global_step) - if N_GPU == 1: - for i, slot in enumerate(self.processor.target_slot): - summary_writer.add_scalar("Validate/Loss_%s" % slot.replace(' ', '_'), - dev_loss_slot[i] / nb_dev_examples, global_step) - summary_writer.add_scalar("Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i], - global_step) - - dev_loss = round(dev_loss, 6) - - output_model_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin") - - if last_update is None or dev_loss < best_loss: - - if not USE_CUDA or N_GPU == 1: - torch.save(model.state_dict(), output_model_file) - else: - torch.save(model.module.state_dict(), output_model_file) - - last_update = epoch - best_loss = dev_loss - best_acc = dev_acc - - logger.info( - "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % ( - last_update, best_loss, best_acc, global_step)) - else: - logger.info( - "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % ( - epoch, dev_loss, dev_acc, global_step)) - - if last_update + args.patience <= epoch: - break - - def test(self, mode='dev', model_path=None): - '''Testing funciton of TRADE (to be added)''' - # Evaluation - self.load_weights(model_path) - - if mode == 'test': - eval_examples = self.dev_examples - elif mode == 'dev': - eval_examples = self.test_examples - - all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( - eval_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) - all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to( - DEVICE), all_label_ids.to(DEVICE) - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) - - eval_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) - - # Run prediction for full data - eval_sampler = SequentialSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.dev_batch_size) - - model = self.sumbt_model - eval_loss, eval_accuracy = 0, 0 - eval_loss_slot, eval_acc_slot = None, None - nb_eval_steps, nb_eval_examples = 0, 0 - - accuracies = {'joint7': 0, 'slot7': 0, 'joint5': 0, 'slot5': 0, 'joint_rest': 0, 'slot_rest': 0, - 'num_turn': 0, 'num_slot7': 0, 'num_slot5': 0, 'num_slot_rest': 0} - - for input_ids, input_len, label_ids in tqdm(eval_dataloader, desc="Evaluating"): - # if input_ids.dim() == 2: - # input_ids = input_ids.unsqueeze(0) - # input_len = input_len.unsqueeze(0) - # label_ids = label_ids.unsuqeeze(0) - - with torch.no_grad(): - if not USE_CUDA or N_GPU == 1: - loss, loss_slot, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, 1) - else: - loss, _, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, N_GPU) - nbatch = label_ids.size(0) - nslot = pred_slot.size(3) - pred_slot = pred_slot.view(nbatch, -1, nslot) - - accuracies = eval_all_accs(pred_slot, label_ids, accuracies) - - nb_eval_ex = (label_ids[:, :, 0].view(-1) != -1).sum().item() - nb_eval_examples += nb_eval_ex - nb_eval_steps += 1 - - if not USE_CUDA or N_GPU == 1: - eval_loss += loss.item() * nb_eval_ex - eval_accuracy += acc.item() * nb_eval_ex - if eval_loss_slot is None: - eval_loss_slot = [l * nb_eval_ex for l in loss_slot] - eval_acc_slot = acc_slot * nb_eval_ex - else: - for i, l in enumerate(loss_slot): - eval_loss_slot[i] = eval_loss_slot[i] + l * nb_eval_ex - eval_acc_slot += acc_slot * nb_eval_ex - else: - eval_loss += sum(loss) * nb_eval_ex - eval_accuracy += sum(acc) * nb_eval_ex - - eval_loss = eval_loss / nb_eval_examples - eval_accuracy = eval_accuracy / nb_eval_examples - if not USE_CUDA or N_GPU == 1: - eval_acc_slot = eval_acc_slot / nb_eval_examples - - loss = None - - if not USE_CUDA or N_GPU == 1: - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'loss': loss, - 'eval_loss_slot': '\t'.join([str(val / nb_eval_examples) for val in eval_loss_slot]), - 'eval_acc_slot': '\t'.join([str((val).item()) for val in eval_acc_slot]) - } - else: - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'loss': loss - } - - out_file_name = 'eval_results' - # if TARGET_SLOT == 'all': - # out_file_name += '_all' - output_eval_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name) - - if not USE_CUDA or N_GPU == 1: - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - out_file_name = 'eval_all_accuracies' - with open(os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name), 'w') as f: - s = '{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}'.format( - 'joint acc (7 domain)', - 'slot acc (7 domain)', - 'joint acc (5 domain)', - 'slot acc (5 domain)', - 'joint restaurant', - 'slot acc restaurant') - f.write(s + '\n') - print(s) - s = '{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}'.format( - (accuracies['joint7'] / accuracies['num_turn']).item(), - (accuracies['slot7'] / accuracies['num_slot7']).item(), - (accuracies['joint5'] / accuracies['num_turn']).item(), - (accuracies['slot5'] / accuracies['num_slot5']).item(), - (accuracies['joint_rest'] / accuracies['num_turn']).item(), - (accuracies['slot_rest'] / accuracies['num_slot_rest']).item() - ) - f.write(s + '\n') - print(s) - - def init_session(self): - self.state = default_state() - if not self.param_restored: - if os.path.isfile(os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')): - print('loading weights from downloaded model') - self.load_weights(model_path=os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')) - elif os.path.isfile(os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')): - print('loading weights from trained model') - self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')) - else: - raise ValueError('no available weights found.') - self.param_restored = True - - def update(self, user_act=None): - """Update the dialogue state with the generated tokens from TRADE""" - if not isinstance(user_act, str): - raise Exception( - 'Expected user_act is str but found {}'.format(type(user_act)) - ) - prev_state = self.state - - actual_history = copy.deepcopy(prev_state['history']) - - query = self.construct_query(actual_history) - pred_states = self.predict(query) - - new_belief_state = copy.deepcopy(prev_state['belief_state']) - for state in pred_states: - domain, slot, value = state.split('-', 2) - # slot = REF_SYS_DA[domain.capitalize()].get(slot, slot) - assert 'semi' in new_belief_state[domain] - assert 'book' in new_belief_state[domain] - domain_dic = new_belief_state[domain] - if '预订' in slot: - assert slot.startswith('预订') - slot = slot[2:] - assert slot in domain_dic['book'] - - if slot in domain_dic['semi']: - new_belief_state[domain]['semi'][slot] = value - # normalize_value(self.value_dict, domain, slot, value) - elif slot in domain_dic['book']: - new_belief_state[domain]['book'][slot] = value - else: - with open('trade_tracker_unknown_slot.log', 'a+') as f: - f.write( - 'unknown slot name <{}> with value <{}> of domain <{}>\nitem: {}\n\n'.format(slot, value, domain, state) - ) - - # new_request_state = copy.deepcopy(prev_state['request_state']) - # # update request_state - # user_request_slot = self.detect_requestable_slots(user_act) - # for domain in user_request_slot: - # for key in user_request_slot[domain]: - # if domain not in new_request_state: - # new_request_state[domain] = {} - # if key not in new_request_state[domain]: - # new_request_state[domain][key] = user_request_slot[domain][key] - - new_state = copy.deepcopy(dict(prev_state)) - new_state['belief_state'] = new_belief_state - # new_state['request_state'] = new_request_state - self.state = new_state - # print((pred_states, query)) - return self.state - - def predict(self, query): - cache_query_key = ''.join(str(list(chain.from_iterable(query[0])))) - if cache_query_key in self.cached_res.keys(): - return self.cached_res[cache_query_key] - - input_ids, input_len = query - input_ids = torch.tensor(input_ids).to(self.device).unsqueeze(0) - input_len = torch.tensor(input_len).to(self.device).unsqueeze(0) - labels = None - _, pred_slot = self.sumbt_model(input_ids, input_len, labels) - pred_slot_t = pred_slot[0][-1].tolist() - predict_belief = [] - for idx, i in enumerate(pred_slot_t): - predict_belief.append('{}-{}'.format(self.target_slot[idx], self.label_map_inv[idx][i])) - self.cached_res[cache_query_key] = predict_belief - - return predict_belief - - - def construct_query(self, context): - '''Construct query from context''' - ids = [] - lens = [] - context_len = len(context) - if context[0][0] != 'sys': - context = [['sys', '']] + context - for i in range(0, context_len, 2): - # utt_user = '' - # utt_sys = '' - # for evaluation - utt_sys = context[i][1] - utt_user = context[i + 1][1] - - tokens_user = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_user)] - tokens_sys = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_sys)] - - _truncate_seq_pair(tokens_user, tokens_sys, self.args.max_seq_length - 3) - tokens = ["[CLS]"] + tokens_user + ["[SEP]"] + tokens_sys + ["[SEP]"] - input_len = [len(tokens_user) + 2, len(tokens_sys) + 1] - - input_ids = self.tokenizer.convert_tokens_to_ids(tokens) - padding = [0] * (self.args.max_seq_length - len(input_ids)) - input_ids += padding - assert len(input_ids) == self.args.max_seq_length - ids.append(input_ids) - lens.append(input_len) - - return (ids, lens) - - def detect_requestable_slots(self, observation): - result = {} - observation = observation.lower() - _observation = ' {} '.format(observation) - for value in self.det_dic.keys(): - _value = ' {} '.format(value.strip()) - if _value in _observation: - key, domain = self.det_dic[value].split('-') - if domain not in result: - result[domain] = {} - result[domain][key] = 0 - return result diff --git a/convlab/dst/sumbt/multiwoz_zh/sumbt_config.py b/convlab/dst/sumbt/multiwoz_zh/sumbt_config.py deleted file mode 100644 index f0bfd90ff474d903329c6956f0ade7a861caedfc..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz_zh/sumbt_config.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import convlab - -class DotMap(): - def __init__(self): - self.max_label_length = 32 - self.max_turn_length = 22 - self.num_rnn_layers = 1 - self.zero_init_rnn = False - self.attn_head = 4 - self.do_eval = True - self.do_train = False - self.train_batch_size = 3 - self.dev_batch_size = 1 - self.eval_batch_size = 1 - self.learning_rate = 5e-5 - self.num_train_epochs = 3 - self.patience = 10 - self.warmup_proportion = 0.1 - self.local_rank = -1 - self.seed = 42 - self.gradient_accumulation_steps = 1 - self.fp16 = False - self.loss_scale = 0 - self.do_not_use_tensorboard = False - self.fix_utterance_encoder = False - self.do_eval = True - self.num_train_epochs = 300 - - self.bert_model = os.path.join(convlab.get_root_path(), "pre-trained-models/chinese-bert-wwm-ext") - self.bert_model_cache_dir = os.path.join(convlab.get_root_path(), "pre-trained-models/") - self.bert_model_name = "hfl/chinese-bert-wwm-ext" - self.do_lower_case = True - self.task_name = 'bert-gru-sumbt' - self.nbt = 'rnn' - # self.output_dir = os.path.join(path, 'ckpt/') - self.target_slot = 'all' - self.learning_rate = 5e-5 - self.distance_metric = 'euclidean' - self.patience = 15 - - self.hidden_dim = 300 - self.max_label_length = 32 - self.max_seq_length = 64 - self.max_turn_length = 22 - - self.fp16_loss_scale = 0.0 - self.data_dir = 'data/multiwoz_zh/' - self.tf_dir = 'tensorboard' - self.tmp_data_dir = 'processed_data/' - self.output_dir = 'model_output/' - -args = DotMap() \ No newline at end of file diff --git a/convlab/dst/sumbt/multiwoz_zh/sumbt_utils.py b/convlab/dst/sumbt/multiwoz_zh/sumbt_utils.py deleted file mode 100644 index 19b6864115e0b7129546097f033b2efa6ccba9a3..0000000000000000000000000000000000000000 --- a/convlab/dst/sumbt/multiwoz_zh/sumbt_utils.py +++ /dev/null @@ -1,440 +0,0 @@ -import csv -import os -import json -import collections -import logging -import re -import torch - -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.INFO) -logger = logging.getLogger(__name__) - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding='utf-8') as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - if len(line) > 0 and line[0][0] == '#': # ignore comments (starting with '#') - continue - lines.append(line) - return lines - - -class Processor(DataProcessor): - """Processor for the belief tracking dataset (GLUE version).""" - - def __init__(self, config): - super(Processor, self).__init__() - - print(config) - # MultiWOZ dataset - - with open(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))), config.data_dir, "ontology.json"), "r") as fp_ontology: - ontology = json.load(fp_ontology) - for slot in ontology.keys(): - ontology[slot].append("未提及") - - if config.target_slot != 'all': - raise Exception('unsupported') - - # sorting the ontology according to the alphabetic order of the slots - ontology = collections.OrderedDict(sorted(ontology.items())) - - # select slots to train - nslots = len(ontology.keys()) - target_slot = list(ontology.keys()) - - self.target_slot_idx = [*range(0, nslots)] - - for idx in range(0, nslots): - if not idx in self.target_slot_idx: - del ontology[target_slot[idx]] - - self.ontology = ontology - self.target_slot = list(self.ontology.keys()) - for i, slot in enumerate(self.target_slot): - if slot == "pricerange": - self.target_slot[i] = "price range" - - logger.info('Processor: target_slot') - logger.info(self.target_slot) - - def get_train_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", accumulation) - - def get_dev_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", accumulation) - - def get_test_examples(self, data_dir, accumulation=False): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "test.tsv")), "test", accumulation) - - def get_labels(self): - """See base class.""" - return [self.ontology[slot] for slot in self.target_slot] - - def _create_examples(self, lines, set_type, accumulation=False): - """Creates examples for the training and dev sets.""" - prev_dialogue_index = None - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s-%s" % (set_type, line[0], line[1]) # line[0]: dialogue index, line[1]: turn index - if accumulation: - if prev_dialogue_index is None or prev_dialogue_index != line[0]: - text_a = line[2] - text_b = line[3] - prev_dialogue_index = line[0] - else: - # The symbol '#' will be replaced with '[SEP]' after tokenization. - text_a = line[2] + " # " + text_a - text_b = line[3] + " # " + text_b - else: - text_a = line[2] # line[2]: user utterance - text_b = line[3] # line[3]: system response - - label = [line[4 + idx] for idx in self.target_slot_idx] - - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -def normalize_text(text): - global replacements - # lower case every word - text = text.lower() - # replace white spaces in front and end - text = re.sub(r'^\s*|\s*$', '', text) - - # hotel domain pfb30 - text = re.sub(r"b&b", "bed and breakfast", text) - text = re.sub(r"b and b", "bed and breakfast", text) - - # replace st. - text = text.replace(';', ',') - text = re.sub('$\/', '', text) - text = text.replace('/', ' and ') - - # replace other special characters - text = text.replace('-', ' ') - text = re.sub('[\"\<>@\(\)]', '', text) # remove - - # insert white space before and after tokens: - for token in ['?', '.', ',', '!']: - text = insertSpace(token, text) - - # insert white space for 's - text = insertSpace('\'s', text) - - # replace it's, does't, you'd ... etc - text = re.sub('^\'', '', text) - text = re.sub('\'$', '', text) - text = re.sub('\'\s', ' ', text) - text = re.sub('\s\'', ' ', text) - for fromx, tox in replacements: - text = ' ' + text + ' ' - text = text.replace(fromx, tox)[1:-1] - - # remove multiple spaces - text = re.sub(' +', ' ', text) - - # concatenate numbers - tmp = text - tokens = text.split() - i = 1 - while i < len(tokens): - if re.match(u'^\d+$', tokens[i]) and \ - re.match(u'\d+$', tokens[i - 1]): - tokens[i - 1] += tokens[i] - del tokens[i] - else: - i += 1 - text = ' '.join(tokens) - - return text - - -def insertSpace(token, text): - sidx = 0 - while True: - sidx = text.find(token, sidx) - if sidx == -1: - break - if sidx + 1 < len(text) and re.match('[0-9]', text[sidx - 1]) and \ - re.match('[0-9]', text[sidx + 1]): - sidx += 1 - continue - if text[sidx - 1] != ' ': - text = text[:sidx] + ' ' + text[sidx:] - sidx += 1 - if sidx + len(token) < len(text) and text[sidx + len(token)] != ' ': - text = text[:sidx + 1] + ' ' + text[sidx + 1:] - sidx += 1 - return text - - -def get_label_embedding(labels, max_seq_length, tokenizer, device): - features = [] - for label in labels: - label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"] - label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens) - label_len = len(label_token_ids) - - label_padding = [0] * (max_seq_length - len(label_token_ids)) - label_token_ids += label_padding - assert len(label_token_ids) == max_seq_length - - features.append((label_token_ids, label_len)) - - all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device) - all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device) - - return all_label_token_ids, all_label_len - - -def warmup_linear(x, warmup=0.002): - if x < warmup: - return x / warmup - return 1.0 - x - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_len, label_id): - self.input_ids = input_ids - self.input_len = input_len - self.label_id = label_id - - -def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, max_turn_length): - """Loads a data file into a list of `InputBatch`s.""" - - label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list] - slot_dim = len(label_list) - - features = [] - prev_dialogue_idx = None - all_padding = [0] * max_seq_length - all_padding_len = [0, 0] - - max_turn = 0 - for (ex_index, example) in enumerate(examples): - if max_turn < int(example.guid.split('-')[2]): - max_turn = int(example.guid.split('-')[2]) - max_turn_length = min(max_turn + 1, max_turn_length) - logger.info("max_turn_length = %d" % max_turn) - - for (ex_index, example) in enumerate(examples): - tokens_a = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_a)] - tokens_b = None - if example.text_b: - tokens_b = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_b)] - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[:(max_seq_length - 2)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambigiously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - - tokens = ["[CLS]"] + tokens_a + ["[SEP]"] - input_len = [len(tokens), 0] - - if tokens_b: - tokens += tokens_b + ["[SEP]"] - input_len[1] = len(tokens_b) + 1 - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # Zero-pad up to the sequence length. - padding = [0] * (max_seq_length - len(input_ids)) - input_ids += padding - assert len(input_ids) == max_seq_length - - FLAG_TEST = False - if example.label is not None: - label_id = [] - label_info = 'label: ' - for i, label in enumerate(example.label): - if label == 'dontcare': - label = 'do not care' - label_id.append(label_map[i][label]) - label_info += '%s (id = %d) ' % (label, label_map[i][label]) - - if ex_index < 5: - logger.info("*** Example ***") - logger.info("guid: %s" % example.guid) - logger.info("tokens: %s" % " ".join( - [str(x) for x in tokens])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_len: %s" % " ".join([str(x) for x in input_len])) - logger.info("label: " + label_info) - else: - FLAG_TEST = True - label_id = None - - curr_dialogue_idx = example.guid.split('-')[1] - curr_turn_idx = int(example.guid.split('-')[2]) - - if prev_dialogue_idx is not None and prev_dialogue_idx != curr_dialogue_idx: - if prev_turn_idx < max_turn_length: - features += [InputFeatures(input_ids=all_padding, - input_len=all_padding_len, - label_id=[-1] * slot_dim)] \ - * (max_turn_length - prev_turn_idx - 1) - assert len(features) % max_turn_length == 0 - - if prev_dialogue_idx is None or prev_turn_idx < max_turn_length: - features.append( - InputFeatures(input_ids=input_ids, - input_len=input_len, - label_id=label_id)) - - prev_dialogue_idx = curr_dialogue_idx - prev_turn_idx = curr_turn_idx - - if prev_turn_idx < max_turn_length: - features += [InputFeatures(input_ids=all_padding, - input_len=all_padding_len, - label_id=[-1] * slot_dim)] \ - * (max_turn_length - prev_turn_idx - 1) - assert len(features) % max_turn_length == 0 - - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_len = torch.tensor([f.input_len for f in features], dtype=torch.long) - if not FLAG_TEST: - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) - - # reshape tensors to [#batch, #max_turn_length, #max_seq_length] - all_input_ids = all_input_ids.view(-1, max_turn_length, max_seq_length) - all_input_len = all_input_len.view(-1, max_turn_length, 2) - if not FLAG_TEST: - all_label_ids = all_label_ids.view(-1, max_turn_length, slot_dim) - else: - all_label_ids = None - - return all_input_ids, all_input_len, all_label_ids - - -def eval_all_accs(pred_slot, labels, accuracies): - - def _eval_acc(_pred_slot, _labels): - slot_dim = _labels.size(-1) - accuracy = (_pred_slot == _labels).view(-1, slot_dim) - num_turn = torch.sum(_labels[:, :, 0].view(-1) > -1, 0).float() - num_data = torch.sum(_labels > -1).float() - # joint accuracy - # joint_acc = sum(torch.sum(accuracy, 1) / slot_dim).float() - num_slots = accuracy.shape[1] - joint_acc = sum(torch.sum(accuracy, 1) == num_slots) - # slot accuracy - slot_acc = torch.sum(accuracy).float() - return joint_acc, slot_acc, num_turn, num_data - - # 7 domains - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot, labels) - accuracies['joint7'] += joint_acc - accuracies['slot7'] += slot_acc - accuracies['num_turn'] += num_turn - accuracies['num_slot7'] += num_data - - # restaurant domain - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot[:,:,18:25], labels[:,:,18:25]) - accuracies['joint_rest'] += joint_acc - accuracies['slot_rest'] += slot_acc - accuracies['num_slot_rest'] += num_data - - pred_slot5 = torch.cat((pred_slot[:,:,0:3], pred_slot[:,:,8:]), 2) - label_slot5 = torch.cat((labels[:,:,0:3], labels[:,:,8:]), 2) - - # 5 domains (excluding bus and hotel domain) - joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot5, label_slot5) - accuracies['joint5'] += joint_acc - accuracies['slot5'] += slot_acc - accuracies['num_slot5'] += num_data - - return accuracies diff --git a/convlab/policy/ppo/semantic_level_config.json b/convlab/policy/ppo/semantic_level_config.json index 04b0626a10bc8d48add16732df26a7cc00a35088..b9908c9cb7717515775221227f3fba19636d20dc 100644 --- a/convlab/policy/ppo/semantic_level_config.json +++ b/convlab/policy/ppo/semantic_level_config.json @@ -6,7 +6,7 @@ "batchsz": 1000, "seed": 0, "epoch": 10, - "eval_frequency": 5, + "eval_frequency": 1, "process_num": 4, "sys_semantic_to_usr": false, "num_eval_dialogues": 500 diff --git a/convlab/policy/ppo/setsumbt_config.json b/convlab/policy/ppo/setsumbt_config.json index 31a8ac6d275166e4163e416e0dbef6f742cddb7f..b6a02adbf371bfea63e3e156a2d9e47f13456c78 100644 --- a/convlab/policy/ppo/setsumbt_config.json +++ b/convlab/policy/ppo/setsumbt_config.json @@ -26,7 +26,7 @@ "setsumbt-mul": { "class_path": "convlab.dst.setsumbt.SetSUMBTTracker", "ini_params": { - "model_path": "/gpfs/project/niekerk/models/setsumbt_models/SetSUMBT+ActPrediction-multiwoz21-roberta-gru-cosine-distribution_distillation-Seed0-30-08-22-15-00" + "model_path": "https://huggingface.co/ConvLab/setsumbt-dst_nlu-multiwoz21-EnD2/resolve/main/SetSUMBT-nlu-multiwoz21-roberta-gru-cosine-distribution_distillation-Seed0.zip" } } }, diff --git a/convlab/policy/ppo/setsumbt_unc_config.json b/convlab/policy/ppo/setsumbt_unc_config.json index 6b7d115aafa53a2bfc5c58672e67086f04d5884d..fafdb3fc9bd8f7fe09e3759d58a591cf964fb93b 100644 --- a/convlab/policy/ppo/setsumbt_unc_config.json +++ b/convlab/policy/ppo/setsumbt_unc_config.json @@ -1,6 +1,6 @@ { "model": { - "load_path": "/gpfs/project/niekerk/src/ConvLab3/convlab/policy/mle/experiments/experiment_2022-11-10-10-37-30/save/supervised", + "load_path": "", "pretrained_load_path": "", "use_pretrained_initialisation": false, "batchsz": 1000, @@ -28,7 +28,7 @@ "setsumbt-mul": { "class_path": "convlab.dst.setsumbt.SetSUMBTTracker", "ini_params": { - "model_path": "/gpfs/project/niekerk/models/setsumbt_models/SetSUMBT+ActPrediction-multiwoz21-roberta-gru-cosine-distribution_distillation-Seed0-30-08-22-15-00", + "model_path": "https://huggingface.co/ConvLab/setsumbt-dst_nlu-multiwoz21-EnD2/resolve/main/SetSUMBT-nlu-multiwoz21-roberta-gru-cosine-distribution_distillation-Seed0.zip", "return_confidence_scores": true, "return_belief_state_mutual_info": true }