From ecfa6933c008f8cfe0621574ff625dbd4ae0a06a Mon Sep 17 00:00:00 2001
From: Christian <christian.geishauser@hhu.de>
Date: Thu, 24 Nov 2022 19:30:34 +0100
Subject: [PATCH] added readmes, only need to update the references

---
 convlab/policy/README.md                      | 16 ++++-
 convlab/policy/gdpl/README.md                 | 61 ++++++++++++-----
 .../policy/gdpl/semantic_level_config.json    |  2 +-
 convlab/policy/mle/README.md                  | 19 ++++++
 convlab/policy/pg/README.md                   | 66 ++++++++++++-------
 convlab/policy/ppo/README.md                  | 64 ++++++++++++------
 convlab/policy/ppo/semantic_level_config.json |  2 +-
 convlab/policy/vtrace_DPT/README.md           | 65 ++++++++++++++++++
 8 files changed, 232 insertions(+), 63 deletions(-)
 create mode 100644 convlab/policy/mle/README.md
 create mode 100644 convlab/policy/vtrace_DPT/README.md

diff --git a/convlab/policy/README.md b/convlab/policy/README.md
index fab0e3e4..6cdac456 100755
--- a/convlab/policy/README.md
+++ b/convlab/policy/README.md
@@ -88,7 +88,9 @@ The necessary step before starting a training is to set up the environment and p
 
 Once you set up your configuration, you are ready to start an experiment by executing
 
-```python convlab/policy/policy_subfolder/train.py --path=your_environment_config --seed=your_seed```
+```sh
+$ python convlab/policy/policy_subfolder/train.py --path=your_environment_config --seed=your_seed
+```
 
 You can specify the seed either in the environment config or through the argument parser. If you do not specify an environment config, it will automatically load the default config. 
 
@@ -105,6 +107,18 @@ Once the training finished, it will move the experiment-TIMESTAMP folder into th
 
 The evaluation tools can be found in the folder convlab/policy/plot_results. Please have a look in the README for detailed instructions. 
 
+#### Running Evaluation Dialogues
+
+You can run evaluation dialogues with a trained model using 
+
+```sh
+$ python convlab/policy/evaluate.py --model_name=NAME --config_path=PATH --num_dialogues=NUM --verbose
+```
+
+- model_name: specify which model is used, i.e. MLE, PPO, PG, DDPT
+- config_path: specify the config-path that was used during RL training, for instance semantic_level_config.json
+- num_dialogues: number of evaluation dialogues
+- verbose: can be also excluded. If used, it will print the dialogues in the termain consoloe together with its goal. That helps in analysing the behaviour of the policy.
 
 ## Adding a new policy
 
diff --git a/convlab/policy/gdpl/README.md b/convlab/policy/gdpl/README.md
index e9e62e96..fc9ffa32 100755
--- a/convlab/policy/gdpl/README.md
+++ b/convlab/policy/gdpl/README.md
@@ -1,33 +1,58 @@
-# GDPL
+# Guided Dialogue Policy Learning (GDPL)
 
-A join policy optimization and reward estimation method using adversarial inverse reinforcement learning that learns a dialog policy and builds a reward estimator simultaneously. The reward estimator evaluates the state-action pairs to guide the dialog policy at each dialog turn.
+GDPL uses the PPO algorithm to optimize the policy. The difference to vanilla PPO is that it is not using the extrinsic reward for optimization but leverages inverse reinforcement learning to train a reward estimator. This reward estimator provides the reward that should be optimized.
 
-## Train
+## Supervised pre-training
 
-Run `train.py` in the `gdpl` directory:
+If you want to obtain a supervised model for pre-training, please have a look in the MLE policy folder.
 
-```bash
-python train.py
+## RL training
+
+Starting a RL training is as easy as executing
+
+```sh
+$ python train.py --path=your_environment_config --seed=SEED
 ```
 
-For better performance, we can do immitating learning before reinforcement learning. The immitating learning is implemented in the `mle` directory.
+One example for the environment-config is **semantic_level_config.json**, where parameters for the training are specified, for instance
 
-For example, if the trained model of immitating learning is saved at FOLDER_OF_MODEL/best_mle.pol.mdl, then you can run
+- load_path: provide a path to initialise the model with a pre-trained model, skip the ending .pol.mdl
+- process_num: the number of processes to use during evaluation to speed it up
+- num_eval_dialogues: how many evaluation dialogues should be used
+- epoch: how many training epochs to run. One epoch consists of collecting dialogues + performing an update
+- eval_frequency: after how many epochs perform an evaluation
+- batchsz: the number of training dialogues collected before doing an update
 
-```bash
-python train.py --load_path FOLDER_OF_MODEL/best_mle
-```
+Moreover, you can specify the full dialogue pipeline here, such as the user policy, NLU for system and user, etc.
+
+Parameters that are tied to the RL algorithm and the model architecture can be changed in config.json.
 
-Note that the *.pol.mdl* suffix should not appear in the --load_path argument.
 
-## Reference
+## Evaluation
+
+For creating evaluation plots and running evaluation dialogues, please have a look in the README of the policy folder.
+
+## References
 
 ```
-@inproceedings{takanobu2019guided,
-  title={Guided Dialog Policy Learning: Reward Estimation for Multi-Domain Task-Oriented Dialog},
-  author={Takanobu, Ryuichi and Zhu, Hanlin and Huang, Minlie},
-  booktitle={EMNLP-IJCNLP},
-  pages={100--110},
+@inproceedings{devlin2019bert,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+  booktitle={Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
+  pages={4171--4186},
   year={2019}
 }
+
+@inproceedings{zhu-etal-2020-convlab,
+    title = "{C}onv{L}ab-2: An Open-Source Toolkit for Building, Evaluating, and Diagnosing Dialogue Systems",
+    author = "Zhu, Qi and Zhang, Zheng and Fang, Yan and Li, Xiang and Takanobu, Ryuichi and Li, Jinchao and Peng, Baolin and Gao, Jianfeng and Zhu, Xiaoyan and Huang, Minlie",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
+    month = jul,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.acl-demos.19",
+    doi = "10.18653/v1/2020.acl-demos.19",
+    pages = "142--149"
+}
 ```
\ No newline at end of file
diff --git a/convlab/policy/gdpl/semantic_level_config.json b/convlab/policy/gdpl/semantic_level_config.json
index e64159c7..0dcd4666 100644
--- a/convlab/policy/gdpl/semantic_level_config.json
+++ b/convlab/policy/gdpl/semantic_level_config.json
@@ -5,7 +5,7 @@
 		"pretrained_load_path": "",
 		"batchsz": 1000,
 		"seed": 0,
-		"epoch": 50,
+		"epoch": 10,
 		"eval_frequency": 5,
 		"process_num": 4,
 		"sys_semantic_to_usr": false,
diff --git a/convlab/policy/mle/README.md b/convlab/policy/mle/README.md
new file mode 100644
index 00000000..db62faae
--- /dev/null
+++ b/convlab/policy/mle/README.md
@@ -0,0 +1,19 @@
+# Maximum Likelihood Estimator (MLE)
+
+MLE learns a MLP model in a supervised way using a provided dataset. The trained model can be used as intialisation point for running RL trainings with PPO or GDPL for instance.
+
+## Supervised Training
+
+Starting a training is as easy as executing
+
+```sh
+$ python train.py --dataset_name=DATASET_NAME --seed=SEED --eval_freq=FREQ
+```
+
+The dataset name can be "multiwoz21" or "sgd" for instance. The first time you run that command, it will take longer as the dataset needs to be pre-processed. The evaluation frequency decides after how many epochs should be evaluated.
+
+Other hyperparameters such as learning rate or number of epochs can be set in the config.json file.
+
+## Evaluation
+
+Evaluation on the validation data set takes place during training.
\ No newline at end of file
diff --git a/convlab/policy/pg/README.md b/convlab/policy/pg/README.md
index d6136527..d37dd020 100755
--- a/convlab/policy/pg/README.md
+++ b/convlab/policy/pg/README.md
@@ -1,36 +1,58 @@
-# REINFORCE
+# Policy Gradient (PG)
 
-A simple stochastic gradient algorithm for policy gradient reinforcement learning. We adapt REINFORCE to the dialog policy.
+PG is an on-policy reinforcement learning algorithm that uses the policy gradient theorem to perform policy updates, using directly the return as value estimation
+. 
+## Supervised pre-training
 
-## Train
+If you want to obtain a supervised model for pre-training, please have a look in the MLE policy folder.
 
-Run `train.py` in the `pg` directory:
+## RL training
 
-```bash
-python train.py
+Starting a RL training is as easy as executing
+
+```sh
+$ python train.py --path=your_environment_config --seed=SEED
 ```
 
-For better performance, we can do immitating learning before reinforcement learning. The immitating learning is implemented in the `mle` directory.
+One example for the environment-config is **semantic_level_config.json**, where parameters for the training are specified, for instance
 
-For example, if the trained model of immitating learning is saved at FOLDER_OF_MODEL/best_mle.pol.mdl, then you can run
+- load_path: provide a path to initialise the model with a pre-trained model, skip the ending .pol.mdl
+- process_num: the number of processes to use during evaluation to speed it up
+- num_eval_dialogues: how many evaluation dialogues should be used
+- epoch: how many training epochs to run. One epoch consists of collecting dialogues + performing an update
+- eval_frequency: after how many epochs perform an evaluation
+- batchsz: the number of training dialogues collected before doing an update
+
+Moreover, you can specify the full dialogue pipeline here, such as the user policy, NLU for system and user, etc.
+
+Parameters that are tied to the RL algorithm and the model architecture can be changed in config.json.
 
-```bash
-python train.py --load_path FOLDER_OF_MODEL/best_mle
-```
 
-Note that the *.pol.mdl* suffix should not appear in the --load_path argument.
+## Evaluation
 
-## Reference
+For creating evaluation plots and running evaluation dialogues, please have a look in the README of the policy folder.
+
+## References
 
 ```
-@article{williams1992simple,
-  title={Simple statistical gradient-following algorithms for connectionist reinforcement learning},
-  author={Williams, Ronald J},
-  journal={Machine learning},
-  volume={8},
-  number={3-4},
-  pages={229--256},
-  year={1992},
-  publisher={Springer}
+@inproceedings{devlin2019bert,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+  booktitle={Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
+  pages={4171--4186},
+  year={2019}
+}
+
+@inproceedings{zhu-etal-2020-convlab,
+    title = "{C}onv{L}ab-2: An Open-Source Toolkit for Building, Evaluating, and Diagnosing Dialogue Systems",
+    author = "Zhu, Qi and Zhang, Zheng and Fang, Yan and Li, Xiang and Takanobu, Ryuichi and Li, Jinchao and Peng, Baolin and Gao, Jianfeng and Zhu, Xiaoyan and Huang, Minlie",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
+    month = jul,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.acl-demos.19",
+    doi = "10.18653/v1/2020.acl-demos.19",
+    pages = "142--149"
 }
 ```
\ No newline at end of file
diff --git a/convlab/policy/ppo/README.md b/convlab/policy/ppo/README.md
index 6bf87252..0228adb8 100755
--- a/convlab/policy/ppo/README.md
+++ b/convlab/policy/ppo/README.md
@@ -1,34 +1,58 @@
-# PPO
+# Proximal Policy Optimization (PPO)
 
-A policy optimization method in policy based reinforcement learning that uses
-multiple epochs of stochastic gradient ascent and a constant
-clipping mechanism as the soft constraint to perform each policy update. We adapt PPO to the dialog policy.
+Proximal Policy Optimization (Schulmann et. al. 2017) is an on-policy reinforcement learning algorithm. The architecture used is a simple MLP and thus not transferable to new ontologies.
 
-## Train
+## Supervised pre-training
 
-Run `train.py` in the `ppo` directory:
+If you want to obtain a supervised model for pre-training, please have a look in the MLE policy folder.
 
-```bash
-python train.py
+## RL training
+
+Starting a RL training is as easy as executing
+
+```sh
+$ python train.py --path=your_environment_config --seed=SEED
 ```
 
-For better performance, we can do immitating learning before reinforcement learning. The immitating learning is implemented in the `mle` directory.
+One example for the environment-config is **semantic_level_config.json**, where parameters for the training are specified, for instance
 
-For example, if the trained model of immitating learning is saved at FOLDER_OF_MODEL/best_mle.pol.mdl, then you can run
+- load_path: provide a path to initialise the model with a pre-trained model, skip the ending .pol.mdl
+- process_num: the number of processes to use during evaluation to speed it up
+- num_eval_dialogues: how many evaluation dialogues should be used
+- epoch: how many training epochs to run. One epoch consists of collecting dialogues + performing an update
+- eval_frequency: after how many epochs perform an evaluation
+- batchsz: the number of training dialogues collected before doing an update
 
-```bash
-python train.py --load_path FOLDER_OF_MODEL/best_mle
-```
+Moreover, you can specify the full dialogue pipeline here, such as the user policy, NLU for system and user, etc.
+
+Parameters that are tied to the RL algorithm and the model architecture can be changed in config.json.
 
-Note that the *.pol.mdl* suffix should not appear in the --load_path argument.
 
-## Reference
+## Evaluation
+
+For creating evaluation plots and running evaluation dialogues, please have a look in the README of the policy folder.
+
+## References
 
 ```
-@article{schulman2017proximal,
-  title={Proximal policy optimization algorithms},
-  author={Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
-  journal={arXiv preprint arXiv:1707.06347},
-  year={2017}
+@inproceedings{devlin2019bert,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+  booktitle={Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
+  pages={4171--4186},
+  year={2019}
+}
+
+@inproceedings{zhu-etal-2020-convlab,
+    title = "{C}onv{L}ab-2: An Open-Source Toolkit for Building, Evaluating, and Diagnosing Dialogue Systems",
+    author = "Zhu, Qi and Zhang, Zheng and Fang, Yan and Li, Xiang and Takanobu, Ryuichi and Li, Jinchao and Peng, Baolin and Gao, Jianfeng and Zhu, Xiaoyan and Huang, Minlie",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
+    month = jul,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.acl-demos.19",
+    doi = "10.18653/v1/2020.acl-demos.19",
+    pages = "142--149"
 }
 ```
\ No newline at end of file
diff --git a/convlab/policy/ppo/semantic_level_config.json b/convlab/policy/ppo/semantic_level_config.json
index a4a24598..ac266e27 100644
--- a/convlab/policy/ppo/semantic_level_config.json
+++ b/convlab/policy/ppo/semantic_level_config.json
@@ -5,7 +5,7 @@
 		"pretrained_load_path": "",
 		"batchsz": 1000,
 		"seed": 0,
-		"epoch": 50,
+		"epoch": 10,
 		"eval_frequency": 5,
 		"process_num": 4,
 		"sys_semantic_to_usr": false,
diff --git a/convlab/policy/vtrace_DPT/README.md b/convlab/policy/vtrace_DPT/README.md
new file mode 100644
index 00000000..0e6ed7d9
--- /dev/null
+++ b/convlab/policy/vtrace_DPT/README.md
@@ -0,0 +1,65 @@
+# Dynamic Dialogue Policy Transformer (DDPT)
+
+The dynamic dialogue policy transformer (Geishauser et. al. 2022) is a model built for continual reinforcement learning. It uses a pre-trained RoBERTa language model to construct embeddings for each state information and domain, slot and value in the action set. As a consequence, it can be used for different ontologies and is able to deal with new state information as well as actions. The backbone architecture is a transformer encoder-decoder.
+
+It uses the CLEAR algorithm (Rolnick et. al. 2019) for continual reinforcement learning that builds on top of VTRACE (Espheholt et. al. 2018). The current folder supports only training in a stationary environment and no continual learning, which uses VTRACE as algorithm.
+
+## Supervised pre-training
+
+If you want to pre-train the model on a dataset, use the command
+
+```sh
+$ python supervised/train_supervised.py --dataset_name=DATASET_NAME --seed=SEED --model_path=""
+```
+
+The first time you run that command, it will take longer as the dataset needs to be pre-processed.
+
+This will create a corresponding experiments folder under supervised/experiments, where the model is saved in /save.
+
+You can specify the dataset that you would like to use, e.g. "multiwoz21" or "sgd". You can also specify a model_path if you have already a pre-trained model, for instance when you first train on SGD before you fine-tune on multiwoz21 data.
+
+You can specify hyperparamters such as epoch, supervised_lr and data_percentage (how much of the data you want to use) in the config.json file.
+
+
+
+## RL training
+
+Starting a RL training is as easy as executing
+
+```sh
+$ python train.py --path=your_environment_config --seed=SEED
+```
+
+One example for the environment-config is **semantic_level_config.json**, where parameters for the training are specified, for instance
+
+- load_path: provide a path to initialise the model with a pre-trained model, skip the ending .pol.mdl
+- process_num: the number of processes to use during evaluation to speed it up
+- num_eval_dialogues: how many evaluation dialogues should be used
+- eval_frequency: after how many training dialogues an evaluation should be performed
+- total_dialogues: how many training dialogues should be done in total
+- new_dialogues: how many new dialogues should be collected before a policy update
+
+Moreover, you can specify the full dialogue pipeline here, such as the user policy, NLU for system and user, etc.
+
+Parameters that are tied to the RL algorithm and the model architecture can be changed in config.json.
+
+
+## Evaluation
+
+For creating evaluation plots and running evaluation dialogues, please have a look in the README of the policy folder.
+
+## References
+
+```
+@inproceedings{NEURIPS2019_fa7cdfad,
+ author = {Rolnick, David and Ahuja, Arun and Schwarz, Jonathan and Lillicrap, Timothy and Wayne, Gregory},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
+ pages = {},
+ publisher = {Curran Associates, Inc.},
+ title = {Experience Replay for Continual Learning},
+ url = {https://proceedings.neurips.cc/paper/2019/file/fa7cdfad1a5aaf8370ebeda47a1ff1c3-Paper.pdf},
+ volume = {32},
+ year = {2019}
+}
+```
\ No newline at end of file
-- 
GitLab