Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
Loading items

Target

Select target project
  • general/dsml/trippy-public
1 result
Select Git revision
Loading items
Show changes
Commits on Source (3)
Showing
with 2467 additions and 226 deletions
# Store binaries in LFS
## Custom paths
data/ filter=lfs diff=lfs merge=lfs -text
## Archive/Compressed
*.7z filter=lfs diff=lfs merge=lfs -text
*.cpio filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.iso filter=lfs diff=lfs merge=lfs -text
*.bz filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.bzip filter=lfs diff=lfs merge=lfs -text
*.bzip2 filter=lfs diff=lfs merge=lfs -text
*.cab filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.gzip filter=lfs diff=lfs merge=lfs -text
*.lz filter=lfs diff=lfs merge=lfs -text
*.lzma filter=lfs diff=lfs merge=lfs -text
*.lzo filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.z filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.ace filter=lfs diff=lfs merge=lfs -text
*.dmg filter=lfs diff=lfs merge=lfs -text
*.dd filter=lfs diff=lfs merge=lfs -text
*.apk filter=lfs diff=lfs merge=lfs -text
*.ear filter=lfs diff=lfs merge=lfs -text
*.jar filter=lfs diff=lfs merge=lfs -text
*.deb filter=lfs diff=lfs merge=lfs -text
*.cue filter=lfs diff=lfs merge=lfs -text
*.dump filter=lfs diff=lfs merge=lfs -text
## Image
*.jpg filter=lfs diff=lfs merge=lfs -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.gif filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.psd filter=lfs diff=lfs merge=lfs -text
*.bmp filter=lfs diff=lfs merge=lfs -text
*.dng filter=lfs diff=lfs merge=lfs -text
*.cdr filter=lfs diff=lfs merge=lfs -text
*.indd filter=lfs diff=lfs merge=lfs -text
*.tiff filter=lfs diff=lfs merge=lfs -text
*.tif filter=lfs diff=lfs merge=lfs -text
*.psp filter=lfs diff=lfs merge=lfs -text
*.tga filter=lfs diff=lfs merge=lfs -text
*.eps filter=lfs diff=lfs merge=lfs -text
*.svg filter=lfs diff=lfs merge=lfs -text
## Documents
*.pdf filter=lfs diff=lfs merge=lfs -text
*.doc filter=lfs diff=lfs merge=lfs -text
*.docx filter=lfs diff=lfs merge=lfs -text
*.xls filter=lfs diff=lfs merge=lfs -text
*.xlsx filter=lfs diff=lfs merge=lfs -text
*.ppt filter=lfs diff=lfs merge=lfs -text
*.pptx filter=lfs diff=lfs merge=lfs -text
*.ppz filter=lfs diff=lfs merge=lfs -text
*.dot filter=lfs diff=lfs merge=lfs -text
*.dotx filter=lfs diff=lfs merge=lfs -text
*.lwp filter=lfs diff=lfs merge=lfs -text
*.odm filter=lfs diff=lfs merge=lfs -text
*.odt filter=lfs diff=lfs merge=lfs -text
*.ott filter=lfs diff=lfs merge=lfs -text
*.ods filter=lfs diff=lfs merge=lfs -text
*.ots filter=lfs diff=lfs merge=lfs -text
*.odp filter=lfs diff=lfs merge=lfs -text
*.otp filter=lfs diff=lfs merge=lfs -text
*.odg filter=lfs diff=lfs merge=lfs -text
*.otg filter=lfs diff=lfs merge=lfs -text
*.wps filter=lfs diff=lfs merge=lfs -text
*.wpd filter=lfs diff=lfs merge=lfs -text
*.wpt filter=lfs diff=lfs merge=lfs -text
*.xps filter=lfs diff=lfs merge=lfs -text
*.ttf filter=lfs diff=lfs merge=lfs -text
*.otf filter=lfs diff=lfs merge=lfs -text
*.dvi filter=lfs diff=lfs merge=lfs -text
*.pages filter=lfs diff=lfs merge=lfs -text
*.key filter=lfs diff=lfs merge=lfs -text
## Audio/Video
*.mpg filter=lfs diff=lfs merge=lfs -text
*.mpeg filter=lfs diff=lfs merge=lfs -text
*.mp3 filter=lfs diff=lfs merge=lfs -text
*.mp4 filter=lfs diff=lfs merge=lfs -text
*.avi filter=lfs diff=lfs merge=lfs -text
*.wav filter=lfs diff=lfs merge=lfs -text
*.mkv filter=lfs diff=lfs merge=lfs -text
*.3gp filter=lfs diff=lfs merge=lfs -text
*.flv filter=lfs diff=lfs merge=lfs -text
*.m4v filter=lfs diff=lfs merge=lfs -text
*.ogg filter=lfs diff=lfs merge=lfs -text
*.mov filter=lfs diff=lfs merge=lfs -text
*.wmv filter=lfs diff=lfs merge=lfs -text
*.webm filter=lfs diff=lfs merge=lfs -text
## VM
*.vfd filter=lfs diff=lfs merge=lfs -text
*.vhd filter=lfs diff=lfs merge=lfs -text
*.vmdk filter=lfs diff=lfs merge=lfs -text
*.vmsd filter=lfs diff=lfs merge=lfs -text
*.vmsn filter=lfs diff=lfs merge=lfs -text
*.vmss filter=lfs diff=lfs merge=lfs -text
*.dsk filter=lfs diff=lfs merge=lfs -text
*.vdi filter=lfs diff=lfs merge=lfs -text
*.cow filter=lfs diff=lfs merge=lfs -text
*.qcow filter=lfs diff=lfs merge=lfs -text
*.qcow2 filter=lfs diff=lfs merge=lfs -text
*.qed filter=lfs diff=lfs merge=lfs -text
## Other
*.exe filter=lfs diff=lfs merge=lfs -text
*.sxi filter=lfs diff=lfs merge=lfs -text
*.dat filter=lfs diff=lfs merge=lfs -text
*.data filter=lfs diff=lfs merge=lfs -text
......@@ -2,14 +2,30 @@
# Parameters ------------------------------------------------------
# --- Sim-M dataset
#TASK="sim-m"
#DATA_DIR="data/simulated-dialogue/sim-M"
#DATASET_CONFIG="dataset_config/sim-m.json"
# --- Sim-R dataset
#TASK="sim-r"
#DATA_DIR="data/simulated-dialogue/sim-R"
#DATASET_CONFIG="dataset_config/sim-r.json"
# --- WOZ 2.0 dataset
#TASK="woz2"
#DATA_DIR="data/woz2"
#DATASET_CONFIG="dataset_config/woz2.json"
# --- MultiWOZ 2.1 legacy version dataset
#TASK="multiwoz21_legacy"
#DATA_DIR="data/MULTIWOZ2.1"
#DATASET_CONFIG="dataset_config/multiwoz21.json"
# --- MultiWOZ 2.1 dataset
TASK="multiwoz21"
DATA_DIR="data/MULTIWOZ2.1"
DATA_DIR="data/multiwoz/data/MultiWOZ_2.1"
DATASET_CONFIG="dataset_config/multiwoz21.json"
# --- MultiWOZ 2.1 in ConvLab3's unified data format
#TASK="unified"
#DATA_DIR=""
#DATASET_CONFIG="dataset_config/unified_multiwoz21.json"
# Project paths etc. ----------------------------------------------
......@@ -29,34 +45,28 @@ for step in train dev test; do
python3 run_dst.py \
--task_name=${TASK} \
--data_dir=${DATA_DIR} \
--dataset_config=dataset_config/${TASK}.json \
--dataset_config=${DATASET_CONFIG} \
--model_type="roberta" \
--model_name_or_path="roberta-base" \
--do_lower_case \
--learning_rate=1e-4 \
--num_train_epochs=10 \
--max_seq_length=180 \
--per_gpu_train_batch_size=16 \
--per_gpu_eval_batch_size=16 \
--per_gpu_train_batch_size=32 \
--per_gpu_eval_batch_size=32 \
--output_dir=${OUT_DIR} \
--save_epochs=2 \
--logging_steps=10 \
--warmup_proportion=0.1 \
--eval_all_checkpoints \
--adam_epsilon=1e-6 \
--weight_decay=0.01 \
--label_value_repetitions \
--swap_utterances \
--append_history \
--use_history_labels \
${args_add} \
2>&1 | tee ${OUT_DIR}/${step}.log
if [ "$step" = "dev" ] || [ "$step" = "test" ]; then
python3 metric_bert_dst.py \
${TASK} \
dataset_config/${TASK}.json \
"${OUT_DIR}/pred_res.${step}*json" \
python3 metric_dst.py \
--dataset_config=${DATASET_CONFIG} \
--file_list="${OUT_DIR}/pred_res.${step}*json" \
2>&1 | tee ${OUT_DIR}/eval_pred_${step}.log
fi
done
......@@ -2,14 +2,30 @@
# Parameters ------------------------------------------------------
# --- Sim-M dataset
#TASK="sim-m"
#DATA_DIR="data/simulated-dialogue/sim-M"
#DATASET_CONFIG="dataset_config/sim-m.json"
# --- Sim-R dataset
#TASK="sim-r"
#DATA_DIR="data/simulated-dialogue/sim-R"
#DATASET_CONFIG="dataset_config/sim-r.json"
# --- WOZ 2.0 dataset
#TASK="woz2"
#DATA_DIR="data/woz2"
#DATASET_CONFIG="dataset_config/woz2.json"
# --- MultiWOZ 2.1 legacy version dataset
#TASK="multiwoz21_legacy"
#DATA_DIR="data/MULTIWOZ2.1"
#DATASET_CONFIG="dataset_config/multiwoz21.json"
# --- MultiWOZ 2.1 dataset
TASK="multiwoz21"
DATA_DIR="data/MULTIWOZ2.1"
DATA_DIR="data/multiwoz/data/MultiWOZ_2.1"
DATASET_CONFIG="dataset_config/multiwoz21.json"
# --- MultiWOZ 2.1 in ConvLab3's unified data format
#TASK="unified"
#DATA_DIR=""
#DATASET_CONFIG="dataset_config/unified_multiwoz21.json"
AUX_TASK="cola"
AUX_DATA_DIR="data/aux/roberta_base_cased_lower"
......@@ -24,7 +40,7 @@ mkdir -p ${OUT_DIR}
for step in train dev test; do
args_add=""
if [ "$step" = "train" ]; then
args_add="--do_train --predict_type=dummy"
args_add="--do_train --predict_type=dummy" # INFO: For sim-M, we recommend to add "--svd=0.3"
elif [ "$step" = "dev" ] || [ "$step" = "test" ]; then
args_add="--do_eval --predict_type=${step}"
fi
......@@ -32,7 +48,7 @@ for step in train dev test; do
python3 run_dst_mtl.py \
--task_name=${TASK} \
--data_dir=${DATA_DIR} \
--dataset_config=dataset_config/${TASK}.json \
--dataset_config=${DATASET_CONFIG} \
--model_type="roberta" \
--model_name_or_path="roberta-base" \
--do_lower_case \
......@@ -43,16 +59,11 @@ for step in train dev test; do
--per_gpu_eval_batch_size=1 \
--output_dir=${OUT_DIR} \
--save_epochs=2 \
--logging_steps=10 \
--warmup_proportion=0.1 \
--eval_all_checkpoints \
--adam_epsilon=1e-6 \
--weight_decay=0.01 \
--heads_dropout=0.1 \
--label_value_repetitions \
--swap_utterances \
--append_history \
--use_history_labels \
--delexicalize_sys_utts \
--class_aux_feats_inform \
--class_aux_feats_ds \
......@@ -65,10 +76,9 @@ for step in train dev test; do
2>&1 | tee ${OUT_DIR}/${step}.log
if [ "$step" = "dev" ] || [ "$step" = "test" ]; then
python3 metric_bert_dst.py \
${TASK} \
dataset_config/${TASK}.json \
"${OUT_DIR}/pred_res.${step}*json" \
python3 metric_dst.py \
--dataset_config=${DATASET_CONFIG} \
--file_list="${OUT_DIR}/pred_res.${step}*json" \
2>&1 | tee ${OUT_DIR}/eval_pred_${step}.log
fi
done
......@@ -2,14 +2,30 @@
# Parameters ------------------------------------------------------
# --- Sim-M dataset
#TASK="sim-m"
#DATA_DIR="data/simulated-dialogue/sim-M"
#DATASET_CONFIG="dataset_config/sim-m.json"
# --- Sim-R dataset
#TASK="sim-r"
#DATA_DIR="data/simulated-dialogue/sim-R"
TASK="woz2"
DATA_DIR="data/woz2"
#TASK="multiwoz21"
#DATASET_CONFIG="dataset_config/sim-r.json"
# --- WOZ 2.0 dataset
#TASK="woz2"
#DATA_DIR="data/woz2"
#DATASET_CONFIG="dataset_config/woz2.json"
# --- MultiWOZ 2.1 legacy version dataset
#TASK="multiwoz21_legacy"
#DATA_DIR="data/MULTIWOZ2.1"
#DATASET_CONFIG="dataset_config/multiwoz21.json"
# --- MultiWOZ 2.1 dataset
TASK="multiwoz21"
DATA_DIR="data/multiwoz/data/MultiWOZ_2.1"
DATASET_CONFIG="dataset_config/multiwoz21.json"
# --- MultiWOZ 2.1 in ConvLab3's unified data format
#TASK="unified"
#DATA_DIR=""
#DATASET_CONFIG="dataset_config/unified_multiwoz21.json"
# Project paths etc. ----------------------------------------------
......@@ -21,7 +37,7 @@ mkdir -p ${OUT_DIR}
for step in train dev test; do
args_add=""
if [ "$step" = "train" ]; then
args_add="--do_train --predict_type=dummy"
args_add="--do_train --predict_type=dummy" # INFO: For sim-M, we recommend to add "--svd=0.3"
elif [ "$step" = "dev" ] || [ "$step" = "test" ]; then
args_add="--do_eval --predict_type=${step}"
fi
......@@ -29,7 +45,7 @@ for step in train dev test; do
python3 run_dst.py \
--task_name=${TASK} \
--data_dir=${DATA_DIR} \
--dataset_config=dataset_config/${TASK}.json \
--dataset_config=${DATASET_CONFIG} \
--model_type="bert" \
--model_name_or_path="bert-base-uncased" \
--do_lower_case \
......@@ -40,14 +56,9 @@ for step in train dev test; do
--per_gpu_eval_batch_size=1 \
--output_dir=${OUT_DIR} \
--save_epochs=2 \
--logging_steps=10 \
--warmup_proportion=0.1 \
--eval_all_checkpoints \
--adam_epsilon=1e-6 \
--label_value_repetitions \
--swap_utterances \
--append_history \
--use_history_labels \
--delexicalize_sys_utts \
--class_aux_feats_inform \
--class_aux_feats_ds \
......@@ -55,10 +66,9 @@ for step in train dev test; do
2>&1 | tee ${OUT_DIR}/${step}.log
if [ "$step" = "dev" ] || [ "$step" = "test" ]; then
python3 metric_bert_dst.py \
${TASK} \
dataset_config/${TASK}.json \
"${OUT_DIR}/pred_res.${step}*json" \
python3 metric_dst.py \
--dataset_config=${DATASET_CONFIG} \
--file_list="${OUT_DIR}/pred_res.${step}*json" \
2>&1 | tee ${OUT_DIR}/eval_pred_${step}.log
fi
done
#!/bin/bash
# Parameters ------------------------------------------------------
#TASK="sim-m"
#DATA_DIR="data/simulated-dialogue/sim-M"
#TASK="sim-r"
#DATA_DIR="data/simulated-dialogue/sim-R"
TASK="woz2"
DATA_DIR="data/woz2"
#TASK="multiwoz21"
#DATA_DIR="data/MULTIWOZ2.1"
# Project paths etc. ----------------------------------------------
OUT_DIR=results
mkdir -p ${OUT_DIR}
# Main ------------------------------------------------------------
for step in train dev test; do
args_add=""
if [ "$step" = "train" ]; then
args_add="--do_train --predict_type=dummy"
elif [ "$step" = "dev" ] || [ "$step" = "test" ]; then
args_add="--do_eval --predict_type=${step}"
fi
python3 run_dst.py \
--task_name=${TASK} \
--data_dir=${DATA_DIR} \
--dataset_config=dataset_config/${TASK}.json \
--model_type="bert" \
--model_name_or_path="bert-base-uncased" \
--do_lower_case \
--learning_rate=1e-4 \
--num_train_epochs=10 \
--max_seq_length=180 \
--per_gpu_train_batch_size=48 \
--per_gpu_eval_batch_size=1 \
--output_dir=${OUT_DIR} \
--save_epochs=2 \
--logging_steps=10 \
--warmup_proportion=0.1 \
--eval_all_checkpoints \
--adam_epsilon=1e-6 \
--label_value_repetitions \
${args_add} \
2>&1 | tee ${OUT_DIR}/${step}.log
if [ "$step" = "dev" ] || [ "$step" = "test" ]; then
python3 metric_bert_dst.py \
${TASK} \
dataset_config/${TASK}.json \
"${OUT_DIR}/pred_res.${step}*json" \
2>&1 | tee ${OUT_DIR}/eval_pred_${step}.log
fi
done
# *** The upcoming update will add ConvLab-3 support, faster caching, transformers 4.X support and more ***
## Introduction
TripPy is a new approach to dialogue state tracking (DST) which makes use of various copy mechanisms to fill slots with values. Our model has no need to maintain a list of candidate values. Instead, all values are extracted from the dialog context on-the-fly.
TripPy is an approach to dialogue state tracking (DST) that makes use of various copy mechanisms to fill slots with values. Our model has no need to maintain a list of candidate values. Instead, all values are extracted from the dialog context on-the-fly.
A slot is filled by one of three copy mechanisms:
1. Span prediction may extract values directly from the user input;
2. a value may be copied from a system inform memory that keeps track of the system’s inform operations;
3. a value may be copied over from a different slot that is already contained in the dialog state to resolve coreferences within and across domains.
Our approach combines the advantages of span-based slot filling methods with memory methods to avoid the use of value picklists altogether. We argue that our strategy simplifies the DST task while at the same time achieving state of the art performance on various popular evaluation sets including MultiWOZ 2.1.
## Recent updates
- 2022.12.19: Added support for ConvLab-3's unified data format. Added faster caching. Added transformers 4 support.
- 2022.02.15: Added support for MultiWOZ versions 2.2, 2.3, 2.4
## How to run
Two example scripts are provided for how to use TripPy. `DO.example.simple` will train and evaluate a simpler model, whereas `DO.example.advanced` uses the parameters that will result in performance similar to the reported ones. `DO.example.recommended` uses RoBERTa as encoder and the currently recommended set of hyperparameters. For more challenging datasets with longer dialogues, better performance may be achieved by using the maximum sequence length of 512.
Two example scripts are provided for how to use TripPy. `DO.example` will train and evaluate a model with recommended settings. See below list for expected performance per dataset. `DO.example.paper` uses the parameters that were used for experiments in our paper "TripPy: A Triple Copy Strategy for Value Independent Neural Dialog State Tracking". Thus, performance will be similar to the reported ones. For more challenging datasets with longer dialogues, better performance may be achieved by using the maximum sequence length of 512.
`DO.example.mtl` will train a model with multi-task learning (MTL) using an auxiliary task (See our paper "Out-of-Task Training for Dialog State Tracking Models" for details).
`DO.example.mtl` will train a model with multi-task learning (MTL) using an auxiliary task, using the parameters that we used in our paper "Out-of-Task Training for Dialog State Tracking Models".
## Datasets
......@@ -22,13 +26,25 @@ Supported datasets are:
- sim-R (https://github.com/google-research-datasets/simulated-dialogue.git)
- WOZ 2.0 (see data/)
- MultiWOZ 2.0 (https://github.com/budzianowski/multiwoz.git)
- MultiWOZ 2.1 (see data/, https://github.com/budzianowski/multiwoz.git)
- MultiWOZ 2.1 (https://github.com/budzianowski/multiwoz.git)
- MultiWOZ 2.1 legacy version (see data/)
- MultiWOZ 2.2 (https://github.com/budzianowski/multiwoz.git)
- MultiWOZ 2.3 (https://github.com/lexmen318/MultiWOZ-coref.git)
- MultiWOZ 2.4 (https://github.com/smartyfh/MultiWOZ2.4.git)
- Unified data format (currently supported: MultiWOZ 2.1) (see https://github.com/ConvLab/ConvLab-3)
See the README file in `data/` for more details how to obtain and prepare the datasets for use in TripPy.
The ```--task_name``` is
- 'sim-m', for sim-M
- 'sim-r', for sim-R
- 'woz2', for WOZ 2.0
- 'multiwoz21', for MultiWOZ 2.0-2.4
- 'multiwoz21_legacy', for MultiWOZ 2.1 legacy version
With a sequence length of 180, you should expect the following average JGA:
- 53% for MultiWOZ 2.0
- 56% for MultiWOZ 2.1 legacy version
- 56% for MultiWOZ 2.1
- 56% for MultiWOZ 2.2
- 63% for MultiWOZ 2.3
......@@ -37,11 +53,17 @@ With a sequence length of 180, you should expect the following average JGA:
- 90% for sim-R
- 92% for WOZ 2.0
## ConvLab-3
TripPy is integrated in ConvLab-3 as ready-to-use dialogue state tracker. A checkpoint is available at HuggingFace (see the ConvLab-3 repo for more details).
If you want to train your own TripPy model for ConvLab-3 from scratch, you can do so by using this code, setting ```--task_name='unified'```. The ```--data_dir``` parameter will be ignored in that case. Pick the file for ```--dataset_config``` according to the dataset you want to train for. For MultiWOZ, this would 'data/unified_multiwoz21'.
## Requirements
- torch (tested: 1.4.0)
- transformers (tested: 2.9.1)
- tensorboardX (tested: 2.0)
- torch (tested: 1.8.0)
- transformers (tested: 4.18.0)
- tensorboardX (tested: 2.1)
## Citation
......
## Supported datasets
Datasets should go into the ```data/``` folder.
### sim-M & sim-R:
```
git clone https://github.com/google-research-datasets/simulated-dialogue.git
```
### WOZ 2.0
The original URL (http://mi.eng.cam.ac.uk/~nm480/woz_2.0.zip) is not active anymore.
We provide the dataset in ```data/woz2```.
### MultiWOZ 2.0, 2.1 & 2.2
```
git clone https://github.com/budzianowski/multiwoz.git
unzip multiwoz/data/MultiWOZ_2.0.zip -d multiwoz/data/
unzip multiwoz/data/MultiWOZ_2.1.zip -d multiwoz/data/
mv multiwoz/data/MULTIWOZ2\ 2/ multiwoz/data/MultiWOZ_2.0
python3 multiwoz/data/MultiWOZ_2.2/convert_to_multiwoz_format.py --multiwoz21_data_dir=multiwoz/data/MultiWOZ_2.1 --output_file=multiwoz/data/MultiWOZ_2.2/data.json
cp multiwoz/data/MultiWOZ_2.1/valListFile.txt multiwoz/data/MultiWOZ_2.2/
cp multiwoz/data/MultiWOZ_2.1/testListFile.txt multiwoz/data/MultiWOZ_2.2/
python split_multiwoz_data.py --data_dir multiwoz/data/MultiWOZ_2.0
python split_multiwoz_data.py --data_dir multiwoz/data/MultiWOZ_2.1
python split_multiwoz_data.py --data_dir multiwoz/data/MultiWOZ_2.2
```
### MultiWOZ 2.1 legacy version
With "legacy version" we refer to the mid 2019 version of MultiWOZ 2.1, which can be found at https://doi.org/10.17863/CAM.41572
We used this version when we built TripPy. We provide the exact data that we used in ```data/MULTIWOZ2.1_legacy```.
The dataset has since been updated and the most recent version of MultiWOZ 2.1 differs slightly from the version we used for the experiments that we report in [TripPy: A Triple Copy Strategy for Value Independent Neural Dialog State Tracking](https://www.aclweb.org/anthology/2020.sigdial-1.4/). Our code supports both the new version as well as the legacy version of MultiWOZ.
### MultiWOZ 2.3
```
git clone https://github.com/lexmen318/MultiWOZ-coref.git
```
### MultiWOZ 2.4
```
git clone https://github.com/smartyfh/MultiWOZ2.4.git
```
# coding=utf-8
#
# Copyright 2020-2022 Heinrich Heine University Duesseldorf
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", default=None, type=str, required=True, help="Task database.")
args = parser.parse_args()
with open(os.path.join(args.data_dir, "data.json")) as f:
data = json.load(f)
val_list_file = os.path.join(args.data_dir, "valListFile.json")
if not os.path.isfile(val_list_file):
val_list_file = os.path.join(args.data_dir, "valListFile.txt")
with open(val_list_file) as f:
val_set = f.read().splitlines()
test_list_file = os.path.join(args.data_dir, "testListFile.json")
if not os.path.isfile(test_list_file):
test_list_file = os.path.join(args.data_dir, "testListFile.txt")
with open(test_list_file) as f:
test_set = f.read().splitlines()
val = {}
train = {}
test = {}
for k, v in data.items():
if k in val_set:
val[k] = v
elif k in test_set:
test[k] = v
else:
train[k] = v
print(len(data), len(train), len(val), len(test))
with open(os.path.join(args.data_dir, "train_dials.json"), "w+") as f:
f.write(json.dumps(train, indent = 4))
with open(os.path.join(args.data_dir, "val_dials.json"), "w+") as f:
f.write(json.dumps(val, indent = 4))
with open(os.path.join(args.data_dir, "test_dials.json"), "w+") as f:
f.write(json.dumps(test, indent = 4))
if __name__ == "__main__":
main()
# coding=utf-8
#
# Copyright 2020 Heinrich Heine University Duesseldorf
# Copyright 2020-2022 Heinrich Heine University Duesseldorf
#
# Part of this code is based on the source code of BERT-DST
# (arXiv:1907.03040)
......@@ -23,24 +23,39 @@ import json
import dataset_woz2
import dataset_sim
import dataset_multiwoz21
import dataset_multiwoz21_legacy
import dataset_aux_task
import dataset_unified
class DataProcessor(object):
dataset_name = ""
class_types = []
slot_list = []
label_maps = {}
def __init__(self, dataset_config):
# Load dataset config file.
with open(dataset_config, "r", encoding='utf-8') as f:
raw_config = json.load(f)
self.class_types = raw_config['class_types']
self.slot_list = raw_config['slots']
self.label_maps = raw_config['label_maps']
self.class_types = raw_config['class_types'] # Required
self.slot_list = raw_config['slots'] if 'slots' in raw_config else []
self.label_maps = raw_config['label_maps'] if 'label_maps' in raw_config else {}
self.dataset_name = raw_config['dataset_name'] if 'dataset_name' in raw_config else ""
# If not slot list is provided, generate from data.
if len(self.slot_list) == 0:
self.slot_list = self._get_slot_list()
def _get_slot_list(self):
raise NotImplementedError()
def get_train_examples(self, data_dir, **args):
def get_train_examples(self):
raise NotImplementedError()
def get_dev_examples(self, data_dir, **args):
def get_dev_examples(self):
raise NotImplementedError()
def get_test_examples(self, data_dir, **args):
def get_test_examples(self):
raise NotImplementedError()
......@@ -61,16 +76,30 @@ class Woz2Processor(DataProcessor):
class Multiwoz21Processor(DataProcessor):
def get_train_examples(self, data_dir, args):
return dataset_multiwoz21.create_examples(os.path.join(data_dir, 'train_dials.json'),
'train', self.class_types, self.slot_list, self.label_maps, **args)
def get_dev_examples(self, data_dir, args):
return dataset_multiwoz21.create_examples(os.path.join(data_dir, 'val_dials.json'),
'dev', self.class_types, self.slot_list, self.label_maps, **args)
def get_test_examples(self, data_dir, args):
return dataset_multiwoz21.create_examples(os.path.join(data_dir, 'test_dials.json'),
'test', self.class_types, self.slot_list, self.label_maps, **args)
class Multiwoz21LegacyProcessor(DataProcessor):
def get_train_examples(self, data_dir, args):
return dataset_multiwoz21_legacy.create_examples(os.path.join(data_dir, 'train_dials.json'),
os.path.join(data_dir, 'dialogue_acts.json'),
'train', self.slot_list, self.label_maps, **args)
def get_dev_examples(self, data_dir, args):
return dataset_multiwoz21.create_examples(os.path.join(data_dir, 'val_dials.json'),
return dataset_multiwoz21_legacy.create_examples(os.path.join(data_dir, 'val_dials.json'),
os.path.join(data_dir, 'dialogue_acts.json'),
'dev', self.slot_list, self.label_maps, **args)
def get_test_examples(self, data_dir, args):
return dataset_multiwoz21.create_examples(os.path.join(data_dir, 'test_dials.json'),
return dataset_multiwoz21_legacy.create_examples(os.path.join(data_dir, 'test_dials.json'),
os.path.join(data_dir, 'dialogue_acts.json'),
'test', self.slot_list, self.label_maps, **args)
......@@ -89,6 +118,23 @@ class SimProcessor(DataProcessor):
'test', self.slot_list, **args)
class UnifiedDatasetProcessor(DataProcessor):
def _get_slot_list(self):
return dataset_unified.get_slot_list(self.dataset_name)
def get_train_examples(self, data_dir, args):
return dataset_unified.create_examples('train', self.dataset_name, self.class_types,
self.slot_list, self.label_maps, **args)
def get_dev_examples(self, data_dir, args):
return dataset_unified.create_examples('validation', self.dataset_name, self.class_types,
self.slot_list, self.label_maps, **args)
def get_test_examples(self, data_dir, args):
return dataset_unified.create_examples('test', self.dataset_name, self.class_types,
self.slot_list, self.label_maps, **args)
class AuxTaskProcessor(object):
def get_aux_task_examples(self, data_dir, data_name, max_seq_length):
file_path = os.path.join(data_dir, '{}_train.json'.format(data_name))
......@@ -99,4 +145,6 @@ PROCESSORS = {"woz2": Woz2Processor,
"sim-m": SimProcessor,
"sim-r": SimProcessor,
"multiwoz21": Multiwoz21Processor,
"multiwoz21_legacy": Multiwoz21LegacyProcessor,
"unified": UnifiedDatasetProcessor,
"aux_task": AuxTaskProcessor}
# coding=utf-8
#
# Copyright 2020 Heinrich Heine University Duesseldorf
# Copyright 2020-2022 Heinrich Heine University Duesseldorf
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
This diff is collapsed.
# coding=utf-8
#
# Copyright 2020 Heinrich Heine University Duesseldorf
# Copyright 2020-2022 Heinrich Heine University Duesseldorf
#
# Part of this code is based on the source code of BERT-DST
# (arXiv:1907.03040)
......@@ -19,6 +19,7 @@
import json
import re
from tqdm import tqdm
from utils_dst import (DSTExample, convert_to_unicode)
......@@ -64,54 +65,6 @@ ACTS_DICT = {'taxi-depart': 'taxi-departure',
}
LABEL_MAPS = {} # Loaded from file
# Loads the dialogue_acts.json and returns a list
# of slot-value pairs.
def load_acts(input_file):
with open(input_file) as f:
acts = json.load(f)
s_dict = {}
for d in acts:
for t in acts[d]:
# Only process, if turn has annotation
if isinstance(acts[d][t], dict):
is_22_format = False
if 'dialog_act' in acts[d][t]:
is_22_format = True
acts_list = acts[d][t]['dialog_act']
if int(t) % 2 == 0:
continue
else:
acts_list = acts[d][t]
for a in acts_list:
aa = a.lower().split('-')
if aa[1] in ['inform', 'recommend', 'select', 'book']:
for i in acts_list[a]:
s = i[0].lower()
v = i[1].lower().strip()
if s == 'none' or v == '?' or v == 'none':
continue
slot = aa[0] + '-' + s
if slot in ACTS_DICT:
slot = ACTS_DICT[slot]
if is_22_format:
t_key = str(int(int(t) / 2 + 1))
d_key = d
else:
t_key = t
d_key = d + '.json'
key = d_key, t_key, slot
# In case of multiple mentioned values...
# ... Option 1: Keep first informed value
if key not in s_dict:
s_dict[key] = list([v])
# ... Option 2: Keep last informed value
#s_dict[key] = list([v])
return s_dict
def normalize_time(text):
text = re.sub("(\d{1})(a\.?m\.?|p\.?m\.?)", r"\1 \2", text) # am/pm without space
text = re.sub("(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)", r"\1\2:00 \3", text) # am/pm short to long form
......@@ -143,8 +96,7 @@ def normalize_text(text):
return text
# This should only contain label normalizations. All other mappings should
# be defined in LABEL_MAPS.
# This should only contain label normalizations, no label mappings.
def normalize_label(slot, value_label):
# Normalization of capitalization
if isinstance(value_label, str):
......@@ -166,7 +118,7 @@ def normalize_label(slot, value_label):
return "dontcare"
# Normalization of time slots
if "leaveAt" in slot or "arriveBy" in slot or slot == 'restaurant-book_time':
if "leave" in slot or "arrive" in slot or "time" in slot:
return normalize_time(value_label)
# Normalization
......@@ -203,18 +155,18 @@ def get_token_pos(tok_list, value_label):
return found, find_pos
def check_label_existence(value_label, usr_utt_tok):
def check_label_existence(value_label, usr_utt_tok, label_maps={}):
in_usr, usr_pos = get_token_pos(usr_utt_tok, value_label)
# If no hit even though there should be one, check for value label variants
if not in_usr and value_label in LABEL_MAPS:
for value_label_variant in LABEL_MAPS[value_label]:
if not in_usr and value_label in label_maps:
for value_label_variant in label_maps[value_label]:
in_usr, usr_pos = get_token_pos(usr_utt_tok, value_label_variant)
if in_usr:
break
return in_usr, usr_pos
def check_slot_referral(value_label, slot, seen_slots):
def check_slot_referral(value_label, slot, seen_slots, label_maps={}):
referred_slot = 'none'
if slot == 'hotel-stars' or slot == 'hotel-internet' or slot == 'hotel-parking':
return referred_slot
......@@ -231,8 +183,8 @@ def check_slot_referral(value_label, slot, seen_slots):
if seen_slots[s] == value_label:
referred_slot = s
break
elif value_label in LABEL_MAPS:
for value_label_variant in LABEL_MAPS[value_label]:
elif value_label in label_maps:
for value_label_variant in label_maps[value_label]:
if seen_slots[s] == value_label_variant:
referred_slot = s
break
......@@ -266,7 +218,7 @@ def delex_utt(utt, values, unk_token="[UNK]"):
# Fuzzy matching to label informed slot values
def check_slot_inform(value_label, inform_label):
def check_slot_inform(value_label, inform_label, label_maps={}):
result = False
informed_value = 'none'
vl = ' '.join(tokenize(value_label))
......@@ -277,8 +229,8 @@ def check_slot_inform(value_label, inform_label):
result = True
elif is_in_list(vl, il):
result = True
elif il in LABEL_MAPS:
for il_variant in LABEL_MAPS[il]:
elif il in label_maps:
for il_variant in label_maps[il]:
if vl == il_variant:
result = True
break
......@@ -288,8 +240,8 @@ def check_slot_inform(value_label, inform_label):
elif is_in_list(vl, il_variant):
result = True
break
elif vl in LABEL_MAPS:
for value_label_variant in LABEL_MAPS[vl]:
elif vl in label_maps:
for value_label_variant in label_maps[vl]:
if value_label_variant == il:
result = True
break
......@@ -305,15 +257,15 @@ def check_slot_inform(value_label, inform_label):
return result, informed_value
def get_turn_label(value_label, inform_label, sys_utt_tok, usr_utt_tok, slot, seen_slots, slot_last_occurrence):
def get_turn_label(value_label, inform_label, sys_utt_tok, usr_utt_tok, slot, seen_slots, slot_last_occurrence, label_maps={}):
usr_utt_tok_label = [0 for _ in usr_utt_tok]
informed_value = 'none'
referred_slot = 'none'
if value_label == 'none' or value_label == 'dontcare' or value_label == 'true' or value_label == 'false':
class_type = value_label
else:
in_usr, usr_pos = check_label_existence(value_label, usr_utt_tok)
is_informed, informed_value = check_slot_inform(value_label, inform_label)
in_usr, usr_pos = check_label_existence(value_label, usr_utt_tok, label_maps)
is_informed, informed_value = check_slot_inform(value_label, inform_label, label_maps)
if in_usr:
class_type = 'copy_value'
if slot_last_occurrence:
......@@ -327,7 +279,7 @@ def get_turn_label(value_label, inform_label, sys_utt_tok, usr_utt_tok, slot, se
elif is_informed:
class_type = 'inform'
else:
referred_slot = check_slot_referral(value_label, slot, seen_slots)
referred_slot = check_slot_referral(value_label, slot, seen_slots, label_maps)
if referred_slot != 'none':
class_type = 'refer'
else:
......@@ -335,6 +287,21 @@ def get_turn_label(value_label, inform_label, sys_utt_tok, usr_utt_tok, slot, se
return informed_value, referred_slot, usr_utt_tok_label, class_type
# Requestable slots, general acts and domain indicator slots
def is_request(slot, user_act, turn_domains):
if slot in user_act:
if isinstance(user_act[slot], list):
for act in user_act[slot]:
if act['intent'] in ['request', 'bye', 'thank', 'greet']:
return True
elif user_act[slot]['intent'] in ['request', 'bye', 'thank', 'greet']:
return True
do, sl = slot.split('-')
if sl == 'none' and do in turn_domains:
return True
return False
def tokenize(utt):
utt_lower = convert_to_unicode(utt).lower()
utt_lower = normalize_text(utt_lower)
......@@ -346,27 +313,22 @@ def utt_to_token(utt):
return [tok for tok in map(lambda x: re.sub(" ", "", x), re.split("(\W+)", utt)) if len(tok) > 0]
def create_examples(input_file, acts_file, set_type, slot_list,
def create_examples(input_file, set_type, class_types, slot_list,
label_maps={},
append_history=False,
use_history_labels=False,
no_append_history=False,
no_use_history_labels=False,
no_label_value_repetitions=False,
swap_utterances=False,
label_value_repetitions=False,
delexicalize_sys_utts=False,
unk_token="[UNK]",
analyze=False):
"""Read a DST json file into a list of DSTExample."""
sys_inform_dict = load_acts(acts_file)
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)
global LABEL_MAPS
LABEL_MAPS = label_maps
examples = []
for dialog_id in input_data:
for d_itr, dialog_id in enumerate(tqdm(input_data)):
entry = input_data[dialog_id]
utterances = entry['log']
......@@ -376,6 +338,9 @@ def create_examples(input_file, acts_file, set_type, slot_list,
# First system utterance is empty, since multiwoz starts with user input
utt_tok_list = [[]]
mod_slots_list = [{}]
inform_dict_list = [{}]
user_act_dict_list = [{}]
mod_domains_list = [{}]
# Collect all utterances and their metadata
usr_sys_switch = True
......@@ -391,17 +356,46 @@ def create_examples(input_file, acts_file, set_type, slot_list,
if is_sys_utt:
turn_itr += 1
# Delexicalize sys utterance
if delexicalize_sys_utts and is_sys_utt:
inform_dict = {slot: 'none' for slot in slot_list}
for slot in slot_list:
if (str(dialog_id), str(turn_itr), slot) in sys_inform_dict:
inform_dict[slot] = sys_inform_dict[(str(dialog_id), str(turn_itr), slot)]
utt_tok_list.append(delex_utt(utt['text'], inform_dict, unk_token)) # normalize utterances
else:
utt_tok_list.append(tokenize(utt['text'])) # normalize utterances
# Extract dialog_act information for sys and usr utts.
inform_dict = {}
user_act_dict = {}
modified_slots = {}
modified_domains = set()
if 'dialog_act' in utt:
for a in utt['dialog_act']:
aa = a.lower().split('-')
for i in utt['dialog_act'][a]:
s = i[0].lower()
# Some special intents are modeled as slots in TripPy
if aa[0] == 'general':
cs = "%s-%s" % (aa[0], aa[1])
else:
cs = "%s-%s" % (aa[0], s)
if cs in ACTS_DICT:
cs = ACTS_DICT[cs]
v = normalize_label(cs, i[1].lower().strip())
if cs in ['hotel-internet', 'hotel-parking']:
v = 'true'
modified_domains.add(aa[0]) # Remember domains
if is_sys_utt and aa[1] in ['inform', 'recommend', 'select', 'book'] and v != 'none':
if cs not in inform_dict:
inform_dict[cs] = []
inform_dict[cs].append(v)
elif not is_sys_utt:
if cs not in user_act_dict:
user_act_dict[cs] = []
user_act_dict[cs] = {'domain': aa[0], 'intent': aa[1], 'slot': s, 'value': v}
# INFO: Since the model has no mechanism to predict
# one among several informed value candidates, we
# keep only one informed value. For fairness, we
# apply a global rule:
for e in inform_dict:
# ... Option 1: Always keep first informed value
inform_dict[e] = list([inform_dict[e][0]])
# ... Option 2: Always keep last informed value
#inform_dict[e] = list([inform_dict[e][-1]])
else:
print("WARN: dialogue %s is missing dialog_act information." % dialog_id)
# If sys utt, extract metadata (identify and collect modified slots)
if is_sys_utt:
......@@ -424,8 +418,20 @@ def create_examples(input_file, acts_file, set_type, slot_list,
if cs in slot_list and cumulative_labels[cs] != value_label:
modified_slots[cs] = value_label
cumulative_labels[cs] = value_label
modified_domains.add(cs.split("-")[0]) # Remember domains
# Delexicalize sys utterance
if delexicalize_sys_utts and is_sys_utt:
utt_tok_list.append(delex_utt(utt['text'], inform_dict, unk_token)) # normalizes utterances
else:
utt_tok_list.append(tokenize(utt['text'])) # normalizes utterances
inform_dict_list.append(inform_dict.copy())
user_act_dict_list.append(user_act_dict.copy())
mod_slots_list.append(modified_slots.copy())
modified_domains = list(modified_domains)
modified_domains.sort()
mod_domains_list.append(modified_domains)
# Form proper (usr, sys) turns
turn_itr = 0
......@@ -446,14 +452,17 @@ def create_examples(input_file, acts_file, set_type, slot_list,
class_type_dict = {}
# Collect turn data
if append_history:
if swap_utterances:
if not no_append_history:
if not swap_utterances:
hst_utt_tok = usr_utt_tok + sys_utt_tok + hst_utt_tok
else:
hst_utt_tok = sys_utt_tok + usr_utt_tok + hst_utt_tok
sys_utt_tok = utt_tok_list[i - 1]
usr_utt_tok = utt_tok_list[i]
turn_slots = mod_slots_list[i + 1]
inform_mem = inform_dict_list[i - 1]
user_act = user_act_dict_list[i]
turn_domains = mod_domains_list[i + 1]
guid = '%s-%s-%s' % (set_type, str(dialog_id), str(turn_itr))
......@@ -472,17 +481,18 @@ def create_examples(input_file, acts_file, set_type, slot_list,
# modify any of the original labels for test sets,
# since this would make comparison difficult.
value_dict[slot] = value_label
elif label_value_repetitions and slot in diag_seen_slots_dict:
elif not no_label_value_repetitions and slot in diag_seen_slots_dict:
value_label = diag_seen_slots_value_dict[slot]
# Get dialog act annotations
inform_label = list(['none'])
inform_slot_dict[slot] = 0
if (str(dialog_id), str(turn_itr), slot) in sys_inform_dict:
inform_label = list([normalize_label(slot, i) for i in sys_inform_dict[(str(dialog_id), str(turn_itr), slot)]])
booking_slot = 'booking-' + slot.split('-')[1]
if slot in inform_mem:
inform_label = inform_mem[slot]
inform_slot_dict[slot] = 1
elif (str(dialog_id), str(turn_itr), 'booking-' + slot.split('-')[1]) in sys_inform_dict:
inform_label = list([normalize_label(slot, i) for i in sys_inform_dict[(str(dialog_id), str(turn_itr), 'booking-' + slot.split('-')[1])]])
elif booking_slot in inform_mem:
inform_label = inform_mem[booking_slot]
inform_slot_dict[slot] = 1
(informed_value,
......@@ -494,17 +504,25 @@ def create_examples(input_file, acts_file, set_type, slot_list,
usr_utt_tok,
slot,
diag_seen_slots_value_dict,
slot_last_occurrence=True)
slot_last_occurrence=True,
label_maps=label_maps)
inform_dict[slot] = informed_value
# Requestable slots, domain indicator slots and general slots
# should have class_type 'request', if they ought to be predicted.
# Give other class_types preference.
if 'request' in class_types:
if class_type in ['none', 'unpointable'] and is_request(slot, user_act, turn_domains):
class_type = 'request'
# Generally don't use span prediction on sys utterance (but inform prediction instead).
sys_utt_tok_label = [0 for _ in sys_utt_tok]
# Determine what to do with value repetitions.
# If value is unique in seen slots, then tag it, otherwise not,
# since correct slot assignment can not be guaranteed anymore.
if label_value_repetitions and slot in diag_seen_slots_dict:
if not no_label_value_repetitions and slot in diag_seen_slots_dict:
if class_type == 'copy_value' and list(diag_seen_slots_value_dict.values()).count(value_label) > 1:
class_type = 'none'
usr_utt_tok_label = [0 for _ in usr_utt_tok_label]
......@@ -512,9 +530,9 @@ def create_examples(input_file, acts_file, set_type, slot_list,
sys_utt_tok_label_dict[slot] = sys_utt_tok_label
usr_utt_tok_label_dict[slot] = usr_utt_tok_label
if append_history:
if use_history_labels:
if swap_utterances:
if not no_append_history:
if not no_use_history_labels:
if not swap_utterances:
new_hst_utt_tok_label_dict[slot] = usr_utt_tok_label + sys_utt_tok_label + new_hst_utt_tok_label_dict[slot]
else:
new_hst_utt_tok_label_dict[slot] = sys_utt_tok_label + usr_utt_tok_label + new_hst_utt_tok_label_dict[slot]
......@@ -556,7 +574,7 @@ def create_examples(input_file, acts_file, set_type, slot_list,
if analyze:
print("]")
if swap_utterances:
if not swap_utterances:
txt_a = usr_utt_tok
txt_b = sys_utt_tok
txt_a_lbl = usr_utt_tok_label_dict
......
# coding=utf-8
#
# Copyright 2020-2022 Heinrich Heine University Duesseldorf
#
# Part of this code is based on the source code of BERT-DST
# (arXiv:1907.03040)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import re
from tqdm import tqdm
from utils_dst import (DSTExample)
from dataset_multiwoz21 import (ACTS_DICT, is_request,
tokenize, normalize_label,
get_turn_label, delex_utt)
# Loads the dialogue_acts.json and returns a list
# of slot-value pairs.
def load_acts(input_file):
with open(input_file) as f:
acts = json.load(f)
s_dict = {}
for d in acts:
for t in acts[d]:
# Only process, if turn has annotation
if isinstance(acts[d][t], dict):
is_22_format = False
if 'dialog_act' in acts[d][t]:
is_22_format = True
acts_list = acts[d][t]['dialog_act']
if int(t) % 2 == 0:
continue
else:
acts_list = acts[d][t]
for a in acts_list:
aa = a.lower().split('-')
if aa[1] in ['inform', 'recommend', 'select', 'book']:
for i in acts_list[a]:
s = i[0].lower()
v = i[1].lower().strip()
if s == 'none' or v == '?' or v == 'none':
continue
slot = aa[0] + '-' + s
if slot in ACTS_DICT:
slot = ACTS_DICT[slot]
if is_22_format:
t_key = str(int(int(t) / 2 + 1))
d_key = d
else:
t_key = t
d_key = d + '.json'
key = d_key, t_key, slot
# INFO: Since the model has no mechanism to predict
# one among several informed value candidates, we
# keep only one informed value. For fairness, we
# apply a global rule:
# ... Option 1: Keep first informed value
if key not in s_dict:
s_dict[key] = list([v])
# ... Option 2: Keep last informed value
#s_dict[key] = list([v])
return s_dict
def create_examples(input_file, acts_file, set_type, slot_list,
label_maps={},
no_append_history=False,
no_use_history_labels=False,
no_label_value_repetitions=False,
swap_utterances=False,
delexicalize_sys_utts=False,
unk_token="[UNK]",
analyze=False):
"""Read a DST json file into a list of DSTExample."""
sys_inform_dict = load_acts(acts_file)
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)
examples = []
for d_itr, dialog_id in enumerate(tqdm(input_data)):
entry = input_data[dialog_id]
utterances = entry['log']
# Collects all slot changes throughout the dialog
cumulative_labels = {slot: 'none' for slot in slot_list}
# First system utterance is empty, since multiwoz starts with user input
utt_tok_list = [[]]
mod_slots_list = [{}]
# Collect all utterances and their metadata
usr_sys_switch = True
turn_itr = 0
for utt in utterances:
# Assert that system and user utterances alternate
is_sys_utt = utt['metadata'] != {}
if usr_sys_switch == is_sys_utt:
print("WARN: Wrong order of system and user utterances. Skipping rest of dialog %s" % (dialog_id))
break
usr_sys_switch = is_sys_utt
if is_sys_utt:
turn_itr += 1
# Delexicalize sys utterance
if delexicalize_sys_utts and is_sys_utt:
inform_dict = {slot: 'none' for slot in slot_list}
for slot in slot_list:
if (str(dialog_id), str(turn_itr), slot) in sys_inform_dict:
inform_dict[slot] = sys_inform_dict[(str(dialog_id), str(turn_itr), slot)]
utt_tok_list.append(delex_utt(utt['text'], inform_dict, unk_token)) # normalize utterances
else:
utt_tok_list.append(tokenize(utt['text'])) # normalize utterances
modified_slots = {}
# If sys utt, extract metadata (identify and collect modified slots)
if is_sys_utt:
for d in utt['metadata']:
booked = utt['metadata'][d]['book']['booked']
booked_slots = {}
# Check the booked section
if booked != []:
for s in booked[0]:
booked_slots[s] = normalize_label('%s-%s' % (d, s), booked[0][s]) # normalize labels
# Check the semi and the inform slots
for category in ['book', 'semi']:
for s in utt['metadata'][d][category]:
cs = '%s-book_%s' % (d, s) if category == 'book' else '%s-%s' % (d, s)
value_label = normalize_label(cs, utt['metadata'][d][category][s]) # normalize labels
# Prefer the slot value as stored in the booked section
if s in booked_slots:
value_label = booked_slots[s]
# Remember modified slots and entire dialog state
if cs in slot_list and cumulative_labels[cs] != value_label:
modified_slots[cs] = value_label
cumulative_labels[cs] = value_label
mod_slots_list.append(modified_slots.copy())
# Form proper (usr, sys) turns
turn_itr = 0
diag_seen_slots_dict = {}
diag_seen_slots_value_dict = {slot: 'none' for slot in slot_list}
diag_state = {slot: 'none' for slot in slot_list}
sys_utt_tok = []
usr_utt_tok = []
hst_utt_tok = []
hst_utt_tok_label_dict = {slot: [] for slot in slot_list}
for i in range(1, len(utt_tok_list) - 1, 2):
sys_utt_tok_label_dict = {}
usr_utt_tok_label_dict = {}
value_dict = {}
inform_dict = {}
inform_slot_dict = {}
referral_dict = {}
class_type_dict = {}
# Collect turn data
if not no_append_history:
if not swap_utterances:
hst_utt_tok = usr_utt_tok + sys_utt_tok + hst_utt_tok
else:
hst_utt_tok = sys_utt_tok + usr_utt_tok + hst_utt_tok
sys_utt_tok = utt_tok_list[i - 1]
usr_utt_tok = utt_tok_list[i]
turn_slots = mod_slots_list[i + 1]
guid = '%s-%s-%s' % (set_type, str(dialog_id), str(turn_itr))
if analyze:
print("%15s %2s %s ||| %s" % (dialog_id, turn_itr, ' '.join(sys_utt_tok), ' '.join(usr_utt_tok)))
print("%15s %2s [" % (dialog_id, turn_itr), end='')
new_hst_utt_tok_label_dict = hst_utt_tok_label_dict.copy()
new_diag_state = diag_state.copy()
for slot in slot_list:
value_label = 'none'
if slot in turn_slots:
value_label = turn_slots[slot]
# We keep the original labels so as to not
# overlook unpointable values, as well as to not
# modify any of the original labels for test sets,
# since this would make comparison difficult.
value_dict[slot] = value_label
elif not no_label_value_repetitions and slot in diag_seen_slots_dict:
value_label = diag_seen_slots_value_dict[slot]
# Get dialog act annotations
inform_label = list(['none'])
inform_slot_dict[slot] = 0
if (str(dialog_id), str(turn_itr), slot) in sys_inform_dict:
inform_label = list([normalize_label(slot, i) for i in sys_inform_dict[(str(dialog_id), str(turn_itr), slot)]])
inform_slot_dict[slot] = 1
elif (str(dialog_id), str(turn_itr), 'booking-' + slot.split('-')[1]) in sys_inform_dict:
inform_label = list([normalize_label(slot, i) for i in sys_inform_dict[(str(dialog_id), str(turn_itr), 'booking-' + slot.split('-')[1])]])
inform_slot_dict[slot] = 1
(informed_value,
referred_slot,
usr_utt_tok_label,
class_type) = get_turn_label(value_label,
inform_label,
sys_utt_tok,
usr_utt_tok,
slot,
diag_seen_slots_value_dict,
slot_last_occurrence=True,
label_maps=label_maps)
inform_dict[slot] = informed_value
# Generally don't use span prediction on sys utterance (but inform prediction instead).
sys_utt_tok_label = [0 for _ in sys_utt_tok]
# Determine what to do with value repetitions.
# If value is unique in seen slots, then tag it, otherwise not,
# since correct slot assignment can not be guaranteed anymore.
if not no_label_value_repetitions and slot in diag_seen_slots_dict:
if class_type == 'copy_value' and list(diag_seen_slots_value_dict.values()).count(value_label) > 1:
class_type = 'none'
usr_utt_tok_label = [0 for _ in usr_utt_tok_label]
sys_utt_tok_label_dict[slot] = sys_utt_tok_label
usr_utt_tok_label_dict[slot] = usr_utt_tok_label
if not no_append_history:
if not no_use_history_labels:
if not swap_utterances:
new_hst_utt_tok_label_dict[slot] = usr_utt_tok_label + sys_utt_tok_label + new_hst_utt_tok_label_dict[slot]
else:
new_hst_utt_tok_label_dict[slot] = sys_utt_tok_label + usr_utt_tok_label + new_hst_utt_tok_label_dict[slot]
else:
new_hst_utt_tok_label_dict[slot] = [0 for _ in sys_utt_tok_label + usr_utt_tok_label + new_hst_utt_tok_label_dict[slot]]
# For now, we map all occurences of unpointable slot values
# to none. However, since the labels will still suggest
# a presence of unpointable slot values, the task of the
# DST is still to find those values. It is just not
# possible to do that via span prediction on the current input.
if class_type == 'unpointable':
class_type_dict[slot] = 'none'
referral_dict[slot] = 'none'
if analyze:
if slot not in diag_seen_slots_dict or value_label != diag_seen_slots_value_dict[slot]:
print("(%s): %s, " % (slot, value_label), end='')
elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[slot] and class_type != 'copy_value' and class_type != 'inform':
# If slot has seen before and its class type did not change, label this slot a not present,
# assuming that the slot has not actually been mentioned in this turn.
# Exceptions are copy_value and inform. If a seen slot has been tagged as copy_value or inform,
# this must mean there is evidence in the original labels, therefore consider
# them as mentioned again.
class_type_dict[slot] = 'none'
referral_dict[slot] = 'none'
else:
class_type_dict[slot] = class_type
referral_dict[slot] = referred_slot
# Remember that this slot was mentioned during this dialog already.
if class_type != 'none':
diag_seen_slots_dict[slot] = class_type
diag_seen_slots_value_dict[slot] = value_label
new_diag_state[slot] = class_type
# Unpointable is not a valid class, therefore replace with
# some valid class for now...
if class_type == 'unpointable':
new_diag_state[slot] = 'copy_value'
if analyze:
print("]")
if not swap_utterances:
txt_a = usr_utt_tok
txt_b = sys_utt_tok
txt_a_lbl = usr_utt_tok_label_dict
txt_b_lbl = sys_utt_tok_label_dict
else:
txt_a = sys_utt_tok
txt_b = usr_utt_tok
txt_a_lbl = sys_utt_tok_label_dict
txt_b_lbl = usr_utt_tok_label_dict
examples.append(DSTExample(
guid=guid,
text_a=txt_a,
text_b=txt_b,
history=hst_utt_tok,
text_a_label=txt_a_lbl,
text_b_label=txt_b_lbl,
history_label=hst_utt_tok_label_dict,
values=diag_seen_slots_value_dict.copy(),
inform_label=inform_dict,
inform_slot_label=inform_slot_dict,
refer_label=referral_dict,
diag_state=diag_state,
class_label=class_type_dict))
# Update some variables.
hst_utt_tok_label_dict = new_hst_utt_tok_label_dict.copy()
diag_state = new_diag_state.copy()
turn_itr += 1
if analyze:
print("----------------------------------------------------------------------")
return examples
# coding=utf-8
#
# Copyright 2020 Heinrich Heine University Duesseldorf
# Copyright 2020-2022 Heinrich Heine University Duesseldorf
#
# Part of this code is based on the source code of BERT-DST
# (arXiv:1907.03040)
......@@ -166,11 +166,10 @@ def get_turn_label(turn, prev_dialogue_state, slot_list, dial_id, turn_id, sys_i
def create_examples(input_file, set_type, slot_list,
label_maps={},
append_history=False,
use_history_labels=False,
no_append_history=False,
no_use_history_labels=False,
no_label_value_repetitions=False,
swap_utterances=False,
label_value_repetitions=False,
delexicalize_sys_utts=False,
unk_token="[UNK]",
analyze=False):
......@@ -211,7 +210,7 @@ def create_examples(input_file, set_type, slot_list,
unk_token=unk_token,
slot_last_occurrence=True)
if swap_utterances:
if not swap_utterances:
txt_a = text_b
txt_b = text_a
txt_a_lbl = text_b_label
......@@ -230,8 +229,8 @@ def create_examples(input_file, set_type, slot_list,
value_dict[slot] = 'none'
if class_label[slot] != 'none':
ds_lbl_dict[slot] = class_label[slot]
if append_history:
if use_history_labels:
if not no_append_history:
if not no_use_history_labels:
hst_lbl_dict[slot] = txt_a_lbl[slot] + txt_b_lbl[slot] + hst_lbl_dict[slot]
else:
hst_lbl_dict[slot] = [0 for _ in txt_a_lbl[slot] + txt_b_lbl[slot] + hst_lbl_dict[slot]]
......@@ -255,7 +254,7 @@ def create_examples(input_file, set_type, slot_list,
prev_ds_lbl_dict = ds_lbl_dict.copy()
prev_hst_lbl_dict = hst_lbl_dict.copy()
if append_history:
if not no_append_history:
hst = txt_a + txt_b + hst
return examples
# coding=utf-8
#
# Copyright 2020-2022 Heinrich Heine University Duesseldorf
#
# Part of this code is based on the source code of BERT-DST
# (arXiv:1907.03040)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import re
from tqdm import tqdm
from utils_dst import (DSTExample)
try:
from convlab.util import (load_dataset, load_ontology, load_dst_data)
except ModuleNotFoundError as e:
print(e)
print("Ignore this error if you don't intend to use the data processor for ConvLab3's unified data format.")
print("Otherwise, make sure you have ConvLab3 installed and added to your PYTHONPATH.")
def get_ontology_slots(ontology):
domains = [domain for domain in ontology['domains']]
ontology_slots = dict()
for domain in domains:
if domain not in ontology_slots:
ontology_slots[domain] = set()
for slot in ontology['domains'][domain]['slots']:
ontology_slots[domain].add(slot)
ontology_slots[domain] = list(ontology_slots[domain])
ontology_slots[domain].sort()
return ontology_slots
def get_slot_list(dataset_name):
slot_list = []
ontology = load_ontology(dataset_name)
dataset_slot_list = get_ontology_slots(ontology)
for domain in dataset_slot_list:
for slot in dataset_slot_list[domain]:
slot_list.append("%s-%s" % (domain, slot))
slot_list.append("%s-none" % (domain)) # none slot indicates domain activation in ConvLab3
# Some special intents are modeled as 'request' slots in TripPy
if 'bye' in ontology['intents']:
slot_list.append("general-bye")
if 'thank' in ontology['intents']:
slot_list.append("general-thank")
if 'greet' in ontology['intents']:
slot_list.append("general-greet")
return slot_list
def create_examples(set_type, dataset_name="multiwoz21", class_types=[], slot_list=[], label_maps={},
no_append_history=False,
no_use_history_labels=False,
no_label_value_repetitions=False,
swap_utterances=False,
delexicalize_sys_utts=False,
unk_token="[UNK]",
analyze=False):
"""Read a DST json file into a list of DSTExample."""
# TODO: Make sure normalization etc. will be compatible with or suitable for SGD and
# other datasets as well.
if dataset_name == "multiwoz21":
from dataset_multiwoz21 import (tokenize, normalize_label,
get_turn_label, delex_utt,
is_request)
else:
raise ValueError("Unknown dataset_name.")
dataset_args = {"dataset_name": dataset_name}
dataset_dict = load_dataset(**dataset_args)
if slot_list == []:
slot_list = get_slot_list()
data = load_dst_data(dataset_dict, data_split=set_type, speaker='all', dialogue_acts=True, split_to_turn=False)
examples = []
for d_itr, entry in enumerate(tqdm(data[set_type])):
dialog_id = entry['dialogue_id']
#dialog_id = entry['original_id']
original_id = entry['original_id']
domains = entry['domains']
turns = entry['turns']
# Collects all slot changes throughout the dialog
cumulative_labels = {slot: 'none' for slot in slot_list}
# First system utterance is empty, since multiwoz starts with user input
utt_tok_list = [[]]
mod_slots_list = [{}]
inform_dict_list = [{}]
user_act_dict_list = [{}]
mod_domains_list = [{}]
# Collect all utterances and their metadata
usr_sys_switch = True
for turn in turns:
utterance = turn['utterance']
state = turn['state'] if 'state' in turn else {}
acts = [item for sublist in list(turn['dialogue_acts'].values()) for item in sublist] # flatten list
# Assert that system and user utterances alternate
is_sys_utt = turn['speaker'] in ['sys', 'system']
if usr_sys_switch == is_sys_utt:
print("WARN: Wrong order of system and user utterances. Skipping rest of dialog %s" % (dialog_id))
break
usr_sys_switch = is_sys_utt
# Extract metadata: identify modified slots and values informed by the system
inform_dict = {}
user_act_dict = {}
modified_slots = {}
modified_domains = set()
for act in acts:
slot = "%s-%s" % (act['domain'], act['slot'] if act['slot'] != '' else 'none')
if act['intent'] in ['bye', 'thank', 'hello']:
slot = "general-%s" % (act['intent'])
value_label = act['value'] if 'value' in act else 'yes' if act['slot'] != '' else 'none'
value_label = normalize_label(slot, value_label)
modified_domains.add(act['domain']) # Remember domains
if is_sys_utt and act['intent'] in ['inform', 'recommend', 'select', 'book'] and value_label != 'none':
if slot not in inform_dict:
inform_dict[slot] = []
inform_dict[slot].append(value_label)
elif not is_sys_utt:
if slot not in user_act_dict:
user_act_dict[slot] = []
user_act_dict[slot].append(act)
# INFO: Since the model has no mechanism to predict
# one among several informed value candidates, we
# keep only one informed value. For fairness, we
# apply a global rule:
for e in inform_dict:
# ... Option 1: Always keep first informed value
inform_dict[e] = list([inform_dict[e][0]])
# ... Option 2: Always keep last informed value
#inform_dict[e] = list([inform_dict[e][-1]])
for d in state:
for s in state[d]:
slot = "%s-%s" % (d, s)
value_label = normalize_label(slot, state[d][s])
# Remember modified slots and entire dialog state
if slot in slot_list and cumulative_labels[slot] != value_label:
modified_slots[slot] = value_label
cumulative_labels[slot] = value_label
modified_domains.add(d) # Remember domains
# Delexicalize sys utterance
if delexicalize_sys_utts and is_sys_utt:
utt_tok_list.append(delex_utt(utterance, inform_dict, unk_token)) # normalizes utterances
else:
utt_tok_list.append(tokenize(utterance)) # normalizes utterances
inform_dict_list.append(inform_dict.copy())
user_act_dict_list.append(user_act_dict.copy())
mod_slots_list.append(modified_slots.copy())
modified_domains = list(modified_domains)
modified_domains.sort()
mod_domains_list.append(modified_domains)
# Form proper (usr, sys) turns
turn_itr = 0
diag_seen_slots_dict = {}
diag_seen_slots_value_dict = {slot: 'none' for slot in slot_list}
diag_state = {slot: 'none' for slot in slot_list}
sys_utt_tok = []
usr_utt_tok = []
hst_utt_tok = []
hst_utt_tok_label_dict = {slot: [] for slot in slot_list}
for i in range(1, len(utt_tok_list) - 1, 2):
sys_utt_tok_label_dict = {}
usr_utt_tok_label_dict = {}
value_dict = {}
inform_dict = {}
inform_slot_dict = {}
referral_dict = {}
class_type_dict = {}
# Collect turn data
if not no_append_history:
if not swap_utterances:
hst_utt_tok = usr_utt_tok + sys_utt_tok + hst_utt_tok
else:
hst_utt_tok = sys_utt_tok + usr_utt_tok + hst_utt_tok
sys_utt_tok = utt_tok_list[i - 1]
usr_utt_tok = utt_tok_list[i]
turn_slots = mod_slots_list[i]
inform_mem = inform_dict_list[i - 1]
user_act = user_act_dict_list[i]
turn_domains = mod_domains_list[i]
guid = '%s-%s' % (dialog_id, turn_itr)
if analyze:
print("%15s %2s %s ||| %s" % (dialog_id, turn_itr, ' '.join(sys_utt_tok), ' '.join(usr_utt_tok)))
print("%15s %2s [" % (dialog_id, turn_itr), end='')
new_hst_utt_tok_label_dict = hst_utt_tok_label_dict.copy()
new_diag_state = diag_state.copy()
for slot in slot_list:
value_label = 'none'
if slot in turn_slots:
value_label = turn_slots[slot]
# We keep the original labels so as to not
# overlook unpointable values, as well as to not
# modify any of the original labels for test sets,
# since this would make comparison difficult.
value_dict[slot] = value_label
elif not no_label_value_repetitions and slot in diag_seen_slots_dict:
value_label = diag_seen_slots_value_dict[slot]
# Get dialog act annotations
inform_label = list(['none'])
inform_slot_dict[slot] = 0
if slot in inform_mem:
inform_label = inform_mem[slot]
inform_slot_dict[slot] = 1
(informed_value,
referred_slot,
usr_utt_tok_label,
class_type) = get_turn_label(value_label,
inform_label,
sys_utt_tok,
usr_utt_tok,
slot,
diag_seen_slots_value_dict,
slot_last_occurrence=True,
label_maps=label_maps)
inform_dict[slot] = informed_value
# Requestable slots, domain indicator slots and general slots
# should have class_type 'request', if they ought to be predicted.
# Give other class_types preference.
if 'request' in class_types:
if class_type in ['none', 'unpointable'] and is_request(slot, user_act, turn_domains):
class_type = 'request'
# Generally don't use span prediction on sys utterance (but inform prediction instead).
sys_utt_tok_label = [0 for _ in sys_utt_tok]
# Determine what to do with value repetitions.
# If value is unique in seen slots, then tag it, otherwise not,
# since correct slot assignment can not be guaranteed anymore.
if not no_label_value_repetitions and slot in diag_seen_slots_dict:
if class_type == 'copy_value' and list(diag_seen_slots_value_dict.values()).count(value_label) > 1:
class_type = 'none'
usr_utt_tok_label = [0 for _ in usr_utt_tok_label]
sys_utt_tok_label_dict[slot] = sys_utt_tok_label
usr_utt_tok_label_dict[slot] = usr_utt_tok_label
if not no_append_history:
if not no_use_history_labels:
if not swap_utterances:
new_hst_utt_tok_label_dict[slot] = usr_utt_tok_label + sys_utt_tok_label + new_hst_utt_tok_label_dict[slot]
else:
new_hst_utt_tok_label_dict[slot] = sys_utt_tok_label + usr_utt_tok_label + new_hst_utt_tok_label_dict[slot]
else:
new_hst_utt_tok_label_dict[slot] = [0 for _ in sys_utt_tok_label + usr_utt_tok_label + new_hst_utt_tok_label_dict[slot]]
# For now, we map all occurences of unpointable slot values
# to none. However, since the labels will still suggest
# a presence of unpointable slot values, the task of the
# DST is still to find those values. It is just not
# possible to do that via span prediction on the current input.
if class_type == 'unpointable':
class_type_dict[slot] = 'none'
referral_dict[slot] = 'none'
if analyze:
if slot not in diag_seen_slots_dict or value_label != diag_seen_slots_value_dict[slot]:
print("(%s): %s, " % (slot, value_label), end='')
elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[slot] and class_type != 'copy_value' and class_type != 'inform':
# If slot has seen before and its class type did not change, label this slot a not present,
# assuming that the slot has not actually been mentioned in this turn.
# Exceptions are copy_value and inform. If a seen slot has been tagged as copy_value or inform,
# this must mean there is evidence in the original labels, therefore consider
# them as mentioned again.
class_type_dict[slot] = 'none'
referral_dict[slot] = 'none'
else:
class_type_dict[slot] = class_type
referral_dict[slot] = referred_slot
# Remember that this slot was mentioned during this dialog already.
if class_type != 'none':
diag_seen_slots_dict[slot] = class_type
diag_seen_slots_value_dict[slot] = value_label
new_diag_state[slot] = class_type
# Unpointable is not a valid class, therefore replace with
# some valid class for now...
if class_type == 'unpointable':
new_diag_state[slot] = 'copy_value'
if analyze:
print("]")
if not swap_utterances:
txt_a = usr_utt_tok
txt_b = sys_utt_tok
txt_a_lbl = usr_utt_tok_label_dict
txt_b_lbl = sys_utt_tok_label_dict
else:
txt_a = sys_utt_tok
txt_b = usr_utt_tok
txt_a_lbl = sys_utt_tok_label_dict
txt_b_lbl = usr_utt_tok_label_dict
examples.append(DSTExample(
guid=guid,
text_a=txt_a,
text_b=txt_b,
history=hst_utt_tok,
text_a_label=txt_a_lbl,
text_b_label=txt_b_lbl,
history_label=hst_utt_tok_label_dict,
values=diag_seen_slots_value_dict.copy(),
inform_label=inform_dict,
inform_slot_label=inform_slot_dict,
refer_label=referral_dict,
diag_state=diag_state,
class_label=class_type_dict))
# Update some variables.
hst_utt_tok_label_dict = new_hst_utt_tok_label_dict.copy()
diag_state = new_diag_state.copy()
turn_itr += 1
if analyze:
print("----------------------------------------------------------------------")
return examples