milu now can use datasets in unified format

9bb720b5 · zqwerty · 95eb2201 · 9bb720b5 · 9bb720b5 · 9bb720b5
Commit 9bb720b5 authored 3 years ago by zqwerty
--- a/convlab2/nlu/jointBERT/unified_datasets/preprocess.py
+++ b/convlab2/nlu/jointBERT/unified_datasets/preprocess.py
 import json
 import os
 from collections import Counter
-from convlab2.util import load_dataset, load_ontology, load_nlu_data
+from convlab2.util import load_dataset, load_nlu_data
 from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
 from tqdm import tqdm


--- a/convlab2/nlu/milu/README.md
+++ b/convlab2/nlu/milu/README.md
@@ -5,16 +5,41 @@ MILU is a joint neural model that allows you to simultaneously predict multiple
 ## Example usage
 We based our implementation on the [AllenNLP library](https://github.com/allenai/allennlp). For an introduction to this library, you should check [these tutorials](https://allennlp.org/tutorials).

+To use this model, you need to additionally install `overrides==4.1.2, allennlp==0.9.0` and use `python>=3.6,<=3.8`.
+
+### On MultiWOZ dataset
+
 ```bash
-$ PYTHONPATH=../../.. python train.py multiwoz/configs/[base|context3].jsonnet -s serialization_dir
-$ PYTHONPATH=../../.. python evaluate.py serialization_dir/model.tar.gz {test_file} --cuda-device {CUDA_DEVICE}
+$ python train.py multiwoz/configs/[base|context3].jsonnet -s serialization_dir
+$ python evaluate.py serialization_dir/model.tar.gz {test_file} --cuda-device {CUDA_DEVICE}
 ```

 If you want to perform end-to-end evaluation, you can include the trained model by adding the model path (serialization_dir/model.tar.gz) to your ConvLab spec file.

-## Data
+#### Data
 We use the multiwoz data (data/multiwoz/[train|val|test].json.zip).

+### MILU on datasets in unified format
+We support training MILU on datasets that are in our unified format.
+
+- For **non-categorical** dialogue acts whose values are in the utterances, we use **slot tagging** to extract the values.
+- For **categorical** and **binary** dialogue acts whose values may not be presented in the utterances, we treat them as **intents** of the utterances.
+
+Takes MultiWOZ 2.1 (unified format) as an example,
+```bash
+$ python train.py unified_datasets/configs/multiwoz21_user_context3.jsonnet -s serialization_dir
+$ python evaluate.py serialization_dir/model.tar.gz test --cuda-device {CUDA_DEVICE}
+```
+Note that the config file is different from the above. You should set:
+- `"use_unified_datasets": true` in `dataset_reader` and `model`
+- `"dataset_name": "multiwoz21"` in `dataset_reader`
+- `"train_data_path": "train"`
+- `"validation_data_path": "validation"`
+- `"test_data_path": "test"`
+
+## Predict
+See `nlu.py` under `multiwoz` and `unified_datasets` directories.
+
 ## References
 ```
 @inproceedings{lee2019convlab,

--- a/convlab2/nlu/milu/dai_f1_measure.py
+++ b/convlab2/nlu/milu/dai_f1_measure.py
@@ -9,7 +9,7 @@ from allennlp.training.metrics.metric import Metric
 class DialogActItemF1Measure(Metric):
    """
    """
-    def __init__(self) -> None:
+    def __init__(self, use_unified_datasets) -> None:
        """
        Parameters
        ----------
@@ -18,6 +18,7 @@ class DialogActItemF1Measure(Metric):
        self._true_positives = 0 
        self._false_positives = 0 
        self._false_negatives = 0 
+        self.use_unified_datasets = use_unified_datasets


    def __call__(self,
@@ -32,6 +33,25 @@ class DialogActItemF1Measure(Metric):
            A tensor of integer class label of shape (batch_size, sequence_length). It must be the same
            shape as the ``predictions`` tensor without the ``num_classes`` dimension.
        """
+        if self.use_unified_datasets:
+            for prediction, gold_label in zip(predictions, gold_labels): 
+                for da_type in ['non-categorical', 'categorical', 'binary']:
+                    if da_type == 'binary':
+                        predicts = [(x['intent'], x['domain'], x['slot']) for x in prediction[da_type]]
+                        labels = [(x['intent'], x['domain'], x['slot']) for x in gold_label[da_type]]
+                    else:
+                        predicts = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in prediction[da_type]]
+                        labels = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in gold_label[da_type]]
+                    
+                    for ele in predicts:
+                        if ele in labels:
+                            self._true_positives += 1
+                        else:
+                            self._false_positives += 1
+                    for ele in labels:
+                        if ele not in predicts:
+                            self._false_negatives += 1
+        else:
            for prediction, gold_label in zip(predictions, gold_labels): 
                for dat in prediction:
                    for sv in prediction[dat]:

--- a/convlab2/nlu/milu/dataset_reader.py
+++ b/convlab2/nlu/milu/dataset_reader.py
@@ -13,6 +13,8 @@ from allennlp.data.fields import TextField, SequenceLabelField, MultiLabelField,
 from allennlp.data.instance import Instance
 from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
 from allennlp.data.tokenizers import Token
+from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
+from convlab2.util import load_dataset, load_nlu_data
 from overrides import overrides

 from convlab2.util.file_util import cached_path
@@ -45,6 +47,8 @@ class MILUDatasetReader(DatasetReader):
    def __init__(self,
                 context_size: int = 0,
                 agent: str = None,
+                 use_unified_datasets: bool = False,
+                 dataset_name: str = None,
                 random_context_size: bool = True,
                 token_delimiter: str = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
@@ -52,12 +56,81 @@ class MILUDatasetReader(DatasetReader):
        super().__init__(lazy)
        self._context_size = context_size
        self._agent = agent 
+        self.use_unified_datasets = use_unified_datasets
+        if self.use_unified_datasets:
+            self._dataset_name = dataset_name
+            self._dataset = load_dataset(self._dataset_name)
        self._random_context_size = random_context_size
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self._token_delimiter = token_delimiter
+        self._sent_tokenizer = PunktSentenceTokenizer()
+        self._word_tokenizer = TreebankWordTokenizer()

    @overrides
    def _read(self, file_path):
+        if self.use_unified_datasets:
+            data_split = file_path
+            logger.info("Reading instances from unified dataset %s[%s]", self._dataset_name, data_split)
+
+            data = load_nlu_data(self._dataset, data_split=data_split, speaker=self._agent, use_context=self._context_size>0, context_window_size=self._context_size)[data_split]
+
+            for sample in data:
+                utterance = sample['utterance']
+                sentences = self._sent_tokenizer.tokenize(utterance)
+                sent_spans = self._sent_tokenizer.span_tokenize(utterance)
+                tokens = [token for sent in sentences for token in self._word_tokenizer.tokenize(sent)]
+                token_spans = [(sent_span[0]+token_span[0], sent_span[0]+token_span[1]) for sent, sent_span in zip(sentences, sent_spans) for token_span in self._word_tokenizer.span_tokenize(sent)]
+                tags = ['O'] * len(tokens)
+
+                for da in sample['dialogue_acts']['non-categorical']:
+                    if 'start' not in da:
+                        # skip da that doesn't have span annotation
+                        continue
+                    char_start = da['start']
+                    char_end = da['end']
+                    word_start, word_end = -1, -1
+                    for i, token_span in enumerate(token_spans):
+                        if char_start == token_span[0]:
+                            word_start = i
+                        if char_end == token_span[1]:
+                            word_end = i + 1
+                    if word_start == -1 and word_end == -1:
+                        # char span does not match word, maybe there is an error in the annotation, skip
+                        print('char span does not match word, skipping')
+                        print('\t', 'utteance:', utterance)
+                        print('\t', 'value:', utterance[char_start: char_end])
+                        print('\t', 'da:', da, '\n')
+                        continue
+                    intent, domain, slot = da['intent'], da['domain'], da['slot']
+                    tags[word_start] = f"B-{intent}+{domain}+{slot}"
+                    for i in range(word_start+1, word_end):
+                        tags[i] = f"I-{intent}+{domain}+{slot}"
+
+                intents = []
+                for da in sample['dialogue_acts']['categorical']:
+                    intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da['value'].strip().lower()
+                    intent = str((intent, domain, slot, value))
+                    intents.append(intent)
+                for da in sample['dialogue_acts']['binary']:
+                    intent, domain, slot = da['intent'], da['domain'], da['slot']
+                    intent = str((intent, domain, slot))
+                    intents.append(intent)
+
+                wrapped_tokens = [Token(token) for token in tokens]
+
+                wrapped_context_tokens = []
+                num_context = random.randint(0, self._context_size) if self._random_context_size else self._context_size
+                if num_context > 0 and len(sample['context']) > 0:
+                    for utt in sample['context']:
+                        for sent in self._sent_tokenizer.tokenize(utt['utterance']):
+                            for token in self._word_tokenizer.tokenize(sent):
+                                wrapped_context_tokens.append(Token(token))
+                        wrapped_context_tokens.append(Token("SENT_END"))
+                else:
+                    wrapped_context_tokens = [Token("SENT_END")]
+
+                yield self.text_to_instance(wrapped_context_tokens, wrapped_tokens, tags, intents, sample['dialogue_acts'])
+        else:
            # if `file_path` is a URL, redirect to the cache
            file_path = cached_path(file_path)


--- a/convlab2/nlu/milu/model.py
+++ b/convlab2/nlu/milu/model.py
@@ -57,6 +57,7 @@ class MILU(Model):
                 feedforward: Optional[FeedForward] = None,
                 label_encoding: Optional[str] = None,
                 include_start_end_transitions: bool = True,
+                 use_unified_datasets: bool = False,
                 crf_decoding: bool = False,
                 constrain_crf_decoding: bool = None,
                 focal_loss_gamma: float = None,
@@ -83,6 +84,7 @@ class MILU(Model):
        self.tag_encoder = intent_encoder
        self._feedforward = feedforward
        self._verbose_metrics = verbose_metrics
+        self.use_unified_datasets = use_unified_datasets
        self.rl = False 
 
        if attention:
@@ -164,7 +166,7 @@ class MILU(Model):
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=sequence_label_namespace,
                                                 label_encoding=label_encoding)
-        self._dai_f1_metric = DialogActItemF1Measure()
+        self._dai_f1_metric = DialogActItemF1Measure(self.use_unified_datasets)

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
@@ -355,6 +357,41 @@ class MILU(Model):
        for i, tags in enumerate(output_dict["tags"]): 
            seq_len = len(output_dict["words"][i])
            spans = bio_tags_to_spans(tags[:seq_len])
+            if self.use_unified_datasets:
+                dialog_act = {
+                    'categorical': [],
+                    'non-categorical': [],
+                    'binary': []
+                }
+                for span in spans:
+                    intent, domain, slot = span[0].split("+")
+                    value = " ".join(output_dict["words"][i][span[1][0]:span[1][1]+1])
+                    dialog_act['non-categorical'].append({
+                        'intent': intent,
+                        'domain': domain,
+                        'slot': slot,
+                        'value': value
+                    })
+                
+                for intent in output_dict["intents"][i]:
+                    intent = eval(intent)
+                    if len(intent) == 3:
+                        dialog_act['binary'].append({
+                            'intent': intent[0],
+                            'domain': intent[1],
+                            'slot': intent[2]
+                        })
+                    else:
+                        assert len(intent) == 4
+                        dialog_act['categorical'].append({
+                            'intent': intent[0],
+                            'domain': intent[1],
+                            'slot': intent[2],
+                            'value': intent[3]
+                        })
+                output_dict["dialog_act"].append(dialog_act)
+
+            else:
                dialog_act = {}
                for span in spans:
                    domain_act = span[0].split("+")[0]

--- a/convlab2/nlu/milu/unified_datasets/__init__.py
+++ b/convlab2/nlu/milu/unified_datasets/__init__.py
+from convlab2.nlu.milu.unified_datasets.nlu import MILU
--- a/convlab2/nlu/milu/unified_datasets/configs/multiwoz21_user.jsonnet
+++ b/convlab2/nlu/milu/unified_datasets/configs/multiwoz21_user.jsonnet
+{
+  "dataset_reader": {
+    "type": "milu",
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": true
+      },
+      "token_characters": {
+        "type": "characters",
+        "min_padding_length": 3
+      },
+    },
+    "context_size": 0,
+    "agent": "user",
+    "use_unified_datasets": true,
+    "dataset_name": "multiwoz21",
+    "random_context_size": false
+  },
+  "train_data_path": "train",
+  "validation_data_path": "validation",
+  "test_data_path": "test",
+  "model": {
+    "type": "milu",
+    "label_encoding": "BIO",
+    "use_unified_datasets": true,
+    "dropout": 0.3,
+    "include_start_end_transitions": false,
+    "text_field_embedder": {
+      "token_embedders": {
+        "tokens": {
+            "type": "embedding",
+            "embedding_dim": 50,
+            "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.50d.txt.gz",
+            "trainable": true
+        },
+        "token_characters": {
+            "type": "character_encoding",
+            "embedding": {
+            "embedding_dim": 16
+            },
+            "encoder": {
+            "type": "cnn",
+            "embedding_dim": 16,
+            "num_filters": 128,
+            "ngram_filter_sizes": [3],
+            "conv_layer_activation": "relu"
+            }
+        }
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 178,
+      "hidden_size": 200,
+      "num_layers": 1,
+      "dropout": 0.5,
+      "bidirectional": true
+    },
+    "intent_encoder": {
+      "type": "lstm",
+      "input_size": 400,
+      "hidden_size": 200,
+      "num_layers": 1,
+      "dropout": 0.5,
+      "bidirectional": true
+    },
+    "attention": {
+      "type": "bilinear",
+      "vector_dim": 400,
+      "matrix_dim": 400
+    },    
+    "context_for_intent": true,
+    "context_for_tag": false,
+    "attention_for_intent": false,
+    "attention_for_tag": false,
+    "regularizer": [
+      [
+        "scalar_parameters",
+        {
+          "type": "l2",
+          "alpha": 0.1
+        }
+      ]
+    ]
+  },
+  "iterator": {
+    "type": "basic",
+    "batch_size": 64
+  },
+  "trainer": {
+    "optimizer": {
+        "type": "adam",
+        "lr": 0.001
+    },
+    "validation_metric": "+f1-measure",
+    "num_serialized_models_to_keep": 3,
+    "num_epochs": 40,
+    "grad_norm": 5.0,
+    "patience": 75,
+    "cuda_device": 4
+  },
+  "evaluate_on_test": true
+}
--- a/convlab2/nlu/milu/unified_datasets/configs/multiwoz21_user_context3.jsonnet
+++ b/convlab2/nlu/milu/unified_datasets/configs/multiwoz21_user_context3.jsonnet
+{
+  "dataset_reader": {
+    "type": "milu",
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": true
+      },
+      "token_characters": {
+        "type": "characters",
+        "min_padding_length": 3
+      },
+    },
+    "context_size": 3,
+    "agent": "user",
+    "use_unified_datasets": true,
+    "dataset_name": "multiwoz21",
+    "random_context_size": false
+  },
+  "train_data_path": "train",
+  "validation_data_path": "validation",
+  "test_data_path": "test",
+  "model": {
+    "type": "milu",
+    "label_encoding": "BIO",
+    "use_unified_datasets": true,
+    "dropout": 0.3,
+    "include_start_end_transitions": false,
+    "text_field_embedder": {
+      "token_embedders": {
+        "tokens": {
+            "type": "embedding",
+            "embedding_dim": 50,
+            "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.50d.txt.gz",
+            "trainable": true
+        },
+        "token_characters": {
+            "type": "character_encoding",
+            "embedding": {
+            "embedding_dim": 16
+            },
+            "encoder": {
+            "type": "cnn",
+            "embedding_dim": 16,
+            "num_filters": 128,
+            "ngram_filter_sizes": [3],
+            "conv_layer_activation": "relu"
+            }
+        }
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 178,
+      "hidden_size": 200,
+      "num_layers": 1,
+      "dropout": 0.5,
+      "bidirectional": true
+    },
+    "intent_encoder": {
+      "type": "lstm",
+      "input_size": 400,
+      "hidden_size": 200,
+      "num_layers": 1,
+      "dropout": 0.5,
+      "bidirectional": true
+    },
+    "attention": {
+      "type": "bilinear",
+      "vector_dim": 400,
+      "matrix_dim": 400
+    },    
+    "context_for_intent": true,
+    "context_for_tag": false,
+    "attention_for_intent": false,
+    "attention_for_tag": false,
+    "regularizer": [
+      [
+        "scalar_parameters",
+        {
+          "type": "l2",
+          "alpha": 0.1
+        }
+      ]
+    ]
+  },
+  "iterator": {
+    "type": "basic",
+    "batch_size": 64
+  },
+  "trainer": {
+    "optimizer": {
+        "type": "adam",
+        "lr": 0.001
+    },
+    "validation_metric": "+f1-measure",
+    "num_serialized_models_to_keep": 3,
+    "num_epochs": 40,
+    "grad_norm": 5.0,
+    "patience": 75,
+    "cuda_device": 0
+  },
+  "evaluate_on_test": true
+}
--- a/convlab2/nlu/milu/unified_datasets/nlu.py
+++ b/convlab2/nlu/milu/unified_datasets/nlu.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""
+"""
+
+import os
+from pprint import pprint
+import torch
+from allennlp.common.checks import check_for_gpu
+from allennlp.data import DatasetReader
+from allennlp.models.archival import load_archive
+from allennlp.data.tokenizers import Token
+
+from convlab2.util.file_util import cached_path
+from convlab2.nlu.milu import dataset_reader, model
+from convlab2.nlu.nlu import NLU
+from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
+
+DEFAULT_CUDA_DEVICE = -1
+DEFAULT_DIRECTORY = "models"
+DEFAULT_ARCHIVE_FILE = os.path.join(DEFAULT_DIRECTORY, "milu_multiwoz_all_context.tar.gz")
+
+class MILU(NLU):
+    """Multi-intent language understanding model."""
+
+    def __init__(self,
+                archive_file,
+                cuda_device,
+                model_file,
+                context_size):
+        """ Constructor for NLU class. """
+
+        self.context_size = context_size
+        cuda_device = 0 if torch.cuda.is_available() else DEFAULT_CUDA_DEVICE
+        check_for_gpu(cuda_device)
+
+        if not os.path.isfile(archive_file):
+            if not model_file:
+                raise Exception("No model for MILU is specified!")
+
+            archive_file = cached_path(model_file)
+
+        archive = load_archive(archive_file,
+                            cuda_device=cuda_device)
+        self.sent_tokenizer = PunktSentenceTokenizer()
+        self.word_tokenizer = TreebankWordTokenizer()
+
+        dataset_reader_params = archive.config["dataset_reader"]
+        self.dataset_reader = DatasetReader.from_params(dataset_reader_params)
+        self.model = archive.model
+        self.model.eval()
+
+
+    def predict(self, utterance, context=list()):
+        """
+        Predict the dialog act of a natural language utterance and apply error model.
+        Args:
+            utterance (str): A natural language utterance.
+        Returns:
+            output (dict): The dialog act of utterance.
+        """
+        if len(utterance) == 0:
+            return []
+
+        if self.context_size > 0 and len(context) > 0:
+            context_tokens = []
+            for utt in context[-self.context_size:]:
+                for sent in self.sent_tokenizer.tokenize(utt):
+                    for token in self.word_tokenizer.tokenize(sent):
+                        context_tokens.append(Token(token))
+                context_tokens.append(Token("SENT_END"))
+        else:
+            context_tokens = [Token("SENT_END")]
+        sentences = self.sent_tokenizer.tokenize(utterance)
+        tokens = [Token(token) for sent in sentences for token in self.word_tokenizer.tokenize(sent)]
+        instance = self.dataset_reader.text_to_instance(context_tokens, tokens)
+        outputs = self.model.forward_on_instance(instance)
+
+        tuples = []
+        for da_type in outputs['dialog_act']:
+            for da in outputs['dialog_act'][da_type]:
+                tuples.append([da['intent'], da['domain'], da['slot'], da.get('value','')])
+        return tuples
+
+
+if __name__ == "__main__":
+    nlu = MILU(archive_file='../output/multiwoz21_user/model.tar.gz', cuda_device=3, model_file=None, context_size=3)
+    test_utterances = [
+        "What type of accommodations are they. No , i just need their address . Can you tell me if the hotel has internet available ?",
+        "What type of accommodations are they.",
+        "No , i just need their address .",
+        "Can you tell me if the hotel has internet available ?",
+        "yes. it should be moderately priced.",
+        "i want to book a table for 6 at 18:45 on thursday",
+        "i will be departing out of stevenage.",
+        "What is the name of attraction ?",
+        "Can I get the name of restaurant?",
+        "Can I get the address and phone number of the restaurant?",
+        "do you have a specific area you want to stay in?"
+    ]
+    for utt in test_utterances:
+        print(utt)
+        pprint(nlu.predict(utt))
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,6 @@ setup(
        'datasets>=1.8',
        'seqeval',
        'spacy',
-        'allennlp',
        'simplejson',
        'unidecode',
        'jieba',