diff --git a/data/unified_datasets/camrest/README.md b/data/unified_datasets/camrest/README.md index ae06dbe9dd5cd62c980f6b9bc731a1542bfd1aad..1ec70f427fdb6fa28d9b28d8635af46a8bdaed7c 100644 --- a/data/unified_datasets/camrest/README.md +++ b/data/unified_datasets/camrest/README.md @@ -1,24 +1,61 @@ -# README +# Dataset Card for Camrest -## Features +- **Repository:** https://www.repository.cam.ac.uk/handle/1810/260970 +- **Paper:** https://aclanthology.org/D16-1233/ +- **Leaderboard:** None +- **Who transforms the dataset:** Qi Zhu(zhuq96 at gmail dot com) -- Annotations: dialogue act, character-level span for non-categorical slots. +### Dataset Summary -Statistics: +Cambridge restaurant dialogue domain dataset collected for developing neural network based dialogue systems. The two papers published based on this dataset are: 1. A Network-based End-to-End Trainable Task-oriented Dialogue System 2. Conditional Generation and Snapshot Learning in Neural Dialogue Systems. The dataset was collected based on the Wizard of Oz experiment on Amazon MTurk. Each dialogue contains a goal label and several exchanges between a customer and the system. Each user turn was labelled by a set of slot-value pairs representing a coarse representation of dialogue state (`slu` field). There are in total 676 dialogue, in which most of the dialogues are finished but some of dialogues were not. -| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | -| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | -| train | 406 | 2936 | 7.23 | 11.36 | 1 | -| dev | 135 | 941 | 6.97 | 11.99 | 1 | -| train | 135 | 935 | 6.93 | 11.87 | 1 | +- **How to get the transformed data from original data:** + - Run `python preprocess.py` in the current directory. Need `../../camrest/` as the original data. +- **Main changes of the transformation:** + - Add dialogue act annotation according to the state change. This step was done by ConvLab-2 and we use the processed dialog acts here. + - Rename `pricerange` to `price range` + - Add character level span annotation for non-categorical slots. +- **Annotations:** + - user goal, dialogue acts, state. -## Main changes +### Supported Tasks and Leaderboards -- domain is set to **restaurant** -- ignore some rare pair -- 3 values are not found in original utterances -- **dontcare** values in non-categorical slots are calculated in `evaluate.py` so `da_match` in evaluation is lower than actual number. +NLU, DST, Policy, NLG, E2E, User simulator -## Original data +### Languages -camrest used in convlab2, included in `data/` path \ No newline at end of file +English + +### Data Splits + +| split | dialogues | utterances | avg_utt | avg_tokens | avg_domains | cat slot match(state) | cat slot match(goal) | cat slot match(dialogue act) | non-cat slot span(dialogue act) | +| ---------- | --------- | ---------- | ------- | ---------- | ----------- | --------------------- | -------------------- | ---------------------------- | ------------------------------- | +| train | 406 | 3342 | 8.23 | 10.6 | 1 | 100 | 100 | 100 | 99.83 | +| validation | 135 | 1076 | 7.97 | 11.26 | 1 | 100 | 100 | 100 | 100 | +| test | 135 | 1070 | 7.93 | 11.01 | 1 | 100 | 100 | 100 | 100 | +| all | 676 | 5488 | 8.12 | 10.81 | 1 | 100 | 100 | 100 | 99.9 | + +1 domains: ['restaurant'] +- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage. +- **non-cat slot span**: how many values of non-categorical slots have span annotation in percentage. + +### Citation + +``` +@inproceedings{wen-etal-2016-conditional, + title = "Conditional Generation and Snapshot Learning in Neural Dialogue Systems", + author = "Wen, Tsung-Hsien and Ga{\v{s}}i{\'c}, Milica and Mrk{\v{s}}i{\'c}, Nikola and Rojas-Barahona, Lina M. and Su, Pei-Hao and Ultes, Stefan and Vandyke, David and Young, Steve", + booktitle = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing", + month = nov, + year = "2016", + address = "Austin, Texas", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/D16-1233", + doi = "10.18653/v1/D16-1233", + pages = "2153--2162", +} +``` + +### Licensing Information + +[**CC BY 4.0**](https://creativecommons.org/licenses/by/4.0/) \ No newline at end of file diff --git a/data/unified_datasets/camrest/data.zip b/data/unified_datasets/camrest/data.zip index b0d3db9f816f377f431e33d4a43ab0b9eb668f2a..cd9822c65929d46b992b511787893bc92725a76f 100644 Binary files a/data/unified_datasets/camrest/data.zip and b/data/unified_datasets/camrest/data.zip differ diff --git a/data/unified_datasets/camrest/dummy_data.json b/data/unified_datasets/camrest/dummy_data.json new file mode 100644 index 0000000000000000000000000000000000000000..bd24e654272429e1bec3d92037b773c2b03bf1d9 --- /dev/null +++ b/data/unified_datasets/camrest/dummy_data.json @@ -0,0 +1,2276 @@ +[ + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-0", + "original_id": 0, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 11193: You are looking for an expensive restaurant and it should be in the south part of town. Make sure you get the address of the venue.", + "inform": { + "restaurant": { + "price range": "expensive", + "area": "south" + } + }, + "request": { + "restaurant": { + "address": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "I need to find an expensive restauant that's in the south section of the city.", + "utt_idx": 0, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "expensive" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "south" + } + ], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "south", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are several restaurants in the south part of town that serve expensive food. Do you have a cuisine preference?", + "utt_idx": 1, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food" + } + ], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "expensive" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "south" + } + ], + "non-categorical": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "No I don't care about the type of cuisine.", + "utt_idx": 2, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "dontcare" + } + ] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "south", + "food": "dontcare" + } + } + }, + { + "speaker": "system", + "utterance": "Chiquito Restaurant Bar is a Mexican restaurant located in the south part of town.", + "utt_idx": 3, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "south" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Chiquito Restaurant Bar", + "start": 0, + "end": 23 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Mexican", + "start": 29, + "end": 36 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "What is their address?", + "utt_idx": 4, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "address" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "south", + "food": "dontcare" + } + } + }, + { + "speaker": "system", + "utterance": "There address is 2G Cambridge Leisure Park Cherry Hinton Road Cherry Hinton, it there anything else I can help you with?", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "address", + "value": "2G Cambridge Leisure Park Cherry Hinton Road Cherry Hinton", + "start": 17, + "end": 75 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "What is the food type they serve ?", + "utt_idx": 6, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "south", + "food": "dontcare" + } + } + }, + { + "speaker": "system", + "utterance": "They serve Mexican food .", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Mexican", + "start": 11, + "end": 18 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Nope, that's what I was looking for. Thanks a lot!", + "utt_idx": 8, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "south", + "food": "dontcare" + } + } + }, + { + "speaker": "system", + "utterance": "Thank you very much for using the Cambridge Restaurant System.", + "utt_idx": 9, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-1", + "original_id": 1, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 02790: You want to find a moderately priced restaurant and it should serve cantonese food. If there is no such venue how about chinese type of food. You want to know the phone number of the venue.", + "inform": { + "restaurant": { + "food": "chinese", + "price range": "moderate" + } + }, + "request": { + "restaurant": { + "phone": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "I am looking for a restaurant that is moderately priced and serves Cantonese food.", + "utt_idx": 0, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "moderate" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Cantonese", + "start": 67, + "end": 76 + } + ] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "", + "food": "cantonese" + } + } + }, + { + "speaker": "system", + "utterance": "There are no restaurants that serve Cantonese food in the moderate price range.", + "utt_idx": 1, + "dialogue_acts": { + "binary": [ + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "How about chinese type of food?", + "utt_idx": 2, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "chinese", + "start": 10, + "end": 17 + } + ] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "", + "food": "chinese" + } + } + }, + { + "speaker": "system", + "utterance": "the Golden wok serves chinese food and is in the moderate price range. Would you like their location? ", + "utt_idx": 3, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "area" + } + ], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "moderate" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Golden wok", + "start": 4, + "end": 14 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "chinese", + "start": 22, + "end": 29 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Their phone number please.", + "utt_idx": 4, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "", + "food": "chinese" + } + } + }, + { + "speaker": "system", + "utterance": "Their phone number is 01223 350688.", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "phone", + "value": "01223 350688", + "start": 22, + "end": 34 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "What is the area?", + "utt_idx": 6, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "area" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "", + "food": "chinese" + } + } + }, + { + "speaker": "system", + "utterance": "it is located in the north part of town", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "north" + } + ], + "non-categorical": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Thank you for your help. Good bye.", + "utt_idx": 8, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "", + "food": "chinese" + } + } + }, + { + "speaker": "system", + "utterance": "Goodbye.", + "utt_idx": 9, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-2", + "original_id": 2, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 10230: You are looking for an expensive restaurant and it should serve australian food. If there is no such venue how about british type of food. You want to know the phone number of the venue.", + "inform": { + "restaurant": { + "food": "british", + "price range": "expensive" + } + }, + "request": { + "restaurant": { + "phone": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "Hello, I am looking for an expensive restaurant that serves Australian food. ", + "utt_idx": 0, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "expensive" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Australian", + "start": 60, + "end": 70 + } + ] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "", + "food": "australian" + } + } + }, + { + "speaker": "system", + "utterance": "There are no records matching your request for an expensive restaurant serving Australian food. May I help you find a different restaurant? ", + "utt_idx": 1, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food" + }, + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "What about British food?", + "utt_idx": 2, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "British", + "start": 11, + "end": 18 + } + ] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "", + "food": "british" + } + } + }, + { + "speaker": "system", + "utterance": "There are six restaurants matching your query. One of them is travellers rest.", + "utt_idx": 3, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "travellers rest", + "start": 62, + "end": 77 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Could I have the phone number for one of the restaurants?", + "utt_idx": 4, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "", + "food": "british" + } + } + }, + { + "speaker": "system", + "utterance": "there phone number is 01223352500 is there anything else i can help you with?", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "phone", + "value": "01223352500", + "start": 22, + "end": 33 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Thanks, goodbye.", + "utt_idx": 6, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "", + "food": "british" + } + } + }, + { + "speaker": "system", + "utterance": "Thank you, goodbye.", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-3", + "original_id": 3, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 01215: You want to find an expensive restaurant and it should serve african food. You want to know the phone number.", + "inform": { + "restaurant": { + "food": "african", + "price range": "expensive" + } + }, + "request": { + "restaurant": { + "phone": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "I am wanting an expensive restaurant that offers African food. What is their number?", + "utt_idx": 0, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + } + ], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "expensive" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "African", + "start": 49, + "end": 56 + } + ] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "", + "food": "african" + } + } + }, + { + "speaker": "system", + "utterance": "Where are you located? I see two that might work but Id like to offer the closest.", + "utt_idx": 1, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "area" + } + ], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I do not care about the area of town.", + "utt_idx": 2, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "dontcare" + } + ], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "dontcare", + "food": "african" + } + } + }, + { + "speaker": "system", + "utterance": "Bedouin is an expensive African restaurant in the city centre. ", + "utt_idx": 3, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "centre" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "expensive" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "African", + "start": 24, + "end": 31 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Bedouin", + "start": 0, + "end": 7 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "That sounds good, what is their number so that I can ring them?", + "utt_idx": 4, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "dontcare", + "food": "african" + } + } + }, + { + "speaker": "system", + "utterance": "Their phone number is 01223 367660.", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "phone", + "value": "01223 367660", + "start": 22, + "end": 34 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Great! Thank you. Good Bye. ", + "utt_idx": 6, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "expensive", + "area": "dontcare", + "food": "african" + } + } + }, + { + "speaker": "system", + "utterance": "You're welcome, goodbye.", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-4", + "original_id": 4, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 07173: You are looking for a moderately priced restaurant and it should be in the north part of town. Make sure you get the address and type of food.", + "inform": { + "restaurant": { + "price range": "moderate", + "area": "north" + } + }, + "request": { + "restaurant": { + "address": "", + "food": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "I would like a moderately priced restaurant in the north part of town.", + "utt_idx": 0, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "moderate" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "north" + } + ], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "north", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "Golden Wok is a moderately priced restaurant in the north side of town. ", + "utt_idx": 1, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "moderate" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "north" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Golden Wok", + "start": 0, + "end": 10 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "What type of food does Golden Wok serve?", + "utt_idx": 2, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "north", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "The Golden Wok serves Chinese food. Would you like more information?", + "utt_idx": 3, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Golden Wok", + "start": 4, + "end": 14 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Chinese", + "start": 22, + "end": 29 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Yes what is the address for the business?", + "utt_idx": 4, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "address" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "north", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "They are located at 191 Histon Road Chesterton", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "address", + "value": "191 Histon Road Chesterton", + "start": 20, + "end": 46 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Great. I have everything I need, Thank you ", + "utt_idx": 6, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "north", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "You're welconme", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-5", + "original_id": 5, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 11163: You want to find a restaurant in the north part of town and it should serve traditional food. If there is no such venue how about asian oriental type of food. You want to know the address and phone number of the venue.", + "inform": { + "restaurant": { + "food": "asian oriental", + "area": "north" + } + }, + "request": { + "restaurant": { + "address": "", + "phone": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "i want somewhere that serves traditional food.", + "utt_idx": 0, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "traditional", + "start": 29, + "end": 40 + } + ] + }, + "state": { + "restaurant": { + "price range": "", + "area": "", + "food": "traditional" + } + } + }, + { + "speaker": "system", + "utterance": "There are no traditional restaurants in the city. Can I help you with a different type of cuisine? ", + "utt_idx": 1, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food" + }, + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "How abou asian oriental food", + "utt_idx": 2, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "asian oriental", + "start": 9, + "end": 23 + } + ] + }, + "state": { + "restaurant": { + "price range": "", + "area": "", + "food": "asian oriental" + } + } + }, + { + "speaker": "system", + "utterance": "There are 5 restaurants meeting your criteria. What area and price range did you have in mind?", + "utt_idx": 3, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "area" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "price range" + } + ], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "What is the address and phone number?", + "utt_idx": 4, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "", + "area": "", + "food": "asian oriental" + } + } + }, + { + "speaker": "system", + "utterance": "The dojo noodle bar serves asian oriental food. They are located at 40210 Millers Yard City Centre and their phone number is 01223 363471. ", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "dojo noodle bar", + "start": 4, + "end": 19 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "asian oriental", + "start": 27, + "end": 41 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "address", + "value": "40210 Millers Yard City Centre", + "start": 69, + "end": 99 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "phone", + "value": "01223 363471", + "start": 126, + "end": 138 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "thank you. good bye.", + "utt_idx": 6, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "", + "area": "", + "food": "asian oriental" + } + } + }, + { + "speaker": "system", + "utterance": "Good bye", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-6", + "original_id": 6, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 07425: You are looking for a cheap restaurant and it should be in the north part of town. Don't go for the first venue the system offers you, ask if there is anything else. Make sure you get the phone number of the venue.", + "inform": { + "restaurant": { + "price range": "cheap", + "area": "north" + } + }, + "request": { + "restaurant": { + "phone": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "Is there a cheap place in the north part of town? ", + "utt_idx": 0, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "cheap" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "north" + } + ], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "cheap", + "area": "north", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are two restaurants that meet your specification. One is Italian and the other is Indian. Which would you prefer?", + "utt_idx": 1, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food" + } + ], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Italian", + "start": 63, + "end": 70 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Indian", + "start": 88, + "end": 94 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Italian, please. Can you give me the address?", + "utt_idx": 2, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "address" + } + ], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Italian", + "start": 0, + "end": 7 + } + ] + }, + "state": { + "restaurant": { + "price range": "cheap", + "area": "north", + "food": "italian" + } + } + }, + { + "speaker": "system", + "utterance": "da vinci pizzeria is in the north part of town their address is 20 Milton Road Chesterton.", + "utt_idx": 3, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "north" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "address", + "value": "20 Milton Road Chesterton", + "start": 64, + "end": 89 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "da vinci pizzeria", + "start": 0, + "end": 17 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "What is the phone number please?", + "utt_idx": 4, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "cheap", + "area": "north", + "food": "italian" + } + } + }, + { + "speaker": "system", + "utterance": "da vinci pizzeria's phone number is 01223351707. Can I help you with anything else? ", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "da vinci pizzeria", + "start": 0, + "end": 17 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "phone", + "value": "01223351707", + "start": 36, + "end": 47 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "No that will be it. Thank you ", + "utt_idx": 6, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "cheap", + "area": "north", + "food": "italian" + } + } + }, + { + "speaker": "system", + "utterance": "You're welcome. Goodbye.", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-7", + "original_id": 7, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 09135: You want to find a moderately priced restaurant and it should be in the east part of town. Make sure you get the phone number of the venue.", + "inform": { + "restaurant": { + "price range": "moderate", + "area": "east" + } + }, + "request": { + "restaurant": { + "phone": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "I want a moderately priced restaurant in the east part of town. What is the phone number?", + "utt_idx": 0, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + } + ], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "moderate" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "east" + } + ], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "east", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "Would you prefer Indian food or Italian food?", + "utt_idx": 1, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food" + } + ], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Italian", + "start": 32, + "end": 39 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Indian", + "start": 17, + "end": 23 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I don't care. Give me their phone number.", + "utt_idx": 2, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "east", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "Curry Prince is a moderately priced restaurant in the east part of town that serves Indian food. Would you like the phone number?", + "utt_idx": 3, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + } + ], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "east" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "moderate" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Curry Prince", + "start": 0, + "end": 12 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Indian", + "start": 84, + "end": 90 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Yes give me the phone number.", + "utt_idx": 4, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "east", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "The phone number for Curry Prince is 01223 566388. ", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "phone", + "value": "01223 566388", + "start": 37, + "end": 49 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Curry Prince", + "start": 21, + "end": 33 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Awesome, thank you! Goodbye!", + "utt_idx": 6, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "east", + "food": "" + } + } + }, + { + "speaker": "system", + "utterance": "You're welcome, goodbye.", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-8", + "original_id": 8, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 11820: You are looking for a restaurant in the east part of town and it should serve world food. If there is no such venue how about indian type of food. You want to know the address and phone number of the venue.", + "inform": { + "restaurant": { + "food": "indian", + "area": "east" + } + }, + "request": { + "restaurant": { + "address": "", + "phone": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "I'd like a place that serves world food in the east part of town. ", + "utt_idx": 0, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "east" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "world", + "start": 29, + "end": 34 + } + ] + }, + "state": { + "restaurant": { + "price range": "", + "area": "east", + "food": "world" + } + } + }, + { + "speaker": "system", + "utterance": "Sorry but no world food restaurants in the east is found. can i provide you something else. ", + "utt_idx": 1, + "dialogue_acts": { + "binary": [ + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "How about Indian food?", + "utt_idx": 2, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Indian", + "start": 10, + "end": 16 + } + ] + }, + "state": { + "restaurant": { + "price range": "", + "area": "east", + "food": "indian" + } + } + }, + { + "speaker": "system", + "utterance": "Your options are Curry Prince, Pipasha Restaurant, Rajmahal, and Sitar Tandoori. Which one would you like more information regarding?", + "utt_idx": 3, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Curry Prince", + "start": 17, + "end": 29 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Pipasha Restaurant", + "start": 31, + "end": 49 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Sitar Tandoori", + "start": 65, + "end": 79 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Rajmahal", + "start": 51, + "end": 59 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I would like the address and phone number of Curry Prince.", + "utt_idx": 4, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "", + "area": "east", + "food": "indian" + } + } + }, + { + "speaker": "system", + "utterance": "The curry prince is located at 451 Newmarket Road Fen Ditton and their phone number is 01223 566388. Can I help you with anything else today? ", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "address", + "value": "451 Newmarket Road Fen Ditton", + "start": 31, + "end": 60 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "phone", + "value": "01223 566388", + "start": 87, + "end": 99 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "curry prince", + "start": 4, + "end": 16 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "No, thank you goobye.", + "utt_idx": 6, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "", + "area": "east", + "food": "indian" + } + } + }, + { + "speaker": "system", + "utterance": "thank you, goodbye", + "utt_idx": 7, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + }, + { + "dataset": "camrest", + "data_split": "train", + "dialogue_id": "camrest-train-9", + "original_id": 9, + "domains": [ + "restaurant" + ], + "goal": { + "description": "Task 06252: You are looking for a moderately priced restaurant and it should serve chinese food. Make sure you get the address, phone number, and postcode.", + "inform": { + "restaurant": { + "food": "chinese", + "price range": "moderate" + } + }, + "request": { + "restaurant": { + "address": "", + "phone": "", + "postcode": "" + } + } + }, + "finished": true, + "turns": [ + { + "speaker": "user", + "utterance": "I'd like to find a moderately priced chinese restaurant.", + "utt_idx": 0, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "moderate" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "chinese", + "start": 37, + "end": 44 + } + ] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "", + "food": "chinese" + } + } + }, + { + "speaker": "system", + "utterance": "Golden Wok is a moderately priced Chinese restaurant in North.", + "utt_idx": 1, + "dialogue_acts": { + "binary": [], + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "moderate" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "North" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Golden Wok", + "start": 0, + "end": 10 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Chinese", + "start": 34, + "end": 41 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Can you give me the address, phone number, and area code?", + "utt_idx": 2, + "dialogue_acts": { + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "postcode" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "phone" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "", + "food": "chinese" + } + } + }, + { + "speaker": "system", + "utterance": "The Golden Wok is at 191 Histon Road Chesterton, post code C.B 4, 3 H.L. The phone number is 01223 350688.", + "utt_idx": 3, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "address", + "value": "191 Histon Road Chesterton", + "start": 21, + "end": 47 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "postcode", + "value": "C.B 4, 3 H.L", + "start": 59, + "end": 71 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "Golden Wok", + "start": 4, + "end": 14 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "phone", + "value": "01223 350688", + "start": 93, + "end": 105 + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Thank you very much for the address of the golden wok!", + "utt_idx": 4, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "state": { + "restaurant": { + "price range": "moderate", + "area": "", + "food": "chinese" + } + } + }, + { + "speaker": "system", + "utterance": "You're welcome. good bye!", + "utt_idx": 5, + "dialogue_acts": { + "binary": [], + "categorical": [], + "non-categorical": [] + }, + "db_results": {} + } + ] + } +] \ No newline at end of file diff --git a/data/unified_datasets/camrest/preprocess.py b/data/unified_datasets/camrest/preprocess.py index 510df266aa4d528bfe19fd5c496be5d2f96521cb..5841c98434d0e97c398c5034622a6042b07e36c4 100644 --- a/data/unified_datasets/camrest/preprocess.py +++ b/data/unified_datasets/camrest/preprocess.py @@ -2,107 +2,117 @@ import zipfile import json import os import copy -import logging - -logging.basicConfig(level=logging.INFO) -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) -# print(sys.path[-1]) - -from convlab2.util.file_util import read_zipped_json, write_zipped_json - -self_dir = os.path.dirname(os.path.abspath(__file__)) - -cat_slot_values = { - 'area': ['north', 'east', 'west', 'south', 'centre'], - 'pricerange': ['cheap', 'moderate', 'expensive'] -} - -camrest_desc = { - 'restaurant': { - 'domain': 'find a restaurant to eat', - 'food': 'food type the restaurant serves', - 'area': 'area where the restaurant is located', - 'name': 'name of the restaurant', - 'pricerange': 'price range of the restaurant', - 'phone': 'phone number of the restaurant', - 'address': 'exact location of the restaurant', - 'postcode': 'postal code of the restaurant', +from shutil import copy2, rmtree +from zipfile import ZipFile, ZIP_DEFLATED + +ontology = { + 'domains': { + 'restaurant': { + 'description': 'find a restaurant to eat', + 'slots': { + 'area': { + 'description': 'area where the restaurant is located', + 'is_categorical': True, + 'possible_values': ["centre","north","west","south","east"] + }, + 'price range': { + 'description': 'price range of the restaurant', + 'is_categorical': True, + 'possible_values': ["cheap","moderate","expensive"] + }, + 'food': { + 'description': 'the cuisine of the restaurant', + 'is_categorical': False, + 'possible_values': ["afghan","african","afternoon tea","asian oriental","australasian","australian","austrian","barbeque","basque","belgian","bistro","brazilian","british","canapes","cantonese","caribbean","catalan","chinese","christmas","corsica","creative","crossover","cuban","danish","eastern european","english","eritrean","european","french","fusion","gastropub","german","greek","halal","hungarian","indian","indonesian","international","irish","italian","jamaican","japanese","korean","kosher","latin american","lebanese","light bites","malaysian","mediterranean","mexican","middle eastern","modern american","modern eclectic","modern european","modern global","molecular gastronomy","moroccan","new zealand","north african","north american","north indian","northern european","panasian","persian","polish","polynesian","portuguese","romanian","russian","scandinavian","scottish","seafood","singaporean","south african","south indian","spanish","sri lankan","steakhouse","swedish","swiss","thai","the americas","traditional","turkish","tuscan","unusual","vegetarian","venetian","vietnamese","welsh","world"] + }, + 'name': { + 'description': 'name of the restaurant', + 'is_categorical': False, + 'possible_values': [] + }, + 'phone': { + 'description': 'phone number of the restaurant', + 'is_categorical': False, + 'possible_values': [] + }, + 'address': { + 'description': 'exact location of the restaurant', + 'is_categorical': False, + 'possible_values': [] + }, + 'postcode': { + 'description': 'postcode of the restaurant', + 'is_categorical': False, + 'possible_values': [] + } + } + } }, 'intents': { - 'inform': 'inform user of value of a slot', - 'request': 'ask for value of a slot', - 'nooffer': 'inform user that no restaurant matches his request' + 'inform': { + 'description': 'inform the value of a slot' + }, + 'request': { + 'description': 'ask for the value of a slot' + }, + 'nooffer': { + 'description': 'inform the user that there is no result satisfies user requirements' + } + }, + 'state': { + 'restaurant': { + 'price range': '', + 'area': '', + 'food': '' + } + }, + 'dialogue_acts': { + "categorical": {}, + "non-categorical": {}, + "binary": {} } } -all_slots = ['food', 'area', 'name', 'pricerange', 'phone', 'address', 'postcode'] - -def convert_da(utt, da, all_intent, all_binary_das): +def convert_da(utt, da): + global ontology converted_da = { 'binary': [], 'categorical': [], 'non-categorical': [] } - for _intent, svs in da.items(): - if _intent not in all_intent: - all_intent.append(_intent) - - if _intent == 'nooffer': + for intent, svs in da.items(): + assert intent in ontology['intents'] + if intent == 'nooffer': + assert svs == [['none', 'none']] converted_da['binary'].append({ - 'intent': _intent, + 'intent': intent, 'domain': 'restaurant', 'slot': '', - 'value': '' }) - - if { - 'intent': _intent, - 'domain': 'restaurant', - 'slot': '', - 'value': '' - } not in all_binary_das: - all_binary_das.append({ - 'intent': _intent, - 'domain': 'restaurant', - 'slot': '', - 'value': '' - }) continue for s, v in svs: if 'care' in v: - v = 'dontcare' - s = s.lower() - v = v.lower() - if _intent == 'request': + assert v == 'dontcare', print(v) + assert s == s.lower() + if s == 'pricerange': + s = 'price range' + v = v + if intent == 'request': + assert v == '?' converted_da['binary'].append({ - 'intent': _intent, + 'intent': intent, 'domain': 'restaurant', - 'slot': s, - 'value': '' + 'slot': s }) - - if { - 'intent': _intent, - 'domain': 'restaurant', - 'slot': s, - 'value': '' - } not in all_binary_das: - all_binary_das.append({ - 'intent': _intent, - 'domain': 'restaurant', - 'slot': s, - 'value': '' - }) continue - if s in cat_slot_values: - assert v in cat_slot_values[s] + ['dontcare'] + if s in ['price range', 'area']: + assert v.lower() in ontology['domains']['restaurant']['slots'][s]['possible_values'] + ['dontcare'], print(s, v) converted_da['categorical'].append({ - 'intent': _intent, + 'intent': intent, 'domain': 'restaurant', 'slot': s, 'value': v @@ -110,240 +120,158 @@ def convert_da(utt, da, all_intent, all_binary_das): else: # non-categorical - start_ch = utt.find(v) + start_ch = utt.lower().find(v.lower()) if start_ch == -1: - # if not v == 'dontcare': - # logging.info('non-categorical slot value not found') - # logging.info('value: {}'.format(v)) - # logging.info('sentence: {}'.format(utt)) - # continue + if not v == 'dontcare': + print('non-categorical slot value not found') + print('value: {}'.format(v)) + print('sentence: {}'.format(utt)) + print() converted_da['non-categorical'].append({ - 'intent': _intent, + 'intent': intent, 'domain': 'restaurant', 'slot': s, 'value': v, - # 'start': 0, - # 'end': 0 }) - continue - - converted_da['non-categorical'].append({ - 'intent': _intent, - 'domain': 'restaurant', - 'slot': s, - 'value': utt[start_ch: start_ch + len(v)], - 'start': start_ch, - 'end': start_ch + len(v) - }) - assert utt[start_ch: start_ch + len(v)] == v + else: + converted_da['non-categorical'].append({ + 'intent': intent, + 'domain': 'restaurant', + 'slot': s, + 'value': utt[start_ch: start_ch + len(v)], + 'start': start_ch, + 'end': start_ch + len(v) + }) + assert utt[start_ch: start_ch + len(v)].lower() == v.lower() return converted_da -def convert_state(state, state_slots): - ret_state = {'restaurant': {k: '' for k in state_slots}} - for da in state: +def convert_state(slu): + global ontology + ret_state = copy.deepcopy(ontology['state']) + for da in slu: if da['act'] != 'inform': continue for s, v in da['slots']: - s = s.lower() - v = v.lower() - - if not s in all_slots: - logging.info('state slot {} not in all_slots!'.format(s)) + s = s if s != 'pricerange' else 'price range' + if s not in ret_state['restaurant']: + print('slot not in state') + print(da) + print() continue - ret_state['restaurant'][s] = v - if s not in state_slots: - print(s) - raise - return ret_state -def get_state_update(prev_state, cur_state, prev_turns, cur_user_da, dialog_id): - # cur_user_da: List of non-categorical slot-values - diff_state = {} - state_update = {'categorical': [], 'non-categorical':[]} - for s, v in cur_state.items(): - if s in prev_state and prev_state[s] == v: - continue - diff_state[s] = v - - for s, v in diff_state.items(): - if v == '': - continue - if s in cat_slot_values: - assert v in cat_slot_values[s] + ['dontcare'] - state_update['categorical'].append({ - 'domain': 'restaurant', - 'slot': s, - 'value': v, - }) - else: - # non-categorical slot - found = False - for _usr_da in cur_user_da: - if _usr_da['slot'] == s and _usr_da['value'] == v : - found = True - if v != 'dontcare' and 'start' in _usr_da: - state_update['non-categorical'].append({ - 'domain': 'restaurant', - 'slot': s, - 'value': v, - 'utt_idx': len(prev_turns), - 'start': _usr_da['start'], - 'end': _usr_da['end'] - }) - else: - state_update['non-categorical'].append({ - 'domain': 'restaurant', - 'slot': s, - 'value': v, - }) - if found: - continue - - prev_sys_da = [] if len(prev_turns) == 0 else prev_turns[-1]['dialogue_act']['non-categorical'] - for _sys_da in prev_sys_da: - if _sys_da['slot'] == s and _sys_da['value'] == v and 'start' in _sys_da: - if _sys_da['slot'] == s and _sys_da['value'] == v: - state_update['non-categorical'].append({ - 'domain': 'restaurant', - 'slot': s, - 'value': v, - 'utt_idx': len(prev_turns) - 1, - 'start': _sys_da['start'], - 'end': _sys_da['end'] - }) - found = True - - if not found: - state_update['non-categorical'].append({ - 'domain': 'restaurant', - 'slot': s, - 'value': v - }) - - return state_update - - def preprocess(): - original_zipped_path = os.path.join(self_dir, 'original_data.zip') - if not os.path.exists(original_zipped_path): - raise FileNotFoundError(original_zipped_path) - if not os.path.exists(os.path.join(self_dir, 'data.zip')) or not os.path.exists( - os.path.join(self_dir, 'ontology.json')): - # print('unzip to', new_dir) - # print('This may take several minutes') - archive = zipfile.ZipFile(original_zipped_path, 'r') - archive.extractall(self_dir) - - all_data = [] - all_intent = [] - all_binary_das = [] - all_state_slots = ['pricerange', 'area', 'food'] - - data_splits = ['train', 'val', 'test'] - extract_dir = os.path.join(self_dir, 'original_data') - - if not os.path.exists('data.zip') or not os.path.exists('ontology.json'): - - dialog_id = 1 - for data_split in data_splits: - data = json.load(open(os.path.join(self_dir, extract_dir, '{}.json'.format(data_split)))) - - for i, d in enumerate(data): - - dialogue = d['dial'] - converted_dialogue = { - 'dataset': 'camrest', - 'data_split': data_split, - 'dialogue_id': 'camrest_' + str(dialog_id), - 'original_id': d['dialogue_id'], - 'domains': ['restaurant'], - 'turns': [] - } - - prev_state = {'restaurant': {}} - for turn in dialogue: - usr_text = turn['usr']['transcript'].lower() - usr_da = turn['usr']['dialog_act'] - - sys_text = turn['sys']['sent'].lower() - sys_da = turn['sys']['dialog_act'] - - cur_state = convert_state(turn['usr']['slu'], all_state_slots) - cur_user_da = convert_da(usr_text, usr_da, all_intent, all_binary_das) - - usr_turn = { - 'utt_idx': len(converted_dialogue['turns']), - 'speaker': 'user', - 'utterance': usr_text, - 'dialogue_act': cur_user_da, - 'state': copy.deepcopy(cur_state), - 'state_update': get_state_update(prev_state['restaurant'], cur_state['restaurant'], converted_dialogue['turns'], cur_user_da['non-categorical'], converted_dialogue['dialogue_id']) - } - - sys_turn = { - 'utt_idx': len(converted_dialogue['turns'])+1, - 'speaker': 'system', - 'utterance': sys_text, - 'dialogue_act': convert_da(sys_text, sys_da, all_intent, all_binary_das), - } - - prev_state = copy.deepcopy(cur_state) - - converted_dialogue['turns'].append(usr_turn) - converted_dialogue['turns'].append(sys_turn) - if converted_dialogue['turns'][-1]['speaker'] == 'system': - converted_dialogue['turns'].pop(-1) - all_data.append(converted_dialogue) - dialog_id += 1 - - json.dump(all_data, open('./data.json', 'w'), indent=4) - write_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') - os.remove('data.json') - - new_ont = { - 'domains': {}, - 'intents': {}, - 'binary_dialogue_act': [], - 'state': {} - } + # use convlab-2 version camrest which already has dialog act annotation + original_data_dir = '../../camrest/' + new_data_dir = 'data' + + os.makedirs(new_data_dir, exist_ok=True) + + copy2(f'{original_data_dir}/db/CamRestDB.json', new_data_dir) + + dataset = 'camrest' + domain = 'restaurant' + splits = ['train', 'validation', 'test'] + dialogues_by_split = {split:[] for split in splits} + + for split in ['train', 'val', 'test']: + data = json.load(zipfile.ZipFile(os.path.join(original_data_dir, f'{split}.json.zip'), 'r').open(f'{split}.json')) + if split == 'val': + split = 'validation' + + cur_domains = [domain] + + for ori_dialog in data: + dialogue_id = f'{dataset}-{split}-{len(dialogues_by_split[split])}' + + goal = { + 'description': ori_dialog['goal']['text'], + 'inform': {'restaurant': {}}, + 'request': {'restaurant': {}} + } + for slot, value in ori_dialog['goal']['info'].items(): + if slot == 'pricerange': + slot = 'price range' + goal['inform'][domain][slot] = value + for slot in ori_dialog['goal']['reqt']: + if slot == 'pricerange': + slot = 'price range' + goal['request'][domain][slot] = '' + + dialogue = { + 'dataset': dataset, + 'data_split': split, + 'dialogue_id': dialogue_id, + 'original_id': ori_dialog['dialogue_id'], + 'domains': cur_domains, + 'goal': goal, + 'finished': ori_dialog['finished'], + 'turns': [] + } - new_ont['state']['restaurant'] = {} - for ss in all_state_slots: - new_ont['state']['restaurant'][ss] = '' + for turn in ori_dialog['dial']: + usr_text = turn['usr']['transcript'] + usr_da = turn['usr']['dialog_act'] - for b in all_binary_das: - new_ont['binary_dialogue_act'].append(b) + sys_text = turn['sys']['sent'] + sys_da = turn['sys']['dialog_act'] - for i in all_intent: - new_ont['intents'][i] = {'description': camrest_desc['intents'][i]} - - new_ont['domains']['restaurant'] = { - 'description': camrest_desc['restaurant']['domain'], - 'slots': {} - } - for s in all_slots: - new_ont['domains']['restaurant']['slots'][s] = { - "description": camrest_desc['restaurant'][s], - "is_categorical": True if s in cat_slot_values else False, - "possible_values": cat_slot_values[s] if s in cat_slot_values else [] - } - json.dump(new_ont, open(os.path.join(self_dir, './ontology.json'), 'w'), indent=4) + cur_state = convert_state(turn['usr']['slu']) + cur_user_da = convert_da(usr_text, usr_da) + usr_turn = { + 'speaker': 'user', + 'utterance': usr_text, + 'utt_idx': len(dialogue['turns']), + 'dialogue_acts': cur_user_da, + 'state': cur_state, + } - else: - all_data = read_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') - new_ont = json.load(open(os.path.join(self_dir, './ontology.json'), 'r')) + sys_turn = { + 'speaker': 'system', + 'utterance': sys_text, + 'utt_idx': len(dialogue['turns'])+1, + 'dialogue_acts': convert_da(sys_text, sys_da), + 'db_results': {} + } - return all_data, new_ont + dialogue['turns'].append(usr_turn) + dialogue['turns'].append(sys_turn) + + for turn in dialogue['turns']: + speaker = turn['speaker'] + dialogue_acts = turn['dialogue_acts'] + + # add to dialogue_acts dictionary in the ontology + for da_type in dialogue_acts: + das = dialogue_acts[da_type] + for da in das: + ontology["dialogue_acts"][da_type].setdefault((da['intent'], da['domain'], da['slot']), {}) + ontology["dialogue_acts"][da_type][(da['intent'], da['domain'], da['slot'])][speaker] = True + dialogues_by_split[split].append(dialogue) + + dialogues = [] + for split in splits: + dialogues += dialogues_by_split[split] + for da_type in ontology['dialogue_acts']: + ontology["dialogue_acts"][da_type] = sorted([str({'user': speakers.get('user', False), 'system': speakers.get('system', False), 'intent':da[0],'domain':da[1], 'slot':da[2]}) for da, speakers in ontology["dialogue_acts"][da_type].items()]) + json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False) + json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False) + json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False) + with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf: + for filename in os.listdir(new_data_dir): + zf.write(f'{new_data_dir}/{filename}') + rmtree(original_data_dir) + rmtree(new_data_dir) + return dialogues, ontology if __name__ == '__main__':