diff --git a/convlab2/nlu/jointBERT/multiwoz/nlu.py b/convlab2/nlu/jointBERT/multiwoz/nlu.py index 8dd0014b1b3226b8ef83c98efc655e674c0dfe15..8b9f0dcf9fb6c4d8bd426b8423f9158e881d0c0c 100755 --- a/convlab2/nlu/jointBERT/multiwoz/nlu.py +++ b/convlab2/nlu/jointBERT/multiwoz/nlu.py @@ -1,4 +1,5 @@ import os +import re import zipfile import json import torch @@ -66,6 +67,8 @@ class BERTNLU(NLU): print("BERTNLU loaded") def predict(self, utterance, context=list()): + # Note: spacy cannot tokenize 'id' or 'Id' correctly. + utterance = re.sub(r'\b(id|Id)\b', 'ID', utterance) # tokenization first, very important! ori_word_seq = [token.text for token in self.nlp(unidecode(utterance)) if token.text.strip()] # print(ori_word_seq)