Select Git revision
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
RegexSemI.py 17.86 KiB
###############################################################################
# PyDial: Multi-domain Statistical Spoken Dialogue System Software
###############################################################################
#
# Copyright 2015 - 2019
# Cambridge University Engineering Department Dialogue Systems Group
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
###############################################################################
"""
RegexSemI.py - Regular expressions SemI parser base class
==========================================================
.. note::
This implementation is based on the following assumptions:
- obs can be a ASR n-best list - potential sentence inputs each with a probability (Currently - no probabilities - will have to slightly amend code to deal with these.)
- will only output text semantic acts (plus probs maybe) -- wont be instances of DiaAct for example
.. warning::
Remember that this is the base class for all of the regex parsers. Making changes here could possibly fix a parser
in your domain, but (at worst!) break/weaken parsers in all other domains! i.e. -- 1st, 2nd and 3rd approaches
should be to tweak the derived class for the domain of interest. You can redefine anything in the derived class.
.. seealso:: CUED Imports/Dependencies:
import :mod:`utils.Settings` |.|
import :class:`semi.SemI.SemI` |.|
import :class:`semi.SemIContextUtils` |.|
import :mod:`ontology.Ontology`
"""
__author__ = "cued_dialogue_systems_group"
'''
Modifications History
===============================
Date Author Description
===============================
Jul 21 2016 lmr46 Refactoring, creating inheritance and grouping functionalities together
'''
import re,os
import importlib
parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
os.sys.path.insert(0,parentdir)
from utils import ContextLogger
from ontology import Ontology
logger = ContextLogger.getLogger('')
from semi.SemI import SemI
from semi import SemIContextUtils as contextUtils
class RegexSemI(SemI):
"""Is a base class for each domains Regular Expressions based semantic parser. Primary focus is on **users intent**.
The derived semantic parsers of each domain can deal with the constraints (slots,values).
"""
def __init__(self):
# test if regex describing the entity at hand has been set in child class. used for request alternatives
try:
self.rTYPE
except AttributeError:
self.rTYPE = "(thereisnovalueyet)"
logger.warning("No rTYPE regex has been set.")
self.domainTag = None
self.init_regular_expressions()
def _domain_init(self, dstring):
'''Will be used by all classes that extend this
'''
self.USER_REQUESTABLE = Ontology.global_ontology.get_requestable_slots(dstring)
self.USER_INFORMABLE = Ontology.global_ontology.get_informable_slots(dstring)
self.domains_type = Ontology.global_ontology.get_type(dstring)
# Generate regular expressions for informs:
self.slot_values = dict.fromkeys(self.USER_INFORMABLE)
for slot in list(self.slot_values.keys()):
slot_values = Ontology.global_ontology.get_informable_slot_values(dstring, slot)
self.slot_values[slot] = {value:"("+str(value)+")" for value in slot_values}
# Optional arguments sys_act and turn here, needed only by DLSemI
def decode(self, obs, sys_act=None, turn=None):
"""Assumes input is either a single string or a list of tuples (string, prob) as in a nbest list.
"""
# bundle up into a list if a single sentence (ie string) is input
if isinstance(obs,str):
obs = [obs]
if not isinstance(obs,list):
logger.error("Expecting a list or string as input")
all_hyps = []
for ob in obs:
if isinstance(ob,tuple):
sentence,sentence_prob = ob[0],ob[1]
elif isinstance(ob,str):
sentence,sentence_prob = ob, None
else:
logger.error("For decoding, expected either a str or (str,probability) tuple")
assert(isinstance(sentence,str) or isinstance(sentence,str))
all_hyps.append((self.decode_single_hypothesis(sentence, sys_act),sentence_prob))
return self.combine_parses(all_hyps)
#"""
def combine_parses(self, nbest_parses):
"""TODO - should return a list of tuples (semantic act, prob) - in order
"""
#TODO JUST A HACK FOR NOW - needs to combine probs and order to get single most likely semantic hypothesis
# Should probably also be ammending self.clean() or doing what it does here or ...
# will work for now on assumption that there was only 1 sentence (not an nbest list) to parse anyway
return nbest_parses
def decode_single_hypothesis(self,obs, sys_act = None):
"""
:param: (str) obs - sentence (an ASR hypothesis)
"""
self.semanticActs = []
# run the obs thru all possible semantic acts:
self._decode_request(obs)
self._decode_affirm(obs)
self._decode_inform(obs)
self._decode_confirm(obs)
self._decode_hello(obs)
self._decode_negate(obs)
self._decode_repeat(obs)
self._decode_reqalts(obs)
self._decode_bye(obs)
self._decode_type(obs)
# probably need to then do some cleaning on acts in semanticActs
self.clean(sys_act)
return self.semanticActs
def init_regular_expressions(self):
"""
"""
self.rHELLO = "(\b|^|\ )(hi|hello)\s"
self.rNEG = "(\b|^|\ )(no\b|wrong|incorrect|error)|not\ (true|correct|right)\s"
self.rAFFIRM = "(yes|yeah|(\b|^)ok\b|(\b|^)OK\b|okay|sure|(that('?s| is) )?(?<!not\ )(?<!no\ )(right|correct|confirm))"
self.rBYE = "(\b|^|\ )(bye|goodbye|that'?s?\ (is\ )*all)(\s|$|\ |\.)"
self.GREAT = "(great|good|awesome)"
self.HELPFUL = "(that((\')?s|\ (is|was))\ (very\ )?helpful)"
self.THANK = "(thank(s|\ you)(\ (very|so)\ much)?)"
self.rTHANKS = r"(^(\ )*)((" + self.GREAT + "\ )?(" + self.HELPFUL + "\ )?"+self.THANK+"(\ " + self.HELPFUL + ")?|(" + self.GREAT + "\ )?" + self.HELPFUL + "|" + self.GREAT + ")((\ )*$)"
self.rREQALTS = "(\b|^|\ )((something|anything)\ else)|(different(\ one)*)|(another\ one)|(alternatives*)"
self.rREQALTS += "|(other options*)|((don\'*t|do not) (want|like)\ (that|this)(\ one)*)"
self.rREQALTS += "|(others|other\ "+self.rTYPE+"(s)?)"
self.rREPEAT = "(\b|^|\ )(repeat\ that)|(say\ that\ again)"
# The remaining regex are for the slot,value dependent acts - and so here in the base class are \
# just aiming to catch intent.
# REQUESTS:
self.WHAT = "(what\'*s*|which|does|where)(\ (its|the))*"
self.IT = "(it\'*s*|it\ have|is\ it\'*s*|is\ (the|their))(\ for)*"
self.CYTM = "(can\ you\ tell\ me\ (the|it\'*s|their))"
self.CIG = "(can\ I\ get\ (the|it\'*s|their))"
self.NEGATE ="((i\ )*(don\'?t|do\ not|does\ not|does\'?nt)\ (care|mind|matter)(\ (about|what))*(\ (the|it\'?s*))*)"
self.DONTCARE = "(i\ dont\ care)"#Cant create variable lengths with negative lookback... else merge following:
self.DONTCAREWHAT = "(i\ dont\ care\ what\ )"
self.DONTCAREABOUT = "(i\ dont\ care\ about\ )"
self.rREQUEST = r"(\b|^|\ )(?<!"+self.DONTCARE+")("+self.WHAT+"\ "+self.IT+"|"+self.CYTM+"|"+self.CIG+")"
# INFORMS:
self.WANT = "(what\ about|want|have|need|looking\ for|used\ for)(\ a(n)?)*"
self.WBG = "(\ ((would|seems\ to)\ be\ (good|nice)($|[^\?]$)|seems\ (good|nice)($|[^\?]$)))"
self.rINFORM = "(\b|^|\ )"+self.WANT
self.rINFORM_DONTCARE = self.DONTCARE+r"((what|which|about)(\ (it\'*s*|the))*)+"
self.rINFORM_DONTWANT = r"(((i\ )*(don\'*t\ want))|it\ (shouldn\'*t|should\ not)\ (have|be))+"
# Contextual dontcares: i.e things that should be labelled inform(=dontcare)
self.rCONTEXTUAL_DONTCARE = r"(anything(?!\ else)|((any$|any\ kind)|(i\ )*(don\'?t|do\ not)\ (care|know))($|(?!\ (a?bout|of|what))|(\ (a?bout|of|what)\ (type|kind)(?!\ of))|\ a?bout\ (that|this))|(any(thing)?\ (is\ )*(fine|ok\b|okay|will\ do))($|\ and|\ but)|(it )?(doesn\'?t|does not) matter)+"
# The following are NOT regular expresions, but EXACT string matching:
self.COMMON_CONTEXTUAL_DONTCARES = ["i dont care","any","anything", "i dont mind"]
self.COMMON_CONTEXTUAL_DONTCARES += ["it doesn\'t matter", "dont care"]
def _decode_request(self,obs):
"""TO BE DEFINED IN DOMAIN SPECIFIC WAY IN DERIVED CLASS"""
pass
def _decode_inform(self,obs):
"""TO BE DEFINED IN DOMAIN SPECIFIC WAY IN DERIVED CLASS"""
pass
def _decode_confirm(self, obs):
"""TO BE DEFINED IN DOMAIN SPECIFIC WAY IN DERIVED CLASS"""
pass
def _decode_type(self,obs):
"""TO BE DEFINED IN DOMAIN SPECIFIC WAY IN DERIVED CLASS"""
pass
def _decode_hello(self,obs):
"""
"""
if self._check(re.search(self.rHELLO,obs, re.I)): #DEL is not None:
self.semanticActs.append('hello()')
def _decode_negate(self, obs):
"""
"""
if self._check(re.search(self.rNEG,obs, re.I)): #DEL is not None:
self.semanticActs.append('negate()') #TODO - is this a used act? syntax: neg()?
if obs in ['no','wrong']: # deal with negate a little differently
self.semanticActs.append('negate()')
def _decode_affirm(self,obs):
"""
"""
if self._check(re.search(self.rAFFIRM,obs, re.I)): #DEL is not None:
self.semanticActs.append('affirm()')
def _decode_bye(self, obs):
"""
"""
if self._check(re.search(self.rBYE,obs, re.I)): #DEL is not None:
self.semanticActs.append('bye()')
# elif self._check(re.search(self.rTHANKS,obs,re.I)):
# self.semanticActs.append('bye()')
def _decode_reqalts(self,obs):
"""
"""
if self._check(re.search(self.rREQALTS,obs, re.I)): #DEL is not None:
self.semanticActs.append('reqalts()')
def _decode_repeat(self, obs):
"""
"""
if self._check(re.search(self.rREPEAT,obs, re.I)): #DEL is not None:
self.semanticActs.append('repeat()')
def clean(self, sys_act = None):
"""
"""
#TODO - deal with duplicates, probabilities, others?
tempActs = {}
#first add context to user acts
for act in self.semanticActs:
[act] = contextUtils._add_context_to_user_act(sys_act,[[act,1.0]],self.domainTag)
act = act[0]
intent, slotValues = self._parseDialogueAct(act)
if intent not in tempActs:
tempActs[intent] = slotValues
else:
for slot in slotValues:
if slot in tempActs[intent]:
if slotValues[slot] != tempActs[intent][slot]:
logger.warning('Sematic decoding of input lead to different interpretations within one hypothesis. Slot {} has values {} and {}.'.format(slot,slotValues[slot],tempActs[intent][slot]))
else:
tempActs[intent][slot] = slotValues[slot]
#x
hypos = []
for intent in tempActs:
if len(tempActs[intent]):
(key, value) = list(tempActs[intent].items())[0]
if value is not None:
hypos.append('{}({})'.format(intent,','.join('%s=%s' % (key, value) for (key, value) in list(tempActs[intent].items()))))
else:
hypos.append('{}({})'.format(intent,','.join('%s' % (key) for (key, value) in list(tempActs[intent].items()))))
else:
logger.warning("intent {} found in input without arguments")
self.semanticActs = "|".join(hypos)
def _contextual_inform(self,obs):
"""
"""
# Statements that are contextual/implicit (ie dont mention slot or value explicitly):
if self._check(re.search(self.rCONTEXTUAL_DONTCARE, obs, re.I)): #DEL is not None:
self.semanticActs.append('inform(=dontcare)')
if self._exact_match(self.COMMON_CONTEXTUAL_DONTCARES, obs):
self.semanticActs.append('inform(=dontcare)')
def _exact_match(self, strings, obs):
"""
"""
if obs.lstrip().lower().replace("'","") in strings:
return True
return False
def _domain_independent_requests(self,obs):
"""
"""
rDOM_IN_REQ_NAME = r"((what(s*)|what\ is)\ it called)"
if self._check(re.search(rDOM_IN_REQ_NAME, obs, re.I)): #DEL is not None:
self.semanticActs.append('request(name)')
def _check(self,re_object):
"""
"""
if re_object is None:
return False
for o in re_object.groups():
if o is not None:
return True
return False
def _parseDialogueAct(self, act):
slotValues = {}
intent = None
if act is not None:
match = re.match("([^\(]+)\(([^\)]*)\)", act)
if match is not None:
intent = match.group(1)
slotValueString = match.group(2)
slots = slotValueString.split(',')
slotValues = {slot.split('=')[0] : slot.split('=')[1] for slot in slots if '=' in slot}
slotValues.update({slot : None for slot in slots if '=' not in slot})
return intent, slotValues
class FileParser(object):
"""
"""
def __init__(self,filename, domainTag="CamRestaurants"):
self.domain_tag = domainTag
self.filename=filename
self.JOINER = " <=> "
# TODO - note that can import parser into Hubs via below 2 lines:
parser_module = __import__("RegexSemI_"+self.domain_tag, fromlist=["RegexSemI_"+self.domain_tag])
self.parser = getattr(parser_module, "RegexSemI_"+self.domain_tag)()
def decode_file(self, DOPRINT=True):
"""
"""
self.inputs = []
self.results = []
with open(self.filename,"r") as f:
for line in f:
parse = (line.strip('\n'), self.parser.decode(line)[0][0])
self.results.append(parse[1]) #list order is persistent, as required here
self.inputs.append(parse[0])
if DOPRINT:
print(parse[0] + self.JOINER + parse[1])
def test_file(self, referenceFile=None):
"""
Note this just has some **very basic checking** that the ref and parsed file match up appropriately.
A guide to using this function for developing Regex SemI parsers:
0. create a list of example sentences for parsing
1. get a parser working a little
2. Dump the output of parsing the example sentences file
>> python RegexSemI.py _resources/EXAMPLE_INPUT_SENTENCES_FOR_DOMAIN DOMAINTAG PATH_TO_REPO_ROOT > OUTfile
3. Fix the semantic parsers in the OUTfile so that it can be used as a reference
4. Improve the parser
5. Check the improvements against the reference OUTfile
>> python RegexSemI.py _resources/EXAMPLE_INPUT_SENTENCES_FOR_DOMAIN DOMAINTAG PATH_TO_REPO_ROOT OUTfile
6. go back to 4, add more sentences to examples file etc etc
"""
if referenceFile is None:
return
lineNum = -1
with open(referenceFile,"r") as f:
for line in f:
lineNum += 1
line = line.strip('\n')
bits = line.split(self.JOINER)
assert(len(bits)==2)
userinput,reference = bits[0],bits[1]
if userinput != self.inputs[lineNum]:
print("MISMATCH ERROR: " + userinput + " != " + self.inputs[lineNum])
elif self.results[lineNum] != reference:
print("INCORRECT PARSE: " + userinput)
print('\t\t'+self.results[lineNum] + " != " + reference)
else:
pass
#print "CORRECT: " + self.results[lineNum] + self.JOINER + reference
#-------------------------------------------------------------------------------------------
# Main
#-------------------------------------------------------------------------------------------
if __name__=="__main__":
import sys
importlib.reload(sys)
# sys.setdefaultencoding('utf8') # Sometime the encoding in the example files may cause trouble otherwise
if len(sys.argv) < 4:
exit("Usage: python RegexSemi.py EXAMPLE_SENTENCES_FILEPATH DOMAIN_TAG REPOSITORY_ROOT [optional: REFERENCE_FILE]")
if len(sys.argv) == 5:
refFileIn = sys.argv[4]
else:
refFileIn = None
from utils import Settings
Settings.load_root(rootIn=sys.argv[3]) #when runing with FileParser() -- since without a config, root may be unavailable.
Settings.load_config(None)
Settings.config.add_section("GENERAL")
Settings.config.set("GENERAL",'domains', sys.argv[2])
Ontology.init_global_ontology()
fp = FileParser(filename=sys.argv[1], domainTag=sys.argv[2])
fp.decode_file(DOPRINT=refFileIn is None)
fp.test_file(referenceFile=refFileIn)
#END OF FILE