Skip to content
Snippets Groups Projects
Select Git revision
  • 7c3fc0a658b2ec5fc25da3a66de1e36e31c556fd
  • master default protected
  • exec_auto_adjust_trace
  • let_variables
  • v1.4.1
  • v1.4.0
  • v1.3.0
  • v1.2.0
  • v1.1.0
  • v1.0.0
10 results

README.md

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    RegexSemI.py 17.86 KiB
    
    
    
    
    ###############################################################################
    # PyDial: Multi-domain Statistical Spoken Dialogue System Software
    ###############################################################################
    #
    # Copyright 2015 - 2019
    # Cambridge University Engineering Department Dialogue Systems Group
    #
    # 
    # Licensed under the Apache License, Version 2.0 (the "License");
    # you may not use this file except in compliance with the License.
    # You may obtain a copy of the License at
    #
    # http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    #
    ###############################################################################
    
    """
    RegexSemI.py - Regular expressions SemI parser base class
    ==========================================================
    
    .. note::
    
        This implementation is based on the following assumptions:
        
        - obs can be a ASR n-best list - potential sentence inputs each with a probability (Currently - no probabilities - will have to slightly amend code to deal with these.)
        - will only output text semantic acts (plus probs maybe) -- wont be instances of DiaAct for example
    
    
    .. warning::
    
        Remember that this is the base class for all of the regex parsers. Making changes here could possibly fix a parser
        in your domain, but (at worst!) break/weaken parsers in all other domains! i.e. -- 1st, 2nd and 3rd approaches
        should be to tweak the derived class for the domain of interest. You can redefine anything in the derived class.
        
    .. seealso:: CUED Imports/Dependencies: 
    
        import :mod:`utils.Settings` |.|
        import :class:`semi.SemI.SemI` |.|
        import :class:`semi.SemIContextUtils` |.|
        import :mod:`ontology.Ontology`
    
    
    """
    
    __author__ = "cued_dialogue_systems_group"
    
    '''
        Modifications History
        ===============================
        Date        Author  Description
        ===============================
        Jul 21 2016 lmr46   Refactoring, creating inheritance and grouping functionalities together
    '''
    
    import re,os
    import importlib
    parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.sys.path.insert(0,parentdir) 
    from utils import ContextLogger
    from ontology import Ontology
    logger = ContextLogger.getLogger('')
    from semi.SemI import SemI
    from semi import SemIContextUtils as contextUtils
    
    class RegexSemI(SemI):
        """Is a  base class for each domains Regular Expressions based semantic parser. Primary focus is on **users intent**.
            The derived semantic parsers of each domain can deal with the constraints (slots,values).
        """
        def __init__(self):
            # test if regex describing the entity at hand has been set in child class. used for request alternatives
            try:
                self.rTYPE
            except AttributeError:
                self.rTYPE = "(thereisnovalueyet)"
                logger.warning("No rTYPE regex has been set.")
                
            self.domainTag = None
                
            self.init_regular_expressions()
    
        def _domain_init(self, dstring):
            '''Will be used by all classes that extend this
            '''
            self.USER_REQUESTABLE = Ontology.global_ontology.get_requestable_slots(dstring)
            self.USER_INFORMABLE = Ontology.global_ontology.get_informable_slots(dstring)
            self.domains_type = Ontology.global_ontology.get_type(dstring)
            # Generate regular expressions for informs:
            self.slot_values = dict.fromkeys(self.USER_INFORMABLE)
            for slot in list(self.slot_values.keys()):
                slot_values = Ontology.global_ontology.get_informable_slot_values(dstring, slot)
                self.slot_values[slot] = {value:"("+str(value)+")" for value in slot_values}
    
        # Optional arguments sys_act and turn here, needed only by DLSemI
        def decode(self, obs, sys_act=None, turn=None):
            """Assumes input is either a single string or a list of tuples (string, prob) as in a nbest list.
            """ 
            # bundle up into a list if a single sentence (ie string) is input
            if isinstance(obs,str):
                obs = [obs]
            if not isinstance(obs,list):
                logger.error("Expecting a list or string as input") 
           
            all_hyps = []
            for ob in obs:
                if isinstance(ob,tuple):
                    sentence,sentence_prob = ob[0],ob[1]
                elif isinstance(ob,str):
                    sentence,sentence_prob = ob, None
                else:
                    logger.error("For decoding, expected either a str or (str,probability) tuple") 
                assert(isinstance(sentence,str) or isinstance(sentence,str))
                all_hyps.append((self.decode_single_hypothesis(sentence, sys_act),sentence_prob)) 
            return self.combine_parses(all_hyps)
            #"""
    
        def combine_parses(self, nbest_parses):
            """TODO - should return a list of tuples (semantic act, prob) - in order
            """
            #TODO  JUST A HACK FOR NOW - needs to combine probs and order to get single most likely semantic hypothesis
            # Should probably also be ammending self.clean() or doing what it does here or ...
            # will work for now on assumption that there was only 1 sentence (not an nbest list) to parse anyway
            return nbest_parses 
    
        def decode_single_hypothesis(self,obs, sys_act = None):
            """
            :param: (str) obs - sentence (an ASR hypothesis)
            """ 
            self.semanticActs = []
            # run the obs thru all possible semantic acts:
            self._decode_request(obs)
            self._decode_affirm(obs)
            self._decode_inform(obs)
            self._decode_confirm(obs)
            self._decode_hello(obs)
            self._decode_negate(obs)
            self._decode_repeat(obs)
            self._decode_reqalts(obs)
            self._decode_bye(obs)
            self._decode_type(obs)
            # probably need to then do some cleaning on acts in semanticActs
            self.clean(sys_act)
            return self.semanticActs
    
        def init_regular_expressions(self):
            """
            """
            self.rHELLO = "(\b|^|\ )(hi|hello)\s"
            self.rNEG =  "(\b|^|\ )(no\b|wrong|incorrect|error)|not\ (true|correct|right)\s" 
            self.rAFFIRM = "(yes|yeah|(\b|^)ok\b|(\b|^)OK\b|okay|sure|(that('?s| is) )?(?<!not\ )(?<!no\ )(right|correct|confirm))" 
            self.rBYE = "(\b|^|\ )(bye|goodbye|that'?s?\ (is\ )*all)(\s|$|\ |\.)"
            self.GREAT = "(great|good|awesome)"
            self.HELPFUL = "(that((\')?s|\ (is|was))\ (very\ )?helpful)"
            self.THANK = "(thank(s|\ you)(\ (very|so)\ much)?)"
            self.rTHANKS = r"(^(\ )*)((" + self.GREAT + "\ )?(" + self.HELPFUL + "\ )?"+self.THANK+"(\ " + self.HELPFUL + ")?|(" + self.GREAT + "\ )?" + self.HELPFUL + "|" + self.GREAT + ")((\ )*$)"
            self.rREQALTS = "(\b|^|\ )((something|anything)\ else)|(different(\ one)*)|(another\ one)|(alternatives*)"
            self.rREQALTS += "|(other options*)|((don\'*t|do not) (want|like)\ (that|this)(\ one)*)" 
            self.rREQALTS += "|(others|other\ "+self.rTYPE+"(s)?)"
            self.rREPEAT = "(\b|^|\ )(repeat\ that)|(say\ that\ again)" 
            # The remaining regex are for the slot,value dependent acts - and so here in the base class are \
            # just aiming to catch intent.
            # REQUESTS:
            self.WHAT = "(what\'*s*|which|does|where)(\ (its|the))*"
            self.IT = "(it\'*s*|it\ have|is\ it\'*s*|is\ (the|their))(\ for)*"
            self.CYTM = "(can\ you\ tell\ me\ (the|it\'*s|their))"
            self.CIG = "(can\ I\ get\ (the|it\'*s|their))"
            self.NEGATE ="((i\ )*(don\'?t|do\ not|does\ not|does\'?nt)\ (care|mind|matter)(\ (about|what))*(\ (the|it\'?s*))*)"
            self.DONTCARE = "(i\ dont\ care)"#Cant create variable lengths with negative lookback... else merge following:
            self.DONTCAREWHAT = "(i\ dont\ care\ what\ )"
            self.DONTCAREABOUT = "(i\ dont\ care\ about\ )"
            self.rREQUEST = r"(\b|^|\ )(?<!"+self.DONTCARE+")("+self.WHAT+"\ "+self.IT+"|"+self.CYTM+"|"+self.CIG+")"
            # INFORMS:
            self.WANT = "(what\ about|want|have|need|looking\ for|used\ for)(\ a(n)?)*"
            self.WBG = "(\ ((would|seems\ to)\ be\ (good|nice)($|[^\?]$)|seems\ (good|nice)($|[^\?]$)))"
            self.rINFORM = "(\b|^|\ )"+self.WANT
            self.rINFORM_DONTCARE = self.DONTCARE+r"((what|which|about)(\ (it\'*s*|the))*)+" 
            self.rINFORM_DONTWANT = r"(((i\ )*(don\'*t\ want))|it\ (shouldn\'*t|should\ not)\ (have|be))+" 
            # Contextual dontcares: i.e things that should be labelled inform(=dontcare)
            self.rCONTEXTUAL_DONTCARE = r"(anything(?!\ else)|((any$|any\ kind)|(i\ )*(don\'?t|do\ not)\ (care|know))($|(?!\ (a?bout|of|what))|(\ (a?bout|of|what)\ (type|kind)(?!\ of))|\ a?bout\ (that|this))|(any(thing)?\ (is\ )*(fine|ok\b|okay|will\ do))($|\ and|\ but)|(it )?(doesn\'?t|does not) matter)+" 
            # The following are NOT regular expresions, but EXACT string matching:
            self.COMMON_CONTEXTUAL_DONTCARES = ["i dont care","any","anything", "i dont mind"]
            self.COMMON_CONTEXTUAL_DONTCARES += ["it doesn\'t matter", "dont care"]
    
    
    
        def _decode_request(self,obs):
            """TO BE DEFINED IN DOMAIN SPECIFIC WAY IN DERIVED CLASS"""
            pass
    
        def _decode_inform(self,obs):
            """TO BE DEFINED IN DOMAIN SPECIFIC WAY IN DERIVED CLASS"""
            pass
    
        def _decode_confirm(self, obs):
            """TO BE DEFINED IN DOMAIN SPECIFIC WAY IN DERIVED CLASS"""
            pass
    
        def _decode_type(self,obs):
            """TO BE DEFINED IN DOMAIN SPECIFIC WAY IN DERIVED CLASS"""
            pass
    
        def _decode_hello(self,obs):
            """
            """  
            if self._check(re.search(self.rHELLO,obs, re.I)): #DEL is not None:
                self.semanticActs.append('hello()')
    
        def _decode_negate(self, obs):
            """
            """
            if self._check(re.search(self.rNEG,obs, re.I)): #DEL  is not None:
                self.semanticActs.append('negate()')  #TODO - is this a used act? syntax: neg()?
            if obs in ['no','wrong']:  # deal with negate a little differently
                self.semanticActs.append('negate()')
    
        def _decode_affirm(self,obs):
            """
            """ 
            if self._check(re.search(self.rAFFIRM,obs, re.I)): #DEL is not None:
                self.semanticActs.append('affirm()')
    
        def _decode_bye(self, obs):
            """
            """
            if self._check(re.search(self.rBYE,obs, re.I)): #DEL is not None:
                self.semanticActs.append('bye()')
    #         elif self._check(re.search(self.rTHANKS,obs,re.I)):
    #             self.semanticActs.append('bye()')
    
        def _decode_reqalts(self,obs):
            """
            """
            if self._check(re.search(self.rREQALTS,obs, re.I)): #DEL is not None:
                self.semanticActs.append('reqalts()')
    
        def _decode_repeat(self, obs):
            """
            """
            if self._check(re.search(self.rREPEAT,obs, re.I)): #DEL is not None:
                self.semanticActs.append('repeat()')
    
        def clean(self, sys_act = None):
            """
            """
            #TODO - deal with duplicates, probabilities, others?
            tempActs = {}
            
            #first add context to user acts
            for act in self.semanticActs:
                [act] = contextUtils._add_context_to_user_act(sys_act,[[act,1.0]],self.domainTag)
                act = act[0]
                intent, slotValues = self._parseDialogueAct(act)
                if intent not in tempActs:
                    tempActs[intent] = slotValues
                else:
                    for slot in slotValues:
                        if slot in tempActs[intent]:
                            if slotValues[slot] != tempActs[intent][slot]:
                                logger.warning('Sematic decoding of input lead to different interpretations within one hypothesis. Slot {} has values {} and {}.'.format(slot,slotValues[slot],tempActs[intent][slot]))
                        else:
                            tempActs[intent][slot] = slotValues[slot]
            #x
            hypos = []              
            for intent in tempActs:
                if len(tempActs[intent]):
                    (key, value) = list(tempActs[intent].items())[0]
                    if value is not None:
                        hypos.append('{}({})'.format(intent,','.join('%s=%s' % (key, value) for (key, value) in list(tempActs[intent].items()))))
                    else:
                        hypos.append('{}({})'.format(intent,','.join('%s' % (key) for (key, value) in list(tempActs[intent].items()))))
                else:
                    logger.warning("intent {} found in input without arguments")
            self.semanticActs =  "|".join(hypos)
    
        def _contextual_inform(self,obs):
            """
            """
            # Statements that are contextual/implicit (ie dont mention slot or value explicitly):
            if self._check(re.search(self.rCONTEXTUAL_DONTCARE, obs, re.I)): #DEL is not None: 
                self.semanticActs.append('inform(=dontcare)')
            if self._exact_match(self.COMMON_CONTEXTUAL_DONTCARES, obs):
                self.semanticActs.append('inform(=dontcare)')
    
        def _exact_match(self, strings, obs):
            """
            """
            if obs.lstrip().lower().replace("'","") in strings:
                return True
            return False
    
        def _domain_independent_requests(self,obs):
            """  
            """
            rDOM_IN_REQ_NAME = r"((what(s*)|what\ is)\ it called)"
            if self._check(re.search(rDOM_IN_REQ_NAME, obs, re.I)): #DEL is not None: 
                self.semanticActs.append('request(name)')
    
        def _check(self,re_object):
            """
            """
            if re_object is None:
                return False
            for o in re_object.groups():
                if o is not None:
                    return True
            return False
        
        def _parseDialogueAct(self, act):
            slotValues = {}
            intent = None
            if act is not None:
                match = re.match("([^\(]+)\(([^\)]*)\)", act)
                if match is not None:
                    intent = match.group(1)
                    slotValueString = match.group(2)
                    slots = slotValueString.split(',')
                    slotValues = {slot.split('=')[0] : slot.split('=')[1] for slot in slots if '=' in slot}
                    slotValues.update({slot : None for slot in slots if '=' not in slot})
                        
            return intent, slotValues
    
    
    class FileParser(object):
        """
        """
        def __init__(self,filename, domainTag="CamRestaurants"):
            self.domain_tag = domainTag
            self.filename=filename
            self.JOINER = " <=> "
            # TODO - note that can import parser into Hubs via below 2 lines:
            parser_module = __import__("RegexSemI_"+self.domain_tag, fromlist=["RegexSemI_"+self.domain_tag]) 
            self.parser = getattr(parser_module, "RegexSemI_"+self.domain_tag)()
    
        def decode_file(self, DOPRINT=True):
            """
            """
            self.inputs = []
            self.results = []
            with open(self.filename,"r") as f:
                for line in f:
                    parse = (line.strip('\n'), self.parser.decode(line)[0][0])
                    self.results.append(parse[1]) #list order is persistent, as required here
                    self.inputs.append(parse[0])
                    if DOPRINT:
                        print(parse[0] + self.JOINER + parse[1])
    
        def test_file(self, referenceFile=None):
            """
            Note this just has some **very basic checking** that the ref and parsed file match up appropriately. 
             
            A guide to using this function for developing Regex SemI parsers:
            0. create a list of example sentences for parsing
            1. get a parser working a little
            2. Dump the output of parsing the example sentences file 
            >> python RegexSemI.py _resources/EXAMPLE_INPUT_SENTENCES_FOR_DOMAIN DOMAINTAG PATH_TO_REPO_ROOT > OUTfile
            3. Fix the semantic parsers in the OUTfile so that it can be used as a reference 
            4. Improve the parser
            5. Check the improvements against the reference OUTfile
            >> python RegexSemI.py _resources/EXAMPLE_INPUT_SENTENCES_FOR_DOMAIN DOMAINTAG PATH_TO_REPO_ROOT OUTfile
            6. go back to 4, add more sentences to examples file etc etc
            """
            if referenceFile is None:
                return
            lineNum = -1
            with open(referenceFile,"r") as f:
                for line in f:
                    lineNum += 1
                    line = line.strip('\n')
                    bits = line.split(self.JOINER)
                    assert(len(bits)==2)
                    userinput,reference = bits[0],bits[1]
                    if userinput != self.inputs[lineNum]:
                        print("MISMATCH ERROR: " + userinput + " != "  + self.inputs[lineNum])
                    elif self.results[lineNum] != reference:
                        print("INCORRECT PARSE: " + userinput)
                        print('\t\t'+self.results[lineNum] + " != "  + reference)
                    else:
                        pass
                        #print "CORRECT: " + self.results[lineNum] + self.JOINER + reference
                    
    
    
    #-------------------------------------------------------------------------------------------
    #  Main
    #-------------------------------------------------------------------------------------------
    if __name__=="__main__":
        import sys
        importlib.reload(sys)  
    #     sys.setdefaultencoding('utf8')  # Sometime the encoding in the example files may cause trouble otherwise
        if len(sys.argv) < 4:
            exit("Usage: python RegexSemi.py EXAMPLE_SENTENCES_FILEPATH DOMAIN_TAG REPOSITORY_ROOT [optional: REFERENCE_FILE]")
        if len(sys.argv) == 5:
            refFileIn = sys.argv[4]
        else:
            refFileIn = None
            
        from utils import Settings
        Settings.load_root(rootIn=sys.argv[3])  #when runing with FileParser() -- since without a config, root may be unavailable.
        Settings.load_config(None)
        Settings.config.add_section("GENERAL")
        Settings.config.set("GENERAL",'domains', sys.argv[2])
        
    
        Ontology.init_global_ontology()
        fp = FileParser(filename=sys.argv[1], domainTag=sys.argv[2])
        fp.decode_file(DOPRINT=refFileIn is None)
        fp.test_file(referenceFile=refFileIn)
    
    
    #END OF FILE