model validation and check unused parameters/molecules

a569e10b · Peter Schubert · 3cb87afe · a569e10b · a569e10b · a569e10b
Commit a569e10b authored 2 years ago by Peter Schubert
--- a/rbaxdf/model/rba_density.py
+++ b/rbaxdf/model/rba_density.py
-"""Implementation of RbaDensity class.
+"""Implementation of RbaDensities and RbaDensity classes.

 Peter Schubert, CCB, HHU Duesseldorf, December 2022
 """
@@ -10,15 +10,12 @@ import xml.etree.ElementTree
 from .rba_target_value import RbaTargetValue


-class RbaDensity:
-
-    def __init__(self, cid):
-        self.id = cid
-        self.target_value = None
+class RbaDensities:

-    @staticmethod
-    def get_xml_items(model_dir):
+    def __init__(self):
+        self.densities = {}

+    def get_xml_items(self, model_dir):
        file_name = os.path.join(model_dir, 'density.xml')
        if os.path.exists(file_name) is False:
            print(f'{file_name} not found!')
@@ -28,6 +25,37 @@ class RbaDensity:
        root = tree.getroot()
        assert root.tag == 'RBADensity'

+        self.densities = RbaDensity.get_xml_items(root)
+
+    def get_df_items(self):
+        df = pd.DataFrame([item.to_dict() for item in self.densities.values()])
+        df.set_index('compartment', inplace=True)
+        return df
+
+    def validate(self, component_ids):
+        valid = True
+        missing_components = self.ref_parameters().difference(component_ids['functions'])\
+                                                  .difference(component_ids['aggregates'])
+        if len(missing_components) > 0:
+            print('function/aggregates used in densities not defined:', missing_components)
+            valid = False
+        return valid
+
+    def ref_parameters(self):
+        refs = set()
+        for d in self.densities.values():
+            refs |= {tv.split('=')[1].strip() for tv in d.target_value.get_str().split(',')}
+        return refs
+
+
+class RbaDensity:
+
+    def __init__(self, cid):
+        self.id = cid
+        self.target_value = None
+
+    @staticmethod
+    def get_xml_items(root):
        data = {}
        target_densities = root.find('listOfTargetDensities')
        for target_density in target_densities.findall('targetDensity'):
@@ -37,12 +65,6 @@ class RbaDensity:
            data[cid] = rba_density
        return data

-    @staticmethod
-    def get_df_items(items):
-        df = pd.DataFrame([item.to_dict() for item in items.values()])
-        df.set_index('compartment', inplace=True)
-        return df
-
    def to_dict(self):
        return {'compartment': self.id,
                'targetValue': self.target_value.get_str()}
--- a/rbaxdf/model/rba_enzyme.py
+++ b/rbaxdf/model/rba_enzyme.py
-"""Implementation of RbaEnzyme class.
+"""Implementation of RbaEnzymes and RbaEnzyme classes.

 Peter Schubert, CCB, HHU Duesseldorf, December 2022
 """
@@ -12,19 +12,12 @@ import xml.etree.ElementTree
 from rbaxdf.utils.et_utils import get_species_refs


-class RbaEnzyme:
+class RbaEnzymes:

-    def __init__(self, eid):
-        self.id = eid
-        self.reaction = ''
-        self.forward_eff = ''
-        self.backward_eff = ''
-        self.zero_cost = False
-        self.mach_reactants = {}
-        self.mach_products = {}
+    def __init__(self):
+        self.enzymes = {}

-    @staticmethod
-    def get_xml_items(model_dir):
+    def get_xml_items(self, model_dir):

        file_name = os.path.join(model_dir, 'enzymes.xml')
        if os.path.exists(file_name) is False:
@@ -35,6 +28,57 @@ class RbaEnzyme:
        root = tree.getroot()
        assert root.tag == 'RBAEnzymes'

+        self.enzymes = RbaEnzyme.get_xml_items(root)
+
+    def get_df_items(self):
+        df = pd.DataFrame([item.to_dict() for item in self.enzymes.values()])
+        df.set_index('enzyme', inplace=True)
+        return df
+
+    def validate(self, component_ids):
+        valid = True
+        missing = self.ref_molecules().difference(component_ids['species']) \
+            .difference(component_ids['rnas']) \
+            .difference(component_ids['proteins'])
+        if len(missing) > 0:
+            print('species/macromolecules used in enzyme machinery not defined:', missing)
+            valid = False
+
+        missing = self.ref_parameters().difference(component_ids['functions']) \
+            .difference(component_ids['aggregates'])
+        if len(missing) > 0:
+            print('function/aggregates used in enzymes not defined:', missing)
+            valid = False
+        return valid
+
+    def ref_molecules(self):
+        refs = set()
+        for e in self.enzymes.values():
+            refs |= {sid for sid in e.mach_reactants}
+            refs |= {sid for sid in e.mach_products}
+        return refs
+
+    def ref_parameters(self):
+        refs = set()
+        for e in self.enzymes.values():
+            refs.add(e.forward_eff)
+            refs.add(e.backward_eff)
+        return refs
+
+
+class RbaEnzyme:
+
+    def __init__(self, eid):
+        self.id = eid
+        self.reaction = ''
+        self.forward_eff = ''
+        self.backward_eff = ''
+        self.zero_cost = False
+        self.mach_reactants = {}
+        self.mach_products = {}
+
+    @staticmethod
+    def get_xml_items(root):
        data = {}
        enzymes = root.find('listOfEnzymes')
        for enzyme in enzymes.findall('enzyme'):
@@ -53,12 +97,6 @@ class RbaEnzyme:
            data[eid] = rba_enzyme
        return data

-    @staticmethod
-    def get_df_items(items):
-        df = pd.DataFrame([item.to_dict() for item in items.values()])
-        df.set_index('enzyme', inplace=True)
-        return df
-
    def to_dict(self):
        mach_reactants = '; '.join([f'species={species}, stoic={stoic}'
                                    for species, stoic in self.mach_reactants.items()])

--- a/rbaxdf/model/rba_metabolism.py
+++ b/rbaxdf/model/rba_metabolism.py
@@ -47,6 +47,21 @@ class RbaMetabolism:
            print(f'wrong metabolism type: {m_type}')
        return df

+    def validate(self, component_ids):
+        valid = True
+        missing = self.ref_molecules().difference(component_ids['species'])
+        if len(missing) > 0:
+            print('species used in reactions not defined:', missing)
+            valid = False
+        return valid
+
+    def ref_molecules(self):
+        refs = set()
+        for r in self.reactions.values():
+            refs |= {sid for sid in r.reactants}
+            refs |= {sid for sid in r.products}
+        return refs
+

 class RbaCompartment:


--- a/rbaxdf/model/rba_model.py
+++ b/rbaxdf/model/rba_model.py
@@ -9,10 +9,10 @@ import pandas as pd
 from .rba_macromolecules import RbaMacromolecules
 from .rba_metabolism import RbaMetabolism
 from .rba_parameters import RbaParameters
-from .rba_process import RbaProcesses
-from .rba_enzyme import RbaEnzyme
-from .rba_density import RbaDensity
-from .rba_target_group import RbaTargetGroup
+from .rba_processes import RbaProcesses
+from .rba_enzymes import RbaEnzymes
+from .rba_densities import RbaDensities
+from .rba_targets import RbaTargets


 class RbaModel:
@@ -28,9 +28,9 @@ class RbaModel:
        self.metabolism = RbaMetabolism()
        self.parameters = RbaParameters()
        self.processes = RbaProcesses()
-        self.density = None
-        self.targets = None
-        self.enzymes = None
+        self.densities = RbaDensities()
+        self.targets = RbaTargets()
+        self.enzymes = RbaEnzymes()

        if os.path.exists(model_dir) is False:
            print(f'{model_dir} not found!')
@@ -43,10 +43,9 @@ class RbaModel:
        self.proteins.get_xml_items(self.model_dir)
        self.metabolism.get_xml_items(self.model_dir)
        self.processes.get_xml_items(self.model_dir)
-
-        self.density = RbaDensity.get_xml_items(self.model_dir)
-        self.targets = RbaTargetGroup.get_xml_items(self.model_dir)
-        self.enzymes = RbaEnzyme.get_xml_items(self.model_dir)
+        self.densities.get_xml_items(self.model_dir)
+        self.enzymes.get_xml_items(self.model_dir)
+        self.targets.get_xml_items(self.model_dir)

        self.is_model = True

@@ -59,11 +58,9 @@ class RbaModel:
            m_dict['rnas'] = self.rnas.get_df_items()
            m_dict['dna'] = self.dna.get_df_items()
            m_dict['proteins'] = self.proteins.get_df_items()
-
-            m_dict['density'] = RbaDensity.get_df_items(self.density)
-            m_dict['targets'] = RbaTargetGroup.get_df_items(self.targets)
-            m_dict['enzymes'] = RbaEnzyme.get_df_items(self.enzymes)
-
+            m_dict['enzymes'] = self.enzymes.get_df_items()
+            m_dict['densities'] = self.densities.get_df_items()
+            m_dict['targets'] = self.targets.get_df_items()
            m_dict['compartments'] = self.metabolism.get_df_items('compartments')
            m_dict['species'] = self.metabolism.get_df_items('species')
            m_dict['reactions'] = self.metabolism.get_df_items('reactions')
@@ -86,9 +83,9 @@ class RbaModel:
            if '=' in row['machineryCapacity']:
                first_value = row['machineryCapacity'].split('=')[1]
                m_dict['processes'].at[idx, 'capacity_info'] = self.parameters.get_value_info(first_value)
-        for idx, row in m_dict['density'].iterrows():
+        for idx, row in m_dict['densities'].iterrows():
            first_value = row['targetValue'].split('=')[1]
-            m_dict['density'].at[idx, 'value_info'] = self.parameters.get_value_info(first_value)
+            m_dict['densities'].at[idx, 'value_info'] = self.parameters.get_value_info(first_value)
        for idx, row in m_dict['targets'].iterrows():
            first_value = row['targetValue'].split('=')[1]
            m_dict['targets'].at[idx, 'value_info'] = self.parameters.get_value_info(first_value)
@@ -99,6 +96,54 @@ class RbaModel:
                df.to_excel(writer, sheet_name=name, index=keep_index)
            print(f'model exported to {xlsx_name}')

+    def validate(self):
+        component_ids = {'species': set(self.metabolism.species),
+                         'dna': set(self.dna.macromolecules),
+                         'rnas': set(self.rnas.macromolecules),
+                         'proteins': set(self.proteins.macromolecules),
+                         'functions': set(self.parameters.functions),
+                         'aggregates': set(self.parameters.aggregates)}
+
+        valid = True
+        valid = valid and self.metabolism.validate(component_ids)
+        valid = valid and self.processes.validate(component_ids)
+        valid = valid and self.densities.validate(component_ids)
+        valid = valid and self.enzymes.validate(component_ids)
+        valid = valid and self.targets.validate(component_ids)
+        valid = valid and self.parameters.validate(component_ids)
+        return valid
+
+    def check_unused(self):
+        unused = 0
+        molecules = (set(self.metabolism.species) | set(self.dna.macromolecules) |
+                     set(self.rnas.macromolecules) | set(self.proteins.macromolecules))
+        parameters = set(self.parameters.functions) | set(self.parameters.aggregates)
+
+        ref_parameters = set()
+        ref_parameters |= self.processes.ref_parameters()
+        ref_parameters |= self.densities.ref_parameters()
+        ref_parameters |= self.targets.ref_parameters()
+        ref_parameters |= self.parameters.ref_parameters()
+        ref_parameters |= self.enzymes.ref_parameters()
+
+        ref_molecules = set()
+        ref_molecules |= self.metabolism.ref_molecules()
+        ref_molecules |= self.processes.ref_molecules()
+        ref_molecules |= self.enzymes.ref_molecules()
+        ref_molecules |= self.targets.ref_molecules()
+
+        unused_parameters = parameters.difference(ref_parameters)
+        unused_molecules = molecules.difference(ref_molecules)
+
+        if len(unused_parameters) > 0:
+            print(f'{len(unused_parameters)} unused parameters:', unused_parameters)
+            unused += len(unused_parameters)
+        if len(unused_molecules) > 0:
+            print(f'{len(unused_molecules)} unused molecules:', unused_molecules)
+            unused += len(unused_molecules)
+        if unused == 0:
+            print('no unused parameters/molecules')
+
    def from_df(self):
        pass


--- a/rbaxdf/model/rba_parameters.py
+++ b/rbaxdf/model/rba_parameters.py
@@ -46,6 +46,20 @@ class RbaParameters:
            print(f'wrong parameter type: {p_type}')
        return df

+    def validate(self, component_ids):
+        valid = True
+        missing = self.ref_parameters().difference(component_ids['functions'])
+        if len(missing) > 0:
+            print('functions used in aggregates not defined:', missing)
+            valid = False
+        return valid
+
+    def ref_parameters(self):
+        refs = set()
+        for a in self.aggregates.values():
+            refs |= set(a.functions)
+        return refs
+

 class RbaFunction:


--- a/rbaxdf/model/rba_process.py
+++ b/rbaxdf/model/rba_process.py
@@ -55,6 +55,74 @@ class RbaProcesses:
            print(f'wrong parameter type: {p_type}')
        return df

+    def validate(self, component_ids):
+        valid = True
+
+        missing = self.ref_molecules_pmaps().difference(component_ids['species'])
+        if len(missing) > 0:
+            print('species used in processingMaps not defined:', missing)
+            valid = False
+
+        missing = self.ref_molecules_machinery().difference(component_ids['species']) \
+            .difference(component_ids['rnas']) \
+            .difference(component_ids['proteins'])
+        if len(missing) > 0:
+            print('species/macromolecules used in processes not defined:', missing)
+            valid = False
+
+        missing = self.ref_molecules_inputs().difference(component_ids['dna']) \
+            .difference(component_ids['rnas']) \
+            .difference(component_ids['proteins'])
+        if len(missing) > 0:
+            print('macromolecules used in processes not defined:', missing)
+            valid = False
+
+        missing = self.ref_parameters().difference(component_ids['functions']) \
+            .difference(component_ids['aggregates'])
+        if len(missing) > 0:
+            print('function/aggregates used in aggregates not defined:', missing)
+            valid = False
+
+        return valid
+
+    def ref_molecules_pmaps(self):
+        refs = set()
+        for pmap in self.processing_maps.values():
+            refs |= {sid for sid in pmap.constant_processing.get('reactants', {})}
+            refs |= {sid for sid in pmap.constant_processing.get('products', {})}
+            for comp_proc in pmap.component_processings.values():
+                refs |= {sid for sid in comp_proc['reactants']}
+                refs |= {sid for sid in comp_proc['products']}
+        return refs
+
+    def ref_molecules_machinery(self):
+        refs = set()
+        for p in self.processes.values():
+            refs |= {sid for sid in p.machinery.get('reactants', {})}
+            refs |= {sid for sid in p.machinery.get('products', {})}
+        return refs
+
+    def ref_molecules_inputs(self):
+        refs = set()
+        for p in self.processes.values():
+            refs |= {sid for sid in p.productions.get('inputs', {})}
+            refs |= {sid for sid in p.degradations.get('inputs', {})}
+        return refs
+
+    def ref_molecules(self):
+        refs = set()
+        refs |= self.ref_molecules_pmaps()
+        refs |= self.ref_molecules_machinery()
+        refs |= self.ref_molecules_inputs()
+        return refs
+
+    def ref_parameters(self):
+        refs = set()
+        for p in self.processes.values():
+            if 'capacity' in p.machinery:
+                refs.add(p.machinery['capacity'].value)
+        return refs
+

 class RbaProcess:

@@ -134,8 +202,8 @@ class RbaProcess:
        return {'process': self.id, 'name': self.name,
                'machineryCapacity': mach_capacity, 'machineryReactants': mach_reactants,
                'machineryProducts': mach_products,
-                'productionsProcessingMap': prod_pmap, 'productionsSet': prod_set,
-                'productionsInputs': prod_inputs,
+                'productionProcessingMap': prod_pmap, 'productionSet': prod_set,
+                'productionInputs': prod_inputs,
                'degradationProcessingMap': degr_pmap, 'degradationSet': degr_set,
                'degradationInputs': degr_inputs}


--- a/rbaxdf/model/rba_target_group.py
+++ b/rbaxdf/model/rba_target_group.py
-"""Implementation of RbaTargetGroup class.
+"""Implementation of RbaTargets and RbaTargetGroup classes.

 Peter Schubert, CCB, HHU Duesseldorf, December 2022
 """
@@ -10,18 +10,12 @@ import xml.etree.ElementTree
 from rbaxdf.utils.et_utils import get_target_species, get_target_reactions


-class RbaTargetGroup:
-
-    def __init__(self, tgid):
-        self.id = tgid
-        self.concentrations = {}
-        self.production_fluxes = {}
-        self.degradation_fluxes = {}
-        self.reaction_fluxes = {}
+class RbaTargets:

-    @staticmethod
-    def get_xml_items(model_dir):
+    def __init__(self):
+        self.target_groups = {}

+    def get_xml_items(self, model_dir):
        file_name = os.path.join(model_dir, 'targets.xml')
        if os.path.exists(file_name) is False:
            print(f'{file_name} not found!')
@@ -31,6 +25,66 @@ class RbaTargetGroup:
        root = tree.getroot()
        assert root.tag == 'RBATargets'

+        self.target_groups = RbaTargetGroup.get_xml_items(root)
+
+    def get_df_items(self):
+        data = []
+        for tgid, tg in self.target_groups.items():
+            tdict = tg.to_dict()
+            for target_type in ['concentrations', 'productionFluxes', 'degradationFluxes', 'reactionFluxes']:
+                for target, value in tdict[target_type].items():
+                    data.append([tgid, target_type, target, value])
+        df = pd.DataFrame(data, columns=['targetGroup', 'targetType', 'target', 'targetValue'])
+        df.set_index('targetGroup', inplace=True)
+        return df
+
+    def validate(self, component_ids):
+        valid = True
+        missing = self.ref_molecules().difference(component_ids['species']) \
+            .difference(component_ids['rnas']) \
+            .difference(component_ids['dna']) \
+            .difference(component_ids['proteins'])
+        if len(missing) > 0:
+            print('species/macromolecules used in targets not defined:', missing)
+            valid = False
+
+        missing = self.ref_parameters().difference(component_ids['functions']) \
+            .difference(component_ids['aggregates'])
+        if len(missing) > 0:
+            print('function/aggregates used in targets not defined:', missing)
+            valid = False
+
+        return valid
+
+    def ref_molecules(self):
+        refs = set()
+        for tg in self.target_groups.values():
+            refs |= {sid for sid in tg.concentrations}
+            refs |= {sid for sid in tg.production_fluxes}
+            refs |= {sid for sid in tg.degradation_fluxes}
+        return refs
+
+    def ref_parameters(self):
+        refs = set()
+        for tg in self.target_groups.values():
+            for target_type in ['concentrations', 'production_fluxes', 'degradation_fluxes', 'reaction_fluxes']:
+                targets = getattr(tg, target_type)
+                for target in targets.values():
+                    refs |= {tv.split('=')[1].strip() for tv in target.get_str().split(',')}
+        return refs
+
+
+class RbaTargetGroup:
+
+    def __init__(self, tgid):
+        self.id = tgid
+        self.concentrations = {}
+        self.production_fluxes = {}
+        self.degradation_fluxes = {}
+        self.reaction_fluxes = {}
+
+    @staticmethod
+    def get_xml_items(root):
        data = {}
        target_groups = root.find('listOfTargetGroups')
        for target_group in target_groups.findall('targetGroup'):
@@ -43,19 +97,10 @@ class RbaTargetGroup:
            data[tgid] = rba_target
        return data

-    @staticmethod
-    def get_df_items(items):
-        data = []
-        for tgid, tg in items.items():
-            for target, target_value in tg.concentrations.items():
-                data.append([tgid, 'concentrations', target, target_value.get_str()])
-            for target, target_value in tg.production_fluxes.items():
-                data.append([tgid, 'productionFluxes', target, target_value.get_str()])
-            for target, target_value in tg.degradation_fluxes.items():
-                data.append([tgid, 'degradationFluxes', target, target_value.get_str()])
-            for target, target_value in tg.reaction_fluxes.items():
-                data.append([tgid, 'reactionFluxes', target, target_value.get_str()])
-
-        df = pd.DataFrame(data, columns=['targetGroup', 'targetType', 'target', 'targetValue'])
-        df.set_index('targetGroup', inplace=True)
-        return df
+    def to_dict(self):
+        conc = {target: value.get_str() for target, value in self.concentrations.items()}
+        prod_fluxes = {target: value.get_str() for target, value in self.production_fluxes.items()}
+        degr_fluxes = {target: value.get_str() for target, value in self.degradation_fluxes.items()}
+        reac_fluxes = {target: value.get_str() for target, value in self.reaction_fluxes.items()}
+        return {'targetGroup': self.id, 'concentrations': conc, 'productionFluxes': prod_fluxes,
+                'degradationFluxes': degr_fluxes, 'reactionFluxes': reac_fluxes}