From a569e10be62f34308017696c0cb3afb04476798e Mon Sep 17 00:00:00 2001 From: Peter Schubert <Peter.Schubert@hhu.de> Date: Tue, 20 Dec 2022 22:03:45 +0100 Subject: [PATCH] model validation and check unused parameters/molecules --- .../{rba_density.py => rba_densities.py} | 50 ++++++--- .../model/{rba_enzyme.py => rba_enzymes.py} | 74 +++++++++--- rbaxdf/model/rba_metabolism.py | 15 +++ rbaxdf/model/rba_model.py | 81 ++++++++++--- rbaxdf/model/rba_parameters.py | 14 +++ .../{rba_process.py => rba_processes.py} | 72 +++++++++++- rbaxdf/model/rba_target_group.py | 61 ---------- rbaxdf/model/rba_targets.py | 106 ++++++++++++++++++ 8 files changed, 360 insertions(+), 113 deletions(-) rename rbaxdf/model/{rba_density.py => rba_densities.py} (53%) rename rbaxdf/model/{rba_enzyme.py => rba_enzymes.py} (62%) rename rbaxdf/model/{rba_process.py => rba_processes.py} (78%) delete mode 100644 rbaxdf/model/rba_target_group.py create mode 100644 rbaxdf/model/rba_targets.py diff --git a/rbaxdf/model/rba_density.py b/rbaxdf/model/rba_densities.py similarity index 53% rename from rbaxdf/model/rba_density.py rename to rbaxdf/model/rba_densities.py index 41bf12f..3ea8e1e 100644 --- a/rbaxdf/model/rba_density.py +++ b/rbaxdf/model/rba_densities.py @@ -1,4 +1,4 @@ -"""Implementation of RbaDensity class. +"""Implementation of RbaDensities and RbaDensity classes. Peter Schubert, CCB, HHU Duesseldorf, December 2022 """ @@ -10,15 +10,12 @@ import xml.etree.ElementTree from .rba_target_value import RbaTargetValue -class RbaDensity: - - def __init__(self, cid): - self.id = cid - self.target_value = None +class RbaDensities: - @staticmethod - def get_xml_items(model_dir): + def __init__(self): + self.densities = {} + def get_xml_items(self, model_dir): file_name = os.path.join(model_dir, 'density.xml') if os.path.exists(file_name) is False: print(f'{file_name} not found!') @@ -28,6 +25,37 @@ class RbaDensity: root = tree.getroot() assert root.tag == 'RBADensity' + self.densities = RbaDensity.get_xml_items(root) + + def get_df_items(self): + df = pd.DataFrame([item.to_dict() for item in self.densities.values()]) + df.set_index('compartment', inplace=True) + return df + + def validate(self, component_ids): + valid = True + missing_components = self.ref_parameters().difference(component_ids['functions'])\ + .difference(component_ids['aggregates']) + if len(missing_components) > 0: + print('function/aggregates used in densities not defined:', missing_components) + valid = False + return valid + + def ref_parameters(self): + refs = set() + for d in self.densities.values(): + refs |= {tv.split('=')[1].strip() for tv in d.target_value.get_str().split(',')} + return refs + + +class RbaDensity: + + def __init__(self, cid): + self.id = cid + self.target_value = None + + @staticmethod + def get_xml_items(root): data = {} target_densities = root.find('listOfTargetDensities') for target_density in target_densities.findall('targetDensity'): @@ -37,12 +65,6 @@ class RbaDensity: data[cid] = rba_density return data - @staticmethod - def get_df_items(items): - df = pd.DataFrame([item.to_dict() for item in items.values()]) - df.set_index('compartment', inplace=True) - return df - def to_dict(self): return {'compartment': self.id, 'targetValue': self.target_value.get_str()} diff --git a/rbaxdf/model/rba_enzyme.py b/rbaxdf/model/rba_enzymes.py similarity index 62% rename from rbaxdf/model/rba_enzyme.py rename to rbaxdf/model/rba_enzymes.py index 12439d8..2fe00fd 100644 --- a/rbaxdf/model/rba_enzyme.py +++ b/rbaxdf/model/rba_enzymes.py @@ -1,4 +1,4 @@ -"""Implementation of RbaEnzyme class. +"""Implementation of RbaEnzymes and RbaEnzyme classes. Peter Schubert, CCB, HHU Duesseldorf, December 2022 """ @@ -12,19 +12,12 @@ import xml.etree.ElementTree from rbaxdf.utils.et_utils import get_species_refs -class RbaEnzyme: +class RbaEnzymes: - def __init__(self, eid): - self.id = eid - self.reaction = '' - self.forward_eff = '' - self.backward_eff = '' - self.zero_cost = False - self.mach_reactants = {} - self.mach_products = {} + def __init__(self): + self.enzymes = {} - @staticmethod - def get_xml_items(model_dir): + def get_xml_items(self, model_dir): file_name = os.path.join(model_dir, 'enzymes.xml') if os.path.exists(file_name) is False: @@ -35,6 +28,57 @@ class RbaEnzyme: root = tree.getroot() assert root.tag == 'RBAEnzymes' + self.enzymes = RbaEnzyme.get_xml_items(root) + + def get_df_items(self): + df = pd.DataFrame([item.to_dict() for item in self.enzymes.values()]) + df.set_index('enzyme', inplace=True) + return df + + def validate(self, component_ids): + valid = True + missing = self.ref_molecules().difference(component_ids['species']) \ + .difference(component_ids['rnas']) \ + .difference(component_ids['proteins']) + if len(missing) > 0: + print('species/macromolecules used in enzyme machinery not defined:', missing) + valid = False + + missing = self.ref_parameters().difference(component_ids['functions']) \ + .difference(component_ids['aggregates']) + if len(missing) > 0: + print('function/aggregates used in enzymes not defined:', missing) + valid = False + return valid + + def ref_molecules(self): + refs = set() + for e in self.enzymes.values(): + refs |= {sid for sid in e.mach_reactants} + refs |= {sid for sid in e.mach_products} + return refs + + def ref_parameters(self): + refs = set() + for e in self.enzymes.values(): + refs.add(e.forward_eff) + refs.add(e.backward_eff) + return refs + + +class RbaEnzyme: + + def __init__(self, eid): + self.id = eid + self.reaction = '' + self.forward_eff = '' + self.backward_eff = '' + self.zero_cost = False + self.mach_reactants = {} + self.mach_products = {} + + @staticmethod + def get_xml_items(root): data = {} enzymes = root.find('listOfEnzymes') for enzyme in enzymes.findall('enzyme'): @@ -53,12 +97,6 @@ class RbaEnzyme: data[eid] = rba_enzyme return data - @staticmethod - def get_df_items(items): - df = pd.DataFrame([item.to_dict() for item in items.values()]) - df.set_index('enzyme', inplace=True) - return df - def to_dict(self): mach_reactants = '; '.join([f'species={species}, stoic={stoic}' for species, stoic in self.mach_reactants.items()]) diff --git a/rbaxdf/model/rba_metabolism.py b/rbaxdf/model/rba_metabolism.py index a47b3e3..bd80540 100644 --- a/rbaxdf/model/rba_metabolism.py +++ b/rbaxdf/model/rba_metabolism.py @@ -47,6 +47,21 @@ class RbaMetabolism: print(f'wrong metabolism type: {m_type}') return df + def validate(self, component_ids): + valid = True + missing = self.ref_molecules().difference(component_ids['species']) + if len(missing) > 0: + print('species used in reactions not defined:', missing) + valid = False + return valid + + def ref_molecules(self): + refs = set() + for r in self.reactions.values(): + refs |= {sid for sid in r.reactants} + refs |= {sid for sid in r.products} + return refs + class RbaCompartment: diff --git a/rbaxdf/model/rba_model.py b/rbaxdf/model/rba_model.py index 5fd4ce1..966ffe7 100644 --- a/rbaxdf/model/rba_model.py +++ b/rbaxdf/model/rba_model.py @@ -9,10 +9,10 @@ import pandas as pd from .rba_macromolecules import RbaMacromolecules from .rba_metabolism import RbaMetabolism from .rba_parameters import RbaParameters -from .rba_process import RbaProcesses -from .rba_enzyme import RbaEnzyme -from .rba_density import RbaDensity -from .rba_target_group import RbaTargetGroup +from .rba_processes import RbaProcesses +from .rba_enzymes import RbaEnzymes +from .rba_densities import RbaDensities +from .rba_targets import RbaTargets class RbaModel: @@ -28,9 +28,9 @@ class RbaModel: self.metabolism = RbaMetabolism() self.parameters = RbaParameters() self.processes = RbaProcesses() - self.density = None - self.targets = None - self.enzymes = None + self.densities = RbaDensities() + self.targets = RbaTargets() + self.enzymes = RbaEnzymes() if os.path.exists(model_dir) is False: print(f'{model_dir} not found!') @@ -43,10 +43,9 @@ class RbaModel: self.proteins.get_xml_items(self.model_dir) self.metabolism.get_xml_items(self.model_dir) self.processes.get_xml_items(self.model_dir) - - self.density = RbaDensity.get_xml_items(self.model_dir) - self.targets = RbaTargetGroup.get_xml_items(self.model_dir) - self.enzymes = RbaEnzyme.get_xml_items(self.model_dir) + self.densities.get_xml_items(self.model_dir) + self.enzymes.get_xml_items(self.model_dir) + self.targets.get_xml_items(self.model_dir) self.is_model = True @@ -59,11 +58,9 @@ class RbaModel: m_dict['rnas'] = self.rnas.get_df_items() m_dict['dna'] = self.dna.get_df_items() m_dict['proteins'] = self.proteins.get_df_items() - - m_dict['density'] = RbaDensity.get_df_items(self.density) - m_dict['targets'] = RbaTargetGroup.get_df_items(self.targets) - m_dict['enzymes'] = RbaEnzyme.get_df_items(self.enzymes) - + m_dict['enzymes'] = self.enzymes.get_df_items() + m_dict['densities'] = self.densities.get_df_items() + m_dict['targets'] = self.targets.get_df_items() m_dict['compartments'] = self.metabolism.get_df_items('compartments') m_dict['species'] = self.metabolism.get_df_items('species') m_dict['reactions'] = self.metabolism.get_df_items('reactions') @@ -86,9 +83,9 @@ class RbaModel: if '=' in row['machineryCapacity']: first_value = row['machineryCapacity'].split('=')[1] m_dict['processes'].at[idx, 'capacity_info'] = self.parameters.get_value_info(first_value) - for idx, row in m_dict['density'].iterrows(): + for idx, row in m_dict['densities'].iterrows(): first_value = row['targetValue'].split('=')[1] - m_dict['density'].at[idx, 'value_info'] = self.parameters.get_value_info(first_value) + m_dict['densities'].at[idx, 'value_info'] = self.parameters.get_value_info(first_value) for idx, row in m_dict['targets'].iterrows(): first_value = row['targetValue'].split('=')[1] m_dict['targets'].at[idx, 'value_info'] = self.parameters.get_value_info(first_value) @@ -99,6 +96,54 @@ class RbaModel: df.to_excel(writer, sheet_name=name, index=keep_index) print(f'model exported to {xlsx_name}') + def validate(self): + component_ids = {'species': set(self.metabolism.species), + 'dna': set(self.dna.macromolecules), + 'rnas': set(self.rnas.macromolecules), + 'proteins': set(self.proteins.macromolecules), + 'functions': set(self.parameters.functions), + 'aggregates': set(self.parameters.aggregates)} + + valid = True + valid = valid and self.metabolism.validate(component_ids) + valid = valid and self.processes.validate(component_ids) + valid = valid and self.densities.validate(component_ids) + valid = valid and self.enzymes.validate(component_ids) + valid = valid and self.targets.validate(component_ids) + valid = valid and self.parameters.validate(component_ids) + return valid + + def check_unused(self): + unused = 0 + molecules = (set(self.metabolism.species) | set(self.dna.macromolecules) | + set(self.rnas.macromolecules) | set(self.proteins.macromolecules)) + parameters = set(self.parameters.functions) | set(self.parameters.aggregates) + + ref_parameters = set() + ref_parameters |= self.processes.ref_parameters() + ref_parameters |= self.densities.ref_parameters() + ref_parameters |= self.targets.ref_parameters() + ref_parameters |= self.parameters.ref_parameters() + ref_parameters |= self.enzymes.ref_parameters() + + ref_molecules = set() + ref_molecules |= self.metabolism.ref_molecules() + ref_molecules |= self.processes.ref_molecules() + ref_molecules |= self.enzymes.ref_molecules() + ref_molecules |= self.targets.ref_molecules() + + unused_parameters = parameters.difference(ref_parameters) + unused_molecules = molecules.difference(ref_molecules) + + if len(unused_parameters) > 0: + print(f'{len(unused_parameters)} unused parameters:', unused_parameters) + unused += len(unused_parameters) + if len(unused_molecules) > 0: + print(f'{len(unused_molecules)} unused molecules:', unused_molecules) + unused += len(unused_molecules) + if unused == 0: + print('no unused parameters/molecules') + def from_df(self): pass diff --git a/rbaxdf/model/rba_parameters.py b/rbaxdf/model/rba_parameters.py index 633170d..2e6eef8 100644 --- a/rbaxdf/model/rba_parameters.py +++ b/rbaxdf/model/rba_parameters.py @@ -46,6 +46,20 @@ class RbaParameters: print(f'wrong parameter type: {p_type}') return df + def validate(self, component_ids): + valid = True + missing = self.ref_parameters().difference(component_ids['functions']) + if len(missing) > 0: + print('functions used in aggregates not defined:', missing) + valid = False + return valid + + def ref_parameters(self): + refs = set() + for a in self.aggregates.values(): + refs |= set(a.functions) + return refs + class RbaFunction: diff --git a/rbaxdf/model/rba_process.py b/rbaxdf/model/rba_processes.py similarity index 78% rename from rbaxdf/model/rba_process.py rename to rbaxdf/model/rba_processes.py index 001a81a..e5d256c 100644 --- a/rbaxdf/model/rba_process.py +++ b/rbaxdf/model/rba_processes.py @@ -55,6 +55,74 @@ class RbaProcesses: print(f'wrong parameter type: {p_type}') return df + def validate(self, component_ids): + valid = True + + missing = self.ref_molecules_pmaps().difference(component_ids['species']) + if len(missing) > 0: + print('species used in processingMaps not defined:', missing) + valid = False + + missing = self.ref_molecules_machinery().difference(component_ids['species']) \ + .difference(component_ids['rnas']) \ + .difference(component_ids['proteins']) + if len(missing) > 0: + print('species/macromolecules used in processes not defined:', missing) + valid = False + + missing = self.ref_molecules_inputs().difference(component_ids['dna']) \ + .difference(component_ids['rnas']) \ + .difference(component_ids['proteins']) + if len(missing) > 0: + print('macromolecules used in processes not defined:', missing) + valid = False + + missing = self.ref_parameters().difference(component_ids['functions']) \ + .difference(component_ids['aggregates']) + if len(missing) > 0: + print('function/aggregates used in aggregates not defined:', missing) + valid = False + + return valid + + def ref_molecules_pmaps(self): + refs = set() + for pmap in self.processing_maps.values(): + refs |= {sid for sid in pmap.constant_processing.get('reactants', {})} + refs |= {sid for sid in pmap.constant_processing.get('products', {})} + for comp_proc in pmap.component_processings.values(): + refs |= {sid for sid in comp_proc['reactants']} + refs |= {sid for sid in comp_proc['products']} + return refs + + def ref_molecules_machinery(self): + refs = set() + for p in self.processes.values(): + refs |= {sid for sid in p.machinery.get('reactants', {})} + refs |= {sid for sid in p.machinery.get('products', {})} + return refs + + def ref_molecules_inputs(self): + refs = set() + for p in self.processes.values(): + refs |= {sid for sid in p.productions.get('inputs', {})} + refs |= {sid for sid in p.degradations.get('inputs', {})} + return refs + + def ref_molecules(self): + refs = set() + refs |= self.ref_molecules_pmaps() + refs |= self.ref_molecules_machinery() + refs |= self.ref_molecules_inputs() + return refs + + def ref_parameters(self): + refs = set() + for p in self.processes.values(): + if 'capacity' in p.machinery: + refs.add(p.machinery['capacity'].value) + return refs + class RbaProcess: @@ -134,8 +202,8 @@ class RbaProcess: return {'process': self.id, 'name': self.name, 'machineryCapacity': mach_capacity, 'machineryReactants': mach_reactants, 'machineryProducts': mach_products, - 'productionsProcessingMap': prod_pmap, 'productionsSet': prod_set, - 'productionsInputs': prod_inputs, + 'productionProcessingMap': prod_pmap, 'productionSet': prod_set, + 'productionInputs': prod_inputs, 'degradationProcessingMap': degr_pmap, 'degradationSet': degr_set, 'degradationInputs': degr_inputs} diff --git a/rbaxdf/model/rba_target_group.py b/rbaxdf/model/rba_target_group.py deleted file mode 100644 index 0892650..0000000 --- a/rbaxdf/model/rba_target_group.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Implementation of RbaTargetGroup class. - -Peter Schubert, CCB, HHU Duesseldorf, December 2022 -""" - -import os -import pandas as pd -import xml.etree.ElementTree - -from rbaxdf.utils.et_utils import get_target_species, get_target_reactions - - -class RbaTargetGroup: - - def __init__(self, tgid): - self.id = tgid - self.concentrations = {} - self.production_fluxes = {} - self.degradation_fluxes = {} - self.reaction_fluxes = {} - - @staticmethod - def get_xml_items(model_dir): - - file_name = os.path.join(model_dir, 'targets.xml') - if os.path.exists(file_name) is False: - print(f'{file_name} not found!') - return {} - - tree = xml.etree.ElementTree.parse(file_name) - root = tree.getroot() - assert root.tag == 'RBATargets' - - data = {} - target_groups = root.find('listOfTargetGroups') - for target_group in target_groups.findall('targetGroup'): - tgid = target_group.attrib.get('id', '') - rba_target = RbaTargetGroup(tgid) - rba_target.concentrations = get_target_species(target_group.find('listOfConcentrations')) - rba_target.production_fluxes = get_target_species(target_group.find('listOfProductionFluxes')) - rba_target.degradation_fluxes = get_target_species(target_group.find('listOfDegradationFluxes')) - rba_target.reaction_fluxes = get_target_reactions(target_group.find('listOfReactionFluxes')) - data[tgid] = rba_target - return data - - @staticmethod - def get_df_items(items): - data = [] - for tgid, tg in items.items(): - for target, target_value in tg.concentrations.items(): - data.append([tgid, 'concentrations', target, target_value.get_str()]) - for target, target_value in tg.production_fluxes.items(): - data.append([tgid, 'productionFluxes', target, target_value.get_str()]) - for target, target_value in tg.degradation_fluxes.items(): - data.append([tgid, 'degradationFluxes', target, target_value.get_str()]) - for target, target_value in tg.reaction_fluxes.items(): - data.append([tgid, 'reactionFluxes', target, target_value.get_str()]) - - df = pd.DataFrame(data, columns=['targetGroup', 'targetType', 'target', 'targetValue']) - df.set_index('targetGroup', inplace=True) - return df diff --git a/rbaxdf/model/rba_targets.py b/rbaxdf/model/rba_targets.py new file mode 100644 index 0000000..11b2cdc --- /dev/null +++ b/rbaxdf/model/rba_targets.py @@ -0,0 +1,106 @@ +"""Implementation of RbaTargets and RbaTargetGroup classes. + +Peter Schubert, CCB, HHU Duesseldorf, December 2022 +""" + +import os +import pandas as pd +import xml.etree.ElementTree + +from rbaxdf.utils.et_utils import get_target_species, get_target_reactions + + +class RbaTargets: + + def __init__(self): + self.target_groups = {} + + def get_xml_items(self, model_dir): + file_name = os.path.join(model_dir, 'targets.xml') + if os.path.exists(file_name) is False: + print(f'{file_name} not found!') + return {} + + tree = xml.etree.ElementTree.parse(file_name) + root = tree.getroot() + assert root.tag == 'RBATargets' + + self.target_groups = RbaTargetGroup.get_xml_items(root) + + def get_df_items(self): + data = [] + for tgid, tg in self.target_groups.items(): + tdict = tg.to_dict() + for target_type in ['concentrations', 'productionFluxes', 'degradationFluxes', 'reactionFluxes']: + for target, value in tdict[target_type].items(): + data.append([tgid, target_type, target, value]) + df = pd.DataFrame(data, columns=['targetGroup', 'targetType', 'target', 'targetValue']) + df.set_index('targetGroup', inplace=True) + return df + + def validate(self, component_ids): + valid = True + missing = self.ref_molecules().difference(component_ids['species']) \ + .difference(component_ids['rnas']) \ + .difference(component_ids['dna']) \ + .difference(component_ids['proteins']) + if len(missing) > 0: + print('species/macromolecules used in targets not defined:', missing) + valid = False + + missing = self.ref_parameters().difference(component_ids['functions']) \ + .difference(component_ids['aggregates']) + if len(missing) > 0: + print('function/aggregates used in targets not defined:', missing) + valid = False + + return valid + + def ref_molecules(self): + refs = set() + for tg in self.target_groups.values(): + refs |= {sid for sid in tg.concentrations} + refs |= {sid for sid in tg.production_fluxes} + refs |= {sid for sid in tg.degradation_fluxes} + return refs + + def ref_parameters(self): + refs = set() + for tg in self.target_groups.values(): + for target_type in ['concentrations', 'production_fluxes', 'degradation_fluxes', 'reaction_fluxes']: + targets = getattr(tg, target_type) + for target in targets.values(): + refs |= {tv.split('=')[1].strip() for tv in target.get_str().split(',')} + return refs + + +class RbaTargetGroup: + + def __init__(self, tgid): + self.id = tgid + self.concentrations = {} + self.production_fluxes = {} + self.degradation_fluxes = {} + self.reaction_fluxes = {} + + @staticmethod + def get_xml_items(root): + data = {} + target_groups = root.find('listOfTargetGroups') + for target_group in target_groups.findall('targetGroup'): + tgid = target_group.attrib.get('id', '') + rba_target = RbaTargetGroup(tgid) + rba_target.concentrations = get_target_species(target_group.find('listOfConcentrations')) + rba_target.production_fluxes = get_target_species(target_group.find('listOfProductionFluxes')) + rba_target.degradation_fluxes = get_target_species(target_group.find('listOfDegradationFluxes')) + rba_target.reaction_fluxes = get_target_reactions(target_group.find('listOfReactionFluxes')) + data[tgid] = rba_target + return data + + def to_dict(self): + conc = {target: value.get_str() for target, value in self.concentrations.items()} + prod_fluxes = {target: value.get_str() for target, value in self.production_fluxes.items()} + degr_fluxes = {target: value.get_str() for target, value in self.degradation_fluxes.items()} + reac_fluxes = {target: value.get_str() for target, value in self.reaction_fluxes.items()} + return {'targetGroup': self.id, 'concentrations': conc, 'productionFluxes': prod_fluxes, + 'degradationFluxes': degr_fluxes, 'reactionFluxes': reac_fluxes} -- GitLab