diff --git a/README.md b/README.md index 9d4440f457f610640bfc936ada9f83ac0ac10a4a..f37176efe9bc757d78860706ba57d4dc0369b9bb 100644 --- a/README.md +++ b/README.md @@ -3,66 +3,77 @@ This is a port to Python and improvement of the network reduction MATLAB code (CNAreduceMFNetwork.m) implemented in **CellNetAnalyzer** [1]. -Making use of functions provided by **glpk** [2] and **sbmlxdf**. +GLPK (GNU Linear Programming Kit) [2] is used as local solver to solve +related FBA (flux balance analysis) [4] and +FVA (flux variability analysis) [5] problems. -The network reduction method is based on **An algorithm for the reduction +The network pruning method is based on **An algorithm for the reduction of genome-scale metabolic network models to meaningful core models** [3]. This port does not implement the additional (optional) step of -network compression, which converts linear pathways to single reactions +network compression, which compresses linear pathways to single reactions and suppresses some biomass metabolites and transport reactions. ## Benefits of this port: -This is a port of the respective MATLAB code, mainly the functions -CNAreduceMFNetwork, CNAcompressMFNetwork, with following -enhancements: +This package is based on exising CellNetAnalzyer MATLAB code, mainly functions +in CNAreduceMFNetwork.m, with following enhancements: - significant speedup (e.g. network reduction of genome-scale E. coli network - 30 h -> 1 h on a laptop) -- direct interface to SBML coded models (import from SBML / export to SBML) + 30 h -> less than 1 h on a laptop) +- direct interface to SBML coded models via **sbmlxdf** - use of open software -- modifications can be implemented easier in Python compared to - MATLAB (fewer people program in MATLAB) -- SBML identifiers, annotations, gene-product-associations, - genes and groups carried over transparently - from the full network to the pruned network, making integration - with external databases easier +- modifications can be implemented easier in Python compared to MATLAB +- SBML identifiers, annotations, gene-product-associations, genes and + groups carried over transparently from the full network to the pruned + network, making subsequent interaction with external databases easier - protected reactions, metabolites, functions, flux bound overwrites can be - provided in an Excel spreadsheet, alternatively as a Python dict + provided in an Excel spreadsheet, alternatively as a Python dict + +Code has been developped and validated on Apple MacBook Pro 2019. Speed-up improvements mainly due to faster implementation of flux variability -function calls on glpk. Additional speed improvements achieved by -continually removing reactions and blocked metabolites during the network -reduction process. Essential reactions identified earlier. +function calls on glpk and using muliprocessing. Speed improvements could +also be achieved by removing candidate reactions and blocked metabolites +at each iteration step, i.e. continuously reducing the size of the model. ## install package: ```shell -$ pip3 install networkred@git+https://gitlab.cs.uni-duesseldorf.de/schubert/modelprune.git +$ pip3 install modelprune@git+https://gitlab.cs.uni-duesseldorf.de/schubert/modelprune.git ``` ## Small Python example -```python +```python 3 import modelpruner -# file names full_sbml = 'sample_data/SBML_models/Deniz_model_fba.xml' -pruned_sbml = 'sample_data/SBML_models/Deniz_model_fba_pruned.xml' -protected_parts = 'sample_data/data/Deniz_model_fba_nrp.xlsx' - +pruned_sbml = 'sample_data/SBML_models/Deniz_model_fba_reduced.xml' +protected_name = 'sample_data/data/Deniz_model_fba_nrp2.xlsx' + # load the original model -mp = modelpruner.ModelPruner(full_sbml, protected_parts) +mp = modelpruner.ModelPruner(full_sbml, protected_name) -# prune the network +print('---- network reduction -----') mp.prune() - -# export reduced model to sbml -mp.write_sbml(pruned_sbml) -``` +# export pruned model in sbml format +mp.export_pruned_model(pruned_sbml) +print(mp.print_status()) +``` ## License + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/> + [GPLv3](LICENSE.txt) @@ -73,8 +84,16 @@ Peter Schubert, Computational Cell Biology, HHU Duesseldorf, July 2022 in biotechnology and metabolic engineering. Journal of Biotechnolgy 261: 221-228. https://doi.org/10.1016/j.jbiotec.2017.05.001 -[2] GNU gklp reference #### TO BE UPDATED +[2] The GLPK package is part of the GNU Project released under the aegis of GNU. [3] Erdrich P, Steuer R, Klamt S. (2015) An algorithm for the reduction of genome-scale metabolic network models to meaningful core models. BMC Syst Biol 9, 48. https://doi.org/10.1186/s12918-015-0191-x + +[4] Fell, DA; Small, JR (1986). "Fat synthesis in adipose tissue. +An examination of stoichiometric constraints". +Biochem J. 238 (3): 781–786. doi:10.1042/bj2380781. PMC 1147204. PMID 3800960. + +[5] Mahadevan, R., & Schilling, C. H. (2003). The effects of alternate optimal +solutions in constraint-based genome-scale metabolic models. +Metabolic Engineering, 5(4), 264–276. https://doi.org/10.1016/j.ymben.2003.09.002 \ No newline at end of file diff --git a/modelpruner/_version.py b/modelpruner/_version.py index 0b6fe6ada0bff6a7b153de5775dca38cc1d8f544..dc8bf378878b039932ead25e856f1f986ca74294 100644 --- a/modelpruner/_version.py +++ b/modelpruner/_version.py @@ -1,5 +1,5 @@ """ Definition of version string. """ -__version__ = "0.2.0" +__version__ = "0.3.0" program_name = 'modelpruner' diff --git a/modelpruner/core/model_pruner.py b/modelpruner/core/model_pruner.py index d83be0016e73ed30a63600a8566989e3844223ea..7c97eb3f9285687bc7219d2e8739df4dbe0ba890 100644 --- a/modelpruner/core/model_pruner.py +++ b/modelpruner/core/model_pruner.py @@ -66,7 +66,7 @@ def _worker(rids): class ModelPruner: - def __init__(self, sbml_fname, protected_parts, resume=False): + def __init__(self, sbml_fname, protected_parts, resume=False, processes=100): """Init :param sbml_fname: pathname of original model (ending with '.xml') @@ -75,10 +75,13 @@ class ModelPruner: :type protected_parts: str or dict with protected parts information :param resume: Indicator if to resume pruning from stored snapshot :type resume: bool, optional (default: False) + :param processes: maximal number of processes for multiprocessing + :type processes: int (default: 100) """ self.tolerance = 1e-7 # later accessible by set_params() ? - self.cpus = int(os.cpu_count() - 0) + self.cpus = min(processes, os.cpu_count()) self.min_mp_total_flux = 1000 + self.full_sbmnl_fname = sbml_fname self._snapshot_sbml = re.sub(r'.xml$', '_snapshot.xml', sbml_fname) self._snapshot_json = re.sub(r'.xml$', '_snapshot.json', sbml_fname) @@ -431,19 +434,24 @@ class ModelPruner: f"{status['n_full_rids' ] - status['n_rids']:4d}/{status['n_rids']:4d}" f"/{len(self.protected_rids):4d}; dof: {status['dof']:3d}") - def export_pruned_model(self, pruned_sbml): + def export_pruned_model(self, pruned_sbml=None): """Export the pruned network (or snapshot) to SBML. Component information of initial network will be carried over, e.g. annotation data, groups, gene products :param pruned_sbml: pathname of pruned network, ending with '.xml' - :type pruned_sbml: str + :type pruned_sbml: str, optional (default: None, construct a name) """ + n_s, n_r = self.fba_model.s_mat_coo.shape + if pruned_sbml is None: + pruned_sbml = re.sub(r'.xml$', f'_pruned_{n_s}x{n_r}.xml', self.full_sbmnl_fname) + success = self.fba_model.export_pruned_model(pruned_sbml) if success is True: print('Pruned model exported to', pruned_sbml) - # self.remove_snapshot_files() + if len(self.protected_rids) == n_r: + self.remove_snapshot_files() else: print('Pruned model could not be exported') diff --git a/modelpruner/models/fba_model.py b/modelpruner/models/fba_model.py index 5712705a75df445f4c5abaeb8eb55d3942a46ed3..68b2437c822a7e16d27818fe10022ba01477ef08 100644 --- a/modelpruner/models/fba_model.py +++ b/modelpruner/models/fba_model.py @@ -8,7 +8,7 @@ import numpy as np from scipy import sparse import sbmlxdf -from modelpruner.problems.lp_problem import LpProblem +from modelpruner.problems.glpk_linear_problem import GlpkLinearProblem class FbaModel: @@ -45,7 +45,7 @@ class FbaModel: # using a sparse matrix speeds up significantly pickling (factor > 350 for S-matrix) # pickling required to support multiprocessing # also data exchange from sbmlxdf.Model and towards LpProblem improves - self.s_mat_coo = sparse.coo_matrix(sbml_model.get_s_matrix(sparse=True)) + self.s_mat_coo = sparse.coo_array(sbml_model.get_s_matrix(sparse=True)) # lp problem will only be created once required. # this gives chances to update e.g. flux bounds and to support pickling (multiprocessing) @@ -104,7 +104,8 @@ class FbaModel: group = [idx] for adjacent_pos in range(pos + 1, len(sorted_idx)): candiate_idx = sorted_idx[adjacent_pos] - if (fwd_rev_s_mat_coo.getcol(idx) != fwd_rev_s_mat_coo.getcol(candiate_idx)).getnnz() == 0: + if (fwd_rev_s_mat_coo.getcol(idx) != fwd_rev_s_mat_coo.getcol(candiate_idx))\ + .getnnz() == 0: consumed_idxs.add(candiate_idx) group.append(candiate_idx) else: @@ -194,8 +195,8 @@ class FbaModel: if 'assoc=' in gpr: gpr = re.sub('assoc=', '', gpr) gpr = re.sub(r'[()]', ' ', gpr) - gpr = re.sub(r'and', ' ', gpr) - gpr = re.sub(r'or', ' ', gpr) + gpr = re.sub(r' and ', ' ', gpr) + gpr = re.sub(r' or ', ' ', gpr) gp_ids = gp_ids.union(set(re.findall(r'\w+', gpr))) if 'fbcGeneProducts' in pruned_mdict: pruned_mdict['fbcGeneProducts'] = pruned_mdict['fbcGeneProducts'].loc[list(gp_ids)] @@ -204,7 +205,8 @@ class FbaModel: pruned_model.from_df(pruned_mdict) err = pruned_model.validate_sbml() if len(err) > 0: - print('SBML validation result: %s', ', '.join([k + ': ' + str(v) for k, v in err.items()])) + print('SBML validation result:', ', '.join([k + ': ' + str(v) for k, v in err.items()])) + print('see ./results/tmp.txt for details') return pruned_model.export_sbml(pruned_sbml) def create_core_fba_lp(self): @@ -225,7 +227,7 @@ class FbaModel: fwd_rev_col_bnds = np.hstack((np.fmax(self.flux_bounds[:], 0), np.fmax(np.flip(-self.flux_bounds[:, self.reversible], axis=0), 0))) row_bnds = np.zeros((fwd_rev_s_mat_coo.shape[0], 2)) - lp = LpProblem(fwd_rev_s_mat_coo, row_bnds, fwd_rev_col_bnds) + lp = GlpkLinearProblem(fwd_rev_s_mat_coo, row_bnds, fwd_rev_col_bnds) return lp def delete_fba_lp(self): @@ -401,6 +403,10 @@ class FbaModel: def fva_pf_flux_ranges(self, pf, rids): """FVA for a specific protected function for all or selected reactions + Reference: Mahadevan R, Schilling CH: The effects of alternate optimal + solutions in constraint-based genome-scale metabolic models. + Metab Eng 2003, 5(4):264-76. + :param pf: protected function (objective) :type pf: class ProtectedFunction :param rids: list of reaction ids to get flux range @@ -436,18 +442,6 @@ class FbaModel: # print(res) return res - def fva_single_rid(self, rid): - """FVA - flux variability analysis for all or selected reactions. - - :param rid: selected reaction ids to run FVA (alternatively, all) - :type rid: list of strings (default: None - all reactions) - :return: rid, min and max flux values - :rtype: dict - """ - len(self.rids) - print('fva_single_rid not implemented', rid) - return {'success': False} - def copy(self): """Copy the FbaModel instance (while removing linear problem). diff --git a/modelpruner/problems/__init__.py b/modelpruner/problems/__init__.py index 95f537df2d71bef5d10257c57151bd7ba23665bd..1c1ef53afa9784f99b4a4166b95b50d2548f4797 100644 --- a/modelpruner/problems/__init__.py +++ b/modelpruner/problems/__init__.py @@ -1,5 +1,5 @@ """Subpackage with LpProblem Class """ -from .lp_problem import LpProblem +from .glpk_linear_problem import GlpkLinearProblem -__all__ = ['LpProblem'] +__all__ = ['GlpkLinearProblem'] diff --git a/modelpruner/problems/lp_problem.py b/modelpruner/problems/glpk_linear_problem.py similarity index 97% rename from modelpruner/problems/lp_problem.py rename to modelpruner/problems/glpk_linear_problem.py index 8324ce1562ae8f351209effcb443197dc34ffb9b..e5bac42cc937362241ccad54a3ebdcdae51b7e5f 100644 --- a/modelpruner/problems/lp_problem.py +++ b/modelpruner/problems/glpk_linear_problem.py @@ -57,7 +57,7 @@ scaling_options = {'GM': glpk.GLP_SF_GM, 'AUTO': glpk.GLP_SF_AUTO} -class LpProblem: +class GlpkLinearProblem: def __init__(self, constr_coefs_coo, row_bnds, col_bnds): """create core linear problem (constraints, variable and row bounds) @@ -71,9 +71,9 @@ class LpProblem: :param col_bnds: lower and upper bounds of structural (col) variables :type col_bnds: 2D ndarray of shape(2, ncols) (1st row: lower bounds, 2nd row: upper bounds) """ - glp_msg_lev = glpk.GLP_MSG_ERR + glp_msg_lev = glpk.GLP_MSG_OFF # GLP_MSG_OFF, GLP_MSG_ERR, GLP_MSG_ON, GLP_MSG_ALL glp_tm_lim = 10 * 1000 - glp_presolve = glpk.GLP_OFF # or glpk.GLP_OFF (default) or GLP_ON + glp_presolve = glpk.GLP_OFF # or glpk.GLP_OFF (default) or GLP_ON self.res = {} self.nrows, self.ncols = constr_coefs_coo.shape @@ -251,6 +251,11 @@ class LpProblem: glpk.glp_del_rows(self.lp, len(row_idxs0), num) self.nrows -= len(row_idxs0) + def construct_inital_basis(self): + """Re-construct an initial basis, e.g. after problem update + """ + glpk.glp_adv_basis(self.lp, 0) + def solve(self, short_result=True, scaling=None): """Solve the Linear Programming Problem. diff --git a/sample_prune_model.py b/sample_prune_model.py index 134b319997d42b59982438764b1b2a031879c8a1..ce6edbd15c851b7367504d467df798269a71ccbc 100644 --- a/sample_prune_model.py +++ b/sample_prune_model.py @@ -1,35 +1,21 @@ -# sample_reduce_model.py -# sample script for network reduction of a SBML fbc model +# sample_prune_model.py +# sample script to prune a metabolic model (coded in SBML) using modelpruner + +# Peter Schubert, HHU, Institute of Computational Cell Biology, Prof. Martin Lercher +# July, 2022 -import sys import os -# import logging import modelpruner def prune_model(): - # activate logging - # logfile = './log/nr.log' - # logformat = '%(asctime)s %(levelname)s:%(name)s %(message)s' - # logdir = os.path.dirname(logfile) - # if not os.path.exists(logdir): - # os.mkdir(logdir) - # logging.basicConfig(filename=logfile, filemode='w', - # level=logging.INFO, format=logformat) - # logging.info('Started') - - # print some environment information - print('Python version: {:d}.{:d}.{:d}'.format(sys.version_info.major, - sys.version_info.minor, - sys.version_info.micro)) print('modelpruner version: {:s}'.format(modelpruner.__version__)) print('working directory :', os.getcwd()) # file names full_sbml = 'sample_data/SBML_models/Deniz_model_fba.xml' - pruned_sbml = 'sample_data/SBML_models/Deniz_model_fba_reduced.xml' protected_fname = 'sample_data/data/Deniz_model_fba_nrp2.xlsx' # load the original model @@ -39,11 +25,10 @@ def prune_model(): mp.prune() # export pruned model in sbml format - mp.export_pruned_model(pruned_sbml) - print('pruned model converted to SBML:', pruned_sbml) + mp.export_pruned_model() + print('pruned model converted to SBML:') print('finally protected reactions:', mp.protected_rids) print(mp.print_status()) - # logging.info('Finished') if __name__ == '__main__': diff --git a/setup.py b/setup.py index a2c3986889a5adee62bb7a3df1ab6991351755b0..fbf505834aaf7e319e91b9d00a45578d8f61e35f 100755 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( 'numpy>= 1.20.0', 'sbmlxdf>=0.2.7', 'swiglpk>=5.0.3', - 'scipy>=1.7.0', + 'scipy>=1.8.1', ], python_requires=">=3.7", keywords=['modeling', 'standardization', 'network reduction', 'SBML'],