Source code for deeprank_gnn.GraphGenMP

import os
import sys
import glob
import h5py
from tqdm import tqdm
import time
import multiprocessing as mp
from functools import partial
import pickle

from .ResidueGraph import ResidueGraph
from .Graph import Graph


[docs]class GraphHDF5(object): def __init__(self, pdb_path, ref_path=None, graph_type='residue', pssm_path=None, select=None, outfile='graph.hdf5', nproc=1, use_tqdm=True, tmpdir='./', limit=None, biopython=False): """Master class from which graphs are computed Args: pdb_path (str): path to the docking models ref_path (str, optional): path to the reference model. Defaults to None. graph_type (str, optional): Defaults to 'residue'. pssm_path ([type], optional): path to the pssm file. Defaults to None. select (str, optional): filter files that starts with 'input'. Defaults to None. outfile (str, optional): Defaults to 'graph.hdf5'. nproc (int, optional): number of processors. Default to 1. use_tqdm (bool, optional): Default to True. tmpdir (str, optional): Default to `./`. limit (int, optional): Default to None. >>> pdb_path = './data/pdb/1ATN/' >>> pssm_path = './data/pssm/1ATN/' >>> ref = './data/ref/1ATN/' >>> GraphHDF5(pdb_path=pdb_path, ref_path=ref, pssm_path=pssm_path, graph_type='residue', outfile='1AK4_residue.hdf5') """ # get the list of PDB names pdbs = list(filter(lambda x: x.endswith( '.pdb'), os.listdir(pdb_path))) if select is not None: pdbs = list(filter(lambda x: x.startswith(select), pdbs)) # get the full path of the pdbs pdbs = [os.path.join(pdb_path, name) for name in pdbs] if limit is not None: if isinstance(limit, list): pdbs = pdbs[limit[0]:limit[1]] else: pdbs = pdbs[:limit] # get the pssm data pssm = {} for p in pdbs: base = os.path.basename(p) mol_name = os.path.splitext(base)[0] base_name = mol_name.split('_')[0] if pssm_path is not None: pssm[p] = self._get_pssm( pssm_path, mol_name, base_name) else: pssm[p] = None # get the ref path if ref_path is None: ref = None else: ref = os.path.join(ref_path, base_name+'.pdb') # compute all the graphs on 1 core and directly # store the graphs the HDF5 file if nproc == 1: graphs = self.get_all_graphs( pdbs, pssm, ref, outfile, use_tqdm, biopython) else: if not os.path.isdir(tmpdir): os.mkdir(tmpdir) pool = mp.Pool(nproc) part_process = partial( self._pickle_one_graph, pssm=pssm, ref=ref, tmpdir=tmpdir, biopython=biopython) pool.map(part_process, pdbs) # get teh graph names graph_names = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir)] graph_names = list( filter(lambda x: x.endswith('.pkl'), graph_names)) if select is not None: graph_names = list( filter(lambda x: x.startswith(tmpdir+select), graph_names)) # transfer them to the hdf5 with h5py.File(outfile, 'w') as f5: desc = '{:25s}'.format(' Store in HDF5') for name in graph_names: f = open(name, 'rb') g = pickle.load(f) try: g.nx2h5(f5) except Exception as e: print( 'Issue encountered while computing graph ', name) print(e) f.close() os.remove(name) # clean up rmfiles = glob.glob( '*.izone') + glob.glob('*.lzone') + glob.glob('*.refpairs') for f in rmfiles: os.remove(f)
[docs] def get_all_graphs(self, pdbs, pssm, ref, outfile, use_tqdm=True, biopython=False): graphs = [] if use_tqdm: desc = '{:25s}'.format(' Create HDF5') lst = tqdm(pdbs, desc=desc, file=sys.stdout) else: lst = pdbs for name in lst: try: graphs.append(self._get_one_graph( name, pssm, ref, biopython)) except Exception as e: print('Issue encountered while computing graph ', name) print(e) with h5py.File(outfile, 'w') as f5: for g in graphs: try: g.nx2h5(f5) except Exception as e: print('Issue encountered while storing graph ', g.pdb) print(e)
@staticmethod def _pickle_one_graph(name, pssm, ref, tmpdir='./', biopython=False): # get the graph try: g = ResidueGraph( pdb=name, pssm=pssm[name], biopython=biopython) if ref is not None: g.get_score(ref) # pickle it mol_name = os.path.basename(name) mol_name = os.path.splitext(mol_name)[0] fname = os.path.join(tmpdir, mol_name+'.pkl') f = open(fname, 'wb') pickle.dump(g, f) f.close() except Exception as e: print('Issue encountered while storing graph ', name) print(e) @staticmethod def _get_one_graph(name, pssm, ref, biopython): # get the graph g = ResidueGraph( pdb=name, pssm=pssm[name], biopython=biopython) if ref is not None: g.get_score(ref) return g @staticmethod def _get_pssm(pssm_path, mol_name, base_name): if pssm_path is None: return None pssmA = os.path.join(pssm_path, base_name+'.A.pssm') pssmB = os.path.join(pssm_path, base_name+'.B.pssm') # check if the pssms exists if os.path.isfile(pssmA) and os.path.isfile(pssmB): pssm = {'A': pssmA, 'B': pssmB} else: pssmA = os.path.join(pssm_path, base_name+'.A.pdb.pssm') pssmB = os.path.join(pssm_path, base_name+'.B.pdb.pssm') if os.path.isfile(pssmA) and os.path.isfile(pssmB): pssm = {'A': pssmA, 'B': pssmB} else: pssmA = os.path.join(pssm_path, mol_name+'.A.pdb.pssm') pssmB = os.path.join(pssm_path, mol_name+'.B.pdb.pssm') if os.path.isfile(pssmA) and os.path.isfile(pssmB): pssm = {'A': pssmA, 'B': pssmB} else: raise FileNotFoundError( 'PSSM file for ' + mol_name + ' not found') return pssm