From d370a903157f3b482ccf2eec51ab12da9da72acd Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 22 Sep 2023 16:37:33 +0200 Subject: [PATCH] refactor: simplify code to read atom data from pdb2sql objects --- deeprank2/utils/buildgraph.py | 153 +++++++++++----------------------- 1 file changed, 49 insertions(+), 104 deletions(-) diff --git a/deeprank2/utils/buildgraph.py b/deeprank2/utils/buildgraph.py index a22657c03..38da5632f 100644 --- a/deeprank2/utils/buildgraph.py +++ b/deeprank2/utils/buildgraph.py @@ -14,6 +14,9 @@ _log = logging.getLogger(__name__) +PDB_GET = "x,y,z,name,altLoc,occ,element,chainID,resSeq,resName,iCode" + + def _add_atom_to_residue(atom: Atom, residue: Residue): """Adds an `Atom` to a `Residue` if not already there. @@ -30,57 +33,56 @@ def _add_atom_to_residue(atom: Atom, residue: Residue): residue.add_atom(atom) -def _add_atom_data_to_structure(structure: PDBStructure, # pylint: disable=too-many-arguments, too-many-locals - x: float, y: float, z: float, - atom_name: str, - altloc: str, occupancy: float, - element_name: str, - chain_id: str, - residue_number: int, - residue_name: str, - insertion_code: str): - """ - This is a subroutine, to be used in other methods for converting pdb2sql atomic data into a +def _add_atom_data_to_structure( + structure: PDBStructure, + pdb_obj: pdb2sql_object | pdb2sql_interface, + **kwargs +): + """This is a subroutine, to be used in other methods for converting pdb2sql atomic data into a deeprank structure object. It should be called for one atom. Args: - structure (:class:`PDBStructure`): Where this atom should be added to. - x (float): x-coordinate of atom. - y (float): y-coordinate of atom. - z (float): z-coordinate of atom. - atom_name (str): Name of atom: 'CA', 'C', 'N', 'O', 'CB', etc. - altloc (str): Pdb alternative location id for this atom (can be empty): 'A', 'B', 'C', etc. - occupancy (float): Pdb occupancy of this atom, ranging from 0.0 to 1.0. Should be used with altloc. - element_name (str): Pdb element symbol of this atom: 'C', 'O', 'H', 'N', 'S'. - chain_id (str): Pdb chain identifier: 'A', 'B', 'C', etc. - residue_number (int): Pdb residue number, a positive integer. - residue_name (str): Pdb residue name: "ALA", "CYS", "ASP", etc. - insertion_code (str): Pdb residue insertion code (can be empty) : '', 'A', 'B', 'C', etc. + structure (:class:`PDBStructure`): The structure to which this atom should be added to. + pdb (pdb2sql_object | pdb2sql_interface): The pdb2sql object to retrieve the data from. + kwargs: as required by the get function for the pdb object. """ - # Make sure not to take the same atom twice. - if altloc is not None and altloc != "" and altloc != "A": - return + retrieved_data = PDB_GET.split(sep=',') + for data_values in pdb_obj.get(PDB_GET, **kwargs): + atom_data = dict(zip(retrieved_data, data_values)) - insertion_code = None if insertion_code == "" else insertion_code - amino_acid = amino_acids_by_code[residue_name] if residue_name in amino_acids_by_code else None - atom_position = np.array([x, y, z]) + # Make sure not to take the same atom twice. + if atom_data["altLoc"] is not None and atom_data["altLoc"] != "" and atom_data["altLoc"] != "A": + return - if not structure.has_chain(chain_id): - structure.add_chain(Chain(structure, chain_id)) - chain = structure.get_chain(chain_id) + atom_data["iCode"] = None if atom_data["iCode"] == "" else atom_data["iCode"] - if not chain.has_residue(residue_number, insertion_code): - chain.add_residue(Residue(chain, residue_number, amino_acid, insertion_code)) - residue = chain.get_residue(residue_number, insertion_code) + try: + atom_data["aa"] = amino_acids_by_code[atom_data["resName"]] + except KeyError: + atom_data["aa"] = None + atom_data["coordinates"] = np.array(data_values[:3]) - atom = Atom( - residue, atom_name, AtomicElement[element_name], atom_position, occupancy - ) - _add_atom_to_residue(atom, residue) + if not structure.has_chain(atom_data["chainID"]): + structure.add_chain(Chain(structure, atom_data["chainID"])) + chain = structure.get_chain(atom_data["chainID"]) -def get_structure(pdb: pdb2sql_object, id_: str) -> PDBStructure: + if not chain.has_residue(atom_data["resSeq"], atom_data["iCode"]): + chain.add_residue(Residue(chain, atom_data["resSeq"], atom_data["aa"], atom_data["iCode"])) + residue = chain.get_residue(atom_data["resSeq"], atom_data["iCode"]) + + atom = Atom( + residue, + atom_data["name"], + AtomicElement[atom_data["element"]], + atom_data["coordinates"], + atom_data["occ"], + ) + _add_atom_to_residue(atom, residue) + + +def get_structure(pdb_obj: pdb2sql_object, id_: str) -> PDBStructure: """Builds a structure from rows in a pdb file. Args: @@ -91,41 +93,11 @@ def get_structure(pdb: pdb2sql_object, id_: str) -> PDBStructure: PDBStructure: The structure object, giving access to chains, residues, atoms. """ structure = PDBStructure(id_) - - # Iterate over the atom output from pdb2sql - for row in pdb.get( - "x,y,z,rowID,name,altLoc,occ,element,chainID,resSeq,resName,iCode", model=0 - ): - - ( - x, - y, - z, - _, - atom_name, - altloc, - occupancy, - element_name, - chain_id, - residue_number, - residue_name, - insertion_code, - ) = row - - _add_atom_data_to_structure(structure, - x, y, z, - atom_name, - altloc, occupancy, - element_name, - chain_id, - residue_number, - residue_name, - insertion_code) - + _add_atom_data_to_structure(structure, pdb_obj, model=0) return structure -def get_contact_atoms( # pylint: disable=too-many-locals +def get_contact_atoms( pdb_path: str, chain_ids: list[str], interaction_radius: float @@ -133,47 +105,20 @@ def get_contact_atoms( # pylint: disable=too-many-locals """Gets the contact atoms from pdb2sql and wraps them in python objects.""" interface = pdb2sql_interface(pdb_path) + pdb_name = os.path.splitext(os.path.basename(pdb_path))[0] + structure = PDBStructure(f"contact_atoms_{pdb_name}") + try: atom_indexes = interface.get_contact_atoms( cutoff=interaction_radius, chain1=chain_ids[0], chain2=chain_ids[1], ) - rows = interface.get( - "x,y,z,name,element,altLoc,occ,chainID,resSeq,resName,iCode", - rowID=atom_indexes[chain_ids[0]] + atom_indexes[chain_ids[1]] - ) + pdb_rowID = atom_indexes[chain_ids[0]] + atom_indexes[chain_ids[1]] + _add_atom_data_to_structure(structure, interface, rowID=pdb_rowID) finally: interface._close() # pylint: disable=protected-access - pdb_name = os.path.splitext(os.path.basename(pdb_path))[0] - structure = PDBStructure(f"contact_atoms_{pdb_name}") - - for row in rows: - ( - x, - y, - z, - atom_name, - element_name, - altloc, - occupancy, - chain_id, - residue_number, - residue_name, - insertion_code - ) = row - - _add_atom_data_to_structure(structure, - x, y, z, - atom_name, - altloc, occupancy, - element_name, - chain_id, - residue_number, - residue_name, - insertion_code) - return structure.get_atoms() def get_residue_contact_pairs( # pylint: disable=too-many-locals