Skip to content

Commit

Permalink
Merge pull request #498 from DeepRank/54_docking_metrics_gcroci2
Browse files Browse the repository at this point in the history
docs: clarify ppi scoring metrics and add doc strings and tests
  • Loading branch information
gcroci2 authored Sep 21, 2023
2 parents ddd04ad + 0e3022d commit ada1790
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 42 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Main features:
- All features' documentation is available [here](https://deeprank2.readthedocs.io/en/latest/features.html)
- Predefined target types
- binary class, CAPRI categories, DockQ, RMSD, and FNAT
- Detailed docking scores documentation is available [here](https://deeprank2.readthedocs.io/en/latest/docking.html)
- Flexible definition of both new features and targets
- Features generation for both graphs and grids
- Efficient data storage in HDF5 format
Expand Down
28 changes: 12 additions & 16 deletions deeprank2/tools/target.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,24 +83,20 @@ def add_target(graph_path: Union[str, List[str]], target_name: str, target_list:
print(f"no graph for {hdf5}")


def compute_targets(pdb_path: str, reference_pdb_path: str) -> Dict[str, Union[float, int]]:
def compute_ppi_scores(pdb_path: str, reference_pdb_path: str) -> Dict[str, Union[float, int]]:

"""
Compute targets and outputs them as a dictionary.
For classification:
- binary (scalar value is expected to be either 0 or 1)
- capri_classes (scalar integer values are expected)
For regression:
- irmsd
- lrmsd
- fnat
- dockq
"""Compute structure similarity scores for the input docking model and return them as a dictionary.
The computed scores are: `lrmsd` (ligand root mean square deviation), `irmsd` (interface rmsd),
`fnat` (fraction of native contacts), `dockq` (docking model quality), `binary` (True - high quality,
False - low quality), `capri_class` (capri classification, 1 - high quality, 2 - medium, 3 - acceptable,
4 - incorrect). See https://deeprank2.readthedocs.io/en/latest/docking.html for more details about the scores.
Args:
pdb_path (str): Path to the scored pdb structure.
reference_pdb_path (str): Path to the reference structure required to compute the different target.
pdb_path (str): Path to the decoy.
reference_pdb_path (str): Path to the reference (native) structure.
Returns: a dictionary containing values for lrmsd, irmsd, fnat, dockq, binary, capri_class
Returns: a dictionary containing values for lrmsd, irmsd, fnat, dockq, binary, capri_class.
"""

ref_name = os.path.splitext(os.path.basename(reference_pdb_path))[0]
Expand Down Expand Up @@ -128,8 +124,8 @@ def compute_targets(pdb_path: str, reference_pdb_path: str) -> Dict[str, Union[f
)
scores[targets.BINARY] = scores[targets.IRMSD] < 4.0

scores[targets.CAPRI] = 5
for thr, val in zip([6.0, 4.0, 2.0, 1.0], [4, 3, 2, 1]):
scores[targets.CAPRI] = 4
for thr, val in zip([4.0, 2.0, 1.0], [3, 2, 1]):
if scores[targets.IRMSD] < thr:
scores[targets.CAPRI] = val

Expand Down
45 changes: 45 additions & 0 deletions docs/docking.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Docking scores

The following scores have been developed for evaluating the quality of the protein-protein models produced by computational methods (docking models), and all of them compare the structural similarity between the decoys (computationally generated structures) and the experimentally solved native structures. To calculate these measures, the interface between the two interacting protein molecules is defined as any pair of heavy atoms from the two molecules within 5Å of each other.

- `lmrsd` (ligand root mean square deviation) is a float value calculated for the backbone of the shorter chain (ligand) of the model after superposition of the longer chain (receptor). Lower scores represent better matching than higher scores.
- `imrsd` (interface rmsd) is a float value calculated for the backbone atoms of the interface residues (atomic contact cutoff of 10Å) after superposition of their equivalents in the predicted complex (model) Lower scores represent better matching than higher scores.
- `fnat` (fraction of native contacts) is the fraction of native interfacial contacts preserved in the interface of the predicted complex. The score is a float in the range [0, 1], where higher values respresent higher quality.
- `dockq` (docking model quality) is a continuous quality measure for docking models that instead of classifying into different quality groups. It combines fnat, lmrs, and irms and yields a float score in the range [0, 1], where higher values respresent higher quality.
- `binary` (bool): True if the irmsd is lower than 4.0, meaning that the decoy is considered high quality docking model, otherwise False.
- `capri_class` (int). It refers to Critical Assessment of PRedicted Interactions (CAPRI) classification, in which the possible values are: 1 (high quality, irmsd < 1.0), 2 (medium, irmsd < 2.0), 3 (acceptable, irms < 4.0), 4 (incorrect, irmsd >= 4.0)

See https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.10393 for more details about `capri_class`, `lrmsd`, `irmsd`, and `fnat`. See https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0161879 for more details about `dockq`.

## Compute and add docking scores

The following code snippet shows an example of how to use deeprank2 to compute the docking scores for a given docking model, and how to add one of the scores (e.g., `dockq`) as a target to the already processed data.

```python
from deeprank2.tools.target import add_target, compute_ppi_scores

docking_models = [
"<path_to_docking_model1.pdb>",
"<path_to_docking_model2.pdb>"
]
ref_models = [
"<path_to_ref_model1.pdb>",
"<path_to_ref_model2.pdb>"
]

target_list = ""
for idx, _ in enumerate(docking_models):
scores = compute_ppi_scores(
docking_models[idx],
ref_models[idx])
dockq = scores['dockq']
target_list += f"query_id_model{idx} {dockq}\n"

with open("<path_to_target_list.lst>", "w", encoding="utf-8") as f:
f.write(target_list)

add_target("<path_to_hdf5_file.hdf5>", "dockq", "<path_to_target_list.lst>")

```

After having run the above code snipped, each processed data point within the indicated HDF5 file will contain a new Dataset called "dockq", containing the value computed through `compute_ppi_scores`.
11 changes: 8 additions & 3 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ DeepRank2 allows for transformation of (pdb formatted) molecular data into 3D re

Main features:

* Predefined atom-level and residue-level feature types (e.g. atom/residue type, charge, size, potential energy, all features' documentation is available `here`_)
* Predefined target types (binary class, CAPRI categories, DockQ, RMSD, and FNAT)
* Predefined atom-level and residue-level feature types (e.g. atom/residue type, charge, size, potential energy, all features' documentation is available under `Features`_ notes)
* Predefined target types (binary class, CAPRI categories, DockQ, RMSD, and FNAT, detailed docking scores documentation is available under `Docking scores`_ notes)
* Flexible definition of both new features and targets
* Features generation for both graphs and grids
* Efficient data storage in HDF5 format
Expand All @@ -20,7 +20,8 @@ Main features:
.. _DeepRank-Mut: https://github.com/DeepRank/DeepRank-Mut
.. _convolutional neural networks: https://en.wikipedia.org/wiki/Convolutional_neural_network
.. _graph neural networks: https://en.wikipedia.org/wiki/Graph_neural_network
.. _here: https://deeprank2.readthedocs.io/en/latest/features.html
.. _Features: https://deeprank2.readthedocs.io/en/latest/features.html
.. _Docking scores: https://deeprank2.readthedocs.io/en/latest/docking.html
.. _PyTorch: https://pytorch.org/docs/stable/index.html
.. _PyTorch Geometric: https://pytorch-geometric.readthedocs.io/en/latest/

Expand Down Expand Up @@ -49,10 +50,14 @@ Notes
:hidden:

features
docking

:doc:`features`
Get a detailed overview about nodes' and edges' features implemented in the package.

:doc:`docking`
Get a detailed overview about PPIs' docking metrics implemented in the package.

Package reference
===========

Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,6 @@ Changelog = "https://github.com/DeepRank/deeprank2/blob/main/CHANGELOG.md"
branch = true
source = ["deeprank2"]

[tool.setuptools]
packages = ["deeprank2"]
[tool.setuptools.packages.find]
include = ["deeprank2*"]
exclude = ["tests*"]
8 changes: 4 additions & 4 deletions tests/data/hdf5/_generate_testdata.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
" ProteinProteinInterfaceResidueQuery,\n",
" SingleResidueVariantResidueQuery,\n",
" ProteinProteinInterfaceAtomicQuery)\n",
"from deeprank2.tools.target import compute_targets\n",
"from deeprank2.tools.target import compute_ppi_scores\n",
"from deeprank2.dataset import save_hdf5_keys\n",
"from deeprank2.domain.aminoacidlist import alanine, phenylalanine\n",
"import glob\n",
Expand Down Expand Up @@ -64,7 +64,7 @@
"\n",
" for pdb_path in pdb_paths:\n",
" # Append data points\n",
" targets = compute_targets(pdb_path, ref_path)\n",
" targets = compute_ppi_scores(pdb_path, ref_path)\n",
" queries.add(ProteinProteinInterfaceResidueQuery(\n",
" pdb_path = pdb_path,\n",
" chain_id1 = chain_id1,\n",
Expand Down Expand Up @@ -219,7 +219,7 @@
"count_queries = 5\n",
"pdb_path = str(PATH_TEST / \"data/pdb/3C8P/3C8P.pdb\")\n",
"ref_path = str(PATH_TEST / \"data/ref/3C8P/3C8P.pdb\")\n",
"targets = compute_targets(pdb_path, ref_path)\n",
"targets = compute_ppi_scores(pdb_path, ref_path)\n",
"queries = QueryCollection()\n",
"\n",
"for number in range(1, count_queries + 1):\n",
Expand Down Expand Up @@ -269,7 +269,7 @@
"\n",
"for pdb_path in pdb_paths:\n",
" # Append data points\n",
" targets = compute_targets(pdb_path, ref_path)\n",
" targets = compute_ppi_scores(pdb_path, ref_path)\n",
" queries.add(ProteinProteinInterfaceAtomicQuery(\n",
" pdb_path = pdb_path,\n",
" chain_id1 = chain_id1,\n",
Expand Down
16 changes: 8 additions & 8 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@
from tempfile import mkdtemp

import h5py

from deeprank2.dataset import GraphDataset, GridDataset
from deeprank2.domain import edgestorage as Efeat
from deeprank2.domain import nodestorage as Nfeat
from deeprank2.domain import targetstorage as targets
from deeprank2.neuralnets.cnn.model3d import CnnClassification
from deeprank2.neuralnets.gnn.ginet import GINet
from deeprank2.query import (ProteinProteinInterfaceResidueQuery,
QueryCollection)
from deeprank2.tools.target import compute_targets
from deeprank2.tools.target import compute_ppi_scores
from deeprank2.trainer import Trainer
from deeprank2.utils.exporters import HDF5OutputExporter
from deeprank2.utils.grid import GridSettings, MapMethod

from deeprank2.domain import edgestorage as Efeat
from deeprank2.domain import nodestorage as Nfeat
from deeprank2.domain import targetstorage as targets

pdb_path = str("tests/data/pdb/3C8P/3C8P.pdb")
ref_path = str("tests/data/ref/3C8P/3C8P.pdb")
pssm_path1 = str("tests/data/pssm/3C8P/3C8P.A.pdb.pssm")
Expand All @@ -41,10 +41,10 @@ def test_cnn(): # pylint: disable=too-many-locals

prefix = os.path.join(hdf5_directory, "test-queries-process")

all_targets = compute_targets(pdb_path, ref_path)
all_targets = compute_ppi_scores(pdb_path, ref_path)

try:
all_targets = compute_targets(pdb_path, ref_path)
all_targets = compute_ppi_scores(pdb_path, ref_path)

queries = QueryCollection()
for _ in range(count_queries):
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_gnn(): # pylint: disable=too-many-locals
prefix = os.path.join(hdf5_directory, "test-queries-process")

try:
all_targets = compute_targets(pdb_path, ref_path)
all_targets = compute_ppi_scores(pdb_path, ref_path)

queries = QueryCollection()
for _ in range(count_queries):
Expand Down
10 changes: 5 additions & 5 deletions tests/test_querycollection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@

import h5py
import pytest
from deeprank2.domain.aminoacidlist import alanine, phenylalanine
from deeprank2.query import (ProteinProteinInterfaceResidueQuery, Query,
QueryCollection, SingleResidueVariantResidueQuery)
from deeprank2.tools.target import compute_targets

from deeprank2.domain import edgestorage as Efeat
from deeprank2.domain import nodestorage as Nfeat
from deeprank2.domain.aminoacidlist import alanine, phenylalanine
from deeprank2.features import components, contact, surfacearea
from deeprank2.query import (ProteinProteinInterfaceResidueQuery, Query,
QueryCollection, SingleResidueVariantResidueQuery)
from deeprank2.tools.target import compute_ppi_scores


def _querycollection_tester( # pylint: disable = too-many-locals, dangerous-default-value
Expand Down Expand Up @@ -242,7 +242,7 @@ def test_querycollection_duplicates_add():

for pdb_path in pdb_paths:
# Append data points
targets = compute_targets(pdb_path, ref_path)
targets = compute_ppi_scores(pdb_path, ref_path)
queries.add(ProteinProteinInterfaceResidueQuery(
pdb_path = pdb_path,
chain_id1 = chain_id1,
Expand Down
41 changes: 37 additions & 4 deletions tests/tools/test_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
import tempfile
import unittest

from deeprank2.tools.target import add_target, compute_targets
from pdb2sql import StructureSimilarity

from deeprank2.tools.target import add_target, compute_ppi_scores


class TestTools(unittest.TestCase):
def setUp(self):
self.pdb_path = "./tests/data/pdb/1ATN/"
self.pssm_path = "./tests/data/pssm/1ATN/1ATN.A.pdb.pssm"
self.ref = "./tests/data/ref/1ATN/"
self.h5_train_ref = "tests/data/train_ref/train_data.hdf5"
self.h5_graphs = "tests/data/hdf5/1ATN_ppi.hdf5"

def test_add_target(self):
Expand All @@ -33,10 +34,42 @@ def test_add_target(self):
os.remove(graph_path)


def test_compute_targets(self):
compute_targets("tests/data/pdb/1ATN/1ATN_1w.pdb", "tests/data/ref/1ATN/1ATN.pdb")
def test_compute_ppi_scores(self):
scores = compute_ppi_scores(
os.path.join(self.pdb_path, "1ATN_1w.pdb"),
os.path.join(self.ref, "1ATN.pdb"))

sim = StructureSimilarity(
os.path.join(self.pdb_path, "1ATN_1w.pdb"),
os.path.join(self.ref, "1ATN.pdb"), enforce_residue_matching=False)
lrmsd = sim.compute_lrmsd_fast(method="svd")
irmsd = sim.compute_irmsd_fast(method="svd")
fnat = sim.compute_fnat_fast()
dockq = sim.compute_DockQScore(fnat, lrmsd, irmsd)
binary = irmsd < 4.0
capri = 4
for thr, val in zip([6.0, 4.0, 2.0, 1.0], [4, 3, 2, 1]):
if irmsd < thr:
capri = val

assert scores['irmsd'] == irmsd
assert scores['lrmsd'] == lrmsd
assert scores['fnat'] == fnat
assert scores['dockq'] == dockq
assert scores['binary'] == binary
assert scores['capri_class'] == capri

def test_compute_ppi_scores_same_struct(self):
scores = compute_ppi_scores(
os.path.join(self.pdb_path, "1ATN_1w.pdb"),
os.path.join(self.pdb_path, "1ATN_1w.pdb"))

assert scores['irmsd'] == 0.0
assert scores['lrmsd'] == 0.0
assert scores['fnat'] == 1.0
assert scores['dockq'] == 1.0
assert scores['binary'] # True
assert scores['capri_class'] == 1


if __name__ == "__main__":
Expand Down

0 comments on commit ada1790

Please sign in to comment.