Merge pull request #493 from DeepRank/perf_table_gcroci2

docs: add performances table for deeprank2
DeepRank · Sep 22, 2023 · 9b50219 · 9b50219
2 parents ada1790 + 0ac2e71
commit 9b50219
Show file tree

Hide file tree

Showing 3 changed files with 225 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -46,6 +46,7 @@ DeepRank2 extensive documentation can be found [here](https://deeprank2.rtfd.io/
       - [GraphDataset](#graphdataset)
       - [GridDataset](#griddataset)
     - [Training](#training)
+  - [Computational performances](#computational-performances)
   - [Package development](#package-development)
 
 ## Installation
@@ -313,6 +314,16 @@ trainer.test()
 
 ```
 
+## Computational performances
+
+We measured the efficiency of data generation in DeepRank2 using the tutorials' [PDB files](https://zenodo.org/record/8187806) (~100 data points per data set), averaging the results run on Apple M1 Pro, using a single CPU.
+Parameter settings were: atomic resolution, `distance_cutoff` of 5.5 Å, radius (for SRV only) of 10 Å. The [features modules](https://deeprank2.readthedocs.io/en/latest/features.html) used were `components`, `contact`, `exposure`, `irc`, `secondary_structure`, `surfacearea`, for a total of 33 features for PPIs and 26 for SRVs (the latter do not use `irc` features).
+
+|      |         Data processing speed <br />[seconds/structure]        |                Memory <br />[megabyte/structure]               |
+|------|:--------------------------------------------------------:|:--------------------------------------------------------:|
+| PPIs | graph only: **2.99** (std 0.23) <br />graph+grid: **11.35** (std 1.30) | graph only: **0.54** (std 0.07) <br />graph+grid: **16.09** (std 0.44) |
+| SRVs | graph only: **2.20** (std 0.08)  <br />graph+grid: **2.85** (std 0.10) | graph only: **0.05** (std 0.01) <br />graph+grid: **17.52** (std 0.59) |
+
 ## Package development
 
 - Branching

diff --git a/tests/perf/ppi_perf.py b/tests/perf/ppi_perf.py
@@ -0,0 +1,94 @@
+# This script can be used for performance testing of the DeepRank2 package, using the PPI query classes.
+import glob
+import os
+import time
+from os import listdir
+from os.path import isfile, join
+
+import numpy
+import pandas as pd
+
+from deeprank2.features import (components, contact, exposure, irc,
+                                secondary_structure, surfacearea)
+from deeprank2.query import ProteinProteinInterfaceAtomicQuery, QueryCollection
+from deeprank2.utils.grid import GridSettings, MapMethod
+
+#################### PARAMETERS ####################
+interface_distance_cutoff = 5.5  # max distance in Å between two interacting residues/atoms of two proteins
+grid_settings = GridSettings( # None if you don't want grids
+    # the number of points on the x, y, z edges of the cube
+    points_counts = [35, 30, 30],
+    # x, y, z sizes of the box in Å
+    sizes = [1.0, 1.0, 1.0])
+grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids
+# grid_settings = None
+# grid_map_method = None
+feature_modules = [components, contact, exposure, irc, secondary_structure, surfacearea]
+cpu_count = 1
+####################################################
+
+data_path = os.path.join("data_raw", "ppi")
+processed_data_path = os.path.join("data_processed", "ppi")
+
+if not os.path.exists(os.path.join(processed_data_path, "atomic")):
+    os.makedirs(os.path.join(processed_data_path, "atomic"))
+
+def get_pdb_files_and_target_data(data_path):
+    csv_data = pd.read_csv(os.path.join(data_path, "BA_values.csv"))
+    pdb_files = glob.glob(os.path.join(data_path, "pdb", '*.pdb'))
+    pdb_files.sort()
+    pdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0] for pdb_file in pdb_files]
+    csv_data_indexed = csv_data.set_index('ID')
+    csv_data_indexed = csv_data_indexed.loc[pdb_ids_csv]
+    bas = csv_data_indexed.measurement_value.values.tolist()
+    return pdb_files, bas
+
+
+if __name__=='__main__':
+
+    timings = []
+    count = 0
+    pdb_files, bas = get_pdb_files_and_target_data(data_path)
+
+    for i, pdb_file in enumerate(pdb_files):
+        queries = QueryCollection()
+        queries.add(
+            ProteinProteinInterfaceAtomicQuery(
+                pdb_path = pdb_file,
+                chain_id1 = "M",
+                chain_id2 = "P",
+                distance_cutoff = interface_distance_cutoff,
+                targets = {
+                    'binary': int(float(bas[i]) <= 500), # binary target value
+                    'BA': bas[i], # continuous target value
+                    }))
+
+        start = time.perf_counter()
+        queries.process(
+            prefix = os.path.join(processed_data_path, "atomic", "proc"),
+            feature_modules = feature_modules,
+            cpu_count = cpu_count,
+            combine_output = False,
+            grid_settings = grid_settings,
+            grid_map_method = grid_map_method)
+        end = time.perf_counter()
+        elapsed = end - start
+        timings.append(elapsed)
+        print(f'Elapsed time: {elapsed:.6f} seconds.\n')
+
+    timings = numpy.array(timings)
+    print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, "atomic")}.')
+    print(f'Avg: {numpy.mean(timings):.6f} seconds.')
+    print(f'Std: {numpy.std(timings):.6f} seconds.\n')
+
+    proc_files_path = os.path.join(processed_data_path, "atomic")
+    proc_files = [f for f in listdir(proc_files_path) if isfile(join(proc_files_path, f))]
+    mem_sizes = []
+    for proc_file in proc_files:
+        file_size = os.path.getsize(os.path.join(proc_files_path, proc_file))
+        mb_file_size = file_size / (10**6)
+        print(f'Size of {proc_file}: {mb_file_size} MB.\n')
+        mem_sizes.append(mb_file_size)
+    mem_sizes = numpy.array(mem_sizes)
+    print(f'Avg: {numpy.mean(mem_sizes):.6f} MB.')
+    print(f'Std: {numpy.std(mem_sizes):.6f} MB.')
diff --git a/tests/perf/srv_perf.py b/tests/perf/srv_perf.py
@@ -0,0 +1,120 @@
+# This script can be used for performance testing of the DeepRank2 package, using the SRV query classes.
+import glob
+import os
+import time
+from os import listdir
+from os.path import isfile, join
+
+import numpy
+import pandas as pd
+
+from deeprank2.domain.aminoacidlist import (alanine, arginine, asparagine,
+                                            aspartate, cysteine, glutamate,
+                                            glutamine, glycine, histidine,
+                                            isoleucine, leucine, lysine,
+                                            methionine, phenylalanine, proline,
+                                            serine, threonine, tryptophan,
+                                            tyrosine, valine)
+from deeprank2.features import (components, contact, exposure, irc,
+                                secondary_structure, surfacearea)
+from deeprank2.query import QueryCollection, SingleResidueVariantResidueQuery
+from deeprank2.utils.grid import GridSettings, MapMethod
+
+aa_dict = {"ALA": alanine, "CYS": cysteine, "ASP": aspartate,
+           "GLU": glutamate, "PHE": phenylalanine, "GLY": glycine,
+           "HIS": histidine, "ILE": isoleucine, "LYS": lysine,
+           "LEU": leucine, "MET": methionine, "ASN": asparagine,
+           "PRO": proline, "GLN": glutamine, "ARG": arginine,
+           "SER": serine, "THR": threonine, "VAL": valine,
+           "TRP": tryptophan, "TYR": tyrosine
+           }
+
+#################### PARAMETERS ####################
+radius = 10.0
+distance_cutoff = 5.5
+grid_settings = GridSettings( # None if you don't want grids
+    # the number of points on the x, y, z edges of the cube
+    points_counts = [35, 30, 30],
+    # x, y, z sizes of the box in Å
+    sizes = [1.0, 1.0, 1.0])
+grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids
+# grid_settings = None
+# grid_map_method = None
+feature_modules = [components, contact, exposure, irc, surfacearea, secondary_structure]
+cpu_count = 1
+####################################################
+
+data_path = os.path.join("data_raw", "srv")
+processed_data_path = os.path.join("data_processed", "srv")
+
+if not os.path.exists(os.path.join(processed_data_path, "atomic")):
+    os.makedirs(os.path.join(processed_data_path, "atomic"))
+
+def get_pdb_files_and_target_data(data_path):
+    csv_data = pd.read_csv(os.path.join(data_path, "srv_target_values.csv"))
+    # before running this script change .ent to .pdb
+    pdb_files = glob.glob(os.path.join(data_path, "pdb", '*.pdb'))
+    pdb_files.sort()
+    pdb_id = [os.path.basename(pdb_file).split('.')[0] for pdb_file in pdb_files]
+    csv_data['pdb_id'] = csv_data['pdb_file'].apply(lambda x: x.split('.')[0])
+    csv_data_indexed = csv_data.set_index('pdb_id')
+    csv_data_indexed = csv_data_indexed.loc[pdb_id]
+    res_numbers = csv_data_indexed.res_number.values.tolist()
+    res_wildtypes = csv_data_indexed.res_wildtype.values.tolist()
+    res_variants = csv_data_indexed.res_variant.values.tolist()
+    targets = csv_data_indexed.target.values.tolist()
+    pdb_names = csv_data_indexed.index.values.tolist()
+    pdb_files = [data_path + "/pdb/" + pdb_name + ".pdb" for pdb_name in pdb_names]
+    return pdb_files, res_numbers, res_wildtypes, res_variants, targets
+
+
+if __name__=='__main__':
+
+    timings = []
+    count = 0
+    pdb_files, res_numbers, res_wildtypes, res_variants, targets = get_pdb_files_and_target_data(data_path)
+
+    for i, pdb_file in enumerate(pdb_files):
+        queries = QueryCollection()
+        queries.add(
+            SingleResidueVariantResidueQuery(
+                pdb_path = pdb_file,
+                chain_id = "A",
+                residue_number = res_numbers[i],
+                insertion_code = None,
+                wildtype_amino_acid = aa_dict[res_wildtypes[i]],
+                variant_amino_acid = aa_dict[res_variants[i]],
+                targets = {'binary': targets[i]},
+                radius = radius,
+                distance_cutoff = distance_cutoff,
+            ))
+
+        start = time.perf_counter()
+        queries.process(
+            prefix = os.path.join(processed_data_path, "atomic", "proc"),
+            feature_modules = feature_modules,
+            cpu_count = cpu_count,
+            combine_output = False,
+            grid_settings = grid_settings,
+            grid_map_method = grid_map_method)
+        end = time.perf_counter()
+        elapsed = end - start
+        timings.append(elapsed)
+        print(f'Elapsed time: {elapsed:.6f} seconds.\n')
+
+    timings = numpy.array(timings)
+    print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, "atomic")}.')
+    print(f'Avg: {numpy.mean(timings):.6f} seconds.')
+    print(f'Std: {numpy.std(timings):.6f} seconds.\n')
+
+    proc_files_path = os.path.join(processed_data_path, "atomic")
+    proc_files = [f for f in listdir(proc_files_path) if isfile(join(proc_files_path, f))]
+    mem_sizes = []
+    for proc_file in proc_files:
+        file_size = os.path.getsize(os.path.join(proc_files_path, proc_file))
+        mb_file_size = file_size / (10**6)
+        print(f'Size of {proc_file}: {mb_file_size} MB.\n')
+        mem_sizes.append(mb_file_size)
+    mem_sizes = numpy.array(mem_sizes)
+    print(f'Avg: {numpy.mean(mem_sizes):.6f} MB.')
+    print(f'Std: {numpy.std(mem_sizes):.6f} MB.')