Skip to content

Commit

Permalink
Merge pull request #493 from DeepRank/perf_table_gcroci2
Browse files Browse the repository at this point in the history
docs: add performances table for deeprank2
  • Loading branch information
gcroci2 authored Sep 22, 2023
2 parents ada1790 + 0ac2e71 commit 9b50219
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 0 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ DeepRank2 extensive documentation can be found [here](https://deeprank2.rtfd.io/
- [GraphDataset](#graphdataset)
- [GridDataset](#griddataset)
- [Training](#training)
- [Computational performances](#computational-performances)
- [Package development](#package-development)

## Installation
Expand Down Expand Up @@ -313,6 +314,16 @@ trainer.test()

```

## Computational performances

We measured the efficiency of data generation in DeepRank2 using the tutorials' [PDB files](https://zenodo.org/record/8187806) (~100 data points per data set), averaging the results run on Apple M1 Pro, using a single CPU.
Parameter settings were: atomic resolution, `distance_cutoff` of 5.5 Å, radius (for SRV only) of 10 Å. The [features modules](https://deeprank2.readthedocs.io/en/latest/features.html) used were `components`, `contact`, `exposure`, `irc`, `secondary_structure`, `surfacearea`, for a total of 33 features for PPIs and 26 for SRVs (the latter do not use `irc` features).

| | Data processing speed <br />[seconds/structure] | Memory <br />[megabyte/structure] |
|------|:--------------------------------------------------------:|:--------------------------------------------------------:|
| PPIs | graph only: **2.99** (std 0.23) <br />graph+grid: **11.35** (std 1.30) | graph only: **0.54** (std 0.07) <br />graph+grid: **16.09** (std 0.44) |
| SRVs | graph only: **2.20** (std 0.08) <br />graph+grid: **2.85** (std 0.10) | graph only: **0.05** (std 0.01) <br />graph+grid: **17.52** (std 0.59) |

## Package development

- Branching
Expand Down
94 changes: 94 additions & 0 deletions tests/perf/ppi_perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# This script can be used for performance testing of the DeepRank2 package, using the PPI query classes.
import glob
import os
import time
from os import listdir
from os.path import isfile, join

import numpy
import pandas as pd

from deeprank2.features import (components, contact, exposure, irc,
secondary_structure, surfacearea)
from deeprank2.query import ProteinProteinInterfaceAtomicQuery, QueryCollection
from deeprank2.utils.grid import GridSettings, MapMethod

#################### PARAMETERS ####################
interface_distance_cutoff = 5.5 # max distance in Å between two interacting residues/atoms of two proteins
grid_settings = GridSettings( # None if you don't want grids
# the number of points on the x, y, z edges of the cube
points_counts = [35, 30, 30],
# x, y, z sizes of the box in Å
sizes = [1.0, 1.0, 1.0])
grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids
# grid_settings = None
# grid_map_method = None
feature_modules = [components, contact, exposure, irc, secondary_structure, surfacearea]
cpu_count = 1
####################################################

data_path = os.path.join("data_raw", "ppi")
processed_data_path = os.path.join("data_processed", "ppi")

if not os.path.exists(os.path.join(processed_data_path, "atomic")):
os.makedirs(os.path.join(processed_data_path, "atomic"))

def get_pdb_files_and_target_data(data_path):
csv_data = pd.read_csv(os.path.join(data_path, "BA_values.csv"))
pdb_files = glob.glob(os.path.join(data_path, "pdb", '*.pdb'))
pdb_files.sort()
pdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0] for pdb_file in pdb_files]
csv_data_indexed = csv_data.set_index('ID')
csv_data_indexed = csv_data_indexed.loc[pdb_ids_csv]
bas = csv_data_indexed.measurement_value.values.tolist()
return pdb_files, bas


if __name__=='__main__':

timings = []
count = 0
pdb_files, bas = get_pdb_files_and_target_data(data_path)

for i, pdb_file in enumerate(pdb_files):
queries = QueryCollection()
queries.add(
ProteinProteinInterfaceAtomicQuery(
pdb_path = pdb_file,
chain_id1 = "M",
chain_id2 = "P",
distance_cutoff = interface_distance_cutoff,
targets = {
'binary': int(float(bas[i]) <= 500), # binary target value
'BA': bas[i], # continuous target value
}))

start = time.perf_counter()
queries.process(
prefix = os.path.join(processed_data_path, "atomic", "proc"),
feature_modules = feature_modules,
cpu_count = cpu_count,
combine_output = False,
grid_settings = grid_settings,
grid_map_method = grid_map_method)
end = time.perf_counter()
elapsed = end - start
timings.append(elapsed)
print(f'Elapsed time: {elapsed:.6f} seconds.\n')

timings = numpy.array(timings)
print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, "atomic")}.')
print(f'Avg: {numpy.mean(timings):.6f} seconds.')
print(f'Std: {numpy.std(timings):.6f} seconds.\n')

proc_files_path = os.path.join(processed_data_path, "atomic")
proc_files = [f for f in listdir(proc_files_path) if isfile(join(proc_files_path, f))]
mem_sizes = []
for proc_file in proc_files:
file_size = os.path.getsize(os.path.join(proc_files_path, proc_file))
mb_file_size = file_size / (10**6)
print(f'Size of {proc_file}: {mb_file_size} MB.\n')
mem_sizes.append(mb_file_size)
mem_sizes = numpy.array(mem_sizes)
print(f'Avg: {numpy.mean(mem_sizes):.6f} MB.')
print(f'Std: {numpy.std(mem_sizes):.6f} MB.')
120 changes: 120 additions & 0 deletions tests/perf/srv_perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# This script can be used for performance testing of the DeepRank2 package, using the SRV query classes.
import glob
import os
import time
from os import listdir
from os.path import isfile, join

import numpy
import pandas as pd

from deeprank2.domain.aminoacidlist import (alanine, arginine, asparagine,
aspartate, cysteine, glutamate,
glutamine, glycine, histidine,
isoleucine, leucine, lysine,
methionine, phenylalanine, proline,
serine, threonine, tryptophan,
tyrosine, valine)
from deeprank2.features import (components, contact, exposure, irc,
secondary_structure, surfacearea)
from deeprank2.query import QueryCollection, SingleResidueVariantResidueQuery
from deeprank2.utils.grid import GridSettings, MapMethod

aa_dict = {"ALA": alanine, "CYS": cysteine, "ASP": aspartate,
"GLU": glutamate, "PHE": phenylalanine, "GLY": glycine,
"HIS": histidine, "ILE": isoleucine, "LYS": lysine,
"LEU": leucine, "MET": methionine, "ASN": asparagine,
"PRO": proline, "GLN": glutamine, "ARG": arginine,
"SER": serine, "THR": threonine, "VAL": valine,
"TRP": tryptophan, "TYR": tyrosine
}

#################### PARAMETERS ####################
radius = 10.0
distance_cutoff = 5.5
grid_settings = GridSettings( # None if you don't want grids
# the number of points on the x, y, z edges of the cube
points_counts = [35, 30, 30],
# x, y, z sizes of the box in Å
sizes = [1.0, 1.0, 1.0])
grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids
# grid_settings = None
# grid_map_method = None
feature_modules = [components, contact, exposure, irc, surfacearea, secondary_structure]
cpu_count = 1
####################################################

data_path = os.path.join("data_raw", "srv")
processed_data_path = os.path.join("data_processed", "srv")

if not os.path.exists(os.path.join(processed_data_path, "atomic")):
os.makedirs(os.path.join(processed_data_path, "atomic"))

def get_pdb_files_and_target_data(data_path):
csv_data = pd.read_csv(os.path.join(data_path, "srv_target_values.csv"))
# before running this script change .ent to .pdb
pdb_files = glob.glob(os.path.join(data_path, "pdb", '*.pdb'))
pdb_files.sort()
pdb_id = [os.path.basename(pdb_file).split('.')[0] for pdb_file in pdb_files]
csv_data['pdb_id'] = csv_data['pdb_file'].apply(lambda x: x.split('.')[0])
csv_data_indexed = csv_data.set_index('pdb_id')
csv_data_indexed = csv_data_indexed.loc[pdb_id]
res_numbers = csv_data_indexed.res_number.values.tolist()
res_wildtypes = csv_data_indexed.res_wildtype.values.tolist()
res_variants = csv_data_indexed.res_variant.values.tolist()
targets = csv_data_indexed.target.values.tolist()
pdb_names = csv_data_indexed.index.values.tolist()
pdb_files = [data_path + "/pdb/" + pdb_name + ".pdb" for pdb_name in pdb_names]
return pdb_files, res_numbers, res_wildtypes, res_variants, targets


if __name__=='__main__':

timings = []
count = 0
pdb_files, res_numbers, res_wildtypes, res_variants, targets = get_pdb_files_and_target_data(data_path)

for i, pdb_file in enumerate(pdb_files):
queries = QueryCollection()
queries.add(
SingleResidueVariantResidueQuery(
pdb_path = pdb_file,
chain_id = "A",
residue_number = res_numbers[i],
insertion_code = None,
wildtype_amino_acid = aa_dict[res_wildtypes[i]],
variant_amino_acid = aa_dict[res_variants[i]],
targets = {'binary': targets[i]},
radius = radius,
distance_cutoff = distance_cutoff,
))

start = time.perf_counter()
queries.process(
prefix = os.path.join(processed_data_path, "atomic", "proc"),
feature_modules = feature_modules,
cpu_count = cpu_count,
combine_output = False,
grid_settings = grid_settings,
grid_map_method = grid_map_method)
end = time.perf_counter()
elapsed = end - start
timings.append(elapsed)
print(f'Elapsed time: {elapsed:.6f} seconds.\n')

timings = numpy.array(timings)
print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, "atomic")}.')
print(f'Avg: {numpy.mean(timings):.6f} seconds.')
print(f'Std: {numpy.std(timings):.6f} seconds.\n')

proc_files_path = os.path.join(processed_data_path, "atomic")
proc_files = [f for f in listdir(proc_files_path) if isfile(join(proc_files_path, f))]
mem_sizes = []
for proc_file in proc_files:
file_size = os.path.getsize(os.path.join(proc_files_path, proc_file))
mb_file_size = file_size / (10**6)
print(f'Size of {proc_file}: {mb_file_size} MB.\n')
mem_sizes.append(mb_file_size)
mem_sizes = numpy.array(mem_sizes)
print(f'Avg: {numpy.mean(mem_sizes):.6f} MB.')
print(f'Std: {numpy.std(mem_sizes):.6f} MB.')

0 comments on commit 9b50219

Please sign in to comment.