From fe4ebf8d6975670ff5c2d6c1205e4d373e211f1f Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 22 Sep 2023 02:45:39 +0200 Subject: [PATCH] update notebooks --- tests/data/hdf5/_generate_testdata.ipynb | 6 ++++-- tutorials/data_generation_ppi.ipynb | 15 ++++++++++----- tutorials/data_generation_srv.ipynb | 20 ++++++++++---------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/tests/data/hdf5/_generate_testdata.ipynb b/tests/data/hdf5/_generate_testdata.ipynb index 65a58720a..50e07f156 100644 --- a/tests/data/hdf5/_generate_testdata.ipynb +++ b/tests/data/hdf5/_generate_testdata.ipynb @@ -102,7 +102,8 @@ "models_folder_name = 'exp_nmers_all_HLA_quantitative'\n", "data = 'pMHCI'\n", "resolution = 'residue' # either 'residue' or 'atom'\n", - "distance_cutoff = 15 # max distance in Å between two interacting residues/atoms of two proteins\n", + "interaction_radius = 15 # max distance in Å between two interacting residues/atoms of two proteins\n", + "max_edge_distance = 15 # max distance in Å between to create an edge\n", "\n", "csv_file_path = f'{project_folder}data/external/processed/I/{csv_file_name}'\n", "models_folder_path = f'{project_folder}data/{data}/features_input_folder/{models_folder_name}'\n", @@ -130,7 +131,8 @@ " pdb_path = pdb_files[i],\n", " resolution = \"residue\",\n", " chain_ids = [\"M\", \"P\"],\n", - " distance_cutoff = distance_cutoff,\n", + " interaction_radius = interaction_radius,\n", + " max_edge_distance = max_edge_distance,\n", " targets = {\n", " 'binary': int(float(bas[i]) <= 500), # binary target value\n", " 'BA': bas[i], # continuous target value\n", diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 40a765eb9..f73800d57 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -162,8 +162,9 @@ "- A `.pdb` file, representing the protein-protein structural complex.\n", "- The resolution (`\"residue\"` or `\"atom\"`), i.e. whether each node should represent an amino acid residue or an atom.\n", "- The ids of the two chains composing the complex. In our use case, \"M\" indicates the MHC protein chain and \"P\" the peptide chain.\n", - "- The distance cutoff, which represents the maximum distance in Ångström between two interacting residues/atoms of the two proteins.\n", + "- The interaction radius, which determines the threshold distance (in Ångström) for residues/atoms surrounding the interface that will be included in the graph.\n", "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add two targets: \"BA\" and \"binary\". The first represents the actual BA value of the complex in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) a binding one.\n", + "- The max edge distance, which is the maximum distance between two nodes to generate an edge between them.\n", "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), in the form of .pssm files. PSSMs are optional and will not be used in this tutorial." ] }, @@ -183,7 +184,8 @@ "source": [ "queries = QueryCollection()\n", "\n", - "interface_distance_cutoff = 8 # max distance in Å between two interacting residues/atoms of two proteins\n", + "interaction_radius = 8 # max distance in Å between two interacting residues/atoms of two proteins\n", + "max_edge_distance = 8\n", "\n", "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", "count = 0\n", @@ -193,7 +195,8 @@ "\t\t\tpdb_path = pdb_files[i],\n", "\t\t\tresolution = \"residue\",\n", "\t\t\tchain_ids = [\"M\", \"P\"],\n", - "\t\t\tdistance_cutoff = interface_distance_cutoff,\n", + "\t\t\tinteraction_radius = interaction_radius,\n", + "\t\t\tmax_edge_distance = max_edge_distance,\n", "\t\t\ttargets = {\n", "\t\t\t\t'binary': int(float(bas[i]) <= 500), # binary target value\n", "\t\t\t\t'BA': bas[i], # continuous target value\n", @@ -416,7 +419,8 @@ "source": [ "queries = QueryCollection()\n", "\n", - "interface_distance_cutoff = 5 # max distance in Å between two interacting residues/atoms of two proteins\n", + "interaction_radius = 5 # max distance in Å between two interacting residues/atoms of two proteins\n", + "max_edge_distance = 5\n", "\n", "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", "count = 0\n", @@ -426,7 +430,8 @@ "\t\t\tpdb_path = pdb_files[i],\n", "\t\t\tresolution = \"atom\",\n", "\t\t\tchain_ids = [\"M\",\"P\"],\n", - "\t\t\tdistance_cutoff = interface_distance_cutoff,\n", + "\t\t\tinteraction_radius = interaction_radius,\n", + "\t\t\tmax_edge_distance = max_edge_distance,\n", "\t\t\ttargets = {\n", "\t\t\t\t'binary': int(float(bas[i]) <= 500), # binary target value\n", "\t\t\t\t'BA': bas[i], # continuous target value\n", diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index 1fa693011..423e8a0c4 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -186,9 +186,9 @@ "- The insertion code, used when two residues have the same numbering. The combination of residue numbering and insertion code defines the unique residue.\n", "- The wildtype amino acid. \n", "- The variant amino acid. \n", - "- The radius, which determines the threshold distance (in Ångström) for residues/atoms surrounding the mutation that will be included in the graph.\n", - "- The distance cutoff, which represents the maximum distance in Ångström between two interacting residues/atoms.\n", + "- The interaction radius, which determines the threshold distance (in Ångström) for residues/atoms surrounding the mutation that will be included in the graph.\n", "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add a 0 if the SRV belongs to the benign class, and 1 if it belongs to the pathogenic one. \n", + "- The max edge distance, which is the maximum distance between two nodes to generate an edge between them.\n", "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), per chain identifier, in the form of .pssm files. PSSMs are optional and will not be used in this tutorial." ] }, @@ -208,8 +208,8 @@ "source": [ "queries = QueryCollection()\n", "\n", - "radius = 10.0 # radius to select the local neighborhood around the SRV\n", - "distance_cutoff = 4.5 # ??\n", + "interaction_radius = 10.0 # radius to select the local neighborhood around the SRV\n", + "max_edge_distance = 4.5 # ??\n", "\n", "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", "count = 0\n", @@ -223,8 +223,8 @@ "\t\twildtype_amino_acid = aa_dict[res_wildtypes[i]],\n", "\t\tvariant_amino_acid = aa_dict[res_variants[i]],\n", "\t\ttargets = {'binary': targets[i]},\n", - "\t\tradius = radius,\n", - "\t\tdistance_cutoff = distance_cutoff,\n", + "\t\tinteraction_radius = interaction_radius,\n", + "\t\tmax_edge_distance = max_edge_distance,\n", "\t\t))\n", "\tcount +=1\n", "\tif count % 20 == 0:\n", @@ -452,8 +452,8 @@ "source": [ "queries = QueryCollection()\n", "\n", - "radius = 10.0 # radius to select the local neighborhood around the SRV\n", - "distance_cutoff = 4.5 # ??\n", + "interaction_radius = 10.0 # radius to select the local neighborhood around the SRV\n", + "max_edge_distance = 4.5 # ??\n", "\n", "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", "count = 0\n", @@ -467,8 +467,8 @@ "\t\twildtype_amino_acid = aa_dict[res_wildtypes[i]],\n", "\t\tvariant_amino_acid = aa_dict[res_variants[i]],\n", "\t\ttargets = {'binary': targets[i]},\n", - "\t\tradius = radius,\n", - "\t\tdistance_cutoff = distance_cutoff,\n", + "\t\tinteraction_radius = interaction_radius,\n", + "\t\tmax_edge_distance = max_edge_distance,\n", "\t\t))\n", "\tcount +=1\n", "\tif count % 20 == 0:\n",