From 9222f2a040933ace2672eb07e9e993108acb9a4e Mon Sep 17 00:00:00 2001 From: Carles Perez Date: Tue, 19 Feb 2019 18:55:10 +0100 Subject: [PATCH] selectOnPlot.py added to analysis package --- AdaptivePELE/analysis/selectOnPlot.py | 314 ++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 AdaptivePELE/analysis/selectOnPlot.py diff --git a/AdaptivePELE/analysis/selectOnPlot.py b/AdaptivePELE/analysis/selectOnPlot.py new file mode 100644 index 00000000..59107ee0 --- /dev/null +++ b/AdaptivePELE/analysis/selectOnPlot.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python +#title :selectOnPlot.py +#description :Generates a scatterplot where you can draw and select specific dots. +#author :Carles Perez Lopez +#date :20190219 +#python_version :3.6.5 +#============================================================================== + +from __future__ import absolute_import, division, print_function, unicode_literals +import os +import argparse +import glob +import multiprocessing as mp +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import shutil +from matplotlib.widgets import LassoSelector +from matplotlib.path import Path +from AdaptivePELE.atomset import RMSDCalculator, atomset +import AdaptivePELE.utilities.utilities as adapt_tools + + +def parseArguments(): + """ + Parse command line arguments + :returns: str, str str, str, str, bool, int, str, str, str, -- path to adaptive's results, + column to plot in the X axis, column to plot in the Y axis, column to plot in the Z axis, + path to the output's folder, whether to use a summary.csv already created, number of + processors, report prefix, trajectory prefix, separator of csvs + + """ + desc = "Generates a scatterplot of Adaptive's results given two or three columns (X, Y, and Z if set).\n" \ + "This plot allows the selection of desired points by drawing. Structures will be selected and \n" \ + "stored into an output folder. Additionally, a report file of this selected structures will be created. \n" \ + "To be run for example like: \n" \ + "\">python selectOnPlot.py /home/usr/adaptiveresults -xcol 'Binding Energy' -ycol epoch\"" + parser = argparse.ArgumentParser(description=desc) + required_named = parser.add_argument_group('required arguments') + required_named.add_argument("res_path", type=str, + help="Path to Adaptive results.") + parser.add_argument("-xcol", type=str, default="epoch", + help="Column name of the report file that will be used in the X axis.") + parser.add_argument("-ycol", type=str, default="Binding Energy", + help="Column name of the report file that will be used in the Y axis.") + parser.add_argument("-zcol", type=str, default=None, + help="If set, column name of the report file that will be used in the Z axis (colorbar).") + parser.add_argument("-outfol", type=str, default=None, + help="If set, path to the output's folder. By default it will be created in the Adaptive's \n" + "results path. WARNING: Take into account that if the folder already exists it will be \n" + "overwritten!!!") + parser.add_argument("-done", action="store_true", + help="If this is not the first time that you run this script, it is strongly recommended to \n" + "set this parameter on. If it is set, instead of looking all the reports and create a new\n" + "one, the script will use the summary csv of previous usages, saving computational time.") + parser.add_argument("-cpus", type=int, default=4, + help="Number of processors that you want to use in order to save time.") + parser.add_argument("-report", type=str, default="report_", + help="PELE's report prefix.") + parser.add_argument("-traj", type=str, default="trajectory_", + help="Adaptive's trajectory prefix.") + parser.add_argument("-sep", type=str, default=";", + help="Separator string that will be used in the CSV files.") + + args = parser.parse_args() + + return args.res_path, args.xcol, args.ycol, args.zcol, args.outfol, args.done, args.cpus, args.report, args.traj, \ + args.sep + + +class SelectFromCollection(object): + """Select indices from a matplotlib collection using `LassoSelector`. + + Selected indices are saved in the `ind` attribute. This tool fades out the + points that are not part of the selection (i.e., reduces their alpha + values). If your collection has alpha < 1, this tool will permanently + alter the alpha values. + + Note that this tool selects collection objects based on their *origins* + (i.e., `offsets`). + + Parameters + ---------- + ax : :class:`~matplotlib.axes.Axes` + Axes to interact with. + + collection : :class:`matplotlib.collections.Collection` subclass + Collection you want to select from. + + alpha_other : 0 <= float <= 1 + To highlight a selection, this tool sets all selected points to an + alpha value of 1 and non-selected points to `alpha_other`. + """ + + def __init__(self, ax, collection, alpha_other=0.1): + self.canvas = ax.figure.canvas + self.collection = collection + self.alpha_other = alpha_other + + self.xys = collection.get_offsets() + self.Npts = len(self.xys) + + # Ensure that we have separate colors for each object + self.fc = collection.get_facecolors() + if len(self.fc) == 0: + raise ValueError('Collection must have a facecolor') + elif len(self.fc) == 1: + self.fc = np.tile(self.fc, (self.Npts, 1)) + + self.lasso = LassoSelector(ax, onselect=self.onselect) + self.ind = [] + + def onselect(self, verts): + path = Path(verts) + self.ind = np.nonzero(path.contains_points(self.xys))[0] + self.fc[:, -1] = self.alpha_other + self.fc[self.ind, -1] = 1 + self.collection.set_facecolors(self.fc) + self.canvas.draw_idle() + + def disconnect(self): + self.lasso.disconnect_events() + self.fc[:, -1] = 1 + self.collection.set_facecolors(self.fc) + self.canvas.draw_idle() + + +def concat_reports_in_csv(adaptive_results_path, output_file_path, report_prefix="report_", + trajectory_prefix="trajectory_", separator_out=";"): + """ + It search report files in Adaptive's result folder and creates a csv file with everything concatenated, adding the + epoch and trajectory information. + :param adaptive_results_path: Path to the results folder of Adaptive. + :type adaptive_results_path: str + :param output_file_path: Path of the output file. + :type output_file_path: str + :param report_prefix: Prefix of PELE's reports. + :type report_prefix: str + :param trajectory_prefix: Prefix of PELE's trajectories. + :type trajectory_prefix: str + :param separator_out: Separator string used in the csv file. + :type separator_out: str + :return: Creates a csv file. + """ + dataframe_lists = [] + for adaptive_epoch in range(0, 2000): + folder = os.path.join(adaptive_results_path, str(adaptive_epoch)) + if os.path.exists(folder): + report_list = glob.glob("{}/*{}*".format(folder, report_prefix)) + report_list = sorted(report_list, key=lambda x: int(x.split("_")[-1])) + for n, report in enumerate(report_list): + pandas_df = pd.read_csv(report, sep=" ", engine="python", index_col=False, header=0) + pandas_df["epoch"] = adaptive_epoch + pandas_df["trajectory"] = glob.glob("{}/{}/*{}{}.*".format(adaptive_results_path, adaptive_epoch, + trajectory_prefix, n + 1))[0] + dataframe_lists.append(pandas_df) + else: + break + dataframe = pd.concat(dataframe_lists, ignore_index=True) + dataframe.to_csv(output_file_path, sep=separator_out, index=False) + + +def trajectory_and_snapshot_to_pdb(trajectory_path, snapshot, output_path): + """ + Given an absolute path to a trajectory of Adaptive and a snapshot (MODEL) in xtc format, the function transform it + into a PDB format. + :param trajectory_path: Absolute path to a trajectory from Adaptive, in xtc format. + :type trajectory_path:str + :param snapshot: model of a trajectory that you want to transform. + :type snapshot: int + :param output_path: output path of the new pdb file. + :type output_path: str + :return: Creates a PDB file. + """ + topology_path_splited = trajectory_path.split("/")[0:-2] + topology_path = os.path.join("/".join(topology_path_splited), "topology.pdb") + topology_contents = adapt_tools.getTopologyFile(topology_path) + trajectory = adapt_tools.getSnapshots(trajectory_path, topology=topology_path) + try: + single_model = trajectory[snapshot] + PDB = atomset.PDB() + PDB.initialise(single_model, topology=topology_contents) + except IndexError: + exit("You are selecting the model {} for a trajectory that has {} models, please, reselect the model index " + "(starting from 0).".format(snapshot, len(trajectory))) + with open(output_path, "w") as fw: + fw.write("MODEL %4d\n" % (snapshot + 1)) + fw.write(PDB.pdb) + fw.write("ENDMDL\n") + fw.write("END\n") + + +def get_pdb_from_xtc(row, pdbs_output_path, column_file="trajectory"): + """ + Given a row of a dataframe (expected to come from a csv report) and a column name (that must contain the path to + its correspondent trajectory), this function extract the file in PDB format in an output file. + :param row: row of a dataframe (Pandas object). + :type row: pandas.DataFrame + :param pdbs_output_path: output path for the PDB file. + :type pdbs_output_path: str + :param column_file: Column name of the dataframe that contains the path to the trajectory file. + :type column_file: str + :return: + """ + foldername = row[column_file] + filepath = glob.glob(foldername)[0] + epoch = filepath.split("/")[-2] + snapshot = row["numberOfAcceptedPeleSteps"] + new_file_name = os.path.basename(foldername.split("/")[-1]) + new_file_name = new_file_name.split(".")[0] + trajectory_and_snapshot_to_pdb(filepath, snapshot, os.path.join(pdbs_output_path, "{}_epoch_{}_snap_{}.pdb".format( + new_file_name, epoch, snapshot) + )) + print(os.path.join(pdbs_output_path, "{}_epoch_{}_snap_{}.pdb".format(new_file_name, epoch, snapshot))) + + +def get_pdbs_from_df_in_xtc(df, pdbs_output_path, processors=4, column_file="trajectory"): + """ + It uses the function "get_pdb_from_xtc" for a whole dataframe using multiprocessing. + :param df: Dataframe object (Pandas) + :type df: pandas.DataFrame + :param pdbs_output_path: Output path for PDB files. + :type pdbs_output_path: str + :param processors: Number of processes to do with multiprocessing. + :type processors: int + :param column_file: Column name of the dataframe that contains the path to the trajectory file. + :type column_file: str + :return: + """ + pool = mp.Pool(processes=processors) + multiprocessing_list = [] + for index, row in df.iterrows(): + multiprocessing_list.append(pool.apply_async(get_pdb_from_xtc, + (row, pdbs_output_path, column_file))) + for process in multiprocessing_list: + process.get() + + +def main(adaptive_results_folder, column_to_x="epoch", column_to_y="Binding Energy", column_to_z=None, + output_selection_folder=None, summary_done=False, processors=4, report_pref="report_", + trajectory_pref="trajectory_", separator=";", column_file="trajectory"): + """ + Generates a scatterplot of Adaptive's results given two or three columns (X, Y, and Z if set). + This plot allows the selection of desired points by drawing. Structures will be selected and + stored into an output folder. Additionally, a report file of this selected structures + will be created. + :param adaptive_results_folder: Path to Adaptive results. + :type adaptive_results_folder: str + :param column_to_x: Column name of the report file that will be used in the X axis. + :type column_to_x: str + :param column_to_y: Column name of the report file that will be used in the Y axis. + :type column_to_y: str + :param column_to_z: If set, column name of the report file that will be used in the Z axis (colorbar). + :type column_to_z: str + :param output_selection_folder: If set, path to the output's folder. By default it will be created in the + Adaptive's results path. WARNING: Take into account that if the folder already exists it will be overwritten!!! + :type output_selection_folder: str + :param summary_done: If it is set, instead of looking all the reports and create a new one, the script will use + the summary csv of previous usages, saving computational time." + :type summary_done: bool + :param processors: Number of processors that you want to use in order to save time. + :type processors: int + :param report_pref: PELE's report prefix. + :type report_pref: str + :param trajectory_pref: Adaptive's trajectory prefix. + :type trajectory_pref: str + :param separator: Separator string that will be used in the CSV files. + :type separator: str + :param column_file: Column name of the dataframe that contains the path to the trajectory file. + :type column_file: str + :return: + """ + summary_csv_filename = os.path.join(adaptive_results_folder, "summary.csv") + if not summary_done: + concat_reports_in_csv(adaptive_results_path=adaptive_results_folder, output_file_path=summary_csv_filename, + report_prefix=report_pref, trajectory_prefix=trajectory_pref, separator_out=separator) + dataframe = pd.read_csv(summary_csv_filename, sep=separator, engine='python', header=0) + fig, ax = plt.subplots() + if column_to_z: + pts = ax.scatter(dataframe[column_to_x], dataframe[column_to_y], c=dataframe[column_to_z], s=20) + else: + pts = ax.scatter(dataframe[column_to_x], dataframe[column_to_y], s=20) + selector = SelectFromCollection(ax, pts) + + def accept(event, output_selection_folder=output_selection_folder): + if event.key == "enter": + print("Selected points:") + df_select = dataframe.loc[selector.ind] + print(df_select) + if not output_selection_folder: + output_selection_folder = os.path.join(adaptive_results_folder, "selected_from_plot") + if not os.path.exists(output_selection_folder): + os.mkdir(output_selection_folder) + else: + shutil.rmtree(output_selection_folder) + os.mkdir(output_selection_folder) + df_select.to_csv(os.path.join(output_selection_folder, "selection_report.csv"), sep=separator, index=False) + get_pdbs_from_df_in_xtc(df_select, output_selection_folder, processors=processors, column_file=column_file) + selector.disconnect() + ax.set_title("") + fig.canvas.draw() + + fig.canvas.mpl_connect("key_press_event", accept) + ax.set_title("Press enter to accept selected points.") + ax.set_xlabel(column_to_x) + ax.set_ylabel(column_to_y) + plt.show() + + +if __name__ == '__main__': + res_path, xcol, ycol, zcol, outfol, done, cpus, report, traj, sep = parseArguments() + main(adaptive_results_folder=res_path, column_to_x=xcol, column_to_y=ycol, column_to_z=zcol, + output_selection_folder=outfol, summary_done=done, processors=cpus, report_pref=report, + trajectory_pref=traj, separator=sep)