From 0651eb22d34e9420daba9dc2330f9b162fcb828d Mon Sep 17 00:00:00 2001 From: mdbarnesUCSD Date: Thu, 25 Jan 2024 09:24:39 -0800 Subject: [PATCH 1/4] Apply black automatic formatting --- SigProfilerExtractor/__init__.py | 2 +- .../estimate_best_solution.py | 190 +- SigProfilerExtractor/nmf_cpu.py | 158 +- SigProfilerExtractor/nmf_gpu.py | 150 +- SigProfilerExtractor/sigpro.py | 1289 ++++++---- SigProfilerExtractor/subroutines.py | 2164 +++++++++++------ install_genome.py | 10 +- setup.py | 128 +- test.py | 141 +- 9 files changed, 2710 insertions(+), 1522 deletions(-) diff --git a/SigProfilerExtractor/__init__.py b/SigProfilerExtractor/__init__.py index a66e36c..8f32b37 100644 --- a/SigProfilerExtractor/__init__.py +++ b/SigProfilerExtractor/__init__.py @@ -1 +1 @@ -from .version import short_version as __version__ \ No newline at end of file +from .version import short_version as __version__ diff --git a/SigProfilerExtractor/estimate_best_solution.py b/SigProfilerExtractor/estimate_best_solution.py index 7a885e5..f8a02a2 100644 --- a/SigProfilerExtractor/estimate_best_solution.py +++ b/SigProfilerExtractor/estimate_best_solution.py @@ -13,97 +13,127 @@ from SigProfilerExtractor import subroutines as sub -def estimate_solution(base_csvfile="All_solutions_stat.csv", - All_solution="All_Solutions", - genomes="Samples.txt", - output="results", - title="Selection_Plot", - stability=0.8, - min_stability=0.2, - combined_stability=1.0, - statistics=True, - select=None, - exome=False, - allow_stability_drop=False): - - - +def estimate_solution( + base_csvfile="All_solutions_stat.csv", + All_solution="All_Solutions", + genomes="Samples.txt", + output="results", + title="Selection_Plot", + stability=0.8, + min_stability=0.2, + combined_stability=1.0, + statistics=True, + select=None, + exome=False, + allow_stability_drop=False, +): base_csvfile = pd.read_csv(base_csvfile, sep=",", index_col=0) - signatures=list(base_csvfile.index) - genomes=pd.read_csv(genomes, sep="\t", index_col=0) - colnames=genomes.columns - genomes=np.array(genomes) - all_similarities_list=[] - layer_directory=output - - if genomes.shape[0]==78: - mtype="DBS78" - elif genomes.shape[0]==83: - mtype="ID83" - elif genomes.shape[0]==48: - mtype="CNV48" + signatures = list(base_csvfile.index) + genomes = pd.read_csv(genomes, sep="\t", index_col=0) + colnames = genomes.columns + genomes = np.array(genomes) + all_similarities_list = [] + layer_directory = output + + if genomes.shape[0] == 78: + mtype = "DBS78" + elif genomes.shape[0] == 83: + mtype = "ID83" + elif genomes.shape[0] == 48: + mtype = "CNV48" else: - mtype="SBS"+str(genomes.shape[0]) + mtype = "SBS" + str(genomes.shape[0]) - #set the squence type ("genome" or "exome") for selection criteria - if exome==False: - sequence="genome" - if exome==True: - sequence="exome" + # set the squence type ("genome" or "exome") for selection criteria + if exome == False: + sequence = "genome" + if exome == True: + sequence = "exome" - #prepare the csvfile - csvfile=np.zeros([len(signatures),4]) - csvfile=csvfile + # prepare the csvfile + csvfile = np.zeros([len(signatures), 4]) + csvfile = csvfile for i in range(len(signatures)): - if base_csvfile.shape[1]!=3: - signatures[i]=signatures[i].rstrip("*") - fnorm=base_csvfile.iloc[i,4].rstrip("%") - csvfile[i,[1,3]]=base_csvfile.iloc[i,[1,0]] - elif base_csvfile.shape[1]==3: - signatures[i]=str(signatures[i]) - fnorm=base_csvfile.iloc[i,1].astype(float) - fnorm=fnorm*100 - csvfile[i,[1,3]]=base_csvfile.iloc[i,[0,2]] - csvfile[i,0]=signatures[i] - csvfile[i,2]=fnorm - w=pd.read_csv(All_solution+"/"+mtype+"_"+signatures[i]+"_Signatures/Signatures/"+mtype+"_S"+signatures[i]+"_Signatures.txt", sep="\t", index_col=0) - h=pd.read_csv(All_solution+"/"+mtype+"_"+signatures[i]+"_Signatures/Activities/"+mtype+"_S"+signatures[i]+"_NMF_Activities.txt", sep="\t", index_col=0) + if base_csvfile.shape[1] != 3: + signatures[i] = signatures[i].rstrip("*") + fnorm = base_csvfile.iloc[i, 4].rstrip("%") + csvfile[i, [1, 3]] = base_csvfile.iloc[i, [1, 0]] + elif base_csvfile.shape[1] == 3: + signatures[i] = str(signatures[i]) + fnorm = base_csvfile.iloc[i, 1].astype(float) + fnorm = fnorm * 100 + csvfile[i, [1, 3]] = base_csvfile.iloc[i, [0, 2]] + csvfile[i, 0] = signatures[i] + csvfile[i, 2] = fnorm + w = pd.read_csv( + All_solution + + "/" + + mtype + + "_" + + signatures[i] + + "_Signatures/Signatures/" + + mtype + + "_S" + + signatures[i] + + "_Signatures.txt", + sep="\t", + index_col=0, + ) + h = pd.read_csv( + All_solution + + "/" + + mtype + + "_" + + signatures[i] + + "_Signatures/Activities/" + + mtype + + "_S" + + signatures[i] + + "_NMF_Activities.txt", + sep="\t", + index_col=0, + ) w = np.array(w) h = np.array(h).T - est_genomes=np.dot(w,h) - all_similarities, cosine_similarities = sub.calculate_similarities(genomes, est_genomes, colnames) + est_genomes = np.dot(w, h) + all_similarities, cosine_similarities = sub.calculate_similarities( + genomes, est_genomes, colnames + ) all_similarities_list.append(all_similarities) - - csvfile=pd.DataFrame(csvfile) - csvfile.columns=["Total Signatures", "Stability", "Matrix Frobenius%", "avgStability"] - + + csvfile = pd.DataFrame(csvfile) + csvfile.columns = [ + "Total Signatures", + "Stability", + "Matrix Frobenius%", + "avgStability", + ] + try: if not os.path.exists(layer_directory): os.makedirs(layer_directory) - except: - print ("The {} folder could not be created".format("output")) - - solution, all_stats= sub.stabVsRError(csvfile, - layer_directory, - title, all_similarities_list, - input_type="dataframe", - stability=stability, - min_stability=min_stability, - combined_stability=combined_stability, - mtype=mtype, - statistics=statistics, - select=select, - sequence=sequence, - allow_stability_drop=allow_stability_drop) - - all_stats.insert(1, 'Stability (Avg Silhouette)', csvfile["avgStability"]) - all_stats=all_stats.set_index(["Signatures"]) - all_stats.to_csv(layer_directory+"/All_solutions_stat.csv", sep = ",") - #print("\nSelected Solution: ", solution) - - - - return(solution) + except: + print("The {} folder could not be created".format("output")) + solution, all_stats = sub.stabVsRError( + csvfile, + layer_directory, + title, + all_similarities_list, + input_type="dataframe", + stability=stability, + min_stability=min_stability, + combined_stability=combined_stability, + mtype=mtype, + statistics=statistics, + select=select, + sequence=sequence, + allow_stability_drop=allow_stability_drop, + ) + all_stats.insert(1, "Stability (Avg Silhouette)", csvfile["avgStability"]) + all_stats = all_stats.set_index(["Signatures"]) + all_stats.to_csv(layer_directory + "/All_solutions_stat.csv", sep=",") + # print("\nSelected Solution: ", solution) + return solution diff --git a/SigProfilerExtractor/nmf_cpu.py b/SigProfilerExtractor/nmf_cpu.py index 25b6b32..eef6e2d 100644 --- a/SigProfilerExtractor/nmf_cpu.py +++ b/SigProfilerExtractor/nmf_cpu.py @@ -19,9 +19,19 @@ class NMF: - def __init__(self, V, rank, max_iterations=200000, tolerance=1e-8, test_conv=1000, gpu_id=0, generator=None, - init_method='nndsvd', floating_point_precision='single', min_iterations=2000): - + def __init__( + self, + V, + rank, + max_iterations=200000, + tolerance=1e-8, + test_conv=1000, + gpu_id=0, + generator=None, + init_method="nndsvd", + floating_point_precision="single", + min_iterations=2000, + ): """ Run non-negative matrix factorisation using GPU. Uses beta-divergence. @@ -43,16 +53,16 @@ def __init__(self, V, rank, max_iterations=200000, tolerance=1e-8, test_conv=100 min_iterations: the minimum number of iterations to execute before termination. Useful when using fp32 tensors as convergence can happen too early. """ - #torch.cuda.set_device(gpu_id) + # torch.cuda.set_device(gpu_id) - if floating_point_precision == 'single': + if floating_point_precision == "single": self._tensor_type = torch.FloatTensor self._np_dtype = np.float32 - elif floating_point_precision == 'double': + elif floating_point_precision == "double": self._tensor_type = torch.DoubleTensor self._np_dtype = np.float64 else: - raise ValueError("Precision needs to be either 'single' or 'double'." ) + raise ValueError("Precision needs to be either 'single' or 'double'.") self.max_iterations = max_iterations self.min_iterations = min_iterations @@ -62,12 +72,12 @@ def __init__(self, V, rank, max_iterations=200000, tolerance=1e-8, test_conv=100 V = V[None, :, :] self._V = V.type(self._tensor_type) - self._fix_neg = nn.Threshold(0., 1e-8) + self._fix_neg = nn.Threshold(0.0, 1e-8) self._tolerance = tolerance self._prev_loss = None self._iter = 0 self._test_conv = test_conv - #self._gpu_id = gpu_id + # self._gpu_id = gpu_id self._rank = rank self._generator = generator self._W, self._H = self._initialise_wh(init_method) @@ -76,52 +86,72 @@ def _initialise_wh(self, init_method): """ Initialise basis and coefficient matrices according to `init_method` """ - if init_method == 'random': - W = torch.unsqueeze(torch.from_numpy(self._generator.random((self._V.shape[1],self._rank), dtype=np.float64)),0) - H = torch.unsqueeze(torch.from_numpy(self._generator.random((self._rank, self._V.shape[2]), dtype=np.float64)),0) + if init_method == "random": + W = torch.unsqueeze( + torch.from_numpy( + self._generator.random( + (self._V.shape[1], self._rank), dtype=np.float64 + ) + ), + 0, + ) + H = torch.unsqueeze( + torch.from_numpy( + self._generator.random( + (self._rank, self._V.shape[2]), dtype=np.float64 + ) + ), + 0, + ) if self._np_dtype is np.float32: W = W.float() H = H.float() return W, H - elif init_method == 'nndsvd': + elif init_method == "nndsvd": W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) nv = nndsvd.Nndsvd() for i in range(self._V.shape[0]): vin = np.mat(self._V.cpu().numpy()[i]) - W[i,:,:], H[i,:,:] = nv.initialize(vin, self._rank, options={'flag': 0}) - - elif init_method == 'nndsvda': + W[i, :, :], H[i, :, :] = nv.initialize( + vin, self._rank, options={"flag": 0} + ) + + elif init_method == "nndsvda": W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) nv = nndsvd.Nndsvd() for i in range(self._V.shape[0]): vin = np.mat(self._V.cpu().numpy()[i]) - W[i,:,:], H[i,:,:] = nv.initialize(vin, self._rank, options={'flag': 1}) + W[i, :, :], H[i, :, :] = nv.initialize( + vin, self._rank, options={"flag": 1} + ) - elif init_method == 'nndsvdar': + elif init_method == "nndsvdar": W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) nv = nndsvd.Nndsvd() for i in range(self._V.shape[0]): vin = np.mat(self._V.cpu().numpy()[i]) - W[i,:,:], H[i,:,:] = nv.initialize(vin, self._rank, options={'flag': 2}) - elif init_method =='nndsvd_min': - W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) - H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) - nv = nndsvd.Nndsvd() - for i in range(self._V.shape[0]): - vin = np.mat(self._V.cpu().numpy()[i]) - w, h = nv.initialize(vin, self._rank, options={'flag': 2}) - min_X = np.min(vin[vin>0]) - h[h <= min_X] = min_X - w[w <= min_X] = min_X - #W= np.expand_dims(W, axis=0) - #H = np.expand_dims(H, axis=0) - W[i,:,:]=w - H[i,:,:]=h - #W,H=initialize_nm(vin, nfactors, init=init, eps=1e-6,random_state=None) + W[i, :, :], H[i, :, :] = nv.initialize( + vin, self._rank, options={"flag": 2} + ) + elif init_method == "nndsvd_min": + W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) + H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) + nv = nndsvd.Nndsvd() + for i in range(self._V.shape[0]): + vin = np.mat(self._V.cpu().numpy()[i]) + w, h = nv.initialize(vin, self._rank, options={"flag": 2}) + min_X = np.min(vin[vin > 0]) + h[h <= min_X] = min_X + w[w <= min_X] = min_X + # W= np.expand_dims(W, axis=0) + # H = np.expand_dims(H, axis=0) + W[i, :, :] = w + H[i, :, :] = h + # W,H=initialize_nm(vin, nfactors, init=init, eps=1e-6,random_state=None) W = torch.from_numpy(W).type(self._tensor_type) H = torch.from_numpy(H).type(self._tensor_type) return W, H @@ -142,13 +172,17 @@ def H(self): def conv(self): try: return self._conv - except: + except: return 0 @property def _kl_loss(self): # calculate kl_loss in double precision for better convergence criteria - return (self._V * (self._V / self.reconstruction).log()).sum(dtype=torch.float64) - self._V.sum(dtype=torch.float64) + self.reconstruction.sum(dtype=torch.float64) + return ( + (self._V * (self._V / self.reconstruction).log()).sum(dtype=torch.float64) + - self._V.sum(dtype=torch.float64) + + self.reconstruction.sum(dtype=torch.float64) + ) @property def generator(self): @@ -177,22 +211,33 @@ def fit(self, beta=1): beta == 0 => Itakura-Saito updates """ with torch.no_grad(): + def stop_iterations(): - stop = (self._V.shape[0] == 1) and \ - (self._iter % self._test_conv == 0) and \ - self._loss_converged and \ - (self._iter > self.min_iterations) + stop = ( + (self._V.shape[0] == 1) + and (self._iter % self._test_conv == 0) + and self._loss_converged + and (self._iter > self.min_iterations) + ) if stop: pass - #print("loss converged with {} iterations".format(self._iter)) - return [stop, self._iter] + # print("loss converged with {} iterations".format(self._iter)) + return [stop, self._iter] if beta == 2: for self._iter in range(self.max_iterations): - self._H = self.H * (self.W.transpose(1, 2) @ self._V) / (self.W.transpose(1, 2) @ (self.W @ self.H)) - self._W = self.W * (self._V @ self.H.transpose(1, 2)) / (self.W @ (self.H @ self.H.transpose(1, 2))) + self._H = ( + self.H + * (self.W.transpose(1, 2) @ self._V) + / (self.W.transpose(1, 2) @ (self.W @ self.H)) + ) + self._W = ( + self.W + * (self._V @ self.H.transpose(1, 2)) + / (self.W @ (self.H @ self.H.transpose(1, 2))) + ) if stop_iterations()[0]: - self._conv=stop_iterations()[1] + self._conv = stop_iterations()[1] break # Optimisations for the (common) beta=1 (KL) case. @@ -210,16 +255,25 @@ def stop_iterations(): denomenator = wt @ ones self._H *= numerator / denomenator if stop_iterations()[0]: - self._conv=stop_iterations()[1] + self._conv = stop_iterations()[1] break else: for self._iter in range(self.max_iterations): - self._H = self.H * ((self.W.transpose(1, 2) @ (((self.W @ self.H) ** (beta - 2)) * self._V)) / - (self.W.transpose(1, 2) @ ((self.W @ self.H)**(beta-1)))) - self._W = self.W * ((((self.W@self.H)**(beta-2) * self._V) @ self.H.transpose(1, 2)) / - (((self.W @ self.H) ** (beta - 1)) @ self.H.transpose(1, 2))) + self._H = self.H * ( + ( + self.W.transpose(1, 2) + @ (((self.W @ self.H) ** (beta - 2)) * self._V) + ) + / (self.W.transpose(1, 2) @ ((self.W @ self.H) ** (beta - 1))) + ) + self._W = self.W * ( + ( + ((self.W @ self.H) ** (beta - 2) * self._V) + @ self.H.transpose(1, 2) + ) + / (((self.W @ self.H) ** (beta - 1)) @ self.H.transpose(1, 2)) + ) if stop_iterations()[0]: - self._conv=stop_iterations()[1] + self._conv = stop_iterations()[1] break - diff --git a/SigProfilerExtractor/nmf_gpu.py b/SigProfilerExtractor/nmf_gpu.py index 1884f7f..10f2394 100644 --- a/SigProfilerExtractor/nmf_gpu.py +++ b/SigProfilerExtractor/nmf_gpu.py @@ -12,9 +12,19 @@ class NMF: - def __init__(self, V, rank, max_iterations=200000, tolerance=1e-8, test_conv=1000, gpu_id=0, generator=None, - init_method='nndsvd', floating_point_precision='single', min_iterations=2000): - + def __init__( + self, + V, + rank, + max_iterations=200000, + tolerance=1e-8, + test_conv=1000, + gpu_id=0, + generator=None, + init_method="nndsvd", + floating_point_precision="single", + min_iterations=2000, + ): """ Run non-negative matrix factorisation using GPU. Uses beta-divergence. @@ -38,15 +48,14 @@ def __init__(self, V, rank, max_iterations=200000, tolerance=1e-8, test_conv=100 """ torch.cuda.set_device(gpu_id) - - if floating_point_precision == 'single': + if floating_point_precision == "single": self._tensor_type = torch.FloatTensor self._np_dtype = np.float32 - elif floating_point_precision == 'double': + elif floating_point_precision == "double": self._tensor_type = torch.DoubleTensor self._np_dtype = np.float64 else: - raise ValueError("Precision needs to be either 'single' or 'double'." ) + raise ValueError("Precision needs to be either 'single' or 'double'.") self.max_iterations = max_iterations self.min_iterations = min_iterations @@ -56,7 +65,7 @@ def __init__(self, V, rank, max_iterations=200000, tolerance=1e-8, test_conv=100 V = V[None, :, :] self._V = V.type(self._tensor_type).cuda() - self._fix_neg = nn.Threshold(0., 1e-8) + self._fix_neg = nn.Threshold(0.0, 1e-8) self._tolerance = tolerance self._prev_loss = None self._iter = 0 @@ -70,52 +79,66 @@ def _initialise_wh(self, init_method): """ Initialise basis and coefficient matrices according to `init_method` """ - if init_method == 'random': - W = torch.from_numpy(self._generator.random((self._V.shape[0], self._V.shape[1],self._rank), dtype=np.float64)).cuda() - H = torch.from_numpy(self._generator.random((self._V.shape[0], self._rank, self._V.shape[2]), dtype=np.float64)).cuda() + if init_method == "random": + W = torch.from_numpy( + self._generator.random( + (self._V.shape[0], self._V.shape[1], self._rank), dtype=np.float64 + ) + ).cuda() + H = torch.from_numpy( + self._generator.random( + (self._V.shape[0], self._rank, self._V.shape[2]), dtype=np.float64 + ) + ).cuda() if self._np_dtype is np.float32: W = W.float() H = H.float() return W, H - elif init_method == 'nndsvd': + elif init_method == "nndsvd": W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) nv = nndsvd.Nndsvd() for i in range(self._V.shape[0]): vin = np.mat(self._V.cpu().numpy()[i]) - W[i,:,:], H[i,:,:] = nv.initialize(vin, self._rank, options={'flag': 0}) - - elif init_method == 'nndsvda': + W[i, :, :], H[i, :, :] = nv.initialize( + vin, self._rank, options={"flag": 0} + ) + + elif init_method == "nndsvda": W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) nv = nndsvd.Nndsvd() for i in range(self._V.shape[0]): vin = np.mat(self._V.cpu().numpy()[i]) - W[i,:,:], H[i,:,:] = nv.initialize(vin, self._rank, options={'flag': 1}) + W[i, :, :], H[i, :, :] = nv.initialize( + vin, self._rank, options={"flag": 1} + ) - elif init_method == 'nndsvdar': + elif init_method == "nndsvdar": + W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) + H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) + nv = nndsvd.Nndsvd() + for i in range(self._V.shape[0]): + vin = np.mat(self._V.cpu().numpy()[i]) + W[i, :, :], H[i, :, :] = nv.initialize( + vin, self._rank, options={"flag": 2} + ) + elif init_method == "nndsvd_min": W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) nv = nndsvd.Nndsvd() for i in range(self._V.shape[0]): vin = np.mat(self._V.cpu().numpy()[i]) - W[i,:,:], H[i,:,:] = nv.initialize(vin, self._rank, options={'flag': 2}) - elif init_method =='nndsvd_min': - W = np.zeros([self._V.shape[0], self._V.shape[1], self._rank]) - H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]]) - nv = nndsvd.Nndsvd() - for i in range(self._V.shape[0]): - vin = np.mat(self._V.cpu().numpy()[i]) - w, h = nv.initialize(vin, self._rank, options={'flag': 2}) - min_X = np.min(vin[vin>0]) - h[h <= min_X] = min_X - w[w <= min_X] = min_X - #W= np.expand_dims(W, axis=0) - #H = np.expand_dims(H, axis=0) - W[i,:,:]=w - H[i,:,:]=h - #W,H=initialize_nm(vin, nfactors, init=init, eps=1e-6,random_state=None) + w, h = nv.initialize(vin, self._rank, options={"flag": 2}) + min_X = np.min(vin[vin > 0]) + h[h <= min_X] = min_X + w[w <= min_X] = min_X + # W= np.expand_dims(W, axis=0) + # H = np.expand_dims(H, axis=0) + W[i, :, :] = w + H[i, :, :] = h + # W,H=initialize_nm(vin, nfactors, init=init, eps=1e-6,random_state=None) W = torch.from_numpy(W).type(self._tensor_type).cuda(self._gpu_id) H = torch.from_numpy(H).type(self._tensor_type).cuda(self._gpu_id) return W, H @@ -131,7 +154,7 @@ def W(self): @property def H(self): return self._H - + @property def conv(self): try: @@ -146,7 +169,11 @@ def generator(self): @property def _kl_loss(self): # calculate kl_loss in double precision for better convergence criteria - return (self._V * (self._V / self.reconstruction).log()).sum(dtype=torch.float64) - self._V.sum(dtype=torch.float64) + self.reconstruction.sum(dtype=torch.float64) + return ( + (self._V * (self._V / self.reconstruction).log()).sum(dtype=torch.float64) + - self._V.sum(dtype=torch.float64) + + self.reconstruction.sum(dtype=torch.float64) + ) @property def _loss_converged(self): @@ -171,27 +198,40 @@ def fit(self, beta=1): beta == 0 => Itakura-Saito updates """ with torch.no_grad(): + def stop_iterations(): - stop = (self._V.shape[0] == 1) and \ - (self._iter % self._test_conv == 0) and \ - self._loss_converged and \ - (self._iter > self.min_iterations) + stop = ( + (self._V.shape[0] == 1) + and (self._iter % self._test_conv == 0) + and self._loss_converged + and (self._iter > self.min_iterations) + ) if stop: pass - #print("loss converged with {} iterations".format(self._iter)) + # print("loss converged with {} iterations".format(self._iter)) return [stop, self._iter] if beta == 2: for self._iter in range(self.max_iterations): - self._H = self.H * (self.W.transpose(1, 2) @ self._V) / (self.W.transpose(1, 2) @ (self.W @ self.H)) - self._W = self.W * (self._V @ self.H.transpose(1, 2)) / (self.W @ (self.H @ self.H.transpose(1, 2))) + self._H = ( + self.H + * (self.W.transpose(1, 2) @ self._V) + / (self.W.transpose(1, 2) @ (self.W @ self.H)) + ) + self._W = ( + self.W + * (self._V @ self.H.transpose(1, 2)) + / (self.W @ (self.H @ self.H.transpose(1, 2))) + ) if stop_iterations()[0]: - self._conv=stop_iterations()[1] + self._conv = stop_iterations()[1] break # Optimisations for the (common) beta=1 (KL) case. elif beta == 1: - ones = torch.ones(self._V.shape).type(self._tensor_type).cuda(self._gpu_id) + ones = ( + torch.ones(self._V.shape).type(self._tensor_type).cuda(self._gpu_id) + ) for self._iter in range(self.max_iterations): ht = self.H.transpose(1, 2) numerator = (self._V / (self.W @ self.H)) @ ht @@ -204,15 +244,25 @@ def stop_iterations(): denomenator = wt @ ones self._H *= numerator / denomenator if stop_iterations()[0]: - self._conv=stop_iterations()[1] + self._conv = stop_iterations()[1] break else: for self._iter in range(self.max_iterations): - self._H = self.H * ((self.W.transpose(1, 2) @ (((self.W @ self.H) ** (beta - 2)) * self._V)) / - (self.W.transpose(1, 2) @ ((self.W @ self.H)**(beta-1)))) - self._W = self.W * ((((self.W@self.H)**(beta-2) * self._V) @ self.H.transpose(1, 2)) / - (((self.W @ self.H) ** (beta - 1)) @ self.H.transpose(1, 2))) + self._H = self.H * ( + ( + self.W.transpose(1, 2) + @ (((self.W @ self.H) ** (beta - 2)) * self._V) + ) + / (self.W.transpose(1, 2) @ ((self.W @ self.H) ** (beta - 1))) + ) + self._W = self.W * ( + ( + ((self.W @ self.H) ** (beta - 2) * self._V) + @ self.H.transpose(1, 2) + ) + / (((self.W @ self.H) ** (beta - 1)) @ self.H.transpose(1, 2)) + ) if stop_iterations()[0]: - self._conv=stop_iterations()[1] + self._conv = stop_iterations()[1] break diff --git a/SigProfilerExtractor/sigpro.py b/SigProfilerExtractor/sigpro.py index 71fac19..21d330d 100644 --- a/SigProfilerExtractor/sigpro.py +++ b/SigProfilerExtractor/sigpro.py @@ -18,12 +18,14 @@ """ import os -os.environ["MKL_NUM_THREADS"] = "1" -os.environ["NUMEXPR_NUM_THREADS"] = "1" -os.environ["OMP_NUM_THREADS"] = "1" + +os.environ["MKL_NUM_THREADS"] = "1" +os.environ["NUMEXPR_NUM_THREADS"] = "1" +os.environ["OMP_NUM_THREADS"] = "1" import matplotlib.pyplot as plt -plt.switch_backend('agg') + +plt.switch_backend("agg") import scipy from scipy import io as sio @@ -36,11 +38,13 @@ import datetime import psutil import copy -import sigProfilerPlotting +import sigProfilerPlotting import multiprocessing from SigProfilerExtractor import subroutines as sub import SigProfilerMatrixGenerator -from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as datadump +from SigProfilerMatrixGenerator.scripts import ( + SigProfilerMatrixGeneratorFunc as datadump, +) from SigProfilerMatrixGenerator.scripts import SVMatrixGenerator as sv from SigProfilerMatrixGenerator.scripts import CNVMatrixGenerator as scna import multiprocessing as mp @@ -49,440 +53,549 @@ from SigProfilerAssignment import single_sample as spasub from SigProfilerAssignment import decomposition as decomp from numpy.random import SeedSequence + MUTTYPE = "MutationType" + def memory_usage(): pid = os.getpid() py = psutil.Process(pid) - memoryUse1 = py.memory_info()[0]/2.**30 # memory use in GB...I think - print('\n************** Reported Current Memory Use: '+ str(round(memoryUse1,2))+" GB *****************\n") + memoryUse1 = py.memory_info()[0] / 2.0**30 # memory use in GB...I think + print( + "\n************** Reported Current Memory Use: " + + str(round(memoryUse1, 2)) + + " GB *****************\n" + ) + def importdata(datatype="matrix"): - """ Imports the path of example data. - + parameters ---------- - + datatype: A string. Type of data. The type of data should be one of the following: - "vcf": used for vcf format data. - - "matrix": used for text format data. This format represents the catalog of mutations seperated by tab. + - "matrix": used for text format data. This format represents the catalog of mutations seperated by tab. - "matobj": used for matlab object format data. - - - + + + Returns: ------- The path of the example data. - Example: + Example: ------- >>> from SigProfilerExtractor import sigpro as sig >>> data = sig.importdata("table") - + This "data" variable can be used as a parameter of the "project" argument of the sigProfilerExtractor function - + """ - + paths = cosmic.__path__[0] - if datatype=="matobj": - data = paths+"/data/MatObjInput/21_breast_WGS_substitutions.mat" - elif datatype=="text" or datatype=="table" or datatype=="matrix": - data = paths+"/data/TextInput/Samples_SBS.txt" - elif datatype=="matrix_DBS": - data = paths+"/data/TextInput/Samples_DBS.txt" - elif datatype=="matrix_ID": - data = paths+"/data/TextInput/Samples_ID.txt" - elif datatype=="matrix_CNV": - data = paths+"/data/TextInput/Samples_CNV.txt" - elif datatype=="csv": - data = paths+"/data/CSVInput/csv_example.csv" - elif datatype=="seg:BATTENBERG": - data = paths+"/data/CNVInput/Battenberg_test.tsv" - elif datatype=="matrix_SV": - data = paths+"/data/TextInput/Samples_SV.txt" - elif datatype=="vcf": - data = paths+"/data/VCFInput/" + if datatype == "matobj": + data = paths + "/data/MatObjInput/21_breast_WGS_substitutions.mat" + elif datatype == "text" or datatype == "table" or datatype == "matrix": + data = paths + "/data/TextInput/Samples_SBS.txt" + elif datatype == "matrix_DBS": + data = paths + "/data/TextInput/Samples_DBS.txt" + elif datatype == "matrix_ID": + data = paths + "/data/TextInput/Samples_ID.txt" + elif datatype == "matrix_CNV": + data = paths + "/data/TextInput/Samples_CNV.txt" + elif datatype == "csv": + data = paths + "/data/CSVInput/csv_example.csv" + elif datatype == "seg:BATTENBERG": + data = paths + "/data/CNVInput/Battenberg_test.tsv" + elif datatype == "matrix_SV": + data = paths + "/data/TextInput/Samples_SV.txt" + elif datatype == "vcf": + data = paths + "/data/VCFInput/" return data - + + def record_parameters(sysdata, execution_parameters, start_time): """ Extracts mutational signatures from an array of samples. - + """ sysdata.write("\n--------------EXECUTION PARAMETERS--------------\n") sysdata.write("INPUT DATA\n") sysdata.write("\tinput_type: {}\n".format(execution_parameters["input_type"])) sysdata.write("\toutput: {}\n".format(execution_parameters["output"])) sysdata.write("\tinput_data: {}\n".format(execution_parameters["input_data"])) - sysdata.write("\treference_genome: {}\n".format(execution_parameters["reference_genome"])) + sysdata.write( + "\treference_genome: {}\n".format(execution_parameters["reference_genome"]) + ) sysdata.write("\tcontext_types: {}\n".format(execution_parameters["context_type"])) sysdata.write("\texome: {}\n".format(execution_parameters["exome"])) sysdata.write("NMF REPLICATES\n") - sysdata.write("\tminimum_signatures: {}\n".format(execution_parameters["minimum_signatures"])) - sysdata.write("\tmaximum_signatures: {}\n".format(execution_parameters["maximum_signatures"])) - sysdata.write("\tNMF_replicates: {}\n".format(execution_parameters["NMF_replicates"])) + sysdata.write( + "\tminimum_signatures: {}\n".format(execution_parameters["minimum_signatures"]) + ) + sysdata.write( + "\tmaximum_signatures: {}\n".format(execution_parameters["maximum_signatures"]) + ) + sysdata.write( + "\tNMF_replicates: {}\n".format(execution_parameters["NMF_replicates"]) + ) sysdata.write("NMF ENGINE\n") sysdata.write("\tNMF_init: {}\n".format(execution_parameters["NMF_init"])) sysdata.write("\tprecision: {}\n".format(execution_parameters["precision"])) - sysdata.write("\tmatrix_normalization: {}\n".format(execution_parameters["matrix_normalization"])) + sysdata.write( + "\tmatrix_normalization: {}\n".format( + execution_parameters["matrix_normalization"] + ) + ) sysdata.write("\tresample: {}\n".format(execution_parameters["resample"])) sysdata.write("\tseeds: {}\n".format(execution_parameters["seeds"])) - sysdata.write("\tmin_NMF_iterations: {}\n".format(format(execution_parameters["min_NMF_iterations"],',d'))) - sysdata.write("\tmax_NMF_iterations: {}\n".format(format(execution_parameters["max_NMF_iterations"], ',d'))) - sysdata.write("\tNMF_test_conv: {}\n".format(format(execution_parameters["NMF_test_conv"],',d'))) + sysdata.write( + "\tmin_NMF_iterations: {}\n".format( + format(execution_parameters["min_NMF_iterations"], ",d") + ) + ) + sysdata.write( + "\tmax_NMF_iterations: {}\n".format( + format(execution_parameters["max_NMF_iterations"], ",d") + ) + ) + sysdata.write( + "\tNMF_test_conv: {}\n".format( + format(execution_parameters["NMF_test_conv"], ",d") + ) + ) sysdata.write("\tNMF_tolerance: {}\n".format(execution_parameters["NMF_tolerance"])) sysdata.write("CLUSTERING\n") sysdata.write("\tclustering_distance: {}\n".format(execution_parameters["dist"])) sysdata.write("EXECUTION\n") - if execution_parameters["cpu"]==-1: - sysdata.write("\tcpu: {}; Maximum number of CPU is {}\n".format(multiprocessing.cpu_count(), multiprocessing.cpu_count())) + if execution_parameters["cpu"] == -1: + sysdata.write( + "\tcpu: {}; Maximum number of CPU is {}\n".format( + multiprocessing.cpu_count(), multiprocessing.cpu_count() + ) + ) else: - sysdata.write("\tcpu: {}; Maximum number of CPU is {}\n".format(execution_parameters["cpu"], multiprocessing.cpu_count())) + sysdata.write( + "\tcpu: {}; Maximum number of CPU is {}\n".format( + execution_parameters["cpu"], multiprocessing.cpu_count() + ) + ) sysdata.write("\tgpu: {}\n".format(execution_parameters["gpu"])) sysdata.write("Solution Estimation\n") sysdata.write("\tstability: {}\n".format(execution_parameters["stability"])) sysdata.write("\tmin_stability: {}\n".format(execution_parameters["min_stability"])) - sysdata.write("\tcombined_stability: {}\n".format(execution_parameters["combined_stability"])) - sysdata.write("\tallow_stability_drop: {}\n".format(execution_parameters["allow_stability_drop"])) - + sysdata.write( + "\tcombined_stability: {}\n".format(execution_parameters["combined_stability"]) + ) + sysdata.write( + "\tallow_stability_drop: {}\n".format( + execution_parameters["allow_stability_drop"] + ) + ) + sysdata.write("COSMIC MATCH\n") - sysdata.write("\topportunity_genome: {}\n".format(execution_parameters["opportunity_genome"])) - sysdata.write("\tcosmic_version: {}\n".format(execution_parameters["cosmic_version"])) - sysdata.write("\tnnls_add_penalty: {}\n".format(execution_parameters["nnls_add_penalty"])) - sysdata.write("\tnnls_remove_penalty: {}\n".format(execution_parameters["nnls_remove_penalty"])) - sysdata.write("\tinitial_remove_penalty: {}\n".format(execution_parameters["initial_remove_penalty"])) - sysdata.write("\texport_probabilities: {}\n".format(execution_parameters["export_probabilities"])) - sysdata.write("\tcollapse_to_SBS96: {}\n".format(execution_parameters["collapse_to_SBS96"])) - + sysdata.write( + "\topportunity_genome: {}\n".format(execution_parameters["opportunity_genome"]) + ) + sysdata.write( + "\tcosmic_version: {}\n".format(execution_parameters["cosmic_version"]) + ) + sysdata.write( + "\tnnls_add_penalty: {}\n".format(execution_parameters["nnls_add_penalty"]) + ) + sysdata.write( + "\tnnls_remove_penalty: {}\n".format( + execution_parameters["nnls_remove_penalty"] + ) + ) + sysdata.write( + "\tinitial_remove_penalty: {}\n".format( + execution_parameters["initial_remove_penalty"] + ) + ) + sysdata.write( + "\texport_probabilities: {}\n".format( + execution_parameters["export_probabilities"] + ) + ) + sysdata.write( + "\tcollapse_to_SBS96: {}\n".format(execution_parameters["collapse_to_SBS96"]) + ) + sysdata.write("\n-------Analysis Progress------- \n") sysdata.write("[{}] Analysis started: \n".format(str(start_time).split(".")[0])) - -def sigProfilerExtractor(input_type, - output, - input_data, - reference_genome="GRCh37", - opportunity_genome = "GRCh37", - cosmic_version=3.4, - context_type = "default", - exome = False, - minimum_signatures=1, - maximum_signatures=25, - nmf_replicates=100, - resample = True, - batch_size=1, - cpu=-1, - gpu=False, - nmf_init="random", - precision= "single", - matrix_normalization= "gmm", - seeds= "random", - min_nmf_iterations= 10000, - max_nmf_iterations=1000000, - nmf_test_conv= 10000, - nmf_tolerance= 1e-15, - nnls_add_penalty=0.05, - nnls_remove_penalty=0.01, - initial_remove_penalty=0.05, - collapse_to_SBS96=True, - clustering_distance="cosine", - export_probabilities=True, - make_decomposition_plots=True, - stability=0.8, - min_stability=0.2, - combined_stability=1.0, - allow_stability_drop=False, - get_all_signature_matrices= False): + + +def sigProfilerExtractor( + input_type, + output, + input_data, + reference_genome="GRCh37", + opportunity_genome="GRCh37", + cosmic_version=3.4, + context_type="default", + exome=False, + minimum_signatures=1, + maximum_signatures=25, + nmf_replicates=100, + resample=True, + batch_size=1, + cpu=-1, + gpu=False, + nmf_init="random", + precision="single", + matrix_normalization="gmm", + seeds="random", + min_nmf_iterations=10000, + max_nmf_iterations=1000000, + nmf_test_conv=10000, + nmf_tolerance=1e-15, + nnls_add_penalty=0.05, + nnls_remove_penalty=0.01, + initial_remove_penalty=0.05, + collapse_to_SBS96=True, + clustering_distance="cosine", + export_probabilities=True, + make_decomposition_plots=True, + stability=0.8, + min_stability=0.2, + combined_stability=1.0, + allow_stability_drop=False, + get_all_signature_matrices=False, +): """ Extracts mutational signatures from an array of samples. - - + + Parameters ---------- - + INPUT DATA:- - + input_type: A string. Type of input. The type of input should be one of the following: - "vcf": used for vcf format inputs. - "matrix": used for table format inputs using a tab seperated file. - - - output: A string. The name of the output folder. The output folder will be generated in the current working directory. - - input_data: A string. Name of the input folder (in case of "vcf" type input) or the input file (in case of "table" type input). The project file or folder should be inside the current working directory. For the "vcf" type input,the project has to be a folder which will contain the vcf files in vcf format or text formats. The "text"type projects have to be a file. - + + + output: A string. The name of the output folder. The output folder will be generated in the current working directory. + + input_data: A string. Name of the input folder (in case of "vcf" type input) or the input file (in case of "table" type input). The project file or folder should be inside the current working directory. For the "vcf" type input,the project has to be a folder which will contain the vcf files in vcf format or text formats. The "text"type projects have to be a file. + reference_genome: A string, optional. The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf". - + opportunity_genome: The build or version of the reference genome for the reference signatures. The default opportunity genome is GRCh37. If the input_type is "vcf", the opportunity_genome automatically matches the input reference genome value. Only the genomes available in COSMIC are supported (GRCh37, GRCh38, mm9, mm10 and rn6). If a different opportunity genome is selected, the default genome GRCh37 will be used. - - context_type: A list of strings, optional. The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is "SBS96,DBS78,ID83". - + + context_type: A list of strings, optional. The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is "SBS96,DBS78,ID83". + exome: Boolean, optional. Defines if the exomes will be extracted. The default value is "False". - - + + NMF RUNS:- - - minimum_signature: A positive integer, optional. The minimum number of signatures to be extracted. The default value is 1 - + + minimum_signature: A positive integer, optional. The minimum number of signatures to be extracted. The default value is 1 + maximum_signatures: A positive integer, optional. The maximum number of signatures to be extracted. The default value is 10 - + nmf_replicates: A positive integer, optional. The number of iteration to be performed to extract each number signature. The default value is 100 - - resample: Boolean, optional. Default is True. If True, add poisson noise to samples by resampling. - + + resample: Boolean, optional. Default is True. If True, add poisson noise to samples by resampling. + seeds: Boolean. Default is "random". If random, then the seeds for resampling will be random for different analysis. - If not random, then seeds will be obtained from a given path of a .txt file that contains a list of seed. - + If not random, then seeds will be obtained from a given path of a .txt file that contains a list of seed. + NMF RUNS:- - - matrix_normalization: A string. Method of normalizing the genome matrix before it is analyzed by NMF. Default is "log2". Other options are "gmm", "100X" or "no_normalization". - + + matrix_normalization: A string. Method of normalizing the genome matrix before it is analyzed by NMF. Default is "log2". Other options are "gmm", "100X" or "no_normalization". + nmf_init: A String. The initialization algorithm for W and H matrix of NMF. Options are 'random', 'nndsvd', 'nndsvda', 'nndsvdar' and 'nndsvd_min' Default is 'nndsvd_min'. - + precision: A string. Values should be single or double. Default is single. - + min_nmf_iterations: An integer. Value defines the minimum number of iterations to be completed before NMF converges. Default is 2000. - + max_nmf_iterations: An integer. Value defines the maximum number of iterations to be completed before NMF converges. Default is 200000 - + nmf_test_conv: An integer. Value definer the number number of iterations to done between checking next convergence. - - nmf_tolerance: A float. Value defines the tolerance to achieve to converge. - - + + nmf_tolerance: A float. Value defines the tolerance to achieve to converge. + + EXECUTION:- - - cpu: An integer, optional. The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. - - gpu:Boolean, optional. Defines if the GPU resource will used if available. Default is False. If True, the GPU resource + + cpu: An integer, optional. The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. + + gpu:Boolean, optional. Defines if the GPU resource will used if available. Default is False. If True, the GPU resource will be used in the computation. batch_size: An integer. Will be effective only if the GPU is used. Defines the number of NMF replicates to be performed by each CPU during the parallel processing. Default is 1. - - + + SOLUTION ESTIMATION THRESH-HOLDS:- - stability: A float. Default is 0.8. The cutoff thresh-hold of the average stability. Solutions with average stabilities below this thresh-hold will not be considered. + stability: A float. Default is 0.8. The cutoff thresh-hold of the average stability. Solutions with average stabilities below this thresh-hold will not be considered. - min_stability: A float. Default is 0.2. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. + min_stability: A float. Default is 0.2. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. + + combined_stability: A float. Default is 1.0. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. - combined_stability: A float. Default is 1.0. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. - allow_stability_drop: Boolean, optional. Default is False. Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered. - + DECOMPOSITION:- - - nnls_add_penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the strong (add) thresh-hold cutoff to be assigned signatures to a sample. - + + nnls_add_penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the strong (add) thresh-hold cutoff to be assigned signatures to a sample. + nnls_remove_penalty: Float, optional. Takes any positive float. Default is 0.01. Defines the weak (remove) thresh-hold cutoff to be assigned signatures to a sample. - + initial_remove_penalty: Float, optional. Takes any positive float. Default is 0.05. Defines the initial weak (remove) thresh-hold cutoff to be assigned COSMIC signatures to a sample. - + refit_denovo_signatures: Boolean, optional. Default is False. If True, then refit the denovo signatures with nnls. - + make_decomposition_plots: Boolean, optional. Defualt is True. If True, Denovo to Cosmic sigantures decompostion plots will be created as a part the results. - + OTHERS:- - + get_all_signature_matrices: A Boolean. If true, the Ws and Hs from all the NMF iterations are generated in the output. - + export_probabilities: A Boolean. Defualt is True. If False, then doesn't create the probability matrix. - - + + Returns ------- To learn about the output, please visit https://osf.io/t6j7u/wiki/home/ - - + + Examples -------- - + Examples -------- >>> from SigProfilerExtractor import sigpro as sig - + # to get input from vcf files >>> path_to_example_folder_containing_vcf_files = sig.importdata("vcf") >>> data = path_to_example_folder_containing_vcf_files # you can put the path to your folder containing the vcf samples >>> sig.sigProfilerExtractor("vcf", "example_output", data, minimum_signatures=1, maximum_signatures=3) - + Wait untill the excecution is finished. The process may a couple of hours based on the size of the data. Check the current working directory for the "example_output" folder. - + # to get input from table format (mutation catalog matrix) >>> path_to_example_table = sig.importdata("matrix") >>> data = path_to_example_table # you can put the path to your tab delimited file containing the mutational catalog matrix/table >>> sig.sigProfilerExtractor("matrix", "example_output", data, opportunity_genome="GRCh38", minimum_signatures=1, maximum_signatures=3) - + Wait untill the excecution is finished. The process may a couple of hours based on the size of the data. Check the results in the "example_output" folder. """ memory_usage() - #record the start time + # record the start time start_time = datetime.datetime.now() - - #set the output variable - out_put = output - + + # set the output variable + out_put = output + if gpu == True: import torch - + if gpu and (torch.cuda.device_count() == 0): raise RuntimeError("GPU not available!") - - + #################################### At first create the system data file #################################### if not os.path.exists(out_put): os.makedirs(out_put) - sysdata = open(out_put+"/JOB_METADATA.txt", "w") + sysdata = open(out_put + "/JOB_METADATA.txt", "w") sysdata.write("THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME\n\n\n") sysdata.write("-------System Info-------\n") - sysdata.write("Operating System Name: "+ platform.uname()[0]+"\n"+"Nodename: "+platform.uname()[1]+"\n"+"Release: "+platform.uname()[2]+"\n"+"Version: "+platform.uname()[3]+"\n") + sysdata.write( + "Operating System Name: " + + platform.uname()[0] + + "\n" + + "Nodename: " + + platform.uname()[1] + + "\n" + + "Release: " + + platform.uname()[2] + + "\n" + + "Version: " + + platform.uname()[3] + + "\n" + ) sysdata.write("\n-------Python and Package Versions------- \n") - sysdata.write("Python Version: "+str(platform.sys.version_info.major)+"."+str(platform.sys.version_info.minor)+"."+str(platform.sys.version_info.micro)+"\n") - sysdata.write("SigProfilerExtractor Version: "+cosmic.__version__+"\n") - sysdata.write("SigProfilerPlotting Version: "+sigProfilerPlotting.__version__+"\n") - sysdata.write("SigProfilerMatrixGenerator Version: "+SigProfilerMatrixGenerator.__version__+"\n") - sysdata.write("SigProfilerAssignment Version: "+spa.__version__+"\n") - sysdata.write("Pandas version: "+pd.__version__+"\n") - sysdata.write("Numpy version: "+np.__version__+"\n") - sysdata.write("Scipy version: "+scipy.__version__+"\n") - sysdata.write("Scikit-learn version: "+sklearn.__version__+"\n") - - #format the project_name first: - project = input_data #will use this variable as the parameter for project argument in SigprofilerMatrixGenerator + sysdata.write( + "Python Version: " + + str(platform.sys.version_info.major) + + "." + + str(platform.sys.version_info.minor) + + "." + + str(platform.sys.version_info.micro) + + "\n" + ) + sysdata.write("SigProfilerExtractor Version: " + cosmic.__version__ + "\n") + sysdata.write( + "SigProfilerPlotting Version: " + sigProfilerPlotting.__version__ + "\n" + ) + sysdata.write( + "SigProfilerMatrixGenerator Version: " + + SigProfilerMatrixGenerator.__version__ + + "\n" + ) + sysdata.write("SigProfilerAssignment Version: " + spa.__version__ + "\n") + sysdata.write("Pandas version: " + pd.__version__ + "\n") + sysdata.write("Numpy version: " + np.__version__ + "\n") + sysdata.write("Scipy version: " + scipy.__version__ + "\n") + sysdata.write("Scikit-learn version: " + sklearn.__version__ + "\n") + + # format the project_name first: + project = input_data # will use this variable as the parameter for project argument in SigprofilerMatrixGenerator try: if project[-1] != "/": - project_name = project.split("/")[-1] #will use this variable as the parameter for project_name argument in SigprofilerMatrixGenerator + project_name = project.split("/")[ + -1 + ] # will use this variable as the parameter for project_name argument in SigprofilerMatrixGenerator else: project_name = project.split("/")[-2] except: project_name = "Input from DataFrame" - execution_parameters= {"input_type":input_type, - "output":output, - "input_data":input_data, - "reference_genome":reference_genome, - "opportunity_genome":opportunity_genome, - "cosmic_version":cosmic_version, - "context_type":context_type, - "exome":exome, - "minimum_signatures":minimum_signatures, - "maximum_signatures":maximum_signatures, - "NMF_replicates":nmf_replicates, - "cpu":cpu, - "gpu":gpu, - "batch_size":batch_size, - "NMF_init":nmf_init, - "precision":precision, - "matrix_normalization":matrix_normalization, - "resample":resample, - "seeds":seeds, - "min_NMF_iterations":min_nmf_iterations, - "max_NMF_iterations":max_nmf_iterations, - "NMF_test_conv": nmf_test_conv, - "NMF_tolerance": nmf_tolerance, - "nnls_add_penalty":nnls_add_penalty, - "nnls_remove_penalty":nnls_remove_penalty, - "initial_remove_penalty":initial_remove_penalty, - "collapse_to_SBS96":collapse_to_SBS96, - "dist":clustering_distance, - "export_probabilities":export_probabilities, - "make_decompostion_plots":make_decomposition_plots, - "stability":stability, - "min_stability":min_stability, - "combined_stability":combined_stability, - "allow_stability_drop":allow_stability_drop, - "get_all_signature_matrices":get_all_signature_matrices} - + execution_parameters = { + "input_type": input_type, + "output": output, + "input_data": input_data, + "reference_genome": reference_genome, + "opportunity_genome": opportunity_genome, + "cosmic_version": cosmic_version, + "context_type": context_type, + "exome": exome, + "minimum_signatures": minimum_signatures, + "maximum_signatures": maximum_signatures, + "NMF_replicates": nmf_replicates, + "cpu": cpu, + "gpu": gpu, + "batch_size": batch_size, + "NMF_init": nmf_init, + "precision": precision, + "matrix_normalization": matrix_normalization, + "resample": resample, + "seeds": seeds, + "min_NMF_iterations": min_nmf_iterations, + "max_NMF_iterations": max_nmf_iterations, + "NMF_test_conv": nmf_test_conv, + "NMF_tolerance": nmf_tolerance, + "nnls_add_penalty": nnls_add_penalty, + "nnls_remove_penalty": nnls_remove_penalty, + "initial_remove_penalty": initial_remove_penalty, + "collapse_to_SBS96": collapse_to_SBS96, + "dist": clustering_distance, + "export_probabilities": export_probabilities, + "make_decompostion_plots": make_decomposition_plots, + "stability": stability, + "min_stability": min_stability, + "combined_stability": combined_stability, + "allow_stability_drop": allow_stability_drop, + "get_all_signature_matrices": get_all_signature_matrices, + } + ################################ take the inputs from the general optional arguments #################################### - startProcess = minimum_signatures - endProcess = maximum_signatures - mtype = context_type - wall = get_all_signature_matrices - add_penalty = nnls_add_penalty - remove_penalty= nnls_remove_penalty - genome_build = opportunity_genome - refgen = reference_genome - - #set the sequence type ("genome" or "exome") for selection criteria and tmb plot inside the make_final_solution function - if exome==False: - sequence="genome" - if exome==True: - sequence="exome" - + startProcess = minimum_signatures + endProcess = maximum_signatures + mtype = context_type + wall = get_all_signature_matrices + add_penalty = nnls_add_penalty + remove_penalty = nnls_remove_penalty + genome_build = opportunity_genome + refgen = reference_genome + + # set the sequence type ("genome" or "exome") for selection criteria and tmb plot inside the make_final_solution function + if exome == False: + sequence = "genome" + if exome == True: + sequence = "exome" + # Use a SeedSequence to create generators for random number generation - if seeds=="random": + if seeds == "random": execution_parameters["seeds"] = seeds tmp_seed = SeedSequence().entropy - seed = np.array(tmp_seed) - seeds = pd.DataFrame([tmp_seed], columns=["Seed"]) - seeds.to_csv(out_put+"/Seeds.txt", sep="\t", quoting=None) + seed = np.array(tmp_seed) + seeds = pd.DataFrame([tmp_seed], columns=["Seed"]) + seeds.to_csv(out_put + "/Seeds.txt", sep="\t", quoting=None) else: try: execution_parameters["seeds"] = seeds - seeds = pd.read_csv(seeds,sep="\t", index_col=0) - seeds.to_csv(out_put+"/Seeds.txt", sep="\t") - seed = np.array(seeds["Seed"]) + seeds = pd.read_csv(seeds, sep="\t", index_col=0) + seeds.to_csv(out_put + "/Seeds.txt", sep="\t") + seed = np.array(seeds["Seed"]) except: raise ValueError("Please set valid seeds") - - if input_type=="text" or input_type =="table" or input_type=="matrix": - + + if input_type == "text" or input_type == "table" or input_type == "matrix": ################################### For text input files ###################################################### text_file = project - title = "" # set the title for plotting - - if type(text_file)!=str: - data=text_file - execution_parameters["input_data"]="Matrix["+str(data.shape[0])+" rows X "+str(data.shape[1])+ " columns]" + title = "" # set the title for plotting + + if type(text_file) != str: + data = text_file + execution_parameters["input_data"] = ( + "Matrix[" + + str(data.shape[0]) + + " rows X " + + str(data.shape[1]) + + " columns]" + ) else: - data = pd.read_csv(text_file, sep="\t").iloc[:,:] - - if data.shape[0]==48: + data = pd.read_csv(text_file, sep="\t").iloc[:, :] + + if data.shape[0] == 48: paths = cosmic.__path__[0] - feature_map=pd.read_csv(paths + "/data/ReferenceFiles/" + \ - "CN_classes_dictionary.txt", sep="\t", header=None) - feature_order=pd.read_csv(paths + "/data/ReferenceFiles/" + \ - "CNV_features.tsv", sep="\t", header=None) - if list(data.iloc[:,0])==list(feature_order[0]): + feature_map = pd.read_csv( + paths + "/data/ReferenceFiles/" + "CN_classes_dictionary.txt", + sep="\t", + header=None, + ) + feature_order = pd.read_csv( + paths + "/data/ReferenceFiles/" + "CNV_features.tsv", + sep="\t", + header=None, + ) + if list(data.iloc[:, 0]) == list(feature_order[0]): pass else: - orderlist1=list(feature_map[0]) - orderlist2=list(feature_order[0]) - #sort the MutationType first step - data["MutationType"]= pd.Categorical(data["MutationType"], orderlist1) + orderlist1 = list(feature_map[0]) + orderlist2 = list(feature_order[0]) + # sort the MutationType first step + data["MutationType"] = pd.Categorical(data["MutationType"], orderlist1) data = data.sort_values("MutationType") data = data.reset_index() - data = data.drop(columns='index') - #sort the MutationType second step - data["MutationType"]=feature_map[1] - data["MutationType"]= pd.Categorical(data["MutationType"], orderlist2) + data = data.drop(columns="index") + # sort the MutationType second step + data["MutationType"] = feature_map[1] + data["MutationType"] = pd.Categorical(data["MutationType"], orderlist2) data = data.sort_values("MutationType") - + data = data.dropna(axis=1, inplace=False) data = data.loc[:, (data != 0).any(axis=0)] - genomes = data.iloc[:,1:] + genomes = data.iloc[:, 1:] genomes = np.array(genomes) - allgenomes = genomes.copy() # save the allgenomes for the final results - #Contruct the indeces of the matrix - #setting index and columns names of processAvg and exposureAvg - index = data.iloc[:,0] - colnames = data.columns[1:] - allcolnames = colnames.copy() # save the allcolnames for the final results - - #creating list of mutational type to sync with the vcf type input + allgenomes = genomes.copy() # save the allgenomes for the final results + # Contruct the indeces of the matrix + # setting index and columns names of processAvg and exposureAvg + index = data.iloc[:, 0] + colnames = data.columns[1:] + allcolnames = colnames.copy() # save the allcolnames for the final results + + # creating list of mutational type to sync with the vcf type input mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DBS78"] @@ -490,20 +603,20 @@ def sigProfilerExtractor(input_type, mtypes = ["ID83"] elif mtypes[0] == "48": mtypes = ["CNV48"] - elif mtypes[0]=="32": + elif mtypes[0] == "32": mtypes = ["SV32"] - elif mtypes[0]=="96" or "288" or "384" or "1536" or "4608": - mtypes = ["SBS"+mtypes[0]] + elif mtypes[0] == "96" or "288" or "384" or "1536" or "4608": + mtypes = ["SBS" + mtypes[0]] else: - mtypes = ["CH"+mtypes[0]] + mtypes = ["CH" + mtypes[0]] - elif input_type=="csv": + elif input_type == "csv": ################################# For CSV input files ####################################################### filename = project - title = "" # set the title for plotting - genomes, index, colnames, mtypes = sub.read_csv(filename) + title = "" # set the title for plotting + genomes, index, colnames, mtypes = sub.read_csv(filename) allgenomes = genomes.copy() - allcolnames = colnames.copy() + allcolnames = colnames.copy() # Define the mtypes mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": @@ -511,50 +624,59 @@ def sigProfilerExtractor(input_type, elif mtypes[0] == "83": mtypes = ["ID"] - elif input_type=="matobj": + elif input_type == "matobj": ################################# For matlab input files ####################################################### mat_file = project - title = "" # set the title for plotting - mat = sio.loadmat(mat_file) - mat = sub.extract_input(mat) - genomes = mat[1] - allgenomes = genomes.copy() # save the allgenomes for the final results - - #Contruct the indeces of the matrix - #setting index and columns names of processAvg and exposureAvg + title = "" # set the title for plotting + mat = sio.loadmat(mat_file) + mat = sub.extract_input(mat) + genomes = mat[1] + allgenomes = genomes.copy() # save the allgenomes for the final results + + # Contruct the indeces of the matrix + # setting index and columns names of processAvg and exposureAvg index1 = mat[3] index2 = mat[4] - index = [] + index = [] for i, j in zip(index1, index2): - index.append(i[0]+"["+j+"]"+i[2]) - colnames = np.array(pd.Series(mat[2])) - allcolnames = colnames.copy() # save the allcolnames for the final results - index = np.array(pd.Series(index)) - - #creating list of mutational type to sync with the vcf type input + index.append(i[0] + "[" + j + "]" + i[2]) + colnames = np.array(pd.Series(mat[2])) + allcolnames = colnames.copy() # save the allcolnames for the final results + index = np.array(pd.Series(index)) + + # creating list of mutational type to sync with the vcf type input mtypes = [str(genomes.shape[0])] if mtypes[0] == "78": mtypes = ["DINUC"] elif mtypes[0] == "83": mtypes = ["ID"] - elif input_type=="vcf": + elif input_type == "vcf": ################################# For vcf input files ####################################################### - title = project # set the title for plotting - data = datadump.SigProfilerMatrixGeneratorFunc(project_name, refgen, project, exome=exome, bed_file=None, chrom_based=False, plot=False, gs=False) + title = project # set the title for plotting + data = datadump.SigProfilerMatrixGeneratorFunc( + project_name, + refgen, + project, + exome=exome, + bed_file=None, + chrom_based=False, + plot=False, + gs=False, + ) # Selecting the MutationType if mtype == ["default"]: - mtypes = ["SBS96", "DBS78", "ID83"] + mtypes = ["SBS96", "DBS78", "ID83"] elif mtype == "default": - mtypes = ["SBS96", "DBS78", "ID83"] + mtypes = ["SBS96", "DBS78", "ID83"] else: - #mkeys = data.keys() + # mkeys = data.keys() mtype = mtype.upper() mtype = mtype.replace(" ", "") mtypes = mtype.split(",") - #set the genome_build - genome_build=refgen - elif input_type.lower()=="bedpe": + # set the genome_build + genome_build = refgen + elif input_type.lower() == "bedpe": ##################### For SV's bedpe input files ##################### # create a directory to write the output matrices to title = project @@ -567,7 +689,7 @@ def sigProfilerExtractor(input_type, colnames = genomes.columns allcolnames = colnames.copy() allgenomes = genomes.copy() - elif input_type.split(":")[0].lower()=="seg": # seg + elif input_type.split(":")[0].lower() == "seg": # seg ################# For CNV's segmentation input files ################# title = project mtypes = ["CNV48"] @@ -575,237 +697,363 @@ def sigProfilerExtractor(input_type, # cnv_outputs = os.path.join(os.path.split(input_data)[0], "CNV_Matrices") # SV input processing, execution parameters - #project needs to be something NOT a file - genomes = scna.generateCNVMatrix(cnv_file_type, input_data, cnv_file_type, output) + # project needs to be something NOT a file + genomes = scna.generateCNVMatrix( + cnv_file_type, input_data, cnv_file_type, output + ) if MUTTYPE in genomes.columns: - genomes = genomes.set_index('MutationType') + genomes = genomes.set_index("MutationType") index = genomes.index.values colnames = genomes.columns allcolnames = colnames.copy() allgenomes = genomes.copy() else: - raise ValueError("Please provide a correct input_type. Check help for more details") - - #recording context types - execution_parameters["context_type"]=",".join(mtypes) + raise ValueError( + "Please provide a correct input_type. Check help for more details" + ) + + # recording context types + execution_parameters["context_type"] = ",".join(mtypes) record_parameters(sysdata, execution_parameters, start_time) - sysdata.close() - ########################################################################################################################################################################################### - + sysdata.close() + ########################################################################################################################################################################################### + for m in mtypes: # we need to rename the m because users input could be SBS96, SBS1536, DBS78, ID83 etc if m.startswith("SBS"): - m = m[3:] #removing "SBS" + m = m[3:] # removing "SBS" elif m.startswith("DBS"): m = "DINUC" elif m.startswith("ID"): m = "ID" elif m.startswith("CNV"): - m="CNV" + m = "CNV" elif m.startswith("SV"): - m="SV" - + m = "SV" + # Determine the types of mutation which will be needed for exporting and copying the files - if not (m=="DINUC" or m.startswith("DBS") or m.startswith("ID") or m.startswith("CNV") or m.startswith("SV")): - + if not ( + m == "DINUC" + or m.startswith("DBS") + or m.startswith("ID") + or m.startswith("CNV") + or m.startswith("SV") + ): if m.startswith("SBS"): mutation_type = m - elif m in ["96","288","384","1536", "4608"]: - mutation_type="SBS"+m - elif m.startswith("78"): - mutation_type="DBS78" + elif m in ["96", "288", "384", "1536", "4608"]: + mutation_type = "SBS" + m + elif m.startswith("78"): + mutation_type = "DBS78" elif m.startswith("83"): - mutation_type="ID83" + mutation_type = "ID83" elif m.startswith("48"): - mutation_type="CNV48" + mutation_type = "CNV48" elif m.startswith("32"): - mutation_type="SV32" + mutation_type = "SV32" else: - mutation_type = "CH"+m - + mutation_type = "CH" + m + else: if m == "DINUC" or m.startswith("DBS"): mutation_type = "DBS78" - elif m== "ID" or m.startswith("ID"): + elif m == "ID" or m.startswith("ID"): mutation_type = "ID83" - elif m== "CNV" or m.startswith("CNV"): + elif m == "CNV" or m.startswith("CNV"): mutation_type = "CNV48" - elif m== "SV" or m.startswith("SV"): + elif m == "SV" or m.startswith("SV"): mutation_type = "SV32" - - if input_type=="vcf": - try: + + if input_type == "vcf": + try: genomes = pd.DataFrame(data[m]) - except KeyError: - sysdata = open(out_put+"/JOB_METADATA.txt", "a") - sysdata.write("Context {} is not available in the current vcf files".format(m)+"\n") + except KeyError: + sysdata = open(out_put + "/JOB_METADATA.txt", "a") + sysdata.write( + "Context {} is not available in the current vcf files".format(m) + + "\n" + ) print("Context {} is not available in the current vcf files".format(m)) sysdata.close() continue - #check if the genome is a nonzero matrix - if genomes.shape == (0,0): - sysdata = open(out_put+"/JOB_METADATA.txt", "a") - sysdata.write("Sample is not a nonzero matrix for the mutation context "+ m+"\n") - print("Sample is not a nozero matrix for the mutation context "+ m) + # check if the genome is a nonzero matrix + if genomes.shape == (0, 0): + sysdata = open(out_put + "/JOB_METADATA.txt", "a") + sysdata.write( + "Sample is not a nonzero matrix for the mutation context " + + m + + "\n" + ) + print("Sample is not a nozero matrix for the mutation context " + m) sysdata.close() continue genomes = genomes.loc[:, (genomes != 0).any(axis=0)] - allgenomes = genomes.copy() # save the allgenomes for the final results + allgenomes = genomes.copy() # save the allgenomes for the final results index = genomes.index.values - colnames = genomes.columns - allcolnames = colnames.copy() # save the allcolnames for the final results - - #check if start and end processes are bigger than the number of samples + colnames = genomes.columns + allcolnames = colnames.copy() # save the allcolnames for the final results + + # check if start and end processes are bigger than the number of samples startProcess = min(startProcess, genomes.shape[1]) - endProcess = min(endProcess, genomes.shape[1]) - - #in the plotting funciton "ID" is used as "INDEL" - if m=="ID": - m="INDEL" #for plotting - - #create output directories to store all the results - output = out_put+"/"+mutation_type - est_genomes = np.zeros([1,1]) + endProcess = min(endProcess, genomes.shape[1]) + + # in the plotting funciton "ID" is used as "INDEL" + if m == "ID": + m = "INDEL" # for plotting + + # create output directories to store all the results + output = out_put + "/" + mutation_type + est_genomes = np.zeros([1, 1]) genomes = np.array(genomes) - information =[] + information = [] layer_directory = output try: if not os.path.exists(layer_directory): os.makedirs(layer_directory) - except: - print ("The {} folder could not be created".format("output")) - - fh = open(layer_directory+"/All_solutions_stat.csv", "w") - fh.write("Total Signatures,Stability,Matrix Frobenius%,avgStability\n") + except: + print("The {} folder could not be created".format("output")) + + fh = open(layer_directory + "/All_solutions_stat.csv", "w") + fh.write("Total Signatures,Stability,Matrix Frobenius%,avgStability\n") fh.close() # The following for loop operates to extract data from each number of signature - - all_similirities_list = [] #this list is going to store the dataframes of different similirieties as items - minimum_stabilities = [] - # get the cutoff for normatization to handle the hypermutators - - normalization_cutoff = sub.get_normalization_cutoff(genomes, manual_cutoff=100*genomes.shape[0]) + all_similirities_list = ( + [] + ) # this list is going to store the dataframes of different similirieties as items + minimum_stabilities = [] + + # get the cutoff for normatization to handle the hypermutators + + normalization_cutoff = sub.get_normalization_cutoff( + genomes, manual_cutoff=100 * genomes.shape[0] + ) execution_parameters["normalization_cutoff"] = normalization_cutoff - - #pass the seed values to inner funtions: + + # pass the seed values to inner funtions: execution_parameters["seeds"] = seed - if genomes.shape[1] -1.0: - stic = time.time() + if avgSilhouetteCoefficients > -1.0: + stic = time.time() if cpu > 0: pool = mp.Pool(processes=cpu) else: pool = mp.Pool() - results = [pool.apply_async(spasub.fit_signatures_pool, args=(genomes,processAvg,x,)) for x in range(genomes.shape[1])] + results = [ + pool.apply_async( + spasub.fit_signatures_pool, + args=( + genomes, + processAvg, + x, + ), + ) + for x in range(genomes.shape[1]) + ] pooloutput = [p.get() for p in results] pool.close() - + for i in range(len(pooloutput)): - exposureAvg[:,i]=pooloutput[i][0] + exposureAvg[:, i] = pooloutput[i][0] stoc = time.time() - print ("Optimization time is {} seconds".format(stoc-stic)) - #Get total mutationation for each signature in reverse order and order the signatures from high to low mutation barden - signature_total_mutations = np.sum(exposureAvg, axis =1).astype(int) + print("Optimization time is {} seconds".format(stoc - stic)) + # Get total mutationation for each signature in reverse order and order the signatures from high to low mutation barden + signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int) sorted_idx = np.argsort(-signature_total_mutations) processAvg = np.take(processAvg, sorted_idx, axis=1) exposureAvg = np.take(exposureAvg, sorted_idx, axis=0) - signature_total_mutations = np.sum(exposureAvg, axis =1).astype(int) - processStd=np.take(processStd, sorted_idx, axis=1) - exposureStd=np.take(exposureStd, sorted_idx, axis=0) - clusterSilhouetteCoefficients=np.take(clusterSilhouetteCoefficients, sorted_idx, axis=0) - signature_stats = pd.DataFrame({"Stability": clusterSilhouetteCoefficients, "Total Mutations": signature_total_mutations}) - minimum_stabilities.append(round(np.mean(clusterSilhouetteCoefficients),2)) #here minimum stability is the average stability !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + signature_total_mutations = np.sum(exposureAvg, axis=1).astype(int) + processStd = np.take(processStd, sorted_idx, axis=1) + exposureStd = np.take(exposureStd, sorted_idx, axis=0) + clusterSilhouetteCoefficients = np.take( + clusterSilhouetteCoefficients, sorted_idx, axis=0 + ) + signature_stats = pd.DataFrame( + { + "Stability": clusterSilhouetteCoefficients, + "Total Mutations": signature_total_mutations, + } + ) + minimum_stabilities.append( + round(np.mean(clusterSilhouetteCoefficients), 2) + ) # here minimum stability is the average stability !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # Compute the estimated genome from the processAvg and exposureAvg - est_genomes = np.dot(processAvg, exposureAvg) - #check the similarities between the original and estimated genome for each number of signatures - all_similarities, cosine_similarities = sub.calculate_similarities(genomes, est_genomes, colnames) + est_genomes = np.dot(processAvg, exposureAvg) + # check the similarities between the original and estimated genome for each number of signatures + all_similarities, cosine_similarities = sub.calculate_similarities( + genomes, est_genomes, colnames + ) ########################################################################################################################################################################## - # store the resutls of the loop. Here, processStd and exposureStd are standard Errors, NOT STANDARD DEVIATIONS. - loopResults = [genomes, processAvg, exposureAvg, processStd, exposureStd, avgSilhouetteCoefficients, clusterSilhouetteCoefficients, signature_total_mutations, all_similarities, signature_stats, reconstruction_error, finalgenomeErrors, finalgenomesReconstructed, converge_information, finalWall, finalHall, processes] - information.append([processAvg, exposureAvg, processStd, exposureStd, clusterSilhouetteCoefficients, signature_total_mutations, signature_stats, all_similarities]) #Will be used during hierarchycal approach - - ################################# Export the results ########################################################### - sub.export_information(loopResults, mutation_type, layer_directory, index, colnames, wall=wall, sequence=sequence) + # store the resutls of the loop. Here, processStd and exposureStd are standard Errors, NOT STANDARD DEVIATIONS. + loopResults = [ + genomes, + processAvg, + exposureAvg, + processStd, + exposureStd, + avgSilhouetteCoefficients, + clusterSilhouetteCoefficients, + signature_total_mutations, + all_similarities, + signature_stats, + reconstruction_error, + finalgenomeErrors, + finalgenomesReconstructed, + converge_information, + finalWall, + finalHall, + processes, + ] + information.append( + [ + processAvg, + exposureAvg, + processStd, + exposureStd, + clusterSilhouetteCoefficients, + signature_total_mutations, + signature_stats, + all_similarities, + ] + ) # Will be used during hierarchycal approach + + ################################# Export the results ########################################################### + sub.export_information( + loopResults, + mutation_type, + layer_directory, + index, + colnames, + wall=wall, + sequence=sequence, + ) all_similirities_list.append(all_similarities) current_time_end = datetime.datetime.now() - sysdata = open(out_put+"/JOB_METADATA.txt", "a") - sysdata.write("\n[{}] {} de novo extraction completed for a total of {} signatures! \nExecution time:{}\n". \ - format(str(datetime.datetime.now()).split(".")[0],mutation_type,processes,str(current_time_end-current_time_start).split(".")[0], current_time_end)) + sysdata = open(out_put + "/JOB_METADATA.txt", "a") + sysdata.write( + "\n[{}] {} de novo extraction completed for a total of {} signatures! \nExecution time:{}\n".format( + str(datetime.datetime.now()).split(".")[0], + mutation_type, + processes, + str(current_time_end - current_time_start).split(".")[0], + current_time_end, + ) + ) sysdata.close() - ########################################## Plot Stabiltity vs Reconstruction Error ############################# + ########################################## Plot Stabiltity vs Reconstruction Error ############################# # Print the Stabiltity vs Reconstruction Error as get the solution as well - solution, all_stats = sub.stabVsRError(layer_directory+"/All_solutions_stat.csv", layer_directory, title, all_similirities_list, mtype=mutation_type, stability=stability, min_stability=min_stability, combined_stability=combined_stability, sequence=sequence, allow_stability_drop=allow_stability_drop) - all_stats.insert(1, 'Stability (Avg Silhouette)', minimum_stabilities) #!!!!!!!!!!!!!!!!1 here minimum stability is avg stability - all_stats=all_stats.set_index(["Signatures"]) - all_stats.to_csv(layer_directory+"/All_solutions_stat.csv", sep = ",") + solution, all_stats = sub.stabVsRError( + layer_directory + "/All_solutions_stat.csv", + layer_directory, + title, + all_similirities_list, + mtype=mutation_type, + stability=stability, + min_stability=min_stability, + combined_stability=combined_stability, + sequence=sequence, + allow_stability_drop=allow_stability_drop, + ) + all_stats.insert( + 1, "Stability (Avg Silhouette)", minimum_stabilities + ) #!!!!!!!!!!!!!!!!1 here minimum stability is avg stability + all_stats = all_stats.set_index(["Signatures"]) + all_stats.to_csv(layer_directory + "/All_solutions_stat.csv", sep=",") # write the name of Samples and Matrix participating in each Layer. layer_genome = pd.DataFrame(genomes) @@ -814,60 +1062,109 @@ def sigProfilerExtractor(input_type, layer_genome = layer_genome.rename_axis("MutationType", axis="columns") # record the samples - layer_genome.to_csv(output+"/Samples.txt", sep = "\t", index_label=[layer_genome.columns.name]) - #similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t") + layer_genome.to_csv( + output + "/Samples.txt", sep="\t", index_label=[layer_genome.columns.name] + ) + # similarity_dataframe.to_csv(data_stat_folder+"/Similatiry_Data_All_Sigs"+str(H_iteration)+".text", sep = "\t") del layer_genome ################################### Decompose the new signatures into global signatures ######################### - processAvg = information[solution-startProcess][0] - exposureAvg = information[solution-startProcess][1] - processSTE = information[solution-startProcess][2] - signature_stabilities = information[solution-startProcess][4] - signature_total_mutations = information[solution-startProcess][5] - signature_stats = information[solution-startProcess][6] - all_similarities = information[solution-startProcess][7] - - signature_stabilities = sub.signature_plotting_text(signature_stabilities, "Stability", "float") - signature_total_mutations = sub.signature_plotting_text(signature_total_mutations, "Total Mutations", "integer") - listOfSignatures = sub.make_letter_ids(idlenth = processAvg.shape[1], mtype=mutation_type) - - layer_directory1 = output+"/Suggested_Solution/"+mutation_type+"_De-Novo_Solution" - layer_directory2 = output+"/Suggested_Solution/COSMIC_"+mutation_type+"_Decomposed_Solution" - devopts={} - devopts['denovo_outpath'] = layer_directory1 - devopts['decompose_outpath'] = layer_directory2 - devopts['Assignment_outpath'] =layer_directory2 - devopts['signature_stabilities']=signature_stabilities - devopts['signature_total_mutations']=signature_total_mutations - devopts['listOfSignatures']=listOfSignatures - devopts['index']=index - devopts['colnames']=allcolnames - devopts['signature_stats']=signature_stats - devopts['sequence']=sequence - devopts['processSTE']=processSTE - devopts['sequence']=sequence - devopts['make_decomposition_plots']=make_decomposition_plots + processAvg = information[solution - startProcess][0] + exposureAvg = information[solution - startProcess][1] + processSTE = information[solution - startProcess][2] + signature_stabilities = information[solution - startProcess][4] + signature_total_mutations = information[solution - startProcess][5] + signature_stats = information[solution - startProcess][6] + all_similarities = information[solution - startProcess][7] + + signature_stabilities = sub.signature_plotting_text( + signature_stabilities, "Stability", "float" + ) + signature_total_mutations = sub.signature_plotting_text( + signature_total_mutations, "Total Mutations", "integer" + ) + listOfSignatures = sub.make_letter_ids( + idlenth=processAvg.shape[1], mtype=mutation_type + ) + layer_directory1 = ( + output + "/Suggested_Solution/" + mutation_type + "_De-Novo_Solution" + ) + layer_directory2 = ( + output + + "/Suggested_Solution/COSMIC_" + + mutation_type + + "_Decomposed_Solution" + ) + devopts = {} + devopts["denovo_outpath"] = layer_directory1 + devopts["decompose_outpath"] = layer_directory2 + devopts["Assignment_outpath"] = layer_directory2 + devopts["signature_stabilities"] = signature_stabilities + devopts["signature_total_mutations"] = signature_total_mutations + devopts["listOfSignatures"] = listOfSignatures + devopts["index"] = index + devopts["colnames"] = allcolnames + devopts["signature_stats"] = signature_stats + devopts["sequence"] = sequence + devopts["processSTE"] = processSTE + devopts["sequence"] = sequence + devopts["make_decomposition_plots"] = make_decomposition_plots # Check if genome_build is available in COSMIC, if not reset to GRCh37 - if genome_build == "GRCh37" or genome_build == "GRCh38" or genome_build == "mm9" or genome_build == "mm10" or genome_build == "rn6": + if ( + genome_build == "GRCh37" + or genome_build == "GRCh38" + or genome_build == "mm9" + or genome_build == "mm10" + or genome_build == "rn6" + ): genome_build = genome_build else: - sysdata = open(out_put+"/JOB_METADATA.txt", "a") - sysdata.write("\n[{}] The selected opportunity genome is {}. COSMIC signatures are available only for GRCh37/38, mm9/10 and rn6 genomes. So, the opportunity genome is reset to GRCh37.\n". \ - format(str(datetime.datetime.now()).split(".")[0], str(genome_build))) - print("The selected opportunity genome is "+str(genome_build)+". COSMIC signatures are available only for GRCh37/38, mm9/10 and rn6 genomes. So, the opportunity genome is reset to GRCh37.") + sysdata = open(out_put + "/JOB_METADATA.txt", "a") + sysdata.write( + "\n[{}] The selected opportunity genome is {}. COSMIC signatures are available only for GRCh37/38, mm9/10 and rn6 genomes. So, the opportunity genome is reset to GRCh37.\n".format( + str(datetime.datetime.now()).split(".")[0], str(genome_build) + ) + ) + print( + "The selected opportunity genome is " + + str(genome_build) + + ". COSMIC signatures are available only for GRCh37/38, mm9/10 and rn6 genomes. So, the opportunity genome is reset to GRCh37." + ) sysdata.close() genome_build = "GRCh37" - decomp.spa_analyze(allgenomes, output, signatures=processAvg, genome_build=genome_build, cosmic_version=cosmic_version, exome=exome, verbose=False, - decompose_fit_option= True, denovo_refit_option=True, cosmic_fit_option=False, export_probabilities=export_probabilities, devopts=devopts, make_metadata=False) + decomp.spa_analyze( + allgenomes, + output, + signatures=processAvg, + genome_build=genome_build, + cosmic_version=cosmic_version, + exome=exome, + verbose=False, + decompose_fit_option=True, + denovo_refit_option=True, + cosmic_fit_option=False, + export_probabilities=export_probabilities, + devopts=devopts, + make_metadata=False, + ) - - sysdata = open(out_put+"/JOB_METADATA.txt", "a") + sysdata = open(out_put + "/JOB_METADATA.txt", "a") end_time = datetime.datetime.now() sysdata.write("\n[{}] Analysis ended: \n".format(str(end_time).split(".")[0])) sysdata.write("\n-------Job Status------- \n") - sysdata.write("Analysis of mutational signatures completed successfully! \nTotal execution time: "+str(end_time-start_time).split(".")[0]+" \nResults can be found in: "+" "+out_put+ " " +" folder") + sysdata.write( + "Analysis of mutational signatures completed successfully! \nTotal execution time: " + + str(end_time - start_time).split(".")[0] + + " \nResults can be found in: " + + " " + + out_put + + " " + + " folder" + ) sysdata.close() - print("\n\n \nYour Job Is Successfully Completed! Thank You For Using SigProfilerExtractor.\n ") + print( + "\n\n \nYour Job Is Successfully Completed! Thank You For Using SigProfilerExtractor.\n " + ) diff --git a/SigProfilerExtractor/subroutines.py b/SigProfilerExtractor/subroutines.py index a8d29e9..0930e92 100644 --- a/SigProfilerExtractor/subroutines.py +++ b/SigProfilerExtractor/subroutines.py @@ -7,7 +7,8 @@ """ from scipy.cluster import hierarchy import matplotlib.pyplot as plt -plt.switch_backend('agg') + +plt.switch_backend("agg") from matplotlib.backends.backend_pdf import PdfPages import numpy as np import pandas as pd @@ -20,15 +21,17 @@ import sigProfilerPlotting as plot from sigProfilerPlotting import plotActivity as plot_ac from sigProfilerPlotting import tmbplot as tmb -import string +import string import PyPDF2 -import os,sys +import os, sys import scipy -os.environ["MKL_NUM_THREADS"] = "1" -os.environ["NUMEXPR_NUM_THREADS"] = "1" + +os.environ["MKL_NUM_THREADS"] = "1" +os.environ["NUMEXPR_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" # import SigProfilerExtractor as cosmic -from scipy.stats import ranksums +from scipy.stats import ranksums + # from SigProfilerExtractor import single_sample as ss from SigProfilerExtractor import nmf_cpu from sklearn import mixture @@ -36,9 +39,11 @@ from scipy.spatial.distance import correlation as cor from scipy.optimize import linear_sum_assignment from scipy.optimize import nnls + # import shutil # from PyPDF2 import PdfFileMerger import warnings as _warnings + _warnings.filterwarnings("ignore") from numpy.random import Generator, PCG64DXSM, SeedSequence @@ -47,39 +52,39 @@ import torch from . import nmf_gpu except ImportError: - _warnings.warn('Cannot pytorch - GPU unavailable') + _warnings.warn("Cannot pytorch - GPU unavailable") -multiprocessing.set_start_method('spawn', force=True) +multiprocessing.set_start_method("spawn", force=True) ################################################################## Vivid Functions ############################# ############################################################## FUNCTION ONE ########################################## -def make_letter_ids(idlenth = 10, mtype = "SBS96"): - + +def make_letter_ids(idlenth=10, mtype="SBS96"): listOfSignatures = [] letters = list(string.ascii_uppercase) - letters.extend([i+b for i in letters for b in letters]) + letters.extend([i + b for i in letters for b in letters]) letters = letters[0:idlenth] - - for j,l in zip(range(idlenth),letters): - listOfSignatures.append(mtype+l) + + for j, l in zip(range(idlenth), letters): + listOfSignatures.append(mtype + l) listOfSignatures = np.array(listOfSignatures) return listOfSignatures + def union(a, b): - """ return the union of two lists """ + """return the union of two lists""" return list(set(a) | set(b)) def get_indeces(a, b): - - """ + """ Extracts the indices multiple items in a list. - + Parameters: a: list. where we want to get the index of the items. - b: list. the items we want to get index of. - #example: + b: list. the items we want to get index of. + #example: x = ['SBS1', 'SBS2', 'SBS3', 'SBS5', 'SBS8', 'SBS13', 'SBS40'] y = ['SBS1', 'SBS5'] get_indeces(x, y) @@ -89,17 +94,17 @@ def get_indeces(a, b): indeces = [] for i in b: - try: + try: idx = a.index(i) indeces.append(idx) - except: + except: next return indeces - -def get_items_from_index(x,y): - """ decipher the values of items in a list from their indices. - """ + + +def get_items_from_index(x, y): + """decipher the values of items in a list from their indices.""" z = [] for i in y: try: @@ -108,198 +113,246 @@ def get_items_from_index(x,y): pass return z -############################################################## FUNCTION ONE ########################################## + +############################################################## FUNCTION ONE ########################################## def signature_plotting_text(value, text, Type): name = text + ": " - name_list =[] - total=np.sum(np.array(value)) + name_list = [] + total = np.sum(np.array(value)) for i in value: - - if Type=="integer": + if Type == "integer": i = int(i) - p=round(i/total*100,1) - i = format(i, ',d') - tail = str(i)+'/'+str(p)+'%' - name_list.append(name+tail) - elif Type=="float": - i = round(i,2) + p = round(i / total * 100, 1) + i = format(i, ",d") + tail = str(i) + "/" + str(p) + "%" + name_list.append(name + tail) + elif Type == "float": + i = round(i, 2) name_list.append(name + str(i)) - return(name_list) + return name_list + def split_list(lst, splitlenth): newlst = [] for i in range(0, len(lst), splitlenth): - try: - newlst.append([lst[i], lst[i+1]]) - - except: + try: + newlst.append([lst[i], lst[i + 1]]) + + except: newlst.append([lst[i]]) - + return newlst +############################################ Functions for modifications of Sample Matrices ############### -############################################ Functions for modifications of Sample Matrices ############### def get_normalization_cutoff(data, manual_cutoff=9600): - col_sums = np.array(np.sum(data, axis=0)) - # continue the loop if the differece the means is larger than the 2*2*STD of the larger cluster + # continue the loop if the differece the means is larger than the 2*2*STD of the larger cluster while True: - try: # doing Kmean clustering - col_sums_for_cluster = col_sums.reshape(-1,1) - - #separate distributions using mixture model - clf = mixture.GaussianMixture(n_components=2, covariance_type='full') + col_sums_for_cluster = col_sums.reshape(-1, 1) + + # separate distributions using mixture model + clf = mixture.GaussianMixture(n_components=2, covariance_type="full") clf.fit(col_sums_for_cluster) labels = clf.predict(col_sums_for_cluster) - + unique, counts = np.unique(labels, return_counts=True) - if len(unique)==1: + if len(unique) == 1: break bigger_cluster = unique[np.argmax(counts)] smaller_cluster = unique[np.argmin(counts)] - + # estimating the magnitute of discripancy better the clusters - bigger_cluster__dist = col_sums[labels==bigger_cluster] - smaller_cluster__dist = col_sums[labels==smaller_cluster] - bigger_cluster__dist_mean = np.mean(bigger_cluster__dist) - bigger_cluster__dist_std = np.std(bigger_cluster__dist) - smaller_cluster__dist_mean = np.mean(smaller_cluster__dist) - # continue the loop if the differece the means is larger than the 2*STD of the larger cluster - - if abs(bigger_cluster__dist_mean-smaller_cluster__dist_mean)< 2*2*bigger_cluster__dist_std: + bigger_cluster__dist = col_sums[labels == bigger_cluster] + smaller_cluster__dist = col_sums[labels == smaller_cluster] + bigger_cluster__dist_mean = np.mean(bigger_cluster__dist) + bigger_cluster__dist_std = np.std(bigger_cluster__dist) + smaller_cluster__dist_mean = np.mean(smaller_cluster__dist) + # continue the loop if the differece the means is larger than the 2*STD of the larger cluster + + if ( + abs(bigger_cluster__dist_mean - smaller_cluster__dist_mean) + < 2 * 2 * bigger_cluster__dist_std + ): break - # col_sums will be equal to bigger_cluster__dist for the next iteration + # col_sums will be equal to bigger_cluster__dist for the next iteration col_sums = bigger_cluster__dist except: break - mean = np.mean(col_sums) + mean = np.mean(col_sums) std = np.std(col_sums) - cutoff = (mean + 2*(std)).astype(int) - - if cutoffnormalization_cutoff)[0] - norm_genome = bootstrapGenomes[:,list(indices)]/totalMutations[list(indices)][:,np.newaxis].T*normalization_cutoff - bootstrapGenomes[:,list(indices)] = norm_genome + indices = np.where(totalMutations > normalization_cutoff)[0] + norm_genome = ( + bootstrapGenomes[:, list(indices)] + / totalMutations[list(indices)][:, np.newaxis].T + * normalization_cutoff + ) + bootstrapGenomes[:, list(indices)] = norm_genome bootstrapGenomes = pd.DataFrame(bootstrapGenomes) elif norm == "100X": bootstrapGenomes = np.array(bootstrapGenomes) rows = bootstrapGenomes.shape[0] - indices = np.where(totalMutations>(rows*100))[0] - norm_genome = bootstrapGenomes[:,list(indices)]/totalMutations[list(indices)][:,np.newaxis].T*(rows*100) - bootstrapGenomes[:,list(indices)] = norm_genome + indices = np.where(totalMutations > (rows * 100))[0] + norm_genome = ( + bootstrapGenomes[:, list(indices)] + / totalMutations[list(indices)][:, np.newaxis].T + * (rows * 100) + ) + bootstrapGenomes[:, list(indices)] = norm_genome bootstrapGenomes = pd.DataFrame(bootstrapGenomes) elif norm == "log2": log2_of_tM = np.log2(totalMutations) - bootstrapGenomes = bootstrapGenomes/totalMutations*log2_of_tM + bootstrapGenomes = bootstrapGenomes / totalMutations * log2_of_tM elif norm == "none": pass else: try: bootstrapGenomes = np.array(bootstrapGenomes) rows = bootstrapGenomes.shape[0] - indices = np.where(totalMutations>int(norm))[0] - norm_genome = bootstrapGenomes[:,list(indices)]/totalMutations[list(indices)][:,np.newaxis].T*(int(norm)) - bootstrapGenomes[:,list(indices)] = norm_genome + indices = np.where(totalMutations > int(norm))[0] + norm_genome = ( + bootstrapGenomes[:, list(indices)] + / totalMutations[list(indices)][:, np.newaxis].T + * (int(norm)) + ) + bootstrapGenomes[:, list(indices)] = norm_genome bootstrapGenomes = pd.DataFrame(bootstrapGenomes) except: pass return bootstrapGenomes -def split_samples(samples, intervals, rescaled_items, colnames): +def split_samples(samples, intervals, rescaled_items, colnames): colnames = np.array(colnames) - total_mutations = samples.sum(axis =0) - max_total = np.max(total_mutations)+1 - intervals = np.array(intervals ) + total_mutations = samples.sum(axis=0) + max_total = np.max(total_mutations) + 1 + intervals = np.array(intervals) rescaled_items = np.array(rescaled_items) - selected_indices, = np.where(intervals<=max_total) - intervals=intervals[selected_indices] - rescaled_items=rescaled_items[selected_indices] + (selected_indices,) = np.where(intervals <= max_total) + intervals = intervals[selected_indices] + rescaled_items = rescaled_items[selected_indices] rescaled_items = list(rescaled_items) - + sample_list = [] sample_total = [] rescale_list = [] rescale_values = [] colnames_list = [] - ranges = list(intervals)+[max_total] - for i in range(len(ranges)-1): - sub_sample, = np.where((total_mutations<=ranges[i+1]) & (total_mutations>ranges[i])) - if samples[:,sub_sample].shape[1]>0: - sample_list.append(samples[:,sub_sample]) + ranges = list(intervals) + [max_total] + for i in range(len(ranges) - 1): + (sub_sample,) = np.where( + (total_mutations <= ranges[i + 1]) & (total_mutations > ranges[i]) + ) + if samples[:, sub_sample].shape[1] > 0: + sample_list.append(samples[:, sub_sample]) colnames_list.append(list(colnames[sub_sample])) - sample_total.append(np.sum(samples[:,sub_sample], axis=0)) + sample_total.append(np.sum(samples[:, sub_sample], axis=0)) rescale_list.append(rescaled_items[i]) rescale_values.append(ranges[i]) - return(sample_list, colnames_list, sample_total, rescale_list, rescale_values) + return (sample_list, colnames_list, sample_total, rescale_list, rescale_values) + def denormalize_samples(genomes, original_totals, normalization_value=30000): normalized_totals = np.sum(genomes, axis=0) original_totals = np.array(original_totals) - results = genomes/normalized_totals*original_totals - results = np.round(results,0) + results = genomes / normalized_totals * original_totals + results = np.round(results, 0) results = results.astype(int) - return results + return results -def nnmf_cpu(genomes, nfactors, init="nndsvd", execution_parameters=None, generator=None): - + +def nnmf_cpu( + genomes, nfactors, init="nndsvd", execution_parameters=None, generator=None +): genomes = torch.from_numpy(genomes).float() - min_iterations=execution_parameters["min_NMF_iterations"] - max_iterations=execution_parameters["max_NMF_iterations"] - tolerance=execution_parameters["NMF_tolerance"] - test_conv=execution_parameters["NMF_test_conv"] - precision=execution_parameters["precision"] - net = nmf_cpu.NMF(genomes,rank=nfactors, min_iterations=min_iterations, max_iterations=max_iterations, tolerance=tolerance,test_conv=test_conv, init_method=init, generator=generator, floating_point_precision=precision) + min_iterations = execution_parameters["min_NMF_iterations"] + max_iterations = execution_parameters["max_NMF_iterations"] + tolerance = execution_parameters["NMF_tolerance"] + test_conv = execution_parameters["NMF_test_conv"] + precision = execution_parameters["precision"] + net = nmf_cpu.NMF( + genomes, + rank=nfactors, + min_iterations=min_iterations, + max_iterations=max_iterations, + tolerance=tolerance, + test_conv=test_conv, + init_method=init, + generator=generator, + floating_point_precision=precision, + ) net.fit() Ws = [] Hs = [] - + for H in net.H.detach().cpu().numpy(): Hs.append(np.matrix(H)) for W in net.W.detach().cpu().numpy(): Ws.append(np.matrix(W)) - + convergence = int(net.conv) - + W = Ws[0] H = Hs[0] - #calculate L1, L2 and KL for the solution - + # calculate L1, L2 and KL for the solution + est_genome = np.array(np.dot(W, H)) genomes = np.array(genomes) - similarities = calculate_similarities(genomes, est_genome, sample_names=False)[0].iloc[:,2:] + similarities = calculate_similarities(genomes, est_genome, sample_names=False)[ + 0 + ].iloc[:, 2:] similarities = np.array(np.mean(similarities, axis=0)).T similarities = np.append(similarities, convergence) return W, H, similarities -def nnmf_gpu(genomes, nfactors, init="nndsvd",execution_parameters=None, generator=None): + +def nnmf_gpu( + genomes, nfactors, init="nndsvd", execution_parameters=None, generator=None +): p = current_process() identity = p._identity[0] gpu_id = identity % torch.cuda.device_count() genomes = torch.from_numpy(genomes).float().cuda(gpu_id) - min_iterations=execution_parameters["min_NMF_iterations"] - max_iterations=execution_parameters["max_NMF_iterations"] - tolerance=execution_parameters["NMF_tolerance"] - test_conv=execution_parameters["NMF_test_conv"] - precision=execution_parameters["precision"] - net = nmf_gpu.NMF(genomes,rank=nfactors,min_iterations=min_iterations,max_iterations=max_iterations, tolerance=tolerance,test_conv=test_conv, gpu_id=gpu_id, init_method=init, generator=generator, floating_point_precision=precision) + min_iterations = execution_parameters["min_NMF_iterations"] + max_iterations = execution_parameters["max_NMF_iterations"] + tolerance = execution_parameters["NMF_tolerance"] + test_conv = execution_parameters["NMF_test_conv"] + precision = execution_parameters["precision"] + net = nmf_gpu.NMF( + genomes, + rank=nfactors, + min_iterations=min_iterations, + max_iterations=max_iterations, + tolerance=tolerance, + test_conv=test_conv, + gpu_id=gpu_id, + init_method=init, + generator=generator, + floating_point_precision=precision, + ) net.fit() Ws = [] Hs = [] @@ -307,19 +360,19 @@ def nnmf_gpu(genomes, nfactors, init="nndsvd",execution_parameters=None, generat Hs.append(np.matrix(H)) for W in net.W.detach().cpu().numpy(): Ws.append(np.matrix(W)) - - if len(Ws)==1: + + if len(Ws) == 1: convergence = int(net.conv) else: convergence = int(max_iterations) - - convergence=[convergence]*len(Ws) + + convergence = [convergence] * len(Ws) return Ws, Hs, convergence + def BootstrapCancerGenomes(genomes, seed=None): - - ''' + """ index = genomes.index cols = genomes.columns genomes = np.array(genomes) @@ -331,119 +384,167 @@ def BootstrapCancerGenomes(genomes, seed=None): dataframe = pd.DataFrame(genomes) dataframe.index = index - dataframe.columns = cols - ''' - - #normalize the data - a = pd.DataFrame(genomes.sum(0)) #performing the sum of each column. variable a store the sum - a = a.transpose() #transfose the value so that if gets a proper dimention to normalize the data - repmat = pd.concat([a]*genomes.shape[0]) #normalize the data to get the probility of each datapoint to perform the "random.multinomial" operation - normGenomes = genomes/np.array(repmat) #part of normalizing step - - - #Get the normGenomes as Matlab/ alternation of the "mnrnd" fuction in Matlap - #do the Boostraping - dataframe = pd.DataFrame() #declare an empty variable to populate with the Bootstrapped data - for i in range(0,normGenomes.shape[1]): - dataframe[i]=list(seed.multinomial(a.iloc[:,i], normGenomes.iloc[:,i], 1)[0]) - - + dataframe.columns = cols + """ + + # normalize the data + a = pd.DataFrame( + genomes.sum(0) + ) # performing the sum of each column. variable a store the sum + a = ( + a.transpose() + ) # transfose the value so that if gets a proper dimention to normalize the data + repmat = pd.concat( + [a] * genomes.shape[0] + ) # normalize the data to get the probility of each datapoint to perform the "random.multinomial" operation + normGenomes = genomes / np.array(repmat) # part of normalizing step + + # Get the normGenomes as Matlab/ alternation of the "mnrnd" fuction in Matlap + # do the Boostraping + dataframe = ( + pd.DataFrame() + ) # declare an empty variable to populate with the Bootstrapped data + for i in range(0, normGenomes.shape[1]): + dataframe[i] = list( + seed.multinomial(a.iloc[:, i], normGenomes.iloc[:, i], 1)[0] + ) + return dataframe + # NMF version for the multiprocessing library -def pnmf(batch_generator_pair=[1,None], genomes=1, totalProcesses=1, resample=True, init="nndsvd", normalization_cutoff=10000000, norm="log2", gpu=False, execution_parameters=None): +def pnmf( + batch_generator_pair=[1, None], + genomes=1, + totalProcesses=1, + resample=True, + init="nndsvd", + normalization_cutoff=10000000, + norm="log2", + gpu=False, + execution_parameters=None, +): tic = time.time() - totalMutations = np.sum(genomes, axis =0) - genomes = pd.DataFrame(genomes) #creating/loading a dataframe/matrix + totalMutations = np.sum(genomes, axis=0) + genomes = pd.DataFrame(genomes) # creating/loading a dataframe/matrix # generators used for noise and matrix initialization - poisson_generator=batch_generator_pair[1][0] - rep_generator=batch_generator_pair[1][1] + poisson_generator = batch_generator_pair[1][0] + rep_generator = batch_generator_pair[1][1] rand_rng = Generator(PCG64DXSM(rep_generator)) poisson_rng = Generator(PCG64DXSM(poisson_generator)) - if gpu: - batch_size=batch_generator_pair[0] + batch_size = batch_generator_pair[0] nmf_fn = nnmf_gpu results = [] genome_list = [] for b in range(batch_size): if resample == True: - bootstrapGenomes= BootstrapCancerGenomes(genomes, seed=poisson_rng) - else: - bootstrapGenomes=genomes - - bootstrapGenomes[bootstrapGenomes<0.0001]= 0.0001 + bootstrapGenomes = BootstrapCancerGenomes(genomes, seed=poisson_rng) + else: + bootstrapGenomes = genomes + + bootstrapGenomes[bootstrapGenomes < 0.0001] = 0.0001 totalMutations = np.sum(bootstrapGenomes, axis=0) - bootstrapGenomes=normalize_samples(bootstrapGenomes,totalMutations,norm=norm, normalization_cutoff=normalization_cutoff) + bootstrapGenomes = normalize_samples( + bootstrapGenomes, + totalMutations, + norm=norm, + normalization_cutoff=normalization_cutoff, + ) genome_list.append(bootstrapGenomes.values) - + g = np.array(genome_list) - - W, H, Conv = nmf_fn(g, totalProcesses, init=init, execution_parameters=execution_parameters, generator=rand_rng) + + W, H, Conv = nmf_fn( + g, + totalProcesses, + init=init, + execution_parameters=execution_parameters, + generator=rand_rng, + ) for i in range(len(W)): - _W = np.array(W[i]) _H = np.array(H[i]) total = _W.sum(axis=0)[np.newaxis] - _W = _W/total - _H = _H*total.T - _H=denormalize_samples(_H, totalMutations) - _conv=Conv[i] + _W = _W / total + _H = _H * total.T + _H = denormalize_samples(_H, totalMutations) + _conv = Conv[i] results.append((_W, _H, _conv)) - print ("process " +str(totalProcesses)+" continues please wait... ") - print ("execution time: {} seconds \n".format(round(time.time()-tic), 2)) + print("process " + str(totalProcesses) + " continues please wait... ") + print("execution time: {} seconds \n".format(round(time.time() - tic), 2)) return results else: nmf_fn = nnmf_cpu if resample == True: - bootstrapGenomes= BootstrapCancerGenomes(genomes, seed=poisson_rng) + bootstrapGenomes = BootstrapCancerGenomes(genomes, seed=poisson_rng) else: - bootstrapGenomes=genomes - - bootstrapGenomes[bootstrapGenomes<0.0001]= 0.0001 + bootstrapGenomes = genomes + + bootstrapGenomes[bootstrapGenomes < 0.0001] = 0.0001 bootstrapGenomes = bootstrapGenomes.astype(float) # normalize the samples to handle the hypermutators - + totalMutations = np.sum(bootstrapGenomes, axis=0) - bootstrapGenomes=normalize_samples(bootstrapGenomes,totalMutations,norm=norm, normalization_cutoff=normalization_cutoff) + bootstrapGenomes = normalize_samples( + bootstrapGenomes, + totalMutations, + norm=norm, + normalization_cutoff=normalization_cutoff, + ) - bootstrapGenomes=np.array(bootstrapGenomes) + bootstrapGenomes = np.array(bootstrapGenomes) + + W, H, kl = nmf_fn( + bootstrapGenomes, + totalProcesses, + init=init, + execution_parameters=execution_parameters, + generator=rand_rng, + ) # uses custom function nnmf - W, H, kl = nmf_fn(bootstrapGenomes,totalProcesses, init=init, execution_parameters=execution_parameters, generator=rand_rng) #uses custom function nnmf - - W = np.array(W) H = np.array(H) total = W.sum(axis=0)[np.newaxis] - W = W/total - H = H*total.T + W = W / total + H = H * total.T # denormalize H - H = denormalize_samples(H, totalMutations) - print ("process " +str(totalProcesses)+" continues please wait... ") - print ("execution time: {} seconds \n".format(round(time.time()-tic), 2)) + H = denormalize_samples(H, totalMutations) + print("process " + str(totalProcesses) + " continues please wait... ") + print("execution time: {} seconds \n".format(round(time.time() - tic), 2)) return W, H, kl -def parallel_runs(execution_parameters, genomes=1, totalProcesses=1, verbose = False, replicate_generators=None): +def parallel_runs( + execution_parameters, + genomes=1, + totalProcesses=1, + verbose=False, + replicate_generators=None, +): iterations = execution_parameters["NMF_replicates"] - init=execution_parameters["NMF_init"] - normalization_cutoff=execution_parameters["normalization_cutoff"] - n_cpu=execution_parameters["cpu"] - resample=execution_parameters["resample"] - norm=execution_parameters["matrix_normalization"] - gpu=execution_parameters["gpu"] - batch_size=execution_parameters["batch_size"] - + init = execution_parameters["NMF_init"] + normalization_cutoff = execution_parameters["normalization_cutoff"] + n_cpu = execution_parameters["cpu"] + resample = execution_parameters["resample"] + norm = execution_parameters["matrix_normalization"] + gpu = execution_parameters["gpu"] + batch_size = execution_parameters["batch_size"] + if verbose: - print ("Process "+str(totalProcesses)+ " is in progress\n===================================>") - if n_cpu==-1: + print( + "Process " + + str(totalProcesses) + + " is in progress\n===================================>" + ) + if n_cpu == -1: pool = multiprocessing.Pool() else: pool = multiprocessing.Pool(processes=n_cpu) @@ -452,331 +553,440 @@ def parallel_runs(execution_parameters, genomes=1, totalProcesses=1, verbose = F last_batch_size = iterations % batch_size # generators used for noise and matrix initialization - poisson_generator=replicate_generators[0] - rep_generator=replicate_generators[1] + poisson_generator = replicate_generators[0] + rep_generator = replicate_generators[1] # spawn "iterations" number of generators for poisson - poisson_rand_list=poisson_generator[0].spawn(int(iterations)) + poisson_rand_list = poisson_generator[0].spawn(int(iterations)) # spawn "iterations" number of generators for W,H sub_rand_generators = rep_generator.spawn(int(iterations)) generator_pair_list = [] for i, j in zip(poisson_rand_list, sub_rand_generators): - generator_pair_list.append([i,j]) + generator_pair_list.append([i, j]) batches = [batch_size for _ in range(num_full_batches)] if last_batch_size != 0: batches.append(last_batch_size) - + batch_generator_pair = [] # There will be nmf_replicate number of batch_generator_pair elements - for i,j in zip(batches, generator_pair_list): - batch_generator_pair.append([i,j]) + for i, j in zip(batches, generator_pair_list): + batch_generator_pair.append([i, j]) - if gpu==True: + if gpu == True: # submit jobs for parallel processing - pool_nmf=partial(pnmf, genomes=genomes, totalProcesses=totalProcesses, \ - resample=resample, init=init, normalization_cutoff=normalization_cutoff, \ - norm=norm, gpu=gpu, execution_parameters=execution_parameters) + pool_nmf = partial( + pnmf, + genomes=genomes, + totalProcesses=totalProcesses, + resample=resample, + init=init, + normalization_cutoff=normalization_cutoff, + norm=norm, + gpu=gpu, + execution_parameters=execution_parameters, + ) result_list = pool.map(pool_nmf, batch_generator_pair) pool.close() pool.join() flat_list = [item for sublist in result_list for item in sublist] else: - pool_nmf=partial(pnmf, genomes=genomes, totalProcesses=totalProcesses, \ - resample=resample, init=init, normalization_cutoff=normalization_cutoff, \ - norm=norm, gpu=gpu, execution_parameters=execution_parameters) + pool_nmf = partial( + pnmf, + genomes=genomes, + totalProcesses=totalProcesses, + resample=resample, + init=init, + normalization_cutoff=normalization_cutoff, + norm=norm, + gpu=gpu, + execution_parameters=execution_parameters, + ) result_list = pool.map(pool_nmf, batch_generator_pair) pool.close() pool.join() flat_list = result_list return flat_list + #################################### Decipher Signatures ################################################### -def decipher_signatures(execution_parameters, genomes=[0], i=1, totalIterations=1, cpu=-1, mut_context="96", noise_rep_pair=None): - #m = mut_context +def decipher_signatures( + execution_parameters, + genomes=[0], + i=1, + totalIterations=1, + cpu=-1, + mut_context="96", + noise_rep_pair=None, +): + # m = mut_context tic = time.time() - # The initial values accumute the results for each number of + # The initial values accumute the results for each number of totalMutationTypes = genomes.shape[0] totalGenomes = genomes.shape[1] totalProcesses = i - totalIterations=execution_parameters["NMF_replicates"] - gpu=execution_parameters["gpu"] - dist=execution_parameters["dist"] + totalIterations = execution_parameters["NMF_replicates"] + gpu = execution_parameters["gpu"] + dist = execution_parameters["dist"] norm = execution_parameters["matrix_normalization"] - normalization_cutoff=execution_parameters["normalization_cutoff"] + normalization_cutoff = execution_parameters["normalization_cutoff"] # poisson_generator is index 0, and random_generator is index 1 replicate_generators = noise_rep_pair[:2] cluster_rand_seq = noise_rep_pair[2] - print ("Extracting signature {} for mutation type {}".format(totalProcesses, mut_context)) # m is for the mutation context - - if norm=="gmm": + print( + "Extracting signature {} for mutation type {}".format( + totalProcesses, mut_context + ) + ) # m is for the mutation context + + if norm == "gmm": print("The matrix normalizing cutoff is {}\n\n".format(normalization_cutoff)) else: print("The matrix normalizing cutoff is set for {}\n\n".format(norm)) - - ############################################################################################################################################################################## - ############################################################# The parallel processing takes place here ####################################################################### - ############################################################################################################################################################################## - if gpu==True: + + ############################################################################################################################################################################## + ############################################################# The parallel processing takes place here ####################################################################### + ############################################################################################################################################################################## + if gpu == True: results = [] - flat_list = parallel_runs(execution_parameters, genomes=genomes, totalProcesses=totalProcesses, \ - verbose = False, replicate_generators=replicate_generators) - + flat_list = parallel_runs( + execution_parameters, + genomes=genomes, + totalProcesses=totalProcesses, + verbose=False, + replicate_generators=replicate_generators, + ) + for items in range(len(flat_list)): - W = flat_list[items][0] H = flat_list[items][1] - conv=flat_list[items][2] - #calculate L1, L2 and KL for the solution + conv = flat_list[items][2] + # calculate L1, L2 and KL for the solution est_genome = np.array(np.dot(W, H)) - - similarities = calculate_similarities(genomes, est_genome, sample_names=False)[0].iloc[:,2:] + + similarities = calculate_similarities( + genomes, est_genome, sample_names=False + )[0].iloc[:, 2:] similarities = np.array(np.mean(similarities, axis=0)).T similarities = np.append(similarities, conv) - - results.append([W,H,similarities]) + + results.append([W, H, similarities]) else: - results = parallel_runs(execution_parameters, genomes=genomes, totalProcesses=totalProcesses, \ - verbose = False, replicate_generators=replicate_generators) + results = parallel_runs( + execution_parameters, + genomes=genomes, + totalProcesses=totalProcesses, + verbose=False, + replicate_generators=replicate_generators, + ) toc = time.time() - print ("Time taken to collect {} iterations for {} signatures is {} seconds".format(totalIterations , totalProcesses, round(toc-tic, 2))) - ############################################################################################################################################################################## - ######################################################### The parallel processing ends here ################################################################################## - ############################################################################################################################################################################## - - - ################### Achieve the best clustering by shuffling results list using a few iterations ########## + print( + "Time taken to collect {} iterations for {} signatures is {} seconds".format( + totalIterations, totalProcesses, round(toc - tic, 2) + ) + ) + ############################################################################################################################################################################## + ######################################################### The parallel processing ends here ################################################################################## + ############################################################################################################################################################################## + + ################### Achieve the best clustering by shuffling results list using a few iterations ########## Wall = np.zeros((totalMutationTypes, totalProcesses * totalIterations)) Hall = np.zeros((totalProcesses * totalIterations, totalGenomes)) converge_information = np.zeros((totalIterations, 7)) - + finalgenomeErrors = np.zeros((totalMutationTypes, totalGenomes, totalIterations)) - finalgenomesReconstructed = np.zeros((totalMutationTypes, totalGenomes, totalIterations)) - - processCount=0 + finalgenomesReconstructed = np.zeros( + (totalMutationTypes, totalGenomes, totalIterations) + ) + + processCount = 0 for j in range(len(results)): W = results[j][0] H = results[j][1] - converge_information[j,:] = results[j][2][:] - finalgenomeErrors[:, :, j] = genomes - np.dot(W,H) - finalgenomesReconstructed[:, :, j] = np.dot(W,H) - Wall[ :, processCount : (processCount + totalProcesses) ] = W - Hall[ processCount : (processCount + totalProcesses), : ] = H + converge_information[j, :] = results[j][2][:] + finalgenomeErrors[:, :, j] = genomes - np.dot(W, H) + finalgenomesReconstructed[:, :, j] = np.dot(W, H) + Wall[:, processCount : (processCount + totalProcesses)] = W + Hall[processCount : (processCount + totalProcesses), :] = H processCount = processCount + totalProcesses - - - processes=i #renamed the i as "processes" - processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients = \ - cluster_converge_outerloop(Wall, Hall, processes, dist=dist, gpu=gpu, - cluster_rand_seq=cluster_rand_seq, n_cpu=execution_parameters["cpu"]) - reconstruction_error = round(LA.norm(genomes-np.dot(processAvg, exposureAvg), 'fro')/LA.norm(genomes, 'fro'), 2) - - - return processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, \ - np.round(clusterSilhouetteCoefficients,3), finalgenomeErrors, finalgenomesReconstructed, \ - Wall, Hall, converge_information, reconstruction_error, processes + processes = i # renamed the i as "processes" + ( + processAvg, + exposureAvg, + processSTE, + exposureSTE, + avgSilhouetteCoefficients, + clusterSilhouetteCoefficients, + ) = cluster_converge_outerloop( + Wall, + Hall, + processes, + dist=dist, + gpu=gpu, + cluster_rand_seq=cluster_rand_seq, + n_cpu=execution_parameters["cpu"], + ) + reconstruction_error = round( + LA.norm(genomes - np.dot(processAvg, exposureAvg), "fro") + / LA.norm(genomes, "fro"), + 2, + ) + + return ( + processAvg, + exposureAvg, + processSTE, + exposureSTE, + avgSilhouetteCoefficients, + np.round(clusterSilhouetteCoefficients, 3), + finalgenomeErrors, + finalgenomesReconstructed, + Wall, + Hall, + converge_information, + reconstruction_error, + processes, + ) ############################################################### FUNCTIONS TO CALCULATE DISTANCES BETWEEN VECTORS ################################################## ################################################################### FUNCTION ONE ################################################################### -#function to calculate the cosine similarity +# function to calculate the cosine similarity def cos_sim(a, b): - - - """Takes 2 vectors a, b and returns the cosine similarity according + """Takes 2 vectors a, b and returns the cosine similarity according to the definition of the dot product - - Dependencies: - *Requires numpy library. + + Dependencies: + *Requires numpy library. *Does not require any custom function (constructed by me) - + Required by: * pairwise_cluster_raw - """ - if np.sum(a)==0 or np.sum(b) == 0: - return 0.0 + """ + if np.sum(a) == 0 or np.sum(b) == 0: + return 0.0 dot_product = np.dot(a, b) norm_a = np.linalg.norm(a) norm_b = np.linalg.norm(b) return dot_product / (norm_a * norm_b) + def cor_sim(a, b): - - - """Takes 2 vectors a, b and returns the corrilation similarity according + """Takes 2 vectors a, b and returns the corrilation similarity according to the definition of the dot product - - Dependencies: - *Requires numpy library. + + Dependencies: + *Requires numpy library. *Does not require any custom function (constructed by me) - + Required by: * pairwise_cluster_raw - """ - if np.sum(a)==0 or np.sum(b) == 0: - return 0.0 - corr =1-cor(a, b) + """ + if np.sum(a) == 0 or np.sum(b) == 0: + return 0.0 + corr = 1 - cor(a, b) return corr - ################################################################### FUNCTION ONE ################################################################### -#function to calculate multiple similarities/distances +# function to calculate multiple similarities/distances def calculate_similarities(genomes, est_genomes, sample_names=False): from numpy import inf - - if sample_names is False: - sample_names = ["None"]*genomes.shape[1] - + + if sample_names is False: + sample_names = ["None"] * genomes.shape[1] + cosine_similarity_list = [] kl_divergence_list = [] - correlation_list=[] + correlation_list = [] l1_norm_list = [] l2_norm_list = [] total_mutations_list = [] relative_l1_list = [] relative_l2_list = [] - + for i in range(genomes.shape[1]): - p_i = genomes[:,i] + p_i = genomes[:, i] q_i = est_genomes[:, i] - cosine_similarity_list.append(round(cos_sim(p_i,q_i ),3)) - kl_divergence_list.append(round(scipy.stats.entropy(p_i,q_i),5)) - correlation_list.append(round(scipy.stats.pearsonr(p_i,q_i)[0],3)) - l1_norm_list.append(round(np.linalg.norm(p_i-q_i , ord=1),3)) - relative_l1_list.append(round((l1_norm_list[-1]/np.linalg.norm(p_i, ord=1))*100,3)) - l2_norm_list.append(round(np.linalg.norm(p_i-q_i , ord=2),3)) - relative_l2_list.append(round((l2_norm_list[-1]/np.linalg.norm(p_i, ord=2))*100,3)) + cosine_similarity_list.append(round(cos_sim(p_i, q_i), 3)) + kl_divergence_list.append(round(scipy.stats.entropy(p_i, q_i), 5)) + correlation_list.append(round(scipy.stats.pearsonr(p_i, q_i)[0], 3)) + l1_norm_list.append(round(np.linalg.norm(p_i - q_i, ord=1), 3)) + relative_l1_list.append( + round((l1_norm_list[-1] / np.linalg.norm(p_i, ord=1)) * 100, 3) + ) + l2_norm_list.append(round(np.linalg.norm(p_i - q_i, ord=2), 3)) + relative_l2_list.append( + round((l2_norm_list[-1] / np.linalg.norm(p_i, ord=2)) * 100, 3) + ) total_mutations_list.append(np.sum(p_i)) kl_divergence_list = np.array(kl_divergence_list) - kl_divergence_list[kl_divergence_list == inf] =1000 - similarities_dataframe = pd.DataFrame({"Sample Names": sample_names, \ - "Total Mutations":total_mutations_list, \ - "Cosine Similarity": cosine_similarity_list, \ - "L1 Norm": l1_norm_list, \ - "L1_Norm_%":relative_l1_list, \ - "L2 Norm": l2_norm_list, \ - "L2_Norm_%": relative_l2_list, \ - "KL Divergence": kl_divergence_list, \ - "Correlation": correlation_list }) + kl_divergence_list[kl_divergence_list == inf] = 1000 + similarities_dataframe = pd.DataFrame( + { + "Sample Names": sample_names, + "Total Mutations": total_mutations_list, + "Cosine Similarity": cosine_similarity_list, + "L1 Norm": l1_norm_list, + "L1_Norm_%": relative_l1_list, + "L2 Norm": l2_norm_list, + "L2_Norm_%": relative_l2_list, + "KL Divergence": kl_divergence_list, + "Correlation": correlation_list, + } + ) similarities_dataframe = similarities_dataframe.set_index("Sample Names") return [similarities_dataframe, cosine_similarity_list] - - ################################################################ CLUSTERING FUNCTIONS ################################################################ ################################################################### FUNCTION ONE ################################################################### # function to calculate the centroids + ################################################################### FUNCTION ################################################################### -def pairwise_cluster_raw(mat1=([0]), mat2=([0]), dist="cosine"): # the matrices (mat1 and mat2) are used to calculate the clusters and the lsts will be used to store the members of clusters - - ''' Takes a pair of matrices mat1 and mat2 as arguments. Both of the matrices should have the - equal shapes. The function makes a partition based clustering (the number of clusters is equal - to the number of colums of the matrices, and not more column is assigned into a cluster from - a single matrix). It return the list of clusters as "lstCluster" and the list of clusters - based on their indeces of the vectors in their original matrices. Please run the - test of function/example code provided below the for better understanding. - +def pairwise_cluster_raw( + mat1=([0]), mat2=([0]), dist="cosine" +): # the matrices (mat1 and mat2) are used to calculate the clusters and the lsts will be used to store the members of clusters + """Takes a pair of matrices mat1 and mat2 as arguments. Both of the matrices should have the + equal shapes. The function makes a partition based clustering (the number of clusters is equal + to the number of colums of the matrices, and not more column is assigned into a cluster from + a single matrix). It return the list of clusters as "lstCluster" and the list of clusters + based on their indeces of the vectors in their original matrices. Please run the + test of function/example code provided below the for better understanding. + Dependencies: *cos_sim - + Required by: - *pairwise_cluster_init + *pairwise_cluster_init *pairwise_cluster_elong - - - ''' - if dist=="cosine": + + """ + + if dist == "cosine": con_mat = cdist(mat1.T, mat2.T, "cosine") - elif dist=="correlation": + elif dist == "correlation": con_mat = cdist(mat1.T, mat2.T, "correlation") row_ind, col_ind = linear_sum_assignment(con_mat) - idxPair=[] + idxPair = [] for i, j in zip(row_ind, col_ind): - idxPair.append([i,j]) - + idxPair.append([i, j]) + return idxPair + ################################################################### FUNCTION ################################################################### -def reclustering(tempWall=0, tempHall=0, processAvg=0, exposureAvg=0, dist="cosine",gpu=False): +def reclustering( + tempWall=0, tempHall=0, processAvg=0, exposureAvg=0, dist="cosine", gpu=False +): # exposureAvg is not important here. It can be any matrix with the same size of a single exposure matrix - iterations = int(tempWall.shape[1]/processAvg.shape[1]) - processes = processAvg.shape[1] + iterations = int(tempWall.shape[1] / processAvg.shape[1]) + processes = processAvg.shape[1] idxIter = list(range(0, tempWall.shape[1], processes)) - + processes3D = np.zeros([processAvg.shape[1], processAvg.shape[0], iterations]) exposure3D = np.zeros([exposureAvg.shape[0], iterations, exposureAvg.shape[1]]) - - for iteration_number in range(len(idxIter)): - + + for iteration_number in range(len(idxIter)): statidx = idxIter[iteration_number] - loopidx = list(range(statidx, statidx+processes)) - idxPair= pairwise_cluster_raw(mat1=processAvg, mat2=tempWall[:, loopidx], dist=dist) - + loopidx = list(range(statidx, statidx + processes)) + idxPair = pairwise_cluster_raw( + mat1=processAvg, mat2=tempWall[:, loopidx], dist=dist + ) + for cluster_items in idxPair: cluster_number = cluster_items[0] query_idx = cluster_items[1] - processes3D[cluster_number,:,iteration_number]=tempWall[:,statidx+query_idx] - exposure3D[cluster_number, iteration_number, :] = tempHall[statidx+query_idx,:] + processes3D[cluster_number, :, iteration_number] = tempWall[ + :, statidx + query_idx + ] + exposure3D[cluster_number, iteration_number, :] = tempHall[ + statidx + query_idx, : + ] count = 0 - labels=[] + labels = [] clusters = pd.DataFrame() for cluster_id in range(processes3D.shape[0]): - cluster_vectors = pd.DataFrame(processes3D[cluster_id,:,:]) + cluster_vectors = pd.DataFrame(processes3D[cluster_id, :, :]) clusters = clusters.append(cluster_vectors.T) for k in range(cluster_vectors.shape[1]): labels.append(count) - count= count+1 + count = count + 1 try: - if dist=="cosine": - SilhouetteCoefficients = metrics.silhouette_samples(clusters, labels, metric='cosine') - if dist=="correlation": - SilhouetteCoefficients = metrics.silhouette_samples(clusters, labels, metric='correlation') - + if dist == "cosine": + SilhouetteCoefficients = metrics.silhouette_samples( + clusters, labels, metric="cosine" + ) + if dist == "correlation": + SilhouetteCoefficients = metrics.silhouette_samples( + clusters, labels, metric="correlation" + ) + except: - SilhouetteCoefficients = np.ones((len(labels),1)) + SilhouetteCoefficients = np.ones((len(labels), 1)) avgSilhouetteCoefficients = np.mean(SilhouetteCoefficients) - - #clusterSilhouetteCoefficients + + # clusterSilhouetteCoefficients splitByCluster = np.array_split(SilhouetteCoefficients, processes3D.shape[0]) clusterSilhouetteCoefficients = np.array([]) for i in splitByCluster: - - clusterSilhouetteCoefficients=np.append(clusterSilhouetteCoefficients, np.mean(i)) - + clusterSilhouetteCoefficients = np.append( + clusterSilhouetteCoefficients, np.mean(i) + ) + processAvg = np.mean(processes3D, axis=2).T processSTE = scipy.stats.sem(processes3D, axis=2, ddof=1).T - exposureAvg = np.mean(exposure3D, axis=1) + exposureAvg = np.mean(exposure3D, axis=1) exposureSTE = scipy.stats.sem(exposure3D, axis=1, ddof=1) - - - return processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients -def cluster_converge_innerloop(Wall, Hall, totalprocess, iteration_generator_pair, iteration=1, dist="cosine", gpu=False): - + return ( + processAvg, + exposureAvg, + processSTE, + exposureSTE, + avgSilhouetteCoefficients, + clusterSilhouetteCoefficients, + ) + + +def cluster_converge_innerloop( + Wall, + Hall, + totalprocess, + iteration_generator_pair, + iteration=1, + dist="cosine", + gpu=False, +): rng_generator = Generator(PCG64DXSM(iteration_generator_pair[1])) - processAvg = rng_generator.random((Wall.shape[0],totalprocess)) + processAvg = rng_generator.random((Wall.shape[0], totalprocess)) exposureAvg = rng_generator.random((totalprocess, Hall.shape[1])) - + result = 0 convergence_count = 0 while True: - processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients = reclustering(Wall, Hall, processAvg, exposureAvg, dist=dist, gpu=gpu) - + ( + processAvg, + exposureAvg, + processSTE, + exposureSTE, + avgSilhouetteCoefficients, + clusterSilhouetteCoefficients, + ) = reclustering(Wall, Hall, processAvg, exposureAvg, dist=dist, gpu=gpu) + if result == avgSilhouetteCoefficients: break elif convergence_count == 10: @@ -784,14 +994,28 @@ def cluster_converge_innerloop(Wall, Hall, totalprocess, iteration_generator_pai else: result = avgSilhouetteCoefficients convergence_count = convergence_count + 1 - - return processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients - - -def parallel_clustering(Wall, Hall, totalProcesses, iterations=50, n_cpu=-1, dist= "cosine", gpu=False, cluster_rand_seq=None): - - if n_cpu==-1: + return ( + processAvg, + exposureAvg, + processSTE, + exposureSTE, + avgSilhouetteCoefficients, + clusterSilhouetteCoefficients, + ) + + +def parallel_clustering( + Wall, + Hall, + totalProcesses, + iterations=50, + n_cpu=-1, + dist="cosine", + gpu=False, + cluster_rand_seq=None, +): + if n_cpu == -1: pool = multiprocessing.Pool() else: pool = multiprocessing.Pool(processes=n_cpu) @@ -800,317 +1024,649 @@ def parallel_clustering(Wall, Hall, totalProcesses, iterations=50, n_cpu=-1, di sub_rand_generator = cluster_rand_seq.spawn(iterations) iteration_generator_pairs = [] # pair generator with an interation - for i,j in zip(range(iterations), sub_rand_generator): - iteration_generator_pairs.append([i,j]) + for i, j in zip(range(iterations), sub_rand_generator): + iteration_generator_pairs.append([i, j]) - pool_nmf=partial(cluster_converge_innerloop, Wall, Hall, totalProcesses, dist=dist, gpu=gpu) - result_list = pool.map(pool_nmf, iteration_generator_pairs) + pool_nmf = partial( + cluster_converge_innerloop, Wall, Hall, totalProcesses, dist=dist, gpu=gpu + ) + result_list = pool.map(pool_nmf, iteration_generator_pairs) pool.close() pool.join() return result_list + # To select the best clustering converge of the cluster_converge_innerloop -def cluster_converge_outerloop(Wall, Hall, totalprocess, dist="cosine", - gpu=False, cluster_rand_seq=None, n_cpu=-1): - - avgSilhouetteCoefficients = -1 # intial avgSilhouetteCoefficients - - #do the parallel clustering - result_list = parallel_clustering(Wall, Hall, totalprocess, iterations=50, - n_cpu=n_cpu, dist=dist, gpu=gpu, - cluster_rand_seq=cluster_rand_seq) - - for i in range(50): # using 10 iterations to get the best clustering - - temp_processAvg, temp_exposureAvg, temp_processSTE, temp_exposureSTE, temp_avgSilhouetteCoefficients, temp_clusterSilhouetteCoefficients = result_list[i][0], result_list[i][1], result_list[i][2], result_list[i][3], result_list[i][4], result_list[i][5] - +def cluster_converge_outerloop( + Wall, Hall, totalprocess, dist="cosine", gpu=False, cluster_rand_seq=None, n_cpu=-1 +): + avgSilhouetteCoefficients = -1 # intial avgSilhouetteCoefficients + + # do the parallel clustering + result_list = parallel_clustering( + Wall, + Hall, + totalprocess, + iterations=50, + n_cpu=n_cpu, + dist=dist, + gpu=gpu, + cluster_rand_seq=cluster_rand_seq, + ) + + for i in range(50): # using 10 iterations to get the best clustering + ( + temp_processAvg, + temp_exposureAvg, + temp_processSTE, + temp_exposureSTE, + temp_avgSilhouetteCoefficients, + temp_clusterSilhouetteCoefficients, + ) = ( + result_list[i][0], + result_list[i][1], + result_list[i][2], + result_list[i][3], + result_list[i][4], + result_list[i][5], + ) + if avgSilhouetteCoefficients < temp_avgSilhouetteCoefficients: - processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients = temp_processAvg, temp_exposureAvg, temp_processSTE, temp_exposureSTE, temp_avgSilhouetteCoefficients, temp_clusterSilhouetteCoefficients - - - return processAvg, exposureAvg, processSTE, exposureSTE, avgSilhouetteCoefficients, clusterSilhouetteCoefficients + ( + processAvg, + exposureAvg, + processSTE, + exposureSTE, + avgSilhouetteCoefficients, + clusterSilhouetteCoefficients, + ) = ( + temp_processAvg, + temp_exposureAvg, + temp_processSTE, + temp_exposureSTE, + temp_avgSilhouetteCoefficients, + temp_clusterSilhouetteCoefficients, + ) + + return ( + processAvg, + exposureAvg, + processSTE, + exposureSTE, + avgSilhouetteCoefficients, + clusterSilhouetteCoefficients, + ) ################################################### Generation of probabilities for each processes given to A mutation type ############################################ -def probabilities(W, H, index, allsigids, allcolnames): - - # setting up the indices +def probabilities(W, H, index, allsigids, allcolnames): + # setting up the indices rows = index cols = allcolnames sigs = allsigids - + W = np.array(W) - H= np.array(H) - # rebuild the original matrix from the estimated W and H - genomes = np.dot(W,H) - - + H = np.array(H) + # rebuild the original matrix from the estimated W and H + genomes = np.dot(W, H) + result = 0 - for i in range(H.shape[1]): #here H.shape is the number of sample - - M = genomes[:,i][np.newaxis] - probs = W*H[:,i]/M.T + for i in range(H.shape[1]): # here H.shape is the number of sample + M = genomes[:, i][np.newaxis] + probs = W * H[:, i] / M.T probs = pd.DataFrame(probs) probs.columns = sigs - col1 = [cols[i]]*len(rows) - probs.insert(loc=0, column='Sample Names', value=col1) - probs.insert(loc=1, column='MutationType', value = rows) - if i!=0: + col1 = [cols[i]] * len(rows) + probs.insert(loc=0, column="Sample Names", value=col1) + probs.insert(loc=1, column="MutationType", value=rows) + if i != 0: result = pd.concat([result, probs], axis=0) else: result = probs - - + return result - + + ########################################################################################################################################################################## #################################################################### Data loading and Result Exporting ################################################################################### ########################################################################################################################################################################## ########################################################### Functions to load the MATLAB OBJECTS ################### + def extract_arrays(data, field, index=True): accumulator = list() - if index==False: + if index == False: for i in data[field]: accumulator.append(i) - return accumulator - else: + return accumulator + else: for i in data[field]: accumulator.append(i[0][0]) - return accumulator - -def extract_input(data): - originalGenome=data['originalGenomes'] - cancerType = extract_arrays(data, 'cancerType', False) - sampleNames = extract_arrays(data, 'sampleNames', True) - subTypes = extract_arrays(data, 'subtypes', True) - types = extract_arrays(data, 'types', True) + return accumulator + + +def extract_input(data): + originalGenome = data["originalGenomes"] + cancerType = extract_arrays(data, "cancerType", False) + sampleNames = extract_arrays(data, "sampleNames", True) + subTypes = extract_arrays(data, "subtypes", True) + types = extract_arrays(data, "types", True) allFields = [cancerType, originalGenome, sampleNames, subTypes, types] return allFields + ##################################### Get Input From CSV Files ############################################## -def read_csv(filename, folder = False): - if folder==False: + +def read_csv(filename, folder=False): + if folder == False: if type(filename) == str: - genomes = pd.read_csv(filename, sep=",").iloc[:, :] + genomes = pd.read_csv(filename, sep=",").iloc[:, :] else: genomes = filename else: - count = 1 for file in os.listdir(filename): - #print(count) - df = pd.read_csv(filename+"/"+file) - if count==1: - genomes = df + # print(count) + df = pd.read_csv(filename + "/" + file) + if count == 1: + genomes = df else: - genomes = pd.merge(genomes, df, on=["Mutation type", "Trinucleotide"]) + genomes = pd.merge(genomes, df, on=["Mutation type", "Trinucleotide"]) count += 1 mtypes = [str(genomes.shape[0])] - - if mtypes == ['96']: + + if mtypes == ["96"]: # create the list to sort the mutation types - orderlist = ['A[C>A]A', 'A[C>A]C', 'A[C>A]G', 'A[C>A]T', 'A[C>G]A', 'A[C>G]C', 'A[C>G]G', 'A[C>G]T', 'A[C>T]A', 'A[C>T]C', 'A[C>T]G', 'A[C>T]T', - 'A[T>A]A', 'A[T>A]C', 'A[T>A]G', 'A[T>A]T', 'A[T>C]A', 'A[T>C]C', 'A[T>C]G', 'A[T>C]T', 'A[T>G]A', 'A[T>G]C', 'A[T>G]G', 'A[T>G]T', - 'C[C>A]A', 'C[C>A]C', 'C[C>A]G', 'C[C>A]T', 'C[C>G]A', 'C[C>G]C', 'C[C>G]G', 'C[C>G]T', 'C[C>T]A', 'C[C>T]C', 'C[C>T]G', 'C[C>T]T', - 'C[T>A]A', 'C[T>A]C', 'C[T>A]G', 'C[T>A]T', 'C[T>C]A', 'C[T>C]C', 'C[T>C]G', 'C[T>C]T', 'C[T>G]A', 'C[T>G]C', 'C[T>G]G', 'C[T>G]T', - 'G[C>A]A', 'G[C>A]C', 'G[C>A]G', 'G[C>A]T', 'G[C>G]A', 'G[C>G]C', 'G[C>G]G', 'G[C>G]T', 'G[C>T]A', 'G[C>T]C', 'G[C>T]G', 'G[C>T]T', - 'G[T>A]A', 'G[T>A]C', 'G[T>A]G', 'G[T>A]T', 'G[T>C]A', 'G[T>C]C', 'G[T>C]G', 'G[T>C]T', 'G[T>G]A', 'G[T>G]C', 'G[T>G]G', 'G[T>G]T', - 'T[C>A]A', 'T[C>A]C', 'T[C>A]G', 'T[C>A]T', 'T[C>G]A', 'T[C>G]C', 'T[C>G]G', 'T[C>G]T', 'T[C>T]A', 'T[C>T]C', 'T[C>T]G', 'T[C>T]T', - 'T[T>A]A', 'T[T>A]C', 'T[T>A]G', 'T[T>A]T', 'T[T>C]A', 'T[T>C]C', 'T[T>C]G', 'T[T>C]T', 'T[T>G]A', 'T[T>G]C', 'T[T>G]G', 'T[T>G]T'] - #Contruct the indeces of the matrix - #setting index and columns names of processAvg and exposureAvg - index1 = genomes.iloc[:,1] - index2 = genomes.iloc[:,0] + orderlist = [ + "A[C>A]A", + "A[C>A]C", + "A[C>A]G", + "A[C>A]T", + "A[C>G]A", + "A[C>G]C", + "A[C>G]G", + "A[C>G]T", + "A[C>T]A", + "A[C>T]C", + "A[C>T]G", + "A[C>T]T", + "A[T>A]A", + "A[T>A]C", + "A[T>A]G", + "A[T>A]T", + "A[T>C]A", + "A[T>C]C", + "A[T>C]G", + "A[T>C]T", + "A[T>G]A", + "A[T>G]C", + "A[T>G]G", + "A[T>G]T", + "C[C>A]A", + "C[C>A]C", + "C[C>A]G", + "C[C>A]T", + "C[C>G]A", + "C[C>G]C", + "C[C>G]G", + "C[C>G]T", + "C[C>T]A", + "C[C>T]C", + "C[C>T]G", + "C[C>T]T", + "C[T>A]A", + "C[T>A]C", + "C[T>A]G", + "C[T>A]T", + "C[T>C]A", + "C[T>C]C", + "C[T>C]G", + "C[T>C]T", + "C[T>G]A", + "C[T>G]C", + "C[T>G]G", + "C[T>G]T", + "G[C>A]A", + "G[C>A]C", + "G[C>A]G", + "G[C>A]T", + "G[C>G]A", + "G[C>G]C", + "G[C>G]G", + "G[C>G]T", + "G[C>T]A", + "G[C>T]C", + "G[C>T]G", + "G[C>T]T", + "G[T>A]A", + "G[T>A]C", + "G[T>A]G", + "G[T>A]T", + "G[T>C]A", + "G[T>C]C", + "G[T>C]G", + "G[T>C]T", + "G[T>G]A", + "G[T>G]C", + "G[T>G]G", + "G[T>G]T", + "T[C>A]A", + "T[C>A]C", + "T[C>A]G", + "T[C>A]T", + "T[C>G]A", + "T[C>G]C", + "T[C>G]G", + "T[C>G]T", + "T[C>T]A", + "T[C>T]C", + "T[C>T]G", + "T[C>T]T", + "T[T>A]A", + "T[T>A]C", + "T[T>A]G", + "T[T>A]T", + "T[T>C]A", + "T[T>C]C", + "T[T>C]G", + "T[T>C]T", + "T[T>G]A", + "T[T>G]C", + "T[T>G]G", + "T[T>G]T", + ] + # Contruct the indeces of the matrix + # setting index and columns names of processAvg and exposureAvg + index1 = genomes.iloc[:, 1] + index2 = genomes.iloc[:, 0] index = [] for i, j in zip(index1, index2): - index.append(i[0]+"["+j+"]"+i[2]) - - + index.append(i[0] + "[" + j + "]" + i[2]) + index = np.array(pd.Series(index)) genomes["index"] = index - - - #sort the mutationa types - genomes["index"]= pd.Categorical(genomes["index"], orderlist) + + # sort the mutationa types + genomes["index"] = pd.Categorical(genomes["index"], orderlist) genomes = genomes.sort_values("index") - - - genomes = genomes.iloc[:,2:genomes.shape[1]] - - - - # set the index + + genomes = genomes.iloc[:, 2 : genomes.shape[1]] + + # set the index genomes = genomes.set_index("index") - + # prepare the index and colnames variables index = np.array(orderlist) colnames = np.array(pd.Series(genomes.columns.tolist())) - + else: - - index = np.array(genomes.iloc[:,0]) - genomes = genomes.iloc[:,1:] + index = np.array(genomes.iloc[:, 0]) + genomes = genomes.iloc[:, 1:] genomes = genomes.loc[:, (genomes != 0).any(axis=0)] colnames = genomes.columns - - - - - - - return(genomes, index, colnames, mtypes) + + return (genomes, index, colnames, mtypes) ###################################################################################### Export Results ########################################### -def export_information(loopResults, mutation_type, output, index, colnames, sequence="genome", wall=False): - +def export_information( + loopResults, mutation_type, output, index, colnames, sequence="genome", wall=False +): # get the number of processes i = loopResults[-1] - #get Wall and Hall + # get Wall and Hall Wall = loopResults[-3] Hall = loopResults[-2] - - - - # get the mutational contexts - #print ("The mutaion type is", mutation_type) + + # get the mutational contexts + # print ("The mutaion type is", mutation_type) m = mutation_type # Create the neccessary directories - subdirectory = output+"/All_Solutions/"+mutation_type+"_"+str(i)+"_Signatures" - signature_subdirectory=subdirectory+"/Signatures" - activity_subdirectory=subdirectory+"/Activities" - stats_subdirectory=subdirectory+"/Solution_Stats" + subdirectory = ( + output + "/All_Solutions/" + mutation_type + "_" + str(i) + "_Signatures" + ) + signature_subdirectory = subdirectory + "/Signatures" + activity_subdirectory = subdirectory + "/Activities" + stats_subdirectory = subdirectory + "/Solution_Stats" if not os.path.exists(subdirectory): os.makedirs(subdirectory) - os.makedirs(signature_subdirectory) #processes, processSTE and SBS Plots will go here - os.makedirs(activity_subdirectory) #exposures, exposureSTE, TMB plot and activity plot - os.makedirs(stats_subdirectory) #all others - - #preparing the column and row indeces for the Average processes and exposures: + os.makedirs( + signature_subdirectory + ) # processes, processSTE and SBS Plots will go here + os.makedirs( + activity_subdirectory + ) # exposures, exposureSTE, TMB plot and activity plot + os.makedirs(stats_subdirectory) # all others + + # preparing the column and row indeces for the Average processes and exposures: listOfSignatures = [] letters = list(string.ascii_uppercase) - letters.extend([i+b for i in letters for b in letters]) + letters.extend([i + b for i in letters for b in letters]) letters = letters[0:i] - - for j,l in zip(range(i),letters) : - listOfSignatures.append(mutation_type+l) + + for j, l in zip(range(i), letters): + listOfSignatures.append(mutation_type + l) listOfSignatures = np.array(listOfSignatures) - - #Extract the genomes, processAVG, processStabityAvg - genome= loopResults[0] - processAvg= (loopResults[1]) - exposureAvg= (loopResults[2]) + + # Extract the genomes, processAVG, processStabityAvg + genome = loopResults[0] + processAvg = loopResults[1] + exposureAvg = loopResults[2] process_stabililities = np.array(loopResults[6]) - minProcessStability= round(np.min(process_stabililities), 2) + minProcessStability = round(np.min(process_stabililities), 2) meanProcessStability = round(np.mean(process_stabililities), 2) # Calculating and listing the reconstruction error, process stability and signares to make a csv file at the end - reconstruction_error = round(LA.norm(genome-np.dot(processAvg, exposureAvg), 'fro')/LA.norm(genome, 'fro'), 4) - - #First exporting the Average of the processes - processAvg= pd.DataFrame(processAvg) + reconstruction_error = round( + LA.norm(genome - np.dot(processAvg, exposureAvg), "fro") + / LA.norm(genome, "fro"), + 4, + ) + + # First exporting the Average of the processes + processAvg = pd.DataFrame(processAvg) processes = processAvg.set_index(index) processes.columns = listOfSignatures processes = processes.rename_axis("MutationType", axis="columns") - #print(processes) - #print("process are ok", processes) - processes.to_csv(signature_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Signatures"+".txt", "\t", index_label=[processes.columns.name]) - - #Second exporting the Average of the exposures + # print(processes) + # print("process are ok", processes) + processes.to_csv( + signature_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_Signatures" + + ".txt", + "\t", + index_label=[processes.columns.name], + ) + + # Second exporting the Average of the exposures exposureAvg = pd.DataFrame(exposureAvg.astype(int)) exposures = exposureAvg.set_index(listOfSignatures) exposures.columns = colnames exposures = exposures.T exposures = exposures.rename_axis("Samples", axis="columns") - #print("exposures are ok", exposures) - exposures.to_csv(activity_subdirectory+"/"+mutation_type+"_S"+str(i)+"_NMF_Activities.txt", "\t", index_label=[exposures.columns.name]) - - #plt tmb + # print("exposures are ok", exposures) + exposures.to_csv( + activity_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_NMF_Activities.txt", + "\t", + index_label=[exposures.columns.name], + ) + + # plt tmb tmb_exposures = pd.melt(exposures) - tmb.plotTMB(tmb_exposures, scale=sequence, Yrange="adapt", output= activity_subdirectory+"/"+mutation_type+"_S"+str(i)+"_"+"TMB_NMF_plot.pdf") + tmb.plotTMB( + tmb_exposures, + scale=sequence, + Yrange="adapt", + output=activity_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_" + + "TMB_NMF_plot.pdf", + ) del tmb_exposures - - #plot activities - plot_ac.plotActivity(activity_subdirectory+"/"+mutation_type+"_S"+str(i)+"_NMF_Activities.txt", output_file = activity_subdirectory+"/"+mutation_type+"_S"+str(i)+"_"+"NMF_Activity_Plots.pdf", bin_size = 50, log = False) - + + # plot activities + plot_ac.plotActivity( + activity_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_NMF_Activities.txt", + output_file=activity_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_" + + "NMF_Activity_Plots.pdf", + bin_size=50, + log=False, + ) + # get the standard errors of the processes - processSTE = loopResults[3] - #export the processStd file + processSTE = loopResults[3] + # export the processStd file processSTE = pd.DataFrame(processSTE) processSTE = processSTE.set_index(index) processSTE.columns = listOfSignatures processSTE = processSTE.rename_axis("MutationType", axis="columns") - processSTE.to_csv(signature_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Signatures_SEM_Error"+".txt", "\t", float_format='%.2E', index_label=[processes.columns.name]) - - # get the standard errors of the exposures - exposureSTE = loopResults[4] - #export the exposureStd file + processSTE.to_csv( + signature_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_Signatures_SEM_Error" + + ".txt", + "\t", + float_format="%.2E", + index_label=[processes.columns.name], + ) + + # get the standard errors of the exposures + exposureSTE = loopResults[4] + # export the exposureStd file exposureSTE = pd.DataFrame(exposureSTE) exposureSTE = exposureSTE.set_index(listOfSignatures) exposureSTE.columns = colnames exposureSTE = exposureSTE.T exposureSTE = exposureSTE.rename_axis("Samples", axis="columns") - exposureSTE.to_csv(activity_subdirectory+"/"+mutation_type+"_S"+str(i)+"_NMF_Activities_SEM_Error.txt", "\t", float_format='%.2E', index_label=[exposures.columns.name]) - - all_similarities = loopResults[8].copy() - all_similarities['L1_Norm_%'] = all_similarities['L1_Norm_%'].astype(str) + '%' - all_similarities['L2_Norm_%'] = all_similarities['L2_Norm_%'].astype(str) + '%' - all_similarities.to_csv(stats_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Samples_stats.txt", sep="\t") - - signature_stats = loopResults[9] + exposureSTE.to_csv( + activity_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_NMF_Activities_SEM_Error.txt", + "\t", + float_format="%.2E", + index_label=[exposures.columns.name], + ) + + all_similarities = loopResults[8].copy() + all_similarities["L1_Norm_%"] = all_similarities["L1_Norm_%"].astype(str) + "%" + all_similarities["L2_Norm_%"] = all_similarities["L2_Norm_%"].astype(str) + "%" + all_similarities.to_csv( + stats_subdirectory + "/" + mutation_type + "_S" + str(i) + "_Samples_stats.txt", + sep="\t", + ) + + signature_stats = loopResults[9] signature_stats = signature_stats.set_index(listOfSignatures) signature_stats = signature_stats.rename_axis("Signatures", axis="columns") - signature_stats.to_csv(stats_subdirectory+"/"+mutation_type+"_S"+str(i)+"_"+"Signatures_stats.txt", "\t", index_label="Signatures") - - #export convergence information + signature_stats.to_csv( + stats_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_" + + "Signatures_stats.txt", + "\t", + index_label="Signatures", + ) + + # export convergence information converge_information = loopResults[13] converge_information = pd.DataFrame(np.around(converge_information, decimals=3)) - conv_index = list(range(1,len(converge_information)+1)) - colmetrices = ['L1', 'L1 %', 'L2', 'L2 %', 'KL Divergence', "Correlation", "Convergence Iterations"] - converge_information.index = conv_index + conv_index = list(range(1, len(converge_information) + 1)) + colmetrices = [ + "L1", + "L1 %", + "L2", + "L2 %", + "KL Divergence", + "Correlation", + "Convergence Iterations", + ] + converge_information.index = conv_index converge_information.columns = colmetrices - converge_information.to_csv(stats_subdirectory+"/"+mutation_type+"_S"+str(i)+"_"+"NMF_Convergence_Information.txt", "\t", index_label="NMF_Replicate") - + converge_information.to_csv( + stats_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_" + + "NMF_Convergence_Information.txt", + "\t", + index_label="NMF_Replicate", + ) + # export Wall and Hall if "wall" argument is true - if wall==True: - - np.savetxt(stats_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Wall.txt", Wall, delimiter="\t") - np.savetxt(stats_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Hall.txt", Hall, delimiter="\t") - - fh = open(output+"/All_solutions_stat.csv", "a") - print ('The reconstruction error is {}, average process stability is {} and \nthe minimum process stability is {} for {} signatures\n\n'.format(reconstruction_error, round(meanProcessStability,2), round(minProcessStability,2), i)) - fh.write('{}, {}, {}, {}\n'.format(i , round(minProcessStability, 2), round(reconstruction_error, 4), round(meanProcessStability,2))) + if wall == True: + np.savetxt( + stats_subdirectory + "/" + mutation_type + "_S" + str(i) + "_Wall.txt", + Wall, + delimiter="\t", + ) + np.savetxt( + stats_subdirectory + "/" + mutation_type + "_S" + str(i) + "_Hall.txt", + Hall, + delimiter="\t", + ) + + fh = open(output + "/All_solutions_stat.csv", "a") + print( + "The reconstruction error is {}, average process stability is {} and \nthe minimum process stability is {} for {} signatures\n\n".format( + reconstruction_error, + round(meanProcessStability, 2), + round(minProcessStability, 2), + i, + ) + ) + fh.write( + "{}, {}, {}, {}\n".format( + i, + round(minProcessStability, 2), + round(reconstruction_error, 4), + round(meanProcessStability, 2), + ) + ) fh.close() - - - + ########################################### PLOT THE SIGNATURES ################################################ - #prepare the texts lists: + # prepare the texts lists: stability_list = signature_plotting_text(loopResults[6], "Stability", "float") - total_mutation_list = signature_plotting_text(loopResults[7], "Sig. Mutations", "integer") - - - - if m=="DBS78": - plot.plotDBS(signature_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Signatures"+".txt", signature_subdirectory+"/Signature_plot/" , "S"+str(i), "78", True, custom_text_upper=stability_list, custom_text_middle=total_mutation_list) - elif m=="ID83": - plot.plotID(signature_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Signatures"+".txt", signature_subdirectory+"/Signature_plot/" , "S"+str(i), "83", True, custom_text_upper=stability_list, custom_text_middle=total_mutation_list) - elif m=="CNV48": - plot.plotCNV(signature_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Signatures"+".txt", signature_subdirectory+"/Signature_plot/" , "S"+str(i), percentage=True, aggregate=False) - elif m=="SV32": - plot.plotSV(signature_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Signatures"+".txt", signature_subdirectory+"/Signature_plot/" , "S"+str(i), percentage=True, aggregate=False) - elif m=="SBS96" or m=="SBS288" or m=="SBS384" or m=="SBS1536" or m=="SBS4608": + total_mutation_list = signature_plotting_text( + loopResults[7], "Sig. Mutations", "integer" + ) + + if m == "DBS78": + plot.plotDBS( + signature_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_Signatures" + + ".txt", + signature_subdirectory + "/Signature_plot/", + "S" + str(i), + "78", + True, + custom_text_upper=stability_list, + custom_text_middle=total_mutation_list, + ) + elif m == "ID83": + plot.plotID( + signature_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_Signatures" + + ".txt", + signature_subdirectory + "/Signature_plot/", + "S" + str(i), + "83", + True, + custom_text_upper=stability_list, + custom_text_middle=total_mutation_list, + ) + elif m == "CNV48": + plot.plotCNV( + signature_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_Signatures" + + ".txt", + signature_subdirectory + "/Signature_plot/", + "S" + str(i), + percentage=True, + aggregate=False, + ) + elif m == "SV32": + plot.plotSV( + signature_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_Signatures" + + ".txt", + signature_subdirectory + "/Signature_plot/", + "S" + str(i), + percentage=True, + aggregate=False, + ) + elif ( + m == "SBS96" + or m == "SBS288" + or m == "SBS384" + or m == "SBS1536" + or m == "SBS4608" + ): # parse 'm' to be accepted by the plotSBS function tmp_m = m if m.startswith("SBS"): tmp_m = m[3:] - if m=="SBS96" or m=="SBS288" or m=="SBS384" or m=="SBS1536": - plot.plotSBS(signature_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Signatures"+".txt", signature_subdirectory+"/Signature_plot/", "S"+str(i), tmp_m, True, custom_text_upper=stability_list, custom_text_middle=total_mutation_list) - elif m=="SBS4608": - plot.plotSBS(signature_subdirectory+"/"+mutation_type+"_S"+str(i)+"_Signatures"+".txt", signature_subdirectory+"/Signature_plot/", "S"+str(i), tmp_m, True) + if m == "SBS96" or m == "SBS288" or m == "SBS384" or m == "SBS1536": + plot.plotSBS( + signature_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_Signatures" + + ".txt", + signature_subdirectory + "/Signature_plot/", + "S" + str(i), + tmp_m, + True, + custom_text_upper=stability_list, + custom_text_middle=total_mutation_list, + ) + elif m == "SBS4608": + plot.plotSBS( + signature_subdirectory + + "/" + + mutation_type + + "_S" + + str(i) + + "_Signatures" + + ".txt", + signature_subdirectory + "/Signature_plot/", + "S" + str(i), + tmp_m, + True, + ) else: custom_signatures_plot(processes, signature_subdirectory) @@ -1125,273 +1681,359 @@ def dendrogram(data, threshold, layer_directory): colnames = data.columns data = np.array(data) - Z = hierarchy.linkage(data.T, 'single', 'cosine') + Z = hierarchy.linkage(data.T, "single", "cosine") plt.figure(figsize=(15, 9)) - dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold) - plt.title("Clustering of Samples Based on Mutational Signatures" ) + dn = hierarchy.dendrogram(Z, labels=colnames, color_threshold=threshold) + plt.title("Clustering of Samples Based on Mutational Signatures") plt.ylabel("Cosine Distance") plt.xlabel("Sample IDs") - #plt.ylim((0,1)) - plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300) + # plt.ylim((0,1)) + plt.savefig(layer_directory + "/dendrogram.pdf", figsize=(10, 8), dpi=300) # which datapoints goes to which cluster - # The indices of the datapoints will be displayed as the ids - Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None) - dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)}) + # The indices of the datapoints will be displayed as the ids + Y = hierarchy.fcluster(Z, threshold, criterion="distance", R=None, monocrit=None) + dataframe = pd.DataFrame({"Cluster": Y, "Sample Names": list(colnames)}) dataframe = dataframe.set_index("Sample Names") - #print(dataframe) - dictionary = {"clusters":Y, "informations":dn} - - return dataframe + # print(dataframe) + dictionary = {"clusters": Y, "informations": dn} + + return dataframe -######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### -def stabVsRError(csvfile, output, title, all_similarities_list, input_type="csvfile", stability=0.8, min_stability=0.2, combined_stability=1.0, mtype= "", statistics=True, select=None, sequence="genome", allow_stability_drop=False): - - if input_type=="csvfile": +######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### +def stabVsRError( + csvfile, + output, + title, + all_similarities_list, + input_type="csvfile", + stability=0.8, + min_stability=0.2, + combined_stability=1.0, + mtype="", + statistics=True, + select=None, + sequence="genome", + allow_stability_drop=False, +): + if input_type == "csvfile": data = pd.read_csv(csvfile, sep=",") - elif input_type=="dataframe": - data=csvfile - - #exracting and preparing other similiry matrices from the all_similarities_list - mean_cosine_dist=[]; max_cosine_dist=[]; mean_l1 = []; maximum_l1 = []; mean_l2 = []; maximum_l2 = []; mean_kl = []; maximum_kl = []; mean_correlation=[]; minimum_correlation=[]; #all_mean_l2=[]; - #median_l1= Median L1 , maximum _l1 = Maximum L1, median_l2= Median L2 , maximum _l2 = Maximum L2, median_kl= Median KL , maximum _kl = Maximum KL, wilcoxontest = Wilcoxontest significance (True or False); all_med_l2=[] = list of all Median L2 - #statistical test we need to set a previous L2 (pre_l2) which have a same length as the sample number - - selection_data=data.copy() - selection_data["total_stab"]=data["Stability"]+data['avgStability'] - + elif input_type == "dataframe": + data = csvfile + + # exracting and preparing other similiry matrices from the all_similarities_list + mean_cosine_dist = [] + max_cosine_dist = [] + mean_l1 = [] + maximum_l1 = [] + mean_l2 = [] + maximum_l2 = [] + mean_kl = [] + maximum_kl = [] + mean_correlation = [] + minimum_correlation = [] + # all_mean_l2=[]; + # median_l1= Median L1 , maximum _l1 = Maximum L1, median_l2= Median L2 , maximum _l2 = Maximum L2, median_kl= Median KL , maximum _kl = Maximum KL, wilcoxontest = Wilcoxontest significance (True or False); all_med_l2=[] = list of all Median L2 + # statistical test we need to set a previous L2 (pre_l2) which have a same length as the sample number + + selection_data = data.copy() + selection_data["total_stab"] = data["Stability"] + data["avgStability"] + try: - selection_data_to_sort=selection_data[selection_data["total_stab"]>=combined_stability] - selection_data_to_sort=selection_data_to_sort[selection_data_to_sort["Stability"]>=min_stability] - selection_data_to_sort=selection_data_to_sort[selection_data_to_sort['avgStability']>=stability] + selection_data_to_sort = selection_data[ + selection_data["total_stab"] >= combined_stability + ] + selection_data_to_sort = selection_data_to_sort[ + selection_data_to_sort["Stability"] >= min_stability + ] + selection_data_to_sort = selection_data_to_sort[ + selection_data_to_sort["avgStability"] >= stability + ] highest_stable_idx = selection_data_to_sort.index[-1] - except: #if there is no solution over thresh-hold - selection_data_to_sort=selection_data + except: # if there is no solution over thresh-hold + selection_data_to_sort = selection_data highest_stable_idx = selection_data.index[0] - print("There is no signature over the thresh-hold stability. We are selecting the lowest possible number of signatures.") - - highest_stable_signature=list(selection_data["Total Signatures"])[highest_stable_idx] - selection_data=selection_data_to_sort.sort_values(by=['avgStability', 'Total Signatures'], ascending=[False, True]) - resorted_idx=list(selection_data.index) - default_idx=resorted_idx.index(highest_stable_idx) + print( + "There is no signature over the thresh-hold stability. We are selecting the lowest possible number of signatures." + ) + + highest_stable_signature = list(selection_data["Total Signatures"])[ + highest_stable_idx + ] + selection_data = selection_data_to_sort.sort_values( + by=["avgStability", "Total Signatures"], ascending=[False, True] + ) + resorted_idx = list(selection_data.index) + default_idx = resorted_idx.index(highest_stable_idx) if allow_stability_drop == False: - selected_resorted_idx=resorted_idx[0:default_idx+1] + selected_resorted_idx = resorted_idx[0 : default_idx + 1] else: - selected_resorted_idx=resorted_idx + selected_resorted_idx = resorted_idx selected_resorted_idx.sort() - idx_within_thresh_hold=highest_stable_idx - signatures_within_thresh_hold=highest_stable_signature - #create probability list - probabilities=["N/A"]*len(all_similarities_list) - stable_solutions=["NO"]*len(all_similarities_list) - - for values in range(len(all_similarities_list)): # loop through the opposite direction - all_similarities = all_similarities_list[values].iloc[:,[1,3,5,6,7]] - - cosine_distance=1-all_similarities["Cosine Similarity"] - mean_cosine_dist.append(round(cosine_distance.mean(),3)) - max_cosine_dist.append(round(cosine_distance.max(),3)) - mean_l1.append(round(all_similarities["L1_Norm_%"].median(),2)) - maximum_l1.append(round(all_similarities["L1_Norm_%"].max(),2)) - mean_l2.append(round(all_similarities["L2_Norm_%"].median(),2)) - maximum_l2.append(round(all_similarities["L2_Norm_%"].max(),2)) - mean_kl.append(round(all_similarities["KL Divergence"].mean(), 4)) - maximum_kl.append(round(all_similarities["KL Divergence"].max(), 4)) - mean_correlation.append(round(all_similarities["Correlation"].mean(), 3)) - minimum_correlation.append(round(all_similarities["Correlation"].min(), 3)) - - - - all_similarities = all_similarities_list[idx_within_thresh_hold].iloc[:,[1,3,5,6,7]] - #record the statistical test between the l2_of the current and previous signatures first + idx_within_thresh_hold = highest_stable_idx + signatures_within_thresh_hold = highest_stable_signature + # create probability list + probabilities = ["N/A"] * len(all_similarities_list) + stable_solutions = ["NO"] * len(all_similarities_list) + + for values in range( + len(all_similarities_list) + ): # loop through the opposite direction + all_similarities = all_similarities_list[values].iloc[:, [1, 3, 5, 6, 7]] + + cosine_distance = 1 - all_similarities["Cosine Similarity"] + mean_cosine_dist.append(round(cosine_distance.mean(), 3)) + max_cosine_dist.append(round(cosine_distance.max(), 3)) + mean_l1.append(round(all_similarities["L1_Norm_%"].median(), 2)) + maximum_l1.append(round(all_similarities["L1_Norm_%"].max(), 2)) + mean_l2.append(round(all_similarities["L2_Norm_%"].median(), 2)) + maximum_l2.append(round(all_similarities["L2_Norm_%"].max(), 2)) + mean_kl.append(round(all_similarities["KL Divergence"].mean(), 4)) + maximum_kl.append(round(all_similarities["KL Divergence"].max(), 4)) + mean_correlation.append(round(all_similarities["Correlation"].mean(), 3)) + minimum_correlation.append(round(all_similarities["Correlation"].min(), 3)) + + all_similarities = all_similarities_list[idx_within_thresh_hold].iloc[ + :, [1, 3, 5, 6, 7] + ] + # record the statistical test between the l2_of the current and previous signatures first init_l2_dist = all_similarities["L2_Norm_%"] init_mean = all_similarities["L2_Norm_%"].median() - probabilities[idx_within_thresh_hold]="Most Stab Sigs" - stable_solutions[idx_within_thresh_hold]="YES" - #select the final solution. Don't go to the while loop if there is not more than 1 signatures over thresh-hold - list_of_idx_over_thresh_hold=selected_resorted_idx - strating_idx=len(list_of_idx_over_thresh_hold)-1 - if len(list_of_idx_over_thresh_hold)>1 and statistics==True: + probabilities[idx_within_thresh_hold] = "Most Stab Sigs" + stable_solutions[idx_within_thresh_hold] = "YES" + # select the final solution. Don't go to the while loop if there is not more than 1 signatures over thresh-hold + list_of_idx_over_thresh_hold = selected_resorted_idx + strating_idx = len(list_of_idx_over_thresh_hold) - 1 + if len(list_of_idx_over_thresh_hold) > 1 and statistics == True: while True: - idx_within_thresh_hold=list_of_idx_over_thresh_hold[strating_idx-1] - signatures_within_thresh_hold =signatures_within_thresh_hold-(list_of_idx_over_thresh_hold[strating_idx]-list_of_idx_over_thresh_hold[strating_idx-1]) #get the difference between current and previous stable signatures - all_similarities = all_similarities_list[idx_within_thresh_hold].iloc[:,[1,3,5,6,7]] + idx_within_thresh_hold = list_of_idx_over_thresh_hold[strating_idx - 1] + signatures_within_thresh_hold = signatures_within_thresh_hold - ( + list_of_idx_over_thresh_hold[strating_idx] + - list_of_idx_over_thresh_hold[strating_idx - 1] + ) # get the difference between current and previous stable signatures + all_similarities = all_similarities_list[idx_within_thresh_hold].iloc[ + :, [1, 3, 5, 6, 7] + ] current_l2_dist = all_similarities["L2_Norm_%"] current_mean = all_similarities["L2_Norm_%"].median() wiltest = ranksums(np.array(init_l2_dist), np.array(current_l2_dist))[1] # p_value threshold for wiltest - if sequence=="exome": + if sequence == "exome": wiltest_thr = 1.0 else: wiltest_thr = 0.05 - probabilities[idx_within_thresh_hold]="{:.2e}".format(wiltest) - stable_solutions[idx_within_thresh_hold]="YES" - if (wiltest0) or idx_within_thresh_hold==0: - final_solution=signatures_within_thresh_hold+(list_of_idx_over_thresh_hold[strating_idx]-list_of_idx_over_thresh_hold[strating_idx-1]) #select the previous stable signatures + probabilities[idx_within_thresh_hold] = "{:.2e}".format(wiltest) + stable_solutions[idx_within_thresh_hold] = "YES" + if ( + (wiltest < wiltest_thr) + and (current_mean - init_mean > 0) + or idx_within_thresh_hold == 0 + ): + final_solution = signatures_within_thresh_hold + ( + list_of_idx_over_thresh_hold[strating_idx] + - list_of_idx_over_thresh_hold[strating_idx - 1] + ) # select the previous stable signatures break - strating_idx=strating_idx-1 - + strating_idx = strating_idx - 1 + else: - final_solution=signatures_within_thresh_hold - - if type(select)!=type(None): - final_solution=select - - data.iloc[:,2] = np.round(data.iloc[:,2]*100, 2) - data = data.assign(**{'Mean Sample L1%': mean_l1, - 'Maximum Sample L1%': maximum_l1, - 'Mean Sample L2%': mean_l2, - 'Maximum Sample L2%': maximum_l2, - 'Mean Sample KL': mean_kl, - 'Maximum Sample KL': maximum_kl, - "Mean Cosine Distance":mean_cosine_dist, - "Max Cosine Distance":max_cosine_dist, - "Mean Correlation":mean_correlation, - "Minimum Correlation":minimum_correlation}) - - - data=data.rename(columns = {'Stability': 'Minimum Stability'}) + final_solution = signatures_within_thresh_hold + + if type(select) != type(None): + final_solution = select + + data.iloc[:, 2] = np.round(data.iloc[:, 2] * 100, 2) + data = data.assign( + **{ + "Mean Sample L1%": mean_l1, + "Maximum Sample L1%": maximum_l1, + "Mean Sample L2%": mean_l2, + "Maximum Sample L2%": maximum_l2, + "Mean Sample KL": mean_kl, + "Maximum Sample KL": maximum_kl, + "Mean Cosine Distance": mean_cosine_dist, + "Max Cosine Distance": max_cosine_dist, + "Mean Correlation": mean_correlation, + "Minimum Correlation": minimum_correlation, + } + ) + + data = data.rename(columns={"Stability": "Minimum Stability"}) data = data.set_index("Total Signatures") - - - #get the solution + + # get the solution probable_solutions = data.copy() - avg_stability = data.iloc[:,2] + avg_stability = data.iloc[:, 2] data = data.drop(columns="avgStability") - try: + try: alternative_solution = int(final_solution) except: alternative_solution = int(data.index[0]) - print("There is no solution over the thresh-hold minimum stability. We are selecting the minimum number of signature which could be wrong.") -#---------------------------- alternative solution end -------------------------------------------------# + print( + "There is no solution over the thresh-hold minimum stability. We are selecting the minimum number of signature which could be wrong." + ) + # ---------------------------- alternative solution end -------------------------------------------------# # Create some mock data - + t = np.array(data.index).astype(int) - - #data1 = np.array(data.iloc[:,2]) #reconstruction error + + # data1 = np.array(data.iloc[:,2]) #reconstruction error data1 = np.array(mean_cosine_dist) - data2 = np.array(avg_stability) #process stability - shadow_alternative_start = alternative_solution-0.2 - shadow_alternative_end=alternative_solution+0.2 - fig, ax1 = plt.subplots(num=None, figsize=(10, 6), dpi=300, facecolor='w', edgecolor='k') - color = 'tab:red' - ax1.set_xlabel('Total Signatures') - ax1.set_ylabel('Mean Sample Cosine Distance', color=color) + data2 = np.array(avg_stability) # process stability + shadow_alternative_start = alternative_solution - 0.2 + shadow_alternative_end = alternative_solution + 0.2 + fig, ax1 = plt.subplots( + num=None, figsize=(10, 6), dpi=300, facecolor="w", edgecolor="k" + ) + color = "tab:red" + ax1.set_xlabel("Total Signatures") + ax1.set_ylabel("Mean Sample Cosine Distance", color=color) ax1.set_title(title) - lns1 = ax1.plot(t, data1, marker='o', linestyle=":", color=color, label = 'Mean Sample Cosine Distance') - ax1.tick_params(axis='y', labelcolor=color) - ax1.xaxis.set_ticks(np.arange(min(t), max(t)+1, 1)) - ax1.axvspan(shadow_alternative_start, shadow_alternative_end, alpha=0.20, color='#696969') - # manipulate the y-axis values into percentage + lns1 = ax1.plot( + t, + data1, + marker="o", + linestyle=":", + color=color, + label="Mean Sample Cosine Distance", + ) + ax1.tick_params(axis="y", labelcolor=color) + ax1.xaxis.set_ticks(np.arange(min(t), max(t) + 1, 1)) + ax1.axvspan( + shadow_alternative_start, shadow_alternative_end, alpha=0.20, color="#696969" + ) + # manipulate the y-axis values into percentage vals = ax1.get_yticks() # ax1.set_xticklabels(np.arange(min(t), max(t)+1, 1),list(), rotation=30) - ax1.set_xticklabels(list(np.arange(min(t), max(t)+1, 1)), rotation=30) + ax1.set_xticklabels(list(np.arange(min(t), max(t) + 1, 1)), rotation=30) ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis - color = 'tab:blue' - ax2.set_ylabel('Avg Stability', color=color) # we already handled the x-label with ax1 - lns2 = ax2.plot(t, data2, marker='s', linestyle="-.", color=color, label = 'Avg Stability') - ax2.tick_params(axis='y', labelcolor=color) + color = "tab:blue" + ax2.set_ylabel( + "Avg Stability", color=color + ) # we already handled the x-label with ax1 + lns2 = ax2.plot( + t, data2, marker="s", linestyle="-.", color=color, label="Avg Stability" + ) + ax2.tick_params(axis="y", labelcolor=color) fig.tight_layout() # otherwise the right y-label is slightly clipped - #plt.show() - lns = lns1+lns2 + # plt.show() + lns = lns1 + lns2 labs = [l.get_label() for l in lns] - ax1.legend(lns, labs, loc=0) - plt.savefig(output+'/'+mtype+'_selection_plot.pdf') + ax1.legend(lns, labs, loc=0) + plt.savefig(output + "/" + mtype + "_selection_plot.pdf") plt.close() - #put * in the selected solution + # put * in the selected solution index = data.index.astype(int) index = list(index.astype(str)) solution = get_indeces(index, [str(alternative_solution)])[0] - index[solution] = index[solution]+"*" + index[solution] = index[solution] + "*" data.index = index - - + # add % signs - data.insert(1,'Considerable Solution', stable_solutions) - data.insert(2,'P-value', probabilities) - data.iloc[:,3:7] = data.iloc[:,3:7].astype(str) + '%' + data.insert(1, "Considerable Solution", stable_solutions) + data.insert(2, "P-value", probabilities) + data.iloc[:, 3:7] = data.iloc[:, 3:7].astype(str) + "%" data = data.reset_index() - columns_list=list(data.columns) - columns_list[0]="Signatures" - data.columns=columns_list + columns_list = list(data.columns) + columns_list[0] = "Signatures" + data.columns = columns_list return alternative_solution, data + + ######################################## Plot Samples #################################################### -def plot_csv_sbs_samples(filename, output_path, project, mtype="96", percentage=False, custom_text_upper=" " ): - data, index, colnames, _ = read_csv(filename, folder = False) - data.index.names= ["MutationType"] +def plot_csv_sbs_samples( + filename, output_path, project, mtype="96", percentage=False, custom_text_upper=" " +): + data, index, colnames, _ = read_csv(filename, folder=False) + data.index.names = ["MutationType"] data.to_csv("new_file.text", sep="\t") - plot.plotSBS("new_file.text", output_path, project, mtype, False, custom_text_upper=" ") + plot.plotSBS( + "new_file.text", output_path, project, mtype, False, custom_text_upper=" " + ) os.remove("new_file.text") - -def evaluation(true_sigs, est_sigs, cutoff = 0.9, dist="cos", verbose=False): - if true_sigs.shape[1]>=est_sigs.shape[1]: + + +def evaluation(true_sigs, est_sigs, cutoff=0.9, dist="cos", verbose=False): + if true_sigs.shape[1] >= est_sigs.shape[1]: mat1 = est_sigs mat2 = true_sigs else: mat1 = true_sigs mat2 = est_sigs - - if dist=="cos": + + if dist == "cos": con_mat = cdist(mat1.T, mat2.T, "cosine") - elif dist=="cor": + elif dist == "cor": con_mat = cdist(mat1.T, mat2.T, "correlation") row_ind, col_ind = linear_sum_assignment(con_mat) - #print(con_mat[row_ind, col_ind].sum()) - con_mat=1-con_mat - idxPair= {} - true_positives=0 - for x,y in zip(row_ind, col_ind): - idxPair[x]=y - if con_mat[x,y]>=cutoff: - true_positives+=1 - - computedFalsePositives=mat1.shape[1]-true_positives - #print(computedFalsePositives) - computedFalseNegatives=computedFalsePositives - #print(computedFalseNegatives) - if true_sigs.shape[1]>=est_sigs.shape[1]: - baseFalsePositives=0 - baseFalseNegatives=true_sigs.shape[1]-est_sigs.shape[1] - - elif est_sigs.shape[1]>true_sigs.shape[1]: - baseFalsePositives=est_sigs.shape[1]-true_sigs.shape[1] - baseFalseNegatives=0 - - false_positives=baseFalsePositives+computedFalsePositives - false_negatives=baseFalseNegatives+computedFalseNegatives + # print(con_mat[row_ind, col_ind].sum()) + con_mat = 1 - con_mat + idxPair = {} + true_positives = 0 + for x, y in zip(row_ind, col_ind): + idxPair[x] = y + if con_mat[x, y] >= cutoff: + true_positives += 1 + + computedFalsePositives = mat1.shape[1] - true_positives + # print(computedFalsePositives) + computedFalseNegatives = computedFalsePositives + # print(computedFalseNegatives) + if true_sigs.shape[1] >= est_sigs.shape[1]: + baseFalsePositives = 0 + baseFalseNegatives = true_sigs.shape[1] - est_sigs.shape[1] + + elif est_sigs.shape[1] > true_sigs.shape[1]: + baseFalsePositives = est_sigs.shape[1] - true_sigs.shape[1] + baseFalseNegatives = 0 + + false_positives = baseFalsePositives + computedFalsePositives + false_negatives = baseFalseNegatives + computedFalseNegatives number_of_ground_truth_signatures = true_sigs.shape[1] number_of_detected_signature = est_sigs.shape[1] try: - precision = round(true_positives/(true_positives+false_positives),2) - recall = round(true_positives/(true_positives+false_negatives),2) - f1_score = round(2*precision*recall/(precision+recall),2) + precision = round(true_positives / (true_positives + false_positives), 2) + recall = round(true_positives / (true_positives + false_negatives), 2) + f1_score = round(2 * precision * recall / (precision + recall), 2) except: precision = 0 recall = 0 f1_score = 0 - - return number_of_ground_truth_signatures, number_of_detected_signature, true_positives, false_positives, false_negatives, precision, recall, f1_score, idxPair + + return ( + number_of_ground_truth_signatures, + number_of_detected_signature, + true_positives, + false_positives, + false_negatives, + precision, + recall, + f1_score, + idxPair, + ) + def custom_signatures_plot(signatures, output): - with PdfPages(output+'/Custom_Signature_Plots.pdf') as pdf: + with PdfPages(output + "/Custom_Signature_Plots.pdf") as pdf: plt.figure(figsize=(10, 3)) - plt.bar(list(range(1,1+len(signatures.iloc[:,0]))),signatures.iloc[:,0]) - plt.title('Custom Signature {}'.format(0+1)) + plt.bar(list(range(1, 1 + len(signatures.iloc[:, 0]))), signatures.iloc[:, 0]) + plt.title("Custom Signature {}".format(0 + 1)) plt.xticks([]) plt.xlabel("Mutation Types") plt.ylabel("Probabilities") pdf.savefig() # saves the current figure into a pdf page plt.close() - for i in range(1,signatures.shape[1]): + for i in range(1, signatures.shape[1]): # if LaTeX is not installed or error caught, change to `usetex=False` - plt.rc('text', usetex=False) + plt.rc("text", usetex=False) plt.figure(figsize=(10, 3)) - plt.bar(list(range(1, 1+len(signatures.iloc[:,i]))),signatures.iloc[:,i]) - plt.title('Custom Signature {}'.format(i+1)) + plt.bar( + list(range(1, 1 + len(signatures.iloc[:, i]))), signatures.iloc[:, i] + ) + plt.title("Custom Signature {}".format(i + 1)) plt.xticks([]) plt.xlabel("Mutation Types") plt.ylabel("Probabilities") - pdf.attach_note("signature plots") + pdf.attach_note("signature plots") pdf.savefig() plt.close() - diff --git a/install_genome.py b/install_genome.py index b3c56be..1124b4a 100644 --- a/install_genome.py +++ b/install_genome.py @@ -2,9 +2,11 @@ import sys from SigProfilerMatrixGenerator import install as genInstall + def install_ref(ref_path): - genInstall.install('GRCh37', offline_files_path=ref_path) + genInstall.install("GRCh37", offline_files_path=ref_path) + -if __name__=="__main__": - ref_path=sys.argv[1] - install_ref(ref_path) \ No newline at end of file +if __name__ == "__main__": + ref_path = sys.argv[1] + install_ref(ref_path) diff --git a/setup.py b/setup.py index b78f188..4f21105 100644 --- a/setup.py +++ b/setup.py @@ -4,17 +4,18 @@ import sys import subprocess -#remove the dist folder first if exists +# remove the dist folder first if exists if os.path.exists("dist"): shutil.rmtree("dist") -VERSION = '1.1.23' +VERSION = "1.1.23" -with open('README.md') as f: - long_description = f.read() +with open("README.md") as f: + long_description = f.read() -def write_version_py(filename='SigProfilerExtractor/version.py'): + +def write_version_py(filename="SigProfilerExtractor/version.py"): # Copied from numpy setup.py cnt = """ # THIS FILE IS GENERATED FROM SIGPROFILEREXTRACTOR SETUP.PY @@ -23,59 +24,88 @@ def write_version_py(filename='SigProfilerExtractor/version.py'): Update = 'Upgrade v1.1.23: Update to use the latest SigProfilerAssignment with COSMIC v3.4 signatures.' """ - fh = open(filename, 'w') - fh.write(cnt % {'version': VERSION,}) + fh = open(filename, "w") + fh.write( + cnt + % { + "version": VERSION, + } + ) fh.close() -requirements=[ - 'scipy>=1.6.3', - 'torch>=1.8.1', - 'numpy>=1.21.2', - 'pandas>=1.2.4', - 'nimfa>=1.1.0', - 'SigProfilerMatrixGenerator>=1.2.17', - 'sigProfilerPlotting>=1.3.16', - 'SigProfilerAssignment>=0.1.0', - 'pillow', - 'statsmodels>=0.9.0', - 'scikit-learn>=0.24.2', - 'psutil>=5.6.1', - 'reportlab>=3.5.42', - 'PyPDF2>=1.26.0' - ] -operating_system = sys.platform + +requirements = [ + "scipy>=1.6.3", + "torch>=1.8.1", + "numpy>=1.21.2", + "pandas>=1.2.4", + "nimfa>=1.1.0", + "SigProfilerMatrixGenerator>=1.2.17", + "sigProfilerPlotting>=1.3.16", + "SigProfilerAssignment>=0.1.0", + "pillow", + "statsmodels>=0.9.0", + "scikit-learn>=0.24.2", + "psutil>=5.6.1", + "reportlab>=3.5.42", + "PyPDF2>=1.26.0", +] + +operating_system = sys.platform print(operating_system) -if operating_system in ['win32','cygwin','windows']: - requirements.remove('matplotlib>=3.3.0') - requirements.remove('torch==1.5.1') - print('Trying to install pytorch!') +if operating_system in ["win32", "cygwin", "windows"]: + requirements.remove("matplotlib>=3.3.0") + requirements.remove("torch==1.5.1") + print("Trying to install pytorch!") code = 1 try: - code = subprocess.call(['pip', 'install', 'torch===1.5.1+cpu', '-f', 'https://download.pytorch.org/whl/torch_stable.html']) + code = subprocess.call( + [ + "pip", + "install", + "torch===1.5.1+cpu", + "-f", + "https://download.pytorch.org/whl/torch_stable.html", + ] + ) if code != 0: - raise Exception('Torch instalation failed !') + raise Exception("Torch instalation failed !") except: try: - code = subprocess.call(['pip3', 'install', 'torch===1.5.1+cpu', '-f', 'https://download.pytorch.org/whl/torch_stable.html']) + code = subprocess.call( + [ + "pip3", + "install", + "torch===1.5.1+cpu", + "-f", + "https://download.pytorch.org/whl/torch_stable.html", + ] + ) if code != 0: - raise Exception('Torch instalation failed !') + raise Exception("Torch instalation failed !") except: - print('Failed to install pytorch, please install pytorch manually be following the simple instructions over at: https://pytorch.org/get-started/locally/') + print( + "Failed to install pytorch, please install pytorch manually be following the simple instructions over at: https://pytorch.org/get-started/locally/" + ) if code == 0: - print('Successfully installed pytorch version! (If you need the GPU version, please install it manually, checkout the mindsdb docs and the pytroch docs if you need help)') - - + print( + "Successfully installed pytorch version! (If you need the GPU version, please install it manually, checkout the mindsdb docs and the pytroch docs if you need help)" + ) + + write_version_py() -setup(name='SigProfilerExtractor', - version=VERSION, - description='Extracts mutational signatures from mutational catalogues', - long_description=long_description, - long_description_content_type='text/markdown', # This is important! - url="https://github.com/AlexandrovLab/SigProfilerExtractor.git", - author='S Mishu Ashiqul Islam', - author_email='m0islam@ucsd.edu', - license='UCSD', - packages=['SigProfilerExtractor'], - install_requires=requirements, - include_package_data=True, - zip_safe=False) +setup( + name="SigProfilerExtractor", + version=VERSION, + description="Extracts mutational signatures from mutational catalogues", + long_description=long_description, + long_description_content_type="text/markdown", # This is important! + url="https://github.com/AlexandrovLab/SigProfilerExtractor.git", + author="S Mishu Ashiqul Islam", + author_email="m0islam@ucsd.edu", + license="UCSD", + packages=["SigProfilerExtractor"], + install_requires=requirements, + include_package_data=True, + zip_safe=False, +) diff --git a/test.py b/test.py index 2fbb941..07afe3b 100755 --- a/test.py +++ b/test.py @@ -9,62 +9,145 @@ import SigProfilerExtractor as spe_mod import os + def run_matrix_96(): data = sig.importdata("matrix") - sig.sigProfilerExtractor("matrix", "test_matrix_96_output", data, - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "matrix", + "test_matrix_96_output", + data, + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) + def run_matrix_78(): data = sig.importdata("matrix_DBS") - sig.sigProfilerExtractor("matrix", "test_matrix_78_output", data, - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "matrix", + "test_matrix_78_output", + data, + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) + def run_matrix_83(): data = sig.importdata("matrix_ID") - sig.sigProfilerExtractor("matrix", "test_matrix_83_output", data, exome=True, reference_genome="GRCh38", - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "matrix", + "test_matrix_83_output", + data, + exome=True, + reference_genome="GRCh38", + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) + def run_vcf(): vcf_data = sig.importdata("vcf") - sig.sigProfilerExtractor("vcf", "test_vcf_output", vcf_data, - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "vcf", + "test_vcf_output", + vcf_data, + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) + def run_matrix_48(): data = sig.importdata("matrix_CNV") - sig.sigProfilerExtractor("matrix", "test_matrix_48_output", data, - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "matrix", + "test_matrix_48_output", + data, + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) + def run_seg_48(): data = sig.importdata("seg:BATTENBERG") - sig.sigProfilerExtractor("seg:BATTENBERG", "test_segCNV_output", data, - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "seg:BATTENBERG", + "test_segCNV_output", + data, + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) + def run_matrix_32(): data = sig.importdata("matrix_SV") - sig.sigProfilerExtractor("matrix", "test_matrix_32_output", data, - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "matrix", + "test_matrix_32_output", + data, + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) + def run_matobj(): data = sig.importdata("matobj") - sig.sigProfilerExtractor("matobj", "test_matobj_output", data, - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "matobj", + "test_matobj_output", + data, + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) + def run_csv(): data = sig.importdata("csv") - sig.sigProfilerExtractor("csv", "test_csv_output", data, - minimum_signatures=3, maximum_signatures=3, nmf_replicates=5, - min_nmf_iterations=100, max_nmf_iterations=1000, nmf_test_conv=100) + sig.sigProfilerExtractor( + "csv", + "test_csv_output", + data, + minimum_signatures=3, + maximum_signatures=3, + nmf_replicates=5, + min_nmf_iterations=100, + max_nmf_iterations=1000, + nmf_test_conv=100, + ) -if __name__ == '__main__': +if __name__ == "__main__": run_matrix_96() run_matrix_78() run_matrix_83() @@ -73,4 +156,4 @@ def run_csv(): run_seg_48() run_vcf() # run_matobj() - # run_csv() \ No newline at end of file + # run_csv() From 9009a1c096cd68ae2f2ee9905af04d79202d2da0 Mon Sep 17 00:00:00 2001 From: Mousumy Kundu Date: Mon, 15 Apr 2024 10:41:39 -0700 Subject: [PATCH 2/4] added process_input for input handling --- SigProfilerExtractor/sigpro.py | 58 +++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/SigProfilerExtractor/sigpro.py b/SigProfilerExtractor/sigpro.py index 21d330d..3f65a19 100644 --- a/SigProfilerExtractor/sigpro.py +++ b/SigProfilerExtractor/sigpro.py @@ -53,6 +53,7 @@ from SigProfilerAssignment import single_sample as spasub from SigProfilerAssignment import decomposition as decomp from numpy.random import SeedSequence +from sigProfilerPlotting import sigProfilerPlotting as sigPlot MUTTYPE = "MutationType" @@ -557,35 +558,40 @@ def sigProfilerExtractor( else: data = pd.read_csv(text_file, sep="\t").iloc[:, :] - if data.shape[0] == 48: - paths = cosmic.__path__[0] - feature_map = pd.read_csv( - paths + "/data/ReferenceFiles/" + "CN_classes_dictionary.txt", - sep="\t", - header=None, - ) - feature_order = pd.read_csv( - paths + "/data/ReferenceFiles/" + "CNV_features.tsv", - sep="\t", - header=None, - ) - if list(data.iloc[:, 0]) == list(feature_order[0]): - pass - else: - orderlist1 = list(feature_map[0]) - orderlist2 = list(feature_order[0]) - # sort the MutationType first step - data["MutationType"] = pd.Categorical(data["MutationType"], orderlist1) - data = data.sort_values("MutationType") - data = data.reset_index() - data = data.drop(columns="index") - # sort the MutationType second step - data["MutationType"] = feature_map[1] - data["MutationType"] = pd.Categorical(data["MutationType"], orderlist2) - data = data.sort_values("MutationType") + # if data.shape[0] == 48: + # paths = cosmic.__path__[0] + # feature_map = pd.read_csv( + # paths + "/data/ReferenceFiles/" + "CN_classes_dictionary.txt", + # sep="\t", + # header=None, + # ) + # feature_order = pd.read_csv( + # paths + "/data/ReferenceFiles/" + "CNV_features.tsv", + # sep="\t", + # header=None, + # ) + # if list(data.iloc[:, 0]) == list(feature_order[0]): + # pass + # else: + # orderlist1 = list(feature_map[0]) + # orderlist2 = list(feature_order[0]) + # # sort the MutationType first step + # data["MutationType"] = pd.Categorical(data["MutationType"], orderlist1) + # data = data.sort_values("MutationType") + # data = data.reset_index() + # data = data.drop(columns="index") + # # sort the MutationType second step + # data["MutationType"] = feature_map[1] + # data["MutationType"] = pd.Categorical(data["MutationType"], orderlist2) + # data = data.sort_values("MutationType") data = data.dropna(axis=1, inplace=False) data = data.loc[:, (data != 0).any(axis=0)] + # printing the number of mutations + mutation_number = str(data.shape[0]) + # Re-indexing the input matrix file by using process_input function from SigProfilePlotting + data = sigPlot.process_input(data, mutation_number) + data.reset_index(inplace=True) genomes = data.iloc[:, 1:] genomes = np.array(genomes) allgenomes = genomes.copy() # save the allgenomes for the final results From dddf8b010cd76f540dc62b19e51b9c42f163fdd0 Mon Sep 17 00:00:00 2001 From: mdbarnesUCSD Date: Fri, 19 Apr 2024 13:25:55 -0700 Subject: [PATCH 3/4] Upgrade to v1.1.24 --- SigProfilerExtractor/sigpro.py | 27 --------------------------- setup.py | 10 +++++----- 2 files changed, 5 insertions(+), 32 deletions(-) diff --git a/SigProfilerExtractor/sigpro.py b/SigProfilerExtractor/sigpro.py index 3f65a19..40a99a4 100644 --- a/SigProfilerExtractor/sigpro.py +++ b/SigProfilerExtractor/sigpro.py @@ -558,33 +558,6 @@ def sigProfilerExtractor( else: data = pd.read_csv(text_file, sep="\t").iloc[:, :] - # if data.shape[0] == 48: - # paths = cosmic.__path__[0] - # feature_map = pd.read_csv( - # paths + "/data/ReferenceFiles/" + "CN_classes_dictionary.txt", - # sep="\t", - # header=None, - # ) - # feature_order = pd.read_csv( - # paths + "/data/ReferenceFiles/" + "CNV_features.tsv", - # sep="\t", - # header=None, - # ) - # if list(data.iloc[:, 0]) == list(feature_order[0]): - # pass - # else: - # orderlist1 = list(feature_map[0]) - # orderlist2 = list(feature_order[0]) - # # sort the MutationType first step - # data["MutationType"] = pd.Categorical(data["MutationType"], orderlist1) - # data = data.sort_values("MutationType") - # data = data.reset_index() - # data = data.drop(columns="index") - # # sort the MutationType second step - # data["MutationType"] = feature_map[1] - # data["MutationType"] = pd.Categorical(data["MutationType"], orderlist2) - # data = data.sort_values("MutationType") - data = data.dropna(axis=1, inplace=False) data = data.loc[:, (data != 0).any(axis=0)] # printing the number of mutations diff --git a/setup.py b/setup.py index 4f21105..282d7c0 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ if os.path.exists("dist"): shutil.rmtree("dist") -VERSION = "1.1.23" +VERSION = "1.1.24" with open("README.md") as f: @@ -21,7 +21,7 @@ def write_version_py(filename="SigProfilerExtractor/version.py"): # THIS FILE IS GENERATED FROM SIGPROFILEREXTRACTOR SETUP.PY short_version = '%(version)s' version = '%(version)s' -Update = 'Upgrade v1.1.23: Update to use the latest SigProfilerAssignment with COSMIC v3.4 signatures.' +Update = 'Upgrade v1.1.24: Add input processing to SigProfilerExtractor" """ fh = open(filename, "w") @@ -40,9 +40,9 @@ def write_version_py(filename="SigProfilerExtractor/version.py"): "numpy>=1.21.2", "pandas>=1.2.4", "nimfa>=1.1.0", - "SigProfilerMatrixGenerator>=1.2.17", - "sigProfilerPlotting>=1.3.16", - "SigProfilerAssignment>=0.1.0", + "SigProfilerMatrixGenerator>=1.2.25", + "sigProfilerPlotting>=1.3.22", + "SigProfilerAssignment>=0.1.4", "pillow", "statsmodels>=0.9.0", "scikit-learn>=0.24.2", From 2222a6c08c646dd108efb9012726fdd9277a415b Mon Sep 17 00:00:00 2001 From: mdbarnesUCSD Date: Fri, 19 Apr 2024 14:03:10 -0700 Subject: [PATCH 4/4] Fix update message --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 282d7c0..a336f28 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ def write_version_py(filename="SigProfilerExtractor/version.py"): # THIS FILE IS GENERATED FROM SIGPROFILEREXTRACTOR SETUP.PY short_version = '%(version)s' version = '%(version)s' -Update = 'Upgrade v1.1.24: Add input processing to SigProfilerExtractor" +Update = 'Upgrade v1.1.24: Add input processing to SigProfilerExtractor' """ fh = open(filename, "w")