-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain-term.py
executable file
·294 lines (261 loc) · 14.7 KB
/
main-term.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#! /usr/bin/env python3
##################################################
## SMAC
## Author: Stefano Pirro, PhD (aka wynstep)
## Version: 2.0
## Maintainer: Stefano Pirro, PhD (aka wynstep)
## Email: stefano.pirro@uniroma2.it, s.pirro@mir-nat.com
## Status: Active
##################################################
"""
Description of the script:
This downloads a list of Papers and relative informations from PubMed.
It also downloads a series of GEO codes, in order to allow the analysis
of molecular data in live. This will be performed by the final user, just on
selected publications.
PLEASE NOTE: This code is an updated version of SMAC (https://github.com/wynstep/SMAC),
published on Scientific Reports in 2019 (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6642118/)
"""
## Importing libraries, vars and functions
import os, sys, argparse, datetime, re, gc
import pandas as pd
import numpy as np
from func_timeout import func_timeout, FunctionTimedOut
from vars import *
from functions import *
## Reading arguments for the analysis
parser = argparse.ArgumentParser()
parser.add_argument('-t','--queryTerm', help='Term for starting a PubMed search query', required=True)
parser.add_argument('-l','--PMIDsLimit', help='Max number of PMIDs to retrieve (default = all)', type=int, default=None, required=False)
parser.add_argument('-a','--analysis', help='analysis to perform (separated by comma) [pca,receptor_status,molecular_classification,tumour_purity,gene_expression,functional_enrichment]', default=defaultAnalyses, required=False)
parser.add_argument('-s','--skipDownload', help='Do you want to skip download of molecular data? <yes,no>', default="no", required=False)
parser.add_argument('-g','--nGenes', help='Number of top differentially-expressed genes to plot in the gene expression analysis', default=100, required=False)
parser.add_argument('-n','--nCores', help='Number of cores for the analysis', default=1, type=int, required=False)
parser.add_argument('-e','--email', help='A valid email address for not being banned by NCBI', required=True)
args = parser.parse_args()
## Initialising analysis code -- This is the queryTerm without spaces and wrong chars
anCode = [character for character in args.queryTerm if character.isalnum()]
anCode = "".join(anCode)
currentYear = int(datetime.datetime.now().year) # Logging current year (for MeSH download)
## Initialising directories
outRootFolder = outRootFolder.format("DATA", anCode)
outGraphDir = outGraphDir.format(outRootFolder)
outGSEDir = outGSEDir.format(outRootFolder)
os.makedirs(outRootFolder, exist_ok = True)
os.makedirs(outGraphDir, exist_ok = True)
os.makedirs(outGSEDir, exist_ok = True)
## Initialising logFile
logFile = logFile.format(outRootFolder, anCode)
## Initialise recoverFlag
recFlag = 0
##---------------------------------------##
## Extracting list of PMIDs from Pubmed ##
##---------------------------------------##
# We aim at recovering any previous analysis already present
pmidFile = pmidFile.format(outRootFolder, anCode)
# Loading PMIDs in previous run
if (os.path.isfile(pmidFile)):
with open(pmidFile, "r") as pmidData:
for line in pmidData:
pPMIDs.append(line.rstrip())
# Downloading current PMID list and calculate difference with previous
allPMIDs = RetrievePMIDs(args.queryTerm, args.PMIDsLimit, args.email, logFile)
PMIDs = list(set(allPMIDs) - set(pPMIDs))
## At this point, the recover flag can be 1 (new papers found) or 2 (no new papers)
if (len(PMIDs) > 0):
recFlag = 1
KeepLog(logFile, "[MESSAGE] Recovery mode ON: New PMIDs: {0}\n".format(len(PMIDs)))
else:
recFlag = 2
sys.exit("[MESSAGE] Recovery mode ON: New PMIDs: {0} -- Analysis will quit NOW".format(len(PMIDs)))
else: # No previous run has been found
PMIDs = RetrievePMIDs(args.queryTerm, args.PMIDsLimit, args.email, logFile)
# Append/write the new PMIDs to the previous file
SaveIntoDF(PMIDs, pmidFile, "list")
##--------------------------------------------------------------##
## Splitting PMIDs into sets for avoiding blocks ##
##--------------------------------------------------------------##
PMIDsChunks = list(chunkPMIDs(PMIDs, sizeQuery["chunkPMIDs"]))
# From this point we do not need the PMIDs variable anymore, so we can delete and save memory
# This is very useful for low memory machines
del PMIDs
gc.collect() # Release garbage memory
##--------------------------------------------------------------##
## Extracting GEO datasets linked to PMIDs ##
##--------------------------------------------------------------##
if args.skipDownload == "no":
# Initialise file where to store PMID2GSE info
geoInfoFile = geoInfoFile.format(outRootFolder, anCode)
# Initialise pandas dataframe that stores data
PMID2GSE = pd.DataFrame(columns=["PMID","gseCodes","platforms","ftpLinks"])
# Iterating over each block of PMIDs and retrieve the data (easier for backup purposes)
for index, pmids in enumerate(PMIDsChunks):
KeepLog(logFile, "Retrieve GSE codes for batch {0}/{1} \n".format(index+1, len(PMIDsChunks)))
# We need to put a timeout to the function, otherwise (if blocked IP or connection, it will stay for forever)
try:
tmpPMID2GSE = func_timeout(timoutThr, RetrieveGeoInfoFromPMID, args=(pmids, logFile))
SaveIntoDF(tmpPMID2GSE, geoInfoFile, "pandas")
PMID2GSE = PMID2GSE.append(tmpPMID2GSE, ignore_index=True)
except FunctionTimedOut:
KeepLog(logFile, "Sorry, timeout riched for {0}/{1} \n".format(index+1, len(PMIDsChunks)))
continue # continue the for loop if function RetrieveGeoInfoFromPMID takes more than 40 second
# Deleting tmpPMID2GSE, geoInfoFile variable and release garbage memory
try:
del tmpPMID2GSE, geoInfoFile
except:
KeepLog(logFile, "[WARNING] No need to clear VAR\n")
gc.collect()
##--------------------------------------------------------------##
## Extracting Genes linked to PMIDs ##
##--------------------------------------------------------------##
# Initialise file where to store PMID2Gene info
geneInfoFile = geneInfoFile.format(outRootFolder, anCode)
# Initialise pandas dataframe that stores data
PMID2Gene = pd.DataFrame(columns=["PMID","geneName","geneScore"])
# Iterating over each block of PMIDs and retrieve the data (easier for backup purposes)
for index, pmids in enumerate(PMIDsChunks):
KeepLog(logFile, "Retrieve GENES for batch {0}/{1} \n".format(index+1, len(PMIDsChunks)))
# We need to put a timeout to the function, otherwise (if blocked IP or connection, it will stay for forever)
try:
tmpPMID2Gene = func_timeout(timoutThr+100, LinkPMID2Gene, args=(pmids, args.email, logFile))
SaveIntoDF(tmpPMID2Gene, geneInfoFile, "pandas")
PMID2Gene = PMID2Gene.append(tmpPMID2Gene, ignore_index=True)
except FunctionTimedOut:
KeepLog(logFile, "Sorry, timeout riched for {0}/{1} \n".format(index+1, len(PMIDsChunks)))
continue # continue the for loop if function RetrieveGeoInfoFromPMID takes more than 40 second
##--------------------------------------------------------------##
## Create PPI and functional enrichment ##
##--------------------------------------------------------------##
if PMID2Gene.shape[0] > 0: # number of rows is more than 0
PerformAnalysis("gene_network", [geneInfoFile, args.queryTerm, args.nCores, outGraphDir], logFile)
PerformAnalysis("functional_enrichment_pmid", [geneInfoFile, args.queryTerm, args.nCores, outGraphDir], logFile)
# Deleting tmpPMID2Gene, geneInfoFile variable and release garbage memory
try:
del tmpPMID2Gene, geneInfoFile
except:
KeepLog(logFile, "[WARNING] No need to clear VAR\n")
gc.collect()
##--------------------------------------------------------------##
## Extracting Paper Info ##
##--------------------------------------------------------------##
# Initialising files for storing Papers Info
papersInfoFile = papersInfoFile.format(outRootFolder, anCode)
meshTermsFile = meshTermsFile.format(outRootFolder, anCode)
# Initialise pandas dataframe and list that stores data
papersInfo = pd.DataFrame(columns=["PMID","author","journal","title","abstract","pubDate","meshHeadings"])
meshInfo = list()
# Iterating over each block of PMIDs and retrieve the data (easier for backup purposes)
for index, pmids in enumerate(PMIDsChunks):
KeepLog(logFile, "Retrieve INFO for batch {0}/{1} \n".format(index+1, len(PMIDsChunks)))
# We need to put a timeout to the function, otherwise (if blocked IP or connection, it will stay for forever)
try:
tmpPapersInfo = func_timeout(timoutThr, RetrievePMIDInfo, args=(pmids, args.email, logFile))
papersInfo = papersInfo.append(tmpPapersInfo[0], ignore_index=True)
meshInfo.extend(tmpPapersInfo[1])
SaveIntoDF(tmpPapersInfo[0], papersInfoFile, "pandas")
SaveIntoDF(tmpPapersInfo[1], meshTermsFile, "list")
except FunctionTimedOut:
KeepLog(logFile, "Sorry, timeout riched for {0}/{1} \n".format(index+1, len(PMIDsChunks)))
continue # continue the for loop if function RetrieveGeoInfoFromPMID takes more than 40 second
# Deleting tmpPapersInfo, papersInfoFile variable and release garbage memory
try:
del tmpPapersInfo, papersInfoFile
except:
KeepLog(logFile, "[WARNING] No need to clear VAR\n")
gc.collect()
##--------------------------------------------------------------##
## Extracting MeSH terms metrics -- If there are NEW mesh term ##
##--------------------------------------------------------------##
if (len(meshInfo) > 0):
# Initialising files for storing the MeSH statistics
meshLAboundanceFile = meshLAboundanceFile.format(outRootFolder, anCode)
meshGAboundanceFile = meshGAboundanceFile.format(outRootFolder, anCode)
meshLevelFile = meshLevelFile.format(outRootFolder, anCode)
meshTermsDetailsFile = meshTermsDetailsFile.format(outRootFolder, currentYear, anCode)
### We first try to download and parse the latest version of MeSH official terms
### These will be stored into a pandas dataframe
if (not os.path.isfile(meshTermsDetailsFile)): # Not necessary to parse if the file is present
officialMeshTerms = DownloadOfficialMesh(currentYear, srcFolder, logFile)
officialMeshTerms.to_pickle(meshTermsDetailsFile)
else:
officialMeshTerms = pd.read_pickle(meshTermsDetailsFile)
# Load latest version of meshInfo from generated file
with open(meshTermsFile, 'r') as meshIO:
meshInfo = meshIO.read().splitlines()
meshIO.close()
# Calculation of local aboundance
lAboundance = RetrieveMeshInfo(meshInfo, "lAboundance", args.email, logFile)
# Extract JUST official MeSH
lAboundanceSelected = lAboundance[lAboundance['mesh'].isin(officialMeshTerms['mesh'].tolist())]
SaveIntoDF(lAboundanceSelected, meshLAboundanceFile, "pandas")
# Calculating MeSH level for terms present in the paper
meshLevel = officialMeshTerms[officialMeshTerms['mesh'].isin(lAboundanceSelected['mesh'].tolist())]
SaveIntoDF(meshLevel, meshLevelFile, "pandas")
# We'll calculate global aboundance (JUST FOR TERMS WHICH HAVE NOT BEEN CALCULATED)
gAboundance = pd.DataFrame(columns=['mesh', 'gAboundance'])
if os.path.isfile(meshGAboundanceFile): # File exists
meshGAboundanceTerms = pd.read_csv(meshGAboundanceFile, sep="\t")
# Here we select the terms that have not been previously parsed
meshGAboundanceTermsSelected = lAboundanceSelected[lAboundanceSelected['mesh'].isin(meshGAboundanceTerms['mesh'].tolist(), invert=True)]
else: # File does not exists
meshGAboundanceTermsSelected = lAboundanceSelected
for j, m in enumerate(meshGAboundanceTermsSelected["mesh"].tolist()):
KeepLog(logFile, "Retrieve MeSH global aboundance for {0}/{1} -- {2}\n".format(j+1, len(meshGAboundanceTermsSelected["mesh"].tolist()), m))
# We need to put a timeout to the function, otherwise (if blocked IP or connection, it will stay for forever)
try:
tmpAboundance = func_timeout(timoutThr, RetrieveMeshInfo, args=(m, "gAboundance", args.email, logFile))
gAboundance.loc[len(gAboundance)] = tmpAboundance
except FunctionTimedOut:
KeepLog(logFile, "Sorry, timeout riched for {0}/{1} \n".format(index+1, len(PMIDsChunks)))
continue # continue the for loop if function RetrieveGeoInfoFromPMID takes more than 40 second
SaveIntoDF(gAboundance, meshGAboundanceFile, "pandas")
##--------------------------------------------------------------##
## Perform a Robust Ranking Aggregation on MeSH ##
##--------------------------------------------------------------##
PerformAnalysis("rra", [meshLevelFile, meshGAboundanceFile, meshLAboundanceFile, \
args.queryTerm, args.nCores, outGraphDir, outRootFolder], logFile)
# Here we clean some variables before running the GSE download and process, since it can be memory expensive
try:
del meshGAboundanceTerms
del officialMeshTerms, meshInfo
del lAboundance, lAboundanceSelected
del meshLevel
del gAboundance, meshGAboundanceTermsSelected, tmpAboundance
except:
KeepLog(logFile, "[WARNING] No need to clear VAR\n")
gc.collect()
##--------------------------------------------------------------##
## Download GEO data ##
##--------------------------------------------------------------##
if args.skipDownload == "no":
# Iterate over PMID/GSE dataframe and retrieve data
for index, row in PMID2GSE.iterrows():
pmid = row.PMID
gseCodes = [s.strip() for s in row.gseCodes.replace(',', ';').split(';')]
for gse in gseCodes:
try:
gse = "GSE{0}".format(gse)
gseDir = "{0}/{1}/{2}".format(outGSEDir, pmid, gse)
KeepLog(logFile, "[MESSAGE] Analysing {0}/{1}\n".format(pmid,gse))
exprDataFiles, metaDataFile = func_timeout(timoutThr+1000, DownloadGEODataset, args=(gse, gseDir))
except FunctionTimedOut:
KeepLog(logFile, "[WARNING] Sorry, timeout riched for {0}/{1}\n".format(pmid, gse))
subprocess.Popen("rm -rf {0}".format(gseDir), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
continue # continue the for loop if function RetrieveGeoInfoFromPMID takes more
except:
KeepLog(logFile, "[WARNING] Sorry, cannot process {0}/{1}\n".format(pmid, gse))
subprocess.Popen("rm -rf {0}".format(gseDir), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
continue
##--------------------------------------------------------------##
## Perform analyses on downloaded datasets ##
##--------------------------------------------------------------##
if exprDataFiles is not None and metaDataFile is not None: # We perform the analyses JUST if the GEO dataset has been created
# Iterate over the expression data files generated
for index, eDataFile in enumerate(exprDataFiles):
for an in args.analysis.split(","):
PerformAnalysis(an, [eDataFile, metaDataFile[index], index, args.nCores, args.nGenes, gseDir], logFile)
##--------------------------------------------------------------##
## Cleaning empty directories ##
##--------------------------------------------------------------##
RemoveEmptyDirs("DATA/")
KeepLog(logFile, "SMAC for {0} COMPLETED. Thanks for using it!\n".format(anCode))