-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathglycan_detect.py
148 lines (116 loc) · 3.76 KB
/
glycan_detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 1 18:16:13 2020
Description: detection of glycosilated residues (positions) in a protein sequences
or given an epitope within a protein.
Input: epitope sequence, parental protein sequence & glycosilated positions.
Output: epitope glycosilation: YES or NO.
@author: rocfarriolduran
"""
# imports
import sys
import csv
import os.path
from pathlib import Path
h = '''
To right usage of this script:
$ python3 busca3030.py
The files in use have to be provided by stating "location/file_name.csv"
to the input questions that appear in the console.
<file_name> should be a .csv separated by ";"
OPTIONAL you can provide coma separated glycans
For example: 0,16,32,64
If you provide the golas....????????????
The script returns a <file_name>_out.xlsx as output.
You can also do the following (if using Mac/Linux OS):
$ chmod +x busca3030.py
$ ./busca3030.py <location/file_name.csv> <comaSeparateGoals>
You need python 3 installed in your computer!!!
'''
def fileExist(file):
if file!="":
if Path(file).is_file():
return True
elif file=='-h':
print(h)
else:
print("\n>>>>>>> "+file + " File not exist or is not accessible\n")
return False
else:
return False
data_file=""
inputGlycans=""
glycans=""
num_args=len(sys.argv)
if num_args>=2:
data_file = sys.argv[1]
if num_args==3:
inputGlycans = sys.argv[2]
while not fileExist(data_file):
data_file = input(''' Provide location and name of the file dataset.
For example: data/file_test.csv
(-h for help)
Here: ''')
print("Input file: " + data_file)
#data_file= 'test_protein_parser.csv'
extension = os.path.splitext(data_file)[1]
lenextension=len(extension)
nameOutFile=data_file[:-lenextension]+"_Out"+extension
if num_args<3:
inputGlycans = input(''' OPTIONAL Provide coma separated glycans.
(zero is first position)
For example: 0,16,32,64
Here: ''')
#glycans=[0,16,32,64]
if len(inputGlycans)>0:
glycans=inputGlycans.split(",")
if len(glycans)!=0:
print("glycans: " + inputGlycans)
else:
print("No glycans provided")
def isOnGoal(start,llpep):
isOnGoal=-1
if len(glycans) > 0:
for goal in glycans:
if start<=int(goal) and int(goal)<=start+llpep-1:
if isOnGoal==-1:
isOnGoal=goal
else:
isOnGoal = isOnGoal + "," + goal
return isOnGoal
out_file=open(nameOutFile, "w")
if len(glycans) > 0:
out_file.writelines("Id;Sequence;Peptide;Position;Pre;lenPre;Post;lenPost;onGoal\n")
else:
out_file.writelines("Id;Sequence;Peptide;Position;Pre;lenPre;Post;lenPost\n")
f=open(data_file, "r", encoding = 'utf-8-sig')
inputFile = csv.reader(f, delimiter=';')
y=0
for reg in inputFile :
if y!=0:
id=reg[0]
seq=reg[1]
pep=reg[2]
pre=""
post=""
llpep=len(pep)
start=seq.find(pep)
if start!=-1:
if start<30 :
if start==0:
pre=""
else:
pre=seq[:start-1]
else:
pre=seq[start-31:start-1]
post=seq[start+llpep+1:start+llpep+31]
if len(glycans)>0:
onGoal=isOnGoal(start,llpep)
if len(glycans) > 0:
out_file.writelines(id+";"+seq+";"+pep+";"+str(start)+";"+pre+";"+str(len(pre))+";"+post+";"+str(len(post))+";"+str(onGoal)+ "\n")
else:
out_file.writelines(id+";"+seq+";"+pep+";"+str(start)+";"+pre+";"+str(len(pre))+";"+post+";"+str(len(post))+"\n")
y=y+1
out_file.close()
print("Output file: "+nameOutFile)