This repository has been archived by the owner on Oct 28, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reshape_outputs.py
167 lines (139 loc) · 3.89 KB
/
reshape_outputs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python
# This code is somewhat similar to a parser, but it just when the user got already the page of the results in the computer.
# It get two list for each wraper, the begin and the end
import re
#Function working with glimmer! But it is not completely refined, it needs further improvements of the regular expression to use the raw glimmer file.
def glimmer(files, typ):
"""Given an output of glimmer it finds the gene positions begin or end.
Requires just the table. Further improvements must be done to read raw glimmer files"""
# Read, open and get into more readable format
data=open(files)
info=data.readlines()
begin=[]
end=[]
regex=re.compile('\s+')
for line in info:
genes=re.match("orf.+", line)
if genes:
gene=re.split(regex, line)
begin.append(gene[1])
end.append(gene[2])
# Given the type of information it is wanted is returned
if typ=="begin":
return begin
elif typ=="end":
return end
# Function to parser from the output of prodigal.
def prodigal(files, typ="begin"):
"""Given a prodigal output files finds the begining and the end of each gene."""
data=open(files)
info=data.read()
begin=[]
end=[]
info=[info][0].split("\n")
regex=re.compile("\s+")
for line in info:
genes=re.match("[\s]+CDS[\s]+.+", line)
if genes:
genes=genes.group(0)
gene=re.split(regex, genes)
gene=gene[2].split("..")
try:
new_gene=gene[0].split("complement(")[1]
begin.append(new_gene)
except:
new_gene=gene[0]
begin.append(new_gene)
try:
new_end=gene[1].split(")")[0]
end.append(new_end)
except:
new_end=end[0]
end.append(end[0])
if typ=="begin":
return begin
elif typ=="end":
return end
# Function to parse the EasyGene program
def easygene(files, typ):
"""Extract the begin and the end of each gene of the easygene output data."""
data=open(files)
info=data.read()
begin=[]
end=[]
info=[info][0].split("\n")
for line in info:
genes=re.match("^gi", line)
if genes:
gene=line.split("\t")
new_gene=gene[3]
begin.append(new_gene)
new_end=gene[4]
end.append(new_end)
if typ=="begin":
return begin
elif typ=="end":
return end
# Function to parse the GenemMark program
def genemark(files, typ):
"""Extract from a genemark file the begin and end of each gene."""
data=open(files)
info=data.read()
begin=[]
end=[]
info=[info][0].split("\n")
regex=re.compile("\s+")
for line in info:
genes=re.match("^[\s]+[0-9]+[\s]+.+", line)
if genes:
genes=genes.group(0)
gene=re.split(regex, genes)
new_gene=gene[3]
begin.append(new_gene)
new_end=gene[4]
end.append(new_end)
if typ=="begin":
return(begin)
elif typ=="end":
return end
# Function to parser the Augustus program
def augustus(files, typ):
"""Print the begin or the end of the gene lines given the output of the augustus program."""
data=open(files)
info=data.read()
begin=[]
end=[]
info=[info][0].split("\n")
regex=re.compile("\s+")
for line in info:
genes=re.match("[\s+\w+\W+]+gene[\s+\w+\W+]+", line)
if genes:
genes=genes.group(0)
gene=re.split(regex, genes)
if gene[2]=="gene":
new_gene=gene[3]
begin.append(new_gene)
new_end=gene[4]
end.append(new_end)
if typ=="begin":
return begin
elif typ=="end":
return end
#This function given data of the NCBI genes in multi Fasta format get the beginning and end of the genes.
def features(files, typ):
"""Given a file of NCBI gene features in FASTA format it returns a list of the start codon position and the stop position, according to typ(begin, end)"""
begin=[]
end=[]
data=open(files)
info=data.read()
regex=re.compile("\d+\.\.\d+")
genes=re.findall(regex, info) #Select just the positions
regex2=re.compile("\.\.")
for positions in genes:
gene=re.split(regex2, positions) #To split into the two numbers
begin.append(gene[0])
end.append(gene[1])
if typ=="begin":
return(begin)
elif typ=="end":
return end