-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_ref.py
78 lines (62 loc) · 1.65 KB
/
get_ref.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import sys
import xml.etree.ElementTree as ET
if len(sys.argv)!=2:
print sys.argv[0], "xml_filename"
sys.exit(1)
xml_filename=sys.argv[1]
tree = ET.parse(xml_filename)
root = tree.getroot()
dna_samples={}
for c in root.findall("./Result"):
r=c.findall("./analyte_code")
r2=c.findall("./center_name")
r3=c.findall("./sample_type")
if len(r)==1 and len(r2)==1 and len(r3)==1:
sample_type=int(r3[0].text)
center_name=r2[0].text
if center_name not in dna_samples:
dna_samples[center_name]={}
if r[0].text=="D":
dna_samples[center_name][sample_type]=c
else:
print r
continue
if len(dna_samples)==0:
sys.exit(1)
largest=0
largest_pair=[]
#trim by type
for center in dna_samples:
samples=dna_samples[center]
if len(samples)>2:
#get the smallest id lower then 10
#get the smallest id larger or equal to 10
tumor_id=100
normal_id=100
for k in samples.keys():
if k<10 and k<tumor_id:
tumor_id=k
if k>=10 and k<normal_id:
normal_id=k
samples={tumor_id:samples[tumor_id],normal_id:samples[normal_id]}
size=0
for sid in samples:
y=samples[sid]
size+=sum(map(lambda x : int(x.text), y.findall('./files/file/filesize')))
if size>largest:
largest=size
largest_pair=samples
if len(largest_pair)==2:
for sid in largest_pair:
#guess the ref
refstr=largest_pair[sid].findall('./refassem_short_name')[0].text.lower()
ref=""
if refstr.find('hg19')>=0:
ref='hg19'
elif refstr.find('hg18')>=0:
ref='hg18'
elif refstr.find('36')>=0:
ref='hg18'
elif refstr.find('37')>=0:
ref='hg19'
print largest_pair[sid].findall('./analysis_id')[0].text, largest_pair[sid].findall('./refassem_short_name')[0].text,ref