-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_biolink_stats.sh
executable file
·98 lines (89 loc) · 2.66 KB
/
get_biolink_stats.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/bin/bash
# Get transformation stats for BioPortal ontologies,
# as counts of nodes and edges per ontology
# and as counts of nodes/edges per Biolink class.
# Output is TSV with ontologies in rows
# and count types in columns.
TX_PATH="./transformed/ontologies/"
all_nodetypes=("biolink:NamedThing"
"biolink:OntologyClass"
"biolink:BiologicalProcess"
"biolink:Cell"
"biolink:CellularComponent"
"biolink:ChemicalSubstance"
"biolink:Disease"
"biolink:Event"
"biolink:ExposureEvent"
"biolink:Gene"
"biolink:MolecularActivity"
"biolink:OrganismalEntity"
"biolink:Pathway"
"biolink:PhenotypicFeature"
"biolink:Protein"
"biolink:SequenceFeature"
"biolink:SexQualifier"
"biolink:Source"
"biolink:TaxonomicRank"
"biolink:Unit"
"biolink:AnatomicalEntity"
)
all_edgetypes=("biolink:related_to"
"biolink:subclass_of"
"biolink:part_of"
"biolink:inverseOf"
"biolink:subPropertyOf"
"biolink:has_part"
"biolink:has_participant"
"biolink:has_unit"
"biolink:preceded_by"
"biolink:has_attribute"
"biolink:positively_regulates"
"biolink:negatively_regulates"
)
all_classes+=( "${all_nodetypes[@]}" "${all_edgetypes[@]}" )
all_classes_joined=$(printf "\t%s" "${all_classes[@]}")
# Run
echo -e "Ontology\tNodeCount\tEdgeCount$all_classes_joined"
for entry in "$TX_PATH"*
do
base=$(basename $entry)
nodefile=$(find -wholename "$entry/*_nodes.tsv")
unset node_type_counts
declare -a node_type_counts
if [ -f "$nodefile" ]; then
nodecount=$(wc -l < $nodefile | bc)
nodecount=$(($nodecount - 1)) # Header
for ((i=0; i < ${#all_nodetypes[@]}; i++))
do
this_type_count=$(grep ${all_nodetypes[$i]} $nodefile | wc -l)
node_type_counts+=($this_type_count)
done
else
nodecount='0'
for ((i=0; i < ${#all_nodetypes[@]}; i++))
do
node_type_counts+=('0')
done
fi
node_type_counts_joined=$(printf "\t%s" "${node_type_counts[@]}")
edgefile=$(find -wholename "$entry/*_edges.tsv")
unset edge_type_counts
declare -a edge_type_counts
if [ -f "$edgefile" ]; then
edgecount=$(wc -l < $edgefile | bc)
edgecount=$(($edgecount - 1)) # Header
for ((i=0; i < ${#all_edgetypes[@]}; i++))
do
this_type_count=$(grep ${all_edgetypes[$i]} $edgefile | wc -l)
edge_type_counts+=($this_type_count)
done
else
edgecount='0'
for ((i=0; i < ${#all_edgetypes[@]}; i++))
do
edge_type_counts+=('0')
done
fi
edge_type_counts_joined=$(printf "\t%s" "${edge_type_counts[@]}")
echo -e "$base\t$nodecount\t$edgecount$node_type_counts_joined$edge_type_counts_joined"
done