forked from JorisJoBo/treedentifier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processing.py
182 lines (170 loc) · 6.56 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
'''
data_processing.py
usage for map: data_processing.py input_directory output.csv
usage for single file: data_processing.py input.csv output.csv
Adds together all csv files generated from lascanopy and removes lines
with missing values. Can also be used to remove lines with missing values
from a single csv file.
The map containing the csv files that have to be combined has to be
present in the same directory as this file.
Further uses dominant_trees.csv to create data suitable for learning
algorithms.
'''
import os
from sys import argv
import sys
import csv
# Aquire source and destination from commandline.
dirname = argv[1]
try:
outputfile = argv[2]
except BaseException:
print("no output file given, output saved to 'combined_canopy.csv'")
outputfile = "combined_canopy.csv"
# Extract and store single header and every other line from all files.
lines = []
header = None
if dirname.endswith('.csv'):
with open(dirname, 'r') as f:
for i, line in enumerate(f, 0):
if i != 0:
lines.append(line)
else:
header = line
else:
for file in os.listdir(dirname):
if file.endswith(".csv"):
with open(dirname + '/' + file, 'r') as f:
for i, line in enumerate(f, 0):
if i != 0:
lines.append(line)
else:
header = line
# Remove NULL lines and write to output file.
with open(outputfile, 'w') as f:
f.write(header)
for line in lines:
line2 = line.split(',')
if "-" not in line2:
f.write(line)
# Remove any polygons generated with growing forrest who's values deviate too
# much from the original polygon.
with open(outputfile) as f:
with open('cleaned_canopy.csv', 'a') as result_file:
f = csv.reader(f, delimiter=',')
o = []
polygonID = 0
for line in f:
if line[0] == 'index':
outputline = ''
for item in line:
outputline = outputline + item + ','
result_file.write(outputline[:-1] + '\n')
else:
if line[1] != polygonID:
polygonID = line[1]
o = line
outputline = ''
for item in line:
outputline = ''
for item in line:
outputline = outputline + item + ','
result_file.write(outputline[:-1] + '\n')
else:
write = True
for i in range(6, len(o[6:])):
if not abs(float(o[i]) -
float(line[i])) <= max(0.2 *
abs(float(o[i])), 2):
write = False
if write:
outputline = ''
for item in line:
outputline = outputline + item + ','
result_file.write(outputline[:-1] + '\n')
# Combine cleaned LAScanopy data with corresponding vegetation database labels.
with open('cleaned_canopy.csv') as f1:
with open('dominant_trees.csv') as f2:
with open('learning_data.csv', 'a') as result_file:
f1 = csv.reader(f1, delimiter=',')
f2 = csv.reader(f2, delimiter=',')
latindict = {}
# Uses dominant_trees to find the name, and put it in a dictionary.
for line in f2:
latindict[line[1]] = line[-1]
# Use the dictionary to find the treename and add it to the line.
for line in f1:
if line[0] == 'index':
line.append('latinname')
else:
latinname = latindict[line[1]]
line.append(latinname)
outputline = ''
for item in line:
outputline = outputline + item
if item != line[-1]:
outputline = outputline + ','
result_file.write(outputline + '\n')
# Adds numerical value to each datapoint to indicate species, meant for use
# with SVM or Neural Network
lis = []
with open('learning_data.csv', 'r') as f:
for i, line in enumerate(f, 0):
if i > 0:
species = line.split(',')[-1].rstrip('\n')
if species not in lis:
lis.append(species)
with open('indexed_learning_data.csv', 'w') as f2:
with open('learning_data.csv', 'r') as f:
for line in f:
species = line.split(',')[-1].rstrip('\n')
if species in lis:
write_str = line.rstrip('\n') + ',' + \
str(lis.index(species)) + '\n'
else:
write_str = line.rstrip('\n') + ', species_index' + '\n'
f2.write(write_str)
# For every tree species, add it to the csv file if that species has at least
# 50 rows of data in the dataset.
with open("indexed_learning_data.csv") as f1:
with open("common_learning_data.csv", 'w') as f2:
lines = csv.reader(f1, delimiter=",")
data = []
for line in lines:
data.append(line)
seen = []
for line1 in data:
if line1 == data[0]:
f2.write(','.join(line1) + '\n')
c = line1[-1]
if c not in seen:
counter = 0
for line2 in data:
if line2[-1] == c:
counter += 1
if counter >= 50:
seen.append(c)
f2.write(','.join(line1) + '\n')
else:
f2.write(','.join(line1) + '\n')
# Transform the P05 to P90 variables to a percentage of the max, to remove
# the differences between smaller trees of the same species.
with open("common_learning_data.csv") as f:
with open("relative_learning_data.csv", 'w') as result_file:
f = csv.reader(f, delimiter=",")
data = []
for line in f:
data.append(line)
result_file.write(','.join(data[0]) + '\n')
for line in data[1:]:
newline = ''
count = 0
locations = [13, 14, 15, 16, 17, 18]
for element in line:
if count in locations:
newline = newline + \
str(float(element) / float(line[7])) + ','
else:
newline = newline + str(element) + ','
count += 1
result_file.write(newline[:-1] + '\n')