forked from JorisJoBo/treedentifier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDecisionTree_Dependent.py
110 lines (92 loc) · 3.14 KB
/
DecisionTree_Dependent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from sklearn import tree
import csv
import numpy as np
import graphviz
import matplotlib.pylab as plt
# Converts relative_learning_data.csv to a list
data = []
with open('relative_learning_data.csv') as f:
f = csv.reader(f, delimiter=',')
for line in f:
data.append(line)
features = data[0]
classes = []
data = data[1:]
# ID's of columns that aren't usefull for decision trees.
removed_features = [0, 1, 2, 3, 4, 5]
# Removes these columns from the feature names and the dataset.
cl_features = []
cl_data = []
for i in range(len(features)):
if i not in removed_features:
cl_features.append(features[i])
for line in data:
newline = []
for i in range(len(features)):
if i not in removed_features:
newline.append(line[i])
cl_data.append(newline)
features = cl_features
data = cl_data
def decisiontree(data):
Xt = []
Yt = []
Xv = []
Yv = []
# Adds 90% of the data to the trainingsset, 10% to the validationset.
np.random.shuffle(data)
trainingsize = 0.9 * len(data)
training = data[:int(trainingsize)]
validation = data[int(trainingsize):]
# Creates the X and Y parts of the training and test sets.
# Also fills the tree species list (classes) with all different species.
for line in training:
if line[-1] not in classes:
classes.append(line[-1])
Xt.append(line[0:-1])
Yt.append(line[-1])
for line in validation:
if line[-1] not in classes:
return decisiontree(data)
Xv.append(line[0:-1])
Yv.append(line[-1])
clf = tree.DecisionTreeClassifier()
clf = clf.fit(Xt, Yt)
return clf, Xt, Yt, Xv, Yv
clf, Xt, Yt, Xv, Yv = decisiontree(data)
# Sorts the classes alphabetically, which makes them work as class_names
classes.sort()
# This creates an image of the decisiontree and exports it as a PDF.
dot_data = tree.export_graphviz(clf,
out_file=None,
class_names=classes,
feature_names=features[:-1],
rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('tree', view=True)
# This calculates the average correctness for the dataset.
def avgcost(data, n):
totalcost = 0
for i in range(n):
clf, Xt, Yt, Xv, Yv = decisiontree(data)
totalcost = totalcost + clf.score(Xv, Yv)
return totalcost / n
print('Average Correctness: ' + str(avgcost(data, 500)))
# This calculates the usage (/importance) for all features in the decisiontree.
def avgimportance(data, n, features):
totalimportance = []
for i in range(n):
clf, _, _, _, _ = decisiontree(data)
importance = clf.feature_importances_
if len(totalimportance) == 0:
totalimportance = importance
else:
totalimportance = [
x + y for x,
y in zip(
totalimportance,
importance)]
for i in range(len(importance)):
print(str(features[i]) + ': ' + str(totalimportance[i] / n))
avgimportance(data, 500, features)