1 """
2 This module holds functions that are responsible for creating a new
3 decision tree and for using the tree for data classificiation.
4 """
5 from id3 import frequency
6
8 """
9 Creates a list of all values in the target attribute for each record
10 in the data list object, and returns the value that appears in this list
11 the most frequently.
12 """
13 histogram = frequency(data,target_attr)
14 highest_freq = 0
15 most_freq = None
16 for value,count in histogram.items():
17 if most_freq is None or count > highest_freq:
18 most_freq = value
19 highest_freq = count
20 return most_freq
21
23 """
24 Returns a list made up of the unique values found in lst. i.e., it
25 removes the redundant values in lst.
26 """
27 unique = []
28
29 for item in lst:
30 if not item in unique:
31 unique.append(item)
32
33 return unique
34
36 """
37 Creates a list of values in the chosen attribute for each record in data,
38 prunes out all of the redundant values, and return the list.
39 """
40 return unique([record[attr] for record in data \
41 if record[attr] is not None])
42
44 """
45 Cycles through all the attributes and returns the attribute with the
46 highest information gain (or lowest entropy).
47 """
48
49
50 best_gain = 0.0
51 best_attr = None
52
53 for attr in attributes:
54 if attr != target_attr:
55 gain = fitness(data, attr, target_attr)
56 if best_attr is None or gain >= best_gain:
57 best_gain = gain
58 best_attr = attr
59 return best_attr
60
62 """
63 Returns a list of all the records in <data> with the value of <attr>
64 matching the given value.
65 """
66 return filter(lambda r:r[attr] == value or r[attr] is None,data)
67
69 """
70 This function recursively traverses the decision tree and returns a
71 classification for the given record.
72 """
73
74
75 if type(tree) == type("string"):
76 return tree
77
78
79 else:
80 attr = tree.keys()[0]
81 t = tree[attr][record[attr]]
82 return get_classification(record, t)
83
85 """
86 Returns a list of classifications for each of the records in the data
87 list as determined by the given decision tree.
88 """
89
90
91 classification = []
92
93 for record in data:
94 classification.append(get_classification(record, tree))
95
96 return classification
97
99 """
100 Returns a new decision tree based on the examples given.
101 """
102
103
104 vals = get_values(data,target_attr)
105 default = majority_value(data, target_attr)
106
107
108
109 if not data or len(attributes) <= 1:
110 return default
111
112
113 elif len(vals) == 1:
114 return vals[0]
115 else:
116
117 best = choose_attribute(data, attributes, target_attr,
118 fitness_func)
119
120
121 tree = {best:{}}
122
123
124
125 subattributes = [attr for attr in attributes if attr != best]
126 for val in get_values(data, best):
127
128 subdata = get_examples(data,best,val)
129 if attributes.index(best) == 6:
130 print val,len(subdata)
131 subtree = create_decision_tree(subdata,subattributes,
132 target_attr,fitness_func)
133
134
135
136 tree[best][val] = subtree
137 if len(tree[best]) == 0:
138
139 return create_decision_tree(data,subattributes,
140 target_attr,fitness_func)
141 return tree
142