teamwork.math.dtree

1 """ 2 This module holds functions that are responsible for creating a new 3 decision tree and for using the tree for data classificiation. 4 """ 5 from id3 import frequency 6

7 -def majority_value(data, target_attr):

8 """ 9 Creates a list of all values in the target attribute for each record 10 in the data list object, and returns the value that appears in this list 11 the most frequently. 12 """ 13 histogram = frequency(data,target_attr) 14 highest_freq = 0 15 most_freq = None 16 for value,count in histogram.items(): 17 if most_freq is None or count > highest_freq: 18 most_freq = value 19 highest_freq = count 20 return most_freq

21

22 -def unique(lst):

23 """ 24 Returns a list made up of the unique values found in lst. i.e., it 25 removes the redundant values in lst. 26 """ 27 unique = [] 28 # Cycle through the list and add each value to the unique list only once. 29 for item in lst: 30 if not item in unique: 31 unique.append(item) 32 # Return the list with all redundant values removed. 33 return unique

34

35 -def get_values(data, attr):

36 """ 37 Creates a list of values in the chosen attribute for each record in data, 38 prunes out all of the redundant values, and return the list. 39 """ 40 return unique([record[attr] for record in data \ 41 if record[attr] is not None])

42

43 -def choose_attribute(data, attributes, target_attr, fitness):

44 """ 45 Cycles through all the attributes and returns the attribute with the 46 highest information gain (or lowest entropy). 47 """ 48 # dp: removed unnecessary copy 49 ## data = data[:] 50 best_gain = 0.0 51 best_attr = None 52 53 for attr in attributes: 54 if attr != target_attr: 55 gain = fitness(data, attr, target_attr) 56 if best_attr is None or gain >= best_gain: 57 best_gain = gain 58 best_attr = attr 59 return best_attr

60

61 -def get_examples(data, attr, value):

62 """ 63 Returns a list of all the records in <data> with the value of <attr> 64 matching the given value. 65 """ 66 return filter(lambda r:r[attr] == value or r[attr] is None,data)

67

68 -def get_classification(record, tree):

69 """ 70 This function recursively traverses the decision tree and returns a 71 classification for the given record. 72 """ 73 # If the current node is a string, then we've reached a leaf node and 74 # we can return it as our answer 75 if type(tree) == type("string"): 76 return tree 77 78 # Traverse the tree further until a leaf node is found. 79 else: 80 attr = tree.keys()[0] 81 t = tree[attr][record[attr]] 82 return get_classification(record, t)

83

84 -def classify(tree, data):

85 """ 86 Returns a list of classifications for each of the records in the data 87 list as determined by the given decision tree. 88 """ 89 # dp: removed unnecessary copy 90 ## data = data[:] 91 classification = [] 92 93 for record in data: 94 classification.append(get_classification(record, tree)) 95 96 return classification

97

98 -def create_decision_tree(data, attributes, target_attr, fitness_func):

99 """ 100 Returns a new decision tree based on the examples given. 101 """ 102 # dp: removed unnecessary copy 103 ## data = data[:] 104 vals = get_values(data,target_attr) 105 default = majority_value(data, target_attr) 106 # If the dataset is empty or the attributes list is empty, return the 107 # default value. When checking the attributes list for emptiness, we 108 # need to subtract 1 to account for the target attribute. 109 if not data or len(attributes) <= 1: 110 return default 111 # If all the records in the dataset have the same classification, 112 # return that classification. 113 elif len(vals) == 1: 114 return vals[0] 115 else: 116 # Choose the next best attribute to best classify our data 117 best = choose_attribute(data, attributes, target_attr, 118 fitness_func) 119 # Create a new decision tree/node with the best attribute and an empty 120 # dictionary object--we'll fill that up next. 121 tree = {best:{}} 122 123 # Create a new decision tree/sub-node for each of the values in the 124 # best attribute field 125 subattributes = [attr for attr in attributes if attr != best] 126 for val in get_values(data, best): 127 # Create a subtree for the current value under the "best" field 128 subdata = get_examples(data,best,val) 129 if attributes.index(best) == 6: 130 print val,len(subdata) 131 subtree = create_decision_tree(subdata,subattributes, 132 target_attr,fitness_func) 133 134 # Add the new subtree to the empty dictionary object in our new 135 # tree/node we just created. 136 tree[best][val] = subtree 137 if len(tree[best]) == 0: 138 # Not sure whether this should ever happen 139 return create_decision_tree(data,subattributes, 140 target_attr,fitness_func) 141 return tree

142

Source Code for Module teamwork.math.dtree