teamwork.math.id3

1 """ 2 This module contains the functions for calculating the information gain of a 3 dataset as defined by the ID3 (Information Theoretic) heuristic. 4 """ 5 import copy 6 import math 7

8 -def frequency(data,attr):

9 """Computes the histogram over values for the given attribute 10 @param data: the data to be analyzed 11 @type data: dict[] 12 @param attr: the attribute to be analyzed 13 @return: a dictionary of frequency counts, indexed by attribute values 14 @rtype: dict 15 @warning: assumes all fields have the same number of possible values 16 """ 17 val_freq = {} 18 # Calculate the frequency of each of the values in the target attr 19 for record in data: 20 value = record[attr] 21 if value is not None: 22 val_freq[value] = 0. 23 for record in data: 24 weight = pow(float(len(val_freq)),record.values().count(None)) 25 if record[attr] is None: 26 # This counts equally toward all counts 27 for value in val_freq.keys(): 28 val_freq[value] += weight/float(len(val_freq)) 29 else: 30 val_freq[record[attr]] += weight 31 return val_freq

32

33 -def entropy(data, target_attr):

34 """ 35 Calculates the entropy of the given data set for the target attribute. 36 """ 37 val_freq = frequency(data,target_attr) 38 data_entropy = 0.0 39 total = float(sum(val_freq.values())) 40 for freq in val_freq.values(): 41 prob = freq/total 42 data_entropy -= prob * math.log(prob, 2) 43 44 return data_entropy

45

46 -def gain(data, attr, target_attr):

47 """ 48 Calculates the information gain (reduction in entropy) that would 49 result by splitting the data on the chosen attribute (attr). 50 """ 51 val_freq = frequency(data,attr) 52 subset_entropy = 0.0 53 54 # Calculate the sum of the entropy for each subset of records weighted 55 # by their probability of occuring in the training set. 56 total = float(sum(val_freq.values())) 57 for val in val_freq.keys(): 58 val_prob = val_freq[val] / total 59 data_subset = [] 60 for record in data: 61 if record[attr] == val: 62 data_subset.append(record) 63 elif record[attr] is None: 64 datum = copy.copy(record) 65 datum[attr] = val 66 data_subset.append(datum) 67 subset_entropy += val_prob * entropy(data_subset, target_attr) 68 69 # Subtract the entropy of the chosen attribute from the entropy of the 70 # whole data set with respect to the target attribute (and return it) 71 return (entropy(data, target_attr) - subset_entropy)

72

Source Code for Module teamwork.math.id3