1 """
2 This module contains the functions for calculating the information gain of a
3 dataset as defined by the ID3 (Information Theoretic) heuristic.
4 """
5 import copy
6 import math
7
9 """Computes the histogram over values for the given attribute
10 @param data: the data to be analyzed
11 @type data: dict[]
12 @param attr: the attribute to be analyzed
13 @return: a dictionary of frequency counts, indexed by attribute values
14 @rtype: dict
15 @warning: assumes all fields have the same number of possible values
16 """
17 val_freq = {}
18
19 for record in data:
20 value = record[attr]
21 if value is not None:
22 val_freq[value] = 0.
23 for record in data:
24 weight = pow(float(len(val_freq)),record.values().count(None))
25 if record[attr] is None:
26
27 for value in val_freq.keys():
28 val_freq[value] += weight/float(len(val_freq))
29 else:
30 val_freq[record[attr]] += weight
31 return val_freq
32
34 """
35 Calculates the entropy of the given data set for the target attribute.
36 """
37 val_freq = frequency(data,target_attr)
38 data_entropy = 0.0
39 total = float(sum(val_freq.values()))
40 for freq in val_freq.values():
41 prob = freq/total
42 data_entropy -= prob * math.log(prob, 2)
43
44 return data_entropy
45
46 -def gain(data, attr, target_attr):
47 """
48 Calculates the information gain (reduction in entropy) that would
49 result by splitting the data on the chosen attribute (attr).
50 """
51 val_freq = frequency(data,attr)
52 subset_entropy = 0.0
53
54
55
56 total = float(sum(val_freq.values()))
57 for val in val_freq.keys():
58 val_prob = val_freq[val] / total
59 data_subset = []
60 for record in data:
61 if record[attr] == val:
62 data_subset.append(record)
63 elif record[attr] is None:
64 datum = copy.copy(record)
65 datum[attr] = val
66 data_subset.append(datum)
67 subset_entropy += val_prob * entropy(data_subset, target_attr)
68
69
70
71 return (entropy(data, target_attr) - subset_entropy)
72