1
2 import math
3 import sys
4 from copy import copy, deepcopy
5 from time import time
6 from ruleset import *
7 from rule import *
8 from utilities import *
9 from rules_adhoc import RDRLearner
10
11 __docformat__='epytext'
12
14 """
15 Class for learning ripple-down rules from data using the minimum description
16 length principle.
17 """
18
19 - def learn(self, data_file, features_file, max_depth, c, pos_symbol,
20 neg_symbol, min_length, max_length, regenerate):
21 """
22 Learn RDR from data using MDL and display the results.
23
24 @type data_file: string
25 @param data_file: file containing the training data.
26 @type features_file: string
27 @param features_file: file containing the features or "all" if all possible
28 substrings of given lengths are to be generated.
29 @type max_depth: integer
30 @param max_depth: maximum depth of rules.
31 @type c: float
32 @param c: the value of parameter c in MDL.
33 @type pos_symbol: string
34 @param pos_symbol: symbol signifying positive example in data.
35 @type neg_symbol: string
36 @param neg_symbol: symbol signifying negative example in data.
37 @type min_length: integer
38 @param min_length: minimum length for substrings to be generated.
39 @type max_length: integer
40 @param max_length: maximum length for substrings to be generated.
41 @type regenerate: boolean
42 @param regenerate: whether to regenerate possible rules after
43 each time a rule is added to the ruleset
44 """
45
46 data = get_data(data_file, pos_symbol, neg_symbol)
47 if len(data) == 0:
48 print "No data points found in "+data_file+"!"
49 sys.exit(2)
50 self.MAX_DEPTH = max_depth
51 self.C = c
52 self.REGEN = regenerate
53 if features_file == "all":
54 features = gen_features(data, min_length, max_length)
55 else:
56 features = get_features(features_file)
57 if len(features) == 0:
58 print "No features found in "+features_file+"!"
59 sys.exit(2)
60 rules = self.find_rules(data, features, max_depth)
61 Ruleset.print_rule_tree(rules, 0, pos_symbol, neg_symbol)
62
63 - def greedy_set_cover_mdl(self, data, pos_data, ruleset, possible_rules,
64 datapoint_bits, covered_data):
65 """
66 Find greedy set cover using MDL.
67
68 @type data: dict
69 @param data: data points and their classifications.
70 @type pos_data: dict
71 @param pos_data: data points to be classified by the rules and their classifications.
72 @type ruleset: L(Ruleset)
73 @param ruleset: rules already found.
74 @type possible_rules: list
75 @param possible_rules: possible rules that can be used for classification.
76 @type datapoint_bits: float
77 @param datapoint_bits: bits needed to encode one data point.
78 @type covered_data: dict
79 @param covered_data: data points already covered by a rule.
80 @rtype: Ruleset
81 @return: Ruleset that was found.
82 """
83 pos_length = float(len(dict_difference(pos_data, covered_data)))
84 best_rule = Rule()
85 best_rule.lenEH = pos_length * self.C
86 while (len(possible_rules)>0):
87 i=0
88 for rule in possible_rules:
89 rule.additional_confidence = \
90 float(len(dict_difference(rule.sub_pos_data,covered_data)))
91 rule.lenEH = (pos_length - rule.additional_confidence + \
92 rule.error) * datapoint_bits
93 if(rule.shorter_than(best_rule, self.C)):
94 best_rule = deepcopy(rule)
95 best_rule_index = i
96 i+=1
97
98 if (best_rule.name is not None and ruleset.improved_by(best_rule, datapoint_bits, self.C)):
99 ruleset.append(best_rule)
100 del possible_rules[best_rule_index]
101 dict_sum(covered_data,best_rule.sub_data)
102 datalen = float(len(dict_difference(data, covered_data)))
103 pos_length = float(len(dict_difference(pos_data, covered_data)))
104
105 ruleset.lenEH = (ruleset.error + pos_length) * datapoint_bits
106 ruleset.coverage += best_rule.additional_coverage
107
108 best_rule = Rule()
109 best_rule.lenEH = pos_length * datapoint_bits
110
111 else:
112 break
113
114 ruleset.error += pos_length
115 return ruleset
116
117 - def find_possible_rules(self, features, data, depth, covered_data,
118 pos_length, datapoint_bits):
119 """
120 Find possible rules given a set of data points and features.
121
122 @type features: list
123 @type data: dict
124 @param data: data points and their classifications.
125 @type depth: integer
126 @param depth: maximum depth for exceptions.
127 @type covered_data: dict
128 @param covered_data: data points already covered by a rule.
129 @type pos_length: integer
130 @param pos_length: number of unclassified elements.
131 @type datapoint_bits: float
132 @param datapoint_bits: bits needed to encode one data point.
133 @rtype: (list of Rules, Rule, integer)
134 @return: Possible rules, best rule and its index.
135 """
136 possible_rules = []
137 i = 0
138 best_rule_index = None
139 best_rule = Rule()
140 feat_copy = deepcopy(features)
141 feat_copy.sort()
142 for element in feat_copy:
143 rule = Rule(element)
144
145 sub_data = self.data_for_rule(data, rule)
146 sub_pos_data = self.pos_data_for_depth(sub_data, depth)
147
148 rule.sub_data = sub_data
149 rule.sub_pos_data = sub_pos_data
150 rule.confidence = float(len(sub_pos_data))
151 rule.coverage = float(len(sub_data))
152 rule.error = rule.coverage - rule.confidence
153 rule.additional_confidence = \
154 float(len(dict_difference(sub_pos_data,covered_data)))
155 rule.additional_coverage = \
156 float(len(dict_difference(sub_data,covered_data)))
157 rule.length = len(element)
158 rule.lenEH = (pos_length - rule.additional_confidence + \
159 (rule.coverage - rule.confidence)) * datapoint_bits
160
161 features.remove(element)
162 exceptions = self.find_rules(sub_data, features, depth - 1)
163 sublength = (len(element) + 1 + exceptions.length) * self.C + \
164 (pos_length - rule.additional_confidence + exceptions.error) * datapoint_bits
165
166 if sublength <= self.C * rule.length + rule.lenEH:
167 rule.sub_data = sub_data
168 rule.set_exceptions(exceptions)
169 rule.lenEH = (pos_length - rule.additional_confidence + exceptions.error) * datapoint_bits
170 features.append(element)
171
172 if rule.additional_confidence == 0.0:
173 continue
174 if rule.get_error() >= 50:
175 continue
176
177 possible_rules.append(rule)
178 if(rule.shorter_than(best_rule, self.C)):
179 best_rule = deepcopy(rule)
180 best_rule_index = len(possible_rules) - 1
181 return (possible_rules, best_rule, best_rule_index)
182
184 """
185 Find rules from data.
186
187 @type data: dict
188 @param data: data points and their classifications.
189 @type features: list
190 @param features: possible features for classification
191 @type depth: integer
192 @param depth: maximum depth for exceptions for the rules to be found.
193 @rtype: Ruleset
194 @return: Ruleset that was found.
195 """
196 ruleset = Ruleset()
197 if depth == self.MAX_DEPTH:
198 rule = Rule("default")
199 rule.set_exceptions(self.find_rules(data, features, depth - 1))
200 pos_data = self.pos_data_for_depth(data, depth)
201 rule.confidence = float(len(pos_data))
202 rule.coverage = float(len(data))
203 ruleset.append(rule)
204 return ruleset
205 if(self.check_data(data, depth)==False):
206
207 return ruleset
208 if(depth < 0):
209 ruleset.length = 1.0
210 ruleset.error = self.count_errors(data, depth)
211 return ruleset
212 covered_data={}
213 best_rule = Rule()
214 pos_data = self.pos_data_for_depth(data, depth)
215 pos_length = float(len(pos_data))
216 data_length = float(len(data))
217
218 feat_copy = deepcopy(features)
219
220 datapoint_bits = log2(data_length)
221 ruleset.lenEH = pos_length * datapoint_bits
222 best_rule.lenEH = pos_length * datapoint_bits
223
224 while (pos_length>0):
225 (possible_rules, best_rule, best_rule_index) = \
226 self.find_possible_rules(feat_copy, data, depth,
227 covered_data, pos_length, datapoint_bits)
228
229 if (best_rule.name is not None and
230 ruleset.improved_by(best_rule, datapoint_bits, self.C)):
231 del possible_rules[best_rule_index]
232 ruleset.append(best_rule)
233
234 dict_sum(covered_data,best_rule.sub_data)
235 data_length = float(len(dict_difference(data, covered_data)))
236 pos_length = float(len(dict_difference(pos_data, covered_data)))
237 ruleset.lenEH = (ruleset.error + pos_length) * datapoint_bits
238 ruleset.coverage += best_rule.additional_coverage
239 best_rule = Rule()
240 best_rule.lenEH = pos_length
241 else:
242 break
243
244 if self.REGEN is False:
245 return self.greedy_set_cover_mdl(data, pos_data, ruleset, possible_rules, datapoint_bits, covered_data)
246
247 ruleset.error += pos_length
248 return ruleset
249
250
251 if __name__ == "__main__":
252 arguments = parse_arguments(sys.argv)
253 s = RDRLearnerMDL()
254 s.learn(*arguments)
255