1
2 import math
3 import sys
4 from copy import copy, deepcopy
5 from time import time
6 from ruleset import *
7 from rule import *
8 from utilities import *
9
10 __docformat__='epytext'
11
13 """
14 Class for learning ripple-down rules from data using the greedy set covering.
15 """
16
17 - def learn(self, data_file, features_file, max_depth, cutoff, pos_symbol,
18 neg_symbol, min_length, max_length, regenerate):
19 """
20 Learn RDR from data using greedy set covering and display the results.
21
22 @type data_file: string
23 @param data_file: file containing the training data.
24 @type features_file: string
25 @param features_file: file containing the features or "all" if all possible
26 substrings of given lengths are to be generated.
27 @type max_depth: integer
28 @param max_depth: maximum depth of rules.
29 @type cutoff: float
30 @param cutoff: cutoff value for cost of sets to be added to the covering.
31 @type pos_symbol: string
32 @param pos_symbol: symbol signifying positive example in data.
33 @type neg_symbol: string
34 @param neg_symbol: symbol signifying negative example in data.
35 @type min_length: integer
36 @param min_length: minimum length for substrings to be generated.
37 @type max_length: integer
38 @param max_length: maximum length for substrings to be generated.
39 @type regenerate: boolean
40 @param regenerate: whether to regenerate possible rules after
41 each time a rule is added to the ruleset
42 """
43
44 data = get_data(data_file, pos_symbol, neg_symbol)
45 if len(data) == 0:
46 print "No data points found in "+data_file+"!"
47 sys.exit(2)
48 self.MAX_DEPTH = max_depth
49 self.COST_CUTOFF = cutoff
50 self.REGEN = regenerate
51 if features_file == "all":
52 features = gen_substrings(data, min_length, max_length)
53 else:
54 features = get_features(features_file)
55 if len(features) == 0:
56 print "No features found in "+features_file+"!"
57 sys.exit(2)
58 rules = self.find_rules(data, features, max_depth)
59 Ruleset.print_rule_tree(rules, 0, pos_symbol, neg_symbol)
60
62 """
63 Check if any of the data points need to be classified at a particular depth.
64
65 @type data: dict
66 @param data: data points and their classifications.
67 @type depth: integer
68 @param depth: depth to look at.
69 @rtype: boolean
70 @return: True, if there exists a data point currently incorrectly classified
71 """
72 if len(data) == 0:
73 return False
74 seeking = (-1) ** (self.MAX_DEPTH - depth + 1)
75 for value in data.values():
76 if value == seeking:
77 return True
78 return False
79
81 """
82 Count the number of elements wrongly classified at a given depth.
83
84 @type data: dict
85 @param data: data points and their classifications.
86 @type depth: integer
87 @param depth: depth to look at.
88 @rtype: integer
89 @return: Number of wrongly classified elements.
90 """
91 if len(data) == 0:
92 return 0
93 err = 0
94 seeking = (-1) ** (self.MAX_DEPTH - depth + 1)
95 for value in data.values():
96 if value == seeking:
97 err += 1
98 return err
99
101 """
102 Find data points to be classified at a particular depth.
103
104 @type data: dict
105 @param data: data points and their classifications.
106 @type depth: integer
107 @param depth: depth to look at.
108 @rtype: dict
109 @return: Data points to be classified at a particular depth.
110 """
111 pos_data = {}
112 seeking = (-1) ** (self.MAX_DEPTH - depth + 1)
113 for (item, value) in data.items():
114 if value == seeking:
115 pos_data[item]=value
116 return pos_data
117
119 """
120 Find data points covered by rule.
121
122 @type data: dict
123 @param data: data points to look from.
124 @type rule: L(Rule)
125 @rtype: dict
126 @return: Data points covered by the rule.
127 """
128 new_data = {}
129 for (item, value) in data.items():
130 if item.find(rule.name) != -1:
131 new_data[item] = value
132 return new_data
133
134 - def greedy_set_cover(self, data, pos_data, ruleset, possible_rules,
135 covered_data):
136 """
137 Find greedy set covering.
138
139 @type data: dict
140 @param data: data points and their classifications.
141 @type pos_data: dict
142 @param pos_data: data points to be classified by the rules and their classifications.
143 @type ruleset: L(Ruleset)
144 @param ruleset: rules already found.
145 @type possible_rules: list
146 @param possible_rules: possible rules that can be used for classification.
147 @type covered_data: dict
148 @param covered_data: data points already covered by a rule.
149 @rtype: Ruleset
150 @return: Ruleset that was found.
151 """
152 pos_length = float(len(dict_difference(pos_data, covered_data)))
153 best_rule = Rule()
154 while (len(possible_rules)>0):
155 i=0
156 for rule in possible_rules:
157 rule.additional_confidence = \
158 float(len(dict_difference(rule.sub_pos_data,covered_data)))
159 if(rule.cost() < best_rule.cost()):
160 best_rule = deepcopy(rule)
161 best_rule_index = i
162 i+=1
163
164
165
166 if (best_rule.name is not None and best_rule.cost() <= self.COST_CUTOFF):
167 ruleset.append(best_rule)
168 del possible_rules[best_rule_index]
169 dict_sum(covered_data,best_rule.sub_data)
170 datalen = float(len(dict_difference(data, covered_data)))
171 pos_length = float(len(dict_difference(pos_data, covered_data)))
172
173 ruleset.coverage += best_rule.additional_coverage
174
175 best_rule = Rule()
176 else:
177 break
178
179 ruleset.error += pos_length
180 return ruleset
181
184 """
185 Find possible rules given a set of data points and features.
186
187 @type features: list
188 @type data: dict
189 @param data: data points and their classifications.
190 @type depth: integer
191 @param depth: maximum depth for exceptions.
192 @type covered_data: dict
193 @param covered_data: data points already covered by a rule.
194 @type pos_length: integer
195 @param pos_length: number of unclassified elements.
196 @rtype: (list of Rules, Rule, integer)
197 @return: Possible rules, best rule and its index.
198 """
199 possible_rules = []
200 i = 0
201 best_rule_index = None
202 best_rule = Rule()
203 feat_copy = deepcopy(features)
204 feat_copy.sort()
205 for element in feat_copy:
206 rule = Rule(element)
207
208 sub_data = self.data_for_rule(data, rule)
209 sub_pos_data = self.pos_data_for_depth(sub_data, depth)
210
211 rule.sub_data = sub_data
212 rule.sub_pos_data = sub_pos_data
213 rule.confidence = float(len(sub_pos_data))
214 rule.coverage = float(len(sub_data))
215 rule.error = rule.coverage - rule.confidence
216 rule.additional_confidence = \
217 float(len(dict_difference(sub_pos_data,covered_data)))
218 rule.additional_coverage = \
219 float(len(dict_difference(sub_data,covered_data)))
220 rule.length = len(element)
221
222 features.remove(element)
223 exceptions = self.find_rules(sub_data, features, depth - 1)
224
225
226 rule.sub_data = sub_data
227 rule.set_exceptions(exceptions)
228
229 features.append(element)
230
231 if rule.additional_confidence == 0.0:
232 continue
233 if rule.get_error() >= 50:
234 continue
235
236 possible_rules.append(rule)
237
238
239 if(rule.cost() < best_rule.cost()):
240 best_rule = deepcopy(rule)
241 best_rule_index = len(possible_rules) - 1
242 return (possible_rules, best_rule, best_rule_index)
243
245 """
246 Find rules from data.
247
248 @type data: dict
249 @param data: data points and their classifications.
250 @type features: list
251 @param features: possible features for classification
252 @type depth: integer
253 @param depth: maximum depth for exceptions for the rules to be found.
254 @rtype: Ruleset
255 @return: Ruleset that was found.
256 """
257 ruleset = Ruleset()
258 if depth == self.MAX_DEPTH:
259 rule = Rule("default")
260 rule.set_exceptions(self.find_rules(data, features, depth - 1))
261 pos_data = self.pos_data_for_depth(data, depth)
262 rule.confidence = float(len(pos_data))
263 rule.coverage = float(len(data))
264 ruleset.append(rule)
265 return ruleset
266 if(self.check_data(data, depth)==False):
267
268 return ruleset
269 if(depth < 0):
270 ruleset.length = 1.0
271 ruleset.error = self.count_errors(data, depth)
272 return ruleset
273 covered_data={}
274 best_rule = Rule()
275 pos_data = self.pos_data_for_depth(data, depth)
276 pos_length = float(len(pos_data))
277 data_length = float(len(data))
278
279 feat_copy = deepcopy(features)
280 while (pos_length>0):
281 (possible_rules, best_rule, best_rule_index) = \
282 self.find_possible_rules(feat_copy, data, depth,
283 covered_data, pos_length)
284
285 if (best_rule.name is not None and
286 best_rule.cost() <= self.COST_CUTOFF):
287 del possible_rules[best_rule_index]
288 ruleset.append(best_rule)
289
290 dict_sum(covered_data,best_rule.sub_data)
291 data_length = float(len(dict_difference(data, covered_data)))
292 pos_length = float(len(dict_difference(pos_data, covered_data)))
293 ruleset.coverage += best_rule.additional_coverage
294 best_rule = Rule()
295
296 else:
297 break
298
299 if self.REGEN is False:
300 return self.greedy_set_cover(data, pos_data, ruleset, possible_rules, covered_data)
301
302 ruleset.error += pos_length
303 return ruleset
304
305
306 if __name__ == "__main__":
307 arguments = parse_arguments(sys.argv, False)
308 s = RDRLearner()
309 s.learn(*arguments)
310