Module learnrdr_adhoc
[hide private]
[frames] | no frames]

Source Code for Module learnrdr_adhoc

  1  #!/usr/bin/env python 
  2  import math 
  3  import sys 
  4  from copy import copy, deepcopy 
  5  from time import time 
  6  from ruleset import * 
  7  from rule import * 
  8  from utilities import * 
  9   
 10  __docformat__='epytext'  
 11   
12 -class RDRLearner:
13 """ 14 Class for learning ripple-down rules from data using the greedy set covering. 15 """ 16
17 - def learn(self, data_file, features_file, max_depth, cutoff, pos_symbol, 18 neg_symbol, min_length, max_length, regenerate):
19 """ 20 Learn RDR from data using greedy set covering and display the results. 21 22 @type data_file: string 23 @param data_file: file containing the training data. 24 @type features_file: string 25 @param features_file: file containing the features or "all" if all possible 26 substrings of given lengths are to be generated. 27 @type max_depth: integer 28 @param max_depth: maximum depth of rules. 29 @type cutoff: float 30 @param cutoff: cutoff value for cost of sets to be added to the covering. 31 @type pos_symbol: string 32 @param pos_symbol: symbol signifying positive example in data. 33 @type neg_symbol: string 34 @param neg_symbol: symbol signifying negative example in data. 35 @type min_length: integer 36 @param min_length: minimum length for substrings to be generated. 37 @type max_length: integer 38 @param max_length: maximum length for substrings to be generated. 39 @type regenerate: boolean 40 @param regenerate: whether to regenerate possible rules after 41 each time a rule is added to the ruleset 42 """ 43 44 data = get_data(data_file, pos_symbol, neg_symbol) 45 if len(data) == 0: 46 print "No data points found in "+data_file+"!" 47 sys.exit(2) 48 self.MAX_DEPTH = max_depth 49 self.COST_CUTOFF = cutoff 50 self.REGEN = regenerate 51 if features_file == "all": 52 features = gen_substrings(data, min_length, max_length) 53 else: 54 features = get_features(features_file) 55 if len(features) == 0: 56 print "No features found in "+features_file+"!" 57 sys.exit(2) 58 rules = self.find_rules(data, features, max_depth) 59 Ruleset.print_rule_tree(rules, 0, pos_symbol, neg_symbol)
60
61 - def check_data(self, data, depth):
62 """ 63 Check if any of the data points need to be classified at a particular depth. 64 65 @type data: dict 66 @param data: data points and their classifications. 67 @type depth: integer 68 @param depth: depth to look at. 69 @rtype: boolean 70 @return: True, if there exists a data point currently incorrectly classified 71 """ 72 if len(data) == 0: 73 return False 74 seeking = (-1) ** (self.MAX_DEPTH - depth + 1) 75 for value in data.values(): 76 if value == seeking: 77 return True 78 return False
79
80 - def count_errors(self, data, depth):
81 """ 82 Count the number of elements wrongly classified at a given depth. 83 84 @type data: dict 85 @param data: data points and their classifications. 86 @type depth: integer 87 @param depth: depth to look at. 88 @rtype: integer 89 @return: Number of wrongly classified elements. 90 """ 91 if len(data) == 0: 92 return 0 93 err = 0 94 seeking = (-1) ** (self.MAX_DEPTH - depth + 1) 95 for value in data.values(): 96 if value == seeking: 97 err += 1 98 return err
99
100 - def pos_data_for_depth(self, data, depth):
101 """ 102 Find data points to be classified at a particular depth. 103 104 @type data: dict 105 @param data: data points and their classifications. 106 @type depth: integer 107 @param depth: depth to look at. 108 @rtype: dict 109 @return: Data points to be classified at a particular depth. 110 """ 111 pos_data = {} 112 seeking = (-1) ** (self.MAX_DEPTH - depth + 1) 113 for (item, value) in data.items(): 114 if value == seeking: 115 pos_data[item]=value 116 return pos_data
117
118 - def data_for_rule(self, data, rule):
119 """ 120 Find data points covered by rule. 121 122 @type data: dict 123 @param data: data points to look from. 124 @type rule: L(Rule) 125 @rtype: dict 126 @return: Data points covered by the rule. 127 """ 128 new_data = {} 129 for (item, value) in data.items(): 130 if item.find(rule.name) != -1: 131 new_data[item] = value 132 return new_data
133
134 - def greedy_set_cover(self, data, pos_data, ruleset, possible_rules, 135 covered_data):
136 """ 137 Find greedy set covering. 138 139 @type data: dict 140 @param data: data points and their classifications. 141 @type pos_data: dict 142 @param pos_data: data points to be classified by the rules and their classifications. 143 @type ruleset: L(Ruleset) 144 @param ruleset: rules already found. 145 @type possible_rules: list 146 @param possible_rules: possible rules that can be used for classification. 147 @type covered_data: dict 148 @param covered_data: data points already covered by a rule. 149 @rtype: Ruleset 150 @return: Ruleset that was found. 151 """ 152 pos_length = float(len(dict_difference(pos_data, covered_data))) 153 best_rule = Rule() 154 while (len(possible_rules)>0): 155 i=0 156 for rule in possible_rules: 157 rule.additional_confidence = \ 158 float(len(dict_difference(rule.sub_pos_data,covered_data))) 159 if(rule.cost() < best_rule.cost()): 160 best_rule = deepcopy(rule) 161 best_rule_index = i 162 i+=1 163 164 #if cost per element for the set larger than the cutoff value 165 #then do not add it to the covering. For comparability with MDL 166 if (best_rule.name is not None and best_rule.cost() <= self.COST_CUTOFF): 167 ruleset.append(best_rule) 168 del possible_rules[best_rule_index] 169 dict_sum(covered_data,best_rule.sub_data) 170 datalen = float(len(dict_difference(data, covered_data))) 171 pos_length = float(len(dict_difference(pos_data, covered_data))) 172 173 ruleset.coverage += best_rule.additional_coverage 174 175 best_rule = Rule() 176 else: #no additional rule found 177 break 178 179 ruleset.error += pos_length 180 return ruleset
181
182 - def find_possible_rules(self, features, data, depth, covered_data, 183 pos_length):
184 """ 185 Find possible rules given a set of data points and features. 186 187 @type features: list 188 @type data: dict 189 @param data: data points and their classifications. 190 @type depth: integer 191 @param depth: maximum depth for exceptions. 192 @type covered_data: dict 193 @param covered_data: data points already covered by a rule. 194 @type pos_length: integer 195 @param pos_length: number of unclassified elements. 196 @rtype: (list of Rules, Rule, integer) 197 @return: Possible rules, best rule and its index. 198 """ 199 possible_rules = [] 200 i = 0 201 best_rule_index = None 202 best_rule = Rule() 203 feat_copy = deepcopy(features) 204 feat_copy.sort() 205 for element in feat_copy: 206 rule = Rule(element) 207 208 sub_data = self.data_for_rule(data, rule) 209 sub_pos_data = self.pos_data_for_depth(sub_data, depth) 210 211 rule.sub_data = sub_data 212 rule.sub_pos_data = sub_pos_data 213 rule.confidence = float(len(sub_pos_data)) 214 rule.coverage = float(len(sub_data)) 215 rule.error = rule.coverage - rule.confidence 216 rule.additional_confidence = \ 217 float(len(dict_difference(sub_pos_data,covered_data))) 218 rule.additional_coverage = \ 219 float(len(dict_difference(sub_data,covered_data))) 220 rule.length = len(element) 221 222 features.remove(element) 223 exceptions = self.find_rules(sub_data, features, depth - 1) 224 225 #if sublength <= self.C * rule.length + rule.lenEH: 226 rule.sub_data = sub_data 227 rule.set_exceptions(exceptions) 228 # rule.lenEH = (pos_length - rule.additional_confidence + exceptions.error) * datapoint_bits 229 features.append(element) 230 231 if rule.additional_confidence == 0.0: 232 continue 233 if rule.get_error() >= 50: 234 continue 235 236 possible_rules.append(rule) 237 #if depth == self.MAX_DEPTH -1: 238 # print rule.cost(), best_rule.cost() 239 if(rule.cost() < best_rule.cost()): 240 best_rule = deepcopy(rule) 241 best_rule_index = len(possible_rules) - 1 242 return (possible_rules, best_rule, best_rule_index)
243
244 - def find_rules(self, data, features, depth):
245 """ 246 Find rules from data. 247 248 @type data: dict 249 @param data: data points and their classifications. 250 @type features: list 251 @param features: possible features for classification 252 @type depth: integer 253 @param depth: maximum depth for exceptions for the rules to be found. 254 @rtype: Ruleset 255 @return: Ruleset that was found. 256 """ 257 ruleset = Ruleset() 258 if depth == self.MAX_DEPTH: 259 rule = Rule("default") 260 rule.set_exceptions(self.find_rules(data, features, depth - 1)) 261 pos_data = self.pos_data_for_depth(data, depth) 262 rule.confidence = float(len(pos_data)) 263 rule.coverage = float(len(data)) 264 ruleset.append(rule) 265 return ruleset 266 if(self.check_data(data, depth)==False): 267 #return empty ruleset 268 return ruleset 269 if(depth < 0): 270 ruleset.length = 1.0 271 ruleset.error = self.count_errors(data, depth) 272 return ruleset 273 covered_data={} 274 best_rule = Rule() 275 pos_data = self.pos_data_for_depth(data, depth) 276 pos_length = float(len(pos_data)) 277 data_length = float(len(data)) 278 279 feat_copy = deepcopy(features) 280 while (pos_length>0): 281 (possible_rules, best_rule, best_rule_index) = \ 282 self.find_possible_rules(feat_copy, data, depth, 283 covered_data, pos_length) 284 285 if (best_rule.name is not None and 286 best_rule.cost() <= self.COST_CUTOFF): 287 del possible_rules[best_rule_index] 288 ruleset.append(best_rule) 289 290 dict_sum(covered_data,best_rule.sub_data) 291 data_length = float(len(dict_difference(data, covered_data))) 292 pos_length = float(len(dict_difference(pos_data, covered_data))) 293 ruleset.coverage += best_rule.additional_coverage 294 best_rule = Rule() 295 296 else: #no additional rule found 297 break 298 299 if self.REGEN is False: 300 return self.greedy_set_cover(data, pos_data, ruleset, possible_rules, covered_data) 301 302 ruleset.error += pos_length 303 return ruleset
304 305 306 if __name__ == "__main__": 307 arguments = parse_arguments(sys.argv, False) 308 s = RDRLearner() 309 s.learn(*arguments) 310