Module learnrdr
[hide private]
[frames] | no frames]

Source Code for Module learnrdr

  1  #!/usr/bin/env python 
  2  import math 
  3  import sys 
  4  from copy import copy, deepcopy 
  5  from time import time 
  6  from ruleset import * 
  7  from rule import * 
  8  from utilities import * 
  9  from rules_adhoc import RDRLearner 
 10   
 11  __docformat__='epytext'  
 12   
13 -class RDRLearnerMDL (RDRLearner):
14 """ 15 Class for learning ripple-down rules from data using the minimum description 16 length principle. 17 """ 18
19 - def learn(self, data_file, features_file, max_depth, c, pos_symbol, 20 neg_symbol, min_length, max_length, regenerate):
21 """ 22 Learn RDR from data using MDL and display the results. 23 24 @type data_file: string 25 @param data_file: file containing the training data. 26 @type features_file: string 27 @param features_file: file containing the features or "all" if all possible 28 substrings of given lengths are to be generated. 29 @type max_depth: integer 30 @param max_depth: maximum depth of rules. 31 @type c: float 32 @param c: the value of parameter c in MDL. 33 @type pos_symbol: string 34 @param pos_symbol: symbol signifying positive example in data. 35 @type neg_symbol: string 36 @param neg_symbol: symbol signifying negative example in data. 37 @type min_length: integer 38 @param min_length: minimum length for substrings to be generated. 39 @type max_length: integer 40 @param max_length: maximum length for substrings to be generated. 41 @type regenerate: boolean 42 @param regenerate: whether to regenerate possible rules after 43 each time a rule is added to the ruleset 44 """ 45 46 data = get_data(data_file, pos_symbol, neg_symbol) 47 if len(data) == 0: 48 print "No data points found in "+data_file+"!" 49 sys.exit(2) 50 self.MAX_DEPTH = max_depth 51 self.C = c 52 self.REGEN = regenerate 53 if features_file == "all": 54 features = gen_features(data, min_length, max_length) 55 else: 56 features = get_features(features_file) 57 if len(features) == 0: 58 print "No features found in "+features_file+"!" 59 sys.exit(2) 60 rules = self.find_rules(data, features, max_depth) 61 Ruleset.print_rule_tree(rules, 0, pos_symbol, neg_symbol)
62
63 - def greedy_set_cover_mdl(self, data, pos_data, ruleset, possible_rules, 64 datapoint_bits, covered_data):
65 """ 66 Find greedy set cover using MDL. 67 68 @type data: dict 69 @param data: data points and their classifications. 70 @type pos_data: dict 71 @param pos_data: data points to be classified by the rules and their classifications. 72 @type ruleset: L(Ruleset) 73 @param ruleset: rules already found. 74 @type possible_rules: list 75 @param possible_rules: possible rules that can be used for classification. 76 @type datapoint_bits: float 77 @param datapoint_bits: bits needed to encode one data point. 78 @type covered_data: dict 79 @param covered_data: data points already covered by a rule. 80 @rtype: Ruleset 81 @return: Ruleset that was found. 82 """ 83 pos_length = float(len(dict_difference(pos_data, covered_data))) 84 best_rule = Rule() 85 best_rule.lenEH = pos_length * self.C 86 while (len(possible_rules)>0): 87 i=0 88 for rule in possible_rules: 89 rule.additional_confidence = \ 90 float(len(dict_difference(rule.sub_pos_data,covered_data))) 91 rule.lenEH = (pos_length - rule.additional_confidence + \ 92 rule.error) * datapoint_bits 93 if(rule.shorter_than(best_rule, self.C)): 94 best_rule = deepcopy(rule) 95 best_rule_index = i 96 i+=1 97 98 if (best_rule.name is not None and ruleset.improved_by(best_rule, datapoint_bits, self.C)): 99 ruleset.append(best_rule) 100 del possible_rules[best_rule_index] 101 dict_sum(covered_data,best_rule.sub_data) 102 datalen = float(len(dict_difference(data, covered_data))) 103 pos_length = float(len(dict_difference(pos_data, covered_data))) 104 105 ruleset.lenEH = (ruleset.error + pos_length) * datapoint_bits 106 ruleset.coverage += best_rule.additional_coverage 107 108 best_rule = Rule() 109 best_rule.lenEH = pos_length * datapoint_bits 110 111 else: #no additional rule found 112 break 113 114 ruleset.error += pos_length 115 return ruleset
116
117 - def find_possible_rules(self, features, data, depth, covered_data, 118 pos_length, datapoint_bits):
119 """ 120 Find possible rules given a set of data points and features. 121 122 @type features: list 123 @type data: dict 124 @param data: data points and their classifications. 125 @type depth: integer 126 @param depth: maximum depth for exceptions. 127 @type covered_data: dict 128 @param covered_data: data points already covered by a rule. 129 @type pos_length: integer 130 @param pos_length: number of unclassified elements. 131 @type datapoint_bits: float 132 @param datapoint_bits: bits needed to encode one data point. 133 @rtype: (list of Rules, Rule, integer) 134 @return: Possible rules, best rule and its index. 135 """ 136 possible_rules = [] 137 i = 0 138 best_rule_index = None 139 best_rule = Rule() 140 feat_copy = deepcopy(features) 141 feat_copy.sort() 142 for element in feat_copy: 143 rule = Rule(element) 144 145 sub_data = self.data_for_rule(data, rule) 146 sub_pos_data = self.pos_data_for_depth(sub_data, depth) 147 148 rule.sub_data = sub_data 149 rule.sub_pos_data = sub_pos_data 150 rule.confidence = float(len(sub_pos_data)) 151 rule.coverage = float(len(sub_data)) 152 rule.error = rule.coverage - rule.confidence 153 rule.additional_confidence = \ 154 float(len(dict_difference(sub_pos_data,covered_data))) 155 rule.additional_coverage = \ 156 float(len(dict_difference(sub_data,covered_data))) 157 rule.length = len(element) 158 rule.lenEH = (pos_length - rule.additional_confidence + \ 159 (rule.coverage - rule.confidence)) * datapoint_bits 160 161 features.remove(element) 162 exceptions = self.find_rules(sub_data, features, depth - 1) 163 sublength = (len(element) + 1 + exceptions.length) * self.C + \ 164 (pos_length - rule.additional_confidence + exceptions.error) * datapoint_bits 165 166 if sublength <= self.C * rule.length + rule.lenEH: 167 rule.sub_data = sub_data 168 rule.set_exceptions(exceptions) 169 rule.lenEH = (pos_length - rule.additional_confidence + exceptions.error) * datapoint_bits 170 features.append(element) 171 172 if rule.additional_confidence == 0.0: 173 continue 174 if rule.get_error() >= 50: 175 continue 176 177 possible_rules.append(rule) 178 if(rule.shorter_than(best_rule, self.C)): 179 best_rule = deepcopy(rule) 180 best_rule_index = len(possible_rules) - 1 181 return (possible_rules, best_rule, best_rule_index)
182
183 - def find_rules(self, data, features, depth):
184 """ 185 Find rules from data. 186 187 @type data: dict 188 @param data: data points and their classifications. 189 @type features: list 190 @param features: possible features for classification 191 @type depth: integer 192 @param depth: maximum depth for exceptions for the rules to be found. 193 @rtype: Ruleset 194 @return: Ruleset that was found. 195 """ 196 ruleset = Ruleset() 197 if depth == self.MAX_DEPTH: 198 rule = Rule("default") 199 rule.set_exceptions(self.find_rules(data, features, depth - 1)) 200 pos_data = self.pos_data_for_depth(data, depth) 201 rule.confidence = float(len(pos_data)) 202 rule.coverage = float(len(data)) 203 ruleset.append(rule) 204 return ruleset 205 if(self.check_data(data, depth)==False): 206 #return empty ruleset 207 return ruleset 208 if(depth < 0): 209 ruleset.length = 1.0 210 ruleset.error = self.count_errors(data, depth) 211 return ruleset 212 covered_data={} 213 best_rule = Rule() 214 pos_data = self.pos_data_for_depth(data, depth) 215 pos_length = float(len(pos_data)) 216 data_length = float(len(data)) 217 218 feat_copy = deepcopy(features) 219 220 datapoint_bits = log2(data_length) 221 ruleset.lenEH = pos_length * datapoint_bits 222 best_rule.lenEH = pos_length * datapoint_bits 223 224 while (pos_length>0): 225 (possible_rules, best_rule, best_rule_index) = \ 226 self.find_possible_rules(feat_copy, data, depth, 227 covered_data, pos_length, datapoint_bits) 228 229 if (best_rule.name is not None and 230 ruleset.improved_by(best_rule, datapoint_bits, self.C)): 231 del possible_rules[best_rule_index] 232 ruleset.append(best_rule) 233 234 dict_sum(covered_data,best_rule.sub_data) 235 data_length = float(len(dict_difference(data, covered_data))) 236 pos_length = float(len(dict_difference(pos_data, covered_data))) 237 ruleset.lenEH = (ruleset.error + pos_length) * datapoint_bits 238 ruleset.coverage += best_rule.additional_coverage 239 best_rule = Rule() 240 best_rule.lenEH = pos_length 241 else: #no additional rule found 242 break 243 244 if self.REGEN is False: 245 return self.greedy_set_cover_mdl(data, pos_data, ruleset, possible_rules, datapoint_bits, covered_data) 246 247 ruleset.error += pos_length 248 return ruleset
249 250 251 if __name__ == "__main__": 252 arguments = parse_arguments(sys.argv) 253 s = RDRLearnerMDL() 254 s.learn(*arguments) 255