Module utilities
[hide private]
[frames] | no frames]

Source Code for Module utilities

  1  import math 
  2  import sys 
  3  from getopt import getopt, GetoptError 
  4   
  5  __docformat__='epytext' 
  6   
7 -def get_classifier(depth, pos_symbol, neg_symbol):
8 """ 9 Find the symbol for the classifier at given depth. 10 11 @type depth: integer 12 @param depth: Depth of rules to classify. 13 @type pos_symbol: string 14 @param pos_symbol: Symbol signifying positive example in data. 15 @type neg_symbol: string 16 @param neg_symbol: Symbol signifying negative example in data. 17 @rtype: string 18 @return: Correct symbol for rule at given depth. 19 """ 20 if depth % 2 == 0: 21 return neg_symbol 22 else: 23 return pos_symbol
24
25 -def dict_difference(dict1, dict2):
26 """ 27 Difference of two dictionaries. 28 29 @type dict1: dict 30 @type dict2: dict 31 @rtype: dict 32 @return: data points with their classifications that are contained 33 in dict1, but not in dict2 . 34 """ 35 dict = {} 36 for key in dict1.keys(): 37 if not dict2.has_key(key): 38 dict[key] = dict1[key] 39 return dict
40
41 -def dict_sum(dict1, dict2):
42 """ 43 Sum of two dictionaries. 44 45 @type dict1: dict 46 @type dict2: dict 47 @rtype: dict 48 @return: data points with their classifications that are contained 49 in at least one of the dictionaries. 50 """ 51 for (key,value) in dict2.items(): 52 if not dict1.has_key(key): 53 dict1[key] = value
54
55 -def open_file(file):
56 """ 57 Open a file for reading. 58 59 @type file: string 60 @param file: Name of file to open. 61 @raise IOError: if file cannot be opened. 62 @rtype: file handle 63 """ 64 65 try: 66 f=open(file, 'r') 67 except IOError: 68 print 'cannot open', file 69 raise 70 else: 71 return f
72
73 -def get_data(file, pos_symbol, neg_symbol):
74 """ 75 Read data from a file. 76 77 @type file: string 78 @param file: Name of file to read data from. 79 @type pos_symbol: string 80 @param pos_symbol: Symbol signifying positive example in data. 81 @type neg_symbol: string 82 @param neg_symbol: Symbol signifying negative example in data. 83 @rtype: dict 84 @return: data points with their classifications. 85 """ 86 f = open_file(file) 87 data = {} 88 for line in f: 89 (a,b) = line.split() 90 if(b == pos_symbol): 91 data[a] = 1 92 elif(b == neg_symbol): 93 data[a] = -1 94 f.close() 95 return data
96 97
98 -def get_features(file):
99 """ 100 Read features from a file. 101 102 @type file: string 103 @param file: Name of file to read features from. 104 @rtype: list 105 @return: features. 106 """ 107 f = open_file(file) 108 features = [] 109 for line in f: 110 features.append(line.strip()) 111 f.close() 112 return features
113
114 -def gen_substrings(data,min_len,max_len):
115 """ 116 Generate all substrings of given lengths found in data. 117 118 @type data: dictionary 119 @param data: Examples with their classifications. 120 @type min_len: integer 121 @param min_len: Minimum length for substrings to be generated. 122 @type max_len: integer 123 @param max_len: Maximum length for substrings to be generated. 124 @rtype: list 125 @return: substrings of given lengths. 126 """ 127 128 features = set() 129 for (item,value) in data.items(): 130 length = len(item) 131 for i in range(length-min_len+1): 132 for j in range(i+min_len,i+max_len+1): 133 #if(j>length): 134 # break 135 features.add(item[i:j]) 136 ft = list(features) 137 ft.sort() 138 print "Number of features:", len(features) 139 return ft
140
141 -def log2(x):
142 """ 143 Logarithm base 2. 144 145 @rtype: number 146 """ 147 return math.log(x,2)
148
149 -def signum(x):
150 """ 151 Signum. 152 153 @rtype: number 154 """ 155 if x >= 0: 156 return 1 157 else: 158 return -1
159
160 -def parse_arguments(argv, mdl=True):
161 """ 162 Parse commant line arguments. 163 164 @type argv: L{list} 165 @param argv: Command line arguments. 166 @type mdl: boolean 167 @param mdl: True, if the arguments are for the MDL-using program. 168 @rtype: 9-L{tuple} 169 @return: parameters' values 170 """ 171 argv = argv[1:] 172 try: 173 opts, args = getopt(argv, "hf:d:p:n:l:x:rc:", 174 ["help", "features=", "maxdepth=", "possymbol=", "negsymbol=", 175 "minlength=", "maxlength=", "regenerate"]) 176 except GetoptError, err: 177 parse_error(err, mdl) 178 179 features_file = "all" 180 max_depth = 2 181 if mdl: 182 c = 2 183 else: 184 c = 1000000 185 pos_symbol = "+" 186 neg_symbol = "-" 187 min_length = 1 188 max_length = 6 189 regenerate = False 190 191 for opt, arg in opts: 192 if opt in ("-h", "--help"): 193 usage(mdl, err) 194 sys.exit() 195 if opt in ("-f", "--features"): 196 features_file = arg 197 if opt in ("-d", "--maxdepth"): 198 try: 199 max_depth = int(arg) 200 except ValueError: 201 parse_error("d should be integer!", mdl) 202 if opt in ("-p", "--possymbol"): 203 pos_symbol = arg 204 if opt in ("-n", "--negsymbol"): 205 neg_symbol = arg 206 if opt in ("-l", "--minlength"): 207 try: 208 min_length = int(arg) 209 except ValueError: 210 parse_error("minlength should be integer!", mdl) 211 if opt in ("-x", "--maxlength"): 212 try: 213 max_length = int(arg) 214 except ValueError: 215 parse_error("maxlength should be integer!", mdl) 216 if opt in ("-r", "--regenerate"): 217 regenerate = True 218 if opt == "-c": 219 try: 220 c = float(arg) 221 except ValueError: 222 parse_error("c should be a number!", mdl) 223 try: 224 data_file = args[0] 225 except Exception: 226 parse_error("datafile not specified!", mdl) 227 return (data_file, features_file, max_depth, c, pos_symbol, neg_symbol, min_length, max_length, regenerate)
228
229 -def usage(mdl):
230 """ 231 Print program usage information. 232 233 @type mdl: boolean 234 @param mdl: True, if the info is about the MDL-using program. 235 """ 236 if mdl: 237 print "Usage: learnrdr.py [parameters] datafile" 238 else: 239 print "Usage: learnrdr_adhoc.py [parameters] datafile" 240 print """ 241 datafile is a text file with a data point followed by whitespace and a class symbol (POS or NEG) on one line. 242 243 Parameters: 244 -h, --help 245 Show this help. 246 247 -f FEATURES, --features=FEATURES 248 Load the set of features from a text file FEATURES with each feature on a separate line. 249 If FEATURES = "all" then uses all possible substrings of lengths MINLENGTH to MAXLENGTH as features. 250 Default is "all" 251 252 -l MINLENGTH, --minlength=MINLENGTH 253 Set minimum length for features to MINLENGTH. 254 Ignored if features are loaded from a file. 255 Default is 1. 256 257 -x MAXLENGTH, --maxlength=MAXLENGTH 258 Set maximum length for features to MAXLENGTH. 259 Ignored if features are loaded from a file. 260 Default is 6. 261 262 -d DEPTH, --maxdepth=DEPTH 263 Set maximum rule depth to DEPTH 264 Default is 2. 265 266 -p POS, --possymbol=NEG 267 Set the symbol for positive data points to POS, used in reading data from file and displaying rules. 268 Default is "+". 269 270 -n NEG, --negsymbol=NEG 271 Set the symbol for positive data points to NEG, used in reading data from file and displaying rules. 272 Default is "-". 273 274 -r, --regenerate 275 Regenerate possible rules after each time a rule is added to the ruleset. 276 """ 277 if mdl: 278 print """ -c VALUE 279 Set the value of parameter c in MDL to VALUE. 280 Default is 2 (An alphabet of A, C, G, T consists of 4 letters, it takes log2(4)=2 bits to encode each letter). 281 """ 282 else: 283 print """ -c VALUE 284 Set the value cutoff cost to VALUE. 285 Default is 1000000. 286 """ 287
288 -def parse_error(err, mdl):
289 """ 290 Print given parsing error, usage of program and exit. 291 292 @type mdl: boolean 293 @param mdl: True, if the error is generated by the MDL-using program. 294 """ 295 print str(err) 296 usage(mdl) 297 sys.exit(2)
298