1 import math
2 import sys
3 from getopt import getopt, GetoptError
4
5 __docformat__='epytext'
6
8 """
9 Find the symbol for the classifier at given depth.
10
11 @type depth: integer
12 @param depth: Depth of rules to classify.
13 @type pos_symbol: string
14 @param pos_symbol: Symbol signifying positive example in data.
15 @type neg_symbol: string
16 @param neg_symbol: Symbol signifying negative example in data.
17 @rtype: string
18 @return: Correct symbol for rule at given depth.
19 """
20 if depth % 2 == 0:
21 return neg_symbol
22 else:
23 return pos_symbol
24
26 """
27 Difference of two dictionaries.
28
29 @type dict1: dict
30 @type dict2: dict
31 @rtype: dict
32 @return: data points with their classifications that are contained
33 in dict1, but not in dict2 .
34 """
35 dict = {}
36 for key in dict1.keys():
37 if not dict2.has_key(key):
38 dict[key] = dict1[key]
39 return dict
40
42 """
43 Sum of two dictionaries.
44
45 @type dict1: dict
46 @type dict2: dict
47 @rtype: dict
48 @return: data points with their classifications that are contained
49 in at least one of the dictionaries.
50 """
51 for (key,value) in dict2.items():
52 if not dict1.has_key(key):
53 dict1[key] = value
54
56 """
57 Open a file for reading.
58
59 @type file: string
60 @param file: Name of file to open.
61 @raise IOError: if file cannot be opened.
62 @rtype: file handle
63 """
64
65 try:
66 f=open(file, 'r')
67 except IOError:
68 print 'cannot open', file
69 raise
70 else:
71 return f
72
73 -def get_data(file, pos_symbol, neg_symbol):
74 """
75 Read data from a file.
76
77 @type file: string
78 @param file: Name of file to read data from.
79 @type pos_symbol: string
80 @param pos_symbol: Symbol signifying positive example in data.
81 @type neg_symbol: string
82 @param neg_symbol: Symbol signifying negative example in data.
83 @rtype: dict
84 @return: data points with their classifications.
85 """
86 f = open_file(file)
87 data = {}
88 for line in f:
89 (a,b) = line.split()
90 if(b == pos_symbol):
91 data[a] = 1
92 elif(b == neg_symbol):
93 data[a] = -1
94 f.close()
95 return data
96
97
99 """
100 Read features from a file.
101
102 @type file: string
103 @param file: Name of file to read features from.
104 @rtype: list
105 @return: features.
106 """
107 f = open_file(file)
108 features = []
109 for line in f:
110 features.append(line.strip())
111 f.close()
112 return features
113
115 """
116 Generate all substrings of given lengths found in data.
117
118 @type data: dictionary
119 @param data: Examples with their classifications.
120 @type min_len: integer
121 @param min_len: Minimum length for substrings to be generated.
122 @type max_len: integer
123 @param max_len: Maximum length for substrings to be generated.
124 @rtype: list
125 @return: substrings of given lengths.
126 """
127
128 features = set()
129 for (item,value) in data.items():
130 length = len(item)
131 for i in range(length-min_len+1):
132 for j in range(i+min_len,i+max_len+1):
133
134
135 features.add(item[i:j])
136 ft = list(features)
137 ft.sort()
138 print "Number of features:", len(features)
139 return ft
140
142 """
143 Logarithm base 2.
144
145 @rtype: number
146 """
147 return math.log(x,2)
148
150 """
151 Signum.
152
153 @rtype: number
154 """
155 if x >= 0:
156 return 1
157 else:
158 return -1
159
161 """
162 Parse commant line arguments.
163
164 @type argv: L{list}
165 @param argv: Command line arguments.
166 @type mdl: boolean
167 @param mdl: True, if the arguments are for the MDL-using program.
168 @rtype: 9-L{tuple}
169 @return: parameters' values
170 """
171 argv = argv[1:]
172 try:
173 opts, args = getopt(argv, "hf:d:p:n:l:x:rc:",
174 ["help", "features=", "maxdepth=", "possymbol=", "negsymbol=",
175 "minlength=", "maxlength=", "regenerate"])
176 except GetoptError, err:
177 parse_error(err, mdl)
178
179 features_file = "all"
180 max_depth = 2
181 if mdl:
182 c = 2
183 else:
184 c = 1000000
185 pos_symbol = "+"
186 neg_symbol = "-"
187 min_length = 1
188 max_length = 6
189 regenerate = False
190
191 for opt, arg in opts:
192 if opt in ("-h", "--help"):
193 usage(mdl, err)
194 sys.exit()
195 if opt in ("-f", "--features"):
196 features_file = arg
197 if opt in ("-d", "--maxdepth"):
198 try:
199 max_depth = int(arg)
200 except ValueError:
201 parse_error("d should be integer!", mdl)
202 if opt in ("-p", "--possymbol"):
203 pos_symbol = arg
204 if opt in ("-n", "--negsymbol"):
205 neg_symbol = arg
206 if opt in ("-l", "--minlength"):
207 try:
208 min_length = int(arg)
209 except ValueError:
210 parse_error("minlength should be integer!", mdl)
211 if opt in ("-x", "--maxlength"):
212 try:
213 max_length = int(arg)
214 except ValueError:
215 parse_error("maxlength should be integer!", mdl)
216 if opt in ("-r", "--regenerate"):
217 regenerate = True
218 if opt == "-c":
219 try:
220 c = float(arg)
221 except ValueError:
222 parse_error("c should be a number!", mdl)
223 try:
224 data_file = args[0]
225 except Exception:
226 parse_error("datafile not specified!", mdl)
227 return (data_file, features_file, max_depth, c, pos_symbol, neg_symbol, min_length, max_length, regenerate)
228
230 """
231 Print program usage information.
232
233 @type mdl: boolean
234 @param mdl: True, if the info is about the MDL-using program.
235 """
236 if mdl:
237 print "Usage: learnrdr.py [parameters] datafile"
238 else:
239 print "Usage: learnrdr_adhoc.py [parameters] datafile"
240 print """
241 datafile is a text file with a data point followed by whitespace and a class symbol (POS or NEG) on one line.
242
243 Parameters:
244 -h, --help
245 Show this help.
246
247 -f FEATURES, --features=FEATURES
248 Load the set of features from a text file FEATURES with each feature on a separate line.
249 If FEATURES = "all" then uses all possible substrings of lengths MINLENGTH to MAXLENGTH as features.
250 Default is "all"
251
252 -l MINLENGTH, --minlength=MINLENGTH
253 Set minimum length for features to MINLENGTH.
254 Ignored if features are loaded from a file.
255 Default is 1.
256
257 -x MAXLENGTH, --maxlength=MAXLENGTH
258 Set maximum length for features to MAXLENGTH.
259 Ignored if features are loaded from a file.
260 Default is 6.
261
262 -d DEPTH, --maxdepth=DEPTH
263 Set maximum rule depth to DEPTH
264 Default is 2.
265
266 -p POS, --possymbol=NEG
267 Set the symbol for positive data points to POS, used in reading data from file and displaying rules.
268 Default is "+".
269
270 -n NEG, --negsymbol=NEG
271 Set the symbol for positive data points to NEG, used in reading data from file and displaying rules.
272 Default is "-".
273
274 -r, --regenerate
275 Regenerate possible rules after each time a rule is added to the ruleset.
276 """
277 if mdl:
278 print """ -c VALUE
279 Set the value of parameter c in MDL to VALUE.
280 Default is 2 (An alphabet of A, C, G, T consists of 4 letters, it takes log2(4)=2 bits to encode each letter).
281 """
282 else:
283 print """ -c VALUE
284 Set the value cutoff cost to VALUE.
285 Default is 1000000.
286 """
287
289 """
290 Print given parsing error, usage of program and exit.
291
292 @type mdl: boolean
293 @param mdl: True, if the error is generated by the MDL-using program.
294 """
295 print str(err)
296 usage(mdl)
297 sys.exit(2)
298