1 import sys
2 import random
3 from utilities import open_file
4
5 __docformat__='epytext'
6
8 """
9 Converts the data from upstream and downstream files to a format that can
10 be used by learnrdr.
11
12 File retrieve-seq.2009_04_20.154301.res contains upstream sequences and
13 retrieve-seq.2009_04_20.154355.res downstream sequences.
14
15 A command line argument (N) specifies the size of data files to create.
16 Files upstreamN.txt, dowsnstreamN.txt and random N.txt are created.
17
18 For each, N/2 sequences are randomly chosen from retrieve-seq.2009_04_20.154301.res
19 and N/2 from retrieve-seq.2009_04_20.154355.res with sequences from
20 retrieve-seq.2009_04_20.154301.res labelled "1" in upstreamN.txt and "-1"
21 in downstreamN.txt and vice versa for retrieve-seq.2009_04_20.154355.res.
22 The sequences in randomN.txt are labelled "1" or "-1" randomly.
23 """
24 try:
25 N = sys.argv[1]
26 countN = int(N)
27 except Error:
28 print "Enter N"
29 sys.exit()
30
31 out_upstream = open("../data/upstream"+N+".txt","w")
32 out_downstream = open("../data/downstream"+N+".txt","w")
33 out_random = open("../data/random"+N+".txt","w")
34
35 infile_up = open_file("../data/retrieve-seq.2009_04_20.154301.res")
36 infile_down = open_file("../data/retrieve-seq.2009_04_20.154355.res")
37
38 i = 0
39 updata = set()
40 downdata = set()
41
42 seq =""
43 for line in infile_up:
44 if line.startswith(">"):
45 if i != 0:
46 updata.add(seq)
47 seq = ""
48 i += 1
49 else:
50 seq += line.strip()
51
52 for line in infile_down:
53 if line.startswith(">"):
54 if i != 0:
55 downdata.add(seq[3:])
56 seq = ""
57 i += 1
58 else:
59 seq += line.strip()
60
61 up_sample = random.sample(list(updata), countN/2)
62 down_sample = random.sample(list(downdata), countN/2)
63
64 for i in range(countN/2):
65 out_upstream.write(up_sample[i]+" 1 \n")
66 out_upstream.write(down_sample[i]+" -1 \n")
67 out_downstream.write(up_sample[i]+" -1 \n")
68 out_downstream.write(down_sample[i]+" 1 \n")
69 out_random.write(up_sample[i]+" "+str(random.choice([1, -1]))+" \n")
70 out_random.write(down_sample[i]+" "+str(random.choice([1, -1]))+" \n")
71
72 infile_up.close()
73 infile_down.close()
74 out_upstream.close()
75 out_downstream.close()
76 out_random.close()
77
78 if __name__ == "__main__":
79 convert_data()
80