Module convert_data
[hide private]
[frames] | no frames]

Source Code for Module convert_data

 1  import sys 
 2  import random 
 3  from utilities import open_file 
 4   
 5  __docformat__='epytext' 
 6   
7 -def convert_data():
8 """ 9 Converts the data from upstream and downstream files to a format that can 10 be used by learnrdr. 11 12 File retrieve-seq.2009_04_20.154301.res contains upstream sequences and 13 retrieve-seq.2009_04_20.154355.res downstream sequences. 14 15 A command line argument (N) specifies the size of data files to create. 16 Files upstreamN.txt, dowsnstreamN.txt and random N.txt are created. 17 18 For each, N/2 sequences are randomly chosen from retrieve-seq.2009_04_20.154301.res 19 and N/2 from retrieve-seq.2009_04_20.154355.res with sequences from 20 retrieve-seq.2009_04_20.154301.res labelled "1" in upstreamN.txt and "-1" 21 in downstreamN.txt and vice versa for retrieve-seq.2009_04_20.154355.res. 22 The sequences in randomN.txt are labelled "1" or "-1" randomly. 23 """ 24 try: 25 N = sys.argv[1] 26 countN = int(N) 27 except Error: 28 print "Enter N" 29 sys.exit() 30 31 out_upstream = open("../data/upstream"+N+".txt","w") 32 out_downstream = open("../data/downstream"+N+".txt","w") 33 out_random = open("../data/random"+N+".txt","w") 34 35 infile_up = open_file("../data/retrieve-seq.2009_04_20.154301.res") 36 infile_down = open_file("../data/retrieve-seq.2009_04_20.154355.res") 37 38 i = 0 39 updata = set() 40 downdata = set() 41 42 seq ="" 43 for line in infile_up: 44 if line.startswith(">"): 45 if i != 0: 46 updata.add(seq) 47 seq = "" 48 i += 1 49 else: 50 seq += line.strip() 51 52 for line in infile_down: 53 if line.startswith(">"): 54 if i != 0: 55 downdata.add(seq[3:]) 56 seq = "" 57 i += 1 58 else: 59 seq += line.strip() 60 61 up_sample = random.sample(list(updata), countN/2) 62 down_sample = random.sample(list(downdata), countN/2) 63 64 for i in range(countN/2): 65 out_upstream.write(up_sample[i]+" 1 \n") 66 out_upstream.write(down_sample[i]+" -1 \n") 67 out_downstream.write(up_sample[i]+" -1 \n") 68 out_downstream.write(down_sample[i]+" 1 \n") 69 out_random.write(up_sample[i]+" "+str(random.choice([1, -1]))+" \n") 70 out_random.write(down_sample[i]+" "+str(random.choice([1, -1]))+" \n") 71 72 infile_up.close() 73 infile_down.close() 74 out_upstream.close() 75 out_downstream.close() 76 out_random.close()
77 78 if __name__ == "__main__": 79 convert_data() 80