#!/usr/bin/python

####################################################
#                                                  #
# SEQUENCE EXTRACTION FROM SUBJECT AND QUERY FILES #
#       WITHOUT STOP CODONS WITHIN SEQUECE         #
#                                                  #
#      INPUT - TWO FILES WITH DNA SEQUENCES        # 
#              (QUERY AND SUBJECT)                 #
#                       AND                        #
#    PARSED BLAST REPORT BY tcl_blast_parser_123   #
#                                                  #
#     NOTE! BLAST REPORT CORRESPONDS TO BLASTX     #
#    (TRANSLATED DNA VERSUS PROTEIN SEQUENCES)     #
# IT MEANS THAT SUBJECT DNA FILE SHOULD CORRESPOND #
# EXACTLY TO PROTEIN QUERY FILE FOR PROPER RESULTS #
#                                                  #
####################################################

####################################################
#                                                  #
#       COPYRIGHT, ALEXANDER KOZIK, 2003           #
#                                                  #
####################################################

def Seqs_Extractor(in_name1, in_name2, in_name3, out_name, out_dir):

	global stop_codon_status

	print "INPUT FILE 1 (QR_SEQS) :   " + in_name1
	print "INPUT FILE 2 (DB_SEQS) :   " + in_name2
	print "INPUT FILE 3 (BLASTX)  :   " + in_name3
	print "OUTPUT FILE (SEQ FRAGM):   " + out_name
	print "OUTPUT DIR             :   " + out_dir

	cod_name  = out_name + '.triplets'
	hit_name  = out_name + '.subj_seq'
	info_name = out_name + '.overlap_info'
	cons_codn = out_name + '.x_triplets'
	kska_name = out_name + '.x_kska'

	in_file1  = open(in_name1, "rb")
	in_file2  = open(in_name2, "rb")
	in_file3  = open(in_name3, "rb")
	# out_file  = open(out_name, "wb")
	out_file  = open(out_name + '.query_seq', "wb")
	out_file2 = open(hit_name, "wb")
	cod_file  = open(cod_name, "wb")
	kska_file = open(kska_name, "wb")
	info_file = open(info_name, "wb")
	xcdn_file = open(cons_codn, "wb")

	os.mkdir(out_dir)
	group_number = 1

	seq_list  = []
	hit_list  = []
	seq_array = {}
	hit_array = {}

	##########################################
	# READ SEQUENCE DATA INTO ARRAY : STEP 1 #
	#         QUERY DNA FASTA FILE           #
	##########################################
	line_counter = 0
	#################################
	while 1:
		### LAST SEQUENCE IN FASTA FILE ###
		t = in_file1.readline()
		if t == '':
			###  SUB_SEQ FUNCTION  ###
			have_seqs = "".join(my_seqs)
			seqs_len = len(have_seqs)
			seq_array[proper_id] = have_seqs
			####  END OF SUB_SEQ  ####
			break
		### SEQ PROCESSING ###
		if '\n' in t:
			t = t[:-1]
		if '\r' in t:
			t = t[:-1]

		fasta_match = t[0:1]
		if fasta_match == ">":
			gi_test = t[0:4]
			if gi_test == ">gi|":
				# print gi_test
				descr_line = t
				descr_line = re.sub("^>gi\|", "", descr_line)
				descr_line = re.sub("\|", '\t', descr_line, 1)
				# print line_counter
				line_counter += 1
			else:
				descr_line = t
				descr_line = re.sub("^>", "", descr_line)
				descr_line = re.sub(" ", '\t', descr_line, 1)
				# print line_counter
				line_counter += 1
			good_head = string.split(descr_line, '\t')[0]
			try:
				long_tail = string.split(descr_line, '\t')[1]
			except:
				long_tail = "no description found"
			if good_head in seq_list:
				running_text = "\
\n\n  Ooops... ID duplication  \n\n  check input for duplications  \n\n\n ID: " + good_head + "\n\n\n"
				print running_text
				###
				### INSERT TKINTER TEXT MESSAGE BOX
				###
				break
			seq_list.append(good_head)
			print `line_counter` + '\t' + good_head
			if line_counter != 1:
				###  SUB_SEQ FUNCTION  ###
				have_seqs = "".join(my_seqs)
				seqs_len = len(have_seqs)
				seq_array[proper_id] = have_seqs
				#print good_head
				#print have_seqs
				####  END OF SUB_SEQ  ####
			have_seqs = ""
			my_seqs = []
		if fasta_match != ">" and fasta_match != "":
			proper_id = good_head
			good_name = long_tail
			my_seqs.append(t)
	#################################
	###       END OF STEP 1         #
	#################################

	seq_list2  = []
	seq_array2 = {}

	##########################################
	# READ SEQUENCE DATA INTO ARRAY : STEP 2 #
	#        SUBJECT DNA FASTA FILE          #
	##########################################
	line_counter = 0
	#################################
	while 1:
		### LAST SEQUENCE IN FASTA FILE ###
		t = in_file2.readline()
		if t == '':
			###  SUB_SEQ FUNCTION  ###
			have_seqs = "".join(my_seqs)
			seqs_len = len(have_seqs)
			seq_array2[proper_id] = have_seqs
			####  END OF SUB_SEQ  ####
			break
		### SEQ PROCESSING ###
		if '\n' in t:
			t = t[:-1]
		if '\r' in t:
			t = t[:-1]

		fasta_match = t[0:1]
		if fasta_match == ">":
			gi_test = t[0:4]
			if gi_test == ">gi|":
				# print gi_test
				descr_line = t
				descr_line = re.sub("^>gi\|", "", descr_line)
				descr_line = re.sub("\|", '\t', descr_line, 1)
				# print line_counter
				line_counter += 1
			else:
				descr_line = t
				descr_line = re.sub("^>", "", descr_line)
				descr_line = re.sub(" ", '\t', descr_line, 1)
				# print line_counter
				line_counter += 1
			good_head = string.split(descr_line, '\t')[0]
			try:
				long_tail = string.split(descr_line, '\t')[1]
			except:
				long_tail = "no description found"
			if good_head in seq_list2:
				running_text = "\
\n\n  Ooops... ID duplication  \n\n  check input for duplications  \n\n\n ID: " + good_head + "\n\n\n"
				print running_text
				###
				### INSERT TKINTER TEXT MESSAGE BOX
				###
				break
			seq_list2.append(good_head)
			print `line_counter` + '\t' + good_head
			if line_counter != 1:
				###  SUB_SEQ FUNCTION  ###
				have_seqs = "".join(my_seqs)
				seqs_len = len(have_seqs)
				seq_array2[proper_id] = have_seqs
				#print good_head
				#print have_seqs
				####  END OF SUB_SEQ  ####
			have_seqs = ""
			my_seqs = []
		if fasta_match != ">" and fasta_match != "":
			proper_id = good_head
			good_name = long_tail
			my_seqs.append(t)
	#################################
	#        END OF STEP 2          #
	#################################

	#################################
	hit_count = 0
	#################################
	#       READ BLASTX DATA        #
	#################################
	while 1:
		t = in_file3.readline()
		if t == '':
			break
		if '\n' in t:
			t = t[:-1]
		if '\r' in t:
			t = t[:-1]
		t = t.split('\t')
		#################
		id  = t[0]
		# print 'id' + '\t' + id
		hit = t[1]
		# print 'hit' + '\t' + hit
		ds  = t[2]
		# print 'ds' + '\t' + ds
		if ds != 'no_hits_found':
			aln_idn = t[4]
			hnb = t[7]
			hnb = int(hnb)
			# print 'hnb' + '\t' + hnb
			fr  = t[8]
			# print 'fr' + '\t' + fr
			qst = t[9]
			qst = int(qst)
			# print 'qst' + '\t' + qst
			qen = t[10]
			qen = int(qen)
			# print 'qen' + '\t' + qen
			hst = t[11]
			hst = int(hst)
			# print 'hst' + '\t' + hst
			hen = t[12]
			hen = int(hen)
			# print 'hen' + '\t' + hen
			gaps = t[14]
			#################################
			#    SUBSEQUENCE EXTRACTION     #
			#################################
			if hnb == 1:
				hit_count += 1
				print `hit_count` + '\t' + id
				########################
				sequence = seq_array[id]
				# print sequence
				seq_len  = len(sequence)
				if qen > qst:
					direction = 'forward'
					sub_seq  = sequence[(qst-1):(qen-0)]
					sub_len  = len(sub_seq)
				if qen < qst:
					direction = 'reverse'
					t_r = list(sequence)	# string -> list of chars
					t_r.reverse()	# inplace reverse the list
					t_r = string.join(t_r, '')	# list of strings -> string
					t_r = string.upper(t_r)		# all uppercase
					t_r = re.sub("A", "t", t_r)	# A -> t
					t_r = re.sub("T", "a", t_r)	# T -> a
					t_r = re.sub("G", "c", t_r)	# G -> c
					t_r = re.sub("C", "g", t_r)	# C -> g
					t_r = string.upper(t_r)		# back to uppercase
					sequence = t_r
					sub_seq  = sequence[(seq_len-qst):(seq_len-qen + 1)]
					sub_len  = len(sub_seq)
				# print sub_seq
				mod_3 = math.fmod(sub_len, 3)
				if mod_3 != 0:
					print "MOD IS NOT 3 !!!"
					time.sleep(1)
				################################
				sequence2 = seq_array2[hit]
				# print sequence2
				seq_len2  = len(sequence2)
				## PROTEIN POSITION INTO DNA POSITION ###
				hst = hst*3 - 2
				hen = hen*3
				direction2 = 'forward'
				sub_seq2  = sequence2[(hst-1):(hen-0)]
				sub_len2  = len(sub_seq2)

				mod_3s = math.fmod(sub_len2, 3)
				if mod_3s != 0:
					print "MOD IS NOT 3 !!!"
					time.sleep(1)

				n_find  = sub_seq.rfind('N')
				n_find2 = sub_seq2.rfind('N')

				if n_find == -1 and n_find2 == -1:
				################################
					Count_Triplets(id, hit, sub_len, sub_seq, sub_len2, sub_seq2, kska_file, gaps)
				################################
					if stop_codon_status == 'GOOD':
						out_file.write('>' + id + ' length: ' + `sub_len` + ' ' + 'mod: ' + `mod_3` + ' ' + \
						direction + ' ' + `qst` + '-' + `qen` + \
						' [' + hit + ' ' + `hst` + '-' + `hen` + ']' + '\n' + sub_seq + '\n')
				################################
						out_file2.write('>' + hit + ' length: ' + `sub_len2` + ' ' + 'mod: ' + `mod_3s` + ' ' + \
						direction2 + ' ' + `hst` + '-' + `hen` + ' subsequence ' + \
						'\n' + sub_seq2 + '\n')
				################################
						info_file.write(id + '\t' + `qst` + '\t' + `qen` + '\t' \
								+ hit + '\t' + `hst` + '\t' + `hen` + '\t' + gaps + '\t' + aln_idn + '\n')
				################################
						if sub_len == sub_len2 and gaps == "NO_GAPS":
							current_group = "group_A_" + `group_number` + ".fasta"
							current_group = out_dir + "/" + current_group
							current_group_file = open(current_group, "wb")
							current_group_file.write('>' + id + '\n' + sub_seq + '\n' + \
									'>' + hit + '\n' + sub_seq2 + '\n')
						if sub_len == sub_len2 and gaps != "NO_GAPS":
							current_group = "group_B_" + `group_number` + ".fasta"
							current_group = out_dir + "/" + current_group
							current_group_file = open(current_group, "wb")
							current_group_file.write('>' + id + '\n' + sub_seq + '\n' + \
									'>' + hit + '\n' + sub_seq2 + '\n')
						if sub_len != sub_len2:
							current_group = "group_C_" + `group_number` + ".fasta"
							current_group = out_dir + "/" + current_group
							current_group_file = open(current_group, "wb")
							current_group_file.write('>' + id + '\n' + sub_seq + '\n' + \
									'>' + hit + '\n' + sub_seq2 + '\n')
				################################
						group_number += 1
						current_group_file.close()
				################################

	### TRIPLET SUMMARY ###
	Triplet_Summary(cod_file, xcdn_file)

	in_file1.close()
	in_file2.close()
	in_file3.close()
	cod_file.close()
	out_file.close()
	kska_file.close()
	info_file.close()
	xcdn_file.close()

def Triplet_Summary(cod_file, xcdn_file):

	global atgc_array
	global atgc_list
	global atgc_array2
	global atgc_list2
	global aa_array
	global cons_atgc_array
	global cons_atgc_array2

	global aa_count_array
	global cons_aa_count_array
	global aa_count_array2
	global cons_aa_count_array2
	global amac_list

	test1 = 0
	test2 = 0
	test1a = 0
	test2a = 0
	test1c = 0
	test2c = 0
	test1ca = 0
	test2ca = 0

	### ALL DATA ###
	for item in atgc_list:
		fraction  = (atgc_array[item]*100000.00)/atgc_array['atgc']
		test1 = test1 + atgc_array[item]
		fraction2 = (atgc_array2[item]*100000.00)/atgc_array2['atgc']
		test2 = test2 + atgc_array2[item]
		fraction = round(fraction)
		fraction2 = round(fraction2)
		diff = fraction - fraction2
		if fraction2 != 0:
			diff100 = ((fraction*1.00)/fraction2)*100
			diff100 = round(diff100)
		if fraction2 == 0:
			diff100 = 0.0
		am_ac = aa_array[item]
		#### PROPORTION OF AM_AC ####
		total_amac = aa_count_array[am_ac]
		total_amac2 = aa_count_array2[am_ac]
		tot_amac_fr = (total_amac*100.00)/aa_count_array['ALL']
		tot_amac_fr2 = (total_amac2*100.00)/aa_count_array2['ALL']
		if total_amac != 0:
		   fract_amac = (atgc_array[item]*100.00)/total_amac
		if total_amac == 0:
		   fract_amac = 0
		if total_amac2 != 0:
		   fract_amac2 = (atgc_array2[item]*100.00)/total_amac2
		if total_amac2 == 0:
		   fract_amac2 = 0
		fract_amac = round(fract_amac,2)
		fract_amac2 = round(fract_amac2,2)
		tot_amac_fr = round(tot_amac_fr,2)
		tot_amac_fr2 = round(tot_amac_fr2,2)
		tot_amac_diff = tot_amac_fr - tot_amac_fr2
		diff_amac = fract_amac - fract_amac2
		fract_amac = str(fract_amac)
		fract_amac2 = str(fract_amac2)
		diff_amac = str(diff_amac)
		tot_amac_fr = str(tot_amac_fr)
		tot_amac_fr2 = str(tot_amac_fr2)
		tot_amac_diff = str(tot_amac_diff)
		#############################
		cod_file.write(am_ac + '\t' + item + '\t' + `fraction` + '\t' + `fraction2` + '\t' + \
					`diff` + '\t' + `diff100` + '\t' + \
					`atgc_array[item]` + '\t'  + `atgc_array['atgc']` + '\t' + \
					`atgc_array2[item]` + '\t' + `atgc_array2['atgc']` + '\t' + \
					fract_amac + '\t' + fract_amac2 + '\t' + diff_amac + '\t' + \
					'***' + '\t' + tot_amac_fr + '\t' + tot_amac_fr2 + '\t' + tot_amac_diff + '\n')

	######################
	print '************************'
	print '  COUNT TEST 1:  '
	print `test1` + '\t' + `atgc_array['atgc']` + '\t' + 'QUERY TEST'
	print `test2` + '\t' + `atgc_array2['atgc']` + '\t' + 'SUBJECT TEST'
	for item in amac_list:
		test1a = test1a + aa_count_array[item]
		test2a = test2a + aa_count_array2[item]
	print `test1a` + '\t' + `aa_count_array['ALL']` + '\t' + 'AA QUERY TEST'
	print `test2a` + '\t' + `aa_count_array2['ALL']` + '\t' + 'AA SUBJECT TEST'
	print '************************'

	### CONSERVED DATA ###
	for item in atgc_list:
		fraction  = (cons_atgc_array[item]*100000.00)/cons_atgc_array['atgc']
		test1c = test1c + cons_atgc_array[item]
		fraction2 = (cons_atgc_array2[item]*100000.00)/cons_atgc_array2['atgc']
		test2c = test2c + cons_atgc_array2[item]
		fraction = round(fraction)
		fraction2 = round(fraction2)
		diff = fraction - fraction2
		if fraction2 != 0:
			diff100 = ((fraction*1.00)/fraction2)*100
			diff100 = round(diff100)
		if fraction2 == 0:
			diff100 = 0.0
		am_ac = aa_array[item]
		#### PROPORTION OF AM_AC ####
		total_amac = cons_aa_count_array[am_ac]
		total_amac2 = cons_aa_count_array2[am_ac]
		tot_amac_fr = (total_amac*100.00)/cons_aa_count_array['ALL']
		tot_amac_fr2 = (total_amac2*100.00)/cons_aa_count_array2['ALL']
		if total_amac != 0:
		   fract_amac = (cons_atgc_array[item]*100.00)/total_amac
		if total_amac == 0:
		   fract_amac = 0
		if total_amac2 != 0:
		   fract_amac2 = (cons_atgc_array2[item]*100.00)/total_amac2
		if total_amac2 == 0:
		   fract_amac2 = 0
		fract_amac = round(fract_amac,2)
		fract_amac2 = round(fract_amac2,2)
		tot_amac_fr = round(tot_amac_fr,2)
		tot_amac_fr2 = round(tot_amac_fr2,2)
		tot_amac_diff = tot_amac_fr - tot_amac_fr2
		diff_amac = fract_amac - fract_amac2
		fract_amac = str(fract_amac)
		fract_amac2 = str(fract_amac2)
		diff_amac = str(diff_amac)
		tot_amac_fr = str(tot_amac_fr)
		tot_amac_fr2 = str(tot_amac_fr2)
		tot_amac_diff = str(tot_amac_diff)
		#############################
		xcdn_file.write(am_ac + '\t' + item + '\t' + `fraction` + '\t' + `fraction2` + '\t' + \
					`diff` + '\t' + `diff100` + '\t' + \
					`cons_atgc_array[item]` + '\t'  + `cons_atgc_array['atgc']` + '\t' + \
					`cons_atgc_array2[item]` + '\t' + `cons_atgc_array2['atgc']` + '\t' + \
					fract_amac + '\t' + fract_amac2 + '\t' + diff_amac + '\t' + \
					'***' + '\t' + tot_amac_fr + '\t' + tot_amac_fr2 + '\t' + tot_amac_diff + '\n')

	######################
	print ''
	print '************************'
	print '  COUNT TEST 2:  '
	print `test1c` + '\t' + `cons_atgc_array['atgc']` + '\t' + 'QUERY TEST'
	print `test2c` + '\t' + `cons_atgc_array2['atgc']` + '\t' + 'SUBJECT TEST'
	for item in amac_list:
		test1ca = test1ca + cons_aa_count_array[item]
		test2ca = test2ca + cons_aa_count_array2[item]
	print `test1ca` + '\t' + `cons_aa_count_array['ALL']` + '\t' + 'AA QUERY TEST'
	print `test2ca` + '\t' + `cons_aa_count_array2['ALL']` + '\t' + 'AA SUBJECT TEST'
	print '************************'


def Count_Triplets(id, hit, sub_len, sub_seq, sub_len2, sub_seq2, kska_file, gaps):

	global atgc_array
	global atgc_list
	global atgc_array2
	global atgc_list2
	global aa_array
	global cons_atgc_array
	global cons_atgc_array2

	global aa_count_array
	global cons_aa_count_array
	global aa_count_array2
	global cons_aa_count_array2
	global amac_list

	global stop_codon_status
	stop_codon_status = 'GOOD'

	triplet_array = {}
	triplet_array2 = {}
	amac_array = {}
	amac_array2 = {}

	k = 0
	while k < (sub_len - 3):
		triplet = sub_seq[k:(k+3)]
		if triplet == 'TAA' or triplet == 'TAG' or triplet == 'TGA':
			stop_codon_status = 'BAD'
			print 'BAD CODON QUERY'
		k = k + 3
	k = 0
	while k < (sub_len2 - 3):
		triplet = sub_seq2[k:(k+3)]
		if triplet == 'TAA' or triplet == 'TAG' or triplet == 'TGA':
			stop_codon_status = 'BAD'
			print 'BAD CODON SUBJECT'
		k = k + 3

	count_cons = "FALSE"
	if stop_codon_status == 'GOOD':
		if sub_len == sub_len2 and gaps == "NO_GAPS":
			#####          CONSERVATIVE CASE         #####
			##### NO GAPS, IDENTICAL SEQUENCE LENGTH #####
			k = 0
			q = 1
			while k < sub_len:
				triplet = sub_seq[k:(k+3)]
				triplet2 = sub_seq2[k:(k+3)]
				#######################
				triplet = string.lower(triplet)
				triplet2 = string.lower(triplet2)
				if triplet in atgc_list and triplet2 in atgc_list:
					triplet_array[q] = triplet
					triplet_array2[q] = triplet2
					amac = aa_array[triplet]
					amac2 = aa_array[triplet2]
					cons_atgc_array[triplet] += 1
					cons_atgc_array2[triplet2] += 1
					cons_atgc_array['atgc'] += 1
					cons_atgc_array2['atgc'] += 1
					cons_aa_count_array[amac] += 1
					cons_aa_count_array2[amac2] += 1
					cons_aa_count_array['ALL'] += 1
					cons_aa_count_array2['ALL'] += 1
					amac_array[q] = amac
					amac_array2[q] = amac2
					count_cons = "TRUE"
				#######################
				k = k + 3
				q = q + 1

		##### ALL CASES #####

		#### QUERY SEQUENCE ####
		k = 0
		q = 1
		while k < sub_len:
			triplet = sub_seq[k:(k+3)]
			#######################
			triplet = string.lower(triplet)
			if triplet in atgc_list:
				am_ac = aa_array[triplet]
				aa_count_array[am_ac] += 1
				atgc_array[triplet] += 1
				atgc_array['atgc'] += 1
				aa_count_array['ALL'] += 1
			#######################
			k = k + 3
			q = q + 1

		#### SUBJECT SEQUENCE ####
		k = 0
		q = 1
		while k < sub_len2:
			triplet = sub_seq2[k:(k+3)]
			#######################
			triplet = string.lower(triplet)
			if triplet in atgc_list2:
				am_ac = aa_array[triplet]
				aa_count_array2[am_ac] += 1
				atgc_array2[triplet] += 1
				atgc_array2['atgc'] += 1
				aa_count_array2['ALL'] += 1
			#######################
			k = k + 3
			q = q + 1

		#######################
		if count_cons == "TRUE":
		###    COUNT KsKa   ###
			p = 1
			ks = 0
			ka = 0
			pm = 0
			am = 0

			while p < q:
				try:
					oops1 = triplet_array[p]
					oops2 = triplet_array2[p]
					test_pair = "GOOD"
				except:
					test_pair = "BAD TRIPLET"
					print test_pair
				if test_pair == "GOOD":
					if triplet_array[p] == triplet_array2[p]:
						pm = pm + 1
						am = am + 1
					if triplet_array[p] != triplet_array2[p] and amac_array[p] != amac_array2[p]:
						ka = ka + 1
					if triplet_array[p] != triplet_array2[p] and amac_array[p] == amac_array2[p]:
						ks = ks + 1
						am = am + 1
				p = p + 1

			prot_len = sub_len/3
			kska_diff = ks - ka
			# kska_frct = (pm*100.0)/am # FRACTION OF CODON MATCHES WITHIN AMINO ACIDS MATCHES
			kska_frct = ((am - ks)*100.0)/am # FRACTION OF CODON MATCHES WITHIN AMINO ACIDS MATCHES
			kska_frct = round(kska_frct)
			identity = (am*1.0/prot_len)*100
			identity = round(identity)
			kska_file.write(id + '\t' + hit + '\t' + `ks` + '\t' + `ka` + '\t' + `kska_diff` + \
					'\t' + '***' + '\t' + `pm` + '\t' + `am` + '\t' +`prot_len` + '\t' + \
					'***' + '\t' + `identity` + '\t' + `kska_frct` + '\n')


def Set_Zero_Values():

	global atgc_array
	global atgc_list
	global aa_array
	global cons_atgc_array
	global cons_atgc_array2
	global aa_count_array
	global cons_aa_count_array
	global aa_count_array2
	global cons_aa_count_array2
	global amac_list

	atgc_array = {}
	aa_array = {}
	cons_atgc_array = {}
	cons_atgc_array2 = {}
	aa_count_array = {}
	cons_aa_count_array = {}
	aa_count_array2 = {}
	cons_aa_count_array2 = {}

	aa_array['aaa'] = 'Lys'
	aa_array['aat'] = 'Asn'
	aa_array['aag'] = 'Lys'
	aa_array['aac'] = 'Asn'
	aa_array['ata'] = 'Ile'
	aa_array['att'] = 'Ile'
	aa_array['atg'] = 'Met'
	aa_array['atc'] = 'Ile'
	aa_array['aga'] = 'Arg'
	aa_array['agt'] = 'Ser'
	aa_array['agg'] = 'Arg'
	aa_array['agc'] = 'Ser'
	aa_array['aca'] = 'Thr'
	aa_array['act'] = 'Thr'
	aa_array['acg'] = 'Thr'
	aa_array['acc'] = 'Thr'

	aa_array['taa'] = 'End'
	aa_array['tat'] = 'Tyr'
	aa_array['tag'] = 'End'
	aa_array['tac'] = 'Tyr'
	aa_array['tta'] = 'Leu'
	aa_array['ttt'] = 'Phe'
	aa_array['ttg'] = 'Leu'
	aa_array['ttc'] = 'Phe'
	aa_array['tga'] = 'End'
	aa_array['tgt'] = 'Cys'
	aa_array['tgg'] = 'Trp'
	aa_array['tgc'] = 'Cys'
	aa_array['tca'] = 'Ser'
	aa_array['tct'] = 'Ser'
	aa_array['tcg'] = 'Ser'
	aa_array['tcc'] = 'Ser'

	aa_array['gaa'] = 'Glu'
	aa_array['gat'] = 'Asp'
	aa_array['gag'] = 'Glu'
	aa_array['gac'] = 'Asp'
	aa_array['gta'] = 'Val'
	aa_array['gtt'] = 'Val'
	aa_array['gtg'] = 'Val'
	aa_array['gtc'] = 'Val'
	aa_array['gga'] = 'Gly'
	aa_array['ggt'] = 'Gly'
	aa_array['ggg'] = 'Gly'
	aa_array['ggc'] = 'Gly'
	aa_array['gca'] = 'Ala'
	aa_array['gct'] = 'Ala'
	aa_array['gcg'] = 'Ala'
	aa_array['gcc'] = 'Ala'

	aa_array['caa'] = 'Gln'
	aa_array['cat'] = 'His'
	aa_array['cag'] = 'Gln'
	aa_array['cac'] = 'His'
	aa_array['cta'] = 'Leu'
	aa_array['ctt'] = 'Leu'
	aa_array['ctg'] = 'Leu'
	aa_array['ctc'] = 'Leu'
	aa_array['cga'] = 'Arg'
	aa_array['cgt'] = 'Arg'
	aa_array['cgg'] = 'Arg'
	aa_array['cgc'] = 'Arg'
	aa_array['cca'] = 'Pro'
	aa_array['cct'] = 'Pro'
	aa_array['ccg'] = 'Pro'
	aa_array['ccc'] = 'Pro'

	amac_list  =   ['Ala', 'Asn', 'Asp', 'Arg', 'Cys', 'Glu', \
		       	'Phe', 'Gly', 'His', 'Ile', 'Lys', 'Leu', \
		       	'Met', 'Pro', 'Gln', 'Ser', 'Thr', 'Val', \
		       	'Trp', 'Tyr', 'End']

	for item in amac_list:
		aa_count_array[item] = 0
		aa_count_array2[item] = 0
		cons_aa_count_array[item] = 0
		cons_aa_count_array2[item] = 0

	aa_count_array['ALL'] = 0
	aa_count_array2['ALL'] = 0
	cons_aa_count_array['ALL'] = 0
	cons_aa_count_array2['ALL'] = 0

	atgc_list  =   ['aaa', 'aat', 'aag', 'aac', \
			'ata', 'att', 'atg', 'atc', \
			'aga', 'agt', 'agg', 'agc', \
			'aca', 'act', 'acg', 'acc', \
			'taa', 'tat', 'tag', 'tac', \
			'tta', 'ttt', 'ttg', 'ttc', \
			'tga', 'tgt', 'tgg', 'tgc', \
			'tca', 'tct', 'tcg', 'tcc', \
			'gaa', 'gat', 'gag', 'gac', \
			'gta', 'gtt', 'gtg', 'gtc', \
			'gga', 'ggt', 'ggg', 'ggc', \
			'gca', 'gct', 'gcg', 'gcc', \
			'caa', 'cat', 'cag', 'cac', \
			'cta', 'ctt', 'ctg', 'ctc', \
			'cga', 'cgt', 'cgg', 'cgc', \
			'cca', 'cct', 'ccg', 'ccc']

	for item in atgc_list:
		atgc_array[item] = 0

	atgc_array['atgc'] = 0
	#################
	global atgc_array2
	global atgc_list2
	atgc_array2 = {}
	atgc_list2  = atgc_list

	for item in atgc_list2:
		atgc_array2[item] = 0

	atgc_array2['atgc'] = 0
	#################
	for item in atgc_list:
		cons_atgc_array[item] = 0

	cons_atgc_array['atgc'] = 0

	for item in atgc_list2:
		cons_atgc_array2[item] = 0

	cons_atgc_array2['atgc'] = 0
	#################

##################
#                #
#   MAIN BODY    #
#                #
##################

import math
import re
import sys
import string
import time
import os

if __name__ == "__main__":
	if len(sys.argv) <= 5 or len(sys.argv) > 6:
		print "Program usage: "
		print "input_file1(query_seqs) input_file2(db_seqs) input_file3(blastx.info2) output_file output_dir"
		exit
	if len(sys.argv) == 6:
		in_name1  = sys.argv[1]
		in_name2  = sys.argv[2]
		in_name3  = sys.argv[3]
		out_name  = sys.argv[4]
		out_dir   = sys.argv[5]
		Set_Zero_Values()
		Seqs_Extractor(in_name1, in_name2, in_name3, out_name, out_dir)
### THE END ###