#!/usr/bin/python

def Seqs_Drobilka(in_name, out_name, seq_type):

	print in_name + ' ' + out_name

	in_file  = open(in_name,  "rb")
	out_file = open(out_name, "wb")
	out_stat = open(out_name + '.stat', "wb")
	out_tab  = open(out_name + '.tab', "wb")

	if seq_type == "DNA":
		out_stat.write("#SEQ_ID" + '\t' + "LENGTH" + '\t' + " A " + '\t' + " T " + '\t' + \
				" G " + '\t' + " C " + '\t' + "AT" + '\t' + "GC" + '\t' \
				+ "ATGC" + '\t' + " N " + '\t' + " X " + '\n')
	if seq_type == "prot":
		out_stat.write("#SEQ_ID" + '\t' + "LENGTH" + '\t' + \
				" A " + '\t' + " B " + '\t' + \
				" C " + '\t' + " D " + '\t' + \
				" E " + '\t' + " F " + '\t' + \
				" G " + '\t' + " H " + '\t' + \
				" I " + '\t' + " J " + '\t' + \
				" K " + '\t' + " L " + '\t' + \
				" M " + '\t' + " N " + '\t' + \
				" O " + '\t' + " P " + '\t' + \
				" Q " + '\t' + " R " + '\t' + \
				" S " + '\t' + " T " + '\t' + \
				" U " + '\t' + " V " + '\t' + \
				" W " + '\t' + " X " + '\t' + \
				" Y " + '\t' + " Z " + '\n')

	fasta_id_array = []
	line_counter = 0
	have_seqs = ""
	proper_id = ""
	my_seqs = []
	tot_len = 0
	a_tot = 0
	b_tot = 0
	c_tot = 0
	d_tot = 0
	e_tot = 0
	f_tot = 0
	g_tot = 0
	h_tot = 0
	i_tot = 0
	j_tot = 0
	k_tot = 0
	l_tot = 0
	m_tot = 0
	n_tot = 0
	o_tot = 0
	p_tot = 0
	q_tot = 0
	r_tot = 0
	s_tot = 0
	t_tot = 0
	u_tot = 0
	v_tot = 0
	w_tot = 0
	x_tot = 0
	y_tot = 0
	z_tot = 0

	while 1:
		t = in_file.readline()
		if t == '':
			###  SUB_SEQ FUNCTION  ###
			have_seqs = "".join(my_seqs)
			seqs_len = len(have_seqs)
			###   STRING PROCESSING   ###
			if seqs_len != 0:
				have_seqs = "".join(my_seqs)
				seqs_len = len(have_seqs)
				tot_len = tot_len + seqs_len
				###   STRING PROCESSING   ###
				abc_up = have_seqs.upper()
				###
				a_count = abc_up.count("A")
				a_tot = a_tot + a_count
				###
				b_count = abc_up.count("B")
				b_tot = b_tot + b_count
				###
				c_count = abc_up.count("C")
				c_tot = c_tot + c_count
				###
				d_count = abc_up.count("D")
				d_tot = d_tot + d_count
				###
				e_count = abc_up.count("E")
				e_tot = e_tot + e_count
				###
				f_count = abc_up.count("F")
				f_tot = f_tot + f_count
				###
				g_count = abc_up.count("G")
				g_tot = g_tot + g_count
				###
				h_count = abc_up.count("H")
				h_tot = h_tot + h_count
				###
				i_count = abc_up.count("I")
				i_tot = i_tot + i_count
				###
				j_count = abc_up.count("J")
				j_tot = j_tot + j_count
				###
				k_count = abc_up.count("K")
				k_tot = k_tot + k_count
				###
				l_count = abc_up.count("L")
				l_tot = l_tot + l_count
				###
				m_count = abc_up.count("M")
				m_tot = m_tot + m_count
				###
				n_count = abc_up.count("N")
				n_tot = n_tot + n_count
				###
				o_count = abc_up.count("O")
				o_tot = o_tot + o_count
				###
				p_count = abc_up.count("P")
				p_tot = p_tot + p_count
				###
				q_count = abc_up.count("Q")
				q_tot = q_tot + q_count
				###
				r_count = abc_up.count("R")
				r_tot = r_tot + r_count
				###
				s_count = abc_up.count("S")
				s_tot = s_tot + s_count
				###
				t_count = abc_up.count("T")
				t_tot = t_tot + t_count
				###
				u_count = abc_up.count("U")
				u_tot = u_tot + u_count
				###
				v_count = abc_up.count("V")
				v_tot = v_tot + v_count
				###
				w_count = abc_up.count("W")
				w_tot = w_tot + w_count
				###
				x_count = abc_up.count("X")
				x_tot = x_tot + x_count
				###
				y_count = abc_up.count("Y")
				y_tot = y_tot + y_count
				###
				z_count = abc_up.count("Z")
				z_tot = z_tot + z_count
				###
				a_fract = round(a_count*100.00/seqs_len)
				b_fract = round(b_count*100.00/seqs_len)
				c_fract = round(c_count*100.00/seqs_len)
				d_fract = round(d_count*100.00/seqs_len)
				e_fract = round(e_count*100.00/seqs_len)
				f_fract = round(f_count*100.00/seqs_len)
				g_fract = round(g_count*100.00/seqs_len)
				h_fract = round(h_count*100.00/seqs_len)
				i_fract = round(i_count*100.00/seqs_len)
				j_fract = round(j_count*100.00/seqs_len)
				k_fract = round(k_count*100.00/seqs_len)
				l_fract = round(l_count*100.00/seqs_len)
				m_fract = round(m_count*100.00/seqs_len)
				n_fract = round(n_count*100.00/seqs_len)
				o_fract = round(o_count*100.00/seqs_len)
				p_fract = round(p_count*100.00/seqs_len)
				q_fract = round(q_count*100.00/seqs_len)
				r_fract = round(r_count*100.00/seqs_len)
				s_fract = round(s_count*100.00/seqs_len)
				t_fract = round(t_count*100.00/seqs_len)
				u_fract = round(u_count*100.00/seqs_len)
				v_fract = round(v_count*100.00/seqs_len)
				w_fract = round(w_count*100.00/seqs_len)
				x_fract = round(x_count*100.00/seqs_len)
				y_fract = round(y_count*100.00/seqs_len)
				z_fract = round(z_count*100.00/seqs_len)
				###
				at_fract = a_fract + t_fract
				gc_fract = g_fract + c_fract
				atgc_fract = a_fract + t_fract + g_fract + c_fract
				###
				if seq_type == "DNA":
					out_stat.write(proper_id + '\t' + `seqs_len` + '\t' + `a_fract` + '\t' + `t_fract` + '\t' + \
							`g_fract` + '\t' + `c_fract` + '\t' + `at_fract` + '\t' + `gc_fract` + '\t' \
							+ `atgc_fract` + '\t' + 'N: ' + `n_fract` + '\t' + 'X: ' + `x_fract` + '\n')
				if seq_type == "prot":
					out_stat.write(proper_id + '\t' + `seqs_len` + '\t' + \
							`a_fract` + '\t' + `b_fract` + '\t' + \
							`c_fract` + '\t' + `d_fract` + '\t' + \
							`e_fract` + '\t' + `f_fract` + '\t' + \
							`g_fract` + '\t' + `h_fract` + '\t' + \
							`i_fract` + '\t' + `j_fract` + '\t' + \
							`k_fract` + '\t' + `l_fract` + '\t' + \
							`m_fract` + '\t' + `n_fract` + '\t' + \
							`o_fract` + '\t' + `p_fract` + '\t' + \
							`q_fract` + '\t' + `r_fract` + '\t' + \
							`s_fract` + '\t' + `t_fract` + '\t' + \
							`u_fract` + '\t' + `v_fract` + '\t' + \
							`w_fract` + '\t' + `x_fract` + '\t' + \
							`y_fract` + '\t' + `z_fract` + '\n')
			### STRING PROCESSING END ###
			# print seqs_len
			out_file.write('>' + proper_id + ' ' + good_name + '\n')
			out_file.write(have_seqs + '\n')
			####  END OF SUB_SEQ  ####
			out_tab.write(proper_id + '\t' + `seqs_len` + '\t' + have_seqs + '\n')
			##########################
			break
		if '\n' in t:
			t = t[:-1]
		if '\r' in t:
			t = t[:-1]

		fasta_match = t[0:1]
		if fasta_match == ">":
			gi_test = t[0:4]
			if gi_test == ">gi|":
				# print gi_test
				descr_line = t
				descr_line = re.sub("^>gi\|", "", descr_line)
				descr_line = re.sub("\|", '\t', descr_line, 1)
				# print line_counter
				line_counter += 1
			else:
				descr_line = t
				descr_line = re.sub("^>", "", descr_line)
				descr_line = re.sub(" ", '\t', descr_line, 1)
				# print line_counter
				line_counter = line_counter + 1
			good_head = string.split(descr_line, '\t')[0]
			try:
				long_tail = string.split(descr_line, '\t')[1]
			except:
				long_tail = ""
			if good_head in fasta_id_array:
				running_text = "\
\n\n  Ooops... ID duplication  \n\n  check input for duplications  \n\n\n ID: " + good_head + "\n\n\n"
				print running_text
				###
				### INSERT TKINTER TEXT MESSAGE BOX
				###
				break
			fasta_id_array.append(good_head)
			print `line_counter` + '\t' + good_head
			if line_counter != 1:
				###  SUB_SEQ FUNCTION  ###
				have_seqs = "".join(my_seqs)
				seqs_len = len(have_seqs)
				tot_len = tot_len + seqs_len
				###   STRING PROCESSING   ###
				abc_up = have_seqs.upper()
				###
				a_count = abc_up.count("A")
				a_tot = a_tot + a_count
				###
				b_count = abc_up.count("B")
				b_tot = b_tot + b_count
				###
				c_count = abc_up.count("C")
				c_tot = c_tot + c_count
				###
				d_count = abc_up.count("D")
				d_tot = d_tot + d_count
				###
				e_count = abc_up.count("E")
				e_tot = e_tot + e_count
				###
				f_count = abc_up.count("F")
				f_tot = f_tot + f_count
				###
				g_count = abc_up.count("G")
				g_tot = g_tot + g_count
				###
				h_count = abc_up.count("H")
				h_tot = h_tot + h_count
				###
				i_count = abc_up.count("I")
				i_tot = i_tot + i_count
				###
				j_count = abc_up.count("J")
				j_tot = j_tot + j_count
				###
				k_count = abc_up.count("K")
				k_tot = k_tot + k_count
				###
				l_count = abc_up.count("L")
				l_tot = l_tot + l_count
				###
				m_count = abc_up.count("M")
				m_tot = m_tot + m_count
				###
				n_count = abc_up.count("N")
				n_tot = n_tot + n_count
				###
				o_count = abc_up.count("O")
				o_tot = o_tot + o_count
				###
				p_count = abc_up.count("P")
				p_tot = p_tot + p_count
				###
				q_count = abc_up.count("Q")
				q_tot = q_tot + q_count
				###
				r_count = abc_up.count("R")
				r_tot = r_tot + r_count
				###
				s_count = abc_up.count("S")
				s_tot = s_tot + s_count
				###
				t_count = abc_up.count("T")
				t_tot = t_tot + t_count
				###
				u_count = abc_up.count("U")
				u_tot = u_tot + u_count
				###
				v_count = abc_up.count("V")
				v_tot = v_tot + v_count
				###
				w_count = abc_up.count("W")
				w_tot = w_tot + w_count
				###
				x_count = abc_up.count("X")
				x_tot = x_tot + x_count
				###
				y_count = abc_up.count("Y")
				y_tot = y_tot + y_count
				###
				z_count = abc_up.count("Z")
				z_tot = z_tot + z_count
				###
				a_fract = round(a_count*100.00/seqs_len)
				b_fract = round(b_count*100.00/seqs_len)
				c_fract = round(c_count*100.00/seqs_len)
				d_fract = round(d_count*100.00/seqs_len)
				e_fract = round(e_count*100.00/seqs_len)
				f_fract = round(f_count*100.00/seqs_len)
				g_fract = round(g_count*100.00/seqs_len)
				h_fract = round(h_count*100.00/seqs_len)
				i_fract = round(i_count*100.00/seqs_len)
				j_fract = round(j_count*100.00/seqs_len)
				k_fract = round(k_count*100.00/seqs_len)
				l_fract = round(l_count*100.00/seqs_len)
				m_fract = round(m_count*100.00/seqs_len)
				n_fract = round(n_count*100.00/seqs_len)
				o_fract = round(o_count*100.00/seqs_len)
				p_fract = round(p_count*100.00/seqs_len)
				q_fract = round(q_count*100.00/seqs_len)
				r_fract = round(r_count*100.00/seqs_len)
				s_fract = round(s_count*100.00/seqs_len)
				t_fract = round(t_count*100.00/seqs_len)
				u_fract = round(u_count*100.00/seqs_len)
				v_fract = round(v_count*100.00/seqs_len)
				w_fract = round(w_count*100.00/seqs_len)
				x_fract = round(x_count*100.00/seqs_len)
				y_fract = round(y_count*100.00/seqs_len)
				z_fract = round(z_count*100.00/seqs_len)
				###
				at_fract = a_fract + t_fract
				gc_fract = g_fract + c_fract
				atgc_fract = a_fract + t_fract + g_fract + c_fract
				###
				if seq_type == "DNA":
					out_stat.write(proper_id + '\t' + `seqs_len` + '\t' + `a_fract` + '\t' + `t_fract` + '\t' + \
							`g_fract` + '\t' + `c_fract` + '\t' + `at_fract` + '\t' + `gc_fract` + '\t' \
							+ `atgc_fract` + '\t' + 'N: ' + `n_fract` + '\t' + 'X: ' + `x_fract` + '\n')
				if seq_type == "prot":
					out_stat.write(proper_id + '\t' + `seqs_len` + '\t' + \
							`a_fract` + '\t' + `b_fract` + '\t' + \
							`c_fract` + '\t' + `d_fract` + '\t' + \
							`e_fract` + '\t' + `f_fract` + '\t' + \
							`g_fract` + '\t' + `h_fract` + '\t' + \
							`i_fract` + '\t' + `j_fract` + '\t' + \
							`k_fract` + '\t' + `l_fract` + '\t' + \
							`m_fract` + '\t' + `n_fract` + '\t' + \
							`o_fract` + '\t' + `p_fract` + '\t' + \
							`q_fract` + '\t' + `r_fract` + '\t' + \
							`s_fract` + '\t' + `t_fract` + '\t' + \
							`u_fract` + '\t' + `v_fract` + '\t' + \
							`w_fract` + '\t' + `x_fract` + '\t' + \
							`y_fract` + '\t' + `z_fract` + '\n')
				### STRING PROCESSING END ###
				# print seqs_len
				out_file.write('>' + proper_id + ' ' + good_name + '\n')
				out_file.write(have_seqs + '\n')
				####  END OF SUB_SEQ  ####
				out_tab.write(proper_id + '\t' + `seqs_len` + '\t' + have_seqs + '\n')
				##########################
			# out_file.write('>' + good_head + ' ' + `line_counter` + ' ' + long_tail + '\n')
			have_seqs = ""
			my_seqs = []
		if fasta_match != ">" and fasta_match != "":
			proper_id = good_head
			good_name = long_tail
			# have_seqs += t
			my_seqs.append(t)

	a_fract_tot = round(a_tot*100.00/tot_len)
	b_fract_tot = round(b_tot*100.00/tot_len)
	c_fract_tot = round(c_tot*100.00/tot_len)
	d_fract_tot = round(d_tot*100.00/tot_len)
	e_fract_tot = round(e_tot*100.00/tot_len)
	f_fract_tot = round(f_tot*100.00/tot_len)
	g_fract_tot = round(g_tot*100.00/tot_len)
	h_fract_tot = round(h_tot*100.00/tot_len)
	i_fract_tot = round(i_tot*100.00/tot_len)
	j_fract_tot = round(j_tot*100.00/tot_len)
	k_fract_tot = round(k_tot*100.00/tot_len)
	l_fract_tot = round(l_tot*100.00/tot_len)
	m_fract_tot = round(m_tot*100.00/tot_len)
	n_fract_tot = round(n_tot*100.00/tot_len)
	o_fract_tot = round(o_tot*100.00/tot_len)
	p_fract_tot = round(p_tot*100.00/tot_len)
	q_fract_tot = round(q_tot*100.00/tot_len)
	r_fract_tot = round(r_tot*100.00/tot_len)
	s_fract_tot = round(s_tot*100.00/tot_len)
	t_fract_tot = round(t_tot*100.00/tot_len)
	u_fract_tot = round(u_tot*100.00/tot_len)
	v_fract_tot = round(v_tot*100.00/tot_len)
	w_fract_tot = round(w_tot*100.00/tot_len)
	x_fract_tot = round(x_tot*100.00/tot_len)
	y_fract_tot = round(y_tot*100.00/tot_len)
	z_fract_tot = round(z_tot*100.00/tot_len)

	###
	at_fract_tot = a_fract_tot + t_fract_tot
	gc_fract_tot = g_fract_tot + c_fract_tot
	atgc_fract_tot = a_fract_tot + t_fract_tot + g_fract_tot + c_fract_tot
	###
	if seq_type == "DNA":
		out_stat.write("#TOTAL" + '\t' + `tot_len` + '\t' + `a_fract_tot` + '\t' + `t_fract_tot` + '\t' + \
				`g_fract_tot` + '\t' + `c_fract_tot` + '\t' + `at_fract_tot` + '\t' + `gc_fract_tot` + '\t' \
				+ `atgc_fract_tot` + '\t' + 'N: ' + `n_fract_tot` + '\t' + 'X: ' + `x_fract_tot` + '\n')
	if seq_type == "prot":
		out_stat.write("#TOTAL" + '\t' + `tot_len` + '\t' + \
				`a_fract_tot` + '\t' + `b_fract_tot` + '\t' + \
				`c_fract_tot` + '\t' + `d_fract_tot` + '\t' + \
				`e_fract_tot` + '\t' + `f_fract_tot` + '\t' + \
				`g_fract_tot` + '\t' + `h_fract_tot` + '\t' + \
				`i_fract_tot` + '\t' + `j_fract_tot` + '\t' + \
				`k_fract_tot` + '\t' + `l_fract_tot` + '\t' + \
				`m_fract_tot` + '\t' + `n_fract_tot` + '\t' + \
				`o_fract_tot` + '\t' + `p_fract_tot` + '\t' + \
				`q_fract_tot` + '\t' + `r_fract_tot` + '\t' + \
				`s_fract_tot` + '\t' + `t_fract_tot` + '\t' + \
				`u_fract_tot` + '\t' + `v_fract_tot` + '\t' + \
				`w_fract_tot` + '\t' + `x_fract_tot` + '\t' + \
				`y_fract_tot` + '\t' + `z_fract_tot` + '\n')

	in_file.close()
	out_file.close()
	out_stat.close()
	out_tab.close()

import math
import re
import sys
import string
if __name__ == "__main__":
	if len(sys.argv) <= 3 or len(sys.argv) > 4:
		print "Program usage: "
		print "input_file output_file DNA/prot"
		print "Script counts \"ATGC\" content in FASTA file"
		print "and generates tab-delimited file for mySQL db"
		exit
	if len(sys.argv) == 4:
		in_name  = sys.argv[1]
		out_name = sys.argv[2]
		seq_type = sys.argv[3]
		if in_name != out_name:
			Seqs_Drobilka(in_name, out_name, seq_type)
		else:
			print "Output should have different name than Input"
			exit