#!/usr/bin/python

####################################################
#                                                  #
#            AFFY PROBE SET SORTER                 #
#                                                  #
#       COPYRIGHT, ALEXANDER KOZIK, 2005           #
#                                                  #
####################################################

def Seqs_Extractor(in_name, out_name, affy_cut, affy_cut2):

	print "====================================="
	print "INPUT FILE (AFFY)    :   " + in_name
	print "OUTPUT FILE          :   " + out_name
	print "CUTOFF VALUE STRING  :   " + affy_cut
	print "SECOND CUTOFF IS     :   " + affy_cut2
	print "====================================="

	affy_cut = float(affy_cut)
	affy_cut2 = float(affy_cut2)
	print "CUTOFF VALUE FLOAT   :   " + `affy_cut`
	print "SECOND CUTOFF FLOAT  :   " + `affy_cut2`
	print "====================================="

	time.sleep(3)

	in_file   = open(in_name, "rb")
	out_file  = open(out_name + ".affy.tab", "wb")

	all_good    = 0
	all_bad     = 0
	all_mod     = 0
	all_perf    = 0

	count_000_010 = 0
	count_010_020 = 0
	count_020_025 = 0
	count_025_030 = 0
	count_030_035 = 0
	count_035_040 = 0
	count_040_045 = 0
	count_045_050 = 0
	count_050_060 = 0
	count_060_070 = 0
	count_070_100 = 0

	seq_array   = {}
	seq_list    = []
	score_array = {}
	seq_count   = {}
	score_perf  = {}
	score_good  = {}
	score_bad   = {}
	score_mod   = {}
	score_sum   = {}
	sum_good    = {}
	sum_bad     = {}
	sum_mod     = {}

	#################################
	l = 1
	while 1:
		t = in_file.readline()
		if t == '':
			break
		if '\n' in t:
			t = t[:-1]
		if '\r' in t:
			t = t[:-1]
		t = t.split('\t')

		# if l != 1:

		id = t[1]
		ps = t[8]
		ps = float(ps)

		seq_test = 0
		try:
			seq_test = seq_array[id]
		except:
			seq_array[id] = 1
			seq_list.append(id)
			seq_count[id] = 1
			score_perf[id] = 0
			score_good[id] = 0
			score_bad[id]  = 0
			score_mod[id]  = 0
			score_sum[id]  = 0
			sum_good[id]   = 0
			sum_bad[id]    = 0
			sum_mod[id]    = 0
		if seq_test == 1:
			seq_count[id] = seq_count[id] + 1
		if ps < affy_cut:
			score_bad[id] = score_bad[id] + 1
			sum_bad[id]   = sum_bad[id] + ps
			all_bad = all_bad + 1
		if ps >= affy_cut:
			score_good[id] = score_good[id] + 1
			sum_good[id]   = sum_good[id] + ps
			all_good = all_good + 1

		if ps < affy_cut2 and ps > affy_cut:
			score_mod[id] = score_mod[id] + 1

        		# count_000_010 = count_000_010 + 1
        		# count_010_020 = count_010_020 + 1
        		# count_020_025 = count_020_025 + 1
        		count_025_030 = count_025_030 + 1
        		# count_030_035 = count_030_035 + 1
        		# count_035_040 = count_035_040 + 1
        		# count_040_045 = count_040_045 + 1
        		# count_045_050 = count_045_050 + 1
        		# count_050_060 = count_050_060 + 1
        		# count_060_070 = count_060_070 + 1
        		# count_070_100 = count_070_100 + 1

		if ps >= affy_cut2:
			score_perf[id] = score_perf[id] + 1
			all_perf = all_perf + 1

		score_sum[id] = score_sum[id] + ps

		# score_array[id,seq_count[id]] = ps

		line_tick_update = math.fmod(l, 100000)
		if line_tick_update == 0:
			print `l` + "   -=- lines were processed -=-   " + "Seq ID: " + id

		l = l + 1

	print ""
	print "=================== LAST LINE ============================="
	print `l` + "   -=- lines were processed -=-   " + "Seq ID: " + id
	print "==========================================================="
	print ""
	time.sleep(3)

	seq_list_len = len(seq_list)
	print "THERE ARE " + `seq_list_len` + " IDs IN SEQ LIST"
	print " GOOD PROBES:  " + `all_good`
	print " BAD  PROBES:  " + `all_bad`
	print ""
	time.sleep(3)

	in_file.close()

	#########################

	# in_file = open(in_name, "rb")

	#########################

	out_file.write("SEQUENCE_ID" + '\t' + "ALL" + '\t' + "BAD" + '\t' + "MOD" + '\t' + "GOOD" + '\t' + "PERF" + '\t' \
					+ "G_FR" + '\t' + "Q_TEST" + '\t' + "AVER" + '\t' + "AV_BAD" + '\t' + "AV_GOOD" + '\n')

	p = 0
	for item in seq_list:

		test_seq = "XXXXX"

		if seq_count[item] == score_bad[item] + score_good[item]:
			test_seq = "__OK__"
		if seq_count[item] != score_bad[item] + score_good[item]:
			test_seq = "ERROR"

		good_ratio = score_good[item]*1.00/seq_count[item]
		good_ratio = str(round(good_ratio,2))

		try:
			aver_value = score_sum[item]/seq_count[item]
		except:
			aver_value = -1
		try:
			aver_good  = sum_good[item]/score_good[item]
		except:
			aver_good  = -1
		try:
			aver_bad   = sum_bad[item]/score_bad[item]
		except:
			aver_bad   = -1
		aver_status = "XXXXX"
		good_stat   = "XXXXX"
		# if aver_value <  affy_cut:
		#	aver_status = "_BAD_"
		# if aver_value >= affy_cut:
		#	aver_status = "GOOD"

		if score_good[item] <  50:
			good_stat   = "__*__"
		if score_good[item] < 100 and score_good[item] >= 50:
			good_stat   = "_*_*_"
		if score_good[item] < 200 and score_good[item] >= 100:
			good_stat   = "_***_"
		if score_good[item] >=200:
			good_stat   = "*****"

		aver_value = str(round(aver_value,2))
		aver_good  = str(round(aver_good,2))
		aver_bad   = str(round(aver_bad,2))

		out_file.write(item + '\t' + `seq_count[item]` + '\t' + `score_bad[item]` + '\t' + `score_mod[item]` + '\t' + `score_good[item]` + '\t' + `score_perf[item]` \
						+ '\t' + good_ratio + '\t' + good_stat + '\t' + aver_value + '\t' + aver_bad + '\t' + aver_good + '\n')

		print item + "    N= " + `seq_count[item]` + "  -=-  " + `p` + "     " + test_seq

		p = p + 1

	#############################
        seq_list_len = len(seq_list)
        print "THERE ARE " + `seq_list_len` + " IDs IN SEQ LIST"
        print " GOOD PROBES:  " + `all_good`
        print " BAD  PROBES:  " + `all_bad`
	print " MOD  PROBES:  " + `count_025_030`
	print " PERF PROBES:  " + `all_perf`
        print ""
        time.sleep(3)
	#############################

	# in_file.close()
	out_file.close()

##################
#                #
#   MAIN BODY    #
#                #
##################

import math
import re
import sys
import string
import time
import os

if __name__ == "__main__":
	if len(sys.argv) <= 4 or len(sys.argv) > 5:
		print "Program usage: "
		print "input_file(AFFY) output_file cut_off(0.25) cut_off2(0.30)"
		sys.exit()
	if len(sys.argv) == 5:
		in_name   = sys.argv[1]
		out_name  = sys.argv[2]
		affy_cut  = sys.argv[3]
		affy_cut2 = sys.argv[4]
		Seqs_Extractor(in_name, out_name, affy_cut, affy_cut2)
### THE END ###

