#!/usr/bin/python
###########################################
# Author: Brian Chan (birdchan@ucdavis.edu)
# Supervisor: Alexander Kozik (akozik@atgc.org)
# Date: February 27 2004
# Description:
#
#
# This python script extracts data from the contigs input
#    data file. It can produce output file(s) in two ways.
#
# Notation: [something]* means "something" can appear many times.
#      {something}* means the same.
#
# ====================================================
#
# Input file format:
#
#    [comment lines]*
#    {
#     ********* Contig i **********
#     seqA
#           [seq1 is in seqA]*
#     seqB
#           [seq5 is in seqB]*
#    }*
#
#    DETAILED DISPLAY OF CONTIGS
#    {
#     ********** Contig i **********
#      {
#                  .    :    .    :    .     :
#       [seq_j     ATGAAATGACCCAGATATGGGGGGGCC...]*
#       -------------------------------------------
#       consensus  ATGAAATGACCCAGATATGGGGGGGCC...
#      }*
#    }*
#
# ====================================================
#
#  Input file Example
#
#    ******** Contig 1 ************
#    seqA
#         seq1 is in seqA
#         seq2 is in seqA
#    seqB
#         seq5 is in seqB
#         seq6 is in seqB
#         seq7 is in seqB
#
#    DETAILED DISPLAY OF CONTIGS
#
#    ********* Contig 1 ***********
#                .    :    .    :    .
#    seq1        ATGAAAAAAAAAAAAAAAAAGTTT
#    seq2        ATGAAAAAAAAAAAAAAAAAGTTTT
#    -------------------------------------
#    consensus   ATGAAAAAAAAAAAAAAAAAGTTTT
#
#    ********* Contig 2 ***********
#                .    :    .    :    .
#    seq5        GGGGGGGGGATTTTTCCCCC
#    seq6                 ATTTTTCCCCCCCCCA
#    -------------------------------------
#    consensus   GGGGGGGGGATTTTTCCCCCCCCCA
#
#                .    :    .    :    .
#    seq6        AAAATTTTTTTT
#    seq7          AATTTTTTTTTTTGGGG
#    --------------------------------------
#    consensus   AAAATTTTTTTTTTTGGGG
#
#
# ====================================================
#
#  Notice: In this script, everything before the line
#    "DETAILED DISPLAY OF CONTIGS" will be skipped.
#
# ====================================================
#
#  First output option:
#    One output file will be generated with the following format
#
#    [Contig_i   numOfSeq   [seqName(start_index,end_index)]*  ]*
#
#    The seqName's are separated by "|"
#
#    From the sample input file, the output would be
#
#    Contig1  2   seq1(1,24)|seq2(1,26)
#    Contig2  3   seq5(1,20)|seq6(10,37)|seq7(28,44)
#
# ====================================================
#
#  Second output option:
#    Files will be generated according to each contig. Users are
#      asked to provide file prefix, file extension, and the directory
#      name which all the files will be stored under.
#
#    Suppose the user inputs "con" for file prefix, "align" for
#      file extension, and "tmp_dir" for directory name.
#
#    After running this script with the sample input file, these are the
#      files under directory "tmp_dir"
#
#    con1.align
#    con2.align
#
#    In con1.align,
#
#                .    :    .    :    .
#    seq1        ATGAAAAAAAAAAAAAAAAAGTTT
#    seq2        ATGAAAAAAAAAAAAAAAAAGTTTT
#    -------------------------------------
#    Contig_1    ATGAAAAAAAAAAAAAAAAAGTTTT
#
#    In con2.align,
#                .    :    .    :    .    :    .     :     .
#    seq5        GGGGGGGGGATTTTTCCCCC
#    seq6                 ATTTTTCCCCCCCCCAAAAATTTTTTTT
#    seq7                                   AATTTTTTTTTTTGGGG
#    --------------------------------------------------------
#    Contig_2    GGGGGGGGGATTTTTCCCCCCCCCAAAAATTTTTTTTTTTGGGG
#
#    Notice the name "consensus" becomes "Contig_i". This "i" is the
#      same as the one in the file name.
#
#
############################################

import sys
import string
from string import *
from string import rstrip
import re
from os import makedirs
from os.path import isdir
from os import chdir

############################################
# global
seqNameMaxLength = 22

############################################

class seqClass:

	def __init__(self, name, start):
		self.name = name
		self.start = start
		self.seq = ""
	def append(self, start, subSeq):
		# I assume "start" is valid... (the seq is not broken)
		self.seq = self.seq + subSeq
	def getName(self):
		return self.name
	def print_info(self):
		print self.name, self.start, self.seq
	def output_info_1(self, bar):
		output_string(self.name)
		startStr = "%d" % (self.start + 1)  # since index starts at 1
		endStr = "%d" % (self.start + len(self.seq))
		if bar == 1:
			output_string("(" + startStr + "," + endStr + ")|")
		else:
			output_string("(" + startStr + "," + endStr + ")")
	def output_info_2_consensus(self, contig_ID):
		self.output_info_2_x(contig_ID)
	def output_info_2_uncond(self):
		self.output_info_2_x(" ")
	def output_info_2_x(self, contig_ID):
		name = ""
		if contig_ID == " ":
			name = self.name
		else:
			# name = "Contig_" + contig_ID
			name = "Contig" + contig_ID
		output_string(name)
		i = len(name)
		global seqNameMaxLength
		while i < self.start + seqNameMaxLength:
			output_string(" ")
			i = i + 1
		output_string(self.seq + "\n")
	def output_info_2(self):
		if find(self.name, "consensus") == -1:
			self.output_info_2_uncond()

#------------------------------

class seqHeapClass:

	def __init__(self):
		self.seqList = []
		self.seqNum = 0
	def set_contig_ID(self, lines, i):
		line = get_line(lines, i)
		star1, contig_str, ID, start2 = split(line)
		self.contig_ID = ID
	def get_contig_ID(self):
		return self.contig_ID
	def find_seq_index(self, name):
		for i in range(len(self.seqList)):
			if self.seqList[i].getName() == name:
				return i
		return -1
	def readinSeq(self, name, start, subSeq):
		i = self.find_seq_index(name)
		if i != -1:   # if seq exists
			# append it to the existing seq
			subSeqStr = split(subSeq)
			subSeqStr = subSeqStr[0]       # take out the white sp
			self.seqList[i].append(start, subSeqStr)
		else:
			# insert a new seq
			newSeq = seqClass(name, start) # make a new empty seq
			subSeqStr = split(subSeq)
			subSeqStr = subSeqStr[0]       # take out the white sp
			newSeq.append(start, subSeqStr)   # put stuff into it
			self.seqList.append(newSeq)
			self.seqNum = self.seqNum + 1
	def clear(self):
		self.seqList = []
		self.seqNum = 0
	def print_info(self):
		print self.get_contig_ID()
		for l in self.seqList:
			l.print_info()
	def output_info_1(self):
		output_string("Contig" + self.get_contig_ID())
		numOfSeqStr = "%d" % (self.seqNum - 1)
		output_string("\t" + numOfSeqStr + "\t")
		myList = []    # a list without the consensus seq
		for l in range(len(self.seqList)):  # build this list
			if find(self.seqList[l].getName(), "consensus") == -1:
				myList.append(self.seqList[l])
		for l in range(len(myList)):  # output this list
			if l != len(myList) - 1:
				myList[l].output_info_1(1)
			else:
				myList[l].output_info_1(0)
		output_string("\n")
	def output_dot_line(self, con_i):
		global seqNameMaxLength
		dotLen = len(self.seqList[con_i].seq)
		for i in range(seqNameMaxLength):
			output_string(" ")
		for i in range(dotLen):
			if (i+1) % 10 == 0:
				output_string(":")
			elif (i+1) % 5 == 0:
				output_string(".")
			else:
				output_string(" ")
		output_string("\n")
	def output_underscored_line_and_consensus(self, con_i):
		# output the underscored line
		global seqNameMaxLength
		conLen = len(self.seqList[con_i].seq)
		for i in range(seqNameMaxLength):
			output_string(" ")
		for i in range(conLen):
			output_string("-")
		output_string("\n")
		# output the consensus line
		self.seqList[con_i].output_info_2_consensus( self.get_contig_ID() )
	def output_info_2(self):
		global contig_prefix
		global output_suffix
		output_filename = contig_prefix + self.contig_ID + "." + output_suffix
		global fp_out
		fp_out = open(output_filename, "wb")
		# find where consensus is
		con_i = -1   # index of consensus
		for i in range(self.seqNum):
			if find(self.seqList[i].name, "consensus") != -1:
				con_i = i
		if con_i == -1:
			print "No consensus found in", self.get_contig_ID()
			sys.exit(0)
		# output the dot line
		self.output_dot_line(con_i)
		# output the seq's
		for i in range(self.seqNum):
			self.seqList[i].output_info_2()
		# output the underscored line and the consensus seq
		self.output_underscored_line_and_consensus(con_i)
		fp_out.close()


############################################
# global

debug_01 = 1       # print out line info

fp_in = 0          # file pointer for input
fp_out = 0         # file pointer for output
output_format = 0  # what kind of output

seqRow = 0         # which row are we at in each contig
seqNameLength = 22 # constant, length of a seq name
seqLength = 60     # constant, length of a seq line excluding the seq name
lineLength = seqNameLength + seqLength  # constant, length of a seq line
seqHeap = seqHeapClass()  # the obj that stores everything within a contig

contig_dirname = "CAP3_Alignments"
contig_prefix = "Contig"
output_suffix = "ali"

############################################

def read_contigs_info():
	global fp_in
	lines = fp_in.readlines()
	if not lines:
		print "No input in the input file"
		sys.exit(0)   # if no input, exit
	i = 0  # the line index

	# jump to the start line
	i = line_num_string(lines, i, "DETAILED DISPLAY OF CONTIGS")

	# jump to the next contig section
	i = line_num_string(lines, i, "Contig")
	next_contig_index = line_num_string(lines, i+1, "Contig")

	while i < len(lines)-1:

		global seqRow
		global seqHeap
		seqRow = 0
		seqHeap.clear()
		seqHeap.set_contig_ID(lines, i)

		while i < next_contig_index:
			i = i + 1   # the line after the contig line
			line = get_line(lines, i)
			if is_seq_data_line(line) == 1:
				if debug_01 == 1:
					print "line ", i, " is a seq_data_line"
				do_seq_data(line)
			elif is_consensus_line(line) == 1:
				if debug_01 == 1:
					print "line ", i, " is a consensus_line"
				do_consensus(line)
			elif is_blank_line(line) == 1:
				if debug_01 == 1:
					print "line ", i, " is a blank line"

		# show what we got
		#seqHeap.print_info()

		# output data into output file
		if output_format == 1:
			seqHeap.output_info_1()
		elif output_format == 2:
			seqHeap.output_info_2()

		# jump to the next contig section
		i = line_num_string(lines, i, "Contig")
		next_contig_index = line_num_string(lines, i+1, "*****")


# -----------------------------------------

def get_line(lines, index):
	line = lines[index]
	line = rstrip(line)   # get rid of \n
	return line

def line_num_string(lines, index, pattern):   # stop at the matched line
	i = index
	while 1:
		if i >= len(lines):
			return len(lines)-1
		line = get_line(lines, i)
		if find(line, pattern) != -1:
			break
		i = i + 1
	return i

def output_string(str):
	global fp_out
	fp_out.write(str)

def is_x_line(line, x):
	if find(line, x) != -1:
		return 1
	return 0

def is_seq_data_line(line):
	if not is_consensus_line(line) and not is_x_line(line, "*****") and len(line) > 0 and line[0] != " ":
		return 1
	else:
		return 0

def is_consensus_line(line):
	return is_x_line(line, "consensus")

def is_blank_line(line):
	if len(line) == 0:
		return 1
	return 0

def get_start_of_seq_index(seq):
	for i in range(len(seq)):
		if seq[i] != " ":
			return i
	print "Error in get_start_of_seq_index, i =", i
	sys.exit(0)

def do_seq_data(line):
	global seqHeap
	# the index factors row-wise
	global seqRow
	global lineLength
	global seqLength
	# the seq name and the whole seq
	global seqNameMaxLength
	seqName = line[0:seqNameMaxLength]
	seqName = split(seqName)
	seqName = seqName[0]
	seq = line[seqNameMaxLength:lineLength]
	# the index factor column-wise
	j = get_start_of_seq_index(seq)
	seqHeap.readinSeq(seqName, seqRow * seqLength + j, seq)

def do_consensus(line):
	# increment the seqRow counter
	global seqRow
	seqRow = seqRow + 1
	# read consensus line
	global seqHeap
	global seqNameMaxLength
	seqName = line[0:seqNameMaxLength]   # better be "consensus" ...
	seq = line[seqNameMaxLength:lineLength]
	seqHeap.readinSeq(seqName, 0, seq)


############################################
# main

# what output?
s = raw_input("What type of output do you want? (1/2): ")
if s != "1" and s!= "2":
	print "Unknown output style"
	sys.exit(0)
if s == "1":
	output_format = 1
if s == "2":
	output_format = 2

# ask for input
s = raw_input("Enter the SOURCE file name: ")
input_filename = s

if output_format == 1:
	s = raw_input("Enter the DESTINATION file name: ")
	output_filename = s

	# checking input
	if input_filename == output_filename:
		print "input file is the same as output file"
		sys.exit(0)

if output_format == 2:
	# s = sys.argv[1]
	# if s != "":
	# contig_dirname = s
	# else:
	# contig_dirname = "CAP3_Alignments"
	contig_dirname = raw_input("Enter the DESTINATION directory with alignments: ")

	print "Default contig file extension is", output_suffix
	s = raw_input("Enter the contig file extension :")
	if s != "":
		output_suffix = s
	else:
		output_suffix = "ali"

	print "Default contig file name prefix is", contig_prefix
	s = raw_input("Enter the contig file name prefix:")
	if s != "":
		contig_prefix = s
	else:
		contig_prefix = "Contig"


# open the input and output file
print "read data from the input file", input_filename, "..."
fp_in = open(input_filename, "rb")
if output_format == 1:
	fp_out = open(output_filename, "wb")
else:
	if not isdir(contig_prefix):
		makedirs(contig_dirname, 0755)   # for the output files
	chdir(contig_dirname)    # go into the output dir
read_contigs_info()
fp_in.close()
if output_format == 1:
	fp_out.close()
else:
	chdir("..")