#!/usr/bin/python

#######################################################################
#                                                                     #
# Author:      Elena Kochetkova and Alexander Kozik                   #
# Supervisor:                                                         #
# Date:        March 17, 2006                                         #
# Description: This program extracts specified substrings of a given  #
#              dna sequence and translates those to proteins...       #
#                                                                     #
####################################################################### 

###################################################
import os
import sys
import string
import re
import copy
from string import *
import time
import dna_to_prot
##################################################

global dummy_debug
global input_filename, output_filename
global chrom_prefix
global spec_prefix
global note_extract

dummy_debug = "FALSE"
# dummy_debug = "TRUE"

def ask_note_extract():
   global note_extract

   note_extract = "FALSE"
   yn_note_extract = strip(raw_input("\n\nDo you want to extract NOTE field from CDS [Y/N] :  "))
   if yn_note_extract == "Y" or yn_note_extract == "y":
	note_extract = "TRUE"
   return note_extract

def ask_chrom_prefix():
   global chrom_prefix

   chrom_prefix = strip(raw_input("\n\nEnter the [CHROMOSOME-SPECIES] PREFIX: "))
   return chrom_prefix

def ask_spec_prefix():
   global spec_prefix

   spec_prefix = strip(raw_input("\n\nEnter the SPECIES NAME: "))
   return spec_prefix

def ask_filename():
   global input_filename
   
   input = strip(raw_input("\n\nEnter the SOURCE file name: "))
   return input
 
def file_open_err(file_name):
   print "Error opening file ","\"",file_name,"\""

def would_you_like_prompt(prompt_line): 
   ans =""
   ans = lstrip(raw_input(prompt_line))
   while ans == "":
       print "\nEnter y or n."             
       ans = lstrip(raw_input(prompt_line))
   return ans

def fileExists(f):
   try:
      file = open (f)
   except IOError:
      exists = 0
   else:
      exists = 1
   return exists
 
def open_inputfile():
   global input_filename
   proper_name = 1   
   while proper_name != 0: 
     try:        
        # ask for input file name
        input_filename = ask_filename()
        # open the input file
        fp_in = open(input_filename, "rb")
        proper_name = 0
     except:
        file_open_err(input_filename) 
        prompt_line = "\nWould you like to continue (y/n)? "   
        ans = would_you_like_prompt(prompt_line)
        if ans[0] == "n" or ans[0] == "N":
           sys.exit(0)
   return fp_in  
 
def open_outputfile(ext):
   global input_filename, output_filename
   return_value_arr = []
   
   output_filename = input_filename + ext 
   print "Your default destination file is: ", output_filename 
   # open the output file
   if fileExists(output_filename) == 1:
      print  "\nFile",output_filename,"already exists.\n"
      prompt_line = "\nWould you like to overwrite it (y/n)? "
      ans = would_you_like_prompt(prompt_line)
      print "\n\n"
      if ans[0] == "n" or ans[0] == "N":  
          output_filename = lstrip(raw_input("\n\nEnter the DESTINATION file name: "))
          fp_out = open(output_filename, "wb")
      else:
          fp_out = open(output_filename, "wb") 
   else:
      try: 
         fp_out = open(output_filename, "wb")
      except:
         file_open_err(output_filename)
         print "\nTry again later...\n\n"
         sys.exit (1)  
   return_value_arr.append (fp_out) 
   return_value_arr.append (output_filename)
   return return_value_arr

def get_posns(str):
#   print str
   if str[0].isdigit():
      return str
   else:
     open_parenths_idx = str.find("(")
     if (open_parenths_idx != -1):
        close_parenths_idx = str.rfind(")")
        return get_posns(str[open_parenths_idx + 1:close_parenths_idx])
     else:
        return get_posns(str[1:])

def get_dir_pos(data_str):
  return_arr = []
  direction = ""
  dir_c = "" 

#  print data_str + '\n'

  direction = "forward"
  if re.match (".*complement\(", data_str) and not re.match (".*join\(", data_str):
     positions = get_posns(data_str)
#     print positions + '\n'
     direction = "reverse"
  elif not re.match (".*complement\(", data_str) and re.match (".*join\(", data_str):
     positions = get_posns(data_str)
#     print positions + '\n'
  elif re.match (".*complement.*\(join\(", data_str):
     positions = get_posns(data_str)
#     print positions + '\n'
     direction = "reverse"
  else:
     positions = get_posns(data_str)
#     print positions + '\n'
  return direction, positions

def revcomp(dna):
   #reverse complement of a DNA sequence
   comp = dna.translate(maketrans("ATGCatgc", "TACGtacg"))
   lcomp = list(comp)
   lcomp.reverse()
   return join(lcomp, "")

def dna_translate(cdna, code = dna_to_prot.standard):
   """ translate a cDNA sequence to a protein """ 
   return "".join([ code.get(cdna[i:i+3], "X") for i in xrange(0,len(cdna),3) ])

def choose_format():
   print("\n\n")
   print("****************************************")    
   print("*   Please ascribe the order to the    *")
   print("*      following header elements:      *")
   print("*           gene_name (1 2 3)          *")
   print("*        protein_name (1 2 3)          *")  
   print("*          genbank_id (1 2 3)          *")
   print("****************************************")

   order1 = 0
   order2 = 0
   order3 = 0

   name = "GENE_NAME"
   order1 = valid_int_input(name, order2, order3)
    
   name = "PROTEIN_NAME"
   order2 = valid_int_input(name, order1, order3)
  
   name = "GENEBANK_ID"
   order3 = valid_int_input(name, order1, order2)

   order_list = []
   order_list.insert(0, int(order1)-1)  
   order_list.insert(1, int(order2)-1)
   order_list.insert(2, int(order3)-1) 
   return order_list 

def make_header(order_list, gene_name, protein_name, genbank_id):
   header_order_list = []
   for i in range(len(order_list)):
      header_order_list.insert(i, " ")
   header_order_list[int(order_list[0])]= gene_name  
   header_order_list[int(order_list[1])]= protein_name
   header_order_list[int(order_list[2])]= genbank_id

   header = ""
   for j in range(len(header_order_list)):
      header = header + header_order_list[j]+ " "
   return header, header_order_list[0]

def make_header_tab(order_list, gene_name, protein_name, genbank_id):
   header_order_list = []
   for i in range(len(order_list)):
      header_order_list.insert(i, " ")
   header_order_list[int(order_list[0])]= gene_name  
   header_order_list[int(order_list[1])]= protein_name
   header_order_list[int(order_list[2])]= genbank_id

   header_tab = ""
   for j in range(len(header_order_list)):
      header_tab = header_tab + header_order_list[j] + "\t"
   return header_tab

def valid_int_input(name, num1, num2):
   int_input = 1
   
   while int_input != 0: 
      order = strip(raw_input("\n\nEnter the sequence order for " + name + ": "))
      try:          
         test = string.atoi(order)
         if (int(order) > 3) or (int(order) < 1):
            print ("Enter number 1-3...")
         else:
            if (order == num1) or (order == num2):
               print("Number was previously selected...")
            else: 
               int_input = 0
      except ValueError:
         print "\nNot a number...\n" 
   return order


def duplication_option_menu():
   print("\n\n")
   print("****************************************")    
   print("*   In case of duplicated sequences,   *")
   print("*      would you like to extract       *")
   print("*                                      *")
   print("*       1. longest sequence            *")
   print("*       2. first sequence              *")  
   print("*                                      *")
   print("****************************************")
   int_input = 1

   while int_input != 0: 
      answer = strip(raw_input("\n\nEnter your answer: "))
      try:
         test = string.atoi(answer)
         if (int(answer) > 2) or (int(answer) < 1):
            print ("Enter either 1 or 2...")
         else: 
            int_input = 0
      except ValueError:
         print "\nNot a number...\n" 
   return answer


def get_data(fp_in, fp_out):
   
   dna_beg_pos = 0; data_lines = 0; format_cntr = 0
  
   data_array = []; dna_array = []; pos_array = []
  
   while 1:
      data  = fp_in.readline()    
      data_array.append(data)
      # if re.match ("^( {5}(((CDS)|(mRNA)|(tRNA)|(gene)|(misc_feature)|(misc_RNA)|(snoRNA)|(snRNA)|(repeat_region)).*)|ORIGIN)", data_array[data_lines]):
      if re.match ("^( {5}((([a-z])|([A-Z])).*)|ORIGIN)", data_array[data_lines]):
         pos_array.append(data_lines)
      if format_cntr == 0 or format_cntr == 1000:
         print "READING:     Processing line "+str(data_lines)+"... Please wait..." 
         format_cntr = 0
      format_cntr +=1  

      # match the beginning of dna sequence
      if re.match("^ORIGIN", data):
         dna_beg_pos = data_lines + 1
      
      if dna_beg_pos > 0 and data_lines >= dna_beg_pos:
         new_line = string.translate(data_array[data_lines],id,nonAZStr)
         dna_array.append(new_line)
         
      # match the end of dna sequence 
      if re.match("//", data):
         break
      data_lines += 1

      if not data:   # end of file   
         break
      
   if dna_beg_pos == 0:
      fp_out.write('WARNING --- No "ORIGIN" found!') 
      sys.exit(0)

   del data_array[dna_beg_pos:]  
   return dna_array, data_array, pos_array


def defRegionOfInterest(data_array, pos_array):
   region_id_array = []
   CDS_array = []
   gene_array = []
   format_cntr = 0
  
   for count in range(len(pos_array)):
      if count > 0:
         if re.match ("^ {5}(CDS)", data_array [pos_array[count-1]]):
            CDS_str = ''.join (data_array[pos_array[count-1]: pos_array[count]])
            ### TEST FOR LOCUS_TAG ###
            test_cds_string = re.sub("\n", " |N| ", CDS_str)
            test_cds_string = re.sub("\r", " |R| ", test_cds_string)
            if not re.match (".*/locus_tag=", test_cds_string):
               print "++++++++++++++++++++++++++++++++++++++++++"
               print "         CDS REGION IGNORED : \n"
               print CDS_str
               print ""
               print "     IT DOES NOT HAVE LOCUS_TAG FIELD     "
               print "++++++++++++++++++++++++++++++++++++++++++"
               time.sleep(3)
            if re.match (".*/locus_tag=", test_cds_string):
               CDS_array.append(CDS_str)
         if re.match ("^ {5}(gene)", data_array [pos_array[count-1]]):
            gene_str = ''.join (data_array[pos_array[count-1]: pos_array[count]])
            gene_array.append(gene_str)
         
      if format_cntr == 0 or format_cntr == 1000:
         print "SCANNING REGIONS:     Processing line "+str(count)+"... Please wait..." 
         format_cntr = 0
      format_cntr +=1
   del data_array[0:]
   del pos_array[0:]
   return CDS_array, gene_array

def get_gene_name_desc(gene_array):

   global note_extract
   gene_info_array = []
   format_cntr = 0
   substr = []
   first_warning = 0
   
   id = string.maketrans("","")
   
   for k in range (len(gene_array)):
      gname_flag = 0
      gene_info = []
      warning_msg = []
      
      fp_out9.write(gene_array[k])

      substr = string.split(gene_array[k], '  /')
      for k1 in range (len(substr)):

         ### TEST FOR locus_tag PATTERN ###
         gene_array_test = re.sub("\n", " |N| ", gene_array[k])
         gene_array_test = re.sub("\r", " |R| ", gene_array_test)
         if not re.match (".*/locus_tag=",gene_array_test):
            print ""
            print "LOCUS_TAG WAS NOT FOUND AT:  " + "\n" + gene_array_test
            print " ... CAN NOT ... "
            print "  ..  EXIT ..    "
            sys.exit()
            

         ### EXTRACTION OF locus_tag DATA ###
         if re.match ("locus_tag=", substr[k1]):
            gn_g = string.split(substr[k1], '\"')
            gene_name_gene = gn_g[1]
         
            if len(gene_info_array) > 0:
               if gene_name_gene == gene_info_array[-1][0]:
                  warning_msg.append("WARNING: " + gene_name_gene + " duplicated gene name in GENE region\n")

            temp = string.split(substr[0], "gene") 
            fixed =  strip(string.translate(temp[1],id,'\n''>''<'))
            
            if (get_posns(fixed)):
               positions = get_posns(fixed)
            pos_bounds = string.split(positions, "..")

            ### IGNORE IT - WE DO NOT EXTRACT DESCRIPTION HERE ##
            if note_extract == "TRUE":
               desc = " "
            if note_extract == "FALSE":
               desc = " "

            ### NEVER MIND AND IGNORE IT ###
            at_name_gene = gene_name_gene

            gene_info.append(gene_name_gene)
            gene_info.append(at_name_gene)
            gene_info.append(desc)
            gene_info.append(pos_bounds[0])
            gene_info.append(pos_bounds[-1])

            if dummy_debug == "TRUE":
               print ""
               print "    K == " + `k`
               print "======================================================================"
               print substr
               print " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
               print gene_name_gene + "   -=- gene_name_gene "
               print at_name_gene + "   -=- AT name gene "
               print desc + "    .... DESCRIPTION .... "
               print positions + " _____POSITIONS______  "
               print "======================================================================"
               print ""
               time.sleep(0) 

      msg_str = ''.join(warning_msg)
      if len(msg_str) > 0:
         if first_warning == 0:
            fp_out8.write("************************\n")
            fp_out8.write("* GENE REGION WARNINGS *\n")
            fp_out8.write("************************\n\n") 
         fp_out8.write(msg_str)
         first_warning = 1
            
      if len(gene_info)>0: 
         gene_info_array.append(gene_info)


      if format_cntr == 0 or format_cntr == 1000:
         print "GETTING INFO ON GENE REGION:     Processing line " + str(k) + "... Please wait..." 
         format_cntr = 0
      format_cntr +=1
   if first_warning == 1:  
      fp_out8.write('\n\n\n')
   del gene_array[0:]
   return gene_info_array

   
def put100charsPerLine (length, string, file):

   global input_filename

   line_begin  = 0
   line_end    = 100
   count = 0
   format_cntr = 0

   fp_out1.write(">" + input_filename + "\n")

   while length > 0:
      substr = string [line_begin:line_end]
      file.write(substr+'\n')
      line_begin += 100
      line_end   += 100
      length -=100
      if format_cntr == 0 or format_cntr == 1000:
         print "WRITING:     Processing line " + str(count) + "... Please wait..."
         format_cntr = 0
      count += 1
      format_cntr += 1

def get_CDS_region_desc(CDS_array, duplication_option):
   
   cds_info_array = []
   dupl_array = []
   dupl_count = 0
   unique = 0
   format_cntr = 0

   id = string.maketrans("","")

   for l in range (len(CDS_array)):
      fp_out4.write(CDS_array[l])
      ## desc = ""
      gene_name_cds = "DUMMY_GENE_NAME"
      protein_id    = "DUMMY_PROTEIN_ID"
      genbank_id    = "DUMMY_GENBANK_ID"
      desc          = "NO_DESCRIPTION_FOUND "
      substr = []
      cds_join_list = []
      substr = string.split(CDS_array[l], '  /')
      for l1 in range (len(substr)):

         ### TEST FOR locus_tag PATTERN   ###
         CDS_array_test = re.sub("\n", " |N| ", CDS_array[l])
         CDS_array_test = re.sub("\r", " |R| ", CDS_array_test)
         if not re.match (".*/locus_tag=", CDS_array_test):
            print ""
            print "LOCUS TAG WAS NOT FOUND : " + "\n" + CDS_array_test
            print "  ... CAN NOT ...  "
            print "  ___ EXIT ____    "
            sys.exit()

         if not re.match (".*/protein_id=", CDS_array_test):
            print "PROTEIN ID WAS NOT FOUND FOR : " + "\n" + CDS_array_test
            protein_id = "DUMMY_PROTEIN_ID"
            time.sleep(3)
         if not re.match (".*/db_xref=\"GI:", CDS_array_test):
            print "NCBI GI ID WAS NOT FOUND FOR : " + "\n" + CDS_array_test
            genbank_id = "DUMMY_GENBANK_ID"
            time.sleep(3)

         ### EXTRACTION OF locus_tag DATA ###
         if re.match ("locus_tag=", substr[l1]):
            gnc = string.split(substr[l1], '\"')
            gene_name_cds = gnc[1]
            # print substr
            print gene_name_cds + "  +++ LOCUS NAME FROM CDS"
         if re.match ("protein_id=", substr[l1]):
            prot_id = string.split(substr[l1], '\"')
            protein_id = prot_id[1]
         if re.match ("db_xref=\"GI:", substr[l1]):
            gb_id1 = string.split(substr[l1], ':')
            gb_id2 = string.split(gb_id1[1], '\"')
            genbank_id = gb_id2[0]
         if re.match ("note=", substr[l1]): 
            cds_desc = string.split(substr[l1], 'note="')
            desc = strip(string.split(cds_desc[1], '  /')[0])
            desc = re.sub(' {1,}', ' ', desc)
            desc = string.translate(desc,id,'\n''\r')
            desc = re.sub (";.*", "", desc)
            if desc[-1] == "\"":
               desc = desc[:-1]
            desc = desc + " " 
         # print substr
         # print gene_name_cds
      cds_join_list.append (substr[0])
   
      # join broken strings to retrieve numbers within the brackets
      id = string.maketrans("","")
      joined = string.translate(''.join(cds_join_list), id, '\n')
 
      direction = ""
      pos_line  = ""
      warning_msg = []

      direction, pos_line = get_dir_pos(joined)
      pos_subline = string.translate(pos_line, id, ' ' '\r')
      # print pos_subline

      positions = re.sub ("\.\.", "-", pos_subline) 
      positions = re.sub (",", "|", positions)

      if re.match(".*>", positions) or re.match("<.*", positions):
         # WARNING ABOUT > and <
         print "============================================="
         print "  WEIRD COORDINATES -=- CASE 1 :             "
         print positions
         print "============================================="
         time.sleep(3)
         warning_msg.append ("WARNING: " + positions +"contains > or <\n")

      positions = re.sub (">", "", positions)
      positions = re.sub ("<", "", positions)

      pos_pairs = string.split (pos_subline, ",")
      pos_arr = []
      pos_array = []
   
      for m in range (len(pos_pairs)):
         if re.match (".*\.\.", pos_pairs[m]): 
            pair_split = string.split (pos_pairs[m], "..")
            if len(pair_split) == 2:
               pos_arr.append(pair_split[0])
               pos_arr.append(pair_split[1])
         else:
            pos_arr.append(pos_pairs[m])
            pos_arr.append(pos_pairs[m])
            warning_msg.append("WARNING: " + pos_pairs[m] + " does not have a pair\n")
            
      for l2 in range(len(pos_arr)):
         # if re.match(".*<", pos_arr[l2])or re.match(".*>", pos_arr[l2]):    
         if re.match(".*<", pos_arr[l2]) or re.match(".*>", pos_arr[l2]):
            # WARNING ABOUT > and <
            print "==============================================="
            print pos_arr[l2] + "  WEIRD COORDINATES -=- CASE 2 : "
            print positions
            print "==============================================="
            time.sleep(3)
            # warning_msg.append ("WARNING: " + pos_arr[l2] + "contains > or <\n")
            warning_msg.append ("WARNING: " + positions + "contains > or <\n")
            #get rid of > and <
            pos_array.append(int(string.translate(pos_arr[l2], id, '>' '<')))
         else:
            pos_array.append(int(pos_arr[l2]))

      cds_info = []
      cds_info.append(gene_name_cds)
      cds_info.append(protein_id)
      cds_info.append(genbank_id)
      cds_info.append(direction)
      cds_info.append(pos_array)
      cds_info.append(positions)
      cds_info.append(warning_msg)
      cds_info.append(desc)
 
      if l == 0:
         cds_info_array.append(cds_info)
            
      if l > 0:
         if len(dupl_array) > 0:
            temp_gene_name = dupl_array[-1][0]
         else:
            temp_gene_name = cds_info_array[len(cds_info_array)-1][0]
         if (gene_name_cds == temp_gene_name ):
            unique = 1   # not unique
            if dupl_count == 0:
               warning_msg.append("WARNING: Gene name " + gene_name_cds + " is not UNIQUE\n")
               dupl_array.append(cds_info_array[l-1])
               del cds_info_array[-1]
               dupl_array.append(cds_info)
            else:
               dupl_array.append(cds_info)
            dupl_count += 1
         else:
            if unique == 1:
               dupl_count = 0
               if int(duplication_option) == 1: 
                  idx = find_longest(dupl_array)
               else:
                  idx = 0
               for k in range(len(dupl_array)):
                  if k == idx:
                     dupl_array[k].append("UNIQUE_SET")
                     
                  else:
                     dupl_array[k].append("DUPLIC_SET")
                  cds_info_array.append(dupl_array[k])
               del dupl_array[0:]
               unique = 0
               cds_info_array.append(cds_info)
            else:
               cds_info_array.append(cds_info)
               
      if format_cntr == 0 or format_cntr == 1000:
         print "GETTING INFO ON CDS  REGION:     Processing line " + str(l) + "... Please wait..." 
         format_cntr = 0
      format_cntr +=1
        
   del CDS_array[0:]
   return cds_info_array


def find_longest(dupl_array):
   
   length_array =  []
   pos_array = []

   for i in range (len(dupl_array)):
      pos_array.append(dupl_array[i][4])
      j = 1
      length = 0
      while j < len(pos_array[i]):
         length += (int(pos_array[i][j]) + 1) - int(pos_array[i][j-1])
         j +=2
      length_array.append(length)
      if i == 0:
         longest = length_array[i]
         longest_idx = i 
      if i > 0:
         if length_array[i] > longest:
            longest = length_array[i]
            longest_idx = i
   return longest_idx

         

###################################################
#                    main                         #
###################################################

try:
   chr_id = sys.argv[1]
except:
   chr_id = "CHR"

try:
   prfx_id = sys.argv[2]
except:
   prfx_id = ""

fp_in = open_inputfile() 

spec_prefix  = ask_spec_prefix()
chrom_prefix = ask_chrom_prefix()
note_extract = ask_note_extract()

print "PREFIX   =  " + chrom_prefix
print "EXTRACT DESCRIPTION  =  " + note_extract

print "\n\n\nreading data from the input file" , input_filename, "...\n\n"

fd1 = open_outputfile(".out")
fp_out1 = fd1[0]
fd2 = open_outputfile(".join")
fp_out2 = fd2[0]
fd2C = open_outputfile(".xclean.cds_dna.fasta")
fp_out2C = fd2C[0]
fd3 = open_outputfile(".protein")
fp_out3 = fd3[0]
fd3C = open_outputfile(".xclean.protein.fasta")
fp_out3C = fd3C[0]
fd4 = open_outputfile(".cds")
fp_out4 = fd4[0]
fd5 = open_outputfile(".position")
fp_out5 = fd5[0]
fd6 = open_outputfile(".500")
fp_out6 = fd6[0]
fd7 = open_outputfile(".inbetween")
fp_out7 = fd7[0]
fd8 = open_outputfile(".msg")
fp_out8 = fd8[0]
fd9 = open_outputfile(".gene")
fp_out9 = fd9[0]
fd10 = open_outputfile(".protein.dupl")
fp_out10 = fd10[0]
# fd12 = open_outputfile(".position.gi")
# fp_out12 = fd12[0]

fd77 = open_outputfile(".xtest")
fp_out77 = fd77[0]

if prfx_id != "":
   fd13 = open_outputfile(".transl.fasta")
   fp_out13 = fd13[0]

#test
output11 = input_filename + ".test"
fp_out11 = open(output11, "wb")

order_list = choose_format()
# 1 - longest, 2 - first
duplication_option = duplication_option_menu()

id = string.maketrans("","") # No actual translation
nonAZList = [chr(x) for x in range(256) if not ("a"<=chr(x)<="z" or "A"<=chr(x)<="Z")]
nonAZStr="".join(nonAZList)

dna_array  = []; data_array = []; pos_array =[]

#dna_array contains only dna sequence
#data_array contains information about all regions (CDS, gene, mRNA)
dna_array, data_array, pos_array = get_data(fp_in, fp_out1)

fp_in.close() # close the input file

CDS_array = []; gene_array = []
CDS_array, gene_array = defRegionOfInterest(data_array, pos_array)

dna_str = ''.join (dna_array) 
#fp_out11.write (dna_str)

### NNNNNNNNNNNNNNNNNN ###
non_atgc_list = [ 'B', 'D', 'E', 'F', 'H', 'I', 'J', 'K', 'L', 'M', \
                  'O', 'P', 'Q', 'R', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z' ]
dna_str = string.upper(dna_str)
for dummy_letter in non_atgc_list:
   dna_str = re.sub(dummy_letter, "N", dna_str)
dna_str = string.lower(dna_str)
##########################

# retrieving information from gene region
# retrieved is a list of tuples
# first element is gene name, second - at_name_gene,third - description
# fourth - beginning of gene interval, fifth - the end
gene_info_array = []
gene_info_array = get_gene_name_desc(gene_array)

CDS_info_array = []
CDS_dupl_array = []

# CDS_info_array is a list of lists:
# 1st element is gene_name_cds, then - protein_id, genbank_id,
# direction, pos_arr, positions, warning, [duplication_desc]
CDS_info_array = get_CDS_region_desc(CDS_array, duplication_option)

dna_modif = []
last_from_before = 0
first_warning = 0
format_cntr = 0
m = 0
at_gname_arr = []

# processing information from CDS region
for l in range(len(CDS_info_array)):
   desc = "NO_DESCRIPTION_FOUND"
   warning_msg = []
   gene_name_cds = CDS_info_array[l][0]
   protein_id = CDS_info_array[l][1]
   genbank_id = CDS_info_array[l][2]
   direction = CDS_info_array[l][3]
   pos_arr = CDS_info_array[l][4]
   positions = CDS_info_array[l][5]
   warning = ''.join(CDS_info_array[l][6])
   if warning != "":
      warning_msg.append(warning)
   desc_cds = CDS_info_array[l][7]

   dupl_status = "SINGLE_SPLICE_VAR"

   if len(CDS_info_array[l]) == 8:
      unique = 0 
      dupl_desc = 'UNIQUE_SET'
   elif len(CDS_info_array[l]) == 9:
      dupl_status = "MULTIPLE_SPLICE_VAR"
      dupl_desc = CDS_info_array[l][8]
      if dupl_desc == 'UNIQUE_SET':
          unique = 0
      else:
          unique = 1

   # check if gene name in gene regions matches the one in CDS region
   gene_name_gene = gene_info_array[m][0]
   if gene_info_array[m][1] != "":
      at_name_gene = gene_info_array[m][1]
   else:
      at_name_gene = gene_info_array[m][0]   
   desc = desc_cds
   if note_extract == "FALSE":
       desc = " "
   if desc[-1] == "\"":
       desc = desc[:-1]
   gene_beg_interval = int(gene_info_array[m][3])
   gene_end_interval = int(gene_info_array[m][4])
   gene_name_match = 0

   header, first_in_header = make_header(order_list, gene_name_cds, protein_id, genbank_id)
   header_tab = make_header_tab(order_list, gene_name_cds, protein_id, genbank_id)
   
   if ((pos_arr[0] >= gene_beg_interval) and (pos_arr[len(pos_arr)-1 ] <= gene_end_interval)) \
      and (gene_name_gene == gene_name_cds):
      fp_out11.write("*" + gene_name_gene + " == " + gene_name_cds+"\n")
      gene_name_match = 1
      
   else :
      fp_out11.write("WARNING: gene name/interval mismatch: "+gene_name_gene+" (in gene region) and "+gene_name_cds+" (in CDS region)\n")
      temp_m = m
      while ((m > 0)) and gene_name_match != 1:
         m -= 1
         fp_out11.write("m counts down m = "+str(m)+"\n")
         if (gene_info_array[m][0] == gene_name_cds):
            if (pos_arr[0] >= int(gene_info_array[m][3])) and (pos_arr[len(pos_arr)-1 ] <= int(gene_info_array[m][4])):
               fp_out11.write(str(m) + " " + gene_info_array[m][0] + " == " + gene_name_cds + "\n")
               gene_name_gene = gene_info_array[m][0]
               if gene_info_array[m][1] != "":
                  at_name_gene = gene_info_array[m][1]
               else:
                  at_name_gene = gene_info_array[m][0]
               # desc = gene_info_array[m][2]
               gene_name_match = 1
      if gene_name_match == 0:
         m = temp_m
         while ((m < len(gene_info_array)-1)) and gene_name_match != 1:
            m += 1
            fp_out11.write("m counts up m = "+str(m)+ "\n")
            if (gene_info_array[m][0] == gene_name_cds):               
              if (pos_arr[0] >= int(gene_info_array[m][3])) and (pos_arr[len(pos_arr)-1 ] <= int(gene_info_array[m][4])):
                  fp_out11.write(str(m) + " " + gene_info_array[m][0] + " == " + gene_name_cds+ "\n")
                  gene_name_gene = gene_info_array[m][0]
                  if gene_info_array[m][1] != "":
                     at_name_gene = gene_info_array[m][1]
                  else:
                     at_name_gene = gene_info_array[m][0]
                  # desc = gene_info_array[m][2]
                  gene_name_match = 1
      if gene_name_match == 0:
         m = temp_m
         gene_name_gene = "_None_"
         at_name_gene = "_None_"
   m += 1
   fp_out11.write("*********************************\n")

   if unique == 0:
       at_gname_arr.append(first_in_header)
   
   l3 = 1
   l4 = 0
   beg_dna_seq    = []
   end_dna_seq    = []
   subseq_dna     = []
   beg_in_between = []
   end_in_between = []
   dna_str_modif  = []
   temp_dna_str   = ''
   substr_arr     = [] 

   # while not last position in a position array
   while l3 <= len(pos_arr):
      beg_dna_seq.append (pos_arr[l3-1])
      end_dna_seq.append (pos_arr[l3])
      # capitalized substring corresponding to specified positions -> .join
      subseq_dna.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))

      #  get 500 characters before the very first valid subsequence in a whole DNA sequence
      if (l4 == 0) and (l == 0):
         substr_arr.append(dna_str[int(beg_dna_seq[l4])-1-500:int(beg_dna_seq[l4])-1])
         substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
         dna_str_modif_zero = dna_str[:int(beg_dna_seq[l4])-1]
         dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
         
      #lowercase sequence in between 2 genes 
      #(extact -500 before first valid subsequense in each CDS -> substr_arr) 
      elif (l4 == 0) and (l != 0):
         substr_arr.append(dna_str[int(beg_dna_seq[l4])-1-500:int(beg_dna_seq[l4])-1])
         substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
         if unique == 0:
            #if end position of one gene overlaps with the beginning position of another
            if last_from_before >= int(beg_dna_seq[0]):
               warning_msg.append ("WARNING: " + CDS_info_array[l-1][0] + "-" + gene_name_cds + ":  overlapping sequence fragment\n")
               dna_str_modif.append(string.upper(dna_str[last_from_before+1:int(end_dna_seq[l4])]))
            else:
               dna_str_modif.append(dna_str[last_from_before+1:int(beg_dna_seq[l4])-1])
               dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
  
            fp_out7.write('>' + at_gname_arr[0]+ '_' + first_in_header + ' [ ' + CDS_info_array[l-1][0] + '_' + gene_name_cds + ' ] '\
                          + string.upper(CDS_info_array[l-1][3][0]) + string.upper(direction[0]) + '\n')
            fp_out7.write(dna_str[last_from_before+1:int(beg_dna_seq[l4])-1] +'\n')
            del at_gname_arr[0]
            last_from_before = 0
      #lowercase sequence in between 2 position pairs
      elif (l4 > 0):
               substr_arr.append(dna_str[int(end_dna_seq[l4-1]):int(beg_dna_seq[l4])-1])         
               substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
               dna_str_modif.append(dna_str[int(end_dna_seq[l4-1]):int(beg_dna_seq[l4])-1])
               dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))

      else:   
               dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
               substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
      l4 = l4+1
      l3 = l3+2

   # extact +500 after last valid subsequense in each CDS -> substr_arr
   substr_arr.append(dna_str[int(end_dna_seq[l4-1]):int(end_dna_seq[l4-1])+500])
   #data for .500 file
   join_substr = ''.join (substr_arr)
   if (l == len(CDS_info_array)-1):
         dna_str_modif_last = dna_str[int(end_dna_seq[l4-1]):]

   if unique == 0:
      last_from_before = int(end_dna_seq[l4-1])-1
      # temp_dna_str = ''.join (dna_str_modif)
      dna_modif.append(''.join(dna_str_modif))

      if direction == "reverse":  
          rev_join_substr = revcomp(join_substr)
          if len(CDS_info_array[l]) == 9:
             fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] " + \
                               dupl_desc + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
          else:
             fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] " + \
                               dupl_desc + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
          fp_out6.write(rev_join_substr + '\n') 
      else:
          if len(CDS_info_array[l]) == 9:
             fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] " + \
                               dupl_desc + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
          else:
             fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] " + \
                               dupl_desc + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
          fp_out6.write(join_substr + '\n')
   else:
      if direction == "reverse":  
          rev_join_substr = revcomp(join_substr)
          fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] "+ \
                            dupl_desc + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
          fp_out6.write(rev_join_substr + '\n') 
      else:
          fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] " + \
                            dupl_desc + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
          fp_out6.write(join_substr + '\n')

   #data for .join file
   dna_substr = ''.join(subseq_dna)

   if direction == "reverse":   
      rev_comp_dna = revcomp(dna_substr) 
      prot_str = dna_translate (rev_comp_dna)  
   else:
      prot_str = dna_translate (dna_substr)  

   fasta_file_comments = ""

   if len(dna_substr)%3 != 0:
      warning_msg.append ("WARNING: String length is not divisible by 3"+'\n')
   stop_cod_num = string.count(prot_str, "*")	          # count stop codons	           
   if stop_cod_num == 0:
      warning_msg.append ("WARNING: NO STOP CODON"+'\n')
      fasta_file_comments = " { WARNING: NO_STOP_CODON } "
   if (stop_cod_num == 1) and (prot_str[len(prot_str)-1] != '*') :
      warning_msg.append ("WARNING: PREMATURE STOP CODON"+'\n')
      fasta_file_comments = " { WARNING: PREMATURE_STOP_CODON } "
   if stop_cod_num >=2 :
      warning_msg.append ("WARNING: >1 STOP CODONS"+'\n')
      fasta_file_comments = " { WARNING: MULTIPLE_STOP_CODONS } "

   #find middle position
   mid_pos = int((int(beg_dna_seq[0])+ int(end_dna_seq[len(end_dna_seq)-1]))/2)
   chrom_pos_start = str(int(beg_dna_seq[0]))
   chrom_pos_end   = str(int(end_dna_seq[len(end_dna_seq)-1]))
   chrom_pos_mid   = str(mid_pos)

   if unique == 0:
      # fp_out5.write(chrom_prefix+'\t'+gene_name_cds +'\t'+protein_id +'\t'+'CDS'+'\t'+direction+'\t'+ \
      #              genbank_id+'\t'+str(mid_pos)+'\t'+positions+'\n')
      fp_out5.write(spec_prefix + '\t' + chrom_prefix + '\t' + header_tab +'CDS' + '\t' + direction + '\t' + \
                    str(mid_pos) + '\t' + positions + '\n')
      if prfx_id != "":
         spacer_id = "_"
      if prfx_id == "":
         spacer_id = ""
      # fp_out12.write(chrom_prefix + '\t' + genbank_id + '\t' + gene_name_cds + '\t' +direction+'\t' + \
      #              str(mid_pos) + '\t' + positions + '\n')
      fp_out3.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ] " \
                        + fasta_file_comments + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
      fp_out3.write(prot_str +"\n")

      if fasta_file_comments == "":
         fp_out3C.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ] " \
                            + dupl_status \
                            + " { CHROM_POS_START:" + chrom_pos_start + " CHROM_POS_END:" + chrom_pos_end \
                            + " } -=| MIDDLE_POINT:" + chrom_pos_mid + " |=| " + "CHROM_ID:" + chrom_prefix \
                            + " |=| [( " + spec_prefix + " )] " + "|=- \n")
         fp_out3C.write(prot_str + "\n")

      # if prfx_id != "":
      #   fp_out13.write(">" + prfx_id + "_" + genbank_id + " " + desc + "[ "+ header + "CDS " + direction + " ]\n")
      #   fp_out13.write(prot_str + "\n")

      if len(CDS_info_array[l]) == 9:
         fp_out10.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ] " \
                            + dupl_desc + fasta_file_comments + " " + dupl_status + " [( " + spec_prefix + " )] " + '\n')
         fp_out10.write(prot_str +"\n")
         
      if direction == "reverse":   
         rev_dna_substr = revcomp(dna_substr)
         fp_out2.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ] " \
                           + fasta_file_comments + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
         fp_out2.write(rev_dna_substr +'\n')
         if fasta_file_comments == "":
            fp_out2C.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ] " \
                               + dupl_status \
                               + " { CHROM_POS_START:" + chrom_pos_start + " CHROM_POS_END:" + chrom_pos_end \
                               + " } -=| MIDDLE_POINT:" + chrom_pos_mid + " |=| " + "CHROM_ID:" + chrom_prefix \
                               + " |=| [( " + spec_prefix + " )] " + "|=- \n")
            fp_out2C.write(rev_dna_substr + '\n') 
      else:
         fp_out2.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ] " \
                           + fasta_file_comments + " " + dupl_status + " [( " + spec_prefix + " )] " + "\n")
         fp_out2.write(dna_substr + '\n')
         if fasta_file_comments == "":
            fp_out2C.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ] " \
                               + dupl_status \
                               + " { CHROM_POS_START:" + chrom_pos_start + " CHROM_POS_END:" + chrom_pos_end \
                               + " } -=| MIDDLE_POINT:" + chrom_pos_mid + " |=| " + "CHROM_ID:" + chrom_prefix \
                               + " |=| [( " + spec_prefix + " )] " + "|=- \n")
            fp_out2C.write(dna_substr + '\n') 
   else:     
      #write translated protein sequence into fp_out10
      fp_out10.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ] " \
                         + dupl_desc + fasta_file_comments + " " + dupl_status + " [( " + spec_prefix + " )] " + '\n')
      fp_out10.write(prot_str + "\n")

   msg_str = ''.join(warning_msg)
   if len(msg_str) > 0:
      if first_warning == 0:
         fp_out8.write("***********************\n")
         fp_out8.write("* CDS REGION WARNINGS *\n")
         fp_out8.write("***********************\n\n") 
      fp_out8.write(">" + first_in_header + " " + desc + "[ " + header + "CDS " + direction + " ]\n")
      fp_out8.write(msg_str)
      first_warning = 1
   if  format_cntr == 0 or format_cntr == 1000:
      print "TRANSLATING: Processing line " + str(l) + "... Please wait..."
      format_cntr = 0
   format_cntr += 1

if prfx_id != "":
   fp_out13.close()

# fp_out12.close()  # close the output12 file
fp_out11.close()  # close the output11 file
fp_out10.close()  # close the output10 file
fp_out9.close()   # close the output9 file 
fp_out8.close()   # close the output8 file         
fp_out7.close()   # close the output7 file  
fp_out6.close()   # close the output6 file  
fp_out5.close()   # close the output5 file  
fp_out4.close()   # close the output4 file  
fp_out3.close()   # close the output3 file
fp_out3C.close()
fp_out2.close()   # close the output2 file  
fp_out2C.close()
fp_in.close()     # close the input file

fp_out77.close()

dna_modif.insert (0, dna_str_modif_zero)
dna_modif.insert (len(dna_modif), dna_str_modif_last)
modif_dna_str = ''.join (dna_modif)
put100charsPerLine (len(modif_dna_str), modif_dna_str, fp_out1)
fp_out1.close()  # close the output1 file

print "DNA length: " + str(len(dna_str))

print "\n\nModified file",fd1[1],"is created..." 
print     "Modified file",fd2[1],"is created..." 
print     "Modified file",fd3[1],"is created..." 
print     "Modified file",fd4[1],"is created..." 
print     "Modified file",fd5[1],"is created..." 
print     "Modified file",fd6[1],"is created..." 
print     "Modified file",fd7[1],"is created..." 
print     "Modified file",fd8[1],"is created..."
print     "Modified file",fd9[1],"is created...\n\n" 

### THE END ###
