  #!/usr/bin/python

#######################################################################
#                                                                     #
# Author:      Elena Kochetkova                                       #
# Supervisor:  Alexander Kozik                                        #
# Date:        FEbruary 25, 2004                                      #
# Description: This program extracts specified substrings of a given  #
#              dna sequence and translates those to proteins...       #
#                                                                     #
####################################################################### 

###################################################
import os
import sys
import string
import re
import copy
from string import *
import dna_to_prot
##################################################

class _cls:
    def __init__(self):
        if sys.platform in ('linux-i386', 'linux2'):
            self._command = 'clear'
        elif sys.platform in ('win32', 'dos') or \
             sys.platform.startswith('ms-dos'):
            self._command = 'cls'
        else:
            self._command = None

    def __call__(self):
        """Clears the screen."""
        if self._command:
            os.system(self._command)

clear_screen = _cls()

global input_filename, output_filename

def ask_filename():
   global input_filename
   
   input = strip(raw_input("\n\nEnter the SOURCE file name: "))
   return input
 
def file_open_err(file_name):
   print "Error opening file ","\"",file_name,"\""

def would_you_like_prompt(prompt_line): 
   ans =""
   ans = lstrip(raw_input(prompt_line))
   while ans == "":
       print "\nEnter y or n."             
       ans = lstrip(raw_input(prompt_line))
   return ans

def fileExists(f):
   try:
      file = open (f)
   except IOError:
      exists = 0
   else:
      exists = 1
   return exists
 
def open_inputfile():
   global input_filename
   proper_name = 1   
   while proper_name != 0: 
     try:        
        # ask for input file name
        input_filename = ask_filename()
        # open the input file
        fp_in = open(input_filename, "rb")
        proper_name = 0
     except:
        file_open_err(input_filename) 
        prompt_line = "\nWould you like to continue (y/n)? "   
        ans = would_you_like_prompt(prompt_line)
        if ans[0] == "n" or ans[0] == "N":
           sys.exit(0)
   return fp_in  
 
def open_outputfile(ext):
   global input_filename, output_filename
   return_value_arr = []
   
   output_filename = input_filename + ext 
   print "Your default destination file is: ", output_filename 
   # open the output file
   if fileExists(output_filename) == 1:
      print  "\nFile",output_filename,"already exists.\n"
      prompt_line = "\nWould you like to overwrite it (y/n)? "
      ans = would_you_like_prompt(prompt_line)
      print "\n\n"
      if ans[0] == "n" or ans[0] == "N":  
          output_filename = lstrip(raw_input("\n\nEnter the DESTINATION file name: "))
          fp_out = open(output_filename, "wb")
      else:
          fp_out = open(output_filename, "wb") 
   else:
      try: 
         fp_out = open(output_filename, "wb")
      except:
         file_open_err(output_filename)
         print "\nTry again later...\n\n"
         sys.exit (1)  
   return_value_arr.append (fp_out) 
   return_value_arr.append (output_filename)
   return return_value_arr

def get_posns(str):
#   print str
   if str[0].isdigit():
      return str
   else:
     open_parenths_idx = str.find("(")
     if (open_parenths_idx != -1):
        close_parenths_idx = str.rfind(")")
        return get_posns(str[open_parenths_idx + 1:close_parenths_idx])
     else:
        return get_posns(str[1:])

def get_dir_pos(data_str):
  return_arr = []
  direction = ""
  dir_c = "" 

#  print data_str + '\n'

  direction = "forward"
  if re.match (".*complement\(", data_str) and not re.match (".*join\(", data_str):
     positions = get_posns(data_str)
#     print positions + '\n'
     direction = "reverse"
  elif not re.match (".*complement\(", data_str) and re.match (".*join\(", data_str):
     positions = get_posns(data_str)
#     print positions + '\n'
  elif re.match (".*complement.*\(join\(", data_str):
     positions = get_posns(data_str)
#     print positions + '\n'
     direction = "reverse"
  else:
     positions = get_posns(data_str)
#     print positions + '\n'
  return direction, positions

def revcomp(dna):
   #reverse complement of a DNA sequence
   comp = dna.translate(maketrans("ATGCatgc", "TACGtacg"))
   lcomp = list(comp)
   lcomp.reverse()
   return join(lcomp, "")

def dna_translate(cdna, code = dna_to_prot.standard):
   """ translate a cDNA sequence to a protein """ 
   return "".join([ code.get(cdna[i:i+3], "X") for i in xrange(0,len(cdna),3) ])

def choose_format():
   clear_screen ()
   print("\n\n")
   print("****************************************")    
   print("*   Please ascribe the order to the    *")
   print("*      following header elements:      *")
   print("*           gene_name (1 2 3)          *")
   print("*        protein_name (1 2 3)          *")  
   print("*          genbank_id (1 2 3)          *")
   print("****************************************")

   order1 = 0
   order2 = 0
   order3 = 0

   name = "GENE_NAME"
   order1 = valid_int_input(name, order2, order3)
    
   name = "PROTEIN_NAME"
   order2 = valid_int_input(name, order1, order3)
  
   name = "GENEBANK_ID"
   order3 = valid_int_input(name, order1, order2)

   order_list = []
   order_list.insert(0, int(order1)-1)  
   order_list.insert(1, int(order2)-1)
   order_list.insert(2, int(order3)-1) 
   return order_list 

def make_header(order_list, gene_name, protein_name, genbank_id):
   header_order_list = []
   for i in range(len(order_list)):
      header_order_list.insert(i, " ")
   header_order_list[int(order_list[0])]= gene_name  
   header_order_list[int(order_list[1])]= protein_name
   header_order_list[int(order_list[2])]= genbank_id

   header = ""
   for j in range(len(header_order_list)):
      header = header + header_order_list[j]+ " "
   return header, header_order_list[0]

def valid_int_input(name, num1, num2):
   int_input = 1
   
   while int_input != 0: 
      order = strip(raw_input("\n\nEnter the sequence order for " + name + ": "))
      try:          
         test = string.atoi(order)
         if (int(order) > 3) or (int(order) < 1):
            print ("Enter number 1-3...")
         else:
            if (order == num1) or (order == num2):
               print("Number was previously selected...")
            else: 
               int_input = 0
      except ValueError:
         print "\nNot a number...\n" 
   return order


def duplication_option_menu():
   clear_screen ()
   print("\n\n")
   print("****************************************")    
   print("*   In case of duplicated sequences,   *")
   print("*      would you like to extract       *")
   print("*                                      *")
   print("*       1. longest sequence            *")
   print("*       2. first sequence              *")  
   print("*                                      *")
   print("****************************************")
   int_input = 1

   while int_input != 0: 
      answer = strip(raw_input("\n\nEnter your answer: "))
      try:
         test = string.atoi(answer)
         if (int(answer) > 2) or (int(answer) < 1):
            print ("Enter either 1 or 2...")
         else: 
            int_input = 0
      except ValueError:
         print "\nNot a number...\n" 
   return answer


def get_data(fp_in, fp_out):
   
   dna_beg_pos = 0; data_lines = 0; format_cntr = 0
  
   data_array = []; dna_array = []; pos_array = []
  
   while 1:
      data  = fp_in.readline()    
      data_array.append(data)
      if re.match ("^( {5}(((CDS)|(mRNA)|(tRNA)|(gene)|(misc_feature)).*)|ORIGIN)", data_array[data_lines]):
         pos_array.append(data_lines)
      if format_cntr == 0 or format_cntr == 1000:
         clear_screen()
         print "READING:     Processing line "+str(data_lines)+"... Please wait..." 
         format_cntr = 0
      format_cntr +=1  

      # match the beginning of dna sequence
      if re.match("^ORIGIN", data):
         dna_beg_pos = data_lines + 1
      
      if dna_beg_pos > 0 and data_lines >= dna_beg_pos:
         new_line = string.translate(data_array[data_lines],id,nonAZStr)
         dna_array.append(new_line)
         
      # match the end of dna sequence 
      if re.match("//", data):
         break
      data_lines += 1

      if not data:   # end of file   
         break
      
   if dna_beg_pos == 0:
      fp_out.write('WARNING --- No "ORIGIN" found!') 
      sys.exit(0)

   del data_array[dna_beg_pos:]  
   return dna_array, data_array, pos_array


def defRegionOfInterest(data_array, pos_array):
   region_id_array = []
   CDS_array = []
   gene_array = []
   format_cntr = 0
  
   for count in range(len(pos_array)):
      if count > 0:
         if re.match ("^ {5}(CDS)", data_array [pos_array[count-1]]):
            CDS_str = ''.join (data_array[pos_array[count-1]: pos_array[count]])
            CDS_array.append(CDS_str)
         if re.match ("^ {5}(gene)", data_array [pos_array[count-1]]):
            gene_str = ''.join (data_array[pos_array[count-1]: pos_array[count]])
            gene_array.append(gene_str)
         
      if format_cntr == 0 or format_cntr == 1000:
         clear_screen ()
         print "SCANNING REGIONS:     Processing line "+str(count)+"... Please wait..." 
         format_cntr = 0
      format_cntr +=1
   del data_array[0:]
   del pos_array[0:]
   return CDS_array, gene_array

def get_gene_name_desc(gene_array):
   gene_info_array = []
   format_cntr = 0
   substr = []
   first_warning = 0
   
   id = string.maketrans("","")
   
   for k in range (len(gene_array)):
      gname_flag = 0
      gene_info = []
      warning_msg = []
      
      fp_out9.write(gene_array[k])
      temp_str = string.translate(gene_array[k], id,'\n''\r')
      temp_str = re.sub(' {1,}', ' ', temp_str)
      
      if not re.match(".*/note=", temp_str):
         if re.match(".*/gene=", temp_str):
            gene_name_gene = strip(string.split(string.split(temp_str, "gene=\"")[1], "\"")[0])
         if re.match(".*/locus_tag=", temp_str)and not re.match(".*/gene=", temp_str):
            gene_name_gene = strip(string.split(string.split(temp_str, "locus_tag=\"")[1], "\"")[0])
         warning_msg.append("WARNING: " + gene_name_gene + " does not have \"note\" field\n")
         temp = string.split(string.split(temp_str, "gene")[1], "/")
         fixed = strip(string.translate(temp[0],id,'\n''>''<'))
            
         if (get_posns(fixed)):
            positions = get_posns(fixed)
         pos_bounds = string.split(positions, "..")

         gene_info.append(gene_name_gene)
         gene_info.append("")        #gene at name
         gene_info.append("")        #description
         gene_info.append(pos_bounds[0])
         gene_info.append(pos_bounds[-1])
           
      substr = string.split(gene_array[k], '  /')
      for k1 in range (len(substr)):
         if re.match ("gene=", substr[k1]):
            gn_g = string.split(substr[k1], '\"')
            if re.match("At\dg", gn_g[1]):
               gname_flag = 1
               at_name_gene = gn_g[1]
            gene_name_gene = gn_g[1]
         
            if len(gene_info_array) > 0:
               if gene_name_gene == gene_info_array[-1][0]:
                  warning_msg.append("WARNING: " + gene_name_gene + " duplicated gene name in GENE region\n")
         if re.match ("note=", substr[k1]):
            # if gene name does not start with At, get At name
            if re.match (".*locus_tag:", substr[k1]):
                if gname_flag == 0:
                   # get description
                   if (substr[k1].find(';') == -1):
                      at_gene_info = string.split(string.split(substr[k1], "locus_tag:")[1], "\"")
                      at_name_gene =  strip(at_gene_info[0])
                      desc = ""
                   else:
                      at_gene_info = string.split(substr[k1], "locus_tag:")
                      semicol_idx = at_gene_info[1].find(';')  
                      at_gene_desc = strip(at_gene_info[1][semicol_idx+1:])
                      at_name_gene =  strip(string.split(at_gene_info[1], ';')[0])
                      desc = re.sub(' {1,}', ' ', at_gene_desc)
                      desc = string.translate(desc,id,'\n''\r')
                      desc = desc[0:-1] + " "   
            elif not re.match (".*locus_tag:", substr[k1]):
                at_name_gene = ""
                # get description
                idx  = substr[k1].find('=')
                desc = strip(substr[k1][idx+1:])
                desc = re.sub(' {1,}', ' ', desc)
                desc = string.translate(desc,id,'\n''\r')
                desc = desc[1:-1] + " "   
            temp = string.split(substr[0], "gene") 
            fixed =  strip(string.translate(temp[1],id,'\n''>''<'))
            
            if (get_posns(fixed)):
               positions = get_posns(fixed)
            pos_bounds = string.split(positions, "..")

            gene_info.append(gene_name_gene)
            gene_info.append(at_name_gene)
            gene_info.append(desc)
            gene_info.append(pos_bounds[0])
            gene_info.append(pos_bounds[-1])

      msg_str = ''.join(warning_msg)
      if len(msg_str) > 0:
         if first_warning == 0:
            fp_out8.write("************************\n")
            fp_out8.write("* GENE REGION WARNINGS *\n")
            fp_out8.write("************************\n\n") 
         fp_out8.write(msg_str)
         first_warning = 1
            
      if len(gene_info)>0: 
         gene_info_array.append(gene_info)


      if format_cntr == 0 or format_cntr == 1000:
         clear_screen ()
         print "GETTING INFO ON GENE REGION:     Processing line " + str(k) + "... Please wait..." 
         format_cntr = 0
      format_cntr +=1
   if first_warning == 1:  
      fp_out8.write('\n\n\n')
   del gene_array[0:]
   return gene_info_array

   
def put100charsPerLine (length, string, file):

   global input_filename

   line_begin  = 0
   line_end    = 100
   count = 0
   format_cntr = 0

   fp_out1.write(">" + input_filename + "\n")

   while length > 0:
      substr = string [line_begin:line_end]
      file.write(substr+'\n')
      line_begin += 100
      line_end   += 100
      length -=100
      if format_cntr == 0 or format_cntr == 1000:
         clear_screen ()
         print "WRITING:     Processing line " + str(count) + "... Please wait..."
         format_cntr = 0
      count += 1
      format_cntr += 1

def get_CDS_region_desc(CDS_array, duplication_option):
   
   cds_info_array = []
   dupl_array = []
   dupl_count = 0
   unique = 0
   format_cntr = 0

   id = string.maketrans("","")

   for l in range (len(CDS_array)):
      fp_out4.write(CDS_array[l])
      desc = ""
      substr = []
      cds_join_list = []
      substr = string.split(CDS_array[l], '  /')
      for l1 in range (len(substr)):
         if re.match ("gene=", substr[l1]):
            gnc = string.split(substr[l1], '\"')
            gene_name_cds = gnc[1]
         if re.match ("locus_tag=", substr[l1]) and not re.match ("gene=", substr[l1]):
            gnc = string.split(substr[l1], '\"')
            gene_name_cds = gnc[1]
         if re.match ("protein_id=", substr[l1]):
            prot_id = string.split(substr[l1], '\"')
            protein_id = prot_id[1]
         if re.match ("db_xref=\"GI:", substr[l1]):
            gb_id1 = string.split(substr[l1], ':')
            gb_id2 = string.split(gb_id1[1], '\"')
            genbank_id = gb_id2[0]
         if re.match ("note=", substr[l1]): 
            cds_desc = string.split(substr[l1], 'note="')
            desc = strip(string.split(cds_desc[1], '  /')[0])
            desc = re.sub(' {1,}', ' ', desc)
            desc = string.translate(desc,id,'\n''\r')
            desc = desc[0:-1] + " "   
      cds_join_list.append (substr[0])
   
      # join broken strings to retrieve numbers within the brackets
      id = string.maketrans("","")
      joined = string.translate(''.join(cds_join_list), id, '\n')
 
      direction = ""
      pos_line  = ""
      warning_msg = []

      direction, pos_line = get_dir_pos(joined)
      pos_subline = string.translate(pos_line, id, ' ' '\r')
#      print pos_subline

      positions = re.sub ("\.\.", "-", pos_subline) 
      positions = re.sub (",", "|", positions)
      positions = re.sub (">", "", positions)
      positions = re.sub ("<", "", positions)

      pos_pairs = string.split (pos_subline, ",")
      pos_arr = []
      pos_array = []
   
      for m in range (len(pos_pairs)):
         if re.match (".*\.\.", pos_pairs[m]): 
            pair_split = string.split (pos_pairs[m], "..")
            if len(pair_split) == 2:
               pos_arr.append(pair_split[0])
               pos_arr.append(pair_split[1])
         else:
            pos_arr.append(pos_pairs[m])
            pos_arr.append(pos_pairs[m])
            warning_msg.append("WARNING: " + pos_pairs[m] + " does not have a pair\n")
            
      for l2 in range(len(pos_arr)):
         if re.match(".*<", pos_arr[l2])or re.match(".*>", pos_arr[l2]):    
            # WARNING ABOUT > and <
            warning_msg.append ("WARNING: " + pos_arr[l2]+"contains > or <\n")
            #get rid of > and <
            pos_array.append(int(string.translate(pos_arr[l2], id, '>' '<')))
         else:
            pos_array.append(int(pos_arr[l2]))

      cds_info = []
      cds_info.append(gene_name_cds)
      cds_info.append(protein_id)
      cds_info.append(genbank_id)
      cds_info.append(direction)
      cds_info.append(pos_array)
      cds_info.append(positions)
      cds_info.append(warning_msg)
      cds_info.append(desc)
 
      if l == 0:
         cds_info_array.append(cds_info)
            
      if l > 0:
         if len(dupl_array) > 0:
            temp_gene_name = dupl_array[-1][0]
         else:
            temp_gene_name = cds_info_array[len(cds_info_array)-1][0]
         if (gene_name_cds == temp_gene_name ):
            unique = 1   # not unique
            if dupl_count == 0:
               warning_msg.append("WARNING: Gene name " + gene_name_cds + " is not UNIQUE\n")
               dupl_array.append(cds_info_array[l-1])
               del cds_info_array[-1]
               dupl_array.append(cds_info)
            else:
               dupl_array.append(cds_info)
            dupl_count += 1
         else:
            if unique == 1:
               dupl_count = 0
               if int(duplication_option) == 1: 
                  idx = find_longest(dupl_array)
               else:
                  idx = 0
               for k in range(len(dupl_array)):
                  if k == idx:
                     dupl_array[k].append("UNIQUE_SET")
                     
                  else:
                     dupl_array[k].append("DUPLIC_SET")
                  cds_info_array.append(dupl_array[k])
               del dupl_array[0:]
               unique = 0
               cds_info_array.append(cds_info)
            else:
               cds_info_array.append(cds_info)
               
      if format_cntr == 0 or format_cntr == 1000:
         clear_screen ()
         print "GETTING INFO ON CDS  REGION:     Processing line " + str(l) + "... Please wait..." 
         format_cntr = 0
      format_cntr +=1
        
   del CDS_array[0:]
   return cds_info_array


def find_longest(dupl_array):
   
   length_array =  []
   pos_array = []

   for i in range (len(dupl_array)):
      pos_array.append(dupl_array[i][4])
      j = 1
      length = 0
      while j < len(pos_array[i]):
         length += (int(pos_array[i][j]) + 1) - int(pos_array[i][j-1])
         j +=2
      length_array.append(length)
      if i == 0:
         longest = length_array[i]
         longest_idx = i 
      if i > 0:
         if length_array[i] > longest:
            longest = length_array[i]
            longest_idx = i
   return longest_idx

         

###################################################
#                    main                         #
###################################################

clear_screen ()
fp_in = open_inputfile() 

print "\n\n\nreading data from the input file" , input_filename, "...\n\n"

fd1 = open_outputfile(".out")
fp_out1 = fd1[0]
fd2 = open_outputfile(".join")
fp_out2 = fd2[0]
fd3 = open_outputfile(".protein")
fp_out3 = fd3[0]
fd4 = open_outputfile(".cds")
fp_out4 = fd4[0]
fd5 = open_outputfile(".position")
fp_out5 = fd5[0]
fd6 = open_outputfile(".500")
fp_out6 = fd6[0]
fd7 = open_outputfile(".inbetween")
fp_out7 = fd7[0]
fd8 = open_outputfile(".msg")
fp_out8 = fd8[0]
fd9 = open_outputfile(".gene")
fp_out9 = fd9[0]
fd10 = open_outputfile(".protein.dupl")
fp_out10 = fd10[0]

#test
output11 = input_filename + ".test"
fp_out11 = open(output11, "wb")

order_list = choose_format()
# 1 - longest, 2 - first
duplication_option = duplication_option_menu()

id = string.maketrans("","") # No actual translation
nonAZList = [chr(x) for x in range(256) if not ("a"<=chr(x)<="z" or "A"<=chr(x)<="Z")]
nonAZStr="".join(nonAZList)

dna_array  = []; data_array = []; pos_array =[]

#dna_array contains only dna sequence
#data_array contains information about all regions (CDS, gene, mRNA)
dna_array, data_array, pos_array = get_data(fp_in, fp_out1)

fp_in.close() # close the input file

CDS_array = []; gene_array = []
CDS_array, gene_array = defRegionOfInterest(data_array, pos_array)

dna_str = ''.join (dna_array) 
#fp_out11.write (dna_str)

# retrieving information from gene region
# retrieved is a list of tuples
# first element is gene name, second - at_name_gene,third - description
# fourth - beginning of gene interval, fifth - the end
gene_info_array = []
gene_info_array = get_gene_name_desc(gene_array)

CDS_info_array = []
CDS_dupl_array = []

# CDS_info_array is a list of lists:
# 1st element is gene_name_cds, then - protein_id, genbank_id,
# direction, pos_arr, positions, warning, [duplication_desc]
CDS_info_array = get_CDS_region_desc(CDS_array, duplication_option)

dna_modif = []
last_from_before = 0
first_warning = 0
format_cntr = 0
m = 0
at_gname_arr = []

# processing information from CDS region
for l in range(len(CDS_info_array)):
   warning_msg = []
   gene_name_cds = CDS_info_array[l][0]
   protein_id = CDS_info_array[l][1]
   genbank_id = CDS_info_array[l][2]
   direction = CDS_info_array[l][3]
   pos_arr = CDS_info_array[l][4]
   positions = CDS_info_array[l][5]
   warning = ''.join(CDS_info_array[l][6])
   if warning != "":
      warning_msg.append(warning)
   desc_cds = CDS_info_array[l][7]

   if len(CDS_info_array[l]) == 8:
      unique = 0 
   elif len(CDS_info_array[l]) == 9:
      dupl_desc = CDS_info_array[l][8]
      if dupl_desc =='UNIQUE_SET':
          unique = 0
      else:
          unique = 1

   # check if gene name in gene regions matches the one in CDS region
   gene_name_gene = gene_info_array[m][0]
   if gene_info_array[m][1] != "":
      at_name_gene = gene_info_array[m][1]
   else:
      at_name_gene = gene_info_array[m][0]   
   if gene_info_array[m][2] != "":    
       desc = gene_info_array[m][2]
   elif desc_cds != "":
       desc = desc_cds
   else:
       desc = "NO DESCRIPTION FOUND"
   
   gene_beg_interval = int(gene_info_array[m][3])
   gene_end_interval = int(gene_info_array[m][4])
   gene_name_match = 0

   header, first_in_header = make_header(order_list, gene_name_cds, protein_id, genbank_id)
   
   if ((pos_arr[0] >= gene_beg_interval) and (pos_arr[len(pos_arr)-1 ] <= gene_end_interval)) \
      and (gene_name_gene == gene_name_cds):
      fp_out11.write("*" + gene_name_gene + " == " + gene_name_cds+"\n")
      gene_name_match = 1
      
   else :
      fp_out11.write("WARNING: gene name/interval mismatch: "+gene_name_gene+" (in gene region) and "+gene_name_cds+" (in CDS region)\n")
      temp_m = m
      while ((m > 0)) and gene_name_match != 1:
         m -= 1
         fp_out11.write("m counts down m = "+str(m)+"\n")
         if (gene_info_array[m][0] == gene_name_cds):
            if (pos_arr[0] >= int(gene_info_array[m][3])) and (pos_arr[len(pos_arr)-1 ] <= int(gene_info_array[m][4])):
               fp_out11.write(str(m) + " " + gene_info_array[m][0] + " == " + gene_name_cds + "\n")
               gene_name_gene = gene_info_array[m][0]
               if gene_info_array[m][1] != "":
                  at_name_gene = gene_info_array[m][1]
               else:
                  at_name_gene = gene_info_array[m][0]
               desc = gene_info_array[m][2]
               gene_name_match = 1
      if gene_name_match == 0:
         m = temp_m
         while ((m < len(gene_info_array)-1)) and gene_name_match != 1:
            m += 1
            fp_out11.write("m counts up m = "+str(m)+ "\n")
            if (gene_info_array[m][0] == gene_name_cds):               
              if (pos_arr[0] >= int(gene_info_array[m][3])) and (pos_arr[len(pos_arr)-1 ] <= int(gene_info_array[m][4])):
                  fp_out11.write(str(m) + " " + gene_info_array[m][0] + " == " + gene_name_cds+ "\n")
                  gene_name_gene = gene_info_array[m][0]
                  if gene_info_array[m][1] != "":
                     at_name_gene = gene_info_array[m][1]
                  else:
                     at_name_gene = gene_info_array[m][0]
                  desc = gene_info_array[m][2]
                  gene_name_match = 1
      if gene_name_match == 0:
         m = temp_m
         gene_name_gene = "None"
         at_name_gene = "None"
   m += 1
   fp_out11.write("*********************************\n")

   if unique == 0:
       at_gname_arr.append(first_in_header)
   
   l3 = 1
   l4 = 0
   beg_dna_seq    = []
   end_dna_seq    = []
   subseq_dna     = []
   beg_in_between = []
   end_in_between = []
   dna_str_modif  = []
   temp_dna_str   = ''
   substr_arr     = [] 

   # while not last position in a position array
   while l3 <= len(pos_arr):
      beg_dna_seq.append (pos_arr[l3-1])
      end_dna_seq.append (pos_arr[l3])
      # capitalized substring corresponding to specified positions -> .join
      subseq_dna.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))

      #  get 500 characters before the very first valid subsequence in a whole DNA sequence
      if (l4 == 0) and (l == 0):
         substr_arr.append(dna_str[int(beg_dna_seq[l4])-1-500:int(beg_dna_seq[l4])-1])
         substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
         dna_str_modif_zero = dna_str[:int(beg_dna_seq[l4])-1]
         dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
         
      #lowercase sequence in between 2 genes 
      #(extact -500 before first valid subsequense in each CDS -> substr_arr) 
      elif (l4 == 0) and (l != 0):
         substr_arr.append(dna_str[int(beg_dna_seq[l4])-1-500:int(beg_dna_seq[l4])-1])
         substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
         if unique == 0:
            #if end position of one gene overlaps with the beginning position of another
            if last_from_before >= int(beg_dna_seq[0]):
               warning_msg.append ("WARNING: "+CDS_info_array[l-1][0]+"-"+gene_name_cds+":  overlapping sequence fragment\n")
               dna_str_modif.append(string.upper(dna_str[last_from_before+1:int(end_dna_seq[l4])]))
            else:
               dna_str_modif.append(dna_str[last_from_before+1:int(beg_dna_seq[l4])-1])
               dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
  
            fp_out7.write('>'+at_gname_arr[0]+'_'+first_in_header+' [ '+CDS_info_array[l-1][0]+'_'+gene_name_cds+' ] '\
                          +string.upper(CDS_info_array[l-1][3][0])+ string.upper(direction[0])+'\n')
            fp_out7.write(dna_str[last_from_before+1:int(beg_dna_seq[l4])-1] +'\n')
            del at_gname_arr[0]
            last_from_before = 0
      #lowercase sequence in between 2 position pairs
      elif (l4 > 0):
               substr_arr.append(dna_str[int(end_dna_seq[l4-1]):int(beg_dna_seq[l4])-1])         
               substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
               dna_str_modif.append(dna_str[int(end_dna_seq[l4-1]):int(beg_dna_seq[l4])-1])
               dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))

      else:   
               dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
               substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])]))
      l4 = l4+1
      l3 = l3+2

   # extact +500 after last valid subsequense in each CDS -> substr_arr
   substr_arr.append(dna_str[int(end_dna_seq[l4-1]):int(end_dna_seq[l4-1])+500])
   #data for .500 file
   join_substr = ''.join (substr_arr)
   if (l == len(CDS_info_array)-1):
         dna_str_modif_last = dna_str[int(end_dna_seq[l4-1]):]

   if unique == 0:
      last_from_before = int(end_dna_seq[l4-1])-1
#      temp_dna_str = ''.join (dna_str_modif)
      dna_modif.append(''.join(dna_str_modif))

      if direction == "reverse":  
          rev_join_substr = revcomp(join_substr)
          if len(CDS_info_array[l]) == 9:
             fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] "+ dupl_desc +"\n")
          else:
             fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ]\n")
          fp_out6.write(rev_join_substr + '\n') 
      else:
          if len(CDS_info_array[l]) == 9:
             fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] "+dupl_desc +"\n")
          else:
             fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ]\n")
          fp_out6.write(join_substr + '\n')
   else:
      if direction == "reverse":  
          rev_join_substr = revcomp(join_substr)
          fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] "+dupl_desc +"\n")
          fp_out6.write(rev_join_substr + '\n') 
      else:
          fp_out6.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] "+dupl_desc +"\n")
          fp_out6.write(join_substr + '\n')

   #data for .join file
   dna_substr = ''.join(subseq_dna)

   if direction == "reverse":   
      rev_comp_dna = revcomp(dna_substr) 
      prot_str = dna_translate (rev_comp_dna)  
   else:
      prot_str = dna_translate (dna_substr)  


   #find middle position
   mid_pos = int((int(beg_dna_seq[0])+ int(end_dna_seq[len(end_dna_seq)-1]))/2)

   if unique == 0:
      fp_out5.write(at_name_gene+'\t'+gene_name_cds +'\t'+protein_id +'\t'+'CDS'+'\t'+direction+'\t'+ \
                    genbank_id+'\t'+str(mid_pos)+'\t'+positions+'\n')
      fp_out3.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ]\n")
      fp_out3.write(prot_str +"\n")

      if len(CDS_info_array[l]) == 9:
         fp_out10.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] " + dupl_desc + '\n')
         fp_out10.write(prot_str +"\n")
         
      if direction == "reverse":   
         rev_dna_substr = revcomp(dna_substr)
         fp_out2.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ]\n")
         fp_out2.write(rev_dna_substr +'\n') 
      else:
         fp_out2.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ]\n")
         fp_out2.write(dna_substr +'\n')
   else:     
      #write translated protein sequence into fp_out10
      fp_out10.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ] " + dupl_desc + '\n')
      fp_out10.write(prot_str +"\n")


   if len(dna_substr)%3 != 0:
      warning_msg.append ("WARNING: String length is not divisible by 3"+'\n')
   stop_cod_num = string.count(prot_str, "*")	          # count stop codons	           
   if stop_cod_num == 0:
      warning_msg.append ("WARNING: NO STOP CODON"+'\n')
   if (stop_cod_num == 1) and (prot_str[len(prot_str)-1] != '*') :
      warning_msg.append ("WARNING: PREMATURE STOP CODON"+'\n')
   if stop_cod_num >=2 :
      warning_msg.append ("WARNING: >1 STOP CODONS"+'\n')  
 

   msg_str = ''.join(warning_msg)
   if len(msg_str) > 0:
      if first_warning == 0:
         fp_out8.write("***********************\n")
         fp_out8.write("* CDS REGION WARNINGS *\n")
         fp_out8.write("***********************\n\n") 
      fp_out8.write(">" + first_in_header + " " + desc + "[ "+ header + "CDS " + direction + " ]\n")
      fp_out8.write(msg_str)
      first_warning = 1
   if  format_cntr == 0 or format_cntr == 1000:
      clear_screen ()
      print "TRANSLATING: Processing line " + str(l) + "... Please wait..."
      format_cntr = 0
   format_cntr += 1

fp_out11.close()  # close the output11 file
fp_out10.close()  # close the output10 file
fp_out9.close()   # close the output9 file 
fp_out8.close()   # close the output8 file         
fp_out7.close()   # close the output7 file  
fp_out6.close()   # close the output6 file  
fp_out5.close()   # close the output5 file  
fp_out4.close()   # close the output4 file  
fp_out3.close()   # close the output3 file
fp_out2.close()   # close the output2 file  
fp_in.close()     # close the input file


dna_modif.insert (0, dna_str_modif_zero)
dna_modif.insert (len(dna_modif), dna_str_modif_last)
modif_dna_str = ''.join (dna_modif)
put100charsPerLine (len(modif_dna_str), modif_dna_str, fp_out1)
fp_out1.close()  # close the output1 file

print "DNA length: " + str(len(dna_str))

print "\n\nModified file",fd1[1],"is created..." 
print     "Modified file",fd2[1],"is created..." 
print     "Modified file",fd3[1],"is created..." 
print     "Modified file",fd4[1],"is created..." 
print     "Modified file",fd5[1],"is created..." 
print     "Modified file",fd6[1],"is created..." 
print     "Modified file",fd7[1],"is created..." 
print     "Modified file",fd8[1],"is created..."
print     "Modified file",fd9[1],"is created...\n\n" 


