def pfqsep(masked_file, original_file, quality_file, output_file):


    #variables
    seq_end = 0
    carrot_loc = 0
    quality_rows = 0
    original_rows = 0
    mask_rows = 0
    temp_int = 0
    score_len = 0
    log_error = False
    log_string = ''
    cleaning_flag = True
    fast_flag = True
    #LISTS
    MASK_ID_LIST = []
    MASK_SEQUENCE_LIST = []
    ORIGINAL_ID_LIST = []
    ORIGINAL_SEQUENCE_LIST = []
    QUALITY_ID_LIST = []
    QUALITY_SEQUENCE_LIST = []
    QUALITY_SCORE_LIST = []   
    QUALITY_SCORES = []        
    LOG_LIST = []
    TEMP_LIST = []
    TEMP_SCORE_LIST = []
    
    #TUPLES
    OUTPUT_TUPLE = []
    OUTPUT_TUPLE_LIST = []

    #DICTIONARIES
    QUALITY_DICTIONARY = {0:'a', 1:'b', 2:'c', 3:'d', 4:'e', 5:'f', 6:'g', 7:'h', 8:'i', 9:'j', 10:'k', 11:'l', 12:'m', 13:'n', 14:'o', 15:'p', 16:'q', 17:'r', 18:'s', 19:'t', 20:'u', 21:'v', 22:'w', 23:'x', 24:'y', 25:'z', 26:'A', 27:'B', 28:'C', 29:'D', 30:'E', 31:'F', 32:'G', 33:'H', 34:'I', 35:'J', 36:'K', 37:'L', 38:'M', 39:'N', 40:'O', 41:'P', 42:'Q', 43:'R', 44:'S', 45:'T', 46:'U', 47:'V', 48:'W', 49:'X', 50:'Y', 51:'Z'}

    #TIMING VARIABLE
    timer = timeit.Timer()
    begin = timer.timer()


#Data Extraction
##################################################################################################################


    #MASKED


    print "\nOpening and Checking Masked File\n"

    #try to open masked file
    try:
        text_file = open(masked_file)
    except:
            print "DID NOT FIND FILE:  " + masked_file
            print sys.exc_info()[0]
            return
    #try to read total masked file    
    try:
        line = text_file.read()
    except:
        print "MASKED FILE COULD NOT BE READ, CHECK FILE FORMAT"
        print sys.exc_info()[0]
        return

    t1 = timer.timer()


    try:
        while cleaning_flag == True:
            cleaning_flag = False
                    
            while line.find('\t') > 0:
                cleaning_flag = True
                line = line.replace('\t',' ')
            while line.find('  ') > 0:
                cleaning_flag = True
                line = line.replace('  ',' ')
            while line.find('\r') > 0:
                cleaning_flag = True
                line = line.replace('\r', '\n')
            while line.find('\n\n') > 0:
                cleaning_flag = True
                line = line.replace('\n\n', '\n')
            while line.find('\n \n') > 0:
                cleaning_flag = True
                line = line.replace('\n \n', '\n')
            while (line[0] == '\n'  or  line [0] == ' '):
                cleaning_flag = True
                line = line.lstrip('\n')
                line = line.lstrip()
            while (line[-1] == '\n'  or  line[-1] == ' '):
                cleaning_flag = True
                line = line.rstrip()
                line = line.rstrip('\n')
        #added to end of file to simplify parsing
        line = line + '\n'#end of file

        carrots = line.count('>')
        carrot_counter = 1

        #Extraction of id and sequence            
        while line.find('>', carrot_loc) >= 0:

    
            print carrot_counter, " of ", carrots
            carrot_counter = carrot_counter +1
            carrot_loc = line.find('>',carrot_loc)

            if fast_flag == True:            
                id_space = line.find(' ', carrot_loc)#space after id
                if id_space == -1:
                    fast_flag = False

            id_end = line.find('\n', carrot_loc)#if no space then newline after id

            #determines where the end of the id is. 
            #done by finding nearest space or newline and then determines which is closer

            if  (id_space < 0):

                #there is no description
                mask_id = line[carrot_loc + 1:id_end]

            elif  (id_end - id_space) < 0:
                #there is no description
                mask_id = line[carrot_loc + 1:id_end]

            else:
                #there is a description
                mask_id = line[carrot_loc + 1:id_space]

            #id's are stored in this master list
            MASK_ID_LIST.append(mask_id)

            #looks for the begining of the next id
            #this will also points to the end of the sequence
            seq_end = line.find('>' , id_end)
            carrot_loc = seq_end
            #sequence is extracted 
            if seq_end > 0:
                mask_sequence = line[id_end + 1: seq_end]

            else: 
                mask_sequence = line[id_end + 1:len(line)-1]
                MASK_SEQUENCE_LIST.append(mask_sequence)
                mask_rows = mask_rows + 1
                break
            #sequences are stored in this master list
            MASK_SEQUENCE_LIST.append(mask_sequence)

            mask_rows = mask_rows + 1


        text_file.close()
        t2 = timer.timer()
        print "Mask file was scanned in %.2f seconds\n" %(t2-t1)
            
    except:
        print "Error occured while reading data from masked file, Script probably encountered an unexpected format error."
        print "Error encountered was" , sys.exc_info()[0]
        return




    #ORIGINAL
    #error checking and formating of original file
    carrot_loc = 0
    seq_end = 0
    line = ''

    print "Opening and Checking Original File\n"

    #try to open original file
    try:
        text_file = open(original_file)
    except:
            print "DID NOT FIND FILE:  " + original_file
            print sys.exc_info()[0]
            return


    #try to read original file
    try:
        line = text_file.read()
    except:
        print "ORIGINAL FILE COULD NOT BE READ, CHECK FILE FORMAT"
        print sys.exc_info()[0]
        return



    fast_flag = True
    cleaning_flag =True
    carrot_counter = 1


    #formatting of original file
    try:
        #Removal of excess characters
        t1 = timer.timer()
        while cleaning_flag == True:
            cleaning_flag = False
                    
            while line.find('\t') > 0:
                cleaning_flag = True
                line = line.replace('\t',' ')
            while line.find('  ') > 0:
                cleaning_flag = True
                line = line.replace('  ',' ')
            while line.find('\r') > 0:
                cleaning_flag = True
                line = line.replace('\r', '\n')
            while line.find('\n\n') > 0:
                cleaning_flag = True
                line = line.replace('\n\n', '\n')
            while line.find('\n \n') > 0:
                cleaning_flag = True
                line = line.replace('\n \n', '\n')
            while (line[0] == '\n'  or  line [0] == ' '):
                cleaning_flag = True
                line = line.lstrip('\n')
                line = line.lstrip()
            while (line[-1] == '\n'  or  line[-1] == ' '):
                cleaning_flag = True
                line = line.rstrip()
                line = line.rstrip('\n')
        #added to end of file to simplify parsing
        line = line + '\n'#end of file

        carrots = line.count('>')
        carrot_counter = 1
        #Extraction of id and sequence            
        while line.find('>', carrot_loc) >= 0:

    
            print carrot_counter, " of ", carrots
            carrot_counter = carrot_counter +1
            carrot_loc = line.find('>',carrot_loc)

            if fast_flag == True:            
                id_space = line.find(' ', carrot_loc)#space after id
                if id_space == -1:
                    fast_flag = False

            id_end = line.find('\n', carrot_loc)#if no space then newline after id

            #determines where the end of the id is. 
            #done by finding nearest space or newline and then determines which is closer

            if  (id_space < 0):

                #there is no description
                original_id = line[carrot_loc + 1:id_end]

            elif  (id_end - id_space) < 0:
                #there is no description
                original_id = line[carrot_loc + 1:id_end]

            else:
                #there is a description
                original_id = line[carrot_loc + 1:id_space]

                
            ORIGINAL_ID_LIST.append(original_id)

            #looks for the begining of the next id
            #this will also points to the end of the sequence
            seq_end = line.find('>' , id_end)#end of sequence
            carrot_loc = seq_end
            #sequence is extracted
            if seq_end > 0:
                original_sequence = line[id_end + 1: seq_end]
            else: 
                original_sequence = line[id_end + 1:len(line)-1]
                ORIGINAL_SEQUENCE_LIST.append(original_sequence)
                original_rows = original_rows + 1
                break
            #sequences are stored in this master list
            ORIGINAL_SEQUENCE_LIST.append(original_sequence)
            original_rows = original_rows + 1
        text_file.close()
        t2 = timer.timer()
        print "Original file was scanned in %.2f seconds\n" %(t2-t1)
      
    except:
        print "Error occured while reading data from original file, Script probably encountered an unexpected format error."
        print "Error encountered was" , sys.exc_info()[0]
        return
        



    #QUALITY  
    #error checking and formatting of quality file
    carrot_loc = 0
    seq_end = 0
    line = ''
    print "Opening and Checking Quality File\n"

    #try to open quality file
    try:
        text_file = open(quality_file)
    except:
            print "DID NOT FIND FILE:  " + quality_file
            print sys.exc_info()[0]
            return

    #try to read quality file
    try:
        line = text_file.read()
    except:
        print "QUALITY FILE COULD NOT BE READ, CHECK FILE FORMAT"
        print sys.exc_info()[0]
        return
    cleaning_flag = True

    fast_flag = True
    cleaning_flag =True
    carrot_counter = 1



    #formatting of quality file
    try:
        #Removal of excess characters
        t1 = timer.timer()
        while cleaning_flag == True:
            cleaning_flag = False
                    
            while line.find('\t') > 0:
                cleaning_flag = True
                line = line.replace('\t',' ')
            while line.find('  ') > 0:
                cleaning_flag = True
                line = line.replace('  ',' ')
            while line.find('\r') > 0:
                cleaning_flag = True
                line = line.replace('\r', '\n')
            while line.find('\n\n') > 0:
                cleaning_flag = True
                line = line.replace('\n\n', '\n')
            while line.find('\n \n') > 0:
                cleaning_flag = True
                line = line.replace('\n \n', '\n')
            while (line[0] == '\n'  or  line [0] == ' '):
                cleaning_flag = True
                line = line.lstrip('\n')
                line = line.lstrip()
            while (line[-1] == '\n'  or  line[-1] == ' '):
                cleaning_flag = True
                line = line.rstrip()
                line = line.rstrip('\n')
        #added to end of file to simplify parsing
        line = line + '\n'#end of file

        carrots = line.count('>')
        carrot_counter = 1
        #Extraction of id and sequence            
        while line.find('>', carrot_loc) >= 0:

    
            print carrot_counter, " of ", carrots
            carrot_counter = carrot_counter +1
            carrot_loc = line.find('>',carrot_loc)

            if fast_flag == True:            
                id_space = line.find(' ', carrot_loc)#space after id
                if id_space == -1:
                    fast_flag = False

            id_end = line.find('\n', carrot_loc)#if no space then newline after id

            #determines where the end of the id is. 
            #done by finding nearest space or newline and then determines which is closer

            if  (id_space < 0):

                #there is no description
                quality_id = line[carrot_loc + 1:id_end]

            elif  (id_end - id_space) < 0:
                #there is no description
                quality_id = line[carrot_loc + 1:id_end]

            else:
                #there is a description
                quality_id = line[carrot_loc + 1:id_space]

            #stores id's in master list
            QUALITY_ID_LIST.append(quality_id)

            #looks for the begining of the next id
            #this will also points to the end of the sequence
            seq_end = line.find('>' , id_end)#end of sequence
            carrot_loc = seq_end


            #extracts sequence
            if seq_end > 0:
                quality_sequence = line[id_end + 1: seq_end]
            else: 
                quality_sequence = line[id_end + 1:len(line)-1]
                QUALITY_SEQUENCE_LIST.append(quality_sequence)
                quality_rows = quality_rows + 1
                break
            #stores sequences in master list
            QUALITY_SEQUENCE_LIST.append(quality_sequence)
            quality_rows = quality_rows + 1
        text_file.close()
        t2 = timer.timer()
        print "Quality file was scanned in %.2f seconds\n" %(t2-t1)
    except:
        print "Error occured while reading data from quality file, Script probably encountered an unexpected format error."
        print "Error encountered was" , sys.exc_info()[0]
        return
    #line is not used again deleted to free memory
    line =''

#List Formatting
#################################################################################################################


    
    #formatting masked data
    #all spaces and \n characters are removed from the sequences
    for i in range(0, len(MASK_SEQUENCE_LIST)):
        temp_string = MASK_SEQUENCE_LIST[i]
        while temp_string.find(' ') > 0: 
            temp_string = temp_string.replace(' ','')
        while temp_string.find('\n') > 0:
            temp_string = temp_string.replace('\n', '')
       
        TEMP_LIST.append(temp_string)
    MASK_SEQUENCE_LIST = TEMP_LIST
    TEMP_LIST = []

    #format Phred data
    #all spaces and \n characters are removed from the sequences    
    for i in range(0, len(ORIGINAL_SEQUENCE_LIST)):
        temp_string = ORIGINAL_SEQUENCE_LIST[i]
        while temp_string.find(' ') > 0:
            temp_string = temp_string.replace(' ','')
        while temp_string.find('\n') > 0:
            temp_string = temp_string.replace('\n', '')
       
        TEMP_LIST.append(temp_string)
    ORIGINAL_SEQUENCE_LIST = TEMP_LIST
    TEMP_LIST = []



    #format quality scores into a list of ints
    for i in range(0, len(QUALITY_SEQUENCE_LIST)):
        temp_string = QUALITY_SEQUENCE_LIST[i]
        while temp_string.find('\n') > 0:
            temp_string = temp_string.replace('\n', ' ')
        while temp_string.find('  ') > 0:
            temp_string = temp_string.replace('  ',' ')
        temp_string = temp_string.lstrip()
        temp_string = temp_string.rstrip()
        temp_string = temp_string + '\t' #sentinel value at end of string
        count = 0
        #search through temp_string for ints and typecast
        for j in range(0, len(temp_string)):
            if (temp_string[j].isdigit() ):
                continue
            #no scores
            elif (temp_string == '\t'):
                temp_int = ''
                TEMP_SCORE_LIST.append(temp_int)
                break
            #ints are delimited by space and are extracted by search for consecutive whitespace
            else:
                temp_int = int(temp_string[count:j])
                TEMP_SCORE_LIST.append(temp_int)
                count = j+1
                
       
        TEMP_LIST.append(TEMP_SCORE_LIST)
        TEMP_SCORE_LIST = []
    QUALITY_SCORE_LIST = TEMP_LIST
    TEMP_LIST = []
    TEMP_SCORE_LIST = []

##Error Checking
#########################################################################################################################################

    #original file
    for i in range(0,len(ORIGINAL_ID_LIST)):
        print `i`
        for j in range(i+1,len(ORIGINAL_ID_LIST)):
            if (i == j):
                continue
            else:
                if (ORIGINAL_ID_LIST[i] == ORIGINAL_ID_LIST[j]):
                    print "This Id", ORIGINAL_ID_LIST[i] ,"has been found in the original file more than once, this file connot contain redundant Id's\n"
                    log_error = True
                    log_string = "This Id " + ORIGINAL_ID_LIST[i] + " has been found in the original file more than once, this file connot contain redundant Id's"
                    LOG_LIST.append(log_string)
    #quality file                            
    for i in range(0,len(QUALITY_ID_LIST)):
        for j in range(i+1,len(QUALITY_ID_LIST)):
            if (i == j):
                continue
            else:
                if (QUALITY_ID_LIST[i] == QUALITY_ID_LIST[j]):
                    print "This Id", QUALITY_ID_LIST[i] ,"has been found in the quality file more than once, this file connot contain redundant Id's\n"
                    log_error = True
                    log_string = "This Id " + QUALITY_ID_LIST[i] + " has been found in the quality file more than once, this file connot contain redundant Id's"
                    LOG_LIST.append(log_string)

    #Newline
    if (os.name == 'nt'):
        _newline = '\r\n'
    elif (os.name == 'mac'):
        _newline = '\r'
    else:
        _newline = '\n'

    #Halts script if either of the 2 errors above are found
    if (log_error):
        try:
            log_output = open(output_file + ".log", "wbU")
        except:
            print "Could not create log output file", output_file, "Check file permissions."
            return
        for i in range(0,len(LOG_LIST)):
            log_output.write(LOG_LIST[i])
            log_output.write(_newline)
        print "Program Exited Unsucessfully With Errors.\nNo Quality Files Were Created.\nCheck log file."    
        return
        
    



##Find And Extract Scores & More Error Checking
###########################################################################################################################################

    
    #find Mask id's in Original id list
    count = 0
    while count < len(MASK_ID_LIST):

        #check that id is in original file
        if (MASK_ID_LIST[count] in ORIGINAL_ID_LIST):
            for i in range(0,len(ORIGINAL_ID_LIST)):

                #look for sequence id in original file
                if (MASK_ID_LIST[count] == ORIGINAL_ID_LIST[i]):
                    temp_sequence = MASK_SEQUENCE_LIST[count]
                    master_sequence = ORIGINAL_SEQUENCE_LIST[i]
                    start = master_sequence.find(temp_sequence)

                    #sequence wasn't found in original sequence
                    if start == -1:
                        print "The sequence which corresponds to this ID", MASK_ID_LIST[count], "was not found in the original file."
                        print "This ID will be skipped and the program will continue.\n"
                        log_error = True
                        log_string = "The sequence which corresponds to this ID " + MASK_ID_LIST[count] + " was not found in the original file."
                        LOG_LIST.append(log_string)
                        count = count + 1
                        break # i loop

                    #sequence was found multiple times in original sequence
                    if (master_sequence.find(temp_sequence, start + 1) > -1):
                        print "The sequence which corresponds to this ID", MASK_ID_LIST[count], "was found in the original file more than once."
                        print "This ID will be skipped and the program will continue.\n"
                        log_error = True
                        log_string = "The sequence which corresponds to this ID " + MASK_ID_LIST[count] + " was found in the original file more than once."
                        LOG_LIST.append(log_string)
                        count = count + 1
                        break # i loop

                    #sequence was found extract scores
                    else:

                        #check that id is in quality file
                        if (ORIGINAL_ID_LIST[i] in QUALITY_ID_LIST):

                            #look for sequence id     
                            for j in range(0,len(QUALITY_ID_LIST)):
                                if (ORIGINAL_ID_LIST[i] ==  QUALITY_ID_LIST[j]):
                                    TEMP_SCORE_LIST = QUALITY_SCORE_LIST[j]

                                    #check that data is appropriate length
                                    score_len = len(TEMP_SCORE_LIST)

                                    #if proper length extract scores and create/store output tuple
                                    #error start + len(temp_sequence)
                                    if (score_len >= start + len(temp_sequence)):
                                        QUALITY_SCORES = TEMP_SCORE_LIST[start: start + len(temp_sequence)]
                                        OUTPUT_TUPLE = MASK_ID_LIST[count], MASK_SEQUENCE_LIST[count], QUALITY_SCORES
                                        OUTPUT_TUPLE_LIST.append(OUTPUT_TUPLE)
                                        QUALITY_SCORES = []
                                        count = count + 1
                                        start = 0
                                        break #j loop

                                    #sequence is longer than the number of available scores
                                    else:
                                        print "The sequence which corresponds to this ID", MASK_ID_LIST[count],"has a data error."
                                        print "The length of the sequence seems to be larger than the length of the scores available." 
                                        print "This ID will be skipped and the program will continue.\n"            
                                        log_error = True
                                        log_string = "The sequence which corresponds to this ID " + MASK_ID_LIST[count] + " seems to be larger than the length of the scores available."
                                        LOG_LIST.append(log_string)
                                        count = count + 1
                                        break # j loop
                            break# i loop

                        #id was not in quality list
                        else:
                            print "This ID", MASK_ID_LIST[count],"was not found in the quality file."
                            print "This ID will be skipped and the program will continue.\n"            
                            log_error = True
                            log_string = "This ID " + MASK_ID_LIST[count] + " was not found in the quality file."
                            LOG_LIST.append(log_string)
                            count = count + 1
                            break # i loop 

        #id was not in original file.
        else:
            print "This ID", MASK_ID_LIST[count],"was not found in the original file."
            print "This ID will be skipped and the program will continue.\n"            
            log_error = True
            log_string = "This ID " + MASK_ID_LIST[count] + " was not found in the original file."
            LOG_LIST.append(log_string)
            count = count + 1


##Create output file
#############################################################################################################################################
            
    #create output files with .quality extension
    try:
        file_output1 = open(output_file + ".quality1", "wbU")
    except:
        print "Could not create quality1 output file", output_file, "Check file permissions"
        return
    try:
        file_output2 = open(output_file + ".quality2", "wbU")
    except:
        print "Could not create quality2 output file", output_file, "Check file permissions"
        return
    #write .quality1 file
    for i in range(0, len(OUTPUT_TUPLE_LIST)):
        file_output1.write('>')
        file_output1.write(OUTPUT_TUPLE_LIST[i][0])
        file_output1.write(_newline)
        for j in range(0, len(OUTPUT_TUPLE_LIST[i][2]) ):
            file_output1.write(repr(OUTPUT_TUPLE_LIST[i][2][j]))
            file_output1.write(' ')
        file_output1.write(_newline)
    file_output1.close()

    
    #write .quality2 file
    for i in range(0, len(OUTPUT_TUPLE_LIST)):
        file_output2.write('>')
        file_output2.write(OUTPUT_TUPLE_LIST[i][0])
        file_output2.write(_newline)
        for j in range(0, len(OUTPUT_TUPLE_LIST[i][2]) ):
            temp_score = OUTPUT_TUPLE_LIST[i][2][j]

            #truncate scores larger than 51
            if (temp_score > 51):
                temp_score = 51
            
            file_output2.write(QUALITY_DICTIONARY[temp_score])
        file_output2.write(_newline)
    file_output2.close()

    #log file
    if (log_error):
        try:
            log_output = open(output_file + ".log", "wbU")
        except:
            print "Could not create log output file", output_file, "Check file permissions"
            return
        for i in range(0,len(LOG_LIST)):
            log_output.write(LOG_LIST[i])
            log_output.write(_newline)
        end = timer.timer()
        print "Total Execution time: %.2f seconds" %(end - begin)
        print "Program Exited Sucessfully With Errors.\nCheck log file"    
        return
    else:    
        end = timer.timer()
        print "Total Execution time: %.2f seconds" %(end - begin)
        print "Program Exited Sucessfully"    
        return



import sys
import array
import os
import sys
import string
import timeit
if __name__ == "__main__":
        
    if len(sys.argv) <= 4 or len(sys.argv) > 5:
        print "Program usage: "
        print "Masked_file, Original_file, Quality_file, Output_file"
	exit
    else:
        masked_file = sys.argv[1]
	original_file = sys.argv[2]
	quality_file = sys.argv[3]
	output_file = sys.argv[4]
        pfqsep(masked_file, original_file, quality_file, output_file)
