import sys

class globals:
	maxTail = 10 # maximum score value that is considered low quality	

def main():
	# get input file names
	qualityinFileName = raw_input("Enter the name of the quality file:")
#	qualityinFileName = "x-per-quality.tab"
	sequenceinFileName = raw_input("Enter the name of the sequence file:")
#	sequenceinFileName = "x-per-sequence.tab"
	fastaOutFileName = raw_input("Enter the name of the fasta output file:")	
#	fastaOutFileName = "fastaOut"	
	clipOutFileName = raw_input("Enter the name of the clip info output file:")	
#	clipOutFileName = "clipOut"	
	# get output file names
# open files for reading
	sequenceInfo = open(sequenceinFileName,"r").readlines()
	qualityScoreFile = open(qualityinFileName,"r").readlines()
		# make quality file into a dictionary for easy searching as we loop through
		# the sequence file
	
	qualityScores = {}	
	for line in qualityScoreFile:
		line = line.split()
		qualityScores[line[0]] = line[2:]

	# open files for writing
	fastaFile = open(fastaOutFileName,"w")
	clipFile = open(clipOutFileName,"w")
	errorFile = open("ErrorLog","w")	
	# loop through sequence file 
	for line in sequenceInfo:
		# get values
		line = line.split()
		seqID = line[0] # sequence ID (from sequence file)
		seqLen = int(line[1]) # sequence length (from sequence file)
		seq = line[2] # sequence (from sequence file)
		scores = qualityScores[seqID] # score	list (from quality file)
		# error checking: does the sequence length equal the number of scores?
		if len(scores) != seqLen:
			# if not, don't process and output to error file
			errorFile.write("Sequence length doesn't match number of scores for " + seqID + "\n")
			continue
	
		# find end position of low quality scores at front
		frontEnd = 0 
		for i in range(0,seqLen):
			if int(scores[i]) > 10:
				frontEnd = i
				break
		
		# find start position of low quality scores at back
		# (actually this is the position of the low quality score + 1)
		backStart = seqLen - 1
		for i in range(seqLen - 1,-1,-1):
			if int(scores[i]) > globals.maxTail:
				backStart = i
				break

		# trim sequence
		seq = seq[frontEnd:backStart + 1]
 
		# find number of low quality scores in the middle	
		# (end of front to beginning of back)
		middleCount = 0
		for i in range(frontEnd + 1,backStart + 1):
			if int(scores[i]) <= globals.maxTail: middleCount += 1
				

		clip_L = str(frontEnd)
		# since backStart already is (actual start of back + 1)
		# no need to subtract 1 from  seqLen - backStart
		clip_R = str(seqLen - backStart - 1)

		# create clip info string
		clipInfo = seqID + "\tCLIP_L:" + clip_L + "\tCLIP_R:" + clip_R + "\tILQS:" + str(middleCount)
		 

		# output to files

			# output to clip info file
		clipFile.write(clipInfo + "\n") 
			# output to sequence file
		# output header
		fastaFile.write(">" + clipInfo + "\n")
		# output sequence
		fastaFile.write(seq + "\n")	

 
if __name__ == '__main__':
	main()
