I'm attempting to write a programme that will calculate the GC content in each of a series of sequences (input in fasta format) and then return the name of the sequence with the highest percentage and its GC percentage. As per this Rosalind problem.
I've finally stopped getting error messages but my code doesn't appear to do anything. Does anyone have any idea why that might be?
#Define functions
#Calculate GC percentage
def Percent(sequence):
G_count = sequence.count ('G')
C_count = sequence.count ('C')
Total_count = len(sequence)
GC_Sum = int(G_count) + int(C_count)
Percent_GC = GC_Sum / Total_count
Per_GC = (Percent_GC)*100
return Per_GC
Input = input ("Input Sequence")
#Fasta file into dictionary
fasta_dictionary = {}
sequence_name = ""
for line in Input:
line = line.strip()
if not line:
continue
if line.startswith(">"):
sequence_name = line[1:]
if sequence_name not in fasta_dictionary:
fasta_dictionary[sequence_name] = []
continue
sequence = line
fasta_dictionary[sequence_name].append(sequence)
#Put GC values for each sequence into dictionary
dictionary = dict()
for sequence_name in fasta_dictionary:
dictionary[sequence_name] = float(Percent(sequence))
#Find highest
for sequence_name, sequence in fasta_dictionary.items():
inverse = [(sequence, sequence_name) for sequence_name, sequence in dictionary.items()]
highest_GC = max(inverse)[1]
#Find sequence name
for sequence_name, sequence in fasta_dictionary.items():
if sequence == highest_GC:
print ((sequence_name) + ' ' + (highest_GC))