# Dynamic Programming Global Alignment # # computes the global alignment of 2 sequences # implements the o(mn) dynamic programming algorithm # using single character gap penalties # # by: J. Christopher Bare # date: 2005-5-29 # align two sequences given by strings s and t def align(s, t, score): n = len(s) m = len(t) # initialize the table to n rows by m columns table = [0]*((m+1)*(n+1)) coord = Coord(m+1) # initialize table of back pointers to n x m # each entry will be a list of (i,j) tuples backpointers = [[]]*((m+1)*(n+1)) # fill in zeroth row and column table[coord(0,0)] = 0 for i in range(1,n+1): table[coord(i,0)] = table[coord(i-1,0)] + score(s[i-i],"-") for j in range(1,m+1): table[coord(0,j)] = table[coord(0,j-1)] + score("-",t[j-1]) # fill in table for i in range(1,n+1): for j in range(1,m+1): table[coord(i,j)] = max( table[coord(i-1,j)] + score(s[i-1],"-"), table[coord(i,j-1)] + score("-",t[j-1]), table[coord(i-1,j-1)] + score(s[i-1],t[j-1])) print_table(table, s, t, n, m) a = generate_alignments(table, s, t, n, m, n, m, score) print print_alignments(a) # simple scoring function # note that gaps should cost more than mismatches def simple_score(a,b): if a == b: return 5 if a == "-" or b == "-": return -5 else: return -4 # translate from i,j coordinates to a position in a # one dimensional array class Coord: def __init__(self, cols): self.cols = cols def __call__(self, i, j): return self.cols * i +j # traverses the table and generates alignments # returns a list of tuples of strings. Each tuple # is an alignment between the two strings s and t def generate_alignments(table, s, t, i, j, n, m, score): coord = Coord(m+1) if i==0 and j==0: return [("","")] else: if i==0: return [("-"*j,t[0:j])] elif j==0: return [(s[0:i],"-"*i)] else: max_score = max(table[coord(i-1,j-1)] + score(s[i-1],t[j-1]), table[coord(i,j-1)] + score("-",t[j-1]), table[coord(i-1,j)] + score(s[i-1],"-")) alignments = [] if (table[coord(i-1,j-1)] + score(s[i-1],t[j-1])==max_score): prefixes = generate_alignments(table, s, t, i-1, j-1, n, m, score) for (s1, t1) in prefixes: s1 += s[i-1] t1 += t[j-1] alignments.append( (s1,t1) ) if (table[coord(i,j-1)] + score("-",t[j-1])==max_score): prefixes = generate_alignments(table, s, t, i, j-1, n, m, score) for (s1, t1) in prefixes: s1 += "-" t1 += t[j-1] alignments.append( (s1,t1) ) if (table[coord(i-1,j)] + score(s[i-1],"-")==max_score): prefixes = generate_alignments(table, s, t, i-1, j, n, m, score) for (s1, t1) in prefixes: s1 += s[i-1] t1 += "-" alignments.append( (s1,t1) ) return alignments # print a table with n rows and m cols # for strings s of length n # and t of length m def print_table(table, s, t, n, m): coord = Coord(m+1) w = max([len(str(x)) for x in table]) + 1 format_label = "%" + str(w) + "s" format_entry = "%" + str(w) + "d" # print table header print line = " " + " "*w for j in range(1,m+1): line += format_label % (t[j-1]) print line print "-" * len(line) # print table rows line = " |" for j in range(0,m+1): line += format_entry % (table[coord(0,j)]) print line for i in range(1,n+1): line = "%s|" % (s[i-1]) for j in range(0,m+1): line += format_entry % (table[coord(i,j)]) print line # put a blank line at the end print def print_tab_table(table, s, t, n, m): coord = Coord(m+1) # print table header print line = "\t\t" for j in range(1,m+1): line += "%s\t" % (t[j-1]) print line # print table rows line = "\t" for j in range(0,m+1): line += "%d\t" % (table[coord(0,j)]) print line for i in range(1,n+1): line = "%s\t" % (s[i-1]) for j in range(0,m+1): line += "%d\t" % (table[coord(i,j)]) print line # put a blank line at the end print def print_alignments(alignments): for s1,s2 in alignments: print s1 print s2 print # a scoring function that implements the BLOSUM 62 matrix # with a -5 gap penalty class Blosum62: def __init__(self): self.gap_penalty = -5 self.amino_acids="ARNDCQEGHILKMFPSTWYV"; self.table = [ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0, -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3, -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3, 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1, -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2, -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2, 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3, -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3, -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3, -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1, -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2, -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1, -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1, -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2, 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0, -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3, -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1, 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4] def __call__(self, a, b): try: ia = self.amino_acids.index(a) ib = self.amino_acids.index(b) return self.table[ia*len(self.amino_acids) + ib] except: return self.gap_penalty # generates the alignments in Needleman and Wunsch 1970 # maybe these aren't really amino acid sequences. There is # no amino acid J or O def generate_alignments_from_paper(): align("AJCJNRCKCRBP","ABCNJROCLCRPM", simple_score) # aligns a section of human and mouse insulin def insulin(): align("LQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY","PQVEQLELGGSPGDLQTLALEVARQKRGIVDQCCTSICSLYQLENYCN", Blosum62()) def insulin_short(): align("GGPGAGSLQPLALEGSL","GSPGDLQTLALEVAR", Blosum62()) def test_dna(): align("tggatcgata","tgcatgcata", simple_score) def test(): align("ABCDEFGH","ABCXGHZXZX", simple_score) if __name__ == '__main__': insulin_short()