# NeedlemanWunsch # computes the global alignment of 2 sequences using a scoring # system of 1 for a match, 0 for mismatch or gap. # implements the algorithm described in # Needleman, S.B. and Wunsch, C.D. # "A general method applicable to the search for similarities # in the amino-acid sequence of two proteins," # J. Molecular Biology 48 (1970), 443-453. # gap penalties are zero and are omitted # Note that this algorithm runs in O(mn(m+n)), which means it # is slower than necessary. This is because it maximizes over # the previous row and column as in the Needleman and Wunsch # paper. This departure from the O(mn) algorithm could be used # to allow for a gap function that scored a several character # gap function as a single unit. # by: J. Christopher Bare # date: 2005-5-24 # align two sequences given by strings s1 and s2 def align(s1, s2): n = len(s1) m = len(s2) # initialize the table to n x m table = [0]*(m*n) coord = Coord(m) # initialize table of back pointers to n x m # each entry will be a list of (i,j) tuples backpointers = [[]]*(m*n) for i in reversed(range(0,n)): for j in reversed(range(0,m)): if i+1 < n and j+1 < m: max_score = table[coord(i+1,j+1)] max_coords = [(i+1,j+1)] # find max score in previous row # in cols to the right of j+1 for k in range(j+2, m): if table[coord(i+1,k)] > max_score: max_score = table[coord(i+1,k)] max_coords = [(i+1,k)] elif table[coord(i+1,k)] == max_score: max_coords.append((i+1,k)) # find max score in previous col # in rows below i+1 for k in range(i+2, n): if table[coord(k,j+1)] > max_score: max_score = table[coord(k,j+1)] max_coords = [(k,j+1)] elif table[coord(k,j+1)] == max_score: max_coords.append((k,j+1)) table[coord(i,j)] = score(s1[i],s2[j]) + max_score backpointers[coord(i,j)] = max_coords else: table[coord(i,j)] = score(s1[i],s2[j]) print_table(table, s1, s2, n, m) alignments = generate_alignments(backpointers, s1, s2, 0, 0, n, m) print_alignments(alignments) # scoring is 1 for match, and 0 for mismatch. def score(a,b): if a == b: return 1 else: return 0 # translate from i,j coordinates to a position in a # one dimensional array class Coord: def __init__(self, cols): self.cols = cols def __call__(self, i, j): return self.cols * i +j # traverses the DAG formed by the backpointers in bp # to generate all co-optimal alignments of s1 and s2 def generate_alignments(bp, s1, s2, i, j, n, m): coord = Coord(m) backptrs = bp[coord(i,j)] if len(backptrs) == 0: seg1, seg2 = align_segment(s1, s2, i, n, j, m) return [(seg1, seg2)] else: alignments = [] for (i_next,j_next) in backptrs: tails = generate_alignments(bp, s1, s2, i_next, j_next, n, m) seg1, seg2 = align_segment(s1, s2, i, i_next, j, j_next) for t1, t2 in tails: alignments.append( (seg1 + t1, seg2 + t2) ) return alignments # helper function for generate_alignment def align_segment(s1, s2, i_start, i_end, j_start, j_end): seg1 = "" seg2 = "" for i in range(i_start, i_end): seg1 += s1[i] for j in range(j_start, j_end): seg2 += s2[j] seg1 += "-"*(j_end - j_start - 1) seg2 += "-"*(i_end - i_start - 1) return seg1, seg2 # print a table with n rows and m cols # for strings s1 of length n # and s2 of length m def print_table(table, s1, s2, n, m): coord = Coord(m) # print table header print line = " " for j in range(0,m): line += "%3s" % (s2[j]) print line print "-" * len(line) # print table rows for i in range(0,n): line = "%s|" % (s1[i]) for j in range(0,m): line += "%3d" % (table[coord(i,j)]) print line # put a blank line at the end print def print_alignments(alignments): for s1,s2 in alignments: print s1 print s2 print def generate_alignments_from_paper(): align("AJCJNRCKCRBP","ABCNJROCLCRPM") if __name__ == '__main__': generate_alignments_from_paper()