from collections import namedtuple
# -------------------------
# Data: Multiple sequence alignment
# -------------------------
msa = {
"human" : "TCCTGCCTCATCCTATTATTTATCGCACCTAC-GTTCAATATTACAGGCGAA-CATA-CTTACTAAAGTGTGTTAATTAATTAATGCTTGTAG",
"bonobo" : "TCCTGCCCCATTACGTTATTTATCGCACCTAC-GTTCAATATTATTACCTAG-CATGATTTACTAAAGCGTGTTAATTAATTAATGCTTGTAG",
"chimp" : "TCCTGCCCCATTGTATTATTTATCGCACCTAC-GTTCAATATTACGACCTAG-CATA-CCTACTAAAGTGTGTTGATTAATTAATGCTTGCAG",
"gibbon" : "TTCTGACCCATCCTATTGTTGATCGCGCCTAC-GTTCAATATCCCAGCCGAG-CATA-CTTACACTAAGGTGTTAATTAATTCATGCTTGTTG",
"oran-pa" : "TCCTACCTCATGCCATTATTAATCGCGCCTAATATCCAATATCCTAGCCCCACCCTC-AGTGTTTGAAGCTGCTATTTAATTTATGCTAG-AG",
"gorilla" : "TCCTGCCCCATGCTACCATTTATCGCACCTAC-GTTCAATATTACAGCCGAG-CGCA-CAGTGTTCATGGTGTTAATTAATTCATGCTTGTTG",
"oran-pp" : "TCCTGCCCCATGGCGTTATTGATCGCGCCTAACGTCCAATGTTCTAGCGCCC-CCTC-CCTATTGAAAGTTGTTATTTAATTTATGCTAG-AG"
}
# All sequences same length
L = len(next(iter(msa.values())))
# -------------------------
# Fitch algorithm
# -------------------------
def fitch_pass(node, column):
"""Returns (set of states, score) for a node at one column"""
if isinstance(node, str): # leaf
return { msa[node][column] }, 0
left, right = node
Lset, Lscore = fitch_pass(left, column)
Rset, Rscore = fitch_pass(right, column)
inter = Lset & Rset
if inter:
return inter, Lscore + Rscore
else:
return (Lset | Rset), Lscore + Rscore + 1
def fitch_score(tree):
"""Compute total parsimony score for a tree over all columns"""
score = 0
for c in range(L):
_, s = fitch_pass(tree, c)
score += s
return score
# -------------------------
# Define trees as nested tuples
# -------------------------
# Topology 1: ((((human,(chimp,bonobo)),gorilla),(oran-pa,oran-pp)),gibbon)
tree1 = (((( "human", ("chimp","bonobo") ), "gorilla"), ("oran-pa","oran-pp")), "gibbon")
# Topology 2: ((((human,gorilla),(chimp,bonobo)),(oran-pa,oran-pp)),gibbon)
tree2 = (((("human","gorilla"),("chimp","bonobo")),("oran-pa","oran-pp")), "gibbon")
# Topology 3: ((((human,bonobo),gorilla),(oran-pa,oran-pp)),(chimp,gibbon))
tree3 = (((("human","bonobo"),"gorilla"),("oran-pa","oran-pp")),("chimp","gibbon"))
# Topology 4: (human,(((gibbon,(oran-pa,oran-pp)),gorilla),(chimp,bonobo)))
tree4 = ("human",((( "gibbon",("oran-pa","oran-pp") ),"gorilla"),("chimp","bonobo")))
# -------------------------
# Run and print
# -------------------------
for i,tree in enumerate([tree1, tree2, tree3, tree4], start=1):
print(f"Tree {i} score = {fitch_score(tree)}")
ZnJvbSBjb2xsZWN0aW9ucyBpbXBvcnQgbmFtZWR0dXBsZQoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCiMgRGF0YTogTXVsdGlwbGUgc2VxdWVuY2UgYWxpZ25tZW50CiMgLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQptc2EgPSB7CiAgICAiaHVtYW4iICAgOiAiVENDVEdDQ1RDQVRDQ1RBVFRBVFRUQVRDR0NBQ0NUQUMtR1RUQ0FBVEFUVEFDQUdHQ0dBQS1DQVRBLUNUVEFDVEFBQUdUR1RHVFRBQVRUQUFUVEFBVEdDVFRHVEFHIiwKICAgICJib25vYm8iICA6ICJUQ0NUR0NDQ0NBVFRBQ0dUVEFUVFRBVENHQ0FDQ1RBQy1HVFRDQUFUQVRUQVRUQUNDVEFHLUNBVEdBVFRUQUNUQUFBR0NHVEdUVEFBVFRBQVRUQUFUR0NUVEdUQUciLAogICAgImNoaW1wIiAgIDogIlRDQ1RHQ0NDQ0FUVEdUQVRUQVRUVEFUQ0dDQUNDVEFDLUdUVENBQVRBVFRBQ0dBQ0NUQUctQ0FUQS1DQ1RBQ1RBQUFHVEdUR1RUR0FUVEFBVFRBQVRHQ1RUR0NBRyIsCiAgICAiZ2liYm9uIiAgOiAiVFRDVEdBQ0NDQVRDQ1RBVFRHVFRHQVRDR0NHQ0NUQUMtR1RUQ0FBVEFUQ0NDQUdDQ0dBRy1DQVRBLUNUVEFDQUNUQUFHR1RHVFRBQVRUQUFUVENBVEdDVFRHVFRHIiwKICAgICJvcmFuLXBhIiA6ICJUQ0NUQUNDVENBVEdDQ0FUVEFUVEFBVENHQ0dDQ1RBQVRBVENDQUFUQVRDQ1RBR0NDQ0NBQ0NDVEMtQUdUR1RUVEdBQUdDVEdDVEFUVFRBQVRUVEFUR0NUQUctQUciLAogICAgImdvcmlsbGEiIDogIlRDQ1RHQ0NDQ0FUR0NUQUNDQVRUVEFUQ0dDQUNDVEFDLUdUVENBQVRBVFRBQ0FHQ0NHQUctQ0dDQS1DQUdUR1RUQ0FUR0dUR1RUQUFUVEFBVFRDQVRHQ1RUR1RURyIsCiAgICAib3Jhbi1wcCIgOiAiVENDVEdDQ0NDQVRHR0NHVFRBVFRHQVRDR0NHQ0NUQUFDR1RDQ0FBVEdUVENUQUdDR0NDQy1DQ1RDLUNDVEFUVEdBQUFHVFRHVFRBVFRUQUFUVFRBVEdDVEFHLUFHIgp9CgojIEFsbCBzZXF1ZW5jZXMgc2FtZSBsZW5ndGgKTCA9IGxlbihuZXh0KGl0ZXIobXNhLnZhbHVlcygpKSkpCgojIC0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0KIyBGaXRjaCBhbGdvcml0aG0KIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCgpkZWYgZml0Y2hfcGFzcyhub2RlLCBjb2x1bW4pOgogICAgIiIiUmV0dXJucyAoc2V0IG9mIHN0YXRlcywgc2NvcmUpIGZvciBhIG5vZGUgYXQgb25lIGNvbHVtbiIiIgogICAgaWYgaXNpbnN0YW5jZShub2RlLCBzdHIpOiAgIyBsZWFmCiAgICAgICAgcmV0dXJuIHsgbXNhW25vZGVdW2NvbHVtbl0gfSwgMAogICAgCiAgICBsZWZ0LCByaWdodCA9IG5vZGUKICAgIExzZXQsIExzY29yZSA9IGZpdGNoX3Bhc3MobGVmdCwgY29sdW1uKQogICAgUnNldCwgUnNjb3JlID0gZml0Y2hfcGFzcyhyaWdodCwgY29sdW1uKQogICAgaW50ZXIgPSBMc2V0ICYgUnNldAogICAgaWYgaW50ZXI6CiAgICAgICAgcmV0dXJuIGludGVyLCBMc2NvcmUgKyBSc2NvcmUKICAgIGVsc2U6CiAgICAgICAgcmV0dXJuIChMc2V0IHwgUnNldCksIExzY29yZSArIFJzY29yZSArIDEKCgpkZWYgZml0Y2hfc2NvcmUodHJlZSk6CiAgICAiIiJDb21wdXRlIHRvdGFsIHBhcnNpbW9ueSBzY29yZSBmb3IgYSB0cmVlIG92ZXIgYWxsIGNvbHVtbnMiIiIKICAgIHNjb3JlID0gMAogICAgZm9yIGMgaW4gcmFuZ2UoTCk6CiAgICAgICAgXywgcyA9IGZpdGNoX3Bhc3ModHJlZSwgYykKICAgICAgICBzY29yZSArPSBzCiAgICByZXR1cm4gc2NvcmUKCgojIC0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0KIyBEZWZpbmUgdHJlZXMgYXMgbmVzdGVkIHR1cGxlcwojIC0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0KIyBUb3BvbG9neSAxOiAoKCgoaHVtYW4sKGNoaW1wLGJvbm9ibykpLGdvcmlsbGEpLChvcmFuLXBhLG9yYW4tcHApKSxnaWJib24pCnRyZWUxID0gKCgoKCAiaHVtYW4iLCAoImNoaW1wIiwiYm9ub2JvIikgKSwgImdvcmlsbGEiKSwgKCJvcmFuLXBhIiwib3Jhbi1wcCIpKSwgImdpYmJvbiIpCgojIFRvcG9sb2d5IDI6ICgoKChodW1hbixnb3JpbGxhKSwoY2hpbXAsYm9ub2JvKSksKG9yYW4tcGEsb3Jhbi1wcCkpLGdpYmJvbikKdHJlZTIgPSAoKCgoImh1bWFuIiwiZ29yaWxsYSIpLCgiY2hpbXAiLCJib25vYm8iKSksKCJvcmFuLXBhIiwib3Jhbi1wcCIpKSwgImdpYmJvbiIpCgojIFRvcG9sb2d5IDM6ICgoKChodW1hbixib25vYm8pLGdvcmlsbGEpLChvcmFuLXBhLG9yYW4tcHApKSwoY2hpbXAsZ2liYm9uKSkKdHJlZTMgPSAoKCgoImh1bWFuIiwiYm9ub2JvIiksImdvcmlsbGEiKSwoIm9yYW4tcGEiLCJvcmFuLXBwIikpLCgiY2hpbXAiLCJnaWJib24iKSkKCiMgVG9wb2xvZ3kgNDogKGh1bWFuLCgoKGdpYmJvbiwob3Jhbi1wYSxvcmFuLXBwKSksZ29yaWxsYSksKGNoaW1wLGJvbm9ibykpKQp0cmVlNCA9ICgiaHVtYW4iLCgoKCAiZ2liYm9uIiwoIm9yYW4tcGEiLCJvcmFuLXBwIikgKSwiZ29yaWxsYSIpLCgiY2hpbXAiLCJib25vYm8iKSkpCgoKIyAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCiMgUnVuIGFuZCBwcmludAojIC0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0KZm9yIGksdHJlZSBpbiBlbnVtZXJhdGUoW3RyZWUxLCB0cmVlMiwgdHJlZTMsIHRyZWU0XSwgc3RhcnQ9MSk6CiAgICBwcmludChmIlRyZWUge2l9IHNjb3JlID0ge2ZpdGNoX3Njb3JlKHRyZWUpfSIpCg==