That was a difficult question! This is the path that I have made.
The first observation is that outgroup is always the only node attached to the end of the newick line. Let me call the rest of the groups a group and try to generate all their permutations. Then just add outgroup.
from itertools import permutations def ingroup_generator(species, n): for perm in permutations(species, n): yield tuple([tuple(perm), tuple(s for s in species if s not in perm)]) def format_newick(s, outgroup=''): return '(' + ', '.join('({})'.format(', '.join(p)) for p in s) + ',({}));'.format(outgroup) species = ["A","B","C","D","E"] outgroup = "E" ingroup = [s for s in species if s != outgroup] itertools_newicks= [] for n in range(1, len(ingroup)): for p in ingroup_generator(ingroup, n): itertools_newicks.append(format_newick(p, outgroup)) for newick in itertools_newicks: print newick
This returns 40 new lines:
((A), (B, C, D),(E)); ((B), (A, C, D),(E)); ((C), (A, B, D),(E)); ((D), (A, B, C),(E)); ((A, B), (C, D),(E)); ((A, C), (B, D),(E)); ((A, D), (B, C),(E)); ((B, A), (C, D),(E)); ((B, C), (A, D),(E)); ((B, D), (A, C),(E)); ((C, A), (B, D),(E)); ((C, B), (A, D),(E)); ((C, D), (A, B),(E)); ((D, A), (B, C),(E)); ((D, B), (A, C),(E)); ((D, C), (A, B),(E)); ((A, B, C), (D),(E)); ((A, B, D), (C),(E)); ((A, C, B), (D),(E)); ((A, C, D), (B),(E)); ((A, D, B), (C),(E)); ((A, D, C), (B),(E)); ((B, A, C), (D),(E)); ((B, A, D), (C),(E)); ((B, C, A), (D),(E)); ((B, C, D), (A),(E)); ((B, D, A), (C),(E)); ((B, D, C), (A),(E)); ((C, A, B), (D),(E)); ((C, A, D), (B),(E)); ((C, B, A), (D),(E)); ((C, B, D), (A),(E)); ((C, D, A), (B),(E)); ((C, D, B), (A),(E)); ((D, A, B), (C),(E)); ((D, A, C), (B),(E)); ((D, B, A), (C),(E)); ((D, B, C), (A),(E)); ((D, C, A), (B),(E)); ((D, C, B), (A),(E));
Some of them are duplicates, but later we will remove duplicates.
As bli noted in the comments , (((("A","B"),"C"),"D"),("E")); , and its options should also be considered valid solutions. Comments on BioStar pointed me in the right direction that it is the same as generating all possible binary tree groupings. I found a nice fooobar.com/questions/1145816 / ... :
# A very simple representation for Nodes. Leaves are anything which is not a Node. class Node(object): def __init__(self, left, right): self.left = left self.right = right def __repr__(self): return '(%s, %s)' % (self.left, self.right)
Then
enum_newicks= [] for t in enum_unordered(ingroup): enum_newicks.append('({},({}));'.format(t, outgroup)) for newick in enum_newicks: print newick
delivers the following 15 innovations:
((A, (B, (C, D))),(E)); (((A, B), (C, D)),(E)); ((B, (A, (C, D))),(E)); ((B, ((A, C), D)),(E)); ((B, (C, (A, D))),(E)); ((A, ((B, C), D)),(E)); (((A, (B, C)), D),(E)); ((((A, B), C), D),(E)); (((B, (A, C)), D),(E)); (((B, C), (A, D)),(E)); ((A, (C, (B, D))),(E)); (((A, C), (B, D)),(E)); ((C, (A, (B, D))),(E)); ((C, ((A, B), D)),(E)); ((C, (B, (A, D))),(E));
So, now we already have 40 + 15 = 55 possible new lines, and we have to remove duplicates.
The first dead end I tried was to create a canonical representation of each new line so that I could use them as keys in a dictionary. The idea was to recursively sort the rows in all nodes. But first, I had to grab all the (nested) nodes. I could not use regular expressions because nested structures are by definition not regular .
So, I used the pyparsing package and came up with the following:
from pyparsing import nestedExpr def sort_newick(t): if isinstance(t, str): return sorted(t) else: if all(isinstance(c, str) for c in t): return sorted(t) if all(isinstance(l, list) for l in t): return [sort_newick(l) for l in sorted(t, key=lambda k: sorted(k))] else: return [sort_newick(l) for l in t] def canonical_newick(n): n = n.replace(',', '') p = nestedExpr().parseString(n).asList() s = sort_newick(p) return str(s)
It gave for
from collections import defaultdict all_newicks = itertools_newicks + enum_newicks d = defaultdict(list) for newick in all_newicks: d[canonical_newick(newick)].append(newick) for canonical, newicks in d.items(): print canonical for newick in newicks: print '\t', newick print
Dictionary with 22 keys:
[[[['A'], [['C'], ['B', 'D']]], ['E']]] ((A, (C, (B, D))),(E)); [[[['B'], [['A'], ['C', 'D']]], ['E']]] ((B, (A, (C, D))),(E)); [[[['B'], [['A', 'C'], ['D']]], ['E']]] ((B, ((A, C), D)),(E)); [[['A', 'C', 'D'], ['B'], ['E']]] ((B), (A, C, D),(E)); ((A, C, D), (B),(E)); ((A, D, C), (B),(E)); ((C, A, D), (B),(E)); ((C, D, A), (B),(E)); ((D, A, C), (B),(E)); ((D, C, A), (B),(E)); [[['A', 'B'], ['C', 'D'], ['E']]] ((A, B), (C, D),(E)); ((B, A), (C, D),(E)); ((C, D), (A, B),(E)); ((D, C), (A, B),(E)); [[[[['A'], ['B', 'C']], ['D']], ['E']]] (((A, (B, C)), D),(E)); [[[['A', 'C'], ['B', 'D']], ['E']]] (((A, C), (B, D)),(E)); [[['A'], ['B', 'C', 'D'], ['E']]] ((A), (B, C, D),(E)); ((B, C, D), (A),(E)); ((B, D, C), (A),(E)); ((C, B, D), (A),(E)); ((C, D, B), (A),(E)); ((D, B, C), (A),(E)); ((D, C, B), (A),(E)); [[[['A', 'D'], ['B', 'C']], ['E']]] (((B, C), (A, D)),(E)); [[['A', 'B', 'C'], ['D'], ['E']]] ((D), (A, B, C),(E)); ((A, B, C), (D),(E)); ((A, C, B), (D),(E)); ((B, A, C), (D),(E)); ((B, C, A), (D),(E)); ((C, A, B), (D),(E)); ((C, B, A), (D),(E)); [[['A', 'C'], ['B', 'D'], ['E']]] ((A, C), (B, D),(E)); ((B, D), (A, C),(E)); ((C, A), (B, D),(E)); ((D, B), (A, C),(E)); [[['A', 'B', 'D'], ['C'], ['E']]] ((C), (A, B, D),(E)); ((A, B, D), (C),(E)); ((A, D, B), (C),(E)); ((B, A, D), (C),(E)); ((B, D, A), (C),(E)); ((D, A, B), (C),(E)); ((D, B, A), (C),(E)); [[[['A'], [['B'], ['C', 'D']]], ['E']]] ((A, (B, (C, D))),(E)); [[[[['A', 'B'], ['C']], ['D']], ['E']]] ((((A, B), C), D),(E)); [[[[['B'], ['A', 'C']], ['D']], ['E']]] (((B, (A, C)), D),(E)); [[[['C'], [['B'], ['A', 'D']]], ['E']]] ((C, (B, (A, D))),(E)); [[[['C'], [['A', 'B'], ['D']]], ['E']]] ((C, ((A, B), D)),(E)); [[[['A'], [['B', 'C'], ['D']]], ['E']]] ((A, ((B, C), D)),(E)); [[[['A', 'B'], ['C', 'D']], ['E']]] (((A, B), (C, D)),(E)); [[[['B'], [['C'], ['A', 'D']]], ['E']]] ((B, (C, (A, D))),(E)); [[[['C'], [['A'], ['B', 'D']]], ['E']]] ((C, (A, (B, D))),(E)); [[['A', 'D'], ['B', 'C'], ['E']]] ((A, D), (B, C),(E)); ((B, C), (A, D),(E)); ((C, B), (A, D),(E)); ((D, A), (B, C),(E));
But closer inspection revealed some problems. Let's look, for example, on newicks '(((A, B), (C, D)),(E)); and ((D, C), (A, B),(E)); . In our d dictionary, they have another canonical key, respectively [[[['A', 'B'], ['C', 'D']], ['E']]] and [[['A', 'B'], ['C', 'D'], ['E']]] . But in reality these are duplicate trees. We can confirm this by looking at the Robinson-Fould distance between two trees. If it is zero, the trees are identical.
We use robinson_foulds function from ete3 toolkit
from ete3 import Tree tree1 = Tree('(((A, B), (C, D)),(E));') tree2 = Tree('((D, C), (A, B),(E));') rf, max_parts, common_attrs, edges1, edges2, discard_t1, discard_t2 = tree1.robinson_foulds(tree2, unrooted_trees=True) print rf
OK, so Robinson Folds is the best way to test a new tree, and then my approach to the canonical tree. Allow all newlines to be wrapped in a custom MyTree object, where equality is defined as having a Robinson-Fold distance from zero:
class MyTree(Tree): def __init__(self, *args, **kwargs): super(MyTree, self).__init__(*args, **kwargs) def __eq__(self, other): rf = self.robinson_foulds(other, unrooted_trees=True) return not bool(rf[0]) trees = [MyTree(newick) for newick in all_newicks]
It would be ideal if we could also define a __hash__() function that returns the same value for duplicate trees, then set(trees) will automatically delete all duplicates.
Unfortunately, I could not find a good way to define __hash__() , but with __eq__ in place, I could use index() :
unique_trees = [trees[i] for i in range(len(trees)) if i == trees.index(trees[i])] unique_newicks = [tree.write(format=9) for tree in unique_trees] for unique_newick in unique_newicks: print unique_newick
So here we are at the end of our journey. I cannot fully prove that this is the right solution, but I am sure that the following 19 new elements are all possible different permutations:
((A),(B,C,D),(E)); ((B),(A,C,D),(E)); ((C),(A,B,D),(E)); ((D),(A,B,C),(E)); ((A,B),(C,D),(E)); ((A,C),(B,D),(E)); ((A,D),(B,C),(E)); ((A,(B,(C,D))),(E)); ((B,(A,(C,D))),(E)); ((B,((A,C),D)),(E)); ((B,(C,(A,D))),(E)); ((A,((B,C),D)),(E)); (((A,(B,C)),D),(E)); ((((A,B),C),D),(E)); (((B,(A,C)),D),(E)); ((A,(C,(B,D))),(E)); ((C,(A,(B,D))),(E)); ((C,((A,B),D)),(E)); ((C,(B,(A,D))),(E));
If we compare each newick in pairs with all the other newcomers, we get confirmation that there are more duplicates in this list
from itertools import product for n1, n2 in product(unique_newicks, repeat=2): if n1 != n2: mt1 = MyTree(n1) mt2 = MyTree(n2) assert mt1 != mt2