"""This module can be used to convert a mark-down text file format for phylogenies into a Clustal-style dendrogram files, a Wikipedia {{clade}} tree, or back to canonicalised markdown. Mark-down ========= The markdown format simple a tab-indented list of lists:: -Embryophyta -Liverworts -_____ -_____ -Hornworts -Tracheophytes Euphyllophytes -Spermatophytes -Gymnosperms -Angiosperms -Ferns Lycopods -Mosses Node names can include can include only letters, numbers, underscores, the space character, an '†' to indicate extinct nodes, and a '*' to indicate paraphyletic groups (if you must!) The dashes before each name are optional. Unnamed internal nodes must be noted in the markdown using underscore characters (5 are shown above, but you can use any number). Output formats of script ======================== * Passing ``-f nwk`` as an option to the script will return a Newick tree. This format does not necesarily support internal node labels, but they can be included in some extended Newick formats by appending the internal node label to its equivalent closing bracket:: ((Liverworts,((Hornworts,(((Gymnosperms,Angiosperms)Spermatophytes,Ferns)Euphyllophytes,Lycopods)Tracheophytes),Mosses))Embryophyta); * Passing ``-f nwkstrict`` as an option to the script will return a Newick tree with no internal node labels:: ((Liverworts,((Hornworts,(((Gymnosperms,Angiosperms),Ferns),Lycopods)),Mosses))); * Passing ``-f wp`` as an option to the script will return Wikipedia markup format using the {{clade}} template:: {{clade |label1=Embryophyta |1={{clade |1=Liverworts |2={{clade |1={{clade |1=Hornworts |label2=Tracheophytes |2={{clade |label1=Euphyllophytes |1={{clade |label1=Spermatophytes |1={{clade |1=Gymnosperms |2=Angiosperms }} |2=Ferns }} |2=Lycopods }} }} |2=Mosses }} }} }} * Passing ``-f md`` as an option to the script will return a canonicalised version of the markdown, with leading hyphens and five-underscore placeholders:: -Embryophyta -Liverworts -_____ -_____ -Hornworts -Tracheophytes -Euphyllophytes -Spermatophytes -Gymnosperms -Angiosperms -Ferns -Lycopods -Mosses Module ====== For use as a module, you just need:: from parse_phylogeny import phylo filename = 'plant_phylogeny.txt' format = 'nwk' # or nwkstrict, md, wp formatted_output = phylo(filename, format) print(formatted_output) Script ====== usage: parse_phylogeny.py [-h] [-f [format]] [filename] Generate phylogeny formats from markdown. positional arguments: filename file containing simple phylogeny markdown optional arguments: -h, --help show this help message and exit -f [format] specify output format as nwk, nwkstrict, md or wp (default: nwk) License ======= This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ __version__ = '1.2.0' __author__ = 'Steve Cook ' import argparse, re, os.path class Node: """Internal node of parse tree""" def __init__(self, name): self.name = name class Terminal: """Terminal node of parse tree""" def __init__(self, name): self.name = name def tokenise(filename): """Yield a token string from a markdown file as a generator""" depth = 0 tokens = re.compile(u'^(\t*)(?:[-])?\s*([\w \u2020\*\.]+)') # include dagger and asterisk with open(filename, 'r', encoding='utf-8') as phylogeny: linenum = 0 for line in phylogeny: linenum += 1 match = tokens.search(line) if not match: raise Exception('Invalid markdown: not an acceptable node label at line {linenum} near {line}'.format( linenum=str(linenum), line=line)) indent = len(match.group(1)) name = match.group(2) name = name.rstrip() if re.match('^-?\s*_+$', name): name = '' if indent == depth: yield(name) elif indent == depth + 1: depth += 1 yield('(') yield(name) elif indent < depth: while indent < depth: depth -= 1 yield(')') yield(name) else: raise Exception('Invalid markdown: Cannot indent more than one level at a time at line {linenum} near {line}'.format( linenum=str(linenum), line=line)) while depth > 0: yield(')') # indented file has 'virtual' dedents at EOF depth -= 1 def parse(filename): """Construct a parse tree from a token stream""" depth = 0 stack = [] stream = tokenise(filename) for token in stream: if token == '(': depth += 1 terminal = stack.pop() node = Node(terminal.name) stack.append(node) elif token == ')': depth -= 1 items = [] while type(stack[-1]) is not Node: items.append(stack.pop()) items.append(stack.pop()) stack.append(items[::-1]) # reverse the order else: terminal = Terminal(token) stack.append(terminal) return(stack) def nwk(tree, **kwargs): """Recursively walk a parse tree, serialising it to a nwk string: nwk_serialise(tree, strict=True) will strip internal node labels, nwk_serialise(tree, strict=False) will retain them""" nwk_serialised = '(' strict = kwargs['strict'] node_names = [] def nwk_recurse(tree, nwk_serialised, strict): for item in tree: if type(item) is Terminal: nwk_serialised = nwk_serialised + item.name + ',' elif type(item) is Node: node_names.append( item.name ) nwk_serialised = nwk_serialised + '(' elif type(item) is list: nwk_serialised = nwk_recurse(item, nwk_serialised, strict) if nwk_serialised[-1] == ',': nwk_serialised = nwk_serialised[:-1] # remove any trailing comma if strict is True: nwk_serialised = nwk_serialised + ')' + ',' else: nwk_serialised = nwk_serialised + ')' + node_names.pop() + ',' return(nwk_serialised) nwk_serialised = nwk_recurse(tree, nwk_serialised, strict) if nwk_serialised[-1] == ',': nwk_serialised = nwk_serialised[:-1] # remove any trailing comma nwk_serialised = nwk_serialised + ');' return nwk_serialised def wp(tree): """Recursively walk a parse tree, serialising it to a wiki clade string""" wp_serialised = '{{clade\n' # opening parenthesis indent = 0 # current pretty-print tab-indent level def wp_recurse(tree, wp_serialised, indent): i = 0 for item in tree: if type(item) is Terminal: i += 1 wp_serialised = wp_serialised + '{tabs}|{num}={name}\n'.format( tabs=('\t'*(indent+1)), num=str(i), name=item.name) elif type(item) is Node: if not item.name: pass else: # backtrack to convert the clade just added to a labelled one matches = re.search(r'\|(\d+)=\{\{clade\n$', wp_serialised) n = int(matches.group(1)) wp_serialised = re.sub(r'\|(\d+)=\{\{clade\n$', '', wp_serialised) wp_serialised = wp_serialised + '|label{labelnum}={name}\n{tabs}|{num}={{{{clade\n'.format( labelnum=str(n), name=item.name, tabs=('\t'*indent), num=str(n)) elif type(item) is list: i += 1 indent += 1 wp_serialised = wp_serialised + '{tabs}|{num}={{{{clade\n'.format( tabs=('\t'*indent), num=str(i)) wp_serialised, indent = wp_recurse(item, wp_serialised, indent) wp_serialised = wp_serialised + '{tabs}}}}}\n'.format( tabs=('\t'*indent)) indent -= 1 return(wp_serialised, indent) wp_serialised, indent = wp_recurse(tree, wp_serialised, indent) wp_serialised = wp_serialised + '}}\n' # closing parenthesis return wp_serialised def md(tree): """Recursively walk a parse tree, serialising it to a markdown string""" md_serialised = '' indent = -1 # current pretty-print tab-indent level def md_recurse(tree, md_serialised, indent): for item in tree: if type(item) is Terminal: md_serialised = md_serialised + '{tabs}-{name}\n'.format( tabs=('\t'*(indent+1)), name=item.name) elif type(item) is Node: if not item.name: md_serialised = md_serialised + '{tabs}-_____\n'.format( tabs=('\t'*(indent))) # canonicalise anonymous internals nodes to 5 underscores else: md_serialised = md_serialised + '{tabs}-{name}\n'.format( tabs=('\t'*indent), name=item.name) elif type(item) is list: indent += 1 md_serialised, indent = md_recurse(item, md_serialised, indent) indent -= 1 return(md_serialised, indent) md_serialised, indent = md_recurse(tree, md_serialised, indent) md_serialised = re.sub(r'\n+$', '', md_serialised) # prune trailing newlines return md_serialised def xml(tree,filename): xml_serialised = '\n' xml_serialised = xml_serialised + '\n' xml_serialised = xml_serialised + '{name}\n'.format(name=os.path.basename(filename)) xml_serialised = xml_serialised + 'Autogenerated phyloXML generated from mark-down\n' indent = -1 # current pretty-print tab-indent level def xml_recurse(tree, xml_serialised, indent): for item in tree: if type(item) is Terminal: xml_serialised = xml_serialised + '{tabs}{name}\n'.format( tabs=('\t'*(indent+1)), name=item.name) elif type(item) is Node: if item.name: xml_serialised = xml_serialised + '{tabs}{name}\n'.format( tabs=('\t'*indent), name=item.name) else: xml_serialised = xml_serialised + '{tabs}\n'.format( tabs=('\t'*indent)) elif type(item) is list: indent += 1 xml_serialised, indent = xml_recurse(item, xml_serialised, indent) xml_serialised = xml_serialised + '{tabs}\n'.format( tabs=('\t'*indent)) indent -= 1 return(xml_serialised, indent) xml_serialised, indent = xml_recurse(tree, xml_serialised, indent) xml_serialised = xml_serialised + '\n\n' return xml_serialised def phylo(filename, format): """Process markdown in filename, and return output formatted as nwk, nwkstrict, wp or md""" if format == 'nwk': return(nwk(parse(filename), strict=False)) elif format == 'nwkstrict': return(nwk(parse(filename), strict=True)) elif format == 'wp': return(wp(parse(filename))) elif format == 'md': return(md(parse(filename))) elif format == 'xml': return(xml(parse(filename),filename)) else: raise Exception('Invalid output format: only nwk, nwkstrict, md, wp and xml are supported') if __name__ == '__main__': argparser = argparse.ArgumentParser(description='Generate phylogeny formats from markdown.') argparser.add_argument('filename', metavar='filename', type=str, nargs='?', help='file containing simple phylogeny markdown') argparser.add_argument('-f', metavar='format', type=str, nargs='?', default='nwk', help='specify output format as nwk, nwkstrict, md or wp (default: nwk)') args = vars(argparser.parse_args()) if not os.path.isfile(args['filename']): raise Exception('Invalid input file: must specify an input file that exists') print(phylo(args['filename'], args['f']))