diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cf0866d..4dc3e6b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: hooks: - id: isort - repo: git@github.com:psf/black.git - rev: 23.12.1 + rev: 24.1.0 hooks: - id: black - repo: git@github.com:pre-commit/pre-commit-hooks.git @@ -15,6 +15,7 @@ repos: - id: check-yaml args: [--allow-multiple-documents] - id: pretty-format-json + exclude: .*.ipynb - id: trailing-whitespace exclude: tests/fixtures/output/haplogroups\..*\.txt|data/variants/isogg\.[0-9]{4}\.[0-9]{2}\.[0-9]{2}\.txt - repo: git@github.com:PyCQA/flake8.git diff --git a/CHANGELOG.md b/CHANGELOG.md index b5b9647..70c5487 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,29 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) -## [2.1.0] - 2024-01 +## [2.1.4] - 2024-01-29 + +### Added +- Support for querying multiple SNPs from the command line +- Support for generating aligned-tip Newick representations of subtrees +- Support for node lookup by SNP-based haplogroup label +- `Node` methods: + - `Node.iter_depth_first` + - `Node.iter_breath_first` + - `Node.remove_children` +- `tree.get_bounded_subtree` function + +### Changed +- `Tree` constructor's `Config` parameter is now optional +- `Node.is_root` and `Node.is_leaf` are now properties + +### Fixed +- Output format of `--snp_query` option + +[2.1.4]: https://github.com/23andMe/yhaplo/compare/2.1.0...2.1.4 + + +## [2.1.0] - 2024-01-12 This release improves haplogroup calling by identifying and correcting various errors in the ISOGG variant metadata and, internally, by pruning poorly performing v5 SNPs. diff --git a/Makefile b/Makefile index 4504c95..7162d76 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,18 @@ # Make variables #---------------------------------------------------------------------- -BOLD_CYAN = \033[1;36m +CYAN := \033[0;36m +GREEN := \033[0;32m +BOLD_CYAN := \033[1;36m BOLD_GREEN := \033[1;32m -GREEN = \033[0;32m -NO_COLOR = \033[0m +NO_COLOR := \033[0m ## General # ---------------------------------------------------------------------- help: ## Print this help message @egrep -h '(\s|^)##\s' $(MAKEFILE_LIST) \ - | sed -E "s/^## (.*)/\n$$(printf "${BOLD_CYAN}")\1$$(printf "${NO_COLOR}")/g" \ - | awk 'BEGIN {FS = ":.*?## "}; {printf "${GREEN}%-25s${NO_COLOR} %s\n", $$1, $$2}' + | sed -E "s/^## (.*)/\n$$(printf "${BOLD_GREEN}")\1$$(printf "${NO_COLOR}")/g" \ + | awk 'BEGIN {FS = ":.*?## "}; {printf "${CYAN}%-25s${NO_COLOR} %s\n", $$1, $$2}' @echo diff --git a/pyproject.toml b/pyproject.toml index 7c3c684..6677ff1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,12 +65,14 @@ ignore_missing_imports = true # D107 Missing docstring in __init__ # D202 No blank lines allowed after function docstring # Ignoring allows blank lines after one-line docstrings. +# D203 1 blank line required before class docstring +# Conflicts with black 24. # D211 No blank lines allowed before class docstring # Contradicts D203. # D213 Multi-line docstring summary should start at the second line # Contradicts D212. # https://github.com/PyCQA/pydocstyle/issues/242#issuecomment-288166773 -ignore = "D107,D202,D211,D213" +ignore = "D107,D202,D203,D211,D213" match_dir = "(?!tests|\\.).*" [tool.pytest.ini_options] diff --git a/scripts/validate_yhaplo.sh b/scripts/validate_yhaplo.sh index 1fbf970..f3fc468 100755 --- a/scripts/validate_yhaplo.sh +++ b/scripts/validate_yhaplo.sh @@ -36,10 +36,12 @@ echo -e "\n${BOLD_CYAN}Removed${NO_COLOR}: ${GREEN}${output_dir}\n\n${NO_COLOR}" echo -e "${BOLD_CYAN}Text Input${NO_COLOR}\n" yhaplo --example_text \ + --hg_genos Q \ --breadth_first \ --depth_first \ --depth_first_table \ - --hg_genos Q + --mrca Q J \ + --snp_query L1335,S730,S530,foo echo -e "\n" diff --git a/yhaplo/api/command_line_args.py b/yhaplo/api/command_line_args.py index 59b5bba..c3fea06 100644 --- a/yhaplo/api/command_line_args.py +++ b/yhaplo/api/command_line_args.py @@ -197,14 +197,14 @@ def get_command_line_args(set_defaults: bool = False) -> argparse.Namespace: nargs=2, dest="mrca_haplogroup_list", metavar=("haplogroup1", "haplogroup2"), - help="Output mrca of two haplogroups", + help="Output MRCA of two haplogroups", ) group.add_argument( "-sq", "--snp_query", - dest="query_snp_name", - metavar="snp_name", - help="List phylogenetic path for a query SNP", + dest="query_snp_names", + metavar="snp_names", + help="List phylogenetic path for each SNP in comma-separated list", ) group.add_argument( "-pt", @@ -282,7 +282,6 @@ def get_command_line_arg_defaults() -> argparse.Namespace: class RawTextWithDefaultsHelpFormatter(argparse.RawDescriptionHelpFormatter): - """Help message formatter that retains formatting and adds defaults. Combines argparse.RawTextHelpFormatter and argparse.ArgumentDefaultsHelpFormatter. diff --git a/yhaplo/config.py b/yhaplo/config.py index c92e912..db52175 100644 --- a/yhaplo/config.py +++ b/yhaplo/config.py @@ -22,7 +22,6 @@ class Config: - """Yhaplo configuration class. This class is a container for parameters, constants, filenames, etc. diff --git a/yhaplo/node.py b/yhaplo/node.py index d6b7ed4..c36d8f2 100644 --- a/yhaplo/node.py +++ b/yhaplo/node.py @@ -5,8 +5,11 @@ import argparse import logging from collections import deque +from collections.abc import Iterator from operator import attrgetter -from typing import Optional, TextIO +from typing import Optional, Union + +import numpy as np from yhaplo import sample as sample_module # noqa F401 from yhaplo import snp as snp_module # noqa F401 @@ -17,16 +20,36 @@ class Node: - """Class representing one node of a haplogroup tree. Each node represents the branch that leads to it. - A node knows its: - - Parent - - Depth - - Children - - Diagnostic SNPs + Attributes + ---------- + parent : Node | None + Parent node. None for the root node. + depth : int + Number of edges between this node and the root node. + child_list : list[Node] + List of descendant nodes. + snp_list : list[SNP] + SNPs associated with the branch incident upon this node. + + haplogroup : str + YCC haplogroup name (e.g., "R1b1c"). + label : str + YCC haplogroup name, including alternative names (e.g., "P/K2b2"). + hg_trunc : str + Truncated haplogroup (e.g., "R"). + hg_snp : str + Haplogroup with representative SNP (e.g., "R-V88"). + + dropped_marker_list : list[DroppedMarker] + List of dropped markers. + branch_length : float | None + Branch length. + dfs_rank : int + Rank in depth-first listing of all nodes. """ @@ -37,28 +60,38 @@ class Node: def __init__( self, - parent: Optional[Node], + parent: Union[Node, None], tree: Optional["tree_module.Tree"] = None, ): + """Instantiate Node. + + Parameters + ---------- + parent : Node | None + Parent node. None for the root node. + tree : Tree | None, optional + Required for root node and ignored otherwise. + Tree to which the root node belongs. + + """ self.parent = parent if parent is None: if tree is not None: type(self).set_tree_config_and_args(tree) - self.depth = 0 else: - raise ValueError( - "A tree instance must be supplied when instantiating a root node." - ) + raise ValueError("Root node requires a tree instance") + + self.depth = 0 else: parent.add_child(self) self.depth = parent.depth + 1 if self.depth > type(self).tree.max_depth: type(self).tree.max_depth = self.depth - self.haplogroup: str = "" # YCC haplogroup name (e.g., "R1b1c") - self.label: str = "" # YCC including alt names (e.g., "P/K2b2") - self.hg_trunc: str = "" # Truncated haplogroup (e.g., "R") - self.hg_snp: str = "" # Haplogroup with representative SNP (e.g., "R-V88") + self.haplogroup: str = "" + self.label: str = "" + self.hg_trunc: str = "" + self.hg_snp: str = "" self.child_list: list[Node] = [] self.snp_list: list["snp_module.SNP"] = [] self.dropped_marker_list: list["snp_module.DroppedMarker"] = [] @@ -102,6 +135,24 @@ def str_dot_pipe_depth(self) -> str: # Other properties # ---------------------------------------------------------------------- + @property + def is_root(self) -> bool: + """Whether or not the Node is the root of the tree.""" + + return self.parent is None + + @property + def is_leaf(self) -> bool: + """Whether or not the Node is a leaf.""" + + return self.num_children == 0 + + @property + def num_children(self) -> int: + """Return number of children.""" + + return len(self.child_list) + @property def tree_table_data(self) -> tuple[str, str, str, str, str]: """Return a tuple of data summarizing the node. @@ -171,20 +222,25 @@ def truncate_haplogroup_label(cls, haplogroup: str) -> str: # Setters and mutaters # ---------------------------------------------------------------------- def set_label(self, label: str) -> None: - """Set label, haplogroup, and hg_trunc.""" + """Set label, haplogroup, and hg_trunc. + Also, add tree.haplogroup_to_node entry mapping haplogroup label to node. + + """ self.label = label label_list = label.split("/") - if self.is_root(): - self.haplogroup = self.hg_trunc = self.config.root_haplogroup - type(self).tree.haplogroup_to_node[self.haplogroup] = self + if self.is_root: + label = self.config.root_haplogroup + self.haplogroup = label + self.hg_trunc = label + label_list = [label] else: self.haplogroup = label_list[0] self.hg_trunc = type(self).truncate_haplogroup_label(self.haplogroup) - for key in label_list: - type(self).tree.haplogroup_to_node[key] = self + for label in label_list: + type(self).tree.haplogroup_to_node[label] = self def set_branch_length(self, branch_length: float) -> None: """Set branch length.""" @@ -217,9 +273,11 @@ def priority_sort_snp_list_and_set_hg_snp(self) -> None: The standard form incudes the truncated haplogroup label and the label of a representative SNP, separated by a hyphen (e.g. R-V88). + Also, add tree.haplogroup_to_node entry mapping hg_snp to node. + """ # Root: no markers - if self.is_root(): + if self.is_root: self.hg_snp = self.haplogroup # Normal case @@ -238,7 +296,7 @@ def priority_sort_snp_list_and_set_hg_snp(self) -> None: # No markers to use else: if self.parent is not None and self.parent.hg_snp: - symbol = "*" if self.is_leaf() else "+" + symbol = "*" if self.is_leaf else "+" self.hg_snp = self.parent.hg_snp + symbol # Uniquify if necessary @@ -258,37 +316,46 @@ def priority_sort_snp_list_and_set_hg_snp(self) -> None: self.hg_snp = self.haplogroup type(self).hg_snp_set.add(self.hg_snp) + type(self).tree.haplogroup_to_node[self.hg_snp] = self # Queries # ---------------------------------------------------------------------- - def is_root(self) -> bool: - """Return a Boolean indicating whether or not the Node is root.""" - - return self.parent is None - - def is_leaf(self) -> bool: - """Return a Boolean indicating whether or not the Node is a leaf.""" - - return len(self.child_list) == 0 - def get_branch_length( self, align_tips: bool = False, + subtree_max_depth: Optional[int] = None, platform: Optional[str] = None, ) -> Optional[float]: - """Get branch length.""" + """Get branch length. + + Parameters + ---------- + align_tips : bool, optional + When True, set internal branch lengths to one and leaf branch lengths + in such a manner as to align the tips of the tree. + subtree_max_depth : int | None, optional + Maximum depth of subtree. Used to set leaf branch lengths when aligning tips. + Default to maximum depth of full tree. + platform : str | None, optional + 23andMe platform to use for computing branch length. - if self.branch_length: + Returns + ------- + branch_length : float | None + Branch length. + + """ + if self.branch_length is not None: branch_length = self.branch_length - elif align_tips and self.is_leaf(): - branch_length = type(self).tree.max_depth - self.depth + 1 + elif align_tips and self.is_leaf: + subtree_max_depth = subtree_max_depth or type(self).tree.max_depth + branch_length = subtree_max_depth - self.depth + 1 elif align_tips: branch_length = 1 elif platform: - branch_length = 0 - for snp in self.snp_list: - if snp.is_on_platform(platform): - branch_length += 1 + branch_length = np.sum( + [snp.is_on_platform(platform) for snp in self.snp_list] + ) else: branch_length = None @@ -339,7 +406,7 @@ def assess_genotypes( return anc_snp_list, der_snp_list - # Children + # Children methods # ---------------------------------------------------------------------- def add_child(self, child: Node) -> None: """Append a child to the child list.""" @@ -369,12 +436,6 @@ def serial_split(self, target_haplogroup: str) -> Node: return current_node - @property - def num_children(self) -> int: - """Return number of children.""" - - return len(self.child_list) - def bifurcate(self) -> tuple[Node, Node]: """Split a node and return the two children.""" @@ -399,54 +460,53 @@ def reverse_children(self) -> None: self.child_list.reverse() + def remove_children(self) -> None: + """Make this node a leaf by purging children.""" + + self.child_list.clear() + # Tree traversals # ---------------------------------------------------------------------- - def write_breadth_first_traversal(self, bf_tree_file: TextIO) -> None: - """Write breadth-first traversal.""" + def iter_depth_first(self) -> Iterator[Node]: + """Traverse tree depth first, pre order.""" + + yield self + for child in self.child_list: + for node in child.iter_depth_first(): + yield node + + def iter_breadth_first(self) -> Iterator[Node]: + """Traverse tree breadth first.""" - bf_tree_file.write(self.str_dot_pipe_depth + "\n") + yield self node_deque = deque(self.child_list) while node_deque: node = node_deque.popleft() - bf_tree_file.write(node.str_dot_pipe_depth + "\n") node_deque.extend(node.child_list) - - def get_depth_first_node_list(self) -> list[Node]: - """Conduct depth-first pre-order traversal.""" - - depth_first_node_list = [self] - self.traverse_depth_first_pre_order_recursive(depth_first_node_list) - - return depth_first_node_list - - def traverse_depth_first_pre_order_recursive( - self, - depth_first_node_list: list[Node], - ) -> None: - """Append each node in depth-first pre order, recursively.""" - - for child in self.child_list: - depth_first_node_list.append(child) - child.traverse_depth_first_pre_order_recursive(depth_first_node_list) + yield node def mrca(self, other_node: Node) -> Node: """Return the most recent common ancestor of this node and another.""" - if self.depth < other_node.depth: - higher_node, lower_node = self, other_node + if self.depth > other_node.depth: + lower_node, higher_node = self, other_node else: - higher_node, lower_node = other_node, self + lower_node, higher_node = other_node, self - while higher_node.depth < lower_node.depth: + # Get to same level of tree + while lower_node.depth > higher_node.depth: assert lower_node.parent is not None lower_node = lower_node.parent - while lower_node != higher_node: - assert lower_node.parent is not None - assert higher_node.parent is not None - lower_node = lower_node.parent - higher_node = higher_node.parent - return higher_node + # Move up one branch at a time + node_a, node_b = lower_node, higher_node + while node_a is not node_b: + assert node_a.parent is not None + assert node_b.parent is not None + node_a = node_a.parent + node_b = node_b.parent + + return node_a # Writing tree to file in Newick format # ---------------------------------------------------------------------- @@ -460,15 +520,13 @@ def write_newick( """Write Newick string for the subtree rooted at this node.""" if not type(self).config.suppress_output: + newick = self.build_newick( + use_hg_snp_label=use_hg_snp_label, + align_tips=align_tips, + platform=platform, + ) with open(newick_fp, "w") as out_file: - out_file.write( - self.build_newick_string_recursive( - use_hg_snp_label, - align_tips, - platform, - ) - + ";\n" - ) + out_file.write(newick + "\n") if align_tips: tree_descriptor = "aligned " @@ -487,40 +545,106 @@ def write_newick( f" {newick_fp}\n" ) - def build_newick_string_recursive( + def build_newick( + self, + use_hg_snp_label: bool = False, + align_tips: bool = False, + platform: Optional[str] = None, + ) -> str: + """Build Newick string for the subtree rooted at this node. + + Parameters + ---------- + use_hg_snp_label : bool, optional + Use SNP-based haplogroup labels rather than YCC haplogroup labels. + align_tips : bool, optional + When True, set branch lengths to align the tips of the tree. + platform : str | None, optional + 23andMe platform to use for computing branch lengths. + + Returns + ------- + newick : str + Newick representation of the tree. + + """ + subtree_max_depth = ( + self.tree.max_depth + if self.is_root + else np.max([node.depth for node in self.iter_depth_first()]) + ) + newick = ( + self.build_newick_recursive( + use_hg_snp_label=use_hg_snp_label, + align_tips=align_tips, + subtree_max_depth=subtree_max_depth, + platform=platform, + ) + + ";" + ) + + return newick + + def build_newick_recursive( self, - use_hg_snplabel: bool = False, + use_hg_snp_label: bool = False, align_tips: bool = False, + subtree_max_depth: Optional[int] = None, platform: Optional[str] = None, ) -> str: - """Build Newick string recursively for the subtree rooted at this node.""" + """Build Newick string recursively for the subtree rooted at this node. + + Parameters + ---------- + use_hg_snp_label : bool, optional + Use SNP-based haplogroup labels rather than YCC haplogroup labels. + align_tips : bool, optional + When True, set branch lengths to align the tips of the tree. + subtree_max_depth : int | None, optional + Maximum depth of subtree. Used to set leaf branch lengths when aligning tips. + Default to maximum depth of full tree. + platform : str | None, optional + 23andMe platform to use for computing branch lengths. + + Returns + ------- + tree_string : str + Component of Newick representation of the tree. - if not self.is_leaf(): + """ + subtree_max_depth = subtree_max_depth or type(self).tree.max_depth + + if not self.is_leaf: child_string_list = [] for child in self.child_list[::-1]: - child_string = child.build_newick_string_recursive( - use_hg_snplabel, - align_tips, - platform, + child_string = child.build_newick_recursive( + use_hg_snp_label=use_hg_snp_label, + align_tips=align_tips, + subtree_max_depth=subtree_max_depth, + platform=platform, ) child_string_list.append(child_string) children = ",".join(child_string_list) - tree_string_part_1 = f"({children})" + children_string = f"({children})" else: - tree_string_part_1 = "" + children_string = "" - branch_label = self.hg_snp if use_hg_snplabel else self.label - branch_length = self.get_branch_length(align_tips, platform) + branch_label = self.hg_snp if use_hg_snp_label else self.label + branch_length = self.get_branch_length( + align_tips=align_tips, + subtree_max_depth=subtree_max_depth, + platform=platform, + ) if align_tips: branch_string = f"{branch_label}:{branch_length}" - elif branch_length is None or (self.is_leaf() and branch_length == 0): + elif branch_length is None or (self.is_leaf and branch_length == 0): branch_string = branch_label elif branch_length > 0: branch_string = f"{branch_label}|{branch_length}:{branch_length}" else: branch_string = ":0.5" - tree_string = f"{tree_string_part_1}{branch_string}" + tree_string = f"{children_string}{branch_string}" return tree_string diff --git a/yhaplo/path.py b/yhaplo/path.py index 32df5e9..76c7e15 100644 --- a/yhaplo/path.py +++ b/yhaplo/path.py @@ -11,14 +11,18 @@ class Path: - """Class representing a path through a haplogroup tree. - Instances store: - - The next node to visit - - A list of SNPs observed in the derived state - - The most derived SNP observed - - The number of ancestral alleles encountered. + Attributes + ---------- + node : Node + The next node to visit. + der_snp_list : list[SNP] + List of SNPs observed in the derived state. + most_derived_snp : SNP | None + The most derived SNP observed. + num_ancestral : int + The number of ancestral alleles encountered. """ @@ -26,6 +30,14 @@ def __init__( self, node: "node_module.Node", ): + """Instantiate Path. + + Parameters + ---------- + node : Node + The next node to visit. + + """ self.node = node self.der_snp_list: list["snp_module.SNP"] = [] self.most_derived_snp: Optional["snp_module.SNP"] = None diff --git a/yhaplo/sample.py b/yhaplo/sample.py index a2437a8..c0aaab9 100644 --- a/yhaplo/sample.py +++ b/yhaplo/sample.py @@ -73,12 +73,23 @@ def call_haplogroups_from_config(config: Config) -> pd.DataFrame: class Sample: - """Class representing an individual. - A sample knows its: - - Genotypes - - Haplogroup, once called + Attributes + ---------- + iid : IID_TYPE + Individual identifier. + haplogroup_node : Node | None + Node corresponding to haplogroup, once called. + most_derived_snp : SNP | None + Most derived SNP observed + der_snp_list : list[SNP] + List of SNPs observed in the derived state. + anc_snp_list: list[SNP] + List of SNPs observed in the ancestral state. + anc_der_count_tuples : list[tuple[Node, int, int]] + List of tuples, each of which includes a node and counts of SNPs observed + in the ancestral and derived states. """ @@ -91,8 +102,14 @@ class Sample: sample_list: list[Sample] = [] def __init__(self, iid: IID_TYPE): - """Construct Sample instance and append to `Sample.sample_list`.""" + """Instantiate Sample. + Parameters + ---------- + iid : IID_TYPE + Individual identifier. + + """ self.iid = iid self.haplogroup_node: Optional["node_module.Node"] = None self.most_derived_snp: Optional["snp_module.SNP"] = None @@ -612,13 +629,17 @@ def write_snps_detail( class TextSample(Sample): - """Class representing an individual whose data are in a sample-major text file. Expected input format: - Row 1: Physical coordinates - Column 1: Individual identifiers + Attributes + ---------- + genotypes : list[str] + List of genotypes. + """ position_to_column_index: dict[int, int] @@ -628,8 +649,16 @@ def __init__( iid: IID_TYPE, genotypes: list[str], ): - """Construct TextSample instance.""" + """Instantiate TextSample. + Parameters + ---------- + iid : IID_TYPE + Individual identifier. + genotypes : list[str] + List of genotypes. + + """ super(TextSample, self).__init__(iid) self.genotypes = genotypes @@ -678,12 +707,24 @@ def call_haplogroups(cls, config: Config) -> None: class VCFSample(Sample): + """Class representing an individual whose data are in a VCF/BCF file. - """Class representing an individual whose data are in a VCF/BCF file.""" + Attributes + ---------- + position_to_genotype : dict[int, str] + Maps position to genotype. + + """ def __init__(self, iid: IID_TYPE): - """Construct VCFSample instance.""" + """Instantiate VCFSample. + + Parameters + ---------- + iid : IID_TYPE + Individual identifier. + """ super(VCFSample, self).__init__(iid) self.position_to_genotype: dict[int, str] = {} diff --git a/yhaplo/snp.py b/yhaplo/snp.py index c06eba9..2477545 100644 --- a/yhaplo/snp.py +++ b/yhaplo/snp.py @@ -26,14 +26,27 @@ class SNP: - """Class representing a SNP. - A SNP instance knows its: - - Names - - Haplogroup - - Position - - Ancestral and derived alleles + Attributes + ---------- + name_list : list[str] + SNP names. + haplogroup : str + Haplogroup the SNP is associated with. + position : int + GRCh37 physical position. + ancestral : str + Ancestral allele. + derived : str + Derived allele. + + is_representative : bool + Indicator as to whether this SNP should be used to represent the haplogroup. + allele_set : set[str] + Set of ancestral and derived alleles. + node : Node + Node associated with the haplogroup the SNP is associated with. """ @@ -49,6 +62,22 @@ def __init__( ancestral: str, derived: str, ): + """Instantiate SNP. + + Parameters + ---------- + name : str + SNP name. + haplogroup : str + Haplogroup the SNP is associated with. + position : int + Physical position. + ancestral : str + Ancestral allele. + derived : str + Derived allele. + + """ if type(self).tree is None: raise RuntimeError( "Before instantiating, call: " @@ -326,7 +355,6 @@ def load_position_set( class DroppedMarker: - """Class representing a marker not used for classification. Such a marker may be useful for node labeling. Examples: @@ -334,6 +362,15 @@ class DroppedMarker: - Multiallelic SNPs - SNPs not meeting ISOGG quality guidelines + Parameters + ---------- + name : str + Marker name. + haplogroup : str + Haplogroup the marker is associated with. + tree : Tree + Y-chromosome tree. + """ def __init__( diff --git a/yhaplo/tree.py b/yhaplo/tree.py index 832963a..abcc378 100644 --- a/yhaplo/tree.py +++ b/yhaplo/tree.py @@ -17,19 +17,68 @@ class Tree: - """Class representing a haplogroup tree. - A tree has single Node instance (the root) as well as a dictionary - that maps haplogroup labels to node instances. + Attributes + ---------- + root : Node + Root node. + config : Config + Yhaplo configuration. + args : argparse.Namespace + Command-line arguments or defaults. + haplogroup_to_node : dict[str, Node] + Maps haplogroup labels (YCC or SNP-based) to nodes. + depth_first_node_list : list[Node] + List of nodes in depth-first order. + max_depth : int + Maximum depth of tree. + + snp_dict : dict[Union[str, tuple[str, int], int], SNP] + Maps SNP names, positions, and (haplogroup, position) tuples to SNPs. + snp_list : list[SNP] + List of SNPs + snp_pos_set : set[int] + Set of SNP physical positions. + snp_name_set : set[str] + Set of SNP names. + preferred_snp_name_set : set[str] + Set of preferred SNP names. + representative_snp_name_set : set[str] + Set of names of "representative" SNPs. + + isogg_omit_name_set : set[str] + Set of SNP names to omit from ISOGG variant metadata. + isogg_omit_pos_str_mutation_set : set[tuple[str, str]] + Set of (position_str, mutation) tuples representing SNPs to omit. + isogg_correction_dict : dict[str, tuple[str, str, str]] + Maps SNP name to a tuple of haplogroup, position, and mutation. + Either position or mutation (or both) are corrections from the ISOGG table. + isogg_counts_dict : dict[str, int] + Counts of various categories of ISOGG SNPs. + num_snps_corrected : int + Number of SNPs corrected, as compared to ISOGG raw data. + multiallelic_old_pos_set : set[int] + Set of positions to exclude due to multiple alleles. + multiallelic_new_pos_set : set[int] + Set of positions found to have multiple alleles. + """ def __init__( self, - config: Config, + config: Optional[Config] = None, ): - self.config = config - self.args = config.args + """Instantiate Tree. + + Parameters + ---------- + config : Config | None, optional + Configuration. If None, use default configuration. + + """ + self.config = config if config is not None else Config() + self.args = self.config.args self.max_depth = 0 self.haplogroup_to_node: dict[str, "node_module.Node"] = {} @@ -85,7 +134,7 @@ def set_search_root(self) -> None: def set_depth_first_node_list(self) -> None: """Build Node list from depth-first pre-order traversal.""" - self.depth_first_node_list = self.root.get_depth_first_node_list() + self.depth_first_node_list = [node for node in self.root.iter_depth_first()] for dfs_rank, node in enumerate(self.depth_first_node_list): node.set_dfs_rank(dfs_rank) @@ -100,10 +149,12 @@ def write_optional_traversal_output(self) -> None: self.write_depth_first_pre_order() if self.args.write_tree_table: self.write_tree_table() + if self.args.mrca_haplogroup_list: - self.query_mrca() - if self.args.query_snp_name: - self.query_snp_path() + self.query_mrca(*self.args.mrca_haplogroup_list) + if self.args.query_snp_names: + for query_snp_name in self.args.query_snp_names.split(","): + self.query_snp_path(query_snp_name) def write_breadth_first(self) -> None: """Write bread-first traversal in pipe/dot format.""" @@ -114,7 +165,8 @@ def write_breadth_first(self) -> None: else self.config.bf_tree_fp ) with open(bf_tree_fp, "w") as bf_tree_file: - self.root.write_breadth_first_traversal(bf_tree_file) + for node in self.root.iter_breadth_first(): + bf_tree_file.write(f"{node.str_dot_pipe_depth}\n") logger.info(f"Wrote breadth-first tree traveral:\n {bf_tree_fp}\n") @@ -128,7 +180,7 @@ def write_depth_first_pre_order(self) -> None: ) with open(df_tree_fp, "w") as df_tree_file: for node in self.depth_first_node_list: - df_tree_file.write(node.str_dot_pipe_depth + "\n") + df_tree_file.write(f"{node.str_dot_pipe_depth}\n") logger.info(f"Wrote depth-first tree traveral:\n {df_tree_fp}\n") @@ -144,16 +196,17 @@ def write_tree_table(self) -> None: logger.info(f"Wrote tree table:\n {tree_table_fp}\n") - def query_mrca(self) -> None: - """Write MRCA of two haplogroups.""" + def query_mrca(self, haplogroup1: str, haplogroup2: str) -> None: + """Write MRCA of two haplogroups. - mrca_haplogroup_list = self.args.mrca_haplogroup_list - if not isinstance(mrca_haplogroup_list, list) or len(mrca_haplogroup_list) != 2: - raise ValueError( - f"mrca expects a list of 2 haplogroups, not this: {mrca_haplogroup_list}\n" - ) + Parameters + ---------- + haplogroup1 : str + Haplogroup label. + haplogroup2 : str + Haplogroup label. - haplogroup1, haplogroup2 = mrca_haplogroup_list + """ node1 = self.haplogroup_to_node[haplogroup1] node2 = self.haplogroup_to_node[haplogroup2] mrca = node1.mrca(node2) @@ -164,20 +217,30 @@ def query_mrca(self) -> None: f"MRCA: {mrca.haplogroup}\n" ) - def query_snp_path(self) -> None: - """List phylogenetic path for a query SNP.""" + def query_snp_path(self, query_snp_name: str) -> None: + """List phylogenetic path for a query SNP. - query_name = self.args.query_snp_name - logger.info(f"\nSNP Query: {query_name}\n\n") - snp = self.snp_dict.get(query_name, None) + Parameters + ---------- + query_snp_name : str + Name of SNP to query. + + """ + logger.info(f"\nSNP Query: {query_snp_name}\n") + snp = self.snp_dict.get(query_snp_name) if snp: for node in snp.back_trace_path(): - logger.info(node.str_simple + "\n") - if snp.label != query_name: - logger.info(f"\nNote: {query_name} is an alias of {snp.label}.\n") + logger.info(node.str_simple) + + if query_snp_name != snp.label: + logger.info(f"\nNote: {query_snp_name} is an alias of {snp.label}.\n") + elif not node.hg_snp.endswith(query_snp_name): + logger.info( + f"\nNote: {query_snp_name} is on the {node.hg_snp} branch.\n" + ) else: - logger.info("Not found.\n") + logger.info(f'No SNPs found with the name "{query_snp_name}"\n') logger.info("") @@ -248,7 +311,7 @@ def identify_phylogenetic_path( The stopping condition is a disjunction of three atomic conditions. The first is trivial: - a. node.is_leaf() + a. node.is_leaf We cannot go any further. The following table enumerates possible cases for the other two @@ -293,7 +356,7 @@ def identify_phylogenetic_path( anc_der_count_tuples.append((path.node, num_ancestral, num_derived)) if ( - path.node.is_leaf() + path.node.is_leaf or (num_ancestral > self.config.args.anc_stop_thresh) or ( num_ancestral == self.config.args.anc_stop_thresh @@ -924,3 +987,36 @@ def verify_newick_token(observed: str, expected: str) -> None: f"Expected this token: {expected}\n" f"Got this one: {observed}\n" ) + + +def get_bounded_subtree( + root_haplogroup: str, + target_haplogroup: str, +) -> "node_module.Node": + """Generate a bounded subtree. + + Set the root node to correspond to `root_haplogroup`. + Prune children of the target node and children of all nodes not on a direct path + from the root to the target. + + Parameters + ---------- + root_haplogroup : str + Haplogroup corresponding to the root of the subtree. + target_haplogroup : str + Haplogroup corresponding to the target node. + + Returns + ------- + root_node : Node + Subtree root node. + + """ + tree = Tree() + root_node = tree.haplogroup_to_node[root_haplogroup] + target_node = tree.haplogroup_to_node[target_haplogroup] + for node in root_node.iter_depth_first(): + if node is target_node or node is not node.mrca(target_node): + node.remove_children() + + return root_node diff --git a/yhaplo/utils/loaders.py b/yhaplo/utils/loaders.py index a1f6e9d..1a6f551 100644 --- a/yhaplo/utils/loaders.py +++ b/yhaplo/utils/loaders.py @@ -23,7 +23,6 @@ @dataclass class DataFile: - """Attributes of a yhaplo data file. data_subdir : str @@ -50,7 +49,6 @@ def package(self): class TtamFileNotFoundError(FileNotFoundError): - """Exception indicating that an unfound data file is not publicly available.""" def __init__( @@ -118,7 +116,7 @@ def load_dataframe( """ try: with as_file(files(data_file.package).joinpath(data_file.filename)) as path: - df = pd.read_csv(path, delim_whitespace=True, header=header) + df = pd.read_csv(path, sep=r"\s+", header=header) except (FileNotFoundError, ModuleNotFoundError): if data_file.ttam_only: @@ -157,7 +155,7 @@ def load_haplogroup_df(haplogroup_fp: str) -> pd.DataFrame: """ haplogroup_df = pd.read_csv( haplogroup_fp, - delim_whitespace=True, + sep=r"\s+", names=["iid", "hg_snp_obs", "hg_snp", "ycc_haplogroup"], ).set_index("iid") @@ -183,7 +181,7 @@ def load_yhaplo_unique_snps(unique_fp: str) -> pd.DataFrame: """ unique_df = pd.read_csv( unique_fp, - delim_whitespace=True, + sep=r"\s+", header=None, names=SNP_TABLE_COL_NAMES, ).astype(SNP_TABLE_DTYPE_DICT)