Module `dphon.align`

Classes and types for pairwise match alignment.

Expand source code

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Classes and types for pairwise match alignment."""

import logging
from abc import ABC, abstractmethod
from typing import List, Mapping, Optional, Tuple, Union

from lingpy.align.pairwise import sw_align
from spacy.tokens import Span

from .match import Match

# Lingpy scoring matrices: a × b = 1.0 -> { ("a", "b"): 1.0, ("b", "a"): 1.0 }
# Matrix cells are represented as tuples; both a × b and b × a need to exist
Scorer_T = Mapping[Tuple[str, str], float]

# Lingpy aligner input type: tuple of str | list of str | str
Seq_T = Union[Tuple[str], List[str], str]


class Aligner(ABC):
    """Abstract class; implements pairwise alignment.

    Override to implement __call__ and define gap_char, the character used to
    represent alignment gaps."""

    gap_char: str

    @abstractmethod
    def __call__(self, match: Match) -> Match:
        """Align and return a match."""
        raise NotImplementedError


class SmithWatermanAligner(Aligner):
    """Local alignment with an optional custom scoring matrix."""

    gap_char: str
    scorer: Optional[Scorer_T]

    def __init__(self, scorer: Scorer_T = None, gap_char: str = "-") -> None:
        self.scorer = scorer
        self.gap_char = gap_char
        logging.info(f"using {self.__class__} with gap_char=\"{gap_char}\"")

    def _get_seqs(self, match: Match) -> Tuple[Seq_T, Seq_T]:
        """Get the two sequences to compare."""
        return match.utxt.text, match.vtxt.text

    def __call__(self, match: Match) -> Match:
        """Perform the alignment and use it to modify the provided match.

        The updated match uses the values calculated from alignment to adjust
        the start and end points of its sequences, as well as storing the score
        and sequence texts calculated for the alignment."""

        # compute the alignment and keep non-aligned regions
        (lu, cu, _ru), (lv, cv, _rv), score = sw_align(*self._get_seqs(match),
                                                        self.scorer)

        # use lengths of non-aligned regions to move the sequence boundaries
        # [...] ["A", "B", "C"] [...]
        # ---->                <----
        u, v = match.utxt.doc, match.vtxt.doc
        us, vs = match.utxt.start + len(lu), match.vtxt.start + len(lv)
        utxt = u[us:us + len(cu)]
        vtxt = v[vs:vs + len(cv)]

        # use the gaps in the alignment to construct a new sequence of token
        # texts, inserting gap_char wherever the aligner created a gap
        u_ptr = 0
        v_ptr = 0
        au = []
        av = []
        for i in range(max(len(utxt), len(vtxt))):
            if cu[i] != "-":
                au.append(utxt[u_ptr].text)
                u_ptr += 1
            else:
                au.append(self.gap_char)
            if cv[i] != "-":
                av.append(vtxt[v_ptr].text)
                v_ptr += 1
            else:
                av.append(self.gap_char)

        # trim back the sequence boundaries further to remove any non-alphanum.
        # tokens from the start and end of both alignment and orig. sequence
        while not au[-1].isalnum() or not av[-1].isalnum():
            utxt, vtxt, au, av = utxt[:-1], vtxt[:-1], au[:-1], av[:-1]
        while not au[0].isalnum() or not av[0].isalnum():
            utxt, vtxt, au, av = utxt[1:], vtxt[1:], au[1:], av[1:]

        # normalize score to length; 1.0 is perfect
        norm_score = float(score) / max(len(au), len(av))

        # create a new match with alignment info and adjusted boundaries
        return Match(match.u, match.v, utxt, vtxt, norm_score, au, av)


class SmithWatermanPhoneticAligner(SmithWatermanAligner):
    """Local alignment using phonetic values provided by a Phonemes instance."""

    def __init__(self, scorer: Scorer_T = None, gap_char: str = "-") -> None:
        # error if phonetic information isn't available
        if not Span.has_extension("phonemes"):
            raise RuntimeError("Phonemes component not available")
        super().__init__(scorer=scorer, gap_char=gap_char)

    def _get_seqs(self, match: Match) -> Tuple[Seq_T, Seq_T]:
        """Get the phonemes of the two sequences for comparison."""
        # combine the phonemes for each token into a single string; if there's
        # no phonetic content, use the token text in place of the phonemes
        return (
            ["".join([p or "" for p in t._.phonemes])
             or t.text for t in match.utxt],
            ["".join([p or "" for p in t._.phonemes])
             or t.text for t in match.vtxt],
        )

Classes

class Aligner

Abstract class; implements pairwise alignment.

Override to implement call and define gap_char, the character used to represent alignment gaps.

Expand source code

class Aligner(ABC):
    """Abstract class; implements pairwise alignment.

    Override to implement __call__ and define gap_char, the character used to
    represent alignment gaps."""

    gap_char: str

    @abstractmethod
    def __call__(self, match: Match) -> Match:
        """Align and return a match."""
        raise NotImplementedError

Ancestors

abc.ABC

Subclasses

SmithWatermanAligner

Class variables

var gap_char : str

class SmithWatermanAligner (scorer: Mapping[Tuple[str, str], float] = None, gap_char: str = '-')

Local alignment with an optional custom scoring matrix.

Expand source code

class SmithWatermanAligner(Aligner):
    """Local alignment with an optional custom scoring matrix."""

    gap_char: str
    scorer: Optional[Scorer_T]

    def __init__(self, scorer: Scorer_T = None, gap_char: str = "-") -> None:
        self.scorer = scorer
        self.gap_char = gap_char
        logging.info(f"using {self.__class__} with gap_char=\"{gap_char}\"")

    def _get_seqs(self, match: Match) -> Tuple[Seq_T, Seq_T]:
        """Get the two sequences to compare."""
        return match.utxt.text, match.vtxt.text

    def __call__(self, match: Match) -> Match:
        """Perform the alignment and use it to modify the provided match.

        The updated match uses the values calculated from alignment to adjust
        the start and end points of its sequences, as well as storing the score
        and sequence texts calculated for the alignment."""

        # compute the alignment and keep non-aligned regions
        (lu, cu, _ru), (lv, cv, _rv), score = sw_align(*self._get_seqs(match),
                                                        self.scorer)

        # use lengths of non-aligned regions to move the sequence boundaries
        # [...] ["A", "B", "C"] [...]
        # ---->                <----
        u, v = match.utxt.doc, match.vtxt.doc
        us, vs = match.utxt.start + len(lu), match.vtxt.start + len(lv)
        utxt = u[us:us + len(cu)]
        vtxt = v[vs:vs + len(cv)]

        # use the gaps in the alignment to construct a new sequence of token
        # texts, inserting gap_char wherever the aligner created a gap
        u_ptr = 0
        v_ptr = 0
        au = []
        av = []
        for i in range(max(len(utxt), len(vtxt))):
            if cu[i] != "-":
                au.append(utxt[u_ptr].text)
                u_ptr += 1
            else:
                au.append(self.gap_char)
            if cv[i] != "-":
                av.append(vtxt[v_ptr].text)
                v_ptr += 1
            else:
                av.append(self.gap_char)

        # trim back the sequence boundaries further to remove any non-alphanum.
        # tokens from the start and end of both alignment and orig. sequence
        while not au[-1].isalnum() or not av[-1].isalnum():
            utxt, vtxt, au, av = utxt[:-1], vtxt[:-1], au[:-1], av[:-1]
        while not au[0].isalnum() or not av[0].isalnum():
            utxt, vtxt, au, av = utxt[1:], vtxt[1:], au[1:], av[1:]

        # normalize score to length; 1.0 is perfect
        norm_score = float(score) / max(len(au), len(av))

        # create a new match with alignment info and adjusted boundaries
        return Match(match.u, match.v, utxt, vtxt, norm_score, au, av)

Ancestors

Aligner
abc.ABC

Subclasses

SmithWatermanPhoneticAligner

Class variables

var gap_char : str
var scorer : Optional[Mapping[Tuple[str, str], float]]

class SmithWatermanPhoneticAligner (scorer: Mapping[Tuple[str, str], float] = None, gap_char: str = '-')

Local alignment using phonetic values provided by a Phonemes instance.

Expand source code

class SmithWatermanPhoneticAligner(SmithWatermanAligner):
    """Local alignment using phonetic values provided by a Phonemes instance."""

    def __init__(self, scorer: Scorer_T = None, gap_char: str = "-") -> None:
        # error if phonetic information isn't available
        if not Span.has_extension("phonemes"):
            raise RuntimeError("Phonemes component not available")
        super().__init__(scorer=scorer, gap_char=gap_char)

    def _get_seqs(self, match: Match) -> Tuple[Seq_T, Seq_T]:
        """Get the phonemes of the two sequences for comparison."""
        # combine the phonemes for each token into a single string; if there's
        # no phonetic content, use the token text in place of the phonemes
        return (
            ["".join([p or "" for p in t._.phonemes])
             or t.text for t in match.utxt],
            ["".join([p or "" for p in t._.phonemes])
             or t.text for t in match.vtxt],
        )

Class variables

var gap_char : str
var scorer : Optional[Mapping[Tuple[str, str], float]]