Module dphon.extend

Abstract base class and implementations for extending Matches.

Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Abstract base class and implementations for extending Matches."""

import logging
from abc import ABC, abstractmethod
from typing import List

import Levenshtein as Lev
from spacy.tokens import Span

from .match import Match
from .g2p import OOV_PHONEMES


class Extender(ABC):
    """Extenders use heuristics to lengthen and return match sequences."""

    @abstractmethod
    def __call__(self, match: Match) -> Match:
        """Extend the match as far as possible and return it."""
        raise NotImplementedError


class StringDistanceExtender(Extender):
    """Add tokens to sequences while string distance metric is above threshold.

    Subclass and implement the _score() method to implement the desired string
    distance measure, e.g. Levenshtein ratio.
    """

    threshold: float    # if the score falls below this, match gets cut off
    len_limit: int      # only score this many tokens at the end of the match

    def __init__(self, threshold: float, len_limit: int) -> None:
        """Create a new extender."""
        self.threshold = threshold
        self.len_limit = len_limit
        logging.info(
            f"using {self.__class__} with threshold={threshold}, len_limit={len_limit}")

    @abstractmethod
    def _score(self, left: Span, right: Span, rev: bool = False) -> float:
        """Compare the match sequences using a string distance measurement.

        Compares only up to len_limit characters when scoring, to speed up
        calculation and improve accuracy for long sequences. When looking in
        reverse, set rev=True to compare the start of the match instead of the
        ends."""
        raise NotImplementedError

    def _extend_fwd(self, match: Match) -> Match:
        """Return a copy of a match extended in the forward direction."""
        trail = 0
        score = self._score(match.utxt, match.vtxt)
        utxt = match.utxt.doc[match.utxt.start:match.utxt.end]
        vtxt = match.vtxt.doc[match.vtxt.start:match.vtxt.end]
        ulen, vlen = len(match.utxt.doc), len(match.vtxt.doc)

        # extend while score is above threshold and we aren't at the end
        while score >= self.threshold and utxt.end < ulen and vtxt.end < vlen:
            utxt = utxt.doc[utxt.start:utxt.end + 1]
            vtxt = vtxt.doc[vtxt.start:vtxt.end + 1]

            # track the last score increase and how far we've gone past it
            new_score = self._score(utxt, vtxt)
            trail = trail + 1 if new_score < score else 0
            score = new_score

        # return match trimmed back to last increasing score
        return Match(match.u, match.v, utxt.doc[utxt.start:utxt.end - trail],
                     vtxt.doc[vtxt.start:vtxt.end - trail])

    def _extend_rev(self, match: Match) -> Match:
        """Return a copy of a match extended in the reverse direction."""
        trail = 0
        score = self._score(match.utxt, match.vtxt, rev=True)
        utxt = match.utxt.doc[match.utxt.start:match.utxt.end]
        vtxt = match.vtxt.doc[match.vtxt.start:match.vtxt.end]

        # extend while score is above threshold and we aren't at the start
        while score >= self.threshold and utxt.start > 0 and vtxt.start > 0:
            utxt = utxt.doc[(utxt.start-1):utxt.end]
            vtxt = vtxt.doc[(vtxt.start-1):vtxt.end]

            # track the last score increase and how far we've gone past it
            new_score = self._score(utxt, vtxt, rev=True)
            trail = trail + 1 if new_score < score else 0
            score = new_score

        # return match trimmed back to last increasing score
        return Match(match.u, match.v, utxt.doc[utxt.start + trail:utxt.end],
                     vtxt.doc[vtxt.start + trail:vtxt.end])

    def __call__(self, match: Match) -> Match:
        """Extend a match using Levenshtein ratio comparison.

        After extending in both directions, use the results to update the match
        bounds and rescore it."""

        ex_fwd = self._extend_fwd(match)
        ex_rev = self._extend_rev(match)
        utxt = match.utxt.doc[ex_rev.utxt.start:ex_fwd.utxt.end]
        vtxt = match.vtxt.doc[ex_rev.vtxt.start:ex_fwd.vtxt.end]
        score = self._score(utxt, vtxt)
        return Match(match.u, match.v, utxt, vtxt, score)


class LevenshteinExtender(StringDistanceExtender):
    """Add tokens to sequences while Levenshtein ratio is above threshold."""

    def _score(self, utxt: Span, vtxt: Span, rev: bool = False) -> float:
        """Compute the Levenshtein ratio of the match sequences."""

        if rev:
            return Lev.ratio(
                utxt.text[:self.len_limit],
                vtxt.text[:self.len_limit]
            )
        else:
            return Lev.ratio(
                utxt.text[-self.len_limit:],
                vtxt.text[-self.len_limit:]
            )


class LevenshteinPhoneticExtender(StringDistanceExtender):
    """Add tokens to sequences while Levenshtein ratio of phonemes is above
    threshold."""

    def _score(self, utxt: Span, vtxt: Span, rev: bool = False) -> float:
        """Compute the Levenshtein ratio of the match sequence phonemes."""

        # if we encounter any OOV tokens, count it as a mismatch
        if OOV_PHONEMES in utxt._.phonemes or OOV_PHONEMES in vtxt._.phonemes:
            return -1

        # otherwise score based on phonemes
        text1 = "".join(utxt._.phonemes)
        text2 = "".join(vtxt._.phonemes)

        # score in the provided direction
        if rev:
            return Lev.ratio(text1[:self.len_limit], text2[:self.len_limit])
        else:
            return Lev.ratio(text1[-self.len_limit:], text2[-self.len_limit:])


def extend_matches(matches: List[Match], extend: Extender) -> List[Match]:
    """Extend a list of matches using a provided Extender instance."""
    # track working matches in a queue; store finished ones separately
    working: List[Match] = []
    done: List[Match] = []

    # track unprocessed matches in a queue; sort then reverse so we can pop()
    # matches from the queue until it's empty, starting at the front
    todo = list(reversed(sorted(matches)))
    while todo:
        current = todo.pop()

        # if we're not working on any matches yet, or if the current match is
        # outside working point in U, clear and reset the working area
        if not working or current.utxt.start >= working[0].utxt.end:
            done += working
            working = [extend(current)]

        # if we overlap in U, check to see if we also overlap in V for any
        # matches we're working on
        else:
            skip = False
            for match in working:
                # if match is fully internal to one in the working area, skip
                # it, as we no longer need it
                if current.vtxt.start >= match.vtxt.start and \
                   current.vtxt.start <= match.vtxt.end and \
                   current.vtxt.end <= match.vtxt.end:
                    skip = True
                    break
            # if match hits new location in V, extend it and add it to
            # the working area
            if not skip:
                working.append(extend(current))

    # finish any remaining work and return extended matches
    done += working
    return done

Functions

def extend_matches(matches: List[Match], extend: Extender) ‑> List[Match]

Extend a list of matches using a provided Extender instance.

Expand source code
def extend_matches(matches: List[Match], extend: Extender) -> List[Match]:
    """Extend a list of matches using a provided Extender instance."""
    # track working matches in a queue; store finished ones separately
    working: List[Match] = []
    done: List[Match] = []

    # track unprocessed matches in a queue; sort then reverse so we can pop()
    # matches from the queue until it's empty, starting at the front
    todo = list(reversed(sorted(matches)))
    while todo:
        current = todo.pop()

        # if we're not working on any matches yet, or if the current match is
        # outside working point in U, clear and reset the working area
        if not working or current.utxt.start >= working[0].utxt.end:
            done += working
            working = [extend(current)]

        # if we overlap in U, check to see if we also overlap in V for any
        # matches we're working on
        else:
            skip = False
            for match in working:
                # if match is fully internal to one in the working area, skip
                # it, as we no longer need it
                if current.vtxt.start >= match.vtxt.start and \
                   current.vtxt.start <= match.vtxt.end and \
                   current.vtxt.end <= match.vtxt.end:
                    skip = True
                    break
            # if match hits new location in V, extend it and add it to
            # the working area
            if not skip:
                working.append(extend(current))

    # finish any remaining work and return extended matches
    done += working
    return done

Classes

class Extender

Extenders use heuristics to lengthen and return match sequences.

Expand source code
class Extender(ABC):
    """Extenders use heuristics to lengthen and return match sequences."""

    @abstractmethod
    def __call__(self, match: Match) -> Match:
        """Extend the match as far as possible and return it."""
        raise NotImplementedError

Ancestors

  • abc.ABC

Subclasses

class LevenshteinExtender (threshold: float, len_limit: int)

Add tokens to sequences while Levenshtein ratio is above threshold.

Create a new extender.

Expand source code
class LevenshteinExtender(StringDistanceExtender):
    """Add tokens to sequences while Levenshtein ratio is above threshold."""

    def _score(self, utxt: Span, vtxt: Span, rev: bool = False) -> float:
        """Compute the Levenshtein ratio of the match sequences."""

        if rev:
            return Lev.ratio(
                utxt.text[:self.len_limit],
                vtxt.text[:self.len_limit]
            )
        else:
            return Lev.ratio(
                utxt.text[-self.len_limit:],
                vtxt.text[-self.len_limit:]
            )

Ancestors

Class variables

var len_limit : int
var threshold : float
class LevenshteinPhoneticExtender (threshold: float, len_limit: int)

Add tokens to sequences while Levenshtein ratio of phonemes is above threshold.

Create a new extender.

Expand source code
class LevenshteinPhoneticExtender(StringDistanceExtender):
    """Add tokens to sequences while Levenshtein ratio of phonemes is above
    threshold."""

    def _score(self, utxt: Span, vtxt: Span, rev: bool = False) -> float:
        """Compute the Levenshtein ratio of the match sequence phonemes."""

        # if we encounter any OOV tokens, count it as a mismatch
        if OOV_PHONEMES in utxt._.phonemes or OOV_PHONEMES in vtxt._.phonemes:
            return -1

        # otherwise score based on phonemes
        text1 = "".join(utxt._.phonemes)
        text2 = "".join(vtxt._.phonemes)

        # score in the provided direction
        if rev:
            return Lev.ratio(text1[:self.len_limit], text2[:self.len_limit])
        else:
            return Lev.ratio(text1[-self.len_limit:], text2[-self.len_limit:])

Ancestors

Class variables

var len_limit : int
var threshold : float
class StringDistanceExtender (threshold: float, len_limit: int)

Add tokens to sequences while string distance metric is above threshold.

Subclass and implement the _score() method to implement the desired string distance measure, e.g. Levenshtein ratio.

Create a new extender.

Expand source code
class StringDistanceExtender(Extender):
    """Add tokens to sequences while string distance metric is above threshold.

    Subclass and implement the _score() method to implement the desired string
    distance measure, e.g. Levenshtein ratio.
    """

    threshold: float    # if the score falls below this, match gets cut off
    len_limit: int      # only score this many tokens at the end of the match

    def __init__(self, threshold: float, len_limit: int) -> None:
        """Create a new extender."""
        self.threshold = threshold
        self.len_limit = len_limit
        logging.info(
            f"using {self.__class__} with threshold={threshold}, len_limit={len_limit}")

    @abstractmethod
    def _score(self, left: Span, right: Span, rev: bool = False) -> float:
        """Compare the match sequences using a string distance measurement.

        Compares only up to len_limit characters when scoring, to speed up
        calculation and improve accuracy for long sequences. When looking in
        reverse, set rev=True to compare the start of the match instead of the
        ends."""
        raise NotImplementedError

    def _extend_fwd(self, match: Match) -> Match:
        """Return a copy of a match extended in the forward direction."""
        trail = 0
        score = self._score(match.utxt, match.vtxt)
        utxt = match.utxt.doc[match.utxt.start:match.utxt.end]
        vtxt = match.vtxt.doc[match.vtxt.start:match.vtxt.end]
        ulen, vlen = len(match.utxt.doc), len(match.vtxt.doc)

        # extend while score is above threshold and we aren't at the end
        while score >= self.threshold and utxt.end < ulen and vtxt.end < vlen:
            utxt = utxt.doc[utxt.start:utxt.end + 1]
            vtxt = vtxt.doc[vtxt.start:vtxt.end + 1]

            # track the last score increase and how far we've gone past it
            new_score = self._score(utxt, vtxt)
            trail = trail + 1 if new_score < score else 0
            score = new_score

        # return match trimmed back to last increasing score
        return Match(match.u, match.v, utxt.doc[utxt.start:utxt.end - trail],
                     vtxt.doc[vtxt.start:vtxt.end - trail])

    def _extend_rev(self, match: Match) -> Match:
        """Return a copy of a match extended in the reverse direction."""
        trail = 0
        score = self._score(match.utxt, match.vtxt, rev=True)
        utxt = match.utxt.doc[match.utxt.start:match.utxt.end]
        vtxt = match.vtxt.doc[match.vtxt.start:match.vtxt.end]

        # extend while score is above threshold and we aren't at the start
        while score >= self.threshold and utxt.start > 0 and vtxt.start > 0:
            utxt = utxt.doc[(utxt.start-1):utxt.end]
            vtxt = vtxt.doc[(vtxt.start-1):vtxt.end]

            # track the last score increase and how far we've gone past it
            new_score = self._score(utxt, vtxt, rev=True)
            trail = trail + 1 if new_score < score else 0
            score = new_score

        # return match trimmed back to last increasing score
        return Match(match.u, match.v, utxt.doc[utxt.start + trail:utxt.end],
                     vtxt.doc[vtxt.start + trail:vtxt.end])

    def __call__(self, match: Match) -> Match:
        """Extend a match using Levenshtein ratio comparison.

        After extending in both directions, use the results to update the match
        bounds and rescore it."""

        ex_fwd = self._extend_fwd(match)
        ex_rev = self._extend_rev(match)
        utxt = match.utxt.doc[ex_rev.utxt.start:ex_fwd.utxt.end]
        vtxt = match.vtxt.doc[ex_rev.vtxt.start:ex_fwd.vtxt.end]
        score = self._score(utxt, vtxt)
        return Match(match.u, match.v, utxt, vtxt, score)

Ancestors

Subclasses

Class variables

var len_limit : int
var threshold : float