Module `dphon.match`

The Match class for encoding text reuse relationships.

Expand source code

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""The Match class for encoding text reuse relationships."""

import math
from typing import Dict, List, NamedTuple

import Levenshtein as Lev
from rich.padding import Padding
from rich.console import Console, ConsoleOptions, RenderResult
from spacy.tokens import Span


class Match(NamedTuple):
    """A match is a pair of similar textual sequences in two documents."""
    u: str
    v: str
    utxt: Span
    vtxt: Span
    weight: float = 0
    au: List[str] = []
    av: List[str] = []

    def __len__(self) -> int:
        """Length of the longer sequence in the match."""
        return max(len(self.utxt), len(self.vtxt))

    def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
        """Format the match for display in console."""
        # get colorized match text and transcription
        su, sv = console.highlighter.format_match(self)     # type: ignore
        pu, pv = console.highlighter.transcription(self)    # type: ignore

        # add left-padding to align with match numbers, and bottom-padding
        # so that there's a space between matches in output
        su, sv, pu = map(lambda t: Padding(t, (0, 0, 0, 4)), (su, sv, pu))
        pv = Padding(pv, (0, 0, 1, 4))

        # return everything as an iterable of renderables
        return (
            f"1.  [white]{self.u}[/white] ({self.utxt.start}–{self.utxt.end-1})：",
            su, pu,
            f"2.  [white]{self.v}[/white] ({self.vtxt.start}–{self.vtxt.end-1})：",
            sv, pv,
        )

    @property
    def weighted_score(self) -> float:
        """Ratio of phonemic similarity to graphic similarity."""
        try:
            return self.weight / Lev.seqratio(self.au, self.av)
        except ZeroDivisionError:
            return math.inf

    def as_dict(self) -> Dict[str, str]:
        """Match with prettier field names for serialization."""
        return {
            "u_id": self.u,
            "v_id": self.v,
            "u_text": self.utxt.text,
            "v_text": self.vtxt.text,
            "u_text_aligned": "".join(self.au),
            "v_text_aligned": "".join(self.av),
            "u_start": self.utxt.start,
            "u_end": self.utxt.end,
            "v_start": self.vtxt.start,
            "v_end": self.vtxt.end,
            "score": str(self.weight),
            "weighted_score": str(self.weighted_score),
        }

Classes

class Match (u: str, v: str, utxt: spacy.tokens.span.Span, vtxt: spacy.tokens.span.Span, weight: float = 0, au: List[str] = [], av: List[str] = [])

A match is a pair of similar textual sequences in two documents.

Expand source code

class Match(NamedTuple):
    """A match is a pair of similar textual sequences in two documents."""
    u: str
    v: str
    utxt: Span
    vtxt: Span
    weight: float = 0
    au: List[str] = []
    av: List[str] = []

    def __len__(self) -> int:
        """Length of the longer sequence in the match."""
        return max(len(self.utxt), len(self.vtxt))

    def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
        """Format the match for display in console."""
        # get colorized match text and transcription
        su, sv = console.highlighter.format_match(self)     # type: ignore
        pu, pv = console.highlighter.transcription(self)    # type: ignore

        # add left-padding to align with match numbers, and bottom-padding
        # so that there's a space between matches in output
        su, sv, pu = map(lambda t: Padding(t, (0, 0, 0, 4)), (su, sv, pu))
        pv = Padding(pv, (0, 0, 1, 4))

        # return everything as an iterable of renderables
        return (
            f"1.  [white]{self.u}[/white] ({self.utxt.start}–{self.utxt.end-1})：",
            su, pu,
            f"2.  [white]{self.v}[/white] ({self.vtxt.start}–{self.vtxt.end-1})：",
            sv, pv,
        )

    @property
    def weighted_score(self) -> float:
        """Ratio of phonemic similarity to graphic similarity."""
        try:
            return self.weight / Lev.seqratio(self.au, self.av)
        except ZeroDivisionError:
            return math.inf

    def as_dict(self) -> Dict[str, str]:
        """Match with prettier field names for serialization."""
        return {
            "u_id": self.u,
            "v_id": self.v,
            "u_text": self.utxt.text,
            "v_text": self.vtxt.text,
            "u_text_aligned": "".join(self.au),
            "v_text_aligned": "".join(self.av),
            "u_start": self.utxt.start,
            "u_end": self.utxt.end,
            "v_start": self.vtxt.start,
            "v_end": self.vtxt.end,
            "score": str(self.weight),
            "weighted_score": str(self.weighted_score),
        }

Ancestors

builtins.tuple

Instance variables

var au : List[str]

Alias for field number 5

var av : List[str]

Alias for field number 6

var u : str

Alias for field number 0

var utxt : spacy.tokens.span.Span

Alias for field number 2

var v : str

Alias for field number 1

var vtxt : spacy.tokens.span.Span

Alias for field number 3

var weight : float

Alias for field number 4

var weighted_score : float

Ratio of phonemic similarity to graphic similarity.

Expand source code

@property
def weighted_score(self) -> float:
    """Ratio of phonemic similarity to graphic similarity."""
    try:
        return self.weight / Lev.seqratio(self.au, self.av)
    except ZeroDivisionError:
        return math.inf

Methods

def as_dict(self) ‑> Dict[str, str]

Match with prettier field names for serialization.

Expand source code

def as_dict(self) -> Dict[str, str]:
    """Match with prettier field names for serialization."""
    return {
        "u_id": self.u,
        "v_id": self.v,
        "u_text": self.utxt.text,
        "v_text": self.vtxt.text,
        "u_text_aligned": "".join(self.au),
        "v_text_aligned": "".join(self.av),
        "u_start": self.utxt.start,
        "u_end": self.utxt.end,
        "v_start": self.vtxt.start,
        "v_end": self.vtxt.end,
        "score": str(self.weight),
        "weighted_score": str(self.weighted_score),
    }