Module dphon.reuse

Classes for analyzing text reuse.

Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Classes for analyzing text reuse."""

from itertools import combinations
from typing import Callable, Iterable, Iterator, Tuple

from networkx import MultiGraph, create_empty_copy
from rich.progress import Progress, BarColumn, SpinnerColumn
from spacy.tokens import Doc

from .align import Aligner
from .extend import Extender, extend_matches
from .match import Match
from .console import err_console


class MatchGraph():

    _G: MultiGraph

    def __init__(self) -> None:
        self._G = MultiGraph()
        self.progress = Progress(
            "[progress.description]{task.description}",
            SpinnerColumn(),
            "[progress.description]{task.fields[u]} × {task.fields[v]}",
            BarColumn(bar_width=None),
            "{task.completed}/{task.total}",
            "{task.percentage:>3.1f}%",
            console=err_console,
            transient=True,
        )

    @property
    def matches(self) -> Iterator[Match]:
        """Iterator over all matches in the graph."""
        return (Match(**data) for _u, _v, data in self._G.edges(data=True))

    @property
    def docs(self) -> Iterator[Doc]:
        """Iterator over all docs in the graph."""
        return (doc for _label, doc in self._G.nodes(data="doc"))

    def number_of_matches(self) -> int:
        """Total number of matches in the graph."""
        return self._G.number_of_edges()

    def number_of_docs(self) -> int:
        """Total number of documents in the graph."""
        return self._G.number_of_nodes()

    def add_doc(self, label: str, doc: Doc) -> None:
        """Add a single document to the graph."""
        self._G.add_node(label, doc=doc)

    def add_docs(self, docs: Iterable[Tuple[str, Doc]]) -> None:
        """Add a collection of documents to the graph."""
        self._G.add_nodes_from(((label, {"doc": doc}) for label, doc in docs))

    def add_match(self, match: Match) -> None:
        """Add a single match to the graph."""
        self._G.add_edge(match.u, match.v, **match._asdict())

    def add_matches(self, matches: Iterable[Match]) -> None:
        """Add a collection of matches to the graph."""
        self._G.add_edges_from([(m.u, m.v, m._asdict()) for m in matches])

    def extend(self, extender: Extender) -> None:
        """Extend all matches in the graph using a provided strategy."""
        # track progress
        task = self.progress.add_task(
            "extending", u="", v="", total=self.number_of_matches())

        # create a new graph without matches and add each extended match to it
        G = create_empty_copy(self._G)
        with self.progress:
            for u, v in combinations(self._G.nodes, 2):
                edges = self._G.get_edge_data(u, v)
                if edges:
                    self.progress.update(task, u=u, v=v)
                    matches = [Match(**data) for data in edges.values()]
                    extended = extend_matches(matches, extender)
                    G.add_edges_from([(m.u, m.v, m._asdict())
                                      for m in extended])
                    self.progress.update(task, advance=len(edges))
        self._G = G
        self.progress.remove_task(task)

    def align(self, align: Aligner) -> None:
        """Align all matches in the graph using a provided strategy."""
        # track progress
        task = self.progress.add_task(
            "aligning", u="", v="", total=self.number_of_matches())

        # create a new graph without matches and add each aligned match to it
        G = create_empty_copy(self._G)
        with self.progress:
            for u, v in combinations(self._G.nodes, 2):
                edges = self._G.get_edge_data(u, v)
                if edges:
                    self.progress.update(task, u=u, v=v)
                    matches = [Match(**data) for data in edges.values()]
                    aligned = [align(match) for match in matches]
                    G.add_edges_from([(m.u, m.v, m._asdict())
                                      for m in aligned])
                    self.progress.update(task, advance=len(edges))
        self._G = G
        self.progress.remove_task(task)

    def filter(self, predicate: Callable[[Match], bool]) -> None:
        """Filter all matches in the graph using a provided predicate."""
        G = create_empty_copy(self._G)
        filtered = filter(predicate, self.matches)
        G.add_edges_from([(m.u, m.v, m._asdict()) for m in filtered])
        self._G = G

Classes

class MatchGraph
Expand source code
class MatchGraph():

    _G: MultiGraph

    def __init__(self) -> None:
        self._G = MultiGraph()
        self.progress = Progress(
            "[progress.description]{task.description}",
            SpinnerColumn(),
            "[progress.description]{task.fields[u]} × {task.fields[v]}",
            BarColumn(bar_width=None),
            "{task.completed}/{task.total}",
            "{task.percentage:>3.1f}%",
            console=err_console,
            transient=True,
        )

    @property
    def matches(self) -> Iterator[Match]:
        """Iterator over all matches in the graph."""
        return (Match(**data) for _u, _v, data in self._G.edges(data=True))

    @property
    def docs(self) -> Iterator[Doc]:
        """Iterator over all docs in the graph."""
        return (doc for _label, doc in self._G.nodes(data="doc"))

    def number_of_matches(self) -> int:
        """Total number of matches in the graph."""
        return self._G.number_of_edges()

    def number_of_docs(self) -> int:
        """Total number of documents in the graph."""
        return self._G.number_of_nodes()

    def add_doc(self, label: str, doc: Doc) -> None:
        """Add a single document to the graph."""
        self._G.add_node(label, doc=doc)

    def add_docs(self, docs: Iterable[Tuple[str, Doc]]) -> None:
        """Add a collection of documents to the graph."""
        self._G.add_nodes_from(((label, {"doc": doc}) for label, doc in docs))

    def add_match(self, match: Match) -> None:
        """Add a single match to the graph."""
        self._G.add_edge(match.u, match.v, **match._asdict())

    def add_matches(self, matches: Iterable[Match]) -> None:
        """Add a collection of matches to the graph."""
        self._G.add_edges_from([(m.u, m.v, m._asdict()) for m in matches])

    def extend(self, extender: Extender) -> None:
        """Extend all matches in the graph using a provided strategy."""
        # track progress
        task = self.progress.add_task(
            "extending", u="", v="", total=self.number_of_matches())

        # create a new graph without matches and add each extended match to it
        G = create_empty_copy(self._G)
        with self.progress:
            for u, v in combinations(self._G.nodes, 2):
                edges = self._G.get_edge_data(u, v)
                if edges:
                    self.progress.update(task, u=u, v=v)
                    matches = [Match(**data) for data in edges.values()]
                    extended = extend_matches(matches, extender)
                    G.add_edges_from([(m.u, m.v, m._asdict())
                                      for m in extended])
                    self.progress.update(task, advance=len(edges))
        self._G = G
        self.progress.remove_task(task)

    def align(self, align: Aligner) -> None:
        """Align all matches in the graph using a provided strategy."""
        # track progress
        task = self.progress.add_task(
            "aligning", u="", v="", total=self.number_of_matches())

        # create a new graph without matches and add each aligned match to it
        G = create_empty_copy(self._G)
        with self.progress:
            for u, v in combinations(self._G.nodes, 2):
                edges = self._G.get_edge_data(u, v)
                if edges:
                    self.progress.update(task, u=u, v=v)
                    matches = [Match(**data) for data in edges.values()]
                    aligned = [align(match) for match in matches]
                    G.add_edges_from([(m.u, m.v, m._asdict())
                                      for m in aligned])
                    self.progress.update(task, advance=len(edges))
        self._G = G
        self.progress.remove_task(task)

    def filter(self, predicate: Callable[[Match], bool]) -> None:
        """Filter all matches in the graph using a provided predicate."""
        G = create_empty_copy(self._G)
        filtered = filter(predicate, self.matches)
        G.add_edges_from([(m.u, m.v, m._asdict()) for m in filtered])
        self._G = G

Instance variables

var docs : Iterator[spacy.tokens.doc.Doc]

Iterator over all docs in the graph.

Expand source code
@property
def docs(self) -> Iterator[Doc]:
    """Iterator over all docs in the graph."""
    return (doc for _label, doc in self._G.nodes(data="doc"))
var matches : Iterator[Match]

Iterator over all matches in the graph.

Expand source code
@property
def matches(self) -> Iterator[Match]:
    """Iterator over all matches in the graph."""
    return (Match(**data) for _u, _v, data in self._G.edges(data=True))

Methods

def add_doc(self, label: str, doc: spacy.tokens.doc.Doc) ‑> None

Add a single document to the graph.

Expand source code
def add_doc(self, label: str, doc: Doc) -> None:
    """Add a single document to the graph."""
    self._G.add_node(label, doc=doc)
def add_docs(self, docs: Iterable[Tuple[str, spacy.tokens.doc.Doc]]) ‑> None

Add a collection of documents to the graph.

Expand source code
def add_docs(self, docs: Iterable[Tuple[str, Doc]]) -> None:
    """Add a collection of documents to the graph."""
    self._G.add_nodes_from(((label, {"doc": doc}) for label, doc in docs))
def add_match(self, match: Match) ‑> None

Add a single match to the graph.

Expand source code
def add_match(self, match: Match) -> None:
    """Add a single match to the graph."""
    self._G.add_edge(match.u, match.v, **match._asdict())
def add_matches(self, matches: Iterable[Match]) ‑> None

Add a collection of matches to the graph.

Expand source code
def add_matches(self, matches: Iterable[Match]) -> None:
    """Add a collection of matches to the graph."""
    self._G.add_edges_from([(m.u, m.v, m._asdict()) for m in matches])
def align(self, align: Aligner) ‑> None

Align all matches in the graph using a provided strategy.

Expand source code
def align(self, align: Aligner) -> None:
    """Align all matches in the graph using a provided strategy."""
    # track progress
    task = self.progress.add_task(
        "aligning", u="", v="", total=self.number_of_matches())

    # create a new graph without matches and add each aligned match to it
    G = create_empty_copy(self._G)
    with self.progress:
        for u, v in combinations(self._G.nodes, 2):
            edges = self._G.get_edge_data(u, v)
            if edges:
                self.progress.update(task, u=u, v=v)
                matches = [Match(**data) for data in edges.values()]
                aligned = [align(match) for match in matches]
                G.add_edges_from([(m.u, m.v, m._asdict())
                                  for m in aligned])
                self.progress.update(task, advance=len(edges))
    self._G = G
    self.progress.remove_task(task)
def extend(self, extender: Extender) ‑> None

Extend all matches in the graph using a provided strategy.

Expand source code
def extend(self, extender: Extender) -> None:
    """Extend all matches in the graph using a provided strategy."""
    # track progress
    task = self.progress.add_task(
        "extending", u="", v="", total=self.number_of_matches())

    # create a new graph without matches and add each extended match to it
    G = create_empty_copy(self._G)
    with self.progress:
        for u, v in combinations(self._G.nodes, 2):
            edges = self._G.get_edge_data(u, v)
            if edges:
                self.progress.update(task, u=u, v=v)
                matches = [Match(**data) for data in edges.values()]
                extended = extend_matches(matches, extender)
                G.add_edges_from([(m.u, m.v, m._asdict())
                                  for m in extended])
                self.progress.update(task, advance=len(edges))
    self._G = G
    self.progress.remove_task(task)
def filter(self, predicate: Callable[[Match], bool]) ‑> None

Filter all matches in the graph using a provided predicate.

Expand source code
def filter(self, predicate: Callable[[Match], bool]) -> None:
    """Filter all matches in the graph using a provided predicate."""
    G = create_empty_copy(self._G)
    filtered = filter(predicate, self.matches)
    G.add_edges_from([(m.u, m.v, m._asdict()) for m in filtered])
    self._G = G
def number_of_docs(self) ‑> int

Total number of documents in the graph.

Expand source code
def number_of_docs(self) -> int:
    """Total number of documents in the graph."""
    return self._G.number_of_nodes()
def number_of_matches(self) ‑> int

Total number of matches in the graph.

Expand source code
def number_of_matches(self) -> int:
    """Total number of matches in the graph."""
    return self._G.number_of_edges()