Module dphon.ngrams

SpaCy pipeline component for generating Token n-grams from Docs.

Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""SpaCy pipeline component for generating Token n-grams from Docs."""

import logging
from typing import Iterator

from spacy.language import Language
from spacy.tokens import Doc, Span


class Ngrams():
    """A spaCy pipeline component for generating Token n-grams from Docs."""

    n: int  # number of tokens per n-gram (order)

    def __init__(self, nlp: Language, n: int):
        """Initialize the n-gram component."""
        self.n = n
        if not Doc.has_extension("ngrams"):
            Doc.set_extension("ngrams", getter=self.get_doc_ngrams)
        logging.info(f"using {self.__class__}\" with n={self.n}")

    def __call__(self, doc: Doc) -> Doc:
        """Return the Doc unmodified."""
        return doc

    def get_doc_ngrams(self, doc: Doc) -> Iterator[Span]:
        """Return an iterator over n-grams in a Doc as Spans."""
        # if empty doc, nothing should happen
        if len(doc) == 0:
            return iter([])
        return (doc[i:i + self.n] for i in range(max(len(doc) - self.n + 1, 1)))


@Language.factory("ngrams")
def create_ngrams(nlp: Language, name: str, n: int) -> Ngrams:
    return Ngrams(nlp, n)

Functions

def create_ngrams(nlp: spacy.language.Language, name: str, n: int) ‑> Ngrams
Expand source code
@Language.factory("ngrams")
def create_ngrams(nlp: Language, name: str, n: int) -> Ngrams:
    return Ngrams(nlp, n)

Classes

class Ngrams (nlp: spacy.language.Language, n: int)

A spaCy pipeline component for generating Token n-grams from Docs.

Initialize the n-gram component.

Expand source code
class Ngrams():
    """A spaCy pipeline component for generating Token n-grams from Docs."""

    n: int  # number of tokens per n-gram (order)

    def __init__(self, nlp: Language, n: int):
        """Initialize the n-gram component."""
        self.n = n
        if not Doc.has_extension("ngrams"):
            Doc.set_extension("ngrams", getter=self.get_doc_ngrams)
        logging.info(f"using {self.__class__}\" with n={self.n}")

    def __call__(self, doc: Doc) -> Doc:
        """Return the Doc unmodified."""
        return doc

    def get_doc_ngrams(self, doc: Doc) -> Iterator[Span]:
        """Return an iterator over n-grams in a Doc as Spans."""
        # if empty doc, nothing should happen
        if len(doc) == 0:
            return iter([])
        return (doc[i:i + self.n] for i in range(max(len(doc) - self.n + 1, 1)))

Class variables

var n : int

Methods

def get_doc_ngrams(self, doc: spacy.tokens.doc.Doc) ‑> Iterator[spacy.tokens.span.Span]

Return an iterator over n-grams in a Doc as Spans.

Expand source code
def get_doc_ngrams(self, doc: Doc) -> Iterator[Span]:
    """Return an iterator over n-grams in a Doc as Spans."""
    # if empty doc, nothing should happen
    if len(doc) == 0:
        return iter([])
    return (doc[i:i + self.n] for i in range(max(len(doc) - self.n + 1, 1)))