Module dphon.index
Tools for building indices of document data.
Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tools for building indices of document data."""
from dphon.g2p import OOV_PHONEMES
import logging
from abc import ABC, abstractmethod
from typing import Callable, Hashable, Iterable, Iterator, List, Tuple, TypeVar, Generic
from spacy.language import Language
from spacy.tokens import Doc, Span
from spacy.lookups import Table
K = TypeVar("K") # type for keys stored in the index
V = TypeVar("V") # type for values stored in the index
class Index(ABC, Generic[K, V]):
"""Abstract base class that indexes arbitrary document data.
Subclasses are intended for use as spaCy pipeline components. Docs will be
passed through the component unmodified, storing some data in an external
index maintained by the component for later use. The particular indexing
strategy is implementation-defined.
Args:
nlp: a spaCy language model.
"""
def __init__(self, nlp: Language):
logging.info(f"using {self.__class__}")
@abstractmethod
def __call__(self, doc: Doc) -> Doc:
"""Index a single spaCy Doc. Should not mutate the Doc."""
return doc
@abstractmethod
def __contains__(self, key: K) -> bool:
"""Check if a key is in the index."""
raise NotImplementedError
@abstractmethod
def __getitem__(self, key: K) -> Iterable[V]:
"""Return all values indexed at a given key."""
raise NotImplementedError
@abstractmethod
def __iter__(self) -> Iterator[Tuple[K, List[V]]]:
"""Return a (k, v) iterator over all entries in the index."""
raise NotImplementedError
@abstractmethod
def filter(self, fn: Callable[[Tuple[K, List[V]]], bool]) -> Iterator[Tuple[K, List[V]]]:
"""Return a (k, v) iterator over all entries which match a predicate."""
raise NotImplementedError
@abstractmethod
def __len__(self) -> int:
"""Get the total number of keys in the index."""
raise NotImplementedError
@property
@abstractmethod
def size(self) -> int:
"""Get the total number of values in the index."""
raise NotImplementedError
class LookupsIndex(Index[Hashable, V], Generic[V]):
"""Index using a `spacy.lookups.Table` to store document data.
Subclasses must implement `_get_vals()` and `_get_keys()` to define how data
is to be extracted from documents and indexed. `_get_vals()` returns an
iterable of all values from a document that should be indexed, while
`_get_keys()` returns the key for a single value.
Data is indexed as a `spacy.lookups.Table`, which is a subclass of
`collections.OrderedDict` with a bloom filter applied to speed up querying.
Args:
nlp: a spaCy language model.
"""
_table: Table # uses spaCy's lookup tables (bloom filtered dict)
_size: int # tracker for total number of values in index
def __init__(self, nlp: Language) -> None:
self._table = nlp.vocab.lookups.add_table("index")
self._size = 0
super().__init__(nlp)
def __call__(self, doc: Doc) -> Doc:
"""Extract values from a doc with _get_vals and index with _get_key."""
for val in self._get_vals(doc):
key = self._get_key(val)
try:
self._table.get(key).append(val)
except AttributeError:
self._table.set(key, [val])
self._size += 1
return super().__call__(doc)
def __len__(self) -> int:
"""Get the total number of keys in the index."""
return len(self._table)
def __contains__(self, key: K) -> bool:
"""Check if a key is in the index."""
return key in self._table
def __getitem__(self, key: K) -> List[V]:
"""Return a list of all values indexed at a given key."""
return self._table.get(key)
def __iter__(self) -> Iterator[Tuple[K, List[V]]]:
"""Return a (k, v) iterator over all entries in the index."""
return (entry for entry in self._table.items())
@abstractmethod
def _get_vals(self, doc: Doc) -> Iterable[V]:
"""Get all the values to be indexed from a Doc."""
raise NotImplementedError
@abstractmethod
def _get_key(self, val: V) -> Hashable:
"""Get the key to index a particular value."""
raise NotImplementedError
def filter(self, fn: Callable[[Tuple[K, List[V]]], bool]) -> Iterator[Tuple[K, List[V]]]:
"""Return a (k, v) iterator over all entries which match a predicate."""
return (entry for entry in iter(self) if fn(entry))
@property
def size(self) -> int:
"""Get the total number of values in the index."""
return self._size
class NgramPhonemesLookupsIndex(LookupsIndex[Span]):
"""Index of phonetic n-grams using a `spacy.lookups.Table`.
Each key in the index is the phonetic content of a unique document n-gram as
a string. Values stored at this key are `spacy.tokens.Span` objects
representing the document locations where this phonetic content occurred.
- Requires an n-gram component (see `dphon.ngrams`) to break document text
into n-grams.
- Requires a grapheme-to-phoneme model (see `dphon.g2p`) to retrieve phonetic
content of n-grams.
"""
def _get_vals(self, doc: Doc) -> Iterator[Span]:
"""Iterator over phonetic ngrams in the doc.
Discards any ngrams containing non-voiced content, and any for which
the g2p model did not have phonetic information.
"""
for ngram in doc._.ngrams:
if ngram.text.isalpha() and OOV_PHONEMES not in ngram._.phonemes:
yield ngram
def _get_key(self, val: Span) -> str:
"""All phonetic content of an ngram as a string."""
return "".join(val._.phonemes)
@Language.factory("ngram_phonemes_index")
def create_ngram_phonemes_lookup_index(nlp: Language, name: str) -> NgramPhonemesLookupsIndex:
return NgramPhonemesLookupsIndex(nlp)
Functions
def create_ngram_phonemes_lookup_index(nlp: spacy.language.Language, name: str) ‑> NgramPhonemesLookupsIndex
-
Expand source code
@Language.factory("ngram_phonemes_index") def create_ngram_phonemes_lookup_index(nlp: Language, name: str) -> NgramPhonemesLookupsIndex: return NgramPhonemesLookupsIndex(nlp)
Classes
class Index (nlp: spacy.language.Language)
-
Abstract base class that indexes arbitrary document data.
Subclasses are intended for use as spaCy pipeline components. Docs will be passed through the component unmodified, storing some data in an external index maintained by the component for later use. The particular indexing strategy is implementation-defined.
Args
nlp
- a spaCy language model.
Expand source code
class Index(ABC, Generic[K, V]): """Abstract base class that indexes arbitrary document data. Subclasses are intended for use as spaCy pipeline components. Docs will be passed through the component unmodified, storing some data in an external index maintained by the component for later use. The particular indexing strategy is implementation-defined. Args: nlp: a spaCy language model. """ def __init__(self, nlp: Language): logging.info(f"using {self.__class__}") @abstractmethod def __call__(self, doc: Doc) -> Doc: """Index a single spaCy Doc. Should not mutate the Doc.""" return doc @abstractmethod def __contains__(self, key: K) -> bool: """Check if a key is in the index.""" raise NotImplementedError @abstractmethod def __getitem__(self, key: K) -> Iterable[V]: """Return all values indexed at a given key.""" raise NotImplementedError @abstractmethod def __iter__(self) -> Iterator[Tuple[K, List[V]]]: """Return a (k, v) iterator over all entries in the index.""" raise NotImplementedError @abstractmethod def filter(self, fn: Callable[[Tuple[K, List[V]]], bool]) -> Iterator[Tuple[K, List[V]]]: """Return a (k, v) iterator over all entries which match a predicate.""" raise NotImplementedError @abstractmethod def __len__(self) -> int: """Get the total number of keys in the index.""" raise NotImplementedError @property @abstractmethod def size(self) -> int: """Get the total number of values in the index.""" raise NotImplementedError
Ancestors
- abc.ABC
- typing.Generic
Subclasses
Instance variables
var size : int
-
Get the total number of values in the index.
Expand source code
@property @abstractmethod def size(self) -> int: """Get the total number of values in the index.""" raise NotImplementedError
Methods
def filter(self, fn: Callable[[Tuple[~K, List[~V]]], bool]) ‑> Iterator[Tuple[~K, List[~V]]]
-
Return a (k, v) iterator over all entries which match a predicate.
Expand source code
@abstractmethod def filter(self, fn: Callable[[Tuple[K, List[V]]], bool]) -> Iterator[Tuple[K, List[V]]]: """Return a (k, v) iterator over all entries which match a predicate.""" raise NotImplementedError
class LookupsIndex (nlp: spacy.language.Language)
-
Index using a
spacy.lookups.Table
to store document data.Subclasses must implement
_get_vals()
and_get_keys()
to define how data is to be extracted from documents and indexed._get_vals()
returns an iterable of all values from a document that should be indexed, while_get_keys()
returns the key for a single value.Data is indexed as a
spacy.lookups.Table
, which is a subclass ofcollections.OrderedDict
with a bloom filter applied to speed up querying.Args
nlp
- a spaCy language model.
Expand source code
class LookupsIndex(Index[Hashable, V], Generic[V]): """Index using a `spacy.lookups.Table` to store document data. Subclasses must implement `_get_vals()` and `_get_keys()` to define how data is to be extracted from documents and indexed. `_get_vals()` returns an iterable of all values from a document that should be indexed, while `_get_keys()` returns the key for a single value. Data is indexed as a `spacy.lookups.Table`, which is a subclass of `collections.OrderedDict` with a bloom filter applied to speed up querying. Args: nlp: a spaCy language model. """ _table: Table # uses spaCy's lookup tables (bloom filtered dict) _size: int # tracker for total number of values in index def __init__(self, nlp: Language) -> None: self._table = nlp.vocab.lookups.add_table("index") self._size = 0 super().__init__(nlp) def __call__(self, doc: Doc) -> Doc: """Extract values from a doc with _get_vals and index with _get_key.""" for val in self._get_vals(doc): key = self._get_key(val) try: self._table.get(key).append(val) except AttributeError: self._table.set(key, [val]) self._size += 1 return super().__call__(doc) def __len__(self) -> int: """Get the total number of keys in the index.""" return len(self._table) def __contains__(self, key: K) -> bool: """Check if a key is in the index.""" return key in self._table def __getitem__(self, key: K) -> List[V]: """Return a list of all values indexed at a given key.""" return self._table.get(key) def __iter__(self) -> Iterator[Tuple[K, List[V]]]: """Return a (k, v) iterator over all entries in the index.""" return (entry for entry in self._table.items()) @abstractmethod def _get_vals(self, doc: Doc) -> Iterable[V]: """Get all the values to be indexed from a Doc.""" raise NotImplementedError @abstractmethod def _get_key(self, val: V) -> Hashable: """Get the key to index a particular value.""" raise NotImplementedError def filter(self, fn: Callable[[Tuple[K, List[V]]], bool]) -> Iterator[Tuple[K, List[V]]]: """Return a (k, v) iterator over all entries which match a predicate.""" return (entry for entry in iter(self) if fn(entry)) @property def size(self) -> int: """Get the total number of values in the index.""" return self._size
Ancestors
- Index
- abc.ABC
- typing.Generic
Subclasses
Inherited members
class NgramPhonemesLookupsIndex (nlp: spacy.language.Language)
-
Index of phonetic n-grams using a
spacy.lookups.Table
.Each key in the index is the phonetic content of a unique document n-gram as a string. Values stored at this key are
spacy.tokens.Span
objects representing the document locations where this phonetic content occurred.- Requires an n-gram component (see
dphon.ngrams
) to break document text into n-grams. - Requires a grapheme-to-phoneme model (see
dphon.g2p
) to retrieve phonetic content of n-grams.
Expand source code
class NgramPhonemesLookupsIndex(LookupsIndex[Span]): """Index of phonetic n-grams using a `spacy.lookups.Table`. Each key in the index is the phonetic content of a unique document n-gram as a string. Values stored at this key are `spacy.tokens.Span` objects representing the document locations where this phonetic content occurred. - Requires an n-gram component (see `dphon.ngrams`) to break document text into n-grams. - Requires a grapheme-to-phoneme model (see `dphon.g2p`) to retrieve phonetic content of n-grams. """ def _get_vals(self, doc: Doc) -> Iterator[Span]: """Iterator over phonetic ngrams in the doc. Discards any ngrams containing non-voiced content, and any for which the g2p model did not have phonetic information. """ for ngram in doc._.ngrams: if ngram.text.isalpha() and OOV_PHONEMES not in ngram._.phonemes: yield ngram def _get_key(self, val: Span) -> str: """All phonetic content of an ngram as a string.""" return "".join(val._.phonemes)
Ancestors
- LookupsIndex
- Index
- abc.ABC
- typing.Generic
Inherited members
- Requires an n-gram component (see