Module dphon.g2p
Tools for converting graphemes to phonemes.
Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tools for converting graphemes to phonemes."""
import json
import logging
from pathlib import Path
from typing import Iterable, Iterator, Mapping, Optional, Tuple, List
from spacy.language import Language
from spacy.lookups import Table
from spacy.tokens import Doc, Span, Token
from dphon.match import Match
# private use unicode char that represents phonemes for OOV tokens
OOV_PHONEMES = "\ue000"
# types for sound tables: map a string to a tuple of syllable phonemes
Phonemes_T = Tuple[Optional[str], ...]
SoundTable_T = Mapping[str, Phonemes_T]
class GraphemesToPhonemes:
"""Grapheme-to-phoneme conversion using a `spacy.lookups.Table`.
Intended for use as a spaCy pipeline component. Docs will be passed through
the component unmodified. Registers several extension attributes that can be
used elsewhere in a spaCy pipeline:
- `Doc._.phonemes`: iterator over all phonemes in a `spacy.tokens.Doc`
- `Span._.phonemes`: iterator over all phonemes in a `spacy.tokens.Span`
- `Token._.phonemes`: iterator over all phonemes in a `spacy.tokens.Token`
- `Token._.is_oov`: check whether a token can be converted to phonemes
Args:
nlp: a spaCy language model.
sound_table: grapheme-to-phoneme conversion table.
"""
_table: Table # uses spaCy's lookup tables (bloom filtered dict)
def __init__(self, nlp: Language, sound_table: SoundTable_T):
# infer the syllable segmentation and map it to an empty phoneme set
syllable_parts = len(next(iter(sound_table.values())))
self.empty_phonemes = tuple(None for _ in range(syllable_parts))
# register extensions on spaCy primitives
if not Doc.has_extension("phonemes"):
Doc.set_extension("phonemes", getter=self.get_all_phonemes)
if not Span.has_extension("phonemes"):
Span.set_extension("phonemes", getter=self.get_all_phonemes)
if not Span.has_extension("syllables"):
Span.set_extension("syllables", getter=self._get_syllables)
if not Token.has_extension("phonemes"):
Token.set_extension("phonemes", getter=self.get_token_phonemes)
if not Token.has_extension("is_oov"):
Token.set_extension("is_oov", getter=self.is_token_oov)
# store the sound table in the vocab's Lookups
self.table = nlp.vocab.lookups.add_table("phonemes", sound_table)
logging.info(f"using {self.__class__}")
def __call__(self, doc: Doc) -> Doc:
return doc
def is_token_oov(self, token: Token) -> bool:
"""`True` if `token` has no phonetic entry in the sound table.
Args:
token: a single `spacy.tokens.Token` to check.
"""
return token.text not in self.table
def has_variant(self, match: Match) -> bool:
"""`True` if `match` contains a graphic variant.
This is designed to be called on matches that are of the same length,
so that the match doesn't need to be aligned for it to work.
Args:
match: a single `dphon.match.Match`, usually output from the early
seed stage, to check.
"""
# compare each token pairwise, True if we find a variant, else False
for i in range(len(match)):
if self.are_graphic_variants(match.utxt[i], match.vtxt[i]):
return True
return False
def are_graphic_variants(self, *tokens: Token) -> bool:
"""Check if `tokens` are graphic variants.
- `False` if any tokens are not in the sound table.
- `False` if any tokens are non-voiced.
- `False` if any tokens have identical graphemes.
- `False` if any tokens have differing phonemes.
Args:
tokens: any number of `spacy.tokens.Token` to compare.
"""
# O(n) implementation: compare all tokens against first one
base_text = tokens[0].text
base_phon = self.get_token_phonemes(tokens[0])
if base_phon == self.empty_phonemes or base_phon == (OOV_PHONEMES,):
return False
for token in tokens[1:]:
phonemes = self.get_token_phonemes(token)
if phonemes == self.empty_phonemes or \
phonemes == (OOV_PHONEMES,) or \
phonemes != base_phon or \
token.text == base_text:
return False
return True
def get_all_phonemes(self, tokens: Iterable[Token]) -> Iterator[str]:
"""Flattened iterator over all phonemes in `tokens`.
- Skips parts of the syllable that are not used (stored as None).
- Skips non-voiced tokens, such as punctuation.
- Keeps OOV_PHONEMES as an indicator of missing phonetic information.
Args:
tokens: iterable of `spacy.tokens.Token` to convert.
"""
for token in tokens:
for phoneme in self.get_token_phonemes(token):
if phoneme:
yield phoneme
def get_token_phonemes(self, token: Token) -> Phonemes_T:
"""Return `token`'s phonemes as an n-tuple.
- If `token` is non-alphanumeric, all elements of the tuple will be None.
- If `token` is not in the sound table, all elements of the tuple will
use a special marker (`OOV_PHONEMES`).
- If some parts of the syllable are not present, their corresponding
elements in the tuple will be `None`.
"""
if not token.is_alpha and not token.like_num:
return self.empty_phonemes
elif token._.is_oov:
logging.debug(f"no phonemes for token: \"{token.text}\"")
return (OOV_PHONEMES,)
else:
return self._select(self.table[token.text])
def _get_token_syllable(self, token: Token) -> str:
try:
return "".join(self.table[token.text])
except KeyError:
return ""
def _get_syllables(self, tokens: Iterable[Token]) -> List[str]:
return [self._get_token_syllable(token) for token in tokens]
def _select(self, reading: Phonemes_T) -> Phonemes_T:
"""Filter the syllable to only the segments we're interested in."""
# NOTE using only initial, nucleus, coda currently
initial = reading[3]
nucleus = reading[6]
coda = reading[7]
return (initial, nucleus, coda)
def get_sound_table_json(path: Path) -> SoundTable_T:
"""Load a sound table as JSON."""
sound_table: SoundTable_T = {}
# open the file and load all readings
with open(path, encoding="utf8") as file:
entries = json.loads(file.read())
for char, readings in entries.items():
# FIXME just using first reading for now, ignoring multiple
# NOTE final two entries in current table are source info; ignore
*reading, _src, _src2 = readings[0]
sound_table[char] = tuple(reading) # type: ignore
# log and return finished table
logging.info(f"sound table {path.resolve()} loaded")
return sound_table
@Language.factory("g2p")
def create_graphemes_to_phonemes(nlp: Language, name: str, sound_table: SoundTable_T) -> GraphemesToPhonemes:
return GraphemesToPhonemes(nlp, sound_table)
Functions
def create_graphemes_to_phonemes(nlp: spacy.language.Language, name: str, sound_table: Mapping[str, Tuple[Optional[str], ...]]) ‑> GraphemesToPhonemes
-
Expand source code
@Language.factory("g2p") def create_graphemes_to_phonemes(nlp: Language, name: str, sound_table: SoundTable_T) -> GraphemesToPhonemes: return GraphemesToPhonemes(nlp, sound_table)
def get_sound_table_json(path: pathlib.Path) ‑> Mapping[str, Tuple[Optional[str], ...]]
-
Load a sound table as JSON.
Expand source code
def get_sound_table_json(path: Path) -> SoundTable_T: """Load a sound table as JSON.""" sound_table: SoundTable_T = {} # open the file and load all readings with open(path, encoding="utf8") as file: entries = json.loads(file.read()) for char, readings in entries.items(): # FIXME just using first reading for now, ignoring multiple # NOTE final two entries in current table are source info; ignore *reading, _src, _src2 = readings[0] sound_table[char] = tuple(reading) # type: ignore # log and return finished table logging.info(f"sound table {path.resolve()} loaded") return sound_table
Classes
class GraphemesToPhonemes (nlp: spacy.language.Language, sound_table: Mapping[str, Tuple[Optional[str], ...]])
-
Grapheme-to-phoneme conversion using a
spacy.lookups.Table
.Intended for use as a spaCy pipeline component. Docs will be passed through the component unmodified. Registers several extension attributes that can be used elsewhere in a spaCy pipeline:
Doc._.phonemes
: iterator over all phonemes in aspacy.tokens.Doc
Span._.phonemes
: iterator over all phonemes in aspacy.tokens.Span
Token._.phonemes
: iterator over all phonemes in aspacy.tokens.Token
Token._.is_oov
: check whether a token can be converted to phonemes
Args
nlp
- a spaCy language model.
sound_table
- grapheme-to-phoneme conversion table.
Expand source code
class GraphemesToPhonemes: """Grapheme-to-phoneme conversion using a `spacy.lookups.Table`. Intended for use as a spaCy pipeline component. Docs will be passed through the component unmodified. Registers several extension attributes that can be used elsewhere in a spaCy pipeline: - `Doc._.phonemes`: iterator over all phonemes in a `spacy.tokens.Doc` - `Span._.phonemes`: iterator over all phonemes in a `spacy.tokens.Span` - `Token._.phonemes`: iterator over all phonemes in a `spacy.tokens.Token` - `Token._.is_oov`: check whether a token can be converted to phonemes Args: nlp: a spaCy language model. sound_table: grapheme-to-phoneme conversion table. """ _table: Table # uses spaCy's lookup tables (bloom filtered dict) def __init__(self, nlp: Language, sound_table: SoundTable_T): # infer the syllable segmentation and map it to an empty phoneme set syllable_parts = len(next(iter(sound_table.values()))) self.empty_phonemes = tuple(None for _ in range(syllable_parts)) # register extensions on spaCy primitives if not Doc.has_extension("phonemes"): Doc.set_extension("phonemes", getter=self.get_all_phonemes) if not Span.has_extension("phonemes"): Span.set_extension("phonemes", getter=self.get_all_phonemes) if not Span.has_extension("syllables"): Span.set_extension("syllables", getter=self._get_syllables) if not Token.has_extension("phonemes"): Token.set_extension("phonemes", getter=self.get_token_phonemes) if not Token.has_extension("is_oov"): Token.set_extension("is_oov", getter=self.is_token_oov) # store the sound table in the vocab's Lookups self.table = nlp.vocab.lookups.add_table("phonemes", sound_table) logging.info(f"using {self.__class__}") def __call__(self, doc: Doc) -> Doc: return doc def is_token_oov(self, token: Token) -> bool: """`True` if `token` has no phonetic entry in the sound table. Args: token: a single `spacy.tokens.Token` to check. """ return token.text not in self.table def has_variant(self, match: Match) -> bool: """`True` if `match` contains a graphic variant. This is designed to be called on matches that are of the same length, so that the match doesn't need to be aligned for it to work. Args: match: a single `dphon.match.Match`, usually output from the early seed stage, to check. """ # compare each token pairwise, True if we find a variant, else False for i in range(len(match)): if self.are_graphic_variants(match.utxt[i], match.vtxt[i]): return True return False def are_graphic_variants(self, *tokens: Token) -> bool: """Check if `tokens` are graphic variants. - `False` if any tokens are not in the sound table. - `False` if any tokens are non-voiced. - `False` if any tokens have identical graphemes. - `False` if any tokens have differing phonemes. Args: tokens: any number of `spacy.tokens.Token` to compare. """ # O(n) implementation: compare all tokens against first one base_text = tokens[0].text base_phon = self.get_token_phonemes(tokens[0]) if base_phon == self.empty_phonemes or base_phon == (OOV_PHONEMES,): return False for token in tokens[1:]: phonemes = self.get_token_phonemes(token) if phonemes == self.empty_phonemes or \ phonemes == (OOV_PHONEMES,) or \ phonemes != base_phon or \ token.text == base_text: return False return True def get_all_phonemes(self, tokens: Iterable[Token]) -> Iterator[str]: """Flattened iterator over all phonemes in `tokens`. - Skips parts of the syllable that are not used (stored as None). - Skips non-voiced tokens, such as punctuation. - Keeps OOV_PHONEMES as an indicator of missing phonetic information. Args: tokens: iterable of `spacy.tokens.Token` to convert. """ for token in tokens: for phoneme in self.get_token_phonemes(token): if phoneme: yield phoneme def get_token_phonemes(self, token: Token) -> Phonemes_T: """Return `token`'s phonemes as an n-tuple. - If `token` is non-alphanumeric, all elements of the tuple will be None. - If `token` is not in the sound table, all elements of the tuple will use a special marker (`OOV_PHONEMES`). - If some parts of the syllable are not present, their corresponding elements in the tuple will be `None`. """ if not token.is_alpha and not token.like_num: return self.empty_phonemes elif token._.is_oov: logging.debug(f"no phonemes for token: \"{token.text}\"") return (OOV_PHONEMES,) else: return self._select(self.table[token.text]) def _get_token_syllable(self, token: Token) -> str: try: return "".join(self.table[token.text]) except KeyError: return "" def _get_syllables(self, tokens: Iterable[Token]) -> List[str]: return [self._get_token_syllable(token) for token in tokens] def _select(self, reading: Phonemes_T) -> Phonemes_T: """Filter the syllable to only the segments we're interested in.""" # NOTE using only initial, nucleus, coda currently initial = reading[3] nucleus = reading[6] coda = reading[7] return (initial, nucleus, coda)
Methods
def are_graphic_variants(self, *tokens: spacy.tokens.token.Token) ‑> bool
-
Check if
tokens
are graphic variants.False
if any tokens are not in the sound table.False
if any tokens are non-voiced.False
if any tokens have identical graphemes.False
if any tokens have differing phonemes.
Args
tokens
- any number of
spacy.tokens.Token
to compare.
Expand source code
def are_graphic_variants(self, *tokens: Token) -> bool: """Check if `tokens` are graphic variants. - `False` if any tokens are not in the sound table. - `False` if any tokens are non-voiced. - `False` if any tokens have identical graphemes. - `False` if any tokens have differing phonemes. Args: tokens: any number of `spacy.tokens.Token` to compare. """ # O(n) implementation: compare all tokens against first one base_text = tokens[0].text base_phon = self.get_token_phonemes(tokens[0]) if base_phon == self.empty_phonemes or base_phon == (OOV_PHONEMES,): return False for token in tokens[1:]: phonemes = self.get_token_phonemes(token) if phonemes == self.empty_phonemes or \ phonemes == (OOV_PHONEMES,) or \ phonemes != base_phon or \ token.text == base_text: return False return True
def get_all_phonemes(self, tokens: Iterable[spacy.tokens.token.Token]) ‑> Iterator[str]
-
Flattened iterator over all phonemes in
tokens
.- Skips parts of the syllable that are not used (stored as None).
- Skips non-voiced tokens, such as punctuation.
- Keeps OOV_PHONEMES as an indicator of missing phonetic information.
Args
tokens
- iterable of
spacy.tokens.Token
to convert.
Expand source code
def get_all_phonemes(self, tokens: Iterable[Token]) -> Iterator[str]: """Flattened iterator over all phonemes in `tokens`. - Skips parts of the syllable that are not used (stored as None). - Skips non-voiced tokens, such as punctuation. - Keeps OOV_PHONEMES as an indicator of missing phonetic information. Args: tokens: iterable of `spacy.tokens.Token` to convert. """ for token in tokens: for phoneme in self.get_token_phonemes(token): if phoneme: yield phoneme
def get_token_phonemes(self, token: spacy.tokens.token.Token) ‑> Tuple[Optional[str], ...]
-
Return
token
's phonemes as an n-tuple.- If
token
is non-alphanumeric, all elements of the tuple will be None. - If
token
is not in the sound table, all elements of the tuple will use a special marker (OOV_PHONEMES
). - If some parts of the syllable are not present, their corresponding
elements in the tuple will be
None
.
Expand source code
def get_token_phonemes(self, token: Token) -> Phonemes_T: """Return `token`'s phonemes as an n-tuple. - If `token` is non-alphanumeric, all elements of the tuple will be None. - If `token` is not in the sound table, all elements of the tuple will use a special marker (`OOV_PHONEMES`). - If some parts of the syllable are not present, their corresponding elements in the tuple will be `None`. """ if not token.is_alpha and not token.like_num: return self.empty_phonemes elif token._.is_oov: logging.debug(f"no phonemes for token: \"{token.text}\"") return (OOV_PHONEMES,) else: return self._select(self.table[token.text])
- If
def has_variant(self, match: Match) ‑> bool
-
True
ifmatch
contains a graphic variant.This is designed to be called on matches that are of the same length, so that the match doesn't need to be aligned for it to work.
Args
match
- a single
Match
, usually output from the early
seed stage, to check.
Expand source code
def has_variant(self, match: Match) -> bool: """`True` if `match` contains a graphic variant. This is designed to be called on matches that are of the same length, so that the match doesn't need to be aligned for it to work. Args: match: a single `dphon.match.Match`, usually output from the early seed stage, to check. """ # compare each token pairwise, True if we find a variant, else False for i in range(len(match)): if self.are_graphic_variants(match.utxt[i], match.vtxt[i]): return True return False
def is_token_oov(self, token: spacy.tokens.token.Token) ‑> bool
-
True
iftoken
has no phonetic entry in the sound table.Args
token
- a single
spacy.tokens.Token
to check.
Expand source code
def is_token_oov(self, token: Token) -> bool: """`True` if `token` has no phonetic entry in the sound table. Args: token: a single `spacy.tokens.Token` to check. """ return token.text not in self.table