608 lines
24 KiB
Python
608 lines
24 KiB
Python
|
# -*- coding: utf-8 -*-
|
|||
|
# Natural Language Toolkit: ALINE
|
|||
|
#
|
|||
|
# Copyright (C) 2001-2018 NLTK Project
|
|||
|
# Author: Greg Kondrak <gkondrak@ualberta.ca>
|
|||
|
# Geoff Bacon <bacon@berkeley.edu> (Python port)
|
|||
|
# URL: <http://nltk.org/>
|
|||
|
# For license information, see LICENSE.TXT
|
|||
|
|
|||
|
"""
|
|||
|
ALINE
|
|||
|
http://webdocs.cs.ualberta.ca/~kondrak/
|
|||
|
Copyright 2002 by Grzegorz Kondrak.
|
|||
|
|
|||
|
ALINE is an algorithm for aligning phonetic sequences, described in [1].
|
|||
|
This module is a port of Kondrak's (2002) ALINE. It provides functions for
|
|||
|
phonetic sequence alignment and similarity analysis. These are useful in
|
|||
|
historical linguistics, sociolinguistics and synchronic phonology.
|
|||
|
|
|||
|
ALINE has parameters that can be tuned for desired output. These parameters are:
|
|||
|
- C_skip, C_sub, C_exp, C_vwl
|
|||
|
- Salience weights
|
|||
|
- Segmental features
|
|||
|
|
|||
|
In this implementation, some parameters have been changed from their default
|
|||
|
values as described in [1], in order to replicate published results. All changes
|
|||
|
are noted in comments.
|
|||
|
|
|||
|
Example usage
|
|||
|
-------------
|
|||
|
|
|||
|
# Get optimal alignment of two phonetic sequences
|
|||
|
|
|||
|
>>> align('θin', 'tenwis') # doctest: +SKIP
|
|||
|
[[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]]
|
|||
|
|
|||
|
[1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
|
|||
|
University of Toronto.
|
|||
|
"""
|
|||
|
|
|||
|
from __future__ import unicode_literals
|
|||
|
|
|||
|
try:
|
|||
|
import numpy as np
|
|||
|
except ImportError:
|
|||
|
np = None
|
|||
|
|
|||
|
# === Constants ===
|
|||
|
|
|||
|
inf = float('inf')
|
|||
|
|
|||
|
# Default values for maximum similarity scores (Kondrak 2002: 54)
|
|||
|
C_skip = 10 # Indels
|
|||
|
C_sub = 35 # Substitutions
|
|||
|
C_exp = 45 # Expansions/compressions
|
|||
|
C_vwl = 5 # Vowel/consonant relative weight (decreased from 10)
|
|||
|
|
|||
|
consonants = ['B', 'N', 'R', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
|
|||
|
'n', 'p', 'q', 'r', 's', 't', 'v', 'x', 'z', 'ç', 'ð', 'ħ',
|
|||
|
'ŋ', 'ɖ', 'ɟ', 'ɢ', 'ɣ', 'ɦ', 'ɬ', 'ɮ', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ',
|
|||
|
'ɸ', 'ɹ', 'ɻ', 'ɽ', 'ɾ', 'ʀ', 'ʁ', 'ʂ', 'ʃ', 'ʈ', 'ʋ', 'ʐ ', 'ʒ',
|
|||
|
'ʔ', 'ʕ', 'ʙ', 'ʝ', 'β', 'θ', 'χ', 'ʐ', 'w']
|
|||
|
|
|||
|
# Relevant features for comparing consonants and vowels
|
|||
|
R_c = ['aspirated', 'lateral', 'manner', 'nasal', 'place', 'retroflex',
|
|||
|
'syllabic', 'voice']
|
|||
|
# 'high' taken out of R_v because same as manner
|
|||
|
R_v = ['back', 'lateral', 'long', 'manner', 'nasal', 'place',
|
|||
|
'retroflex', 'round', 'syllabic', 'voice']
|
|||
|
|
|||
|
# Flattened feature matrix (Kondrak 2002: 56)
|
|||
|
similarity_matrix = {
|
|||
|
#place
|
|||
|
'bilabial': 1.0, 'labiodental': 0.95, 'dental': 0.9,
|
|||
|
'alveolar': 0.85, 'retroflex': 0.8, 'palato-alveolar': 0.75,
|
|||
|
'palatal': 0.7, 'velar': 0.6, 'uvular': 0.5, 'pharyngeal': 0.3,
|
|||
|
'glottal': 0.1, 'labiovelar': 1.0, 'vowel': -1.0, # added 'vowel'
|
|||
|
#manner
|
|||
|
'stop': 1.0, 'affricate': 0.9, 'fricative': 0.85, # increased fricative from 0.8
|
|||
|
'trill': 0.7, 'tap': 0.65, 'approximant': 0.6, 'high vowel': 0.4,
|
|||
|
'mid vowel': 0.2, 'low vowel': 0.0, 'vowel2': 0.5, # added vowel
|
|||
|
#high
|
|||
|
'high': 1.0, 'mid': 0.5, 'low': 0.0,
|
|||
|
#back
|
|||
|
'front': 1.0, 'central': 0.5, 'back': 0.0,
|
|||
|
#binary features
|
|||
|
'plus': 1.0, 'minus': 0.0
|
|||
|
}
|
|||
|
|
|||
|
# Relative weights of phonetic features (Kondrak 2002: 55)
|
|||
|
salience = {
|
|||
|
'syllabic': 5,
|
|||
|
'place': 40,
|
|||
|
'manner': 50,
|
|||
|
'voice': 5, # decreased from 10
|
|||
|
'nasal': 20, # increased from 10
|
|||
|
'retroflex': 10,
|
|||
|
'lateral': 10,
|
|||
|
'aspirated': 5,
|
|||
|
'long': 0, # decreased from 1
|
|||
|
'high': 3, # decreased from 5
|
|||
|
'back': 2, # decreased from 5
|
|||
|
'round': 2 # decreased from 5
|
|||
|
}
|
|||
|
|
|||
|
# (Kondrak 2002: 59-60)
|
|||
|
feature_matrix = {
|
|||
|
# Consonants
|
|||
|
'p': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'b': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
't': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'd': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʈ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɖ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'c': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɟ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'k': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'g': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'q': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɢ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʔ': {'place': 'glottal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'm': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɱ': {'place': 'labiodental', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'n': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɳ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'plus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɲ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ŋ': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɴ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'N': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʙ': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'B': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'r': {'place': 'alveolar', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʀ': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'R': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɾ': {'place': 'alveolar', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɽ': {'place': 'retroflex', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɸ': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'β': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'f': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'v': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'θ': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ð': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
's': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'z': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʃ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʒ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʂ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʐ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ç': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʝ': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'x': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɣ': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'χ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʁ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ħ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʕ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'h': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɦ': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɬ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɮ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʋ': {'place': 'labiodental', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɹ': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɻ': {'place': 'retroflex', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'j': {'place': 'palatal', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɰ': {'place': 'velar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'l': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'w': {'place': 'labiovelar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
# Vowels
|
|||
|
|
|||
|
'i': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
|
|||
|
'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'y': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
|
|||
|
'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'e': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'E': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ø': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɛ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'œ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'æ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
|
|||
|
'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'a': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
|
|||
|
'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'A': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
|
|||
|
'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɨ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
|
|||
|
'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ʉ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
|
|||
|
'back': 'central','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ə': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'u': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
|
|||
|
'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'U': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
|
|||
|
'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'o': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'O': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɔ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
|
|||
|
'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'ɒ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
|
|||
|
'back': 'back','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
'I': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
|
|||
|
'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
|
|||
|
'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
|
|||
|
|
|||
|
}
|
|||
|
|
|||
|
# === Algorithm ===
|
|||
|
|
|||
|
def align(str1, str2, epsilon=0):
|
|||
|
"""
|
|||
|
Compute the alignment of two phonetic strings.
|
|||
|
|
|||
|
:type str1, str2: str
|
|||
|
:param str1, str2: Two strings to be aligned
|
|||
|
:type epsilon: float (0.0 to 1.0)
|
|||
|
:param epsilon: Adjusts threshold similarity score for near-optimal alignments
|
|||
|
|
|||
|
:rtpye: list(list(tuple(str, str)))
|
|||
|
:return: Alignment(s) of str1 and str2
|
|||
|
|
|||
|
(Kondrak 2002: 51)
|
|||
|
"""
|
|||
|
if np == None:
|
|||
|
raise ImportError('You need numpy in order to use the align function')
|
|||
|
|
|||
|
assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
|
|||
|
m = len(str1)
|
|||
|
n = len(str2)
|
|||
|
# This includes Kondrak's initialization of row 0 and column 0 to all 0s.
|
|||
|
S = np.zeros((m+1, n+1), dtype=float)
|
|||
|
|
|||
|
# If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense,
|
|||
|
# and breaks array and string indices. Make sure they never get chosen
|
|||
|
# by setting them to -inf.
|
|||
|
for i in range(1, m+1):
|
|||
|
for j in range(1, n+1):
|
|||
|
edit1 = S[i-1, j] + sigma_skip(str1[i-1])
|
|||
|
edit2 = S[i, j-1] + sigma_skip(str2[j-1])
|
|||
|
edit3 = S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1])
|
|||
|
if i > 1:
|
|||
|
edit4 = S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i])
|
|||
|
else:
|
|||
|
edit4 = -inf
|
|||
|
if j > 1:
|
|||
|
edit5 = S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j])
|
|||
|
else:
|
|||
|
edit5 = -inf
|
|||
|
S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0)
|
|||
|
|
|||
|
T = (1-epsilon)*np.amax(S) # Threshold score for near-optimal alignments
|
|||
|
|
|||
|
alignments = []
|
|||
|
for i in range(1, m+1):
|
|||
|
for j in range(1, n+1):
|
|||
|
if S[i,j] >= T:
|
|||
|
alignments.append(_retrieve(i, j, 0, S, T, str1, str2, []))
|
|||
|
return alignments
|
|||
|
|
|||
|
def _retrieve(i, j, s, S, T, str1, str2, out):
|
|||
|
"""
|
|||
|
Retrieve the path through the similarity matrix S starting at (i, j).
|
|||
|
|
|||
|
:rtype: list(tuple(str, str))
|
|||
|
:return: Alignment of str1 and str2
|
|||
|
"""
|
|||
|
if S[i, j] == 0:
|
|||
|
return out
|
|||
|
else:
|
|||
|
if j > 1 and S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j]) + s >= T:
|
|||
|
out.insert(0, (str1[i-1], str2[j-2:j]))
|
|||
|
_retrieve(i-1, j-2, s+sigma_exp(str1[i-1], str2[j-2:j]), S, T, str1, str2, out)
|
|||
|
elif i > 1 and S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i]) + s >= T:
|
|||
|
out.insert(0, (str1[i-2:i], str2[j-1]))
|
|||
|
_retrieve(i-2, j-1, s+sigma_exp(str2[j-1], str1[i-2:i]), S, T, str1, str2, out)
|
|||
|
elif S[i, j-1] + sigma_skip(str2[j-1]) + s >= T:
|
|||
|
out.insert(0, ('-', str2[j-1]))
|
|||
|
_retrieve(i, j-1, s+sigma_skip(str2[j-1]), S, T, str1, str2, out)
|
|||
|
elif S[i-1, j] + sigma_skip(str1[i-1]) + s >= T:
|
|||
|
out.insert(0, (str1[i-1], '-'))
|
|||
|
_retrieve(i-1, j, s+sigma_skip(str1[i-1]), S, T, str1, str2, out)
|
|||
|
elif S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1]) + s >= T:
|
|||
|
out.insert(0, (str1[i-1], str2[j-1]))
|
|||
|
_retrieve(i-1, j-1, s+sigma_sub(str1[i-1], str2[j-1]), S, T, str1, str2, out)
|
|||
|
return out
|
|||
|
|
|||
|
def sigma_skip(p):
|
|||
|
"""
|
|||
|
Returns score of an indel of P.
|
|||
|
|
|||
|
(Kondrak 2002: 54)
|
|||
|
"""
|
|||
|
return C_skip
|
|||
|
|
|||
|
def sigma_sub(p, q):
|
|||
|
"""
|
|||
|
Returns score of a substitution of P with Q.
|
|||
|
|
|||
|
(Kondrak 2002: 54)
|
|||
|
"""
|
|||
|
return C_sub - delta(p, q) - V(p) - V(q)
|
|||
|
|
|||
|
def sigma_exp(p, q):
|
|||
|
"""
|
|||
|
Returns score of an expansion/compression.
|
|||
|
|
|||
|
(Kondrak 2002: 54)
|
|||
|
"""
|
|||
|
q1 = q[0]
|
|||
|
q2 = q[1]
|
|||
|
return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2))
|
|||
|
|
|||
|
def delta(p, q):
|
|||
|
"""
|
|||
|
Return weighted sum of difference between P and Q.
|
|||
|
|
|||
|
(Kondrak 2002: 54)
|
|||
|
"""
|
|||
|
features = R(p, q)
|
|||
|
total = 0
|
|||
|
for f in features:
|
|||
|
total += diff(p, q, f) * salience[f]
|
|||
|
return total
|
|||
|
|
|||
|
def diff(p, q, f):
|
|||
|
"""
|
|||
|
Returns difference between phonetic segments P and Q for feature F.
|
|||
|
|
|||
|
(Kondrak 2002: 52, 54)
|
|||
|
"""
|
|||
|
p_features, q_features = feature_matrix[p], feature_matrix[q]
|
|||
|
return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]])
|
|||
|
|
|||
|
def R(p, q):
|
|||
|
"""
|
|||
|
Return relevant features for segment comparsion.
|
|||
|
|
|||
|
(Kondrak 2002: 54)
|
|||
|
"""
|
|||
|
if p in consonants or q in consonants:
|
|||
|
return R_c
|
|||
|
return R_v
|
|||
|
|
|||
|
def V(p):
|
|||
|
"""
|
|||
|
Return vowel weight if P is vowel.
|
|||
|
|
|||
|
(Kondrak 2002: 54)
|
|||
|
"""
|
|||
|
if p in consonants:
|
|||
|
return 0
|
|||
|
return C_vwl
|
|||
|
|
|||
|
# === Test ===
|
|||
|
|
|||
|
def demo():
|
|||
|
"""
|
|||
|
A demonstration of the result of aligning phonetic sequences
|
|||
|
used in Kondrak's (2002) dissertation.
|
|||
|
"""
|
|||
|
data = [pair.split(',') for pair in cognate_data.split('\n')]
|
|||
|
for pair in data:
|
|||
|
alignment = align(pair[0], pair[1])[0]
|
|||
|
alignment = ['({}, {})'.format(a[0], a[1]) for a in alignment]
|
|||
|
alignment = ' '.join(alignment)
|
|||
|
print('{} ~ {} : {}'.format(pair[0], pair[1], alignment))
|
|||
|
|
|||
|
cognate_data = """jo,ʒə
|
|||
|
tu,ty
|
|||
|
nosotros,nu
|
|||
|
kjen,ki
|
|||
|
ke,kwa
|
|||
|
todos,tu
|
|||
|
una,ən
|
|||
|
dos,dø
|
|||
|
tres,trwa
|
|||
|
ombre,om
|
|||
|
arbol,arbrə
|
|||
|
pluma,plym
|
|||
|
kabeθa,kap
|
|||
|
boka,buʃ
|
|||
|
pje,pje
|
|||
|
koraθon,kœr
|
|||
|
ber,vwar
|
|||
|
benir,vənir
|
|||
|
deθir,dir
|
|||
|
pobre,povrə
|
|||
|
ðis,dIzes
|
|||
|
ðæt,das
|
|||
|
wat,vas
|
|||
|
nat,nixt
|
|||
|
loŋ,laŋ
|
|||
|
mæn,man
|
|||
|
fleʃ,flajʃ
|
|||
|
bləd,blyt
|
|||
|
feðər,fEdər
|
|||
|
hær,hAr
|
|||
|
ir,Or
|
|||
|
aj,awgə
|
|||
|
nowz,nAzə
|
|||
|
mawθ,munt
|
|||
|
təŋ,tsuŋə
|
|||
|
fut,fys
|
|||
|
nij,knI
|
|||
|
hænd,hant
|
|||
|
hart,herts
|
|||
|
livər,lEbər
|
|||
|
ænd,ante
|
|||
|
æt,ad
|
|||
|
blow,flAre
|
|||
|
ir,awris
|
|||
|
ijt,edere
|
|||
|
fiʃ,piʃkis
|
|||
|
flow,fluere
|
|||
|
staɾ,stella
|
|||
|
ful,plenus
|
|||
|
græs,gramen
|
|||
|
hart,kordis
|
|||
|
horn,korny
|
|||
|
aj,ego
|
|||
|
nij,genU
|
|||
|
məðər,mAter
|
|||
|
mawntən,mons
|
|||
|
nejm,nomen
|
|||
|
njuw,nowus
|
|||
|
wən,unus
|
|||
|
rawnd,rotundus
|
|||
|
sow,suere
|
|||
|
sit,sedere
|
|||
|
θrij,tres
|
|||
|
tuwθ,dentis
|
|||
|
θin,tenwis
|
|||
|
kinwawa,kenuaʔ
|
|||
|
nina,nenah
|
|||
|
napewa,napɛw
|
|||
|
wapimini,wapemen
|
|||
|
namesa,namɛʔs
|
|||
|
okimawa,okemaw
|
|||
|
ʃiʃipa,seʔsep
|
|||
|
ahkohkwa,ahkɛh
|
|||
|
pematesiweni,pematesewen
|
|||
|
asenja,aʔsɛn"""
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
demo()
|