# -*- coding: utf-8 -*- # Natural Language Toolkit: ALINE # # Copyright (C) 2001-2018 NLTK Project # Author: Greg Kondrak # Geoff Bacon (Python port) # URL: # For license information, see LICENSE.TXT """ ALINE http://webdocs.cs.ualberta.ca/~kondrak/ Copyright 2002 by Grzegorz Kondrak. ALINE is an algorithm for aligning phonetic sequences, described in [1]. This module is a port of Kondrak's (2002) ALINE. It provides functions for phonetic sequence alignment and similarity analysis. These are useful in historical linguistics, sociolinguistics and synchronic phonology. ALINE has parameters that can be tuned for desired output. These parameters are: - C_skip, C_sub, C_exp, C_vwl - Salience weights - Segmental features In this implementation, some parameters have been changed from their default values as described in [1], in order to replicate published results. All changes are noted in comments. Example usage ------------- # Get optimal alignment of two phonetic sequences >>> align('θin', 'tenwis') # doctest: +SKIP [[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]] [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation, University of Toronto. """ from __future__ import unicode_literals try: import numpy as np except ImportError: np = None # === Constants === inf = float('inf') # Default values for maximum similarity scores (Kondrak 2002: 54) C_skip = 10 # Indels C_sub = 35 # Substitutions C_exp = 45 # Expansions/compressions C_vwl = 5 # Vowel/consonant relative weight (decreased from 10) consonants = ['B', 'N', 'R', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'x', 'z', 'ç', 'ð', 'ħ', 'ŋ', 'ɖ', 'ɟ', 'ɢ', 'ɣ', 'ɦ', 'ɬ', 'ɮ', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɸ', 'ɹ', 'ɻ', 'ɽ', 'ɾ', 'ʀ', 'ʁ', 'ʂ', 'ʃ', 'ʈ', 'ʋ', 'ʐ ', 'ʒ', 'ʔ', 'ʕ', 'ʙ', 'ʝ', 'β', 'θ', 'χ', 'ʐ', 'w'] # Relevant features for comparing consonants and vowels R_c = ['aspirated', 'lateral', 'manner', 'nasal', 'place', 'retroflex', 'syllabic', 'voice'] # 'high' taken out of R_v because same as manner R_v = ['back', 'lateral', 'long', 'manner', 'nasal', 'place', 'retroflex', 'round', 'syllabic', 'voice'] # Flattened feature matrix (Kondrak 2002: 56) similarity_matrix = { #place 'bilabial': 1.0, 'labiodental': 0.95, 'dental': 0.9, 'alveolar': 0.85, 'retroflex': 0.8, 'palato-alveolar': 0.75, 'palatal': 0.7, 'velar': 0.6, 'uvular': 0.5, 'pharyngeal': 0.3, 'glottal': 0.1, 'labiovelar': 1.0, 'vowel': -1.0, # added 'vowel' #manner 'stop': 1.0, 'affricate': 0.9, 'fricative': 0.85, # increased fricative from 0.8 'trill': 0.7, 'tap': 0.65, 'approximant': 0.6, 'high vowel': 0.4, 'mid vowel': 0.2, 'low vowel': 0.0, 'vowel2': 0.5, # added vowel #high 'high': 1.0, 'mid': 0.5, 'low': 0.0, #back 'front': 1.0, 'central': 0.5, 'back': 0.0, #binary features 'plus': 1.0, 'minus': 0.0 } # Relative weights of phonetic features (Kondrak 2002: 55) salience = { 'syllabic': 5, 'place': 40, 'manner': 50, 'voice': 5, # decreased from 10 'nasal': 20, # increased from 10 'retroflex': 10, 'lateral': 10, 'aspirated': 5, 'long': 0, # decreased from 1 'high': 3, # decreased from 5 'back': 2, # decreased from 5 'round': 2 # decreased from 5 } # (Kondrak 2002: 59-60) feature_matrix = { # Consonants 'p': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'b': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 't': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'd': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʈ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɖ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'}, 'c': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɟ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'k': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'g': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'q': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɢ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʔ': {'place': 'glottal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'm': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɱ': {'place': 'labiodental', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'n': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɳ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'plus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɲ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ŋ': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɴ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'N': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʙ': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'B': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'r': {'place': 'alveolar', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʀ': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'R': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɾ': {'place': 'alveolar', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɽ': {'place': 'retroflex', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɸ': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'β': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'f': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'v': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'θ': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ð': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 's': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'z': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʃ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʒ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʂ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʐ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ç': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʝ': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'x': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɣ': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'χ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʁ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ħ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ʕ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'h': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɦ': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɬ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'}, 'ɮ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'}, 'ʋ': {'place': 'labiodental', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɹ': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɻ': {'place': 'retroflex', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'}, 'j': {'place': 'palatal', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'ɰ': {'place': 'velar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, 'l': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'}, 'w': {'place': 'labiovelar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'}, # Vowels 'i': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high', 'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'}, 'y': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high', 'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'}, 'e': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'}, 'E': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'}, 'ø': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'}, 'ɛ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'}, 'œ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'}, 'æ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low', 'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'}, 'a': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low', 'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'}, 'A': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low', 'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'}, 'ɨ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high', 'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'}, 'ʉ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high', 'back': 'central','round': 'plus', 'long': 'minus', 'aspirated': 'minus'}, 'ə': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'}, 'u': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high', 'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'}, 'U': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high', 'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'}, 'o': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'}, 'O': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'}, 'ɔ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid', 'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'}, 'ɒ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low', 'back': 'back','round': 'minus', 'long': 'minus', 'aspirated': 'minus'}, 'I': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus', 'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high', 'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'}, } # === Algorithm === def align(str1, str2, epsilon=0): """ Compute the alignment of two phonetic strings. :type str1, str2: str :param str1, str2: Two strings to be aligned :type epsilon: float (0.0 to 1.0) :param epsilon: Adjusts threshold similarity score for near-optimal alignments :rtpye: list(list(tuple(str, str))) :return: Alignment(s) of str1 and str2 (Kondrak 2002: 51) """ if np == None: raise ImportError('You need numpy in order to use the align function') assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0." m = len(str1) n = len(str2) # This includes Kondrak's initialization of row 0 and column 0 to all 0s. S = np.zeros((m+1, n+1), dtype=float) # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense, # and breaks array and string indices. Make sure they never get chosen # by setting them to -inf. for i in range(1, m+1): for j in range(1, n+1): edit1 = S[i-1, j] + sigma_skip(str1[i-1]) edit2 = S[i, j-1] + sigma_skip(str2[j-1]) edit3 = S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1]) if i > 1: edit4 = S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i]) else: edit4 = -inf if j > 1: edit5 = S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j]) else: edit5 = -inf S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0) T = (1-epsilon)*np.amax(S) # Threshold score for near-optimal alignments alignments = [] for i in range(1, m+1): for j in range(1, n+1): if S[i,j] >= T: alignments.append(_retrieve(i, j, 0, S, T, str1, str2, [])) return alignments def _retrieve(i, j, s, S, T, str1, str2, out): """ Retrieve the path through the similarity matrix S starting at (i, j). :rtype: list(tuple(str, str)) :return: Alignment of str1 and str2 """ if S[i, j] == 0: return out else: if j > 1 and S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j]) + s >= T: out.insert(0, (str1[i-1], str2[j-2:j])) _retrieve(i-1, j-2, s+sigma_exp(str1[i-1], str2[j-2:j]), S, T, str1, str2, out) elif i > 1 and S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i]) + s >= T: out.insert(0, (str1[i-2:i], str2[j-1])) _retrieve(i-2, j-1, s+sigma_exp(str2[j-1], str1[i-2:i]), S, T, str1, str2, out) elif S[i, j-1] + sigma_skip(str2[j-1]) + s >= T: out.insert(0, ('-', str2[j-1])) _retrieve(i, j-1, s+sigma_skip(str2[j-1]), S, T, str1, str2, out) elif S[i-1, j] + sigma_skip(str1[i-1]) + s >= T: out.insert(0, (str1[i-1], '-')) _retrieve(i-1, j, s+sigma_skip(str1[i-1]), S, T, str1, str2, out) elif S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1]) + s >= T: out.insert(0, (str1[i-1], str2[j-1])) _retrieve(i-1, j-1, s+sigma_sub(str1[i-1], str2[j-1]), S, T, str1, str2, out) return out def sigma_skip(p): """ Returns score of an indel of P. (Kondrak 2002: 54) """ return C_skip def sigma_sub(p, q): """ Returns score of a substitution of P with Q. (Kondrak 2002: 54) """ return C_sub - delta(p, q) - V(p) - V(q) def sigma_exp(p, q): """ Returns score of an expansion/compression. (Kondrak 2002: 54) """ q1 = q[0] q2 = q[1] return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2)) def delta(p, q): """ Return weighted sum of difference between P and Q. (Kondrak 2002: 54) """ features = R(p, q) total = 0 for f in features: total += diff(p, q, f) * salience[f] return total def diff(p, q, f): """ Returns difference between phonetic segments P and Q for feature F. (Kondrak 2002: 52, 54) """ p_features, q_features = feature_matrix[p], feature_matrix[q] return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]]) def R(p, q): """ Return relevant features for segment comparsion. (Kondrak 2002: 54) """ if p in consonants or q in consonants: return R_c return R_v def V(p): """ Return vowel weight if P is vowel. (Kondrak 2002: 54) """ if p in consonants: return 0 return C_vwl # === Test === def demo(): """ A demonstration of the result of aligning phonetic sequences used in Kondrak's (2002) dissertation. """ data = [pair.split(',') for pair in cognate_data.split('\n')] for pair in data: alignment = align(pair[0], pair[1])[0] alignment = ['({}, {})'.format(a[0], a[1]) for a in alignment] alignment = ' '.join(alignment) print('{} ~ {} : {}'.format(pair[0], pair[1], alignment)) cognate_data = """jo,ʒə tu,ty nosotros,nu kjen,ki ke,kwa todos,tu una,ən dos,dø tres,trwa ombre,om arbol,arbrə pluma,plym kabeθa,kap boka,buʃ pje,pje koraθon,kœr ber,vwar benir,vənir deθir,dir pobre,povrə ðis,dIzes ðæt,das wat,vas nat,nixt loŋ,laŋ mæn,man fleʃ,flajʃ bləd,blyt feðər,fEdər hær,hAr ir,Or aj,awgə nowz,nAzə mawθ,munt təŋ,tsuŋə fut,fys nij,knI hænd,hant hart,herts livər,lEbər ænd,ante æt,ad blow,flAre ir,awris ijt,edere fiʃ,piʃkis flow,fluere staɾ,stella ful,plenus græs,gramen hart,kordis horn,korny aj,ego nij,genU məðər,mAter mawntən,mons nejm,nomen njuw,nowus wən,unus rawnd,rotundus sow,suere sit,sedere θrij,tres tuwθ,dentis θin,tenwis kinwawa,kenuaʔ nina,nenah napewa,napɛw wapimini,wapemen namesa,namɛʔs okimawa,okemaw ʃiʃipa,seʔsep ahkohkwa,ahkɛh pematesiweni,pematesewen asenja,aʔsɛn""" if __name__ == '__main__': demo()