generate-dict

#!/usr/bin/python
import os, sys
import subprocess

from voxutils.dictionaries import lookup_words

PHONE_MAP_DIR = "phone-maps"
DICT_DIR = "dict"

def vocab_gen(fn):
    f = open(fn)
    for line in f:
        for word in line.split():
            yield word.upper()
    f.close()

def load_conversion(conversion):
    d = {}
    f = open(os.path.join(PHONE_MAP_DIR, conversion))
    for line in f:
        line = line.strip()
        if line[:3] == "###" or line == '':
            continue
        try:
            native, arpa = line.split(None, 1)
        except ValueError:
            native = line
            arpa = ''
        if native == '+SPACE+':
            native = ' '
        d[native.decode("utf-8")] = arpa

    tree = {}
    for token, arpa in d.iteritems():
        current = tree
        for c in token:
            current = current.setdefault(c, {})
        current[''] = arpa
    return tree

def raw_espeak(word):
        call = ["espeak", "-q", "--ipa", word.encode('utf-8')]
        #call = ["espeak", "-qx", word]
        return subprocess.check_output(call).strip()


def dict_espeak(vocab):
    convertor = load_conversion("ipa-to-cmudict")
    for word in vocab:
        word = word.decode("utf-8")
        if word[:2] == '++': # '++UM++' -> '+UM+', etc special cases
            print ("%s %s" % (word, word[1:-1])).encode('utf-8')
            continue

        pron = raw_espeak(word).decode("utf-8")
        arpa = []
        pos = 0
        try:
            while pos < len(pron):
                d = convertor
                while pos < len(pron) and pron[pos] in d:
                    d = d[pron[pos]]
                    pos += 1
                if d.get('') is None:
                    raise TabError("missing phoneme at %s" % pos)
                arpa.append(d[''])
            print ("%s %s" % (word, ' '.join(arpa))).encode('utf-8')
        except (UnicodeDecodeError, TabError), e:
            print >> sys.stderr, e, word, pron, arpa

def dict_espeak(vocab):
    convertor = load_conversion("ipa-to-cmudict")
    for word in vocab:
        word = word.decode("utf-8")
        if word[:2] == '++': # '++UM++' -> '+UM+', etc special cases
            print ("%s %s" % (word, word[1:-1])).encode('utf-8')
            continue

        pron = raw_espeak(word).decode("utf-8")
        arpa = []
        pos = 0
        try:
            while pos < len(pron):
                d = convertor
                while pos < len(pron) and pron[pos] in d:
                    d = d[pron[pos]]
                    pos += 1
                if d.get('') is None:
                    raise TabError("missing phoneme at %s" % pos)
                arpa.append(d[''])
            print ("%s %s" % (word, ' '.join(arpa))).encode('utf-8')
        except (UnicodeDecodeError, TabError), e:
            print >> sys.stderr, e, word, pron, arpa


def lut_dict_factory(dictname):
    def f(vocab):
        lines, missing = lookup_words(vocab, dictname)
        for line in lines:
            print line.upper()
        for line in missing:
            print >> sys.stderr, line
    return f

dict_cmu = lut_dict_factory('cmudict')

def main(dictname, vocab_files):
    vocab = set()
    for fn in vocab_files:
        vocab.update(vocab_gen(fn))

    f = globals().get("dict_" + dictname,
                      lut_dict_factory(dictname))
    f(vocab)


main(sys.argv[1], sys.argv[2:])