Jam-Teacher/Content/model/ru/ru/dictionary.py

116 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python
# -- coding: utf-8 --
# Converts an accented vocabulary to dictionary, for example
#
# абстракцион+истов
# абстр+акцию
# абстр+акция
#
# абстракционистов a0 b s t r a0 k c i0 o0 nj i1 s t o0 v
# абстракцию a0 b s t r a1 k c i0 j u0
# абстракция a0 b s t r a1 k c i0 j a0
#
import sys
softletters=set(u"яёюиье")
startsyl=set(u"#ъьаяоёуюэеиы-")
others = set(["#", "+", "-", u"ь", u"ъ"])
softhard_cons = {
u"б" : u"b",
u"в" : u"v",
u"г" : u"g",
u"Г" : u"g",
u"д" : u"d",
u"з" : u"z",
u"к" : u"k",
u"л" : u"l",
u"м" : u"m",
u"н" : u"n",
u"п" : u"p",
u"р" : u"r",
u"с" : u"s",
u"т" : u"t",
u"ф" : u"f",
u"х" : u"h"
}
other_cons = {
u"ж" : u"zh",
u"ц" : u"c",
u"ч" : u"ch",
u"ш" : u"sh",
u"щ" : u"sch",
u"й" : u"j"
}
vowels = {
u"а" : u"a",
u"я" : u"a",
u"у" : u"u",
u"ю" : u"u",
u"о" : u"o",
u"ё" : u"o",
u"э" : u"e",
u"е" : u"e",
u"и" : u"i",
u"ы" : u"y",
}
def pallatize(phones):
for i, phone in enumerate(phones[:-1]):
if phone[0] in softhard_cons:
if phones[i+1][0] in softletters:
phones[i] = (softhard_cons[phone[0]] + "j", 0)
else:
phones[i] = (softhard_cons[phone[0]], 0)
if phone[0] in other_cons:
phones[i] = (other_cons[phone[0]], 0)
def convert_vowels(phones):
new_phones = []
prev = ""
for phone in phones:
if prev in startsyl:
if phone[0] in set(u"яюеё"):
new_phones.append("j")
if phone[0] in vowels:
new_phones.append(vowels[phone[0]] + str(phone[1]))
else:
new_phones.append(phone[0])
prev = phone[0]
return new_phones
def convert(stressword):
phones = ("#" + stressword + "#").decode('utf-8')
# Assign stress marks
stress_phones = []
stress = 0
for phone in phones:
if phone == "+":
stress = 1
else:
stress_phones.append((phone, stress))
stress = 0
# Pallatize
pallatize(stress_phones)
# Assign stress
phones = convert_vowels(stress_phones)
# Filter
phones = [x for x in phones if x not in others]
return " ".join(phones).encode("utf-8")
for line in open(sys.argv[1]):
stressword = line.strip()
print stressword.replace("+", ""), convert(stressword)