Skip to content

Commit

Permalink
Start migration of "map_fst"
Browse files Browse the repository at this point in the history
Added `map_fst` and necessary imports. Started test. Began
implementation of actual mapping rules.
  • Loading branch information
Kay-Michael Würzner committed Nov 5, 2018
1 parent 9ec9b11 commit 73b4ace
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 9 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
click
pytest
pytest-ordering
http://www.opengrm.org/twiki/pub/GRM/PyniniDownload/pynini-2.0.0.tar.gz#egg=pynini
20 changes: 20 additions & 0 deletions tests/test_fsts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-

import sys
import os
import pytest
import pynini

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../timur')))

from timur import fsts
from timur import helpers

sample_symbols = ["<Base_Stems>", "<NN>", "<base>", "<frei>", "<NMasc_es_e>"]

def test_map_fst():
syms = helpers.load_alphabet(sample_symbols)
assert(True)

if __name__ == '__main__':
unittest.main()
7 changes: 5 additions & 2 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# -*- coding: utf-8 -*-

import sys, os, pytest

import sys
import os
import pytest
import pynini

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../timur')))
Expand All @@ -11,6 +12,7 @@
sample_symbols = ["<Base_Stems>", "<NN>", "<base>", "<frei>", "<NMasc_es_e>"]
sample_entries = ["<Base_Stems>Anüs<NN><base><frei><NMasc_es_e>"]

@pytest.mark.first
def test_load_alphabet():
'''
Load the sample symbol set and check for membership.
Expand All @@ -19,6 +21,7 @@ def test_load_alphabet():
assert(syms.member("<NN>"))
assert(syms.member("ü"))

@pytest.mark.second
def test_load_lexicon():
'''
Load the sample lexicon and check vor invariance.
Expand Down
23 changes: 23 additions & 0 deletions timur/data/syms.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,35 @@
<V>
<ORD>
<OTHER>
<PREF>
<KSF>
<CARD,DIGCARD,NE>
<ADJ,CARD>
<ADJ,NN>
<CARD,NN>
<CARD,NE>
<ABK,ADJ,NE,NN>
<ADJ,NE,NN>
<ABK,NE,NN>
<NE,NN>
<ABK,CARD,NN>
<ABK,NN>
<ADJ,CARD,NN,V>
<ADJ,NN,V>
<ABK,ADJ,NE,NN,V>
<ADJ,NE,NN,V>
<ADV,NE,NN,V>
<ABK,NE,NN,V>
<NE,NN,V>
<ABK,NN,V>
<NN,V>
# intermediate features
<QUANT>
<Initial>
<Base_Stems>
<Deriv_Stems>
<Kompos_Stems>
<Pref_Stems>
# stem type features
<base>
<deriv>
Expand Down
2 changes: 2 additions & 0 deletions timur/fsts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from .num_fst import num_fst
from .phon_fst import phon_fst
from .map_fst import map_fst_map1
from .map_fst import map_fst_map2
24 changes: 24 additions & 0 deletions timur/fsts/map_fst.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import

import pynini

from timur.helpers import union
from timur.helpers import concat

def map_fst_map1(symbol_table):
'''
Modifications of lexical entries
'''

# lexical features to be omitted from the output
cat = pynini.string_map(["<ABK>", "<ADJ>", "<ADV>", "<CARD>", "<DIGCARD>", "<NE>", "<NN>", "<PRO>", "<V>", "<ORD>", "<OTHER>", "<KSF>"], input_token_type=symbol_table, output_token_type=symbol_table)

cat_ext = pynini.string_map(["<ABK>", "<ADJ>", "<ADV>", "<CARD>", "<DIGCARD>", "<NE>", "<NN>", "<PRO>", "<V>", "<ORD>", "<OTHER>", "<CARD,DIGCARD,NE>", "<ADJ,CARD>", "<ADJ,NN>", "<CARD,NN>", "<CARD,NE>", "<ABK,ADJ,NE,NN>", "<ADJ,NE,NN>", "<ABK,NE,NN>", "<NE,NN>", "<ABK,CARD,NN>", "<ABK,NN>", "<ADJ,CARD,NN,V>", "<ADJ,NN,V>", "<ABK,ADJ,NE,NN,V>", "<ADJ,NE,NN,V>", "<ADV,NE,NN,V>", "<ABK,NE,NN,V>", "<NE,NN,V>", "<ABK,NN,V>", "<NN,V>"], input_token_type=symbol_table, output_token_type=symbol_table)

return pynini.Fst()

def map_fst_map2(symbol_table):
'''
Modifications of lexical entries
'''
12 changes: 9 additions & 3 deletions timur/helpers/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
def union(*args, token_type="utf8"):
args_mod = []
for arg in args:
if type(args) == "str":
if isinstance(arg, str):
args_mod.append(pynini.acceptor(arg, token_type=token_type))
else:
args_mod.append(arg)
Expand All @@ -13,8 +13,13 @@ def union(*args, token_type="utf8"):
def concat(*args, token_type="utf8"):
args_mod = []
conc = pynini.Fst()
conc.set_start(conc.add_state())
conc.set_final(conc.start())
if isinstance(token_type, pynini.SymbolTable):
conc.set_input_symbols(token_type)
conc.set_output_symbols(token_type)
for arg in args:
if type(args) == "str":
if isinstance(arg, str):
arg = pynini.acceptor(arg, token_type=token_type)
conc = pynini.concat(conc, arg)
return conc
Expand All @@ -30,7 +35,8 @@ def load_alphabet(source, auto_singletons=True):
if symbol.isprintable() and not symbol.isspace():
syms.add_symbol(symbol)
for symbol in source:
symbol = str(symbol)
if isinstance(symbol, bytes):
symbol = symbol.decode("utf-8")
if symbol.startswith('#'):
continue
syms.add_symbol(symbol.strip())
Expand Down
25 changes: 21 additions & 4 deletions timur/scripts/timur.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,25 @@ def build(lexicon):

lex = helpers.load_lexicon(lexicon, syms)

#phon = phon_fst(syms)
#phon.draw("test.dot")
num_stems = fsts.num_fst(syms)
# add repetitive prefixes
# TODO: move to fst function
print(syms.member("<Pref_Stems>"))
repeatable_prefs = helpers.concat(
"<Pref_Stems>",
helpers.union(
"u r <PREF>",
"v o r <PREF>",
token_type=syms
).closure(1),
"<ADJ,NN> <nativ>",
token_type=syms
)
lex = pynini.union(lex, repeatable_prefs)

map1 = fsts.map_fst_map1(syms)

lex = pynini.compose(map1, lex)
lex.draw("test.dot")

ANY = construct_any(syms)
#phon = phon_fst(syms)
#num_stems = fsts.num_fst(syms)

0 comments on commit 73b4ace

Please sign in to comment.