- różne poprawki w parsowaniu tagsetu

- praca nad parsowaniem reguł zlepiania segmentów git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@85 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

- różne poprawki w parsowaniu tagsetu
- praca nad parsowaniem reguł zlepiania segmentów git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@85 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Michał Lenart
1 parent 28f11d57
Showing 18 changed files with 1231 additions and 125 deletions
fsabuilder/.settings/org.eclipse.core.resources.prefs
fsabuilder/.settings/org.eclipse.ltk.core.refactoring.prefs
fsabuilder/morfeuszbuilder/fsa/common.py
fsabuilder/morfeuszbuilder/fsa/test/testConstruction.py
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
fsabuilder/morfeuszbuilder/segrules/segrules.py → fsabuilder/morfeuszbuilder/segrules/rules.py
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
fsabuilder/morfeuszbuilder/segrules/segsfsa.py
fsabuilder/morfeuszbuilder/segrules/test.py
fsabuilder/morfeuszbuilder/segrules/test/__init__.py
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
fsabuilder/morfeuszbuilder/segrules/test/polimorf.tagset
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
fsabuilder/morfeuszbuilder/tagset/__init__.py
fsabuilder/morfeuszbuilder/tagset/segtypes.py
fsabuilder/morfeuszbuilder/tagset/tagset.py
fsabuilder/morfeuszbuilder/utils/configFile.py
 eclipse.preferences.version=1
+encoding//morfeuszbuilder/fsa/test/testConstruction.py=utf-8
 encoding/buildfsa.py=utf-8
+eclipse.preferences.version=1
+org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false
@@ -77,34 +77,3 @@ class Interpretation4Generator(object):
     def __repr__(self):
         return unicode(self)
-
-class Tagset(object):
-    
-    TAGS = 1
-    NAMES = 2
-    SEP = '\t'
-    
-    def __init__(self, filename, encoding='utf8'):
-        self.tag2tagnum = {}
-        self.name2namenum = {}
-        self._doInit(filename, encoding)
-#         print self.tag2tagnum
-#         print self.name2namenum
-    
-    def _doInit(self, filename, encoding):
-        addingTo = None
-        with codecs.open(filename, 'r', encoding) as f:
-            for line in f:
-                line = line.strip('\n')
-                if line == u'[TAGS]':
-                    addingTo = Tagset.TAGS
-                elif line == u'[NAMES]':
-                    addingTo = Tagset.NAMES
-                elif line and not line.startswith(u'#'):
-                    assert addingTo in [Tagset.TAGS, Tagset.NAMES]
-                    res = {Tagset.TAGS: self.tag2tagnum,
-                           Tagset.NAMES: self.name2namenum}[addingTo]
-                    tagNum = line.split(Tagset.SEP)[0]
-                    tag = line.split(Tagset.SEP)[1]
-                    assert tag not in res
-                    res[tag] = int(tagNum)
@@ -6,62 +6,62 @@ Created on Oct 8, 2013
 '''
 import unittest
 import os
-from fsa import fsa, visualizer, encode, buildfsa
-from fsa.serializer import SimpleSerializer
+from morfeuszbuilder.fsa import fsa, visualizer, encode
+from morfeuszbuilder.fsa.serializer import SimpleSerializer
 class Test(unittest.TestCase):
-
-    def testSimpleConstruction(self):
-        a = fsa.FSA(encode.SimpleEncoder())
-        input = sorted([
-                (u'bić', ''),
-                (u'bij', ''),
-                (u'biją', ''),
-                (u'bijcie', ''),
-                (u'bije', ''),
-                (u'bijecie', ''),
-                (u'bijemy', ''),
-                (u'bijesz', ''),
-                (u'biję', ''),
-                (u'bijmy', ''),
-                (u'bili', 'asd'),
-                (u'biliby', ''),
-                (u'bilibyście', ''),
-                (u'bilibyśmy', ''),
-                (u'biliście', 'asdfas'),
-                (u'biliśmy', ''),
-                (u'bił', 'wersadfas'),
-                (u'biła', 'asdfasd'),
-                (u'biłaby', 'asdfa'),
-                (u'biłabym', ''),
-                (u'biłabyś', 'asdfa'),
-                (u'biłam', 'dfas'),
-                (u'biłaś', 'asdfas'),
-                (u'biłby', ''),
-                (u'biłbym', 'asdfa'),
-                (u'biłbyś', ''),
-                (u'biłem', ''),
-                (u'biłeś', 'sadfa'),
-                (u'biły', ''),
-                (u'biłyby', ''),
-                (u'biłybyście', ''),
-                (u'biłybyśmy', ''),
-                (u'biłyście', ''),
-                (u'biłyśmy', ''),
-                ], key=lambda w: bytearray(w[0], 'utf8'))
-        a.feed(input)
-        for w, res in input:
-            recognized = a.tryToRecognize(w)
-            assert recognized == res
-        a.calculateOffsets(lambda state: 1 + 4 * len(state.transitionsMap.keys()) + (len(state.encodedData) if state.isAccepting() else 0))
-        visualizer.Visualizer().visualize(a)
-    
-    def testPolimorfConstruction(self):
-        inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab')
-        tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset')
-        fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile)
-        serializer = SimpleSerializer(fsa)
-        serializer.serialize2BinaryFile('/tmp/test0.fsa')
+    pass
+#     def testSimpleConstruction(self):
+#         a = fsa.FSA(encode.SimpleEncoder())
+#         input = sorted([
+#                 (u'bić', ''),
+#                 (u'bij', ''),
+#                 (u'biją', ''),
+#                 (u'bijcie', ''),
+#                 (u'bije', ''),
+#                 (u'bijecie', ''),
+#                 (u'bijemy', ''),
+#                 (u'bijesz', ''),
+#                 (u'biję', ''),
+#                 (u'bijmy', ''),
+#                 (u'bili', 'asd'),
+#                 (u'biliby', ''),
+#                 (u'bilibyście', ''),
+#                 (u'bilibyśmy', ''),
+#                 (u'biliście', 'asdfas'),
+#                 (u'biliśmy', ''),
+#                 (u'bił', 'wersadfas'),
+#                 (u'biła', 'asdfasd'),
+#                 (u'biłaby', 'asdfa'),
+#                 (u'biłabym', ''),
+#                 (u'biłabyś', 'asdfa'),
+#                 (u'biłam', 'dfas'),
+#                 (u'biłaś', 'asdfas'),
+#                 (u'biłby', ''),
+#                 (u'biłbym', 'asdfa'),
+#                 (u'biłbyś', ''),
+#                 (u'biłem', ''),
+#                 (u'biłeś', 'sadfa'),
+#                 (u'biły', ''),
+#                 (u'biłyby', ''),
+#                 (u'biłybyście', ''),
+#                 (u'biłybyśmy', ''),
+#                 (u'biłyście', ''),
+#                 (u'biłyśmy', ''),
+#                 ], key=lambda w: bytearray(w[0], 'utf8'))
+#         a.feed(input)
+#         for w, res in input:
+#             recognized = a.tryToRecognize(w)
+#             assert recognized == res
+#         a.calculateOffsets(lambda state: 1 + 4 * len(state.transitionsMap.keys()) + (len(state.encodedData) if state.isAccepting() else 0))
+#         visualizer.Visualizer().visualize(a)
+#     
+#     def testPolimorfConstruction(self):
+#         inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab')
+#         tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset')
+#         fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile)
+#         serializer = SimpleSerializer(fsa)
+#         serializer.serialize2BinaryFile('/tmp/test0.fsa')
 #         visualizer.Visualizer().visualize(fsa)
 if __name__ == "__main__":
@@ -7,6 +7,7 @@ import re
 from pyparsing import *
 identifier = Word(alphas, bodyChars=alphanums+'_')
+token = Word(alphas, bodyChars=alphanums+'_+>')
 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
@@ -64,7 +65,7 @@ def _processLine(line, defines):
         defineInstance = Forward()
         localId = identifier.copy()
-        rule << OneOrMore(localId ^ defineInstance ^ Word('*|+?'))
+        rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')))
         defineInstance << localId + Suppress('(') + rule + Suppress(')')
         rule.setParseAction(lambda s, l, t: ' '.join(t))
@@ -77,25 +78,25 @@ def _processLine(line, defines):
 def preprocess(inputLines, defs):
     defines = {}
     ifdefsStack = []
-    for lineNum, line in enumerate(inputLines, start=1):
+    for lineNum, line in inputLines:
         if line.startswith('#define'):
-            try:
-                parsedDefine = list(define.parseString(line))
-                if len(parsedDefine) == 2:
-                    name, val = parsedDefine
-                    defines[name] = NonArgDefine(name, val)
-                else:
-                    name, arg, val = parsedDefine
-                    localDefines = defines.copy()
-                    localDefines[arg] = NonArgDefine(arg, arg)
-                    val = _processLine(val, localDefines)
-                    defines[name] = ArgDefine(name, arg, val)
-            except:
-                pass
+            parsedDefine = list(define.parseString(line))
+            if len(parsedDefine) == 2:
+                name, val = parsedDefine
+                defines[name] = NonArgDefine(name, val)
+            else:
+                name, arg, val = parsedDefine
+                localDefines = defines.copy()
+                localDefines[arg] = NonArgDefine(arg, arg)
+                val = _processLine(val, localDefines)
+                defines[name] = ArgDefine(name, arg, val)
         elif line.startswith('#ifdef'):
             name = ifdef.parseString(line)[0]
             ifdefsStack.append(name)
         elif line.startswith('#endif'):
             ifdefsStack.pop()
+        elif line.startswith('#'):
+            yield lineNum, line
         elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)):
-            yield _processLine(line, defines)
+            yield lineNum, _processLine(line, defines)
+        
 \ No newline at end of file
@@ -15,38 +15,45 @@ class SegmentRule(object):
         Constructor
         '''
-class SimpleRule(SegmentRule):
+class TagRule(SegmentRule):
-    def __init__(self, name, typeId):
-        self.name = name
-        self.identifier = typeId
+    def __init__(self, tagType, line):
+        self.tagType = tagType
+        self.line = line
+
+class UnaryRule(SegmentRule):
+    
+    def __init__(self, child, line):
+        self.child = child
+        self.line = line
 class ComplexRule(SegmentRule):
-    def __init__(self, children):
+    def __init__(self, children, line):
         self.children = children
+        self.line = line
 class ConcatRule(ComplexRule):
-    def __init__(self, children):
-        super(ConcatRule, self).__init__(children)
+    def __init__(self, children, line):
+        super(ConcatRule, self).__init__(children, line)
 class OrRule(ComplexRule):
-    def __init__(self, children):
-        super(OrRule, self).__init__(children)
-
-class UnaryRule(SegmentRule):
-    
-    def __init__(self, child):
-        self.child = child
+    def __init__(self, children, line):
+        super(OrRule, self).__init__(children, line)
 class ZeroOrMoreRule(UnaryRule):
-    def __init__(self, child):
-        super(ZeroOrMoreRule, self).__init__(child)
+    def __init__(self, child, line):
+        super(ZeroOrMoreRule, self).__init__(child, line)
+        
+class OneOrMoreRule(UnaryRule):
+    
+    def __init__(self, child, line):
+        super(OneOrMoreRule, self).__init__(child, line)
 class IgnoreOrthRule(UnaryRule):
-    def __init__(self, child):
-        super(IgnoreOrthRule, self).__init__(child)
+    def __init__(self, child, line):
+        super(IgnoreOrthRule, self).__init__(child, line)
+
+from pyparsing import *
+from morfeuszbuilder.tagset import segtypes
+from morfeuszbuilder.utils import configFile
+from morfeuszbuilder.segrules import preprocessor
+import codecs
+import re
+
+import itertools
+import logging
+import segsfsa
+
+# header = Suppress('[') + Word(alphas, bodyChars=alphanums+'_') + Suppress(']')
+# define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
+# ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
+# endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
+
+def doprint(toks):
+    print toks
+
+class RulesParser(object):
+    
+    def __init__(self, tagset):
+        self.tagset = tagset
+    
+    def _getKey2Defs(self, segtypesConfigFile):
+        res = {}
+        for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'):
+            lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress()
+            try:
+                key, defs = lineToParse.parseString(line)
+                res[key] = tuple(defs)
+            except Exception as ex:
+                raise configFile.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex))
+        return res
+    
+    def parse(self, filename):
+        res = []
+        
+        segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes'])
+        key2Defs = self._getKey2Defs(segtypesConfigFile)
+        segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
+        
+        def2Key = {}
+        for key, defs in key2Defs.iteritems():
+            for define in defs:
+                def2Key[define] = key
+        
+        for defs in itertools.product(*key2Defs.values()):
+            key2Def = dict([(def2Key[define], define) for define in defs])
+            fsa = segsfsa.SegmentsFSA(key2Def)
+            combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations')
+            combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs))
+            for rule in self._doParse(combinationEnumeratedLines, segtypesHelper):
+                fsa.addSegmentRule(rule)
+            res.append(fsa)
+        return res
+    
+    def _doParse(self, combinationEnumeratedLines, segtypesHelper):
+        for lineNum, line in combinationEnumeratedLines:
+            if not line.startswith('#'):
+                yield self._doParseOneLine(lineNum, line, segtypesHelper)
+    
+    def _doParseOneLine(self, lineNum, line, segtypesHelper):
+        rule = Forward()
+        tagRule = Word(alphanums+'_')
+        ignoreOrthRule = tagRule + Suppress('>')
+        parenRule = Suppress('(') + rule + Suppress(')')
+        atomicRule = tagRule ^ ignoreOrthRule ^ parenRule
+        zeroOrMoreRule = atomicRule + Suppress('*')
+        oneOrMoreRule = atomicRule + Suppress('+')
+        unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
+        oneOfRule = delimitedList(unaryRule, delim='|')
+        complexRule = unaryRule ^ oneOfRule
+        concatRule = OneOrMore(complexRule)
+        rule << concatRule
+#         rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule
+        
+#         tagRule.setParseAction(lambda s,l,toks: doprint(toks))
+#         print lineNum, line
+        parsedLine = rule.parseString(line, parseAll=True)
+#         print parsedLine
@@ -14,7 +14,7 @@ class SegmentsFSAState(object):
 class SegmentsFSA(object):
-    def __init__(self):
+    def __init__(self, key2Def={}):
         self.initialState = SegmentsFSAState()
     def addSegmentRule(self, segmentRule):
@@ -23,3 +23,5 @@ class SegmentsFSA(object):
     def serialize(self):
         res = bytearray()
         return res
+    
+    
 \ No newline at end of file
@@ -4,7 +4,7 @@ Created on 24 sty 2014
 @author: mlenart
 '''
-import preprocessor
+from morfeuszbuilder.segrules import preprocessor
 if __name__ == '__main__':
     text = '''
@@ -13,8 +13,8 @@ dupa
 #define X(x) a x b
 #define Y(x) X(x) c
 #define B(x) X(x)
-#define Z(x) Y(X(x)) d
-#define AB(asd) dupa asd dupa
+#define Z(x) Y( X(x) jhg) d
+#define A_B(asd) dupa asd dupa asfda_asdfa
 Y(Z(a) b X(c) Y(d))
 #ifdef extra
 asdfasa
@@ -30,7 +30,7 @@ aaaa asd
 asdfasdfada
 #endif
-AB(x)
+A_B( (x)+ x)
 '''
-    for line in preprocessor.preprocess(text.split('\n'), ['extra', 'superextra']):
+    for line in preprocessor.preprocess(enumerate(text.split('\n')), ['extra', 'superextra']):
         print line
 \ No newline at end of file
+'''
+Created on 18 lut 2014
+
+@author: mlenart
+'''
+import unittest
+import os
+from morfeuszbuilder.segrules import rulesParser
+from morfeuszbuilder.tagset import tagset
+
+class Test(unittest.TestCase):
+    t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))
+    parser = rulesParser.RulesParser(t)
+    parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
+
+if __name__ == "__main__":
+    unittest.main()
+#     testParser()
 \ No newline at end of file
+#!MORFEUSZ-TAGSET 0.1
+
+[TAGS]
+
+0	adj:pl:acc:m1.p1:com
+1	adj:pl:acc:m1.p1:pos
+2	adj:pl:acc:m1.p1:sup
+3	adj:pl:acc:m2.m3.f.n1.n2.p2.p3:com
+4	adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos
+5	adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup
+6	adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:com
+7	adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos
+8	adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup
+9	adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:com
+10	adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos
+11	adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup
+12	adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:com
+13	adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos
+14	adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup
+15	adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:com
+16	adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos
+17	adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup
+18	adj:pl:nom.voc:m1.p1:com
+19	adj:pl:nom.voc:m1.p1:pos
+20	adj:pl:nom.voc:m1.p1:sup
+21	adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:com
+22	adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos
+23	adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup
+24	adj:pl:nom:m1.p1:pos
+25	adj:pl:nom:m2.m3.f.n1.n2.p2.p3:pos
+26	adj:sg:acc:f:com
+27	adj:sg:acc:f:pos
+28	adj:sg:acc:f:sup
+29	adj:sg:acc:m1.m2:com
+30	adj:sg:acc:m1.m2:pos
+31	adj:sg:acc:m1.m2:sup
+32	adj:sg:acc:m3:com
+33	adj:sg:acc:m3:pos
+34	adj:sg:acc:m3:sup
+35	adj:sg:acc:n1.n2:com
+36	adj:sg:acc:n1.n2:pos
+37	adj:sg:acc:n1.n2:sup
+38	adj:sg:dat:f:com
+39	adj:sg:dat:f:pos
+40	adj:sg:dat:f:sup
+41	adj:sg:dat:m1.m2.m3.n1.n2:com
+42	adj:sg:dat:m1.m2.m3.n1.n2:pos
+43	adj:sg:dat:m1.m2.m3.n1.n2:sup
+44	adj:sg:gen:f:com
+45	adj:sg:gen:f:pos
+46	adj:sg:gen:f:sup
+47	adj:sg:gen:m1.m2.m3.n1.n2:com
+48	adj:sg:gen:m1.m2.m3.n1.n2:pos
+49	adj:sg:gen:m1.m2.m3.n1.n2:sup
+50	adj:sg:inst:f:com
+51	adj:sg:inst:f:pos
+52	adj:sg:inst:f:sup
+53	adj:sg:inst:m1.m2.m3.n1.n2:com
+54	adj:sg:inst:m1.m2.m3.n1.n2:pos
+55	adj:sg:inst:m1.m2.m3.n1.n2:sup
+56	adj:sg:loc:f:com
+57	adj:sg:loc:f:pos
+58	adj:sg:loc:f:sup
+59	adj:sg:loc:m1.m2.m3.n1.n2:com
+60	adj:sg:loc:m1.m2.m3.n1.n2:pos
+61	adj:sg:loc:m1.m2.m3.n1.n2:sup
+62	adj:sg:nom.voc:f:com
+63	adj:sg:nom.voc:f:pos
+64	adj:sg:nom.voc:f:sup
+65	adj:sg:nom.voc:m1.m2.m3:com
+66	adj:sg:nom.voc:m1.m2.m3:pos
+67	adj:sg:nom.voc:m1.m2.m3:sup
+68	adj:sg:nom.voc:n1.n2:com
+69	adj:sg:nom.voc:n1.n2:pos
+70	adj:sg:nom.voc:n1.n2:sup
+71	adj:sg:nom:f:pos
+72	adj:sg:nom:m1.m2.m3:pos
+73	adj:sg:nom:n1.n2:pos
+74	adja
+75	adjc
+76	adjp
+77	adv
+78	adv:com
+79	adv:pos
+80	adv:sup
+81	aglt:pl:pri:imperf:nwok
+82	aglt:pl:pri:imperf:wok
+83	aglt:pl:sec:imperf:nwok
+84	aglt:pl:sec:imperf:wok
+85	aglt:sg:pri:imperf:nwok
+86	aglt:sg:pri:imperf:wok
+87	aglt:sg:sec:imperf:nwok
+88	aglt:sg:sec:imperf:wok
+89	bedzie:pl:pri:imperf
+90	bedzie:pl:sec:imperf
+91	bedzie:pl:ter:imperf
+92	bedzie:sg:pri:imperf
+93	bedzie:sg:sec:imperf
+94	bedzie:sg:ter:imperf
+95	burk
+96	comp
+97	conj
+98	depr:pl:nom:m2
+99	depr:pl:voc:m2
+100	fin:pl:pri:imperf
+101	fin:pl:pri:imperf.perf
+102	fin:pl:pri:perf
+103	fin:pl:sec:imperf
+104	fin:pl:sec:imperf.perf
+105	fin:pl:sec:perf
+106	fin:pl:ter:imperf
+107	fin:pl:ter:imperf.perf
+108	fin:pl:ter:perf
+109	fin:sg:pri:imperf
+110	fin:sg:pri:imperf.perf
+111	fin:sg:pri:perf
+112	fin:sg:sec:imperf
+113	fin:sg:sec:imperf.perf
+114	fin:sg:sec:perf
+115	fin:sg:ter:imperf
+116	fin:sg:ter:imperf.perf
+117	fin:sg:ter:perf
+118	ger:sg:dat.loc:n2:imperf.perf:aff
+119	ger:sg:dat.loc:n2:imperf.perf:neg
+120	ger:sg:dat.loc:n2:imperf:aff
+121	ger:sg:dat.loc:n2:imperf:neg
+122	ger:sg:dat.loc:n2:perf:aff
+123	ger:sg:dat.loc:n2:perf:neg
+124	ger:sg:gen:n2:imperf.perf:aff
+125	ger:sg:gen:n2:imperf.perf:neg
+126	ger:sg:gen:n2:imperf:aff
+127	ger:sg:gen:n2:imperf:neg
+128	ger:sg:gen:n2:perf:aff
+129	ger:sg:gen:n2:perf:neg
+130	ger:sg:inst:n2:imperf.perf:aff
+131	ger:sg:inst:n2:imperf.perf:neg
+132	ger:sg:inst:n2:imperf:aff
+133	ger:sg:inst:n2:imperf:neg
+134	ger:sg:inst:n2:perf:aff
+135	ger:sg:inst:n2:perf:neg
+136	ger:sg:nom.acc:n2:imperf.perf:aff
+137	ger:sg:nom.acc:n2:imperf.perf:neg
+138	ger:sg:nom.acc:n2:imperf:aff
+139	ger:sg:nom.acc:n2:imperf:neg
+140	ger:sg:nom.acc:n2:perf:aff
+141	ger:sg:nom.acc:n2:perf:neg
+142	imps:imperf
+143	imps:imperf.perf
+144	imps:perf
+145	impt:pl:pri:imperf
+146	impt:pl:pri:imperf.perf
+147	impt:pl:pri:perf
+148	impt:pl:sec:imperf
+149	impt:pl:sec:imperf.perf
+150	impt:pl:sec:perf
+151	impt:sg:sec:imperf
+152	impt:sg:sec:imperf.perf
+153	impt:sg:sec:perf
+154	inf:imperf
+155	inf:imperf.perf
+156	inf:perf
+157	interj
+158	num:comp
+159	num:pl:acc:m1:rec
+160	num:pl:dat.loc:n1.p1.p2:congr.rec
+161	num:pl:dat:m1.m2.m3.n2.f:congr
+162	num:pl:gen.dat.inst.loc:m1.m2.m3.f.n1.n2.p1.p2:congr
+163	num:pl:gen.dat.inst.loc:m1.m2.m3.f.n2:congr
+164	num:pl:gen.dat.loc:m1.m2.m3.n2.f:congr
+165	num:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2:congr
+166	num:pl:gen.loc:m1.m2.m3.n2.f:congr
+167	num:pl:gen:n1.p1.p2:rec
+168	num:pl:inst:f:congr
+169	num:pl:inst:m1.m2.m3.f.n1.n2.p1.p2:congr
+170	num:pl:inst:m1.m2.m3.f.n2:congr
+171	num:pl:inst:m1.m2.m3.n2.f:congr
+172	num:pl:inst:m1.m2.m3.n2:congr
+173	num:pl:inst:n1.p1.p2:rec
+174	num:pl:nom.acc.voc:f:congr
+175	num:pl:nom.acc.voc:m1:rec
+176	num:pl:nom.acc.voc:m2.m3.f.n1.n2.p1.p2:rec
+177	num:pl:nom.acc.voc:m2.m3.f.n2:rec
+178	num:pl:nom.acc.voc:m2.m3.n2.f:congr
+179	num:pl:nom.acc.voc:m2.m3.n2:congr
+180	num:pl:nom.acc.voc:n1.p1.p2:rec
+181	num:pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec
+182	num:pl:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2.p1.p2:rec
+183	num:pl:nom.voc:m1:congr
+184	num:pl:nom.voc:m1:rec
+185	num:sg:nom.gen.dat.inst.acc.loc.voc:f:rec
+186	num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.n1.n2:rec
+187	pact:pl:acc:m1.p1:imperf.perf:aff
+188	pact:pl:acc:m1.p1:imperf.perf:neg
+189	pact:pl:acc:m1.p1:imperf:aff
+190	pact:pl:acc:m1.p1:imperf:neg
+191	pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
+192	pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
+193	pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
+194	pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
+195	pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
+196	pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
+197	pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
+198	pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
+199	pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
+200	pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
+201	pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
+202	pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
+203	pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff
+204	pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg
+205	pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff
+206	pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg
+207	pact:pl:nom.voc:m1.p1:imperf.perf:aff
+208	pact:pl:nom.voc:m1.p1:imperf.perf:neg
+209	pact:pl:nom.voc:m1.p1:imperf:aff
+210	pact:pl:nom.voc:m1.p1:imperf:neg
+211	pact:sg:acc.inst:f:imperf.perf:aff
+212	pact:sg:acc.inst:f:imperf.perf:neg
+213	pact:sg:acc.inst:f:imperf:aff
+214	pact:sg:acc.inst:f:imperf:neg
+215	pact:sg:acc:m1.m2:imperf.perf:aff
+216	pact:sg:acc:m1.m2:imperf.perf:neg
+217	pact:sg:acc:m1.m2:imperf:aff
+218	pact:sg:acc:m1.m2:imperf:neg
+219	pact:sg:acc:m3:imperf.perf:aff
+220	pact:sg:acc:m3:imperf.perf:neg
+221	pact:sg:acc:m3:imperf:aff
+222	pact:sg:acc:m3:imperf:neg
+223	pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff
+224	pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg
+225	pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff
+226	pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg
+227	pact:sg:gen.dat.loc:f:imperf.perf:aff
+228	pact:sg:gen.dat.loc:f:imperf.perf:neg
+229	pact:sg:gen.dat.loc:f:imperf:aff
+230	pact:sg:gen.dat.loc:f:imperf:neg
+231	pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff
+232	pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg
+233	pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff
+234	pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg
+235	pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff
+236	pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg
+237	pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff
+238	pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg
+239	pact:sg:nom.acc.voc:n1.n2:imperf.perf:aff
+240	pact:sg:nom.acc.voc:n1.n2:imperf.perf:neg
+241	pact:sg:nom.acc.voc:n1.n2:imperf:aff
+242	pact:sg:nom.acc.voc:n1.n2:imperf:neg
+243	pact:sg:nom.voc:f:imperf.perf:aff
+244	pact:sg:nom.voc:f:imperf.perf:neg
+245	pact:sg:nom.voc:f:imperf:aff
+246	pact:sg:nom.voc:f:imperf:neg
+247	pact:sg:nom.voc:m1.m2.m3:imperf.perf:aff
+248	pact:sg:nom.voc:m1.m2.m3:imperf.perf:neg
+249	pact:sg:nom.voc:m1.m2.m3:imperf:aff
+250	pact:sg:nom.voc:m1.m2.m3:imperf:neg
+251	pant:perf
+252	pcon:imperf
+253	ppas:pl:acc:m1.p1:imperf.perf:aff
+254	ppas:pl:acc:m1.p1:imperf.perf:neg
+255	ppas:pl:acc:m1.p1:imperf:aff
+256	ppas:pl:acc:m1.p1:imperf:neg
+257	ppas:pl:acc:m1.p1:perf:aff
+258	ppas:pl:acc:m1.p1:perf:neg
+259	ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
+260	ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
+261	ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
+262	ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
+263	ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff
+264	ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg
+265	ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
+266	ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
+267	ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
+268	ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
+269	ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff
+270	ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg
+271	ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
+272	ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
+273	ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
+274	ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
+275	ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff
+276	ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg
+277	ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff
+278	ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg
+279	ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff
+280	ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg
+281	ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:aff
+282	ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:neg
+283	ppas:pl:nom.voc:m1.p1:imperf.perf:aff
+284	ppas:pl:nom.voc:m1.p1:imperf.perf:neg
+285	ppas:pl:nom.voc:m1.p1:imperf:aff
+286	ppas:pl:nom.voc:m1.p1:imperf:neg
+287	ppas:pl:nom.voc:m1.p1:perf:aff
+288	ppas:pl:nom.voc:m1.p1:perf:neg
+289	ppas:sg:acc.inst:f:imperf.perf:aff
+290	ppas:sg:acc.inst:f:imperf.perf:neg
+291	ppas:sg:acc.inst:f:imperf:aff
+292	ppas:sg:acc.inst:f:imperf:neg
+293	ppas:sg:acc.inst:f:perf:aff
+294	ppas:sg:acc.inst:f:perf:neg
+295	ppas:sg:acc:m1.m2:imperf.perf:aff
+296	ppas:sg:acc:m1.m2:imperf.perf:neg
+297	ppas:sg:acc:m1.m2:imperf:aff
+298	ppas:sg:acc:m1.m2:imperf:neg
+299	ppas:sg:acc:m1.m2:perf:aff
+300	ppas:sg:acc:m1.m2:perf:neg
+301	ppas:sg:acc:m3:imperf.perf:aff
+302	ppas:sg:acc:m3:imperf.perf:neg
+303	ppas:sg:acc:m3:imperf:aff
+304	ppas:sg:acc:m3:imperf:neg
+305	ppas:sg:acc:m3:perf:aff
+306	ppas:sg:acc:m3:perf:neg
+307	ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff
+308	ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg
+309	ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff
+310	ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg
+311	ppas:sg:dat:m1.m2.m3.n1.n2:perf:aff
+312	ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg
+313	ppas:sg:gen.dat.loc:f:imperf.perf:aff
+314	ppas:sg:gen.dat.loc:f:imperf.perf:neg
+315	ppas:sg:gen.dat.loc:f:imperf:aff
+316	ppas:sg:gen.dat.loc:f:imperf:neg
+317	ppas:sg:gen.dat.loc:f:perf:aff
+318	ppas:sg:gen.dat.loc:f:perf:neg
+319	ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff
+320	ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg
+321	ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff
+322	ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg
+323	ppas:sg:gen:m1.m2.m3.n1.n2:perf:aff
+324	ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg
+325	ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff
+326	ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg
+327	ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff
+328	ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg
+329	ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:aff
+330	ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg
+331	ppas:sg:nom.acc.voc:n1.n2:imperf.perf:aff
+332	ppas:sg:nom.acc.voc:n1.n2:imperf.perf:neg
+333	ppas:sg:nom.acc.voc:n1.n2:imperf:aff
+334	ppas:sg:nom.acc.voc:n1.n2:imperf:neg
+335	ppas:sg:nom.acc.voc:n1.n2:perf:aff
+336	ppas:sg:nom.acc.voc:n1.n2:perf:neg
+337	ppas:sg:nom.voc:f:imperf.perf:aff
+338	ppas:sg:nom.voc:f:imperf.perf:neg
+339	ppas:sg:nom.voc:f:imperf:aff
+340	ppas:sg:nom.voc:f:imperf:neg
+341	ppas:sg:nom.voc:f:perf:aff
+342	ppas:sg:nom.voc:f:perf:neg
+343	ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff
+344	ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg
+345	ppas:sg:nom.voc:m1.m2.m3:imperf:aff
+346	ppas:sg:nom.voc:m1.m2.m3:imperf:neg
+347	ppas:sg:nom.voc:m1.m2.m3:perf:aff
+348	ppas:sg:nom.voc:m1.m2.m3:perf:neg
+349	ppron12:pl:acc:_:pri
+350	ppron12:pl:acc:_:sec
+351	ppron12:pl:dat:_:pri
+352	ppron12:pl:dat:_:sec
+353	ppron12:pl:gen:_:pri
+354	ppron12:pl:gen:_:sec
+355	ppron12:pl:inst:_:pri
+356	ppron12:pl:inst:_:sec
+357	ppron12:pl:loc:_:pri
+358	ppron12:pl:loc:_:sec
+359	ppron12:pl:nom:_:pri
+360	ppron12:pl:nom:_:sec
+361	ppron12:pl:voc:_:pri
+362	ppron12:pl:voc:_:sec
+363	ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:akc
+364	ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:nakc
+365	ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:akc
+366	ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:nakc
+367	ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:akc
+368	ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:nakc
+369	ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:akc
+370	ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:nakc
+371	ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:akc
+372	ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:nakc
+373	ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:akc
+374	ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:nakc
+375	ppron12:sg:inst:m1.m2.m3.f.n1.n2:pri
+376	ppron12:sg:inst:m1.m2.m3.f.n1.n2:sec
+377	ppron12:sg:loc:m1.m2.m3.f.n1.n2:pri
+378	ppron12:sg:loc:m1.m2.m3.f.n1.n2:sec
+379	ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri
+380	ppron12:sg:nom:m1.m2.m3.f.n1.n2:sec
+381	ppron12:sg:voc:m1.m2.m3.f.n1.n2:pri
+382	ppron12:sg:voc:m1.m2.m3.f.n1.n2:sec
+383	ppron3:pl:acc:m1.p1:ter:_:npraep
+384	ppron3:pl:acc:m1.p1:ter:_:praep
+385	ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:_:npraep
+386	ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:_:praep
+387	ppron3:pl:dat:_:ter:_:npraep
+388	ppron3:pl:dat:_:ter:_:praep
+389	ppron3:pl:gen:_:ter:_:npraep
+390	ppron3:pl:gen:_:ter:_:praep
+391	ppron3:pl:inst:_:ter:_:_
+392	ppron3:pl:loc:_:ter:_:_
+393	ppron3:pl:nom:m1.p1:ter:_:_
+394	ppron3:pl:nom:m2.m3.f.n1.n2.p2.p3:ter:_:_
+395	ppron3:sg:acc:f:ter:_:npraep
+396	ppron3:sg:acc:f:ter:_:praep
+397	ppron3:sg:acc:m1.m2.m3:ter:akc:npraep
+398	ppron3:sg:acc:m1.m2.m3:ter:akc:praep
+399	ppron3:sg:acc:m1.m2.m3:ter:nakc:npraep
+400	ppron3:sg:acc:m1.m2.m3:ter:nakc:praep
+401	ppron3:sg:acc:n1.n2:ter:_:npraep
+402	ppron3:sg:acc:n1.n2:ter:_:praep
+403	ppron3:sg:dat:f:ter:_:npraep
+404	ppron3:sg:dat:f:ter:_:praep
+405	ppron3:sg:dat:m1.m2.m3:ter:_:praep
+406	ppron3:sg:dat:m1.m2.m3:ter:akc:npraep
+407	ppron3:sg:dat:m1.m2.m3:ter:nakc:npraep
+408	ppron3:sg:dat:n1.n2:ter:_:praep
+409	ppron3:sg:dat:n1.n2:ter:akc:npraep
+410	ppron3:sg:dat:n1.n2:ter:nakc:npraep
+411	ppron3:sg:gen:f:ter:_:npraep
+412	ppron3:sg:gen:f:ter:_:praep
+413	ppron3:sg:gen:m1.m2.m3:ter:akc:npraep
+414	ppron3:sg:gen:m1.m2.m3:ter:akc:praep
+415	ppron3:sg:gen:m1.m2.m3:ter:nakc:npraep
+416	ppron3:sg:gen:m1.m2.m3:ter:nakc:praep
+417	ppron3:sg:gen:n1.n2:ter:_:praep
+418	ppron3:sg:gen:n1.n2:ter:akc:npraep
+419	ppron3:sg:gen:n1.n2:ter:nakc:npraep
+420	ppron3:sg:inst:f:ter:_:praep
+421	ppron3:sg:inst:m1.m2.m3:ter:_:_
+422	ppron3:sg:inst:n1.n2:ter:_:_
+423	ppron3:sg:loc:f:ter:_:_
+424	ppron3:sg:loc:m1.m2.m3:ter:_:_
+425	ppron3:sg:loc:n1.n2:ter:_:_
+426	ppron3:sg:nom:f:ter:_:_
+427	ppron3:sg:nom:m1.m2.m3:ter:_:_
+428	ppron3:sg:nom:n1.n2:ter:_:_
+429	praet:pl:m1.p1:imperf
+430	praet:pl:m1.p1:imperf.perf
+431	praet:pl:m1.p1:perf
+432	praet:pl:m2.m3.f.n1.n2.p2.p3:imperf
+433	praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf
+434	praet:pl:m2.m3.f.n1.n2.p2.p3:perf
+435	praet:sg:f:imperf
+436	praet:sg:f:imperf.perf
+437	praet:sg:f:perf
+438	praet:sg:m1.m2.m3:imperf
+439	praet:sg:m1.m2.m3:imperf.perf
+440	praet:sg:m1.m2.m3:imperf:agl
+441	praet:sg:m1.m2.m3:imperf:nagl
+442	praet:sg:m1.m2.m3:perf
+443	praet:sg:m1.m2.m3:perf:agl
+444	praet:sg:m1.m2.m3:perf:nagl
+445	praet:sg:n1.n2:imperf
+446	praet:sg:n1.n2:imperf.perf
+447	praet:sg:n1.n2:perf
+448	pred
+449	prep:acc
+450	prep:acc:nwok
+451	prep:acc:wok
+452	prep:dat
+453	prep:gen
+454	prep:gen:nwok
+455	prep:gen:wok
+456	prep:inst
+457	prep:inst:nwok
+458	prep:inst:wok
+459	prep:loc
+460	prep:loc:nwok
+461	prep:loc:wok
+462	prep:nom
+463	qub
+464	subst:pl:acc:f
+465	subst:pl:acc:m1
+466	subst:pl:acc:m2
+467	subst:pl:acc:m3
+468	subst:pl:acc:n1
+469	subst:pl:acc:n2
+470	subst:pl:acc:p1
+471	subst:pl:acc:p2
+472	subst:pl:acc:p3
+473	subst:pl:dat:f
+474	subst:pl:dat:m1
+475	subst:pl:dat:m2
+476	subst:pl:dat:m3
+477	subst:pl:dat:n1
+478	subst:pl:dat:n2
+479	subst:pl:dat:p1
+480	subst:pl:dat:p2
+481	subst:pl:dat:p3
+482	subst:pl:gen:f
+483	subst:pl:gen:m1
+484	subst:pl:gen:m2
+485	subst:pl:gen:m3
+486	subst:pl:gen:n1
+487	subst:pl:gen:n2
+488	subst:pl:gen:p1
+489	subst:pl:gen:p2
+490	subst:pl:gen:p3
+491	subst:pl:inst:f
+492	subst:pl:inst:m1
+493	subst:pl:inst:m2
+494	subst:pl:inst:m3
+495	subst:pl:inst:n1
+496	subst:pl:inst:n2
+497	subst:pl:inst:p1
+498	subst:pl:inst:p2
+499	subst:pl:inst:p3
+500	subst:pl:loc:f
+501	subst:pl:loc:m1
+502	subst:pl:loc:m2
+503	subst:pl:loc:m3
+504	subst:pl:loc:n1
+505	subst:pl:loc:n2
+506	subst:pl:loc:p1
+507	subst:pl:loc:p2
+508	subst:pl:loc:p3
+509	subst:pl:nom:f
+510	subst:pl:nom:m1
+511	subst:pl:nom:m2
+512	subst:pl:nom:m3
+513	subst:pl:nom:n1
+514	subst:pl:nom:n2
+515	subst:pl:nom:p1
+516	subst:pl:nom:p2
+517	subst:pl:nom:p3
+518	subst:pl:voc:f
+519	subst:pl:voc:m1
+520	subst:pl:voc:m2
+521	subst:pl:voc:m3
+522	subst:pl:voc:n1
+523	subst:pl:voc:n2
+524	subst:pl:voc:p1
+525	subst:pl:voc:p2
+526	subst:pl:voc:p3
+527	subst:sg:acc:f
+528	subst:sg:acc:m1
+529	subst:sg:acc:m2
+530	subst:sg:acc:m3
+531	subst:sg:acc:n1
+532	subst:sg:acc:n2
+533	subst:sg:dat:f
+534	subst:sg:dat:m1
+535	subst:sg:dat:m2
+536	subst:sg:dat:m3
+537	subst:sg:dat:n1
+538	subst:sg:dat:n2
+539	subst:sg:gen:f
+540	subst:sg:gen:m1
+541	subst:sg:gen:m2
+542	subst:sg:gen:m3
+543	subst:sg:gen:n1
+544	subst:sg:gen:n2
+545	subst:sg:inst:f
+546	subst:sg:inst:m1
+547	subst:sg:inst:m2
+548	subst:sg:inst:m3
+549	subst:sg:inst:n1
+550	subst:sg:inst:n2
+551	subst:sg:loc:f
+552	subst:sg:loc:m1
+553	subst:sg:loc:m2
+554	subst:sg:loc:m3
+555	subst:sg:loc:n1
+556	subst:sg:loc:n2
+557	subst:sg:nom:f
+558	subst:sg:nom:m1
+559	subst:sg:nom:m2
+560	subst:sg:nom:m3
+561	subst:sg:nom:n1
+562	subst:sg:nom:n2
+563	subst:sg:voc:f
+564	subst:sg:voc:m1
+565	subst:sg:voc:m2
+566	subst:sg:voc:m3
+567	subst:sg:voc:n1
+568	subst:sg:voc:n2
+569	winien:pl:m1.p1:imperf
+570	winien:pl:m2.m3.f.n1.n2.p2.p3:imperf
+571	winien:sg:f:imperf
+572	winien:sg:m1.m2.m3:imperf
+573	winien:sg:n1.n2:imperf
+
+[NAMES]
+
+0	
+1	etnonim
+2	geograficzna
+3	imię
+4	nazwisko
+5	określenie dodatkowe
+6	organizacja
+7	osoba
+8	pospolita
+9	własna
+10	wydarzenie
+11	wytwór
+
+'''
+Created on 18 lut 2014
+
+@author: mlenart
+'''
+import unittest
+import codecs
+import os
+
+from morfeuszbuilder.segrules import preprocessor
+from morfeuszbuilder.utils import configFile
+
+
+class Test(unittest.TestCase):
+
+
+    def testPreprocess(self):
+        filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat')
+        parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes'])
+        linesEnum = parsedFile.enumerateLinesInSection('combinations')
+        for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']):
+            print (lineNum, line)
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testPreprocess']
+    unittest.main()
 \ No newline at end of file
+[options]
+aggl=permissive strict isolated
+praet=split composite
+
+[combinations]
+(dupa|dupa)
+#define wsz_interp (interp|kropka|dywiz)*
+
+#define moze_interp(segmenty) wsz_interp segmenty wsz_interp
+
+# Segmenty występujące samodzielnie:
+#
+# domyślny typ segmentu samodzielnego:
+moze_interp(samodz)
+
+# segment samotny, który nie dopuszcza nawet znaku interpunkcyjnego po
+# sobie
+samotny
+
+# przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”:
+moze_interp(praet_sg_na)
+
+# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „czytał”:
+moze_interp(praet_sg)
+
+# przeszlik mnogi, np. „czytali”:
+moze_interp(praet_pl)
+
+# partykuła „by”:
+moze_interp(by)
+
+# inne segmenty, które dopuszczają po sobie aglutynant,
+# np. „powininna”, „czyżby”:
+moze_interp(z_aglt)
+
+# forma przymiotnikowa (dopuszcza adja):
+moze_interp(adj)
+
+# dywiz (jako samodzielny segment jest tyko błędnym użyciem w funkcji
+# myślnika, ale trzeba to dopuścić):
+dywiz
+
+#ifdef isolated
+adja
+#endif
+
+
+# Połączenia z aglutynantami:
+#
+#ifdef split
+# Czas przeszły:
+# np. „gniotł·am”
+moze_interp( praet_sg_agl aglsg )
+# np. „czytał·em”
+moze_interp(praet_sg aglsg)
+# np. „czytali·ście”
+moze_interp(praet_pl aglpl)
+
+# Tryb warunkowy:
+# np. „gniótł·by”
+moze_interp(praet_sg_na by)
+# np. „czytało·by”
+moze_interp(praet_sg by)
+# np. „gnietli·by”
+moze_interp(praet_pl by)
+# np. „gniótł·by·ś”
+moze_interp(praet_sg_na by aglsg)
+# np. „czytał·by·m”
+moze_interp(praet_sg by aglsg)
+# np. „gnietli·by·śmy”
+moze_interp(praet_pl by aglpl)
+#else
+moze_interp(praetcond)
+#endif
+# np. „by·ś”
+moze_interp(by aglsg)
+# np. „by·ście”
+moze_interp(by aglpl)
+
+# np. „gdyby·m”
+moze_interp(z_aglt aglsg)
+# np. „gdyby·ście”
+moze_interp(z_aglt aglpl)
+
+# To jest dużo za dużo, ale tytułem eksperymentu:
+#ifdef permissive
+moze_interp(samodz aglsg)
+moze_interp(samodz aglpl)
+#endif
+
+# Złożone formy przymiotnikowe
+# np. „biało·-·czerwony”
+moze_interp( (adja dywiz)+ adj )
+# poniższe załatwione przez + powyżej:
+# # np. „niebiesko·-·biało·-·czerwona”
+# adja dywiz adja dywiz adj interp?
+# # itd. (zatrzymujemy się pragmatycznie na 5 członach)
+# adja dywiz adja dywiz adja dywiz adj interp?
+# adja dywiz adja dywiz adja dywiz adja dywiz adj interp?
+
+# Stopień najwyższy:
+# np. „naj·zieleńszy”, „naj·mądrzej”
+moze_interp( naj> adj_sup )
+
+# Formy „zanegowane” gerundiów i imiesłowów:
+# np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”:
+moze_interp( nie > negat )
+
+# Przyimki akceptujące krótką formę „-ń”
+moze_interp(z_on_agl)
+# np. „do·ń”
+moze_interp(z_on_agl on_agl)
+
+# Liczba zapisana jako ciąg cyfr:
+moze_interp( dig>* dig )
+
+# Formacje prefiksalne
+#### trzeba wydzielić odpowiednie samodze!
+# rzeczownikowe i przymiotnikowe
+# np. „euro·sodoma”, „e-·papieros”, „euro·sodomski”, „bez·argumentowy”
+moze_interp( prefs samodz )
+# czasownikowe np. „po·nakapywać”
+moze_interp( prefv samodz )
+
+# Apozycje z dywizem
+# np. „kobieta-prezydent”
+moze_interp( samodz dywiz samodz )
+# poniższe do sprawdzenia, najwyraźniej obecne w tekstach, skoro wprowadziliśmy:
+# ?
+adj dywiz adj
+# ?
+adj dywiz samodz
+# ?
+samodz dywiz adj
+
+
+[tags]
+naj	naj
+nie	nie
+prefs	prefs
+prefv	prefv
+dig	dig
+adja	adja
+adj	adj:%:pos
+adj_sup	adj:%:sup
+adj_sup	adv:sup
+negat	ger:%:neg
+negat	pact:%:neg
+negat	ppas:%:neg
+on_agl	ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
+z_on_agl	prep:%
+samotny	brev:pun
+samotny	brev:npun
+samotny	intrj
+interp	interp
+aglsg	aglt:sg:%
+aglpl	aglt:pl:%
+praetcond	cond:%
+praetcond	praet:%:pri:%
+praetcond	praet:%:sec:%
+praetcond	praet:%:ter:%
+praet_sg_agl	praet:sg:%:agl
+praet_sg_na	praet:sg:%:nagl
+praet_sg	praet:sg:%
+praet_pl	praet:pl:%
+praet_sg	winien:sg:%
+praet_pl	winien:pl:%
+samodz		%
+
+[lexemes]
+z_aglt	aby:comp
+z_aglt	bowiem:comp
+by	by:qub
+z_aglt	by:comp
+z_aglt	cóż:subst
+z_aglt	czemu:adv
+z_aglt	czyżby:qub
+z_aglt	choćby:comp
+z_aglt	chociażby:comp
+z_aglt	dlaczego:adv
+z_aglt	dopóki:comp
+z_aglt	dopóty:conj
+z_aglt	gdyby:comp
+z_aglt	gdzie:qub
+z_aglt	gdzie:adv
+z_aglt	jakby:comp
+z_aglt	jakoby:comp
+z_aglt	kiedy:adv
+z_aglt	kiedy:comp
+z_aglt	tylko:qub
+z_aglt	żeby:comp
+dywiz	-:interp
+kropka	.:interp
+'''
+Created on 17 lut 2014
+
+@author: mlenart
+'''
+import re
+
+class Segtypes(object):
+    
+    def __init__(self, tagset, segrulesFile):
+        
+        self.tagset = tagset
+        
+        self.segrulesConfigFile = segrulesFile
+        
+        self.segtype2Segnum = {}
+        self.patternsList = []
+    
+    def readTags(self, lines):
+        inTags = False
+        for lineNum, line in enumerate(lines, start=1):
+            header = self._getHeaderValue(line, lineNum)
+            if header == 'tags':
+                inTags = True
+            elif header:
+                inTags = False
+            elif inTags:
+                segtype, pattern = line.strip().split('\t')
+                self._validate(
+                               u'Segment type must be a lowercase alphanumeric with optional underscores',
+                               lineNum,
+                               re.match(r'[a-z_]+', segtype))
+                self._validate(
+                               u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters',
+                               lineNum,
+                               re.match(r'[a-z_\.\:\%]+', pattern))
+                
+                if segtype in self.segtype2Segnum:
+                    segnum = self.segtype2Segnum[segtype]
+                else:
+                    segnum = len(self.segtype2Segnum)
+                    self.segtype2Segnum[segtype] = segnum
+                
+                self.patternsList.append(SegtypePattern(None, pattern, segnum))
+    
+    def readLexemes(self, lines):
+        inLexemes = False
+        for lineNum, line in enumerate(lines, start=1):
+            header = self._getHeaderValue(line, lineNum)
+            if header == 'lexemes':
+                inLexemes = True
+            elif header:
+                inLexemes = False
+            elif inLexemes:
+                segtype, pattern = line.strip().split('\t')
+                self._validate(
+                               u'Segment type must be a lowercase alphanumeric with optional underscores',
+                               lineNum,
+                               re.match(r'[a-z_]+', segtype))
+                self._validate(
+                               u'Pattern must contain lemma and POS',
+                               lineNum,
+                               re.match(r'\w+\:[a-z_]+', pattern, re.U))
+                
+                if segtype in self.segtype2Segnum:
+                    segnum = self.segtype2Segnum[segtype]
+                else:
+                    segnum = len(self.segtype2Segnum)
+                    self.segtype2Segnum[segtype] = segnum
+                
+                lemma, pos = pattern.split(':')
+                
+                self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum))
+    
+    def lexeme2Segnum(self, lemma, tag):
+        for p in self.patternsList:
+            res = p.tryToMatch(lemma, tag)
+            if res >= 0:
+                return res
+        raise SegtypesException('Cannot find segment type for given tag: %s' % tag)
+    
+class SegtypePattern(object):
+    
+    def __init__(self, lemma, pattern, segnum):
+        self.lemma = lemma
+        self.pattern = pattern
+        self.segnum = segnum
+    
+    def tryToMatch(self, lemma, tag):
+        if (self.lemma is None or self.lemma == lemma) \
+        and re.match(self.pattern.replace('%', '.*'), tag):
+            return self.segnum
+        else:
+            return -1
+
+class SegtypesException(Exception):
+    
+    def __init__(self, msg):
+        self.msg = msg
+    
+    def __str__(self):
+        return u'Error in segment rules: %s' % self.msg
+'''
+Created on 17 lut 2014
+
+@author: mlenart
+'''
+
+import codecs
+
+class Tagset(object):
+    
+    TAGS = 1
+    NAMES = 2
+    SEP = '\t'
+    
+    def __init__(self, filename, encoding='utf8'):
+        self.tag2tagnum = {}
+        self.name2namenum = {}
+        self._doInit(filename, encoding)
+        self.tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems()))
+    
+    def _doInit(self, filename, encoding):
+        addingTo = None
+        with codecs.open(filename, 'r', encoding) as f:
+            for line in f:
+                line = line.strip('\n')
+                if line == u'[TAGS]':
+                    addingTo = Tagset.TAGS
+                elif line == u'[NAMES]':
+                    addingTo = Tagset.NAMES
+                elif line and not line.startswith(u'#'):
+                    assert addingTo in [Tagset.TAGS, Tagset.NAMES]
+                    res = {Tagset.TAGS: self.tag2tagnum,
+                           Tagset.NAMES: self.name2namenum}[addingTo]
+                    tagNum = line.split(Tagset.SEP)[0]
+                    tag = line.split(Tagset.SEP)[1]
+                    assert tag not in res
+                    res[tag] = int(tagNum)
+    
+    def getTag4Tagnum(self, tagnum):
+        return self.tagnum2tag[tagnum]
 \ No newline at end of file
+'''
+Created on 18 lut 2014
+
+@author: mlenart
+'''
+
+import re
+import codecs
+
+def getHeaderValue(line, lineNum):
+    m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line)
+    if m:
+        return m.group(1)
+    else:
+        return None
+
+class ConfigFile(object):
+    
+    def __init__(self, filename, sectionNames):
+        self.filename = filename
+        self.sectionNames = sectionNames
+        self.section2Lines = {}
+        self.currSection = None
+        self._parse()
+    
+    def _addSectionStart(self, sectionName, lineNum):
+        if not sectionName in self.sectionNames:
+            raise ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName)
+        if sectionName in self.section2Lines:
+            raise ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName)
+        self.section2Lines[sectionName] = []
+        self.currSection = sectionName
+    
+    def _addLine(self, line, lineNum):
+        line = line.strip()
+        if line:
+            if self.currSection is None and not line.startswith('#'):
+                raise ConfigFileException(self.filename, lineNum, 'Text outside of any section')
+            self.section2Lines[self.currSection].append((lineNum, line))
+    
+    def _getHeaderValue(self, line, lineNum):
+        m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line)
+        if m:
+            return m.group(1)
+        else:
+            return None
+    
+    def enumerateLinesInSection(self, sectionName):
+        return self.section2Lines[sectionName]
+    
+    def _parse(self):
+        with codecs.open(self.filename, 'r', 'utf8') as f:
+            for lineNum, line in enumerate(f, start=1):
+                header = self._getHeaderValue(line, lineNum)
+                if header:
+                    self._addSectionStart(header, lineNum)
+                else:
+                    self._addLine(line, lineNum)
+
+class ConfigFileException(Exception):
+    
+    def __init__(self, filename, lineNum, msg):
+        self.filename = filename
+        self.lineNum = lineNum
+        self.msg = msg
+    
+    def __str__(self):
+        return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg)