Commit 1c1bf6777d2888a58f0faf084f903a5534c74a60
1 parent
28f11d57
- różne poprawki w parsowaniu tagsetu
- praca nad parsowaniem reguł zlepiania segmentów git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@85 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
18 changed files
with
1231 additions
and
125 deletions
fsabuilder/.settings/org.eclipse.core.resources.prefs
fsabuilder/.settings/org.eclipse.ltk.core.refactoring.prefs
0 → 100644
fsabuilder/morfeuszbuilder/fsa/common.py
@@ -77,34 +77,3 @@ class Interpretation4Generator(object): | @@ -77,34 +77,3 @@ class Interpretation4Generator(object): | ||
77 | 77 | ||
78 | def __repr__(self): | 78 | def __repr__(self): |
79 | return unicode(self) | 79 | return unicode(self) |
80 | - | ||
81 | -class Tagset(object): | ||
82 | - | ||
83 | - TAGS = 1 | ||
84 | - NAMES = 2 | ||
85 | - SEP = '\t' | ||
86 | - | ||
87 | - def __init__(self, filename, encoding='utf8'): | ||
88 | - self.tag2tagnum = {} | ||
89 | - self.name2namenum = {} | ||
90 | - self._doInit(filename, encoding) | ||
91 | -# print self.tag2tagnum | ||
92 | -# print self.name2namenum | ||
93 | - | ||
94 | - def _doInit(self, filename, encoding): | ||
95 | - addingTo = None | ||
96 | - with codecs.open(filename, 'r', encoding) as f: | ||
97 | - for line in f: | ||
98 | - line = line.strip('\n') | ||
99 | - if line == u'[TAGS]': | ||
100 | - addingTo = Tagset.TAGS | ||
101 | - elif line == u'[NAMES]': | ||
102 | - addingTo = Tagset.NAMES | ||
103 | - elif line and not line.startswith(u'#'): | ||
104 | - assert addingTo in [Tagset.TAGS, Tagset.NAMES] | ||
105 | - res = {Tagset.TAGS: self.tag2tagnum, | ||
106 | - Tagset.NAMES: self.name2namenum}[addingTo] | ||
107 | - tagNum = line.split(Tagset.SEP)[0] | ||
108 | - tag = line.split(Tagset.SEP)[1] | ||
109 | - assert tag not in res | ||
110 | - res[tag] = int(tagNum) |
fsabuilder/morfeuszbuilder/fsa/test/testConstruction.py
@@ -6,62 +6,62 @@ Created on Oct 8, 2013 | @@ -6,62 +6,62 @@ Created on Oct 8, 2013 | ||
6 | ''' | 6 | ''' |
7 | import unittest | 7 | import unittest |
8 | import os | 8 | import os |
9 | -from fsa import fsa, visualizer, encode, buildfsa | ||
10 | -from fsa.serializer import SimpleSerializer | 9 | +from morfeuszbuilder.fsa import fsa, visualizer, encode |
10 | +from morfeuszbuilder.fsa.serializer import SimpleSerializer | ||
11 | 11 | ||
12 | class Test(unittest.TestCase): | 12 | class Test(unittest.TestCase): |
13 | - | ||
14 | - def testSimpleConstruction(self): | ||
15 | - a = fsa.FSA(encode.SimpleEncoder()) | ||
16 | - input = sorted([ | ||
17 | - (u'bić', ''), | ||
18 | - (u'bij', ''), | ||
19 | - (u'biją', ''), | ||
20 | - (u'bijcie', ''), | ||
21 | - (u'bije', ''), | ||
22 | - (u'bijecie', ''), | ||
23 | - (u'bijemy', ''), | ||
24 | - (u'bijesz', ''), | ||
25 | - (u'biję', ''), | ||
26 | - (u'bijmy', ''), | ||
27 | - (u'bili', 'asd'), | ||
28 | - (u'biliby', ''), | ||
29 | - (u'bilibyście', ''), | ||
30 | - (u'bilibyśmy', ''), | ||
31 | - (u'biliście', 'asdfas'), | ||
32 | - (u'biliśmy', ''), | ||
33 | - (u'bił', 'wersadfas'), | ||
34 | - (u'biła', 'asdfasd'), | ||
35 | - (u'biłaby', 'asdfa'), | ||
36 | - (u'biłabym', ''), | ||
37 | - (u'biłabyś', 'asdfa'), | ||
38 | - (u'biłam', 'dfas'), | ||
39 | - (u'biłaś', 'asdfas'), | ||
40 | - (u'biłby', ''), | ||
41 | - (u'biłbym', 'asdfa'), | ||
42 | - (u'biłbyś', ''), | ||
43 | - (u'biłem', ''), | ||
44 | - (u'biłeś', 'sadfa'), | ||
45 | - (u'biły', ''), | ||
46 | - (u'biłyby', ''), | ||
47 | - (u'biłybyście', ''), | ||
48 | - (u'biłybyśmy', ''), | ||
49 | - (u'biłyście', ''), | ||
50 | - (u'biłyśmy', ''), | ||
51 | - ], key=lambda w: bytearray(w[0], 'utf8')) | ||
52 | - a.feed(input) | ||
53 | - for w, res in input: | ||
54 | - recognized = a.tryToRecognize(w) | ||
55 | - assert recognized == res | ||
56 | - a.calculateOffsets(lambda state: 1 + 4 * len(state.transitionsMap.keys()) + (len(state.encodedData) if state.isAccepting() else 0)) | ||
57 | - visualizer.Visualizer().visualize(a) | ||
58 | - | ||
59 | - def testPolimorfConstruction(self): | ||
60 | - inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab') | ||
61 | - tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset') | ||
62 | - fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile) | ||
63 | - serializer = SimpleSerializer(fsa) | ||
64 | - serializer.serialize2BinaryFile('/tmp/test0.fsa') | 13 | + pass |
14 | +# def testSimpleConstruction(self): | ||
15 | +# a = fsa.FSA(encode.SimpleEncoder()) | ||
16 | +# input = sorted([ | ||
17 | +# (u'bić', ''), | ||
18 | +# (u'bij', ''), | ||
19 | +# (u'biją', ''), | ||
20 | +# (u'bijcie', ''), | ||
21 | +# (u'bije', ''), | ||
22 | +# (u'bijecie', ''), | ||
23 | +# (u'bijemy', ''), | ||
24 | +# (u'bijesz', ''), | ||
25 | +# (u'biję', ''), | ||
26 | +# (u'bijmy', ''), | ||
27 | +# (u'bili', 'asd'), | ||
28 | +# (u'biliby', ''), | ||
29 | +# (u'bilibyście', ''), | ||
30 | +# (u'bilibyśmy', ''), | ||
31 | +# (u'biliście', 'asdfas'), | ||
32 | +# (u'biliśmy', ''), | ||
33 | +# (u'bił', 'wersadfas'), | ||
34 | +# (u'biła', 'asdfasd'), | ||
35 | +# (u'biłaby', 'asdfa'), | ||
36 | +# (u'biłabym', ''), | ||
37 | +# (u'biłabyś', 'asdfa'), | ||
38 | +# (u'biłam', 'dfas'), | ||
39 | +# (u'biłaś', 'asdfas'), | ||
40 | +# (u'biłby', ''), | ||
41 | +# (u'biłbym', 'asdfa'), | ||
42 | +# (u'biłbyś', ''), | ||
43 | +# (u'biłem', ''), | ||
44 | +# (u'biłeś', 'sadfa'), | ||
45 | +# (u'biły', ''), | ||
46 | +# (u'biłyby', ''), | ||
47 | +# (u'biłybyście', ''), | ||
48 | +# (u'biłybyśmy', ''), | ||
49 | +# (u'biłyście', ''), | ||
50 | +# (u'biłyśmy', ''), | ||
51 | +# ], key=lambda w: bytearray(w[0], 'utf8')) | ||
52 | +# a.feed(input) | ||
53 | +# for w, res in input: | ||
54 | +# recognized = a.tryToRecognize(w) | ||
55 | +# assert recognized == res | ||
56 | +# a.calculateOffsets(lambda state: 1 + 4 * len(state.transitionsMap.keys()) + (len(state.encodedData) if state.isAccepting() else 0)) | ||
57 | +# visualizer.Visualizer().visualize(a) | ||
58 | +# | ||
59 | +# def testPolimorfConstruction(self): | ||
60 | +# inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab') | ||
61 | +# tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset') | ||
62 | +# fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile) | ||
63 | +# serializer = SimpleSerializer(fsa) | ||
64 | +# serializer.serialize2BinaryFile('/tmp/test0.fsa') | ||
65 | # visualizer.Visualizer().visualize(fsa) | 65 | # visualizer.Visualizer().visualize(fsa) |
66 | 66 | ||
67 | if __name__ == "__main__": | 67 | if __name__ == "__main__": |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -7,6 +7,7 @@ import re | @@ -7,6 +7,7 @@ import re | ||
7 | from pyparsing import * | 7 | from pyparsing import * |
8 | 8 | ||
9 | identifier = Word(alphas, bodyChars=alphanums+'_') | 9 | identifier = Word(alphas, bodyChars=alphanums+'_') |
10 | +token = Word(alphas, bodyChars=alphanums+'_+>') | ||
10 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | 11 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() |
11 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() | 12 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() |
12 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() | 13 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() |
@@ -64,7 +65,7 @@ def _processLine(line, defines): | @@ -64,7 +65,7 @@ def _processLine(line, defines): | ||
64 | defineInstance = Forward() | 65 | defineInstance = Forward() |
65 | localId = identifier.copy() | 66 | localId = identifier.copy() |
66 | 67 | ||
67 | - rule << OneOrMore(localId ^ defineInstance ^ Word('*|+?')) | 68 | + rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')'))) |
68 | defineInstance << localId + Suppress('(') + rule + Suppress(')') | 69 | defineInstance << localId + Suppress('(') + rule + Suppress(')') |
69 | 70 | ||
70 | rule.setParseAction(lambda s, l, t: ' '.join(t)) | 71 | rule.setParseAction(lambda s, l, t: ' '.join(t)) |
@@ -77,25 +78,25 @@ def _processLine(line, defines): | @@ -77,25 +78,25 @@ def _processLine(line, defines): | ||
77 | def preprocess(inputLines, defs): | 78 | def preprocess(inputLines, defs): |
78 | defines = {} | 79 | defines = {} |
79 | ifdefsStack = [] | 80 | ifdefsStack = [] |
80 | - for lineNum, line in enumerate(inputLines, start=1): | 81 | + for lineNum, line in inputLines: |
81 | if line.startswith('#define'): | 82 | if line.startswith('#define'): |
82 | - try: | ||
83 | - parsedDefine = list(define.parseString(line)) | ||
84 | - if len(parsedDefine) == 2: | ||
85 | - name, val = parsedDefine | ||
86 | - defines[name] = NonArgDefine(name, val) | ||
87 | - else: | ||
88 | - name, arg, val = parsedDefine | ||
89 | - localDefines = defines.copy() | ||
90 | - localDefines[arg] = NonArgDefine(arg, arg) | ||
91 | - val = _processLine(val, localDefines) | ||
92 | - defines[name] = ArgDefine(name, arg, val) | ||
93 | - except: | ||
94 | - pass | 83 | + parsedDefine = list(define.parseString(line)) |
84 | + if len(parsedDefine) == 2: | ||
85 | + name, val = parsedDefine | ||
86 | + defines[name] = NonArgDefine(name, val) | ||
87 | + else: | ||
88 | + name, arg, val = parsedDefine | ||
89 | + localDefines = defines.copy() | ||
90 | + localDefines[arg] = NonArgDefine(arg, arg) | ||
91 | + val = _processLine(val, localDefines) | ||
92 | + defines[name] = ArgDefine(name, arg, val) | ||
95 | elif line.startswith('#ifdef'): | 93 | elif line.startswith('#ifdef'): |
96 | name = ifdef.parseString(line)[0] | 94 | name = ifdef.parseString(line)[0] |
97 | ifdefsStack.append(name) | 95 | ifdefsStack.append(name) |
98 | elif line.startswith('#endif'): | 96 | elif line.startswith('#endif'): |
99 | ifdefsStack.pop() | 97 | ifdefsStack.pop() |
98 | + elif line.startswith('#'): | ||
99 | + yield lineNum, line | ||
100 | elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)): | 100 | elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)): |
101 | - yield _processLine(line, defines) | 101 | + yield lineNum, _processLine(line, defines) |
102 | + | ||
102 | \ No newline at end of file | 103 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/segrules.py renamed to fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -15,38 +15,45 @@ class SegmentRule(object): | @@ -15,38 +15,45 @@ class SegmentRule(object): | ||
15 | Constructor | 15 | Constructor |
16 | ''' | 16 | ''' |
17 | 17 | ||
18 | -class SimpleRule(SegmentRule): | 18 | +class TagRule(SegmentRule): |
19 | 19 | ||
20 | - def __init__(self, name, typeId): | ||
21 | - self.name = name | ||
22 | - self.identifier = typeId | 20 | + def __init__(self, tagType, line): |
21 | + self.tagType = tagType | ||
22 | + self.line = line | ||
23 | + | ||
24 | +class UnaryRule(SegmentRule): | ||
25 | + | ||
26 | + def __init__(self, child, line): | ||
27 | + self.child = child | ||
28 | + self.line = line | ||
23 | 29 | ||
24 | class ComplexRule(SegmentRule): | 30 | class ComplexRule(SegmentRule): |
25 | 31 | ||
26 | - def __init__(self, children): | 32 | + def __init__(self, children, line): |
27 | self.children = children | 33 | self.children = children |
34 | + self.line = line | ||
28 | 35 | ||
29 | class ConcatRule(ComplexRule): | 36 | class ConcatRule(ComplexRule): |
30 | 37 | ||
31 | - def __init__(self, children): | ||
32 | - super(ConcatRule, self).__init__(children) | 38 | + def __init__(self, children, line): |
39 | + super(ConcatRule, self).__init__(children, line) | ||
33 | 40 | ||
34 | class OrRule(ComplexRule): | 41 | class OrRule(ComplexRule): |
35 | 42 | ||
36 | - def __init__(self, children): | ||
37 | - super(OrRule, self).__init__(children) | ||
38 | - | ||
39 | -class UnaryRule(SegmentRule): | ||
40 | - | ||
41 | - def __init__(self, child): | ||
42 | - self.child = child | 43 | + def __init__(self, children, line): |
44 | + super(OrRule, self).__init__(children, line) | ||
43 | 45 | ||
44 | class ZeroOrMoreRule(UnaryRule): | 46 | class ZeroOrMoreRule(UnaryRule): |
45 | 47 | ||
46 | - def __init__(self, child): | ||
47 | - super(ZeroOrMoreRule, self).__init__(child) | 48 | + def __init__(self, child, line): |
49 | + super(ZeroOrMoreRule, self).__init__(child, line) | ||
50 | + | ||
51 | +class OneOrMoreRule(UnaryRule): | ||
52 | + | ||
53 | + def __init__(self, child, line): | ||
54 | + super(OneOrMoreRule, self).__init__(child, line) | ||
48 | 55 | ||
49 | class IgnoreOrthRule(UnaryRule): | 56 | class IgnoreOrthRule(UnaryRule): |
50 | 57 | ||
51 | - def __init__(self, child): | ||
52 | - super(IgnoreOrthRule, self).__init__(child) | 58 | + def __init__(self, child, line): |
59 | + super(IgnoreOrthRule, self).__init__(child, line) |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
0 → 100644
1 | + | ||
2 | +from pyparsing import * | ||
3 | +from morfeuszbuilder.tagset import segtypes | ||
4 | +from morfeuszbuilder.utils import configFile | ||
5 | +from morfeuszbuilder.segrules import preprocessor | ||
6 | +import codecs | ||
7 | +import re | ||
8 | + | ||
9 | +import itertools | ||
10 | +import logging | ||
11 | +import segsfsa | ||
12 | + | ||
13 | +# header = Suppress('[') + Word(alphas, bodyChars=alphanums+'_') + Suppress(']') | ||
14 | +# define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | ||
15 | +# ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() | ||
16 | +# endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() | ||
17 | + | ||
18 | +def doprint(toks): | ||
19 | + print toks | ||
20 | + | ||
21 | +class RulesParser(object): | ||
22 | + | ||
23 | + def __init__(self, tagset): | ||
24 | + self.tagset = tagset | ||
25 | + | ||
26 | + def _getKey2Defs(self, segtypesConfigFile): | ||
27 | + res = {} | ||
28 | + for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'): | ||
29 | + lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress() | ||
30 | + try: | ||
31 | + key, defs = lineToParse.parseString(line) | ||
32 | + res[key] = tuple(defs) | ||
33 | + except Exception as ex: | ||
34 | + raise configFile.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) | ||
35 | + return res | ||
36 | + | ||
37 | + def parse(self, filename): | ||
38 | + res = [] | ||
39 | + | ||
40 | + segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) | ||
41 | + key2Defs = self._getKey2Defs(segtypesConfigFile) | ||
42 | + segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) | ||
43 | + | ||
44 | + def2Key = {} | ||
45 | + for key, defs in key2Defs.iteritems(): | ||
46 | + for define in defs: | ||
47 | + def2Key[define] = key | ||
48 | + | ||
49 | + for defs in itertools.product(*key2Defs.values()): | ||
50 | + key2Def = dict([(def2Key[define], define) for define in defs]) | ||
51 | + fsa = segsfsa.SegmentsFSA(key2Def) | ||
52 | + combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') | ||
53 | + combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) | ||
54 | + for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): | ||
55 | + fsa.addSegmentRule(rule) | ||
56 | + res.append(fsa) | ||
57 | + return res | ||
58 | + | ||
59 | + def _doParse(self, combinationEnumeratedLines, segtypesHelper): | ||
60 | + for lineNum, line in combinationEnumeratedLines: | ||
61 | + if not line.startswith('#'): | ||
62 | + yield self._doParseOneLine(lineNum, line, segtypesHelper) | ||
63 | + | ||
64 | + def _doParseOneLine(self, lineNum, line, segtypesHelper): | ||
65 | + rule = Forward() | ||
66 | + tagRule = Word(alphanums+'_') | ||
67 | + ignoreOrthRule = tagRule + Suppress('>') | ||
68 | + parenRule = Suppress('(') + rule + Suppress(')') | ||
69 | + atomicRule = tagRule ^ ignoreOrthRule ^ parenRule | ||
70 | + zeroOrMoreRule = atomicRule + Suppress('*') | ||
71 | + oneOrMoreRule = atomicRule + Suppress('+') | ||
72 | + unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule | ||
73 | + oneOfRule = delimitedList(unaryRule, delim='|') | ||
74 | + complexRule = unaryRule ^ oneOfRule | ||
75 | + concatRule = OneOrMore(complexRule) | ||
76 | + rule << concatRule | ||
77 | +# rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule | ||
78 | + | ||
79 | +# tagRule.setParseAction(lambda s,l,toks: doprint(toks)) | ||
80 | +# print lineNum, line | ||
81 | + parsedLine = rule.parseString(line, parseAll=True) | ||
82 | +# print parsedLine |
fsabuilder/morfeuszbuilder/segrules/segsfsa.py
@@ -14,7 +14,7 @@ class SegmentsFSAState(object): | @@ -14,7 +14,7 @@ class SegmentsFSAState(object): | ||
14 | 14 | ||
15 | class SegmentsFSA(object): | 15 | class SegmentsFSA(object): |
16 | 16 | ||
17 | - def __init__(self): | 17 | + def __init__(self, key2Def={}): |
18 | self.initialState = SegmentsFSAState() | 18 | self.initialState = SegmentsFSAState() |
19 | 19 | ||
20 | def addSegmentRule(self, segmentRule): | 20 | def addSegmentRule(self, segmentRule): |
@@ -23,3 +23,5 @@ class SegmentsFSA(object): | @@ -23,3 +23,5 @@ class SegmentsFSA(object): | ||
23 | def serialize(self): | 23 | def serialize(self): |
24 | res = bytearray() | 24 | res = bytearray() |
25 | return res | 25 | return res |
26 | + | ||
27 | + | ||
26 | \ No newline at end of file | 28 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/test.py
@@ -4,7 +4,7 @@ Created on 24 sty 2014 | @@ -4,7 +4,7 @@ Created on 24 sty 2014 | ||
4 | @author: mlenart | 4 | @author: mlenart |
5 | ''' | 5 | ''' |
6 | 6 | ||
7 | -import preprocessor | 7 | +from morfeuszbuilder.segrules import preprocessor |
8 | 8 | ||
9 | if __name__ == '__main__': | 9 | if __name__ == '__main__': |
10 | text = ''' | 10 | text = ''' |
@@ -13,8 +13,8 @@ dupa | @@ -13,8 +13,8 @@ dupa | ||
13 | #define X(x) a x b | 13 | #define X(x) a x b |
14 | #define Y(x) X(x) c | 14 | #define Y(x) X(x) c |
15 | #define B(x) X(x) | 15 | #define B(x) X(x) |
16 | -#define Z(x) Y(X(x)) d | ||
17 | -#define AB(asd) dupa asd dupa | 16 | +#define Z(x) Y( X(x) jhg) d |
17 | +#define A_B(asd) dupa asd dupa asfda_asdfa | ||
18 | Y(Z(a) b X(c) Y(d)) | 18 | Y(Z(a) b X(c) Y(d)) |
19 | #ifdef extra | 19 | #ifdef extra |
20 | asdfasa | 20 | asdfasa |
@@ -30,7 +30,7 @@ aaaa asd | @@ -30,7 +30,7 @@ aaaa asd | ||
30 | asdfasdfada | 30 | asdfasdfada |
31 | #endif | 31 | #endif |
32 | 32 | ||
33 | -AB(x) | 33 | +A_B( (x)+ x) |
34 | ''' | 34 | ''' |
35 | - for line in preprocessor.preprocess(text.split('\n'), ['extra', 'superextra']): | 35 | + for line in preprocessor.preprocess(enumerate(text.split('\n')), ['extra', 'superextra']): |
36 | print line | 36 | print line |
37 | \ No newline at end of file | 37 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/test/__init__.py
0 → 100644
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
0 → 100644
1 | +''' | ||
2 | +Created on 18 lut 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | +import unittest | ||
7 | +import os | ||
8 | +from morfeuszbuilder.segrules import rulesParser | ||
9 | +from morfeuszbuilder.tagset import tagset | ||
10 | + | ||
11 | +class Test(unittest.TestCase): | ||
12 | + t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) | ||
13 | + parser = rulesParser.RulesParser(t) | ||
14 | + parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | ||
15 | + | ||
16 | +if __name__ == "__main__": | ||
17 | + unittest.main() | ||
18 | +# testParser() | ||
0 | \ No newline at end of file | 19 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/test/polimorf.tagset
0 → 100644
1 | +#!MORFEUSZ-TAGSET 0.1 | ||
2 | + | ||
3 | +[TAGS] | ||
4 | + | ||
5 | +0 adj:pl:acc:m1.p1:com | ||
6 | +1 adj:pl:acc:m1.p1:pos | ||
7 | +2 adj:pl:acc:m1.p1:sup | ||
8 | +3 adj:pl:acc:m2.m3.f.n1.n2.p2.p3:com | ||
9 | +4 adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos | ||
10 | +5 adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup | ||
11 | +6 adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:com | ||
12 | +7 adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos | ||
13 | +8 adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup | ||
14 | +9 adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:com | ||
15 | +10 adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos | ||
16 | +11 adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup | ||
17 | +12 adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:com | ||
18 | +13 adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos | ||
19 | +14 adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup | ||
20 | +15 adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:com | ||
21 | +16 adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos | ||
22 | +17 adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup | ||
23 | +18 adj:pl:nom.voc:m1.p1:com | ||
24 | +19 adj:pl:nom.voc:m1.p1:pos | ||
25 | +20 adj:pl:nom.voc:m1.p1:sup | ||
26 | +21 adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:com | ||
27 | +22 adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos | ||
28 | +23 adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup | ||
29 | +24 adj:pl:nom:m1.p1:pos | ||
30 | +25 adj:pl:nom:m2.m3.f.n1.n2.p2.p3:pos | ||
31 | +26 adj:sg:acc:f:com | ||
32 | +27 adj:sg:acc:f:pos | ||
33 | +28 adj:sg:acc:f:sup | ||
34 | +29 adj:sg:acc:m1.m2:com | ||
35 | +30 adj:sg:acc:m1.m2:pos | ||
36 | +31 adj:sg:acc:m1.m2:sup | ||
37 | +32 adj:sg:acc:m3:com | ||
38 | +33 adj:sg:acc:m3:pos | ||
39 | +34 adj:sg:acc:m3:sup | ||
40 | +35 adj:sg:acc:n1.n2:com | ||
41 | +36 adj:sg:acc:n1.n2:pos | ||
42 | +37 adj:sg:acc:n1.n2:sup | ||
43 | +38 adj:sg:dat:f:com | ||
44 | +39 adj:sg:dat:f:pos | ||
45 | +40 adj:sg:dat:f:sup | ||
46 | +41 adj:sg:dat:m1.m2.m3.n1.n2:com | ||
47 | +42 adj:sg:dat:m1.m2.m3.n1.n2:pos | ||
48 | +43 adj:sg:dat:m1.m2.m3.n1.n2:sup | ||
49 | +44 adj:sg:gen:f:com | ||
50 | +45 adj:sg:gen:f:pos | ||
51 | +46 adj:sg:gen:f:sup | ||
52 | +47 adj:sg:gen:m1.m2.m3.n1.n2:com | ||
53 | +48 adj:sg:gen:m1.m2.m3.n1.n2:pos | ||
54 | +49 adj:sg:gen:m1.m2.m3.n1.n2:sup | ||
55 | +50 adj:sg:inst:f:com | ||
56 | +51 adj:sg:inst:f:pos | ||
57 | +52 adj:sg:inst:f:sup | ||
58 | +53 adj:sg:inst:m1.m2.m3.n1.n2:com | ||
59 | +54 adj:sg:inst:m1.m2.m3.n1.n2:pos | ||
60 | +55 adj:sg:inst:m1.m2.m3.n1.n2:sup | ||
61 | +56 adj:sg:loc:f:com | ||
62 | +57 adj:sg:loc:f:pos | ||
63 | +58 adj:sg:loc:f:sup | ||
64 | +59 adj:sg:loc:m1.m2.m3.n1.n2:com | ||
65 | +60 adj:sg:loc:m1.m2.m3.n1.n2:pos | ||
66 | +61 adj:sg:loc:m1.m2.m3.n1.n2:sup | ||
67 | +62 adj:sg:nom.voc:f:com | ||
68 | +63 adj:sg:nom.voc:f:pos | ||
69 | +64 adj:sg:nom.voc:f:sup | ||
70 | +65 adj:sg:nom.voc:m1.m2.m3:com | ||
71 | +66 adj:sg:nom.voc:m1.m2.m3:pos | ||
72 | +67 adj:sg:nom.voc:m1.m2.m3:sup | ||
73 | +68 adj:sg:nom.voc:n1.n2:com | ||
74 | +69 adj:sg:nom.voc:n1.n2:pos | ||
75 | +70 adj:sg:nom.voc:n1.n2:sup | ||
76 | +71 adj:sg:nom:f:pos | ||
77 | +72 adj:sg:nom:m1.m2.m3:pos | ||
78 | +73 adj:sg:nom:n1.n2:pos | ||
79 | +74 adja | ||
80 | +75 adjc | ||
81 | +76 adjp | ||
82 | +77 adv | ||
83 | +78 adv:com | ||
84 | +79 adv:pos | ||
85 | +80 adv:sup | ||
86 | +81 aglt:pl:pri:imperf:nwok | ||
87 | +82 aglt:pl:pri:imperf:wok | ||
88 | +83 aglt:pl:sec:imperf:nwok | ||
89 | +84 aglt:pl:sec:imperf:wok | ||
90 | +85 aglt:sg:pri:imperf:nwok | ||
91 | +86 aglt:sg:pri:imperf:wok | ||
92 | +87 aglt:sg:sec:imperf:nwok | ||
93 | +88 aglt:sg:sec:imperf:wok | ||
94 | +89 bedzie:pl:pri:imperf | ||
95 | +90 bedzie:pl:sec:imperf | ||
96 | +91 bedzie:pl:ter:imperf | ||
97 | +92 bedzie:sg:pri:imperf | ||
98 | +93 bedzie:sg:sec:imperf | ||
99 | +94 bedzie:sg:ter:imperf | ||
100 | +95 burk | ||
101 | +96 comp | ||
102 | +97 conj | ||
103 | +98 depr:pl:nom:m2 | ||
104 | +99 depr:pl:voc:m2 | ||
105 | +100 fin:pl:pri:imperf | ||
106 | +101 fin:pl:pri:imperf.perf | ||
107 | +102 fin:pl:pri:perf | ||
108 | +103 fin:pl:sec:imperf | ||
109 | +104 fin:pl:sec:imperf.perf | ||
110 | +105 fin:pl:sec:perf | ||
111 | +106 fin:pl:ter:imperf | ||
112 | +107 fin:pl:ter:imperf.perf | ||
113 | +108 fin:pl:ter:perf | ||
114 | +109 fin:sg:pri:imperf | ||
115 | +110 fin:sg:pri:imperf.perf | ||
116 | +111 fin:sg:pri:perf | ||
117 | +112 fin:sg:sec:imperf | ||
118 | +113 fin:sg:sec:imperf.perf | ||
119 | +114 fin:sg:sec:perf | ||
120 | +115 fin:sg:ter:imperf | ||
121 | +116 fin:sg:ter:imperf.perf | ||
122 | +117 fin:sg:ter:perf | ||
123 | +118 ger:sg:dat.loc:n2:imperf.perf:aff | ||
124 | +119 ger:sg:dat.loc:n2:imperf.perf:neg | ||
125 | +120 ger:sg:dat.loc:n2:imperf:aff | ||
126 | +121 ger:sg:dat.loc:n2:imperf:neg | ||
127 | +122 ger:sg:dat.loc:n2:perf:aff | ||
128 | +123 ger:sg:dat.loc:n2:perf:neg | ||
129 | +124 ger:sg:gen:n2:imperf.perf:aff | ||
130 | +125 ger:sg:gen:n2:imperf.perf:neg | ||
131 | +126 ger:sg:gen:n2:imperf:aff | ||
132 | +127 ger:sg:gen:n2:imperf:neg | ||
133 | +128 ger:sg:gen:n2:perf:aff | ||
134 | +129 ger:sg:gen:n2:perf:neg | ||
135 | +130 ger:sg:inst:n2:imperf.perf:aff | ||
136 | +131 ger:sg:inst:n2:imperf.perf:neg | ||
137 | +132 ger:sg:inst:n2:imperf:aff | ||
138 | +133 ger:sg:inst:n2:imperf:neg | ||
139 | +134 ger:sg:inst:n2:perf:aff | ||
140 | +135 ger:sg:inst:n2:perf:neg | ||
141 | +136 ger:sg:nom.acc:n2:imperf.perf:aff | ||
142 | +137 ger:sg:nom.acc:n2:imperf.perf:neg | ||
143 | +138 ger:sg:nom.acc:n2:imperf:aff | ||
144 | +139 ger:sg:nom.acc:n2:imperf:neg | ||
145 | +140 ger:sg:nom.acc:n2:perf:aff | ||
146 | +141 ger:sg:nom.acc:n2:perf:neg | ||
147 | +142 imps:imperf | ||
148 | +143 imps:imperf.perf | ||
149 | +144 imps:perf | ||
150 | +145 impt:pl:pri:imperf | ||
151 | +146 impt:pl:pri:imperf.perf | ||
152 | +147 impt:pl:pri:perf | ||
153 | +148 impt:pl:sec:imperf | ||
154 | +149 impt:pl:sec:imperf.perf | ||
155 | +150 impt:pl:sec:perf | ||
156 | +151 impt:sg:sec:imperf | ||
157 | +152 impt:sg:sec:imperf.perf | ||
158 | +153 impt:sg:sec:perf | ||
159 | +154 inf:imperf | ||
160 | +155 inf:imperf.perf | ||
161 | +156 inf:perf | ||
162 | +157 interj | ||
163 | +158 num:comp | ||
164 | +159 num:pl:acc:m1:rec | ||
165 | +160 num:pl:dat.loc:n1.p1.p2:congr.rec | ||
166 | +161 num:pl:dat:m1.m2.m3.n2.f:congr | ||
167 | +162 num:pl:gen.dat.inst.loc:m1.m2.m3.f.n1.n2.p1.p2:congr | ||
168 | +163 num:pl:gen.dat.inst.loc:m1.m2.m3.f.n2:congr | ||
169 | +164 num:pl:gen.dat.loc:m1.m2.m3.n2.f:congr | ||
170 | +165 num:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2:congr | ||
171 | +166 num:pl:gen.loc:m1.m2.m3.n2.f:congr | ||
172 | +167 num:pl:gen:n1.p1.p2:rec | ||
173 | +168 num:pl:inst:f:congr | ||
174 | +169 num:pl:inst:m1.m2.m3.f.n1.n2.p1.p2:congr | ||
175 | +170 num:pl:inst:m1.m2.m3.f.n2:congr | ||
176 | +171 num:pl:inst:m1.m2.m3.n2.f:congr | ||
177 | +172 num:pl:inst:m1.m2.m3.n2:congr | ||
178 | +173 num:pl:inst:n1.p1.p2:rec | ||
179 | +174 num:pl:nom.acc.voc:f:congr | ||
180 | +175 num:pl:nom.acc.voc:m1:rec | ||
181 | +176 num:pl:nom.acc.voc:m2.m3.f.n1.n2.p1.p2:rec | ||
182 | +177 num:pl:nom.acc.voc:m2.m3.f.n2:rec | ||
183 | +178 num:pl:nom.acc.voc:m2.m3.n2.f:congr | ||
184 | +179 num:pl:nom.acc.voc:m2.m3.n2:congr | ||
185 | +180 num:pl:nom.acc.voc:n1.p1.p2:rec | ||
186 | +181 num:pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec | ||
187 | +182 num:pl:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2.p1.p2:rec | ||
188 | +183 num:pl:nom.voc:m1:congr | ||
189 | +184 num:pl:nom.voc:m1:rec | ||
190 | +185 num:sg:nom.gen.dat.inst.acc.loc.voc:f:rec | ||
191 | +186 num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.n1.n2:rec | ||
192 | +187 pact:pl:acc:m1.p1:imperf.perf:aff | ||
193 | +188 pact:pl:acc:m1.p1:imperf.perf:neg | ||
194 | +189 pact:pl:acc:m1.p1:imperf:aff | ||
195 | +190 pact:pl:acc:m1.p1:imperf:neg | ||
196 | +191 pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff | ||
197 | +192 pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg | ||
198 | +193 pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff | ||
199 | +194 pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg | ||
200 | +195 pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff | ||
201 | +196 pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg | ||
202 | +197 pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff | ||
203 | +198 pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg | ||
204 | +199 pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff | ||
205 | +200 pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg | ||
206 | +201 pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff | ||
207 | +202 pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg | ||
208 | +203 pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff | ||
209 | +204 pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg | ||
210 | +205 pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff | ||
211 | +206 pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg | ||
212 | +207 pact:pl:nom.voc:m1.p1:imperf.perf:aff | ||
213 | +208 pact:pl:nom.voc:m1.p1:imperf.perf:neg | ||
214 | +209 pact:pl:nom.voc:m1.p1:imperf:aff | ||
215 | +210 pact:pl:nom.voc:m1.p1:imperf:neg | ||
216 | +211 pact:sg:acc.inst:f:imperf.perf:aff | ||
217 | +212 pact:sg:acc.inst:f:imperf.perf:neg | ||
218 | +213 pact:sg:acc.inst:f:imperf:aff | ||
219 | +214 pact:sg:acc.inst:f:imperf:neg | ||
220 | +215 pact:sg:acc:m1.m2:imperf.perf:aff | ||
221 | +216 pact:sg:acc:m1.m2:imperf.perf:neg | ||
222 | +217 pact:sg:acc:m1.m2:imperf:aff | ||
223 | +218 pact:sg:acc:m1.m2:imperf:neg | ||
224 | +219 pact:sg:acc:m3:imperf.perf:aff | ||
225 | +220 pact:sg:acc:m3:imperf.perf:neg | ||
226 | +221 pact:sg:acc:m3:imperf:aff | ||
227 | +222 pact:sg:acc:m3:imperf:neg | ||
228 | +223 pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff | ||
229 | +224 pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg | ||
230 | +225 pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff | ||
231 | +226 pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg | ||
232 | +227 pact:sg:gen.dat.loc:f:imperf.perf:aff | ||
233 | +228 pact:sg:gen.dat.loc:f:imperf.perf:neg | ||
234 | +229 pact:sg:gen.dat.loc:f:imperf:aff | ||
235 | +230 pact:sg:gen.dat.loc:f:imperf:neg | ||
236 | +231 pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff | ||
237 | +232 pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg | ||
238 | +233 pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff | ||
239 | +234 pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg | ||
240 | +235 pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff | ||
241 | +236 pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg | ||
242 | +237 pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff | ||
243 | +238 pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg | ||
244 | +239 pact:sg:nom.acc.voc:n1.n2:imperf.perf:aff | ||
245 | +240 pact:sg:nom.acc.voc:n1.n2:imperf.perf:neg | ||
246 | +241 pact:sg:nom.acc.voc:n1.n2:imperf:aff | ||
247 | +242 pact:sg:nom.acc.voc:n1.n2:imperf:neg | ||
248 | +243 pact:sg:nom.voc:f:imperf.perf:aff | ||
249 | +244 pact:sg:nom.voc:f:imperf.perf:neg | ||
250 | +245 pact:sg:nom.voc:f:imperf:aff | ||
251 | +246 pact:sg:nom.voc:f:imperf:neg | ||
252 | +247 pact:sg:nom.voc:m1.m2.m3:imperf.perf:aff | ||
253 | +248 pact:sg:nom.voc:m1.m2.m3:imperf.perf:neg | ||
254 | +249 pact:sg:nom.voc:m1.m2.m3:imperf:aff | ||
255 | +250 pact:sg:nom.voc:m1.m2.m3:imperf:neg | ||
256 | +251 pant:perf | ||
257 | +252 pcon:imperf | ||
258 | +253 ppas:pl:acc:m1.p1:imperf.perf:aff | ||
259 | +254 ppas:pl:acc:m1.p1:imperf.perf:neg | ||
260 | +255 ppas:pl:acc:m1.p1:imperf:aff | ||
261 | +256 ppas:pl:acc:m1.p1:imperf:neg | ||
262 | +257 ppas:pl:acc:m1.p1:perf:aff | ||
263 | +258 ppas:pl:acc:m1.p1:perf:neg | ||
264 | +259 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff | ||
265 | +260 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg | ||
266 | +261 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff | ||
267 | +262 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg | ||
268 | +263 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff | ||
269 | +264 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg | ||
270 | +265 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff | ||
271 | +266 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg | ||
272 | +267 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff | ||
273 | +268 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg | ||
274 | +269 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff | ||
275 | +270 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg | ||
276 | +271 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff | ||
277 | +272 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg | ||
278 | +273 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff | ||
279 | +274 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg | ||
280 | +275 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff | ||
281 | +276 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg | ||
282 | +277 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff | ||
283 | +278 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg | ||
284 | +279 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff | ||
285 | +280 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg | ||
286 | +281 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:aff | ||
287 | +282 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:neg | ||
288 | +283 ppas:pl:nom.voc:m1.p1:imperf.perf:aff | ||
289 | +284 ppas:pl:nom.voc:m1.p1:imperf.perf:neg | ||
290 | +285 ppas:pl:nom.voc:m1.p1:imperf:aff | ||
291 | +286 ppas:pl:nom.voc:m1.p1:imperf:neg | ||
292 | +287 ppas:pl:nom.voc:m1.p1:perf:aff | ||
293 | +288 ppas:pl:nom.voc:m1.p1:perf:neg | ||
294 | +289 ppas:sg:acc.inst:f:imperf.perf:aff | ||
295 | +290 ppas:sg:acc.inst:f:imperf.perf:neg | ||
296 | +291 ppas:sg:acc.inst:f:imperf:aff | ||
297 | +292 ppas:sg:acc.inst:f:imperf:neg | ||
298 | +293 ppas:sg:acc.inst:f:perf:aff | ||
299 | +294 ppas:sg:acc.inst:f:perf:neg | ||
300 | +295 ppas:sg:acc:m1.m2:imperf.perf:aff | ||
301 | +296 ppas:sg:acc:m1.m2:imperf.perf:neg | ||
302 | +297 ppas:sg:acc:m1.m2:imperf:aff | ||
303 | +298 ppas:sg:acc:m1.m2:imperf:neg | ||
304 | +299 ppas:sg:acc:m1.m2:perf:aff | ||
305 | +300 ppas:sg:acc:m1.m2:perf:neg | ||
306 | +301 ppas:sg:acc:m3:imperf.perf:aff | ||
307 | +302 ppas:sg:acc:m3:imperf.perf:neg | ||
308 | +303 ppas:sg:acc:m3:imperf:aff | ||
309 | +304 ppas:sg:acc:m3:imperf:neg | ||
310 | +305 ppas:sg:acc:m3:perf:aff | ||
311 | +306 ppas:sg:acc:m3:perf:neg | ||
312 | +307 ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff | ||
313 | +308 ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg | ||
314 | +309 ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff | ||
315 | +310 ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg | ||
316 | +311 ppas:sg:dat:m1.m2.m3.n1.n2:perf:aff | ||
317 | +312 ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg | ||
318 | +313 ppas:sg:gen.dat.loc:f:imperf.perf:aff | ||
319 | +314 ppas:sg:gen.dat.loc:f:imperf.perf:neg | ||
320 | +315 ppas:sg:gen.dat.loc:f:imperf:aff | ||
321 | +316 ppas:sg:gen.dat.loc:f:imperf:neg | ||
322 | +317 ppas:sg:gen.dat.loc:f:perf:aff | ||
323 | +318 ppas:sg:gen.dat.loc:f:perf:neg | ||
324 | +319 ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff | ||
325 | +320 ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg | ||
326 | +321 ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff | ||
327 | +322 ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg | ||
328 | +323 ppas:sg:gen:m1.m2.m3.n1.n2:perf:aff | ||
329 | +324 ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg | ||
330 | +325 ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff | ||
331 | +326 ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg | ||
332 | +327 ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff | ||
333 | +328 ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg | ||
334 | +329 ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:aff | ||
335 | +330 ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg | ||
336 | +331 ppas:sg:nom.acc.voc:n1.n2:imperf.perf:aff | ||
337 | +332 ppas:sg:nom.acc.voc:n1.n2:imperf.perf:neg | ||
338 | +333 ppas:sg:nom.acc.voc:n1.n2:imperf:aff | ||
339 | +334 ppas:sg:nom.acc.voc:n1.n2:imperf:neg | ||
340 | +335 ppas:sg:nom.acc.voc:n1.n2:perf:aff | ||
341 | +336 ppas:sg:nom.acc.voc:n1.n2:perf:neg | ||
342 | +337 ppas:sg:nom.voc:f:imperf.perf:aff | ||
343 | +338 ppas:sg:nom.voc:f:imperf.perf:neg | ||
344 | +339 ppas:sg:nom.voc:f:imperf:aff | ||
345 | +340 ppas:sg:nom.voc:f:imperf:neg | ||
346 | +341 ppas:sg:nom.voc:f:perf:aff | ||
347 | +342 ppas:sg:nom.voc:f:perf:neg | ||
348 | +343 ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff | ||
349 | +344 ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg | ||
350 | +345 ppas:sg:nom.voc:m1.m2.m3:imperf:aff | ||
351 | +346 ppas:sg:nom.voc:m1.m2.m3:imperf:neg | ||
352 | +347 ppas:sg:nom.voc:m1.m2.m3:perf:aff | ||
353 | +348 ppas:sg:nom.voc:m1.m2.m3:perf:neg | ||
354 | +349 ppron12:pl:acc:_:pri | ||
355 | +350 ppron12:pl:acc:_:sec | ||
356 | +351 ppron12:pl:dat:_:pri | ||
357 | +352 ppron12:pl:dat:_:sec | ||
358 | +353 ppron12:pl:gen:_:pri | ||
359 | +354 ppron12:pl:gen:_:sec | ||
360 | +355 ppron12:pl:inst:_:pri | ||
361 | +356 ppron12:pl:inst:_:sec | ||
362 | +357 ppron12:pl:loc:_:pri | ||
363 | +358 ppron12:pl:loc:_:sec | ||
364 | +359 ppron12:pl:nom:_:pri | ||
365 | +360 ppron12:pl:nom:_:sec | ||
366 | +361 ppron12:pl:voc:_:pri | ||
367 | +362 ppron12:pl:voc:_:sec | ||
368 | +363 ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:akc | ||
369 | +364 ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:nakc | ||
370 | +365 ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:akc | ||
371 | +366 ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:nakc | ||
372 | +367 ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:akc | ||
373 | +368 ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:nakc | ||
374 | +369 ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:akc | ||
375 | +370 ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:nakc | ||
376 | +371 ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:akc | ||
377 | +372 ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:nakc | ||
378 | +373 ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:akc | ||
379 | +374 ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:nakc | ||
380 | +375 ppron12:sg:inst:m1.m2.m3.f.n1.n2:pri | ||
381 | +376 ppron12:sg:inst:m1.m2.m3.f.n1.n2:sec | ||
382 | +377 ppron12:sg:loc:m1.m2.m3.f.n1.n2:pri | ||
383 | +378 ppron12:sg:loc:m1.m2.m3.f.n1.n2:sec | ||
384 | +379 ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri | ||
385 | +380 ppron12:sg:nom:m1.m2.m3.f.n1.n2:sec | ||
386 | +381 ppron12:sg:voc:m1.m2.m3.f.n1.n2:pri | ||
387 | +382 ppron12:sg:voc:m1.m2.m3.f.n1.n2:sec | ||
388 | +383 ppron3:pl:acc:m1.p1:ter:_:npraep | ||
389 | +384 ppron3:pl:acc:m1.p1:ter:_:praep | ||
390 | +385 ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:_:npraep | ||
391 | +386 ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:_:praep | ||
392 | +387 ppron3:pl:dat:_:ter:_:npraep | ||
393 | +388 ppron3:pl:dat:_:ter:_:praep | ||
394 | +389 ppron3:pl:gen:_:ter:_:npraep | ||
395 | +390 ppron3:pl:gen:_:ter:_:praep | ||
396 | +391 ppron3:pl:inst:_:ter:_:_ | ||
397 | +392 ppron3:pl:loc:_:ter:_:_ | ||
398 | +393 ppron3:pl:nom:m1.p1:ter:_:_ | ||
399 | +394 ppron3:pl:nom:m2.m3.f.n1.n2.p2.p3:ter:_:_ | ||
400 | +395 ppron3:sg:acc:f:ter:_:npraep | ||
401 | +396 ppron3:sg:acc:f:ter:_:praep | ||
402 | +397 ppron3:sg:acc:m1.m2.m3:ter:akc:npraep | ||
403 | +398 ppron3:sg:acc:m1.m2.m3:ter:akc:praep | ||
404 | +399 ppron3:sg:acc:m1.m2.m3:ter:nakc:npraep | ||
405 | +400 ppron3:sg:acc:m1.m2.m3:ter:nakc:praep | ||
406 | +401 ppron3:sg:acc:n1.n2:ter:_:npraep | ||
407 | +402 ppron3:sg:acc:n1.n2:ter:_:praep | ||
408 | +403 ppron3:sg:dat:f:ter:_:npraep | ||
409 | +404 ppron3:sg:dat:f:ter:_:praep | ||
410 | +405 ppron3:sg:dat:m1.m2.m3:ter:_:praep | ||
411 | +406 ppron3:sg:dat:m1.m2.m3:ter:akc:npraep | ||
412 | +407 ppron3:sg:dat:m1.m2.m3:ter:nakc:npraep | ||
413 | +408 ppron3:sg:dat:n1.n2:ter:_:praep | ||
414 | +409 ppron3:sg:dat:n1.n2:ter:akc:npraep | ||
415 | +410 ppron3:sg:dat:n1.n2:ter:nakc:npraep | ||
416 | +411 ppron3:sg:gen:f:ter:_:npraep | ||
417 | +412 ppron3:sg:gen:f:ter:_:praep | ||
418 | +413 ppron3:sg:gen:m1.m2.m3:ter:akc:npraep | ||
419 | +414 ppron3:sg:gen:m1.m2.m3:ter:akc:praep | ||
420 | +415 ppron3:sg:gen:m1.m2.m3:ter:nakc:npraep | ||
421 | +416 ppron3:sg:gen:m1.m2.m3:ter:nakc:praep | ||
422 | +417 ppron3:sg:gen:n1.n2:ter:_:praep | ||
423 | +418 ppron3:sg:gen:n1.n2:ter:akc:npraep | ||
424 | +419 ppron3:sg:gen:n1.n2:ter:nakc:npraep | ||
425 | +420 ppron3:sg:inst:f:ter:_:praep | ||
426 | +421 ppron3:sg:inst:m1.m2.m3:ter:_:_ | ||
427 | +422 ppron3:sg:inst:n1.n2:ter:_:_ | ||
428 | +423 ppron3:sg:loc:f:ter:_:_ | ||
429 | +424 ppron3:sg:loc:m1.m2.m3:ter:_:_ | ||
430 | +425 ppron3:sg:loc:n1.n2:ter:_:_ | ||
431 | +426 ppron3:sg:nom:f:ter:_:_ | ||
432 | +427 ppron3:sg:nom:m1.m2.m3:ter:_:_ | ||
433 | +428 ppron3:sg:nom:n1.n2:ter:_:_ | ||
434 | +429 praet:pl:m1.p1:imperf | ||
435 | +430 praet:pl:m1.p1:imperf.perf | ||
436 | +431 praet:pl:m1.p1:perf | ||
437 | +432 praet:pl:m2.m3.f.n1.n2.p2.p3:imperf | ||
438 | +433 praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf | ||
439 | +434 praet:pl:m2.m3.f.n1.n2.p2.p3:perf | ||
440 | +435 praet:sg:f:imperf | ||
441 | +436 praet:sg:f:imperf.perf | ||
442 | +437 praet:sg:f:perf | ||
443 | +438 praet:sg:m1.m2.m3:imperf | ||
444 | +439 praet:sg:m1.m2.m3:imperf.perf | ||
445 | +440 praet:sg:m1.m2.m3:imperf:agl | ||
446 | +441 praet:sg:m1.m2.m3:imperf:nagl | ||
447 | +442 praet:sg:m1.m2.m3:perf | ||
448 | +443 praet:sg:m1.m2.m3:perf:agl | ||
449 | +444 praet:sg:m1.m2.m3:perf:nagl | ||
450 | +445 praet:sg:n1.n2:imperf | ||
451 | +446 praet:sg:n1.n2:imperf.perf | ||
452 | +447 praet:sg:n1.n2:perf | ||
453 | +448 pred | ||
454 | +449 prep:acc | ||
455 | +450 prep:acc:nwok | ||
456 | +451 prep:acc:wok | ||
457 | +452 prep:dat | ||
458 | +453 prep:gen | ||
459 | +454 prep:gen:nwok | ||
460 | +455 prep:gen:wok | ||
461 | +456 prep:inst | ||
462 | +457 prep:inst:nwok | ||
463 | +458 prep:inst:wok | ||
464 | +459 prep:loc | ||
465 | +460 prep:loc:nwok | ||
466 | +461 prep:loc:wok | ||
467 | +462 prep:nom | ||
468 | +463 qub | ||
469 | +464 subst:pl:acc:f | ||
470 | +465 subst:pl:acc:m1 | ||
471 | +466 subst:pl:acc:m2 | ||
472 | +467 subst:pl:acc:m3 | ||
473 | +468 subst:pl:acc:n1 | ||
474 | +469 subst:pl:acc:n2 | ||
475 | +470 subst:pl:acc:p1 | ||
476 | +471 subst:pl:acc:p2 | ||
477 | +472 subst:pl:acc:p3 | ||
478 | +473 subst:pl:dat:f | ||
479 | +474 subst:pl:dat:m1 | ||
480 | +475 subst:pl:dat:m2 | ||
481 | +476 subst:pl:dat:m3 | ||
482 | +477 subst:pl:dat:n1 | ||
483 | +478 subst:pl:dat:n2 | ||
484 | +479 subst:pl:dat:p1 | ||
485 | +480 subst:pl:dat:p2 | ||
486 | +481 subst:pl:dat:p3 | ||
487 | +482 subst:pl:gen:f | ||
488 | +483 subst:pl:gen:m1 | ||
489 | +484 subst:pl:gen:m2 | ||
490 | +485 subst:pl:gen:m3 | ||
491 | +486 subst:pl:gen:n1 | ||
492 | +487 subst:pl:gen:n2 | ||
493 | +488 subst:pl:gen:p1 | ||
494 | +489 subst:pl:gen:p2 | ||
495 | +490 subst:pl:gen:p3 | ||
496 | +491 subst:pl:inst:f | ||
497 | +492 subst:pl:inst:m1 | ||
498 | +493 subst:pl:inst:m2 | ||
499 | +494 subst:pl:inst:m3 | ||
500 | +495 subst:pl:inst:n1 | ||
501 | +496 subst:pl:inst:n2 | ||
502 | +497 subst:pl:inst:p1 | ||
503 | +498 subst:pl:inst:p2 | ||
504 | +499 subst:pl:inst:p3 | ||
505 | +500 subst:pl:loc:f | ||
506 | +501 subst:pl:loc:m1 | ||
507 | +502 subst:pl:loc:m2 | ||
508 | +503 subst:pl:loc:m3 | ||
509 | +504 subst:pl:loc:n1 | ||
510 | +505 subst:pl:loc:n2 | ||
511 | +506 subst:pl:loc:p1 | ||
512 | +507 subst:pl:loc:p2 | ||
513 | +508 subst:pl:loc:p3 | ||
514 | +509 subst:pl:nom:f | ||
515 | +510 subst:pl:nom:m1 | ||
516 | +511 subst:pl:nom:m2 | ||
517 | +512 subst:pl:nom:m3 | ||
518 | +513 subst:pl:nom:n1 | ||
519 | +514 subst:pl:nom:n2 | ||
520 | +515 subst:pl:nom:p1 | ||
521 | +516 subst:pl:nom:p2 | ||
522 | +517 subst:pl:nom:p3 | ||
523 | +518 subst:pl:voc:f | ||
524 | +519 subst:pl:voc:m1 | ||
525 | +520 subst:pl:voc:m2 | ||
526 | +521 subst:pl:voc:m3 | ||
527 | +522 subst:pl:voc:n1 | ||
528 | +523 subst:pl:voc:n2 | ||
529 | +524 subst:pl:voc:p1 | ||
530 | +525 subst:pl:voc:p2 | ||
531 | +526 subst:pl:voc:p3 | ||
532 | +527 subst:sg:acc:f | ||
533 | +528 subst:sg:acc:m1 | ||
534 | +529 subst:sg:acc:m2 | ||
535 | +530 subst:sg:acc:m3 | ||
536 | +531 subst:sg:acc:n1 | ||
537 | +532 subst:sg:acc:n2 | ||
538 | +533 subst:sg:dat:f | ||
539 | +534 subst:sg:dat:m1 | ||
540 | +535 subst:sg:dat:m2 | ||
541 | +536 subst:sg:dat:m3 | ||
542 | +537 subst:sg:dat:n1 | ||
543 | +538 subst:sg:dat:n2 | ||
544 | +539 subst:sg:gen:f | ||
545 | +540 subst:sg:gen:m1 | ||
546 | +541 subst:sg:gen:m2 | ||
547 | +542 subst:sg:gen:m3 | ||
548 | +543 subst:sg:gen:n1 | ||
549 | +544 subst:sg:gen:n2 | ||
550 | +545 subst:sg:inst:f | ||
551 | +546 subst:sg:inst:m1 | ||
552 | +547 subst:sg:inst:m2 | ||
553 | +548 subst:sg:inst:m3 | ||
554 | +549 subst:sg:inst:n1 | ||
555 | +550 subst:sg:inst:n2 | ||
556 | +551 subst:sg:loc:f | ||
557 | +552 subst:sg:loc:m1 | ||
558 | +553 subst:sg:loc:m2 | ||
559 | +554 subst:sg:loc:m3 | ||
560 | +555 subst:sg:loc:n1 | ||
561 | +556 subst:sg:loc:n2 | ||
562 | +557 subst:sg:nom:f | ||
563 | +558 subst:sg:nom:m1 | ||
564 | +559 subst:sg:nom:m2 | ||
565 | +560 subst:sg:nom:m3 | ||
566 | +561 subst:sg:nom:n1 | ||
567 | +562 subst:sg:nom:n2 | ||
568 | +563 subst:sg:voc:f | ||
569 | +564 subst:sg:voc:m1 | ||
570 | +565 subst:sg:voc:m2 | ||
571 | +566 subst:sg:voc:m3 | ||
572 | +567 subst:sg:voc:n1 | ||
573 | +568 subst:sg:voc:n2 | ||
574 | +569 winien:pl:m1.p1:imperf | ||
575 | +570 winien:pl:m2.m3.f.n1.n2.p2.p3:imperf | ||
576 | +571 winien:sg:f:imperf | ||
577 | +572 winien:sg:m1.m2.m3:imperf | ||
578 | +573 winien:sg:n1.n2:imperf | ||
579 | + | ||
580 | +[NAMES] | ||
581 | + | ||
582 | +0 | ||
583 | +1 etnonim | ||
584 | +2 geograficzna | ||
585 | +3 imię | ||
586 | +4 nazwisko | ||
587 | +5 określenie dodatkowe | ||
588 | +6 organizacja | ||
589 | +7 osoba | ||
590 | +8 pospolita | ||
591 | +9 własna | ||
592 | +10 wydarzenie | ||
593 | +11 wytwór | ||
594 | + |
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
0 → 100644
1 | +''' | ||
2 | +Created on 18 lut 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | +import unittest | ||
7 | +import codecs | ||
8 | +import os | ||
9 | + | ||
10 | +from morfeuszbuilder.segrules import preprocessor | ||
11 | +from morfeuszbuilder.utils import configFile | ||
12 | + | ||
13 | + | ||
14 | +class Test(unittest.TestCase): | ||
15 | + | ||
16 | + | ||
17 | + def testPreprocess(self): | ||
18 | + filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat') | ||
19 | + parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) | ||
20 | + linesEnum = parsedFile.enumerateLinesInSection('combinations') | ||
21 | + for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): | ||
22 | + print (lineNum, line) | ||
23 | + | ||
24 | + | ||
25 | +if __name__ == "__main__": | ||
26 | + #import sys;sys.argv = ['', 'Test.testPreprocess'] | ||
27 | + unittest.main() | ||
0 | \ No newline at end of file | 28 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
0 → 100644
1 | +[options] | ||
2 | +aggl=permissive strict isolated | ||
3 | +praet=split composite | ||
4 | + | ||
5 | +[combinations] | ||
6 | +(dupa|dupa) | ||
7 | +#define wsz_interp (interp|kropka|dywiz)* | ||
8 | + | ||
9 | +#define moze_interp(segmenty) wsz_interp segmenty wsz_interp | ||
10 | + | ||
11 | +# Segmenty występujące samodzielnie: | ||
12 | +# | ||
13 | +# domyślny typ segmentu samodzielnego: | ||
14 | +moze_interp(samodz) | ||
15 | + | ||
16 | +# segment samotny, który nie dopuszcza nawet znaku interpunkcyjnego po | ||
17 | +# sobie | ||
18 | +samotny | ||
19 | + | ||
20 | +# przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”: | ||
21 | +moze_interp(praet_sg_na) | ||
22 | + | ||
23 | +# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „czytał”: | ||
24 | +moze_interp(praet_sg) | ||
25 | + | ||
26 | +# przeszlik mnogi, np. „czytali”: | ||
27 | +moze_interp(praet_pl) | ||
28 | + | ||
29 | +# partykuła „by”: | ||
30 | +moze_interp(by) | ||
31 | + | ||
32 | +# inne segmenty, które dopuszczają po sobie aglutynant, | ||
33 | +# np. „powininna”, „czyżby”: | ||
34 | +moze_interp(z_aglt) | ||
35 | + | ||
36 | +# forma przymiotnikowa (dopuszcza adja): | ||
37 | +moze_interp(adj) | ||
38 | + | ||
39 | +# dywiz (jako samodzielny segment jest tyko błędnym użyciem w funkcji | ||
40 | +# myślnika, ale trzeba to dopuścić): | ||
41 | +dywiz | ||
42 | + | ||
43 | +#ifdef isolated | ||
44 | +adja | ||
45 | +#endif | ||
46 | + | ||
47 | + | ||
48 | +# Połączenia z aglutynantami: | ||
49 | +# | ||
50 | +#ifdef split | ||
51 | +# Czas przeszły: | ||
52 | +# np. „gniotł·am” | ||
53 | +moze_interp( praet_sg_agl aglsg ) | ||
54 | +# np. „czytał·em” | ||
55 | +moze_interp(praet_sg aglsg) | ||
56 | +# np. „czytali·ście” | ||
57 | +moze_interp(praet_pl aglpl) | ||
58 | + | ||
59 | +# Tryb warunkowy: | ||
60 | +# np. „gniótł·by” | ||
61 | +moze_interp(praet_sg_na by) | ||
62 | +# np. „czytało·by” | ||
63 | +moze_interp(praet_sg by) | ||
64 | +# np. „gnietli·by” | ||
65 | +moze_interp(praet_pl by) | ||
66 | +# np. „gniótł·by·ś” | ||
67 | +moze_interp(praet_sg_na by aglsg) | ||
68 | +# np. „czytał·by·m” | ||
69 | +moze_interp(praet_sg by aglsg) | ||
70 | +# np. „gnietli·by·śmy” | ||
71 | +moze_interp(praet_pl by aglpl) | ||
72 | +#else | ||
73 | +moze_interp(praetcond) | ||
74 | +#endif | ||
75 | +# np. „by·ś” | ||
76 | +moze_interp(by aglsg) | ||
77 | +# np. „by·ście” | ||
78 | +moze_interp(by aglpl) | ||
79 | + | ||
80 | +# np. „gdyby·m” | ||
81 | +moze_interp(z_aglt aglsg) | ||
82 | +# np. „gdyby·ście” | ||
83 | +moze_interp(z_aglt aglpl) | ||
84 | + | ||
85 | +# To jest dużo za dużo, ale tytułem eksperymentu: | ||
86 | +#ifdef permissive | ||
87 | +moze_interp(samodz aglsg) | ||
88 | +moze_interp(samodz aglpl) | ||
89 | +#endif | ||
90 | + | ||
91 | +# Złożone formy przymiotnikowe | ||
92 | +# np. „biało·-·czerwony” | ||
93 | +moze_interp( (adja dywiz)+ adj ) | ||
94 | +# poniższe załatwione przez + powyżej: | ||
95 | +# # np. „niebiesko·-·biało·-·czerwona” | ||
96 | +# adja dywiz adja dywiz adj interp? | ||
97 | +# # itd. (zatrzymujemy się pragmatycznie na 5 członach) | ||
98 | +# adja dywiz adja dywiz adja dywiz adj interp? | ||
99 | +# adja dywiz adja dywiz adja dywiz adja dywiz adj interp? | ||
100 | + | ||
101 | +# Stopień najwyższy: | ||
102 | +# np. „naj·zieleńszy”, „naj·mądrzej” | ||
103 | +moze_interp( naj> adj_sup ) | ||
104 | + | ||
105 | +# Formy „zanegowane” gerundiów i imiesłowów: | ||
106 | +# np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: | ||
107 | +moze_interp( nie > negat ) | ||
108 | + | ||
109 | +# Przyimki akceptujące krótką formę „-ń” | ||
110 | +moze_interp(z_on_agl) | ||
111 | +# np. „do·ń” | ||
112 | +moze_interp(z_on_agl on_agl) | ||
113 | + | ||
114 | +# Liczba zapisana jako ciąg cyfr: | ||
115 | +moze_interp( dig>* dig ) | ||
116 | + | ||
117 | +# Formacje prefiksalne | ||
118 | +#### trzeba wydzielić odpowiednie samodze! | ||
119 | +# rzeczownikowe i przymiotnikowe | ||
120 | +# np. „euro·sodoma”, „e-·papieros”, „euro·sodomski”, „bez·argumentowy” | ||
121 | +moze_interp( prefs samodz ) | ||
122 | +# czasownikowe np. „po·nakapywać” | ||
123 | +moze_interp( prefv samodz ) | ||
124 | + | ||
125 | +# Apozycje z dywizem | ||
126 | +# np. „kobieta-prezydent” | ||
127 | +moze_interp( samodz dywiz samodz ) | ||
128 | +# poniższe do sprawdzenia, najwyraźniej obecne w tekstach, skoro wprowadziliśmy: | ||
129 | +# ? | ||
130 | +adj dywiz adj | ||
131 | +# ? | ||
132 | +adj dywiz samodz | ||
133 | +# ? | ||
134 | +samodz dywiz adj | ||
135 | + | ||
136 | + | ||
137 | +[tags] | ||
138 | +naj naj | ||
139 | +nie nie | ||
140 | +prefs prefs | ||
141 | +prefv prefv | ||
142 | +dig dig | ||
143 | +adja adja | ||
144 | +adj adj:%:pos | ||
145 | +adj_sup adj:%:sup | ||
146 | +adj_sup adv:sup | ||
147 | +negat ger:%:neg | ||
148 | +negat pact:%:neg | ||
149 | +negat ppas:%:neg | ||
150 | +on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep | ||
151 | +z_on_agl prep:% | ||
152 | +samotny brev:pun | ||
153 | +samotny brev:npun | ||
154 | +samotny intrj | ||
155 | +interp interp | ||
156 | +aglsg aglt:sg:% | ||
157 | +aglpl aglt:pl:% | ||
158 | +praetcond cond:% | ||
159 | +praetcond praet:%:pri:% | ||
160 | +praetcond praet:%:sec:% | ||
161 | +praetcond praet:%:ter:% | ||
162 | +praet_sg_agl praet:sg:%:agl | ||
163 | +praet_sg_na praet:sg:%:nagl | ||
164 | +praet_sg praet:sg:% | ||
165 | +praet_pl praet:pl:% | ||
166 | +praet_sg winien:sg:% | ||
167 | +praet_pl winien:pl:% | ||
168 | +samodz % | ||
169 | + | ||
170 | +[lexemes] | ||
171 | +z_aglt aby:comp | ||
172 | +z_aglt bowiem:comp | ||
173 | +by by:qub | ||
174 | +z_aglt by:comp | ||
175 | +z_aglt cóż:subst | ||
176 | +z_aglt czemu:adv | ||
177 | +z_aglt czyżby:qub | ||
178 | +z_aglt choćby:comp | ||
179 | +z_aglt chociażby:comp | ||
180 | +z_aglt dlaczego:adv | ||
181 | +z_aglt dopóki:comp | ||
182 | +z_aglt dopóty:conj | ||
183 | +z_aglt gdyby:comp | ||
184 | +z_aglt gdzie:qub | ||
185 | +z_aglt gdzie:adv | ||
186 | +z_aglt jakby:comp | ||
187 | +z_aglt jakoby:comp | ||
188 | +z_aglt kiedy:adv | ||
189 | +z_aglt kiedy:comp | ||
190 | +z_aglt tylko:qub | ||
191 | +z_aglt żeby:comp | ||
192 | +dywiz -:interp | ||
193 | +kropka .:interp |
fsabuilder/morfeuszbuilder/tagset/__init__.py
0 → 100644
fsabuilder/morfeuszbuilder/tagset/segtypes.py
0 → 100644
1 | +''' | ||
2 | +Created on 17 lut 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | +import re | ||
7 | + | ||
8 | +class Segtypes(object): | ||
9 | + | ||
10 | + def __init__(self, tagset, segrulesFile): | ||
11 | + | ||
12 | + self.tagset = tagset | ||
13 | + | ||
14 | + self.segrulesConfigFile = segrulesFile | ||
15 | + | ||
16 | + self.segtype2Segnum = {} | ||
17 | + self.patternsList = [] | ||
18 | + | ||
19 | + def readTags(self, lines): | ||
20 | + inTags = False | ||
21 | + for lineNum, line in enumerate(lines, start=1): | ||
22 | + header = self._getHeaderValue(line, lineNum) | ||
23 | + if header == 'tags': | ||
24 | + inTags = True | ||
25 | + elif header: | ||
26 | + inTags = False | ||
27 | + elif inTags: | ||
28 | + segtype, pattern = line.strip().split('\t') | ||
29 | + self._validate( | ||
30 | + u'Segment type must be a lowercase alphanumeric with optional underscores', | ||
31 | + lineNum, | ||
32 | + re.match(r'[a-z_]+', segtype)) | ||
33 | + self._validate( | ||
34 | + u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', | ||
35 | + lineNum, | ||
36 | + re.match(r'[a-z_\.\:\%]+', pattern)) | ||
37 | + | ||
38 | + if segtype in self.segtype2Segnum: | ||
39 | + segnum = self.segtype2Segnum[segtype] | ||
40 | + else: | ||
41 | + segnum = len(self.segtype2Segnum) | ||
42 | + self.segtype2Segnum[segtype] = segnum | ||
43 | + | ||
44 | + self.patternsList.append(SegtypePattern(None, pattern, segnum)) | ||
45 | + | ||
46 | + def readLexemes(self, lines): | ||
47 | + inLexemes = False | ||
48 | + for lineNum, line in enumerate(lines, start=1): | ||
49 | + header = self._getHeaderValue(line, lineNum) | ||
50 | + if header == 'lexemes': | ||
51 | + inLexemes = True | ||
52 | + elif header: | ||
53 | + inLexemes = False | ||
54 | + elif inLexemes: | ||
55 | + segtype, pattern = line.strip().split('\t') | ||
56 | + self._validate( | ||
57 | + u'Segment type must be a lowercase alphanumeric with optional underscores', | ||
58 | + lineNum, | ||
59 | + re.match(r'[a-z_]+', segtype)) | ||
60 | + self._validate( | ||
61 | + u'Pattern must contain lemma and POS', | ||
62 | + lineNum, | ||
63 | + re.match(r'\w+\:[a-z_]+', pattern, re.U)) | ||
64 | + | ||
65 | + if segtype in self.segtype2Segnum: | ||
66 | + segnum = self.segtype2Segnum[segtype] | ||
67 | + else: | ||
68 | + segnum = len(self.segtype2Segnum) | ||
69 | + self.segtype2Segnum[segtype] = segnum | ||
70 | + | ||
71 | + lemma, pos = pattern.split(':') | ||
72 | + | ||
73 | + self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum)) | ||
74 | + | ||
75 | + def lexeme2Segnum(self, lemma, tag): | ||
76 | + for p in self.patternsList: | ||
77 | + res = p.tryToMatch(lemma, tag) | ||
78 | + if res >= 0: | ||
79 | + return res | ||
80 | + raise SegtypesException('Cannot find segment type for given tag: %s' % tag) | ||
81 | + | ||
82 | +class SegtypePattern(object): | ||
83 | + | ||
84 | + def __init__(self, lemma, pattern, segnum): | ||
85 | + self.lemma = lemma | ||
86 | + self.pattern = pattern | ||
87 | + self.segnum = segnum | ||
88 | + | ||
89 | + def tryToMatch(self, lemma, tag): | ||
90 | + if (self.lemma is None or self.lemma == lemma) \ | ||
91 | + and re.match(self.pattern.replace('%', '.*'), tag): | ||
92 | + return self.segnum | ||
93 | + else: | ||
94 | + return -1 | ||
95 | + | ||
96 | +class SegtypesException(Exception): | ||
97 | + | ||
98 | + def __init__(self, msg): | ||
99 | + self.msg = msg | ||
100 | + | ||
101 | + def __str__(self): | ||
102 | + return u'Error in segment rules: %s' % self.msg |
fsabuilder/morfeuszbuilder/tagset/tagset.py
0 → 100644
1 | +''' | ||
2 | +Created on 17 lut 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | + | ||
7 | +import codecs | ||
8 | + | ||
9 | +class Tagset(object): | ||
10 | + | ||
11 | + TAGS = 1 | ||
12 | + NAMES = 2 | ||
13 | + SEP = '\t' | ||
14 | + | ||
15 | + def __init__(self, filename, encoding='utf8'): | ||
16 | + self.tag2tagnum = {} | ||
17 | + self.name2namenum = {} | ||
18 | + self._doInit(filename, encoding) | ||
19 | + self.tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) | ||
20 | + | ||
21 | + def _doInit(self, filename, encoding): | ||
22 | + addingTo = None | ||
23 | + with codecs.open(filename, 'r', encoding) as f: | ||
24 | + for line in f: | ||
25 | + line = line.strip('\n') | ||
26 | + if line == u'[TAGS]': | ||
27 | + addingTo = Tagset.TAGS | ||
28 | + elif line == u'[NAMES]': | ||
29 | + addingTo = Tagset.NAMES | ||
30 | + elif line and not line.startswith(u'#'): | ||
31 | + assert addingTo in [Tagset.TAGS, Tagset.NAMES] | ||
32 | + res = {Tagset.TAGS: self.tag2tagnum, | ||
33 | + Tagset.NAMES: self.name2namenum}[addingTo] | ||
34 | + tagNum = line.split(Tagset.SEP)[0] | ||
35 | + tag = line.split(Tagset.SEP)[1] | ||
36 | + assert tag not in res | ||
37 | + res[tag] = int(tagNum) | ||
38 | + | ||
39 | + def getTag4Tagnum(self, tagnum): | ||
40 | + return self.tagnum2tag[tagnum] | ||
0 | \ No newline at end of file | 41 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/utils/configFile.py
0 → 100644
1 | +''' | ||
2 | +Created on 18 lut 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | + | ||
7 | +import re | ||
8 | +import codecs | ||
9 | + | ||
10 | +def getHeaderValue(line, lineNum): | ||
11 | + m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) | ||
12 | + if m: | ||
13 | + return m.group(1) | ||
14 | + else: | ||
15 | + return None | ||
16 | + | ||
17 | +class ConfigFile(object): | ||
18 | + | ||
19 | + def __init__(self, filename, sectionNames): | ||
20 | + self.filename = filename | ||
21 | + self.sectionNames = sectionNames | ||
22 | + self.section2Lines = {} | ||
23 | + self.currSection = None | ||
24 | + self._parse() | ||
25 | + | ||
26 | + def _addSectionStart(self, sectionName, lineNum): | ||
27 | + if not sectionName in self.sectionNames: | ||
28 | + raise ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName) | ||
29 | + if sectionName in self.section2Lines: | ||
30 | + raise ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName) | ||
31 | + self.section2Lines[sectionName] = [] | ||
32 | + self.currSection = sectionName | ||
33 | + | ||
34 | + def _addLine(self, line, lineNum): | ||
35 | + line = line.strip() | ||
36 | + if line: | ||
37 | + if self.currSection is None and not line.startswith('#'): | ||
38 | + raise ConfigFileException(self.filename, lineNum, 'Text outside of any section') | ||
39 | + self.section2Lines[self.currSection].append((lineNum, line)) | ||
40 | + | ||
41 | + def _getHeaderValue(self, line, lineNum): | ||
42 | + m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) | ||
43 | + if m: | ||
44 | + return m.group(1) | ||
45 | + else: | ||
46 | + return None | ||
47 | + | ||
48 | + def enumerateLinesInSection(self, sectionName): | ||
49 | + return self.section2Lines[sectionName] | ||
50 | + | ||
51 | + def _parse(self): | ||
52 | + with codecs.open(self.filename, 'r', 'utf8') as f: | ||
53 | + for lineNum, line in enumerate(f, start=1): | ||
54 | + header = self._getHeaderValue(line, lineNum) | ||
55 | + if header: | ||
56 | + self._addSectionStart(header, lineNum) | ||
57 | + else: | ||
58 | + self._addLine(line, lineNum) | ||
59 | + | ||
60 | +class ConfigFileException(Exception): | ||
61 | + | ||
62 | + def __init__(self, filename, lineNum, msg): | ||
63 | + self.filename = filename | ||
64 | + self.lineNum = lineNum | ||
65 | + self.msg = msg | ||
66 | + | ||
67 | + def __str__(self): | ||
68 | + return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) |