Blame view

fsabuilder/morfeuszbuilder/segrules/rulesParser.py 11.1 KB
Michał Lenart authored
1
2

from pyparsing import *
Michał Lenart authored
3
ParserElement.enablePackrat()
Michał Lenart authored
4
from morfeuszbuilder.tagset import segtypes
Michał Lenart authored
5
from morfeuszbuilder.utils import configFile, exceptions
Michał Lenart authored
6
from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars
Michał Lenart authored
7
8

import itertools
Michał Lenart authored
9
from morfeuszbuilder.segrules import rulesNFA
Michał Lenart authored
10
11
12

class RulesParser(object):
Michał Lenart authored
13
14
15
    PARSE4GENERATOR = 1
    PARSE4ANALYZER = 2
Michał Lenart authored
16
    def __init__(self, tagset, namesMap, labelsMap, rulesType):
Michał Lenart authored
17
        self.tagset = tagset
Michał Lenart authored
18
19
        self.namesMap = namesMap
        self.labelsMap = labelsMap
Michał Lenart authored
20
21
        assert rulesType in (RulesParser.PARSE4GENERATOR, RulesParser.PARSE4ANALYZER)
        self.rulesType = rulesType
Michał Lenart authored
22
23
24
25
26
27
28
29
30

    def _getKey2Defs(self, segtypesConfigFile):
        res = {}
        for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'):
            lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress()
            try:
                key, defs = lineToParse.parseString(line)
                res[key] = tuple(defs)
            except Exception as ex:
Marcin Woliński authored
31
                raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, 'Error in [options] section: %s' % str(ex))
Michał Lenart authored
32
        return res
Michał Lenart authored
33
34

    def _key2DefAsKey(self, key2Def):
Marcin Woliński authored
35
        return frozenset(list(key2Def.items()))
Michał Lenart authored
36
Michał Lenart authored
37
38
    def parse(self, filename):
Michał Lenart authored
39
40
41
42
        segtypesConfigFile = configFile.ConfigFile(filename, 
                                                   [
                                                    'options', 
                                                    'combinations', 
Michał Lenart authored
43
                                                    'tags',
Michał Lenart authored
44
45
46
                                                    'lexemes', 
                                                    'segment types', 
                                                    'separator chars'])
Michał Lenart authored
47
        key2Defs = self._getKey2Defs(segtypesConfigFile)
Michał Lenart authored
48
        segtypesHelper = segtypes.Segtypes(self.tagset, self.namesMap, self.labelsMap, segtypesConfigFile)
Michał Lenart authored
49
50
51
        separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \
            if self.rulesType == RulesParser.PARSE4ANALYZER \
            else []
Michał Lenart authored
52
Michał Lenart authored
53
        res = rulesManager.RulesManager(segtypesHelper, separatorsList)
Michał Lenart authored
54
Michał Lenart authored
55
        def2Key = {}
Marcin Woliński authored
56
        for key, defs in list(key2Defs.items()):
Michał Lenart authored
57
58
59
            for define in defs:
                def2Key[define] = key
Michał Lenart authored
60
        resultsMap = {}
Marcin Woliński authored
61
        for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))):
Michał Lenart authored
62
            key2Def = dict([(def2Key[define], define) for define in defs])
Michał Lenart authored
63
64
            currRes = []
            resultsMap[self._key2DefAsKey(key2Def)] = currRes
Michał Lenart authored
65
            combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False)
Michał Lenart authored
66
67
            combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
            for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
Michał Lenart authored
68
69
70
71
72
                if rule.allowsEmptySequence():
                    raise exceptions.ConfigFileException(
                                                     filename, 
                                                     rule.linenum, 
                                                     'This rule allows empty segments sequence to be accepted')
Michał Lenart authored
73
                rule.validate(filename)
Michał Lenart authored
74
                if self.rulesType == RulesParser.PARSE4GENERATOR:
Michał Lenart authored
75
76
77
78
                    additionalRules = rule.getAdditionalAtomicRules4Generator()
                    for rule in additionalRules:
                        rule.autogenerated = True
                    currRes.extend(additionalRules)
Michał Lenart authored
79
80
                    rule = rule.transformToGeneratorVersion()
                if not rule.isSinkRule():
Michał Lenart authored
81
                    currRes.append(rule)
Michał Lenart authored
82
#             nfa.debug()
Michał Lenart authored
83
84
85
86
87
88

        # if self.rulesType == RulesParser.PARSE4GENERATOR:
        #     self.doGeneratorMagic(resultsMap)

        self.doShiftOrthMagic(resultsMap, res)
Marcin Woliński authored
89
        for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))):
Michał Lenart authored
90
91
92
93
94
95
96
            key2Def = dict([(def2Key[define], define) for define in defs])

            nfa = rulesNFA.RulesNFA()

            for rule in resultsMap[self._key2DefAsKey(key2Def)]:
                rule.addToNFA(nfa)
Michał Lenart authored
97
98
99
100
            try:
                dfa = nfa.convertToDFA()
                res.addDFA(key2Def, dfa)
            except rulesNFA.InconsistentStateWeaknessException as ex:
Michał Lenart authored
101
102
103
104
                raise exceptions.ConfigFileException(
                                                     filename, 
                                                     ex.weakState.rule.linenum, 
                                                     'conflicts with rule at line %d. Segmentation for some chunks can be both weak and non-weak which is illegal.' % ex.nonWeakState.rule.linenum)
Michał Lenart authored
105
106
            if idx == 0:
                res.setDefaultOptions(key2Def)
Michał Lenart authored
107
Michał Lenart authored
108
109
        return res
Michał Lenart authored
110
    def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename):
Michał Lenart authored
111
112
        for lineNum, line in combinationEnumeratedLines:
            if not line.startswith('#'):
Michał Lenart authored
113
114
                rule = self._doParseOneLine(lineNum, line, segtypesHelper, filename)
                yield rule
Michał Lenart authored
115
Michał Lenart authored
116
    def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
Michał Lenart authored
117
        if not segtypesHelper.hasSegtype(segtype):
Marcin Woliński authored
118
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid segment type: %s' % (line, segtype))
Michał Lenart authored
119
        else:
Michał Lenart authored
120
#             return rules.TagRule(segtype)
Michał Lenart authored
121
            return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum)
Michał Lenart authored
122
Michał Lenart authored
123
124
    def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper):
        if quantity <= 0:
Marcin Woliński authored
125
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity))
Michał Lenart authored
126
127
128
129
130
        else:
            return rules.ConcatRule(quantity * [child], lineNum)

    def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper):
        if leftN > rightN or (leftN, rightN) == (0, 0):
Marcin Woliński authored
131
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantities: %d %d' % (line, leftN, rightN))
Michał Lenart authored
132
133
134
135
136
137
138
139
140
141
142
        elif leftN == 0:
            children = [rules.OptionalRule(child, lineNum)]
            for n in range(2, rightN + 1):
                children.append(self._createQuantRule1(child, n, lineNum, line, segtypesHelper))
            return rules.OrRule(children, lineNum)
        else:
            children = [self._createQuantRule1(child, n, lineNum, line, segtypesHelper) for n in range(leftN, rightN + 1)]
            return rules.OrRule(children, lineNum)

    def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper):
        if quantity <= 0:
Marcin Woliński authored
143
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity))
Michał Lenart authored
144
145
146
147
148
149
150
151
        else:
            return rules.ConcatRule(
                                    [
                                        rules.ConcatRule(quantity * [child], lineNum),
                                        rules.ZeroOrMoreRule(child, lineNum)
                                    ],
                                    lineNum)
Michał Lenart authored
152
153
154
155
    def _createNewParenWithShiftOrthRule(self, rule, lineNum, line, segtypesHelper):
        rule.makeShiftOrthRule()
        return rule
Michał Lenart authored
156
    def _doParseOneLine(self, lineNum, line, segtypesHelper, filename):
Michał Lenart authored
157
        rule = Forward()
Michał Lenart authored
158
        tagRule = Word(alphanums+'_')
Michał Lenart authored
159
        shiftOrthRule = Word(alphanums+'_') + Suppress('>')
Michał Lenart authored
160
        parenRule = Suppress('(') + rule + Suppress(')')
Michał Lenart authored
161
162
        parenWithShiftOrthRule = parenRule + Suppress('>')
        atomicRule = tagRule ^ shiftOrthRule ^ parenWithShiftOrthRule ^ parenRule
Michał Lenart authored
163
164
        zeroOrMoreRule = atomicRule + Suppress('*')
        oneOrMoreRule = atomicRule + Suppress('+')
Michał Lenart authored
165
166
167
168
169
        optionalRule = atomicRule + Suppress('?')
        quantRule1 = atomicRule + Suppress('{') + Word(nums) + Suppress('}')
        quantRule2 = atomicRule + Suppress('{') + Word(nums) + Suppress(',') + Word(nums) + Suppress('}')
        quantRule3 = atomicRule + Suppress('{') + Word(nums) + Suppress(',') + Suppress('}')
        unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3
Michał Lenart authored
170
171
        oneOfRule = delimitedList(unaryRule, delim='|')
        complexRule = unaryRule ^ oneOfRule
Michał Lenart authored
172
173
174
175
176
        concatRule = OneOrMore(complexRule)
#         if self.rulesType == RulesParser.PARSE4ANALYZER:
#             concatRule = OneOrMore(complexRule)
#         else:
#             concatRule = ZeroOrMore(shiftOrthRule) + tagRule
Michał Lenart authored
177
178
179

        rule << concatRule
        completeRule = rule + Optional(CaselessLiteral('!weak'))
Michał Lenart authored
180
Michał Lenart authored
181
182
        tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
        shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
Michał Lenart authored
183
184
        parenWithShiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewParenWithShiftOrthRule(toks[0], lineNum, line, segtypesHelper))
        parenRule.setParseAction(lambda string, loc, toks: toks[0])
Michał Lenart authored
185
        zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0], lineNum))
Michał Lenart authored
186
187
188
189
        quantRule1.setParseAction(lambda string, loc, toks: self._createQuantRule1(toks[0], int(toks[1], 10), lineNum, line, segtypesHelper))
        quantRule2.setParseAction(lambda string, loc, toks: self._createQuantRule2(toks[0], int(toks[1], 10), int(toks[2], 10), lineNum, line, segtypesHelper))
        quantRule3.setParseAction(lambda string, loc, toks: self._createQuantRule3(toks[0], int(toks[1], 10), lineNum, line, segtypesHelper))
        optionalRule.setParseAction(lambda string, loc, toks: rules.OptionalRule(toks[0], lineNum))
Michał Lenart authored
190
        oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0], lineNum)], lineNum))
Michał Lenart authored
191
192
        oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(list(toks), lineNum))
        concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(list(toks), lineNum))
Michał Lenart authored
193
194
        completeRule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
        parsedRule = pyparseString.pyparseString(completeRule, lineNum, line, filename)[0]
Michał Lenart authored
195
#         print parsedRule, '-->', parsedRule.transformToGeneratorVersion()
Michał Lenart authored
196
        return parsedRule
Michał Lenart authored
197
198
199
200
201
202

    def doShiftOrthMagic(self, resultsMap, rulesManager):

        shiftOrthSegtypes = set()
        nonShiftOrthSegtypes = set()
Marcin Woliński authored
203
        for _, rules in list(resultsMap.items()):
Michał Lenart authored
204
205
206
207
208
209
210
211
            for rule in rules:
                for atomicRule in rule.getAtomicRules():
                    if atomicRule.shiftOrth:
                        shiftOrthSegtypes.add(atomicRule.segtype)
                    else:
                        nonShiftOrthSegtypes.add(atomicRule.segtype)

        rulesManager.shiftOrthMagic.doShiftOrthMagic(resultsMap, rulesManager.segtypes, shiftOrthSegtypes, nonShiftOrthSegtypes)