|
1
2
|
from pyparsing import *
|
|
3
|
ParserElement.enablePackrat()
|
|
4
|
from morfeuszbuilder.tagset import segtypes
|
|
5
|
from morfeuszbuilder.utils import configFile, exceptions
|
|
6
|
from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars
|
|
7
8
|
import itertools
|
|
9
|
from morfeuszbuilder.segrules import rulesNFA
|
|
10
11
12
|
class RulesParser(object):
|
|
13
14
15
|
PARSE4GENERATOR = 1
PARSE4ANALYZER = 2
|
|
16
|
def __init__(self, tagset, namesMap, labelsMap, rulesType):
|
|
17
|
self.tagset = tagset
|
|
18
19
|
self.namesMap = namesMap
self.labelsMap = labelsMap
|
|
20
21
|
assert rulesType in (RulesParser.PARSE4GENERATOR, RulesParser.PARSE4ANALYZER)
self.rulesType = rulesType
|
|
22
23
24
25
26
27
28
29
30
|
def _getKey2Defs(self, segtypesConfigFile):
res = {}
for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'):
lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress()
try:
key, defs = lineToParse.parseString(line)
res[key] = tuple(defs)
except Exception as ex:
|
|
31
|
raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, 'Error in [options] section: %s' % str(ex))
|
|
32
|
return res
|
|
33
34
|
def _key2DefAsKey(self, key2Def):
|
|
35
|
return frozenset(list(key2Def.items()))
|
|
36
|
|
|
37
38
|
def parse(self, filename):
|
|
39
40
41
42
|
segtypesConfigFile = configFile.ConfigFile(filename,
[
'options',
'combinations',
|
|
43
|
'tags',
|
|
44
45
46
|
'lexemes',
'segment types',
'separator chars'])
|
|
47
|
key2Defs = self._getKey2Defs(segtypesConfigFile)
|
|
48
|
segtypesHelper = segtypes.Segtypes(self.tagset, self.namesMap, self.labelsMap, segtypesConfigFile)
|
|
49
50
51
|
separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \
if self.rulesType == RulesParser.PARSE4ANALYZER \
else []
|
|
52
|
|
|
53
|
res = rulesManager.RulesManager(segtypesHelper, separatorsList)
|
|
54
|
|
|
55
|
def2Key = {}
|
|
56
|
for key, defs in list(key2Defs.items()):
|
|
57
58
59
|
for define in defs:
def2Key[define] = key
|
|
60
|
resultsMap = {}
|
|
61
|
for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))):
|
|
62
|
key2Def = dict([(def2Key[define], define) for define in defs])
|
|
63
64
|
currRes = []
resultsMap[self._key2DefAsKey(key2Def)] = currRes
|
|
65
|
combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False)
|
|
66
67
|
combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
|
|
68
69
70
71
72
|
if rule.allowsEmptySequence():
raise exceptions.ConfigFileException(
filename,
rule.linenum,
'This rule allows empty segments sequence to be accepted')
|
|
73
|
rule.validate(filename)
|
|
74
|
if self.rulesType == RulesParser.PARSE4GENERATOR:
|
|
75
76
77
78
|
additionalRules = rule.getAdditionalAtomicRules4Generator()
for rule in additionalRules:
rule.autogenerated = True
currRes.extend(additionalRules)
|
|
79
80
|
rule = rule.transformToGeneratorVersion()
if not rule.isSinkRule():
|
|
81
|
currRes.append(rule)
|
|
82
|
# nfa.debug()
|
|
83
84
85
86
87
88
|
# if self.rulesType == RulesParser.PARSE4GENERATOR:
# self.doGeneratorMagic(resultsMap)
self.doShiftOrthMagic(resultsMap, res)
|
|
89
|
for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))):
|
|
90
91
92
93
94
95
96
|
key2Def = dict([(def2Key[define], define) for define in defs])
nfa = rulesNFA.RulesNFA()
for rule in resultsMap[self._key2DefAsKey(key2Def)]:
rule.addToNFA(nfa)
|
|
97
98
99
100
|
try:
dfa = nfa.convertToDFA()
res.addDFA(key2Def, dfa)
except rulesNFA.InconsistentStateWeaknessException as ex:
|
|
101
102
103
104
|
raise exceptions.ConfigFileException(
filename,
ex.weakState.rule.linenum,
'conflicts with rule at line %d. Segmentation for some chunks can be both weak and non-weak which is illegal.' % ex.nonWeakState.rule.linenum)
|
|
105
106
|
if idx == 0:
res.setDefaultOptions(key2Def)
|
|
107
|
|
|
108
109
|
return res
|
|
110
|
def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename):
|
|
111
112
|
for lineNum, line in combinationEnumeratedLines:
if not line.startswith('#'):
|
|
113
114
|
rule = self._doParseOneLine(lineNum, line, segtypesHelper, filename)
yield rule
|
|
115
|
|
|
116
|
def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
|
|
117
|
if not segtypesHelper.hasSegtype(segtype):
|
|
118
|
raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid segment type: %s' % (line, segtype))
|
|
119
|
else:
|
|
120
|
# return rules.TagRule(segtype)
|
|
121
|
return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum)
|
|
122
|
|
|
123
124
|
def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper):
if quantity <= 0:
|
|
125
|
raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity))
|
|
126
127
128
129
130
|
else:
return rules.ConcatRule(quantity * [child], lineNum)
def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper):
if leftN > rightN or (leftN, rightN) == (0, 0):
|
|
131
|
raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantities: %d %d' % (line, leftN, rightN))
|
|
132
133
134
135
136
137
138
139
140
141
142
|
elif leftN == 0:
children = [rules.OptionalRule(child, lineNum)]
for n in range(2, rightN + 1):
children.append(self._createQuantRule1(child, n, lineNum, line, segtypesHelper))
return rules.OrRule(children, lineNum)
else:
children = [self._createQuantRule1(child, n, lineNum, line, segtypesHelper) for n in range(leftN, rightN + 1)]
return rules.OrRule(children, lineNum)
def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper):
if quantity <= 0:
|
|
143
|
raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity))
|
|
144
145
146
147
148
149
150
151
|
else:
return rules.ConcatRule(
[
rules.ConcatRule(quantity * [child], lineNum),
rules.ZeroOrMoreRule(child, lineNum)
],
lineNum)
|
|
152
153
154
155
|
def _createNewParenWithShiftOrthRule(self, rule, lineNum, line, segtypesHelper):
rule.makeShiftOrthRule()
return rule
|
|
156
|
def _doParseOneLine(self, lineNum, line, segtypesHelper, filename):
|
|
157
|
rule = Forward()
|
|
158
|
tagRule = Word(alphanums+'_')
|
|
159
|
shiftOrthRule = Word(alphanums+'_') + Suppress('>')
|
|
160
|
parenRule = Suppress('(') + rule + Suppress(')')
|
|
161
162
|
parenWithShiftOrthRule = parenRule + Suppress('>')
atomicRule = tagRule ^ shiftOrthRule ^ parenWithShiftOrthRule ^ parenRule
|
|
163
164
|
zeroOrMoreRule = atomicRule + Suppress('*')
oneOrMoreRule = atomicRule + Suppress('+')
|
|
165
166
167
168
169
|
optionalRule = atomicRule + Suppress('?')
quantRule1 = atomicRule + Suppress('{') + Word(nums) + Suppress('}')
quantRule2 = atomicRule + Suppress('{') + Word(nums) + Suppress(',') + Word(nums) + Suppress('}')
quantRule3 = atomicRule + Suppress('{') + Word(nums) + Suppress(',') + Suppress('}')
unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3
|
|
170
171
|
oneOfRule = delimitedList(unaryRule, delim='|')
complexRule = unaryRule ^ oneOfRule
|
|
172
173
174
175
176
|
concatRule = OneOrMore(complexRule)
# if self.rulesType == RulesParser.PARSE4ANALYZER:
# concatRule = OneOrMore(complexRule)
# else:
# concatRule = ZeroOrMore(shiftOrthRule) + tagRule
|
|
177
178
179
|
rule << concatRule
completeRule = rule + Optional(CaselessLiteral('!weak'))
|
|
180
|
|
|
181
182
|
tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
|
|
183
184
|
parenWithShiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewParenWithShiftOrthRule(toks[0], lineNum, line, segtypesHelper))
parenRule.setParseAction(lambda string, loc, toks: toks[0])
|
|
185
|
zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0], lineNum))
|
|
186
187
188
189
|
quantRule1.setParseAction(lambda string, loc, toks: self._createQuantRule1(toks[0], int(toks[1], 10), lineNum, line, segtypesHelper))
quantRule2.setParseAction(lambda string, loc, toks: self._createQuantRule2(toks[0], int(toks[1], 10), int(toks[2], 10), lineNum, line, segtypesHelper))
quantRule3.setParseAction(lambda string, loc, toks: self._createQuantRule3(toks[0], int(toks[1], 10), lineNum, line, segtypesHelper))
optionalRule.setParseAction(lambda string, loc, toks: rules.OptionalRule(toks[0], lineNum))
|
|
190
|
oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0], lineNum)], lineNum))
|
|
191
192
|
oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(list(toks), lineNum))
concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(list(toks), lineNum))
|
|
193
194
|
completeRule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
parsedRule = pyparseString.pyparseString(completeRule, lineNum, line, filename)[0]
|
|
195
|
# print parsedRule, '-->', parsedRule.transformToGeneratorVersion()
|
|
196
|
return parsedRule
|
|
197
198
199
200
201
202
|
def doShiftOrthMagic(self, resultsMap, rulesManager):
shiftOrthSegtypes = set()
nonShiftOrthSegtypes = set()
|
|
203
|
for _, rules in list(resultsMap.items()):
|
|
204
205
206
207
208
209
210
211
|
for rule in rules:
for atomicRule in rule.getAtomicRules():
if atomicRule.shiftOrth:
shiftOrthSegtypes.add(atomicRule.segtype)
else:
nonShiftOrthSegtypes.add(atomicRule.segtype)
rulesManager.shiftOrthMagic.doShiftOrthMagic(resultsMap, rulesManager.segtypes, shiftOrthSegtypes, nonShiftOrthSegtypes)
|