|
1
2
|
from pyparsing import *
|
|
3
|
ParserElement.enablePackrat()
|
|
4
|
from morfeuszbuilder.tagset import segtypes
|
|
5
|
from morfeuszbuilder.utils import configFile, exceptions
|
|
6
|
from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString
|
|
7
8
9
10
11
|
import codecs
import re
import itertools
import logging
|
|
12
|
from morfeuszbuilder.segrules import rulesNFA
|
|
13
14
15
|
class RulesParser(object):
|
|
16
17
18
19
|
PARSE4GENERATOR = 1
PARSE4ANALYZER = 2
def __init__(self, tagset, rulesType):
|
|
20
|
self.tagset = tagset
|
|
21
22
|
assert rulesType in (RulesParser.PARSE4GENERATOR, RulesParser.PARSE4ANALYZER)
self.rulesType = rulesType
|
|
23
24
25
26
27
28
29
30
31
|
def _getKey2Defs(self, segtypesConfigFile):
res = {}
for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'):
lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress()
try:
key, defs = lineToParse.parseString(line)
res[key] = tuple(defs)
except Exception as ex:
|
|
32
|
raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex))
|
|
33
34
35
36
|
return res
def parse(self, filename):
|
|
37
|
segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types'])
|
|
38
39
40
|
key2Defs = self._getKey2Defs(segtypesConfigFile)
segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
|
|
41
42
|
res = rulesManager.RulesManager(segtypesHelper)
|
|
43
44
45
46
47
|
def2Key = {}
for key, defs in key2Defs.iteritems():
for define in defs:
def2Key[define] = key
|
|
48
|
firstNFA = None
|
|
49
|
for idx, defs in enumerate(itertools.product(*key2Defs.values())):
|
|
50
|
key2Def = dict([(def2Key[define], define) for define in defs])
|
|
51
|
# print key2Def
|
|
52
53
54
|
nfa = rulesNFA.RulesNFA()
if not firstNFA:
firstNFA = nfa
|
|
55
|
section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
|
|
56
|
combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False)
|
|
57
58
|
combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
|
|
59
|
# print rule
|
|
60
|
rule.addToNFA(nfa)
|
|
61
|
# nfa.debug()
|
|
62
|
dfa = nfa.convertToDFA()
|
|
63
64
|
# print '********* DFA **************'
# dfa.debug()
|
|
65
|
# print dfa.tryToRecognize(bytearray([14]))
|
|
66
|
res.addDFA(key2Def, dfa)
|
|
67
68
|
if idx == 0:
res.setDefaultOptions(key2Def)
|
|
69
70
|
return res
|
|
71
|
def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename):
|
|
72
73
|
for lineNum, line in combinationEnumeratedLines:
if not line.startswith('#'):
|
|
74
|
yield self._doParseOneLine(lineNum, line, segtypesHelper, filename)
|
|
75
|
|
|
76
|
def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
|
|
77
78
79
|
if not segtypesHelper.hasSegtype(segtype):
raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype))
else:
|
|
80
|
# return rules.TagRule(segtype)
|
|
81
|
return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype)
|
|
82
|
|
|
83
|
def _doParseOneLine(self, lineNum, line, segtypesHelper, filename):
|
|
84
|
rule = Forward()
|
|
85
|
tagRule = Word(alphanums+'_')
|
|
86
|
shiftOrthRule = Word(alphanums+'_') + Suppress('>')
|
|
87
|
parenRule = Suppress('(') + rule + Suppress(')')
|
|
88
|
atomicRule = tagRule ^ shiftOrthRule ^ parenRule
|
|
89
90
91
92
93
|
zeroOrMoreRule = atomicRule + Suppress('*')
oneOrMoreRule = atomicRule + Suppress('+')
unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
oneOfRule = delimitedList(unaryRule, delim='|')
complexRule = unaryRule ^ oneOfRule
|
|
94
95
96
97
|
if self.rulesType == RulesParser.PARSE4ANALYZER:
concatRule = OneOrMore(complexRule)
else:
concatRule = ZeroOrMore(shiftOrthRule) + tagRule
|
|
98
|
rule << concatRule + Optional(CaselessLiteral('!weak'))
|
|
99
|
|
|
100
101
|
tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
|
|
102
103
104
105
106
|
# parenRule.setParseAction(lambda string, loc, toks: toks[0])
zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0]))
oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
|
|
107
|
rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
|
|
108
|
parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
|
|
109
|
return parsedRule
|