rulesParser.py
3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from pyparsing import *
from morfeuszbuilder.tagset import segtypes
from morfeuszbuilder.utils import configFile
from morfeuszbuilder.segrules import preprocessor
import codecs
import re
import itertools
import logging
import segsfsa
# header = Suppress('[') + Word(alphas, bodyChars=alphanums+'_') + Suppress(']')
# define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
# ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
# endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
def doprint(toks):
print toks
class RulesParser(object):
def __init__(self, tagset):
self.tagset = tagset
def _getKey2Defs(self, segtypesConfigFile):
res = {}
for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'):
lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress()
try:
key, defs = lineToParse.parseString(line)
res[key] = tuple(defs)
except Exception as ex:
raise configFile.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex))
return res
def parse(self, filename):
res = []
segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes'])
key2Defs = self._getKey2Defs(segtypesConfigFile)
segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
def2Key = {}
for key, defs in key2Defs.iteritems():
for define in defs:
def2Key[define] = key
for defs in itertools.product(*key2Defs.values()):
key2Def = dict([(def2Key[define], define) for define in defs])
fsa = segsfsa.SegmentsFSA(key2Def)
combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations')
combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs))
for rule in self._doParse(combinationEnumeratedLines, segtypesHelper):
fsa.addSegmentRule(rule)
res.append(fsa)
return res
def _doParse(self, combinationEnumeratedLines, segtypesHelper):
for lineNum, line in combinationEnumeratedLines:
if not line.startswith('#'):
yield self._doParseOneLine(lineNum, line, segtypesHelper)
def _doParseOneLine(self, lineNum, line, segtypesHelper):
rule = Forward()
tagRule = Word(alphanums+'_')
ignoreOrthRule = tagRule + Suppress('>')
parenRule = Suppress('(') + rule + Suppress(')')
atomicRule = tagRule ^ ignoreOrthRule ^ parenRule
zeroOrMoreRule = atomicRule + Suppress('*')
oneOrMoreRule = atomicRule + Suppress('+')
unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
oneOfRule = delimitedList(unaryRule, delim='|')
complexRule = unaryRule ^ oneOfRule
concatRule = OneOrMore(complexRule)
rule << concatRule
# rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule
# tagRule.setParseAction(lambda s,l,toks: doprint(toks))
# print lineNum, line
parsedLine = rule.parseString(line, parseAll=True)
# print parsedLine