Blame view

fsabuilder/morfeuszbuilder/segrules/rulesParser.py 5.23 KB
Michał Lenart authored
1
2

from pyparsing import *
Michał Lenart authored
3
ParserElement.enablePackrat()
Michał Lenart authored
4
from morfeuszbuilder.tagset import segtypes
Michał Lenart authored
5
from morfeuszbuilder.utils import configFile, exceptions
Michał Lenart authored
6
from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString
Michał Lenart authored
7
8
9
10
11
import codecs
import re

import itertools
import logging
Michał Lenart authored
12
from morfeuszbuilder.segrules import rulesNFA
Michał Lenart authored
13
14
15

class RulesParser(object):
Michał Lenart authored
16
17
18
19
    PARSE4GENERATOR = 1
    PARSE4ANALYZER = 2

    def __init__(self, tagset, rulesType):
Michał Lenart authored
20
        self.tagset = tagset
Michał Lenart authored
21
22
        assert rulesType in (RulesParser.PARSE4GENERATOR, RulesParser.PARSE4ANALYZER)
        self.rulesType = rulesType
Michał Lenart authored
23
24
25
26
27
28
29
30
31

    def _getKey2Defs(self, segtypesConfigFile):
        res = {}
        for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'):
            lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress()
            try:
                key, defs = lineToParse.parseString(line)
                res[key] = tuple(defs)
            except Exception as ex:
Michał Lenart authored
32
                raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex))
Michał Lenart authored
33
34
35
36
        return res

    def parse(self, filename):
Michał Lenart authored
37
        segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types'])
Michał Lenart authored
38
39
40
        key2Defs = self._getKey2Defs(segtypesConfigFile)
        segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
Michał Lenart authored
41
42
        res = rulesManager.RulesManager(segtypesHelper)
Michał Lenart authored
43
44
45
46
47
        def2Key = {}
        for key, defs in key2Defs.iteritems():
            for define in defs:
                def2Key[define] = key
Michał Lenart authored
48
        firstNFA = None
Michał Lenart authored
49
        for idx, defs in enumerate(itertools.product(*key2Defs.values())):
Michał Lenart authored
50
            key2Def = dict([(def2Key[define], define) for define in defs])
Michał Lenart authored
51
#             print key2Def
Michał Lenart authored
52
53
54
            nfa = rulesNFA.RulesNFA()
            if not firstNFA:
                firstNFA = nfa
Michał Lenart authored
55
            section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
Michał Lenart authored
56
            combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False)
Michał Lenart authored
57
58
            combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
            for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
Michał Lenart authored
59
#                 print rule
Michał Lenart authored
60
                rule.addToNFA(nfa)
Michał Lenart authored
61
#                 nfa.debug()
Michał Lenart authored
62
            dfa = nfa.convertToDFA()
Michał Lenart authored
63
64
#             print '********* DFA **************'
#             dfa.debug()
Michał Lenart authored
65
#             print dfa.tryToRecognize(bytearray([14]))
Michał Lenart authored
66
            res.addDFA(key2Def, dfa)
Michał Lenart authored
67
68
            if idx == 0:
                res.setDefaultOptions(key2Def)
Michał Lenart authored
69
70
        return res
Michał Lenart authored
71
    def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename):
Michał Lenart authored
72
73
        for lineNum, line in combinationEnumeratedLines:
            if not line.startswith('#'):
Michał Lenart authored
74
                yield self._doParseOneLine(lineNum, line, segtypesHelper, filename)
Michał Lenart authored
75
Michał Lenart authored
76
    def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
Michał Lenart authored
77
78
79
        if not segtypesHelper.hasSegtype(segtype):
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype))
        else:
Michał Lenart authored
80
#             return rules.TagRule(segtype)
Michał Lenart authored
81
            return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype)
Michał Lenart authored
82
Michał Lenart authored
83
    def _doParseOneLine(self, lineNum, line, segtypesHelper, filename):
Michał Lenart authored
84
        rule = Forward()
Michał Lenart authored
85
        tagRule = Word(alphanums+'_')
Michał Lenart authored
86
        shiftOrthRule = Word(alphanums+'_') + Suppress('>')
Michał Lenart authored
87
        parenRule = Suppress('(') + rule + Suppress(')')
Michał Lenart authored
88
        atomicRule = tagRule ^ shiftOrthRule ^ parenRule
Michał Lenart authored
89
90
91
92
93
        zeroOrMoreRule = atomicRule + Suppress('*')
        oneOrMoreRule = atomicRule + Suppress('+')
        unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
        oneOfRule = delimitedList(unaryRule, delim='|')
        complexRule = unaryRule ^ oneOfRule
Michał Lenart authored
94
95
96
97
        if self.rulesType == RulesParser.PARSE4ANALYZER:
            concatRule = OneOrMore(complexRule)
        else:
            concatRule = ZeroOrMore(shiftOrthRule) + tagRule
Michał Lenart authored
98
        rule << concatRule + Optional(CaselessLiteral('!weak'))
Michał Lenart authored
99
Michał Lenart authored
100
101
        tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
        shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
Michał Lenart authored
102
103
104
105
106
#         parenRule.setParseAction(lambda string, loc, toks: toks[0])
        zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0]))
        oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
        oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
        concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
Michał Lenart authored
107
        rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
Michał Lenart authored
108
        parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
Michał Lenart authored
109
        return parsedRule