rulesParser.py 9.54 KB

from pyparsing import *
ParserElement.enablePackrat()
from morfeuszbuilder.tagset import segtypes
from morfeuszbuilder.utils import configFile, exceptions
from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars
import codecs
import re

import itertools
import logging
from morfeuszbuilder.segrules import rulesNFA

class RulesParser(object):
    
    PARSE4GENERATOR = 1
    PARSE4ANALYZER = 2
    
    def __init__(self, tagset, rulesType):
        self.tagset = tagset
        assert rulesType in (RulesParser.PARSE4GENERATOR, RulesParser.PARSE4ANALYZER)
        self.rulesType = rulesType
    
    def _getKey2Defs(self, segtypesConfigFile):
        res = {}
        for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'):
            lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress()
            try:
                key, defs = lineToParse.parseString(line)
                res[key] = tuple(defs)
            except Exception as ex:
                raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex))
        return res
    
    def parse(self, filename):
        
        segtypesConfigFile = configFile.ConfigFile(filename, 
                                                   [
                                                    'options', 
                                                    'combinations', 
                                                    'generator combinations', 
                                                    'tags', 
                                                    'lexemes', 
                                                    'segment types', 
                                                    'separator chars'])
        key2Defs = self._getKey2Defs(segtypesConfigFile)
        segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
        separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \
            if self.rulesType == RulesParser.PARSE4ANALYZER \
            else []
        
        res = rulesManager.RulesManager(segtypesHelper, separatorsList)
        
        def2Key = {}
        for key, defs in key2Defs.iteritems():
            for define in defs:
                def2Key[define] = key
        
        firstNFA = None
        for idx, defs in enumerate(itertools.product(*key2Defs.values())):
            key2Def = dict([(def2Key[define], define) for define in defs])
            print key2Def
            nfa = rulesNFA.RulesNFA()
            if not firstNFA:
                firstNFA = nfa
#             section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
            combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False)
            combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
            for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
                if rule.allowsEmptySequence():
                    raise exceptions.ConfigFileException(
                                                     filename, 
                                                     rule.linenum, 
                                                     'This rule allows empty segments sequence to be accepted')
                if self.rulesType == RulesParser.PARSE4GENERATOR:
                    rule = rule.transformToGeneratorVersion()
                if not rule.isSinkRule():
                    rule.addToNFA(nfa)
#             nfa.debug()
            try:
                dfa = nfa.convertToDFA()
                res.addDFA(key2Def, dfa)
            except rulesNFA.InconsistentStateWeaknessException as ex:
                raise exceptions.ConfigFileException(
                                                     filename, 
                                                     ex.weakState.rule.linenum, 
                                                     'conflicts with rule at line %d. Segmentation for some chunks can be both weak and non-weak which is illegal.' % ex.nonWeakState.rule.linenum)
#             print '********* DFA **************'
#             dfa.debug()
#             print dfa.tryToRecognize(bytearray([14]))
            if idx == 0:
                res.setDefaultOptions(key2Def)
        return res
    
    def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename):
        for lineNum, line in combinationEnumeratedLines:
            if not line.startswith('#'):
                rule = self._doParseOneLine(lineNum, line, segtypesHelper, filename)
                yield rule
    
    def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
        if not segtypesHelper.hasSegtype(segtype):
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype))
        else:
#             return rules.TagRule(segtype)
            return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum)
    
    def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper):
        if quantity <= 0:
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity))
        else:
            return rules.ConcatRule(quantity * [child], lineNum)
    
    def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper):
        if leftN > rightN or (leftN, rightN) == (0, 0):
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantities: %d %d' % (line, leftN, rightN))
        elif leftN == 0:
            children = [rules.OptionalRule(child, lineNum)]
            for n in range(2, rightN + 1):
                children.append(self._createQuantRule1(child, n, lineNum, line, segtypesHelper))
            return rules.OrRule(children, lineNum)
        else:
            children = [self._createQuantRule1(child, n, lineNum, line, segtypesHelper) for n in range(leftN, rightN + 1)]
            return rules.OrRule(children, lineNum)
    
    def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper):
        if quantity <= 0:
            raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity))
        else:
            return rules.ConcatRule(
                                    [
                                        rules.ConcatRule(quantity * [child], lineNum),
                                        rules.ZeroOrMoreRule(child, lineNum)
                                    ],
                                    lineNum)
    
    def _doParseOneLine(self, lineNum, line, segtypesHelper, filename):
        rule = Forward()
        tagRule = Word(alphanums+'_')
        shiftOrthRule = Word(alphanums+'_') + Suppress('>')
        parenRule = Suppress('(') + rule + Suppress(')')
        atomicRule = tagRule ^ shiftOrthRule ^ parenRule
        zeroOrMoreRule = atomicRule + Suppress('*')
        oneOrMoreRule = atomicRule + Suppress('+')
        optionalRule = atomicRule + Suppress('?')
        quantRule1 = atomicRule + Suppress('{') + Word(nums) + Suppress('}')
        quantRule2 = atomicRule + Suppress('{') + Word(nums) + Suppress(',') + Word(nums) + Suppress('}')
        quantRule3 = atomicRule + Suppress('{') + Word(nums) + Suppress(',') + Suppress('}')
        unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3
        oneOfRule = delimitedList(unaryRule, delim='|')
        complexRule = unaryRule ^ oneOfRule
        concatRule = OneOrMore(complexRule)
#         if self.rulesType == RulesParser.PARSE4ANALYZER:
#             concatRule = OneOrMore(complexRule)
#         else:
#             concatRule = ZeroOrMore(shiftOrthRule) + tagRule
        rule << concatRule + Optional(CaselessLiteral('!weak'))
        
        tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
        shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
#         parenRule.setParseAction(lambda string, loc, toks: toks[0])
        zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0], lineNum))
        quantRule1.setParseAction(lambda string, loc, toks: self._createQuantRule1(toks[0], int(toks[1], 10), lineNum, line, segtypesHelper))
        quantRule2.setParseAction(lambda string, loc, toks: self._createQuantRule2(toks[0], int(toks[1], 10), int(toks[2], 10), lineNum, line, segtypesHelper))
        quantRule3.setParseAction(lambda string, loc, toks: self._createQuantRule3(toks[0], int(toks[1], 10), lineNum, line, segtypesHelper))
        optionalRule.setParseAction(lambda string, loc, toks: rules.OptionalRule(toks[0], lineNum))
        oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0], lineNum)], lineNum))
        oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks, lineNum))
        concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks, lineNum))
        rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
        parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
#         print parsedRule, '-->', parsedRule.transformToGeneratorVersion()
        return parsedRule