from pyparsing import * ParserElement.enablePackrat() from morfeuszbuilder.tagset import segtypes from morfeuszbuilder.utils import configFile, exceptions from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars import itertools from morfeuszbuilder.segrules import rulesNFA class RulesParser(object): PARSE4GENERATOR = 1 PARSE4ANALYZER = 2 def __init__(self, tagset, namesMap, labelsMap, rulesType): self.tagset = tagset self.namesMap = namesMap self.labelsMap = labelsMap assert rulesType in (RulesParser.PARSE4GENERATOR, RulesParser.PARSE4ANALYZER) self.rulesType = rulesType def _getKey2Defs(self, segtypesConfigFile): res = {} for lineNum, line in segtypesConfigFile.enumerateLinesInSection('options'): lineToParse = Word(alphanums+'_') + Suppress('=') + Group(OneOrMore(Word(alphanums+'_'))) + LineEnd().suppress() try: key, defs = lineToParse.parseString(line) res[key] = tuple(defs) except Exception as ex: raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, 'Error in [options] section: %s' % str(ex)) return res def _key2DefAsKey(self, key2Def): return frozenset(list(key2Def.items())) def parse(self, filename): segtypesConfigFile = configFile.ConfigFile(filename, [ 'options', 'combinations', 'tags', 'lexemes', 'segment types', 'separator chars']) key2Defs = self._getKey2Defs(segtypesConfigFile) segtypesHelper = segtypes.Segtypes(self.tagset, self.namesMap, self.labelsMap, segtypesConfigFile) separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \ if self.rulesType == RulesParser.PARSE4ANALYZER \ else [] res = rulesManager.RulesManager(segtypesHelper, separatorsList) def2Key = {} for key, defs in list(key2Defs.items()): for define in defs: def2Key[define] = key resultsMap = {} for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))): key2Def = dict([(def2Key[define], define) for define in defs]) currRes = [] resultsMap[self._key2DefAsKey(key2Def)] = currRes combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False) combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): if rule.allowsEmptySequence(): raise exceptions.ConfigFileException( filename, rule.linenum, 'This rule allows empty segments sequence to be accepted') rule.validate(filename) if self.rulesType == RulesParser.PARSE4GENERATOR: additionalRules = rule.getAdditionalAtomicRules4Generator() for rule in additionalRules: rule.autogenerated = True currRes.extend(additionalRules) rule = rule.transformToGeneratorVersion() if not rule.isSinkRule(): currRes.append(rule) # nfa.debug() # if self.rulesType == RulesParser.PARSE4GENERATOR: # self.doGeneratorMagic(resultsMap) self.doShiftOrthMagic(resultsMap, res) for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))): key2Def = dict([(def2Key[define], define) for define in defs]) nfa = rulesNFA.RulesNFA() for rule in resultsMap[self._key2DefAsKey(key2Def)]: rule.addToNFA(nfa) try: dfa = nfa.convertToDFA() res.addDFA(key2Def, dfa) except rulesNFA.InconsistentStateWeaknessException as ex: raise exceptions.ConfigFileException( filename, ex.weakState.rule.linenum, 'conflicts with rule at line %d. Segmentation for some chunks can be both weak and non-weak which is illegal.' % ex.nonWeakState.rule.linenum) if idx == 0: res.setDefaultOptions(key2Def) return res def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename): for lineNum, line in combinationEnumeratedLines: if not line.startswith('#'): rule = self._doParseOneLine(lineNum, line, segtypesHelper, filename) yield rule def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): if not segtypesHelper.hasSegtype(segtype): raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid segment type: %s' % (line, segtype)) else: # return rules.TagRule(segtype) return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum) def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper): if quantity <= 0: raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity)) else: return rules.ConcatRule(quantity * [child], lineNum) def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper): if leftN > rightN or (leftN, rightN) == (0, 0): raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantities: %d %d' % (line, leftN, rightN)) elif leftN == 0: children = [rules.OptionalRule(child, lineNum)] for n in range(2, rightN + 1): children.append(self._createQuantRule1(child, n, lineNum, line, segtypesHelper)) return rules.OrRule(children, lineNum) else: children = [self._createQuantRule1(child, n, lineNum, line, segtypesHelper) for n in range(leftN, rightN + 1)] return rules.OrRule(children, lineNum) def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper): if quantity <= 0: raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity)) else: return rules.ConcatRule( [ rules.ConcatRule(quantity * [child], lineNum), rules.ZeroOrMoreRule(child, lineNum) ], lineNum) def _createNewParenWithShiftOrthRule(self, rule, lineNum, line, segtypesHelper): rule.makeShiftOrthRule() return rule def _doParseOneLine(self, lineNum, line, segtypesHelper, filename): rule = Forward() tagRule = Word(alphanums+'_') shiftOrthRule = Word(alphanums+'_') + Suppress('>') parenRule = Suppress('(') + rule + Suppress(')') parenWithShiftOrthRule = parenRule + Suppress('>') atomicRule = tagRule ^ shiftOrthRule ^ parenWithShiftOrthRule ^ parenRule zeroOrMoreRule = atomicRule + Suppress('*') oneOrMoreRule = atomicRule + Suppress('+') optionalRule = atomicRule + Suppress('?') quantRule1 = atomicRule + Suppress('{') + Word(nums) + Suppress('}') quantRule2 = atomicRule + Suppress('{') + Word(nums) + Suppress(',') + Word(nums) + Suppress('}') quantRule3 = atomicRule + Suppress('{') + Word(nums) + Suppress(',') + Suppress('}') unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3 oneOfRule = delimitedList(unaryRule, delim='|') complexRule = unaryRule ^ oneOfRule concatRule = OneOrMore(complexRule) # if self.rulesType == RulesParser.PARSE4ANALYZER: # concatRule = OneOrMore(complexRule) # else: # concatRule = ZeroOrMore(shiftOrthRule) + tagRule rule << concatRule completeRule = rule + Optional(CaselessLiteral('!weak')) tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) parenWithShiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewParenWithShiftOrthRule(toks[0], lineNum, line, segtypesHelper)) parenRule.setParseAction(lambda string, loc, toks: toks[0]) zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0], lineNum)) quantRule1.setParseAction(lambda string, loc, toks: self._createQuantRule1(toks[0], int(toks[1], 10), lineNum, line, segtypesHelper)) quantRule2.setParseAction(lambda string, loc, toks: self._createQuantRule2(toks[0], int(toks[1], 10), int(toks[2], 10), lineNum, line, segtypesHelper)) quantRule3.setParseAction(lambda string, loc, toks: self._createQuantRule3(toks[0], int(toks[1], 10), lineNum, line, segtypesHelper)) optionalRule.setParseAction(lambda string, loc, toks: rules.OptionalRule(toks[0], lineNum)) oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0], lineNum)], lineNum)) oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(list(toks), lineNum)) concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(list(toks), lineNum)) completeRule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2)) parsedRule = pyparseString.pyparseString(completeRule, lineNum, line, filename)[0] # print parsedRule, '-->', parsedRule.transformToGeneratorVersion() return parsedRule def doShiftOrthMagic(self, resultsMap, rulesManager): shiftOrthSegtypes = set() nonShiftOrthSegtypes = set() for _, rules in list(resultsMap.items()): for rule in rules: for atomicRule in rule.getAtomicRules(): if atomicRule.shiftOrth: shiftOrthSegtypes.add(atomicRule.segtype) else: nonShiftOrthSegtypes.add(atomicRule.segtype) rulesManager.shiftOrthMagic.doShiftOrthMagic(resultsMap, rulesManager.segtypes, shiftOrthSegtypes, nonShiftOrthSegtypes)