diff --git a/fsabuilder/buildfsa.spec b/fsabuilder/buildfsa.spec new file mode 100644 index 0000000..be8cd20 --- /dev/null +++ b/fsabuilder/buildfsa.spec @@ -0,0 +1,22 @@ +# -*- mode: python -*- +a = Analysis(['fsa/buildfsa.py'], + pathex=['/home/lennyn/xxx/morfeusz/fsabuilder'], + hiddenimports=[], + hookspath=None, + runtime_hooks=None) +pyz = PYZ(a.pure) +exe = EXE(pyz, + a.scripts, + exclude_binaries=True, + name='buildfsa', + debug=False, + strip=None, + upx=True, + console=True ) +coll = COLLECT(exe, + a.binaries, + a.zipfiles, + a.datas, + strip=None, + upx=True, + name='buildfsa') diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.py b/fsabuilder/morfeuszbuilder/fsa/fsa.py index 83b0837..7f94fc0 100644 --- a/fsabuilder/morfeuszbuilder/fsa/fsa.py +++ b/fsabuilder/morfeuszbuilder/fsa/fsa.py @@ -14,11 +14,12 @@ class FSA(object): ''' - def __init__(self, encoder, tagset=None, encodeData=True): - self.encodeWord = encoder.encodeWord + def __init__(self, encoder, tagset=None, encodeData=True, encodeWords=True): + self.encodeWord = encoder.encodeWord if encodeWords else lambda x: x self.encodeData = encoder.encodeData if encodeData else lambda x: x self.decodeData = encoder.decodeData if encodeData else lambda x: x self.encodedPrevWord = None + self.tagset = tagset self.initialState = state.State() self.register = register.Register() diff --git a/fsabuilder/morfeuszbuilder/fsa/state.py b/fsabuilder/morfeuszbuilder/fsa/state.py index 66873e9..1ae33ea 100644 --- a/fsabuilder/morfeuszbuilder/fsa/state.py +++ b/fsabuilder/morfeuszbuilder/fsa/state.py @@ -9,7 +9,7 @@ class State(object): A state in an automaton ''' - def __init__(self): + def __init__(self, additionalData=None): self.transitionsMap = {} self.freq = 0 self.encodedData = None @@ -17,6 +17,7 @@ class State(object): self.offset = None self.label2Freq = {} self.serializeAsArray = False + self.additionalData = additionalData @property def transitionsNum(self): diff --git a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py index b48005b..1e3250b 100644 --- a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py +++ b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py @@ -34,11 +34,6 @@ class ArgDefine(object): def __str__(self): return '%s(%s) %s' % (self.name, self.arg, self.val) -class PreprocessorException(Exception): - - def __init__(self, msg, line): - pass - def _tryToSubstituteArgDefine(s, t, defines): defineName = t[0] substituteValue = t[1] diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py index a929c19..1376a9c 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rules.py +++ b/fsabuilder/morfeuszbuilder/segrules/rules.py @@ -4,6 +4,8 @@ Created on 24 sty 2014 @author: mlenart ''' +from morfeuszbuilder.segrules.rulesNFA import RulesNFAState + class SegmentRule(object): ''' classdocs @@ -14,46 +16,91 @@ class SegmentRule(object): ''' Constructor ''' + + def addToNFA(self, fsa): + raise NotImplementedError() + + def _doAddToNFA(self, startStates, endState): + raise NotImplementedError() class TagRule(SegmentRule): - def __init__(self, tagType, line): - self.tagType = tagType - self.line = line + def __init__(self, segnum): + self.segnum = segnum + + def addToNFA(self, fsa): + endState = RulesNFAState(final=True) + self._doAddToNFA(fsa.initialState, endState) + + def _doAddToNFA(self, startState, endState): + startState.addTransition(self.segnum, endState) class UnaryRule(SegmentRule): - def __init__(self, child, line): + def __init__(self, child): self.child = child - self.line = line class ComplexRule(SegmentRule): - def __init__(self, children, line): + def __init__(self, children): self.children = children - self.line = line + + def addToNFA(self, fsa): + endState = RulesNFAState(final=True) + self._doAddToNFA(fsa.initialState, endState) class ConcatRule(ComplexRule): - def __init__(self, children, line): - super(ConcatRule, self).__init__(children, line) + def __init__(self, children): + super(ConcatRule, self).__init__(children) + + def _doAddToNFA(self, startState, endState): + currStartState = startState + for child in self.children[:-1]: + currEndState = RulesNFAState() + child._doAddToNFA(currStartState, currEndState) + nextStartState = RulesNFAState() + currEndState.addTransition(None, nextStartState) + currStartState = nextStartState + lastChild = self.children[-1] + lastChild._doAddToNFA(currStartState, endState) class OrRule(ComplexRule): - def __init__(self, children, line): - super(OrRule, self).__init__(children, line) + def __init__(self, children): + super(OrRule, self).__init__(children) + + def _doAddToNFA(self, startState, endState): + for child in self.children: + intermStartState = RulesNFAState() + intermEndState = RulesNFAState() + startState.addTransition(None, intermStartState) + child._doAddToNFA(intermStartState, intermEndState) + intermEndState.addTransition(None, endState) class ZeroOrMoreRule(UnaryRule): - def __init__(self, child, line): - super(ZeroOrMoreRule, self).__init__(child, line) - -class OneOrMoreRule(UnaryRule): + def __init__(self, child): + super(ZeroOrMoreRule, self).__init__(child) + + def addToNFA(self, fsa): + raise ValueError() - def __init__(self, child, line): - super(OneOrMoreRule, self).__init__(child, line) + def _doAddToNFA(self, startState, endState): + intermStartState = RulesNFAState() + intermEndState = RulesNFAState() + + startState.addTransition(None, intermStartState) + startState.addTransition(None, endState) + self.child._doAddToNFA(intermStartState, intermEndState) + intermEndState.addTransition(None, endState) + endState.addTransition(None, intermStartState) class IgnoreOrthRule(UnaryRule): - def __init__(self, child, line): - super(IgnoreOrthRule, self).__init__(child, line) + def __init__(self, child): + super(IgnoreOrthRule, self).__init__(child) + + def _doAddToNFA(self, startState, endState): + startState.addTransition(self.child.segnum, endState, ignoreOrth=True) + diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py new file mode 100644 index 0000000..56c59ce --- /dev/null +++ b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py @@ -0,0 +1,40 @@ +''' +Created on 24 sty 2014 + +@author: mlenart +''' + +from morfeuszbuilder.fsa import fsa, state, encode + +class RulesNFAState(object): + + def __init__(self, initial=False, final=False): + self.transitionsMap = {} + self.initial = initial + self.final = final + + def addTransition(self, label, targetState, ignoreOrth=False): + assert not ignoreOrth or label is not None + self.transitionsMap.setdefault((label, ignoreOrth), set()) + self.transitionsMap[(label, ignoreOrth)].add(targetState) + +class RulesNFA(object): + + def __init__(self, key2Def={}): + self.initialState = RulesNFAState(initial=True) + + def _doConvertState(self, dfaState, nfaStates): + for label, (nextIgnoreOrth, nextNFAStates) in self._groupOutputByLabels(nfaStates).iteritems(): + nextDFAState = state.State(additionalData=nextIgnoreOrth) + dfaState.setTransition(label, nextDFAState) + dfaState.encodedData = bytearray() + self._doConvertState(nextDFAState, nextNFAStates) + + def convertToDFA(self): + dfa = fsa.FSA(encoder=None, encodeWords=False) + startStates = self.initialState.getClosure() + assert not any(filter(lambda s: s.final, startStates)) + dfa.initialState = state.State(additionalData=False) + self._doConvertState(dfa.initialState, startStates) + + \ No newline at end of file diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py index 22d97b4..398e6a6 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py @@ -1,22 +1,15 @@ from pyparsing import * +ParserElement.enablePackrat() from morfeuszbuilder.tagset import segtypes -from morfeuszbuilder.utils import configFile -from morfeuszbuilder.segrules import preprocessor +from morfeuszbuilder.utils import configFile, exceptions +from morfeuszbuilder.segrules import preprocessor, rules import codecs import re import itertools import logging -import segsfsa - -# header = Suppress('[') + Word(alphas, bodyChars=alphanums+'_') + Suppress(']') -# define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() -# ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() -# endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() - -def doprint(toks): - print toks +from morfeuszbuilder.segrules import rulesNFA class RulesParser(object): @@ -31,7 +24,7 @@ class RulesParser(object): key, defs = lineToParse.parseString(line) res[key] = tuple(defs) except Exception as ex: - raise configFile.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) return res def parse(self, filename): @@ -48,12 +41,12 @@ class RulesParser(object): for defs in itertools.product(*key2Defs.values()): key2Def = dict([(def2Key[define], define) for define in defs]) - fsa = segsfsa.SegmentsFSA(key2Def) + nfa = rulesNFA.RulesNFA(key2Def) combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): - fsa.addSegmentRule(rule) - res.append(fsa) + rule.addToNFA(nfa) + res.append(nfa) return res def _doParse(self, combinationEnumeratedLines, segtypesHelper): @@ -61,6 +54,12 @@ class RulesParser(object): if not line.startswith('#'): yield self._doParseOneLine(lineNum, line, segtypesHelper) + def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper): + if not segtypesHelper.hasSegtype(segtype): + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) + else: + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype)) + def _doParseOneLine(self, lineNum, line, segtypesHelper): rule = Forward() tagRule = Word(alphanums+'_') @@ -74,9 +73,21 @@ class RulesParser(object): complexRule = unaryRule ^ oneOfRule concatRule = OneOrMore(complexRule) rule << concatRule + + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) + ignoreOrthRule.setParseAction(lambda string, loc, toks: rules.IgnoreOrthRule(toks[0])) +# parenRule.setParseAction(lambda string, loc, toks: toks[0]) + zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) + oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) + oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) + concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) + + # rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule # tagRule.setParseAction(lambda s,l,toks: doprint(toks)) # print lineNum, line - parsedLine = rule.parseString(line, parseAll=True) + parsedRule = rule.parseString(line, parseAll=True)[0] + print parsedRule + return parsedRule # print parsedLine diff --git a/fsabuilder/morfeuszbuilder/segrules/segsfsa.py b/fsabuilder/morfeuszbuilder/segrules/segsfsa.py deleted file mode 100644 index f060472..0000000 --- a/fsabuilder/morfeuszbuilder/segrules/segsfsa.py +++ /dev/null @@ -1,27 +0,0 @@ -''' -Created on 24 sty 2014 - -@author: mlenart -''' - -class SegmentsFSAState(object): - - def __init__(self): - self.transitionsMap = {} - - def addSegmentRule(self, segmentRule): - pass - -class SegmentsFSA(object): - - def __init__(self, key2Def={}): - self.initialState = SegmentsFSAState() - - def addSegmentRule(self, segmentRule): - self.initialState.addSegmentRule(segmentRule) - - def serialize(self): - res = bytearray() - return res - - \ No newline at end of file diff --git a/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py b/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py index 5b92392..f74556d 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py +++ b/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py @@ -9,9 +9,11 @@ from morfeuszbuilder.segrules import rulesParser from morfeuszbuilder.tagset import tagset class Test(unittest.TestCase): + print 'do test' t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) parser = rulesParser.RulesParser(t) parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) + print 'done' if __name__ == "__main__": unittest.main() diff --git a/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat b/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat index 7f1e14e..b55cbef 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat +++ b/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat @@ -3,7 +3,6 @@ aggl=permissive strict isolated praet=split composite [combinations] -(dupa|dupa) #define wsz_interp (interp|kropka|dywiz)* #define moze_interp(segmenty) wsz_interp segmenty wsz_interp diff --git a/fsabuilder/morfeuszbuilder/tagset/segtypes.py b/fsabuilder/morfeuszbuilder/tagset/segtypes.py index 254491e..24652a5 100644 --- a/fsabuilder/morfeuszbuilder/tagset/segtypes.py +++ b/fsabuilder/morfeuszbuilder/tagset/segtypes.py @@ -4,80 +4,85 @@ Created on 17 lut 2014 @author: mlenart ''' import re +from morfeuszbuilder.utils import exceptions class Segtypes(object): - def __init__(self, tagset, segrulesFile): + def __init__(self, tagset, segrulesConfigFile): self.tagset = tagset - self.segrulesConfigFile = segrulesFile + self.filename = segrulesConfigFile.filename self.segtype2Segnum = {} self.patternsList = [] + self._readLexemes(segrulesConfigFile) + self._readTags(segrulesConfigFile) + + def _validate(self, msg, lineNum, cond): + if not cond: + raise exceptions.ConfigFileException(self.filename, lineNum, msg) + + def _readTags(self, segrulesConfigFile): + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): + print lineNum, line + splitLine = re.split(r'\s+', line.strip()) + self._validate( + u'Line in [tags] section must contain exactly two fields - segment type and tag pattern', + lineNum, + len(splitLine) == 2) + segtype, pattern = splitLine + self._validate( + u'Segment type must be a lowercase alphanumeric with optional underscores', + lineNum, + re.match(r'[a-z_]+', segtype)) + self._validate( + u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', + lineNum, + re.match(r'[a-z_\.\:\%]+', pattern)) + + if segtype in self.segtype2Segnum: + segnum = self.segtype2Segnum[segtype] + else: + segnum = len(self.segtype2Segnum) + self.segtype2Segnum[segtype] = segnum + + self.patternsList.append(SegtypePattern(None, pattern, segnum)) + + def _readLexemes(self, segrulesConfigFile): + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('lexemes'): + segtype, pattern = line.strip().split('\t') + self._validate( + u'Segment type must be a lowercase alphanumeric with optional underscores', + lineNum, + re.match(r'[a-z_]+', segtype)) + self._validate( + u'Pattern must contain lemma and POS', + lineNum, + re.match(r'.+\:[a-z_]+', pattern, re.U)) + + if segtype in self.segtype2Segnum: + segnum = self.segtype2Segnum[segtype] + else: + segnum = len(self.segtype2Segnum) + self.segtype2Segnum[segtype] = segnum + + lemma, pos = pattern.split(':') + + self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum)) - def readTags(self, lines): - inTags = False - for lineNum, line in enumerate(lines, start=1): - header = self._getHeaderValue(line, lineNum) - if header == 'tags': - inTags = True - elif header: - inTags = False - elif inTags: - segtype, pattern = line.strip().split('\t') - self._validate( - u'Segment type must be a lowercase alphanumeric with optional underscores', - lineNum, - re.match(r'[a-z_]+', segtype)) - self._validate( - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', - lineNum, - re.match(r'[a-z_\.\:\%]+', pattern)) - - if segtype in self.segtype2Segnum: - segnum = self.segtype2Segnum[segtype] - else: - segnum = len(self.segtype2Segnum) - self.segtype2Segnum[segtype] = segnum - - self.patternsList.append(SegtypePattern(None, pattern, segnum)) + def hasSegtype(self, segTypeString): + return segTypeString in self.segtype2Segnum - def readLexemes(self, lines): - inLexemes = False - for lineNum, line in enumerate(lines, start=1): - header = self._getHeaderValue(line, lineNum) - if header == 'lexemes': - inLexemes = True - elif header: - inLexemes = False - elif inLexemes: - segtype, pattern = line.strip().split('\t') - self._validate( - u'Segment type must be a lowercase alphanumeric with optional underscores', - lineNum, - re.match(r'[a-z_]+', segtype)) - self._validate( - u'Pattern must contain lemma and POS', - lineNum, - re.match(r'\w+\:[a-z_]+', pattern, re.U)) - - if segtype in self.segtype2Segnum: - segnum = self.segtype2Segnum[segtype] - else: - segnum = len(self.segtype2Segnum) - self.segtype2Segnum[segtype] = segnum - - lemma, pos = pattern.split(':') - - self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum)) + def getSegnum4Segtype(self, segTypeString): + return self.segtype2Segnum[segTypeString] def lexeme2Segnum(self, lemma, tag): for p in self.patternsList: res = p.tryToMatch(lemma, tag) if res >= 0: return res - raise SegtypesException('Cannot find segment type for given tag: %s' % tag) + return None class SegtypePattern(object): @@ -92,11 +97,3 @@ class SegtypePattern(object): return self.segnum else: return -1 - -class SegtypesException(Exception): - - def __init__(self, msg): - self.msg = msg - - def __str__(self): - return u'Error in segment rules: %s' % self.msg diff --git a/fsabuilder/morfeuszbuilder/tagset/tagset.py b/fsabuilder/morfeuszbuilder/tagset/tagset.py index 2599918..cde6fb2 100644 --- a/fsabuilder/morfeuszbuilder/tagset/tagset.py +++ b/fsabuilder/morfeuszbuilder/tagset/tagset.py @@ -12,10 +12,11 @@ class Tagset(object): NAMES = 2 SEP = '\t' - def __init__(self, filename, encoding='utf8'): + def __init__(self, filename=None, encoding='utf8'): self.tag2tagnum = {} self.name2namenum = {} - self._doInit(filename, encoding) + if filename: + self._doInit(filename, encoding) self.tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) def _doInit(self, filename, encoding): @@ -37,4 +38,4 @@ class Tagset(object): res[tag] = int(tagNum) def getTag4Tagnum(self, tagnum): - return self.tagnum2tag[tagnum] \ No newline at end of file + return self.tagnum2tag[tagnum] diff --git a/fsabuilder/morfeuszbuilder/utils/configFile.py b/fsabuilder/morfeuszbuilder/utils/configFile.py index 53e29fb..2e4c4af 100644 --- a/fsabuilder/morfeuszbuilder/utils/configFile.py +++ b/fsabuilder/morfeuszbuilder/utils/configFile.py @@ -6,6 +6,7 @@ Created on 18 lut 2014 import re import codecs +import exceptions def getHeaderValue(line, lineNum): m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) @@ -25,9 +26,9 @@ class ConfigFile(object): def _addSectionStart(self, sectionName, lineNum): if not sectionName in self.sectionNames: - raise ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName) + raise exceptions.ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName) if sectionName in self.section2Lines: - raise ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName) + raise exceptions.ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName) self.section2Lines[sectionName] = [] self.currSection = sectionName @@ -35,7 +36,7 @@ class ConfigFile(object): line = line.strip() if line: if self.currSection is None and not line.startswith('#'): - raise ConfigFileException(self.filename, lineNum, 'Text outside of any section') + raise exceptions.ConfigFileException(self.filename, lineNum, 'Text outside of any section') self.section2Lines[self.currSection].append((lineNum, line)) def _getHeaderValue(self, line, lineNum): @@ -56,13 +57,3 @@ class ConfigFile(object): self._addSectionStart(header, lineNum) else: self._addLine(line, lineNum) - -class ConfigFileException(Exception): - - def __init__(self, filename, lineNum, msg): - self.filename = filename - self.lineNum = lineNum - self.msg = msg - - def __str__(self): - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) diff --git a/fsabuilder/morfeuszbuilder/utils/exceptions.py b/fsabuilder/morfeuszbuilder/utils/exceptions.py new file mode 100644 index 0000000..494eef2 --- /dev/null +++ b/fsabuilder/morfeuszbuilder/utils/exceptions.py @@ -0,0 +1,34 @@ +''' +Created on Feb 19, 2014 + +@author: lennyn +''' + +class FSABuilderException(Exception): + ''' + Exception in configFile module + ''' + + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return 'Failed to create FSA files: ' + self.msg + +class SegtypesException(FSABuilderException): + + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return u'Error in segment rules: %s' % self.msg + +class ConfigFileException(FSABuilderException): + + def __init__(self, filename, lineNum, msg): + self.filename = filename + self.lineNum = lineNum + self.msg = msg + + def __str__(self): + return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg)