diff --git a/CMakeLists.txt b/CMakeLists.txt index 06b61a4..41edbee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "") if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) else () - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorf-0.6.7.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") endif () endif () diff --git a/buildAll.sh b/buildAll.sh index 796619d..80bdc70 100755 --- a/buildAll.sh +++ b/buildAll.sh @@ -10,7 +10,7 @@ function build { targets=$@ srcDir=`pwd` - buildDir=build/build-$os-$arch + buildDir=buildall/build-$os-$arch targetDir=$srcDir/target/$os-$arch toolchain=$srcDir/morfeusz/Toolchain-$os-$arch.cmake diff --git a/fsabuilder/buildfsa.py b/fsabuilder/buildfsa.py index 8a2f68e..f8da3d3 100644 --- a/fsabuilder/buildfsa.py +++ b/fsabuilder/buildfsa.py @@ -261,8 +261,9 @@ def main(opts): if __name__ == '__main__': import os opts = _parseOptions() - try: - main(opts) - except Exception as ex: - print >> sys.stderr, unicode(ex).encode('utf8') +# try: + main(opts) +# except Exception as ex: +# raise ex +# print >> sys.stderr, unicode(ex).encode('utf8') diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.py b/fsabuilder/morfeuszbuilder/fsa/fsa.py index 8674a87..2a68af2 100644 --- a/fsabuilder/morfeuszbuilder/fsa/fsa.py +++ b/fsabuilder/morfeuszbuilder/fsa/fsa.py @@ -113,12 +113,13 @@ class FSA(object): return q def calculateOffsets(self, sizeCounter): - currReverseOffset = 0 - for state in self.initialState.dfs(set()): - currReverseOffset += sizeCounter(state) - state.reverseOffset = currReverseOffset - for state in self.initialState.dfs(set()): - state.offset = currReverseOffset - state.reverseOffset + self.initialState.calculateOffsets(sizeCounter) +# currReverseOffset = 0 +# for state in self.initialState.dfs(set()): +# currReverseOffset += sizeCounter(state) +# state.reverseOffset = currReverseOffset +# for state in self.initialState.dfs(set()): +# state.offset = currReverseOffset - state.reverseOffset def debug(self): for state in self.initialState.dfs(set()): diff --git a/fsabuilder/morfeuszbuilder/fsa/serializer.py b/fsabuilder/morfeuszbuilder/fsa/serializer.py index 46482b3..5394a38 100644 --- a/fsabuilder/morfeuszbuilder/fsa/serializer.py +++ b/fsabuilder/morfeuszbuilder/fsa/serializer.py @@ -6,6 +6,7 @@ Created on Oct 20, 2013 import logging from state import State +from morfeuszbuilder.utils.serializationUtils import * class Serializer(object): @@ -63,7 +64,7 @@ class Serializer(object): self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): fsaData.extend(self.state2bytearray(state)) - res.extend(self.htonl(len(fsaData))) + res.extend(htonl(len(fsaData))) res.extend(fsaData) res.extend(self.serializeEpilogue(additionalData, moreAdditionalData)) return res @@ -71,9 +72,9 @@ class Serializer(object): def _serializeTags(self, tagsMap): res = bytearray() numOfTags = len(tagsMap) - res.extend(self.htons(numOfTags)) + res.extend(htons(numOfTags)) for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): - res.extend(self.htons(tagnum)) + res.extend(htons(tagnum)) res.extend(self.fsa.encodeWord(tag)) res.append(0) return res @@ -86,25 +87,6 @@ class Serializer(object): res.extend(self._serializeTags(tagset._name2namenum)) return res - # serialize uint16 as big endian - def htons(self, n): - assert n < 65536 - assert n >= 0 - res = bytearray() - res.append((n & 0x00FF00) >> 8) - res.append(n & 0x0000FF) - return res - - # serialize uint32 as big endian - def htonl(self, n): - assert n >= 0 - res = bytearray() - res.append((n & 0xFF000000) >> 24) - res.append((n & 0x00FF0000) >> 16) - res.append((n & 0x0000FF00) >> 8) - res.append(n & 0x000000FF) - return res - def serializePrologue(self): res = bytearray() @@ -126,7 +108,7 @@ class Serializer(object): res = bytearray() additionalDataSize = len(additionalData) if additionalData else 0 moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0 - res.extend(self.htonl(additionalDataSize)) + res.extend(htonl(additionalDataSize)) # add additional data itself if additionalDataSize: diff --git a/fsabuilder/morfeuszbuilder/fsa/state.py b/fsabuilder/morfeuszbuilder/fsa/state.py index 07dbc63..7a306dc 100644 --- a/fsabuilder/morfeuszbuilder/fsa/state.py +++ b/fsabuilder/morfeuszbuilder/fsa/state.py @@ -13,7 +13,7 @@ class State(object): def __init__(self, additionalData=None): self.transitionsMap = {} - self.transitionsDataMap = {} +# self.transitionsDataMap = {} self.freq = 0 self.encodedData = None self.reverseOffset = None @@ -29,11 +29,11 @@ class State(object): def transitionsNum(self): return len(self.transitionsMap) - def setTransition(self, byte, nextState): - self.transitionsMap[byte] = nextState - - def setTransitionData(self, byte, data): - self.transitionsDataMap[byte] = data + def setTransition(self, label, nextState): + self.transitionsMap[label] = nextState +# +# def setTransitionData(self, byte, data): +# self.transitionsDataMap[byte] = data def hasNext(self, byte): return byte in self.transitionsMap @@ -68,6 +68,14 @@ class State(object): yield state1 yield self + def calculateOffsets(self, sizeCounter): + currReverseOffset = 0 + for state in self.dfs(set()): + currReverseOffset += sizeCounter(state) + state.reverseOffset = currReverseOffset + for state in self.dfs(set()): + state.offset = currReverseOffset - state.reverseOffset + def debug(self): print '----------------' print 'STATE:', self.idx, 'accepting', self.isAccepting() diff --git a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py index 3b69ab4..8d5b1ed 100644 --- a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py +++ b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py @@ -7,6 +7,7 @@ Created on 23 sty 2014 import re from pyparsing import * from morfeuszbuilder.utils import exceptions +from pyparseString import pyparseString identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() @@ -54,7 +55,7 @@ def _tryToSubstituteNonArgDefine(s, t, defines): else: return defineName -def _processLine(lineNum, line, defines): +def _processLine(lineNum, line, defines, filename): if line.strip(): rule = Forward() @@ -67,24 +68,16 @@ def _processLine(lineNum, line, defines): rule.setParseAction(lambda s, l, t: ' '.join(t)) defineInstance.setParseAction(lambda s, l, t: _tryToSubstituteArgDefine(s, t, defines)) localId.setParseAction(lambda s, l, t: _tryToSubstituteNonArgDefine(s, t, defines)) - try: - return rule.parseString(line, parseAll=True)[0] - except ParseException as ex: - msg = u'Preprocessing of segmentation rules failed.\n' - msg += line + '\n' - msg += (ex.col - 1) * ' ' + '^\n' - msg += ex.msg -# print unicode(exceptions.SegtypesException(msg)).encode('utf8') - raise exceptions.SegtypesException(msg) + return pyparseString(rule, lineNum, line, filename)[0] else: return line -def preprocess(inputLines, defs): +def preprocess(inputLines, defs, filename): defines = {} ifdefsStack = [] for lineNum, line in inputLines: if line.startswith('#define'): - parsedDefine = list(define.parseString(line)) + parsedDefine = list(pyparseString(define, lineNum, line, filename)) if len(parsedDefine) == 2: name, val = parsedDefine defines[name] = NonArgDefine(name, val) @@ -92,15 +85,16 @@ def preprocess(inputLines, defs): name, arg, val = parsedDefine localDefines = defines.copy() localDefines[arg] = NonArgDefine(arg, arg) - val = _processLine(lineNum, val, localDefines) + val = _processLine(lineNum, val, localDefines, filename) defines[name] = ArgDefine(name, arg, val) elif line.startswith('#ifdef'): - name = ifdef.parseString(line)[0] + name = pyparseString(ifdef, lineNum, line, filename)[0] +# name = ifdef.parseString(line)[0] ifdefsStack.append(name) elif line.startswith('#endif'): ifdefsStack.pop() elif line.startswith('#'): yield lineNum, line elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)): - yield lineNum, _processLine(lineNum, line, defines) + yield lineNum, _processLine(lineNum, line, defines, filename) \ No newline at end of file diff --git a/fsabuilder/morfeuszbuilder/segrules/pyparseString.py b/fsabuilder/morfeuszbuilder/segrules/pyparseString.py new file mode 100644 index 0000000..e999f6c --- /dev/null +++ b/fsabuilder/morfeuszbuilder/segrules/pyparseString.py @@ -0,0 +1,19 @@ +''' +Created on 12 mar 2014 + +@author: mlenart +''' + +from pyparsing import ParseException +from morfeuszbuilder.utils import exceptions + +def pyparseString(rule, lineNum, line, filename): + try: + return rule.parseString(line, parseAll=True) + except ParseException as ex: + msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) + msg += line + '\n' + msg += (ex.col - 1) * ' ' + '^\n' + msg += ex.msg +# print unicode(exceptions.SegtypesException(msg)).encode('utf8') + raise exceptions.SegtypesException(msg) \ No newline at end of file diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py index 5ff2f53..08ff8ad 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rules.py +++ b/fsabuilder/morfeuszbuilder/segrules/rules.py @@ -25,16 +25,17 @@ class SegmentRule(object): class TagRule(SegmentRule): - def __init__(self, segnum, segtype): + def __init__(self, segnum, shiftOrth, segtype): self.segnum = segnum self.segtype = segtype + self.shiftOrth = shiftOrth def addToNFA(self, fsa): endState = RulesNFAState(final=True) self._doAddToNFA(fsa.initialState, endState) def _doAddToNFA(self, startState, endState): - startState.addTransition(self.segnum, endState) + startState.addTransition((self.segnum, self.shiftOrth), endState) def __str__(self): return u'%s(%d)' % (self.segtype, self.segnum) @@ -92,6 +93,7 @@ class ZeroOrMoreRule(UnaryRule): def __init__(self, child): super(ZeroOrMoreRule, self).__init__(child) + assert isinstance(child, SegmentRule) def addToNFA(self, fsa): raise ValueError() @@ -108,33 +110,3 @@ class ZeroOrMoreRule(UnaryRule): def __str__(self): return u'(' + str(self.child) + ')*' - -class ShiftOrthRule(UnaryRule): - - def __init__(self, child): - super(ShiftOrthRule, self).__init__(child) - - def addToNFA(self, fsa): - raise ValueError() - - def _doAddToNFA(self, startState, endState): - self.child._doAddToNFA(startState, endState) - startState.setTransitionData(self.child.segnum, 1) - - def __str__(self): - return u'(' + str(self.child) + ')>' - -class ShiftOrthSameTypeRule(UnaryRule): - - def __init__(self, child): - super(ShiftOrthSameTypeRule, self).__init__(child) - - def addToNFA(self, fsa): - raise ValueError() - - def _doAddToNFA(self, startState, endState): - self.child._doAddToNFA(startState, endState) - startState.setTransitionData(self.child.segnum, 2) - - def __str__(self): - return u'(' + str(self.child) + ')!>' diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py new file mode 100644 index 0000000..65d63e4 --- /dev/null +++ b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py @@ -0,0 +1,73 @@ +''' +Created on 12 mar 2014 + +@author: mlenart +''' +import logging +from morfeuszbuilder.fsa import state +from morfeuszbuilder.utils.serializationUtils import htons + +class RulesState(state.State): + + def __init__(self): + super(RulesState, self).__init__() + self.weak = None + + def setAsAccepting(self, weak): + self.weak = weak + self.encodedData = bytearray([1 if weak else 0]) + + def getEncodedSize(self): + stateInfoSize = 2 # accepting info + transitionsNum + transitionsSize = 4 * len(self.transitionsMap) + return stateInfoSize + transitionsSize + +class RulesFSA(object): + + def __init__(self): + self.initialState = state.State() + self.ACCEPTING_FLAG = 1 + self.WEAK_FLAG = 2 + + def stateData2bytearray(self, state): + res = bytearray() + firstByte = 0 + if state.isAccepting(): + firstByte |= self.ACCEPTING_FLAG + if state.weak: + firstByte |= self.WEAK_FLAG + assert firstByte < 256 and firstByte >= 0 + res.append(firstByte) + + secondByte = len(state.transitionsMap) + assert secondByte < 256 and secondByte >= 0 + res.append(secondByte) + + return res + + def transitionsData2bytearray(self, state): + res = bytearray() +# logging.debug('next') + for (segnum, shiftOrth), nextState in state.transitionsMap.iteritems(): + res.append(segnum) + if shiftOrth: + res.append(1) + else: + res.append(0) + offset = nextState.offset + assert offset < 65536 +# res.append((offset & 0xFF0000) >> 16) + res.extend(htons(offset)) + return res + + def serialize(self): + self.initialState.calculateOffsets(sizeCounter=lambda s: s.getEncodedSize()) + res = bytearray() + + for state in sorted(self.initialState.dfs(set()), key=lambda s: s.offset): + res.extend(self.stateData2bytearray(state)) + res.extend(self.transitionsData2bytearray(state)) + + logging.info('Segmentation automaton size: %d bytes', len(res)) + print list(res) + return res diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py index 967993d..829f17f 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py @@ -4,7 +4,7 @@ Created on 20 lut 2014 @author: mlenart ''' import logging -from morfeuszbuilder.fsa.serializer import SimpleSerializer +from morfeuszbuilder.utils.serializationUtils import htons, htonl class RulesManager(object): @@ -52,9 +52,9 @@ class RulesManager(object): def _serializeDFA(self, dfa): res = bytearray() - serializer = SimpleSerializer(dfa, serializeTransitionsData=True) - dfaBytearray = serializer.fsa2bytearray() - res.extend(serializer.htonl(len(dfaBytearray))) +# serializer = SimpleSerializer(dfa, serializeTransitionsData=True) + dfaBytearray = dfa.serialize() + res.extend(htonl(len(dfaBytearray))) res.extend(dfaBytearray) return res diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py index 9e632f0..2fe36c1 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py @@ -4,7 +4,7 @@ Created on 24 sty 2014 @author: mlenart ''' -from morfeuszbuilder.fsa import fsa, state, encode +from morfeuszbuilder.segrules.rulesFSA import RulesFSA, RulesState class RulesNFAState(object): @@ -12,7 +12,7 @@ class RulesNFAState(object): def __init__(self, initial=False, final=False, weak=False): self.transitionsMap = {} - self.transitionsDataMap = {} +# self.transitionsDataMap = {} self.initial = initial self.final = final self.weak = weak @@ -20,13 +20,9 @@ class RulesNFAState(object): RulesNFAState.statesCounter += 1 def addTransition(self, label, targetState): + assert label is None or len(label) == 2 self.transitionsMap.setdefault(label, set()) self.transitionsMap[label].add(targetState) - self.transitionsDataMap[label] = 0 - - def setTransitionData(self, label, byte): - assert len(self.transitionsMap[label]) == 1 - self.transitionsDataMap[label] = byte def getClosure(self, visited): if self in visited: @@ -64,10 +60,11 @@ class RulesNFA(object): for nfaState in nfaStates: for label, nextStates in nfaState.transitionsMap.iteritems(): if label is not None: - transitionData = nfaState.transitionsDataMap[label] - res.setdefault((label, transitionData), set()) +# transitionData = nfaState.transitionsDataMap[label] + segnum, shiftOrth = label + res.setdefault((segnum, shiftOrth), set()) for nextNFAState in nextStates: - res[(label, transitionData)] |= nextNFAState.getClosure(set()) + res[(segnum, shiftOrth)] |= nextNFAState.getClosure(set()) return res def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): @@ -79,23 +76,24 @@ class RulesNFA(object): if final: # dfaState should be final # and contain info about weakness - dfaState.encodedData = bytearray([1 if weak else 0]) - for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): + dfaState.setAsAccepting(weak=weak) +# dfaState.encodedData = bytearray([1 if weak else 0]) + for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): key = frozenset(nextNFAStates) if key in nfaSubset2DFAState: nextDFAState = nfaSubset2DFAState[key] else: - nextDFAState = state.State() + nextDFAState = RulesState() nfaSubset2DFAState[key] = nextDFAState self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) - dfaState.setTransition(label, nextDFAState) - dfaState.setTransitionData(label, transitionData) + dfaState.setTransition((segnum, shiftOrth), nextDFAState) +# dfaState.setTransitionData(label, transitionData) def convertToDFA(self): - dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) + dfa = RulesFSA() startStates = self.initialState.getClosure(set()) assert not any(filter(lambda s: s.final, startStates)) - dfa.initialState = state.State(additionalData=False) + dfa.initialState = RulesState() self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) return dfa diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py index 22bda6c..5cd0e14 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py @@ -3,7 +3,7 @@ from pyparsing import * ParserElement.enablePackrat() from morfeuszbuilder.tagset import segtypes from morfeuszbuilder.utils import configFile, exceptions -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString import codecs import re @@ -48,8 +48,8 @@ class RulesParser(object): if not firstNFA: firstNFA = nfa combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') - combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) - for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): + combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) + for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): # print rule rule.addToNFA(nfa) # nfa.debug() @@ -60,25 +60,24 @@ class RulesParser(object): res.addDFA(key2Def, dfa) return res - def _doParse(self, combinationEnumeratedLines, segtypesHelper): + def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename): for lineNum, line in combinationEnumeratedLines: if not line.startswith('#'): - yield self._doParseOneLine(lineNum, line, segtypesHelper) + yield self._doParseOneLine(lineNum, line, segtypesHelper, filename) - def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper): + def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): if not segtypesHelper.hasSegtype(segtype): raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) else: # return rules.TagRule(segtype) - return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), segtype) + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype) - def _doParseOneLine(self, lineNum, line, segtypesHelper): + def _doParseOneLine(self, lineNum, line, segtypesHelper, filename): rule = Forward() tagRule = Word(alphanums+'_') - shiftOrthRule = tagRule + '>' - shiftOrthSameTypeRule = tagRule + '!' + '>' + shiftOrthRule = Word(alphanums+'_') + Suppress('>') parenRule = Suppress('(') + rule + Suppress(')') - atomicRule = tagRule ^ shiftOrthRule ^ shiftOrthSameTypeRule ^ parenRule + atomicRule = tagRule ^ shiftOrthRule ^ parenRule zeroOrMoreRule = atomicRule + Suppress('*') oneOrMoreRule = atomicRule + Suppress('+') unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule @@ -87,13 +86,12 @@ class RulesParser(object): concatRule = OneOrMore(complexRule) rule << concatRule - tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) - shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0])) - shiftOrthSameTypeRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthSameTypeRule(toks[0])) + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) + shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) # parenRule.setParseAction(lambda string, loc, toks: toks[0]) zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) - parsedRule = rule.parseString(line, parseAll=True)[0] + parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0] return parsedRule diff --git a/fsabuilder/morfeuszbuilder/tagset/segtypes.py b/fsabuilder/morfeuszbuilder/tagset/segtypes.py index f7f4bdc..98704dd 100644 --- a/fsabuilder/morfeuszbuilder/tagset/segtypes.py +++ b/fsabuilder/morfeuszbuilder/tagset/segtypes.py @@ -33,6 +33,7 @@ class Segtypes(object): raise exceptions.ConfigFileException(self.filename, lineNum, msg) def _readTags(self, segrulesConfigFile): + gotWildcardPattern = False for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): splitLine = re.split(r'\s+', line.strip()) self._validate( @@ -49,13 +50,27 @@ class Segtypes(object): lineNum, re.match(r'[a-z_\.\:\%]+', pattern)) + self._validate( + u'Pattern that matches everything must be the last one', + lineNum - 1, + not gotWildcardPattern) + if segtype in self.segtype2Segnum: segnum = self.segtype2Segnum[segtype] else: segnum = len(self.segtype2Segnum) self.segtype2Segnum[segtype] = segnum - self.patternsList.append(SegtypePattern(None, pattern, segnum)) + segtypePattern = SegtypePattern(None, pattern, segnum) + + self._validate( + u'There is no tag that matches pattern "%s".' % pattern, + lineNum, + any([segtypePattern.tryToMatch(None, tag) != -1 for tag in self.tagset.getAllTags()])) + + self.patternsList.append(segtypePattern) + + gotWildcardPattern = gotWildcardPattern or pattern == '%' self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()]) @@ -67,7 +82,7 @@ class Segtypes(object): lineNum, re.match(r'[a-z_]+', segtype)) self._validate( - u'Pattern must contain lemma and POS', + u'Pattern must contain lemma and part-of-speech fields', lineNum, re.match(r'.+\:[a-z_]+', pattern, re.U)) @@ -79,7 +94,14 @@ class Segtypes(object): lemma, pos = pattern.split(':') - self.patternsList.append(SegtypePattern(lemma, '%s|%s:%%' % (pos, pos), segnum)) + segtypePattern = SegtypePattern(lemma, pos + ':%', segnum) + + self._validate( + u'There is no tag that matches pattern "%s".' % (pos + ':%'), + lineNum, + any([segtypePattern.tryToMatch(lemma, tag) != -1 for tag in self.tagset.getAllTags()])) + + self.patternsList.append(segtypePattern) def _debugSegnums(self): for tagnum, segnum in self._tagnum2Segnum.items(): @@ -121,11 +143,6 @@ class Segtypes(object): if not res: res = self._tagnum2Segnum.get(tagnum, None) return res -# for p in self.patternsList: -# res = p.tryToMatch(lemma, tag) -# if res >= 0: -# return res -# return None class SegtypePattern(object): @@ -135,8 +152,13 @@ class SegtypePattern(object): self.segnum = segnum def tryToMatch(self, lemma, tag): +# tag2Match = tag + ':' if not tag.endswith(':') else tag +# print tag2Match + patterns2Match = [] + patterns2Match.append(self.pattern.replace('%', '.*')) + patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) if (self.lemma is None or self.lemma == lemma) \ - and re.match(self.pattern.replace('%', '.*'), tag): + and any([re.match(p, tag) for p in patterns2Match]): return self.segnum else: return -1 diff --git a/fsabuilder/morfeuszbuilder/utils/serializationUtils.py b/fsabuilder/morfeuszbuilder/utils/serializationUtils.py new file mode 100644 index 0000000..f8ffe0e --- /dev/null +++ b/fsabuilder/morfeuszbuilder/utils/serializationUtils.py @@ -0,0 +1,24 @@ +''' +Created on 12 mar 2014 + +@author: mlenart +''' + +# serialize uint16 as big endian +def htons(n): + assert n < 65536 + assert n >= 0 + res = bytearray() + res.append((n & 0x00FF00) >> 8) + res.append(n & 0x0000FF) + return res + +# serialize uint32 as big endian +def htonl(n): + assert n >= 0 + res = bytearray() + res.append((n & 0xFF000000) >> 24) + res.append((n & 0x00FF0000) >> 16) + res.append((n & 0x0000FF00) >> 8) + res.append(n & 0x000000FF) + return res diff --git a/input/dodatki.tab b/input/dodatki.tab index 27c9f05..2905309 100644 --- a/input/dodatki.tab +++ b/input/dodatki.tab @@ -41,13 +41,171 @@ z Z brev:pun ż Ż brev:pun ch Ch brev:pun st St brev:pun -0 0 dig -1 1 dig -2 2 dig -3 3 dig -4 4 dig -5 5 dig -6 6 dig -7 7 dig -8 8 dig -9 9 dig +poli poli prefa +poli poli prefs +niby niby prefa +niby niby prefs +eks eks prefs +ex ex prefs +euro euro prefa +euro euro prefs +mikro mikro prefs +mikro mikro prefa +makro makro prefa +makro makro prefs +bez bez prefa +do do prefv +do do prefa +dez dez prefv +dez dez prefa +dez dez prefs +ko ko prefa +ko ko prefs +między między prefa +między między prefs +na na prefa +na na prefs +na na prefv +nad nad prefa +nad nad prefs +nad nad prefv +o o prefv +ob ob prefv +od od prefa +od od prefs +od od prefv +pra pra prefs +post post prefa +post post prefs +pod pod prefa +pod pod prefs +pod pod prefv +poza poza prefa +ponad ponad prefa +pre pre prefa +pre pre prefs +pro pro prefa +pro pro prefs +prze prze prefa +prze prze prefv +przeciw przeciw prefa +przeciw przeciw prefs +re re prefa +re re prefs +re re prefv +przy przy prefa +przy przy prefv +roz roz prefv +u u prefv +samo samo prefa +samo samo prefs +video video prefs +video video prefa +w w prefv +wy wy prefv +współ współ prefv +współ współ prefa +współ współ prefs +wice wice prefs +neo neo prefa +neo neo prefs +tele tele prefs +tele tele prefa +z z prefv +za za prefv +za za prefa +za za prefs +wideo wideo prefa +wideo wideo prefs +meta meta prefs +meta meta prefa +multi multi prefa +multi multi prefs +mega mega prefa +mega mega prefs +kontra kontra prefs +kontra kontra prefa +inter inter prefa +inter inter prefs +homo homo prefs +homo homo prefa +ekstra ekstra prefa +ekstra ekstra prefs +giga giga prefa +giga giga prefs +bi bi prefs +bi bi prefa +auto auto prefs +auto auto prefa +de de prefv +de de prefa +de de prefs +ultra ultra prefs +ultra ultra prefa +e- e- prefa +e- e- prefs +mini mini prefs +mini mini prefa +maxi maxi prefs +maxi maxi prefa +midi midi prefs +midi midi prefa +arcy arcy prefs +arcy arcy prefa +anty anty prefa +anty anty prefs +a a prefa +a a prefs +pan pan prefs +pan pan prefa +in in prefa +in in prefs +dys dys prefs +dys dys prefa +mono mono prefa +mono mono prefs +porno porno prefs +porno porno prefa +anglo anglo prefa +aero aero prefs +aero aero prefa +bio bio prefs +bio bio prefa +wszystko wszystko prefs +wszystko wszystko prefa +wszech wszech prefs +wszech wszech prefa +śród śród prefs +śród śród prefa +audio audio prefs +audio audio prefa +eko eko prefs +eko eko prefa +s s prefv +elektro elektro prefs +elektro elektro prefa +trans trans prefa +trans trans prefs +kontr kontr prefs +kontr kontr prefa +pseudo pseudo prefs +pseudo pseudo prefa +quasi quasi prefs +quasi quasi prefa +super super prefs +super super prefa +po po prefv +po po prefa +po po prefs +sub sub prefs +sub sub prefa +hiper hiper prefa +hiper hiper prefs +non non prefs +non non prefa +stereo stereo prefa +stereo stereo prefs +energo energo prefa +para para prefa +para para prefs +ś ś prefv diff --git a/input/polimorf.tagset b/input/polimorf.tagset index e944ec2..aaf7a67 100644 --- a/input/polimorf.tagset +++ b/input/polimorf.tagset @@ -584,6 +584,9 @@ 579 interp 580 brev:pun 581 brev:npun +582 prefa +583 prefs +584 prefv [NAMES] diff --git a/input/segmenty.dat b/input/segmenty.dat index 6df8563..d110bbe 100644 --- a/input/segmenty.dat +++ b/input/segmenty.dat @@ -19,7 +19,7 @@ samotny # przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”: moze_interp(praet_sg_na) -# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „czytał”: +# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „moze”: moze_interp(praet_sg) # przeszlik mnogi, np. „czytali”: @@ -69,9 +69,8 @@ moze_interp(praet_sg by aglsg) # np. „gnietli·by·śmy” moze_interp(praet_pl by aglpl) #else -moze_interp(praetcond) +# moze_interp(praetcond) #endif - # np. „by·ś” moze_interp(by aglsg) # np. „by·ście” @@ -98,9 +97,9 @@ moze_interp( (adja dywiz)+ adj ) # adja dywiz adja dywiz adja dywiz adj interp? # adja dywiz adja dywiz adja dywiz adja dywiz adj interp? -# Stopień najwyższy: -# np. „naj·zieleńszy”, „naj·mądrzej” -moze_interp( naj> adj_sup ) +# Formy zanegowane stopnia wyższego przymiotników i przysłówków (WK) +# np. „nie·grzeczniejszy”, „nie·grzeczniej” +moze_interp( nie> adj_com ) # Formy „zanegowane” gerundiów i imiesłowów: # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: @@ -112,15 +111,21 @@ moze_interp(z_on_agl) moze_interp(z_on_agl on_agl) # Liczba zapisana jako ciąg cyfr: -moze_interp( dig!>+ ) +moze_interp( dig ) # Formacje prefiksalne #### trzeba wydzielić odpowiednie samodze! -# rzeczownikowe i przymiotnikowe -# np. „euro·sodoma”, „e-·papieros”, „euro·sodomski”, „bez·argumentowy” -moze_interp( prefs samodz ) +# rzeczownikowe +# np. „euro·sodoma”, „e-·papieros” +moze_interp(nomina) +moze_interp( prefs> nomina ) # czasownikowe np. „po·nakapywać” -moze_interp( prefv samodz ) +moze_interp(verba_imperf) +moze_interp( prefv> verba_imperf ) +# przymiotnikowe np. „do·żylny”, „euro·sodomski”, „bez·argumentowy” +moze_interp(adjectiva) +moze_interp(prefa> adj) +moze_interp( prefa> adjectiva ) # Apozycje z dywizem # np. „kobieta-prezydent” @@ -133,11 +138,28 @@ adj dywiz samodz # ? samodz dywiz adj +#### PONIŻEJ REGUŁY WK +# Stopień najwyższy: +# np. „naj·zieleńszy”, „naj·mądrzej” +moze_interp( naj> adj_sup ) +# Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj +moze_interp( praet_sg dywiz li) +moze_interp( praet_pl dywiz li) +moze_interp( praet_sg_na dywiz li) +moze_interp( fin dywiz li) + +# i bez dywizu --- czy bez dywizu jest sens to łapać? +#moze_interp( praet_sg li) +#moze_interp( praet_pl li) +#moze_interp( praet_sg_na li) +#moze_interp( fin li) + [segment types] naj nie prefs prefv +prefa dig adja adj @@ -161,11 +183,14 @@ naj naj nie nie prefs prefs prefv prefv +prefa prefa dig dig adja adja adj adj:%:pos adj_sup adj:%:sup adj_sup adv:sup +adj_com adj:%:com +adj_com adj:%:com negat ger:%:neg negat pact:%:neg negat ppas:%:neg @@ -173,26 +198,35 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep z_on_agl prep:% samotny brev:pun samotny brev:npun -samotny intrj +samotny interj interp interp aglsg aglt:sg:% aglpl aglt:pl:% -praetcond cond:% -praetcond praet:%:pri:% -praetcond praet:%:sec:% -praetcond praet:%:ter:% praet_sg_agl praet:sg:%:agl praet_sg_na praet:sg:%:nagl praet_sg praet:sg:% praet_pl praet:pl:% praet_sg winien:sg:% praet_pl winien:pl:% +fin fin:% +nomina subst:% +nomina ger:% +nomina depr:% +adjectiva adv:% +adjectiva ppas:% +adjectiva pact:% +verba_imperf praet:%:imperf +verba_imperf fin:%:imperf +verba_imperf inf:imperf +verba_imperf imps:imperf +verba_imperf impt:%:imperf samodz % [lexemes] z_aglt aby:comp z_aglt bowiem:comp by by:qub +li li:qub z_aglt by:comp z_aglt cóż:subst z_aglt czemu:adv diff --git a/input/segmenty1.dat b/input/segmenty1.dat index 228de14..031707c 100644 --- a/input/segmenty1.dat +++ b/input/segmenty1.dat @@ -7,9 +7,10 @@ praet=split composite #define moze_interp(segmenty) wsz_interp segmenty wsz_interp +dig>* dig (adja dywiz)+ adj -dig!>+ -dig!> dig!> dig!> +#dig!>+ +#dig!> dig!> dig!> naj> adj_sup [segment types] @@ -52,20 +53,10 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep z_on_agl prep:% samotny brev:pun samotny brev:npun -samotny intrj +samotny interj interp interp aglsg aglt:sg:% aglpl aglt:pl:% -praetcond cond:% -praetcond praet:%:pri:% -praetcond praet:%:sec:% -praetcond praet:%:ter:% -praet_sg_agl praet:sg:%:agl -praet_sg_na praet:sg:%:nagl -praet_sg praet:sg:% -praet_pl praet:pl:% -praet_sg winien:sg:% -praet_pl winien:pl:% samodz % [lexemes] diff --git a/morfeusz/InterpretedChunk.hpp b/morfeusz/InterpretedChunk.hpp index 2a51a10..4e49d2c 100644 --- a/morfeusz/InterpretedChunk.hpp +++ b/morfeusz/InterpretedChunk.hpp @@ -17,7 +17,6 @@ struct InterpretedChunk { std::vector<uint32_t> lowercaseCodepoints; InterpsGroup interpsGroup; bool shiftOrth; - bool shiftOrthSameType; bool orthWasShifted; std::vector<InterpretedChunk> prefixChunks; }; diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index 30c1a97..aa400cd 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -37,11 +37,19 @@ static MorfeuszOptions createDefaultOptions() { return res; } +static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { + SegrulesOptions opts; + opts["aggl"] = "isolated"; + opts["praet"] = "split"; + return (*(map.find(opts))).second; +} + Morfeusz::Morfeusz() : env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET), analyzerPtr(DEFAULT_FSA), analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())), segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)), +currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), isAnalyzerFSAFromFile(false), generatorPtr(DEFAULT_SYNTH_FSA), isGeneratorFSAFromFile(false), @@ -50,9 +58,9 @@ options(createDefaultOptions()) { } -static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSAType*>& fsasMap) { +static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) { for ( - std::map<SegrulesOptions, SegrulesFSAType*>::iterator it = fsasMap.begin(); + std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin(); it != fsasMap.end(); ++it) { delete it->second; @@ -100,11 +108,8 @@ void Morfeusz::analyzeOneWord( vector<InterpretedChunk> accum; FlexionGraph graph; const char* currInput = inputStart; - SegrulesOptions opts; - opts["aggl"] = "isolated"; - opts["praet"] = "split"; - SegrulesFSAType* segrulesFSA = (*(this->segrulesFSAsMap.find(opts))).second; - doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->getInitialState()); + SegrulesFSA* segrulesFSA = this->currSegrulesFSA; + doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->initialState); if (!graph.empty()) { InterpretedChunksDecoder interpretedChunksDecoder(env); int srcNode = startNodeNum; @@ -118,7 +123,8 @@ void Morfeusz::analyzeOneWord( srcNode++; } // graph.getResults(*this->tagset, results); - } else if (inputStart != inputEnd) { + } + else if (inputStart != inputEnd) { this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results); } inputStart = currInput; @@ -126,9 +132,9 @@ void Morfeusz::analyzeOneWord( static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { to.prefixChunks.insert( - to.prefixChunks.begin(), - from.prefixChunks.begin(), - from.prefixChunks.end()); + to.prefixChunks.begin(), + from.prefixChunks.begin(), + from.prefixChunks.end()); to.prefixChunks.push_back(from); from.orthWasShifted = true; } @@ -138,7 +144,8 @@ void Morfeusz::doAnalyzeOneWord( const char* inputEnd, vector<InterpretedChunk>& accum, FlexionGraph& graph, - SegrulesStateType segrulesState) const { + SegrulesState segrulesState) const { + // cerr << "doAnalyzeOneWord " << inputData << endl; bool endOfWord = inputData == inputEnd; const char* currInput = inputData; uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd); @@ -159,16 +166,27 @@ void Morfeusz::doAnalyzeOneWord( vector<InterpsGroup> val(state.getValue()); for (unsigned int i = 0; i < val.size(); i++) { InterpsGroup& ig = val[i]; - cerr << (int) ig.type << endl; - SegrulesStateType newSegrulesState = segrulesState; - newSegrulesState.proceedToNext(ig.type); - if (!newSegrulesState.isSink()) { - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false}; - if (!accum.empty() - && (accum.back().shiftOrth - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) { + // newSegrulesState.proceedToNext(ig.type); + // this->currSegrulesFSA->proceedToNext(ig.type, segrulesStates, newSegrulesStates); + set<SegrulesState> newSegrulesStates; + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates); + for ( + set<SegrulesState>::iterator it = newSegrulesStates.begin(); + it != newSegrulesStates.end(); + it++) { + SegrulesState newSegrulesState = *it; + // bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; + // bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; + InterpretedChunk ic = { + inputData, + originalCodepoints, + lowercaseCodepoints, + ig, + newSegrulesState.shiftOrthFromPrevious, + false, + vector<InterpretedChunk>() + }; + if (!accum.empty() && accum.back().shiftOrth) { doShiftOrth(accum.back(), ic); } accum.push_back(ic); @@ -182,27 +200,37 @@ void Morfeusz::doAnalyzeOneWord( this->env.getCharsetConverter().next(currInput, inputEnd); } } + // cerr << "end of word" << endl; // we are at the end of word if (state.isAccepting()) { vector<InterpsGroup > val(state.getValue()); for (unsigned int i = 0; i < val.size(); i++) { InterpsGroup& ig = val[i]; - SegrulesStateType newSegrulesState = segrulesState; - newSegrulesState.proceedToNext(ig.type); - if (newSegrulesState.isAccepting()) { - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false}; - if (!accum.empty() - && (accum.back().shiftOrth - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) { - doShiftOrth(accum.back(), ic); + // cerr << "currInput=" << currInput << endl; + // cerr << "type=" << (int) ig.type << endl; + set<SegrulesState> newSegrulesStates; + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates); + for ( + set<SegrulesState>::iterator it = newSegrulesStates.begin(); + it != newSegrulesStates.end(); + it++) { + SegrulesState newSegrulesState = *it; + if (newSegrulesState.accepting) { + InterpretedChunk ic = { + inputData, + originalCodepoints, + lowercaseCodepoints, + ig, + newSegrulesState.shiftOrthFromPrevious, + false, + vector<InterpretedChunk>()}; + if (!accum.empty() && accum.back().shiftOrth) { + doShiftOrth(accum.back(), ic); + } + accum.push_back(ic); + graph.addPath(accum); + accum.pop_back(); } - accum.push_back(ic); - graph.addPath(accum); - accum.pop_back(); - } else if (!newSegrulesState.isSink()) { - } else { } } } diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp index 6e20424..7000657 100644 --- a/morfeusz/Morfeusz.hpp +++ b/morfeusz/Morfeusz.hpp @@ -12,6 +12,7 @@ #include <list> #include <vector> #include <map> +#include <set> #include "EncodedInterpretation.hpp" #include "fsa/fsa.hpp" #include "MorphInterpretation.hpp" @@ -27,6 +28,7 @@ #include "Environment.hpp" #include "segrules/segrules.hpp" +#include "segrules/SegrulesFSA.hpp" class Morfeusz; class ResultsIterator; @@ -111,7 +113,7 @@ private: const char* inputEnd, std::vector<InterpretedChunk>& accum, FlexionGraph& graph, - SegrulesStateType segrulesState) const; + SegrulesState segrulesState) const; void appendIgnotiumToResults( const std::string& word, @@ -120,17 +122,13 @@ private: Environment env; const unsigned char* analyzerPtr; FSAType* analyzerFSA; - std::map<SegrulesOptions, SegrulesFSAType*> segrulesFSAsMap; + std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap; + SegrulesFSA* currSegrulesFSA; bool isAnalyzerFSAFromFile; const unsigned char* generatorPtr; bool isGeneratorFSAFromFile; Generator generator; -// const CharsetConverter* charsetConverter; -// const Tagset* tagset; -// const CaseConverter* caseConverter; -// -// UTF8CharsetConverter utf8CharsetConverter; MorfeuszOptions options; }; diff --git a/morfeusz/segrules/SegrulesFSA.hpp b/morfeusz/segrules/SegrulesFSA.hpp new file mode 100644 index 0000000..70684b1 --- /dev/null +++ b/morfeusz/segrules/SegrulesFSA.hpp @@ -0,0 +1,71 @@ +/* + * File: SegrulesFSA.hpp + * Author: mlenart + * + * Created on 12 marzec 2014, 17:52 + */ + +#ifndef SEGRULESFSA_HPP +#define SEGRULESFSA_HPP + +#include <set> +#include "../endianness.hpp" + +struct SegrulesState { + uint16_t offset; + bool accepting; + bool weak; + bool shiftOrthFromPrevious; +}; + +inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) +{ + return s1.offset < s2.offset; +} + +class SegrulesFSA { +public: + SegrulesFSA(const unsigned char* ptr): initialState(), ptr(ptr) { + SegrulesState state = {0, false, false, false}; + initialState = state; + } + + void proceedToNext( + const unsigned char segnum, + const SegrulesState state, + std::set<SegrulesState>& newStates) const { + + const unsigned char* currPtr = ptr + state.offset; + currPtr++; + const unsigned char transitionsNum = *currPtr; + currPtr++; + for (unsigned int i = 0; i < transitionsNum; i++) { + if (*currPtr == segnum) { + newStates.insert(newStates.begin(), this->transition2State(currPtr)); + } + currPtr += 4; + } + } + + virtual ~SegrulesFSA() {} + + SegrulesState initialState; +private: + const unsigned char* ptr; + + SegrulesState transition2State(const unsigned char* transitionPtr) const { + unsigned char ACCEPTING_FLAG = 1; + unsigned char WEAK_FLAG = 2; + SegrulesState res; + transitionPtr++; + res.shiftOrthFromPrevious = *transitionPtr; + transitionPtr++; + res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr)); + res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; + res.weak = *(ptr + res.offset) & WEAK_FLAG; + return res; + } +}; + +#endif /* SEGRULESFSA_HPP */ + diff --git a/morfeusz/segrules/segrules.cpp b/morfeusz/segrules/segrules.cpp index 6e48f62..c08d626 100644 --- a/morfeusz/segrules/segrules.cpp +++ b/morfeusz/segrules/segrules.cpp @@ -33,23 +33,23 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { return res; } -static inline SegrulesFSAType* deserializeFSA(const unsigned char*& ptr) { +static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) { uint32_t fsaSize = deserializeUint32(ptr); - static SegrulesDeserializer deserializer; - SegrulesFSAType* res = SegrulesFSAType::getFSA(ptr, deserializer); +// static SegrulesDeserializer deserializer; + SegrulesFSA* res = new SegrulesFSA(ptr); ptr += fsaSize; return res; } -map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) { - map<SegrulesOptions, SegrulesFSAType*> res; +map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) { + map<SegrulesOptions, SegrulesFSA*> res; const unsigned char* fsasMapPtr = getFSAsMapPtr(analyzerPtr); const unsigned char* currPtr = fsasMapPtr; unsigned char fsasNum = *currPtr; currPtr++; for (unsigned char i = 0; i < fsasNum; i++) { SegrulesOptions options = deserializeOptions(currPtr); - SegrulesFSAType* fsa = deserializeFSA(currPtr); + SegrulesFSA* fsa = deserializeFSA(currPtr); res[options] = fsa; } return res; diff --git a/morfeusz/segrules/segrules.hpp b/morfeusz/segrules/segrules.hpp index f7b47a2..5bf3b7a 100644 --- a/morfeusz/segrules/segrules.hpp +++ b/morfeusz/segrules/segrules.hpp @@ -11,13 +11,13 @@ #include <utility> #include <map> #include <string> -#include "../fsa/fsa.hpp" +#include "SegrulesFSA.hpp" typedef std::map<std::string, std::string> SegrulesOptions; -typedef State<unsigned char> SegrulesStateType; -typedef FSA<unsigned char> SegrulesFSAType; +//typedef State<unsigned char> SegrulesStateType; +//typedef FSA<unsigned char> SegrulesFSAType; -std::map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); +std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); #endif /* SEGRULES_HPP */ diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index a1925b9..98bf4ca 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -106,14 +106,20 @@ </makeTool> </makefileType> <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> </item> <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> </item> <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" ex="false" @@ -121,7 +127,6 @@ flavor2="8"> <ccTool> <incDir> - <pElem>build</pElem> <pElem>/usr/lib/jvm/default-java/include</pElem> <pElem>morfeusz</pElem> <pElem>build/morfeusz/java</pElem> @@ -145,7 +150,6 @@ flavor2="8"> <ccTool> <incDir> - <pElem>build</pElem> <pElem>/usr/include/python2.7</pElem> <pElem>morfeusz</pElem> <pElem>build/morfeusz/python</pElem> @@ -173,9 +177,8 @@ <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> <incDir> - <pElem>build1</pElem> <pElem>morfeusz</pElem> - <pElem>build1/morfeusz</pElem> + <pElem>morfeusz/build/morfeusz</pElem> </incDir> <preprocessorList> <Elem>libmorfeusz_EXPORTS</Elem> @@ -185,9 +188,8 @@ <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> <incDir> - <pElem>build1</pElem> <pElem>morfeusz</pElem> - <pElem>build1/morfeusz</pElem> + <pElem>morfeusz/build/morfeusz</pElem> </incDir> <preprocessorList> <Elem>libmorfeusz_EXPORTS</Elem> @@ -266,12 +268,18 @@ </preprocessorList> </ccTool> </folder> - <folder path="morfeusz/java"> + <folder path="morfeusz"> <ccTool> <incDir> <pElem>build</pElem> + </incDir> + </ccTool> + </folder> + <folder path="morfeusz/java"> + <ccTool> + <incDir> <pElem>morfeusz</pElem> - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> + <pElem>/usr/lib/jvm/default-java/include</pElem> </incDir> <preprocessorList> <Elem>libjmorfeusz_EXPORTS</Elem> @@ -281,7 +289,6 @@ <folder path="morfeusz/python"> <ccTool> <incDir> - <pElem>build</pElem> <pElem>/usr/include/python2.7</pElem> <pElem>morfeusz</pElem> </incDir> @@ -407,18 +414,26 @@ </ccTool> </item> <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/charset/CharsetConverter.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/charset/conversion_tables.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> @@ -507,8 +522,12 @@ ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="0">