Commit 00e66248a61ae340a23b5635cfc761be6dbf38cd

Authored by Michał Lenart
1 parent a6f0d912

poprawiona obsługa segmentacji (działają już cyfry tak, jak na początku ustalono)

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@112 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
... ... @@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "")
36 36 if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE")
37 37 set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt)
38 38 else ()
39   - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab")
  39 + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorf-0.6.7.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab")
40 40 endif ()
41 41 endif ()
42 42  
... ...
buildAll.sh
... ... @@ -10,7 +10,7 @@ function build {
10 10 targets=$@
11 11  
12 12 srcDir=`pwd`
13   - buildDir=build/build-$os-$arch
  13 + buildDir=buildall/build-$os-$arch
14 14 targetDir=$srcDir/target/$os-$arch
15 15 toolchain=$srcDir/morfeusz/Toolchain-$os-$arch.cmake
16 16  
... ...
fsabuilder/buildfsa.py
... ... @@ -261,8 +261,9 @@ def main(opts):
261 261 if __name__ == '__main__':
262 262 import os
263 263 opts = _parseOptions()
264   - try:
265   - main(opts)
266   - except Exception as ex:
267   - print >> sys.stderr, unicode(ex).encode('utf8')
  264 +# try:
  265 + main(opts)
  266 +# except Exception as ex:
  267 +# raise ex
  268 +# print >> sys.stderr, unicode(ex).encode('utf8')
268 269  
... ...
fsabuilder/morfeuszbuilder/fsa/fsa.py
... ... @@ -113,12 +113,13 @@ class FSA(object):
113 113 return q
114 114  
115 115 def calculateOffsets(self, sizeCounter):
116   - currReverseOffset = 0
117   - for state in self.initialState.dfs(set()):
118   - currReverseOffset += sizeCounter(state)
119   - state.reverseOffset = currReverseOffset
120   - for state in self.initialState.dfs(set()):
121   - state.offset = currReverseOffset - state.reverseOffset
  116 + self.initialState.calculateOffsets(sizeCounter)
  117 +# currReverseOffset = 0
  118 +# for state in self.initialState.dfs(set()):
  119 +# currReverseOffset += sizeCounter(state)
  120 +# state.reverseOffset = currReverseOffset
  121 +# for state in self.initialState.dfs(set()):
  122 +# state.offset = currReverseOffset - state.reverseOffset
122 123  
123 124 def debug(self):
124 125 for state in self.initialState.dfs(set()):
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.py
... ... @@ -6,6 +6,7 @@ Created on Oct 20, 2013
6 6  
7 7 import logging
8 8 from state import State
  9 +from morfeuszbuilder.utils.serializationUtils import *
9 10  
10 11 class Serializer(object):
11 12  
... ... @@ -63,7 +64,7 @@ class Serializer(object):
63 64 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
64 65 for state in sorted(self.fsa.dfs(), key=lambda s: s.offset):
65 66 fsaData.extend(self.state2bytearray(state))
66   - res.extend(self.htonl(len(fsaData)))
  67 + res.extend(htonl(len(fsaData)))
67 68 res.extend(fsaData)
68 69 res.extend(self.serializeEpilogue(additionalData, moreAdditionalData))
69 70 return res
... ... @@ -71,9 +72,9 @@ class Serializer(object):
71 72 def _serializeTags(self, tagsMap):
72 73 res = bytearray()
73 74 numOfTags = len(tagsMap)
74   - res.extend(self.htons(numOfTags))
  75 + res.extend(htons(numOfTags))
75 76 for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum):
76   - res.extend(self.htons(tagnum))
  77 + res.extend(htons(tagnum))
77 78 res.extend(self.fsa.encodeWord(tag))
78 79 res.append(0)
79 80 return res
... ... @@ -86,25 +87,6 @@ class Serializer(object):
86 87 res.extend(self._serializeTags(tagset._name2namenum))
87 88 return res
88 89  
89   - # serialize uint16 as big endian
90   - def htons(self, n):
91   - assert n < 65536
92   - assert n >= 0
93   - res = bytearray()
94   - res.append((n & 0x00FF00) >> 8)
95   - res.append(n & 0x0000FF)
96   - return res
97   -
98   - # serialize uint32 as big endian
99   - def htonl(self, n):
100   - assert n >= 0
101   - res = bytearray()
102   - res.append((n & 0xFF000000) >> 24)
103   - res.append((n & 0x00FF0000) >> 16)
104   - res.append((n & 0x0000FF00) >> 8)
105   - res.append(n & 0x000000FF)
106   - return res
107   -
108 90 def serializePrologue(self):
109 91 res = bytearray()
110 92  
... ... @@ -126,7 +108,7 @@ class Serializer(object):
126 108 res = bytearray()
127 109 additionalDataSize = len(additionalData) if additionalData else 0
128 110 moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0
129   - res.extend(self.htonl(additionalDataSize))
  111 + res.extend(htonl(additionalDataSize))
130 112  
131 113 # add additional data itself
132 114 if additionalDataSize:
... ...
fsabuilder/morfeuszbuilder/fsa/state.py
... ... @@ -13,7 +13,7 @@ class State(object):
13 13  
14 14 def __init__(self, additionalData=None):
15 15 self.transitionsMap = {}
16   - self.transitionsDataMap = {}
  16 +# self.transitionsDataMap = {}
17 17 self.freq = 0
18 18 self.encodedData = None
19 19 self.reverseOffset = None
... ... @@ -29,11 +29,11 @@ class State(object):
29 29 def transitionsNum(self):
30 30 return len(self.transitionsMap)
31 31  
32   - def setTransition(self, byte, nextState):
33   - self.transitionsMap[byte] = nextState
34   -
35   - def setTransitionData(self, byte, data):
36   - self.transitionsDataMap[byte] = data
  32 + def setTransition(self, label, nextState):
  33 + self.transitionsMap[label] = nextState
  34 +#
  35 +# def setTransitionData(self, byte, data):
  36 +# self.transitionsDataMap[byte] = data
37 37  
38 38 def hasNext(self, byte):
39 39 return byte in self.transitionsMap
... ... @@ -68,6 +68,14 @@ class State(object):
68 68 yield state1
69 69 yield self
70 70  
  71 + def calculateOffsets(self, sizeCounter):
  72 + currReverseOffset = 0
  73 + for state in self.dfs(set()):
  74 + currReverseOffset += sizeCounter(state)
  75 + state.reverseOffset = currReverseOffset
  76 + for state in self.dfs(set()):
  77 + state.offset = currReverseOffset - state.reverseOffset
  78 +
71 79 def debug(self):
72 80 print '----------------'
73 81 print 'STATE:', self.idx, 'accepting', self.isAccepting()
... ...
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... ... @@ -7,6 +7,7 @@ Created on 23 sty 2014
7 7 import re
8 8 from pyparsing import *
9 9 from morfeuszbuilder.utils import exceptions
  10 +from pyparseString import pyparseString
10 11  
11 12 identifier = Word(alphas, bodyChars=alphanums+u'_>*+!')
12 13 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
... ... @@ -54,7 +55,7 @@ def _tryToSubstituteNonArgDefine(s, t, defines):
54 55 else:
55 56 return defineName
56 57  
57   -def _processLine(lineNum, line, defines):
  58 +def _processLine(lineNum, line, defines, filename):
58 59 if line.strip():
59 60  
60 61 rule = Forward()
... ... @@ -67,24 +68,16 @@ def _processLine(lineNum, line, defines):
67 68 rule.setParseAction(lambda s, l, t: ' '.join(t))
68 69 defineInstance.setParseAction(lambda s, l, t: _tryToSubstituteArgDefine(s, t, defines))
69 70 localId.setParseAction(lambda s, l, t: _tryToSubstituteNonArgDefine(s, t, defines))
70   - try:
71   - return rule.parseString(line, parseAll=True)[0]
72   - except ParseException as ex:
73   - msg = u'Preprocessing of segmentation rules failed.\n'
74   - msg += line + '\n'
75   - msg += (ex.col - 1) * ' ' + '^\n'
76   - msg += ex.msg
77   -# print unicode(exceptions.SegtypesException(msg)).encode('utf8')
78   - raise exceptions.SegtypesException(msg)
  71 + return pyparseString(rule, lineNum, line, filename)[0]
79 72 else:
80 73 return line
81 74  
82   -def preprocess(inputLines, defs):
  75 +def preprocess(inputLines, defs, filename):
83 76 defines = {}
84 77 ifdefsStack = []
85 78 for lineNum, line in inputLines:
86 79 if line.startswith('#define'):
87   - parsedDefine = list(define.parseString(line))
  80 + parsedDefine = list(pyparseString(define, lineNum, line, filename))
88 81 if len(parsedDefine) == 2:
89 82 name, val = parsedDefine
90 83 defines[name] = NonArgDefine(name, val)
... ... @@ -92,15 +85,16 @@ def preprocess(inputLines, defs):
92 85 name, arg, val = parsedDefine
93 86 localDefines = defines.copy()
94 87 localDefines[arg] = NonArgDefine(arg, arg)
95   - val = _processLine(lineNum, val, localDefines)
  88 + val = _processLine(lineNum, val, localDefines, filename)
96 89 defines[name] = ArgDefine(name, arg, val)
97 90 elif line.startswith('#ifdef'):
98   - name = ifdef.parseString(line)[0]
  91 + name = pyparseString(ifdef, lineNum, line, filename)[0]
  92 +# name = ifdef.parseString(line)[0]
99 93 ifdefsStack.append(name)
100 94 elif line.startswith('#endif'):
101 95 ifdefsStack.pop()
102 96 elif line.startswith('#'):
103 97 yield lineNum, line
104 98 elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)):
105   - yield lineNum, _processLine(lineNum, line, defines)
  99 + yield lineNum, _processLine(lineNum, line, defines, filename)
106 100  
107 101 \ No newline at end of file
... ...
fsabuilder/morfeuszbuilder/segrules/pyparseString.py 0 → 100644
  1 +'''
  2 +Created on 12 mar 2014
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +from pyparsing import ParseException
  8 +from morfeuszbuilder.utils import exceptions
  9 +
  10 +def pyparseString(rule, lineNum, line, filename):
  11 + try:
  12 + return rule.parseString(line, parseAll=True)
  13 + except ParseException as ex:
  14 + msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum)
  15 + msg += line + '\n'
  16 + msg += (ex.col - 1) * ' ' + '^\n'
  17 + msg += ex.msg
  18 +# print unicode(exceptions.SegtypesException(msg)).encode('utf8')
  19 + raise exceptions.SegtypesException(msg)
0 20 \ No newline at end of file
... ...
fsabuilder/morfeuszbuilder/segrules/rules.py
... ... @@ -25,16 +25,17 @@ class SegmentRule(object):
25 25  
26 26 class TagRule(SegmentRule):
27 27  
28   - def __init__(self, segnum, segtype):
  28 + def __init__(self, segnum, shiftOrth, segtype):
29 29 self.segnum = segnum
30 30 self.segtype = segtype
  31 + self.shiftOrth = shiftOrth
31 32  
32 33 def addToNFA(self, fsa):
33 34 endState = RulesNFAState(final=True)
34 35 self._doAddToNFA(fsa.initialState, endState)
35 36  
36 37 def _doAddToNFA(self, startState, endState):
37   - startState.addTransition(self.segnum, endState)
  38 + startState.addTransition((self.segnum, self.shiftOrth), endState)
38 39  
39 40 def __str__(self):
40 41 return u'%s(%d)' % (self.segtype, self.segnum)
... ... @@ -92,6 +93,7 @@ class ZeroOrMoreRule(UnaryRule):
92 93  
93 94 def __init__(self, child):
94 95 super(ZeroOrMoreRule, self).__init__(child)
  96 + assert isinstance(child, SegmentRule)
95 97  
96 98 def addToNFA(self, fsa):
97 99 raise ValueError()
... ... @@ -108,33 +110,3 @@ class ZeroOrMoreRule(UnaryRule):
108 110  
109 111 def __str__(self):
110 112 return u'(' + str(self.child) + ')*'
111   -
112   -class ShiftOrthRule(UnaryRule):
113   -
114   - def __init__(self, child):
115   - super(ShiftOrthRule, self).__init__(child)
116   -
117   - def addToNFA(self, fsa):
118   - raise ValueError()
119   -
120   - def _doAddToNFA(self, startState, endState):
121   - self.child._doAddToNFA(startState, endState)
122   - startState.setTransitionData(self.child.segnum, 1)
123   -
124   - def __str__(self):
125   - return u'(' + str(self.child) + ')>'
126   -
127   -class ShiftOrthSameTypeRule(UnaryRule):
128   -
129   - def __init__(self, child):
130   - super(ShiftOrthSameTypeRule, self).__init__(child)
131   -
132   - def addToNFA(self, fsa):
133   - raise ValueError()
134   -
135   - def _doAddToNFA(self, startState, endState):
136   - self.child._doAddToNFA(startState, endState)
137   - startState.setTransitionData(self.child.segnum, 2)
138   -
139   - def __str__(self):
140   - return u'(' + str(self.child) + ')!>'
... ...
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py 0 → 100644
  1 +'''
  2 +Created on 12 mar 2014
  3 +
  4 +@author: mlenart
  5 +'''
  6 +import logging
  7 +from morfeuszbuilder.fsa import state
  8 +from morfeuszbuilder.utils.serializationUtils import htons
  9 +
  10 +class RulesState(state.State):
  11 +
  12 + def __init__(self):
  13 + super(RulesState, self).__init__()
  14 + self.weak = None
  15 +
  16 + def setAsAccepting(self, weak):
  17 + self.weak = weak
  18 + self.encodedData = bytearray([1 if weak else 0])
  19 +
  20 + def getEncodedSize(self):
  21 + stateInfoSize = 2 # accepting info + transitionsNum
  22 + transitionsSize = 4 * len(self.transitionsMap)
  23 + return stateInfoSize + transitionsSize
  24 +
  25 +class RulesFSA(object):
  26 +
  27 + def __init__(self):
  28 + self.initialState = state.State()
  29 + self.ACCEPTING_FLAG = 1
  30 + self.WEAK_FLAG = 2
  31 +
  32 + def stateData2bytearray(self, state):
  33 + res = bytearray()
  34 + firstByte = 0
  35 + if state.isAccepting():
  36 + firstByte |= self.ACCEPTING_FLAG
  37 + if state.weak:
  38 + firstByte |= self.WEAK_FLAG
  39 + assert firstByte < 256 and firstByte >= 0
  40 + res.append(firstByte)
  41 +
  42 + secondByte = len(state.transitionsMap)
  43 + assert secondByte < 256 and secondByte >= 0
  44 + res.append(secondByte)
  45 +
  46 + return res
  47 +
  48 + def transitionsData2bytearray(self, state):
  49 + res = bytearray()
  50 +# logging.debug('next')
  51 + for (segnum, shiftOrth), nextState in state.transitionsMap.iteritems():
  52 + res.append(segnum)
  53 + if shiftOrth:
  54 + res.append(1)
  55 + else:
  56 + res.append(0)
  57 + offset = nextState.offset
  58 + assert offset < 65536
  59 +# res.append((offset & 0xFF0000) >> 16)
  60 + res.extend(htons(offset))
  61 + return res
  62 +
  63 + def serialize(self):
  64 + self.initialState.calculateOffsets(sizeCounter=lambda s: s.getEncodedSize())
  65 + res = bytearray()
  66 +
  67 + for state in sorted(self.initialState.dfs(set()), key=lambda s: s.offset):
  68 + res.extend(self.stateData2bytearray(state))
  69 + res.extend(self.transitionsData2bytearray(state))
  70 +
  71 + logging.info('Segmentation automaton size: %d bytes', len(res))
  72 + print list(res)
  73 + return res
... ...
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... ... @@ -4,7 +4,7 @@ Created on 20 lut 2014
4 4 @author: mlenart
5 5 '''
6 6 import logging
7   -from morfeuszbuilder.fsa.serializer import SimpleSerializer
  7 +from morfeuszbuilder.utils.serializationUtils import htons, htonl
8 8  
9 9 class RulesManager(object):
10 10  
... ... @@ -52,9 +52,9 @@ class RulesManager(object):
52 52  
53 53 def _serializeDFA(self, dfa):
54 54 res = bytearray()
55   - serializer = SimpleSerializer(dfa, serializeTransitionsData=True)
56   - dfaBytearray = serializer.fsa2bytearray()
57   - res.extend(serializer.htonl(len(dfaBytearray)))
  55 +# serializer = SimpleSerializer(dfa, serializeTransitionsData=True)
  56 + dfaBytearray = dfa.serialize()
  57 + res.extend(htonl(len(dfaBytearray)))
58 58 res.extend(dfaBytearray)
59 59 return res
60 60  
... ...
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... ... @@ -4,7 +4,7 @@ Created on 24 sty 2014
4 4 @author: mlenart
5 5 '''
6 6  
7   -from morfeuszbuilder.fsa import fsa, state, encode
  7 +from morfeuszbuilder.segrules.rulesFSA import RulesFSA, RulesState
8 8  
9 9 class RulesNFAState(object):
10 10  
... ... @@ -12,7 +12,7 @@ class RulesNFAState(object):
12 12  
13 13 def __init__(self, initial=False, final=False, weak=False):
14 14 self.transitionsMap = {}
15   - self.transitionsDataMap = {}
  15 +# self.transitionsDataMap = {}
16 16 self.initial = initial
17 17 self.final = final
18 18 self.weak = weak
... ... @@ -20,13 +20,9 @@ class RulesNFAState(object):
20 20 RulesNFAState.statesCounter += 1
21 21  
22 22 def addTransition(self, label, targetState):
  23 + assert label is None or len(label) == 2
23 24 self.transitionsMap.setdefault(label, set())
24 25 self.transitionsMap[label].add(targetState)
25   - self.transitionsDataMap[label] = 0
26   -
27   - def setTransitionData(self, label, byte):
28   - assert len(self.transitionsMap[label]) == 1
29   - self.transitionsDataMap[label] = byte
30 26  
31 27 def getClosure(self, visited):
32 28 if self in visited:
... ... @@ -64,10 +60,11 @@ class RulesNFA(object):
64 60 for nfaState in nfaStates:
65 61 for label, nextStates in nfaState.transitionsMap.iteritems():
66 62 if label is not None:
67   - transitionData = nfaState.transitionsDataMap[label]
68   - res.setdefault((label, transitionData), set())
  63 +# transitionData = nfaState.transitionsDataMap[label]
  64 + segnum, shiftOrth = label
  65 + res.setdefault((segnum, shiftOrth), set())
69 66 for nextNFAState in nextStates:
70   - res[(label, transitionData)] |= nextNFAState.getClosure(set())
  67 + res[(segnum, shiftOrth)] |= nextNFAState.getClosure(set())
71 68 return res
72 69  
73 70 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
... ... @@ -79,23 +76,24 @@ class RulesNFA(object):
79 76 if final:
80 77 # dfaState should be final
81 78 # and contain info about weakness
82   - dfaState.encodedData = bytearray([1 if weak else 0])
83   - for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems():
  79 + dfaState.setAsAccepting(weak=weak)
  80 +# dfaState.encodedData = bytearray([1 if weak else 0])
  81 + for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems():
84 82 key = frozenset(nextNFAStates)
85 83 if key in nfaSubset2DFAState:
86 84 nextDFAState = nfaSubset2DFAState[key]
87 85 else:
88   - nextDFAState = state.State()
  86 + nextDFAState = RulesState()
89 87 nfaSubset2DFAState[key] = nextDFAState
90 88 self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState)
91   - dfaState.setTransition(label, nextDFAState)
92   - dfaState.setTransitionData(label, transitionData)
  89 + dfaState.setTransition((segnum, shiftOrth), nextDFAState)
  90 +# dfaState.setTransitionData(label, transitionData)
93 91  
94 92 def convertToDFA(self):
95   - dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False)
  93 + dfa = RulesFSA()
96 94 startStates = self.initialState.getClosure(set())
97 95 assert not any(filter(lambda s: s.final, startStates))
98   - dfa.initialState = state.State(additionalData=False)
  96 + dfa.initialState = RulesState()
99 97 self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState})
100 98 return dfa
101 99  
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -3,7 +3,7 @@ from pyparsing import *
3 3 ParserElement.enablePackrat()
4 4 from morfeuszbuilder.tagset import segtypes
5 5 from morfeuszbuilder.utils import configFile, exceptions
6   -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager
  6 +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString
7 7 import codecs
8 8 import re
9 9  
... ... @@ -48,8 +48,8 @@ class RulesParser(object):
48 48 if not firstNFA:
49 49 firstNFA = nfa
50 50 combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations')
51   - combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs))
52   - for rule in self._doParse(combinationEnumeratedLines, segtypesHelper):
  51 + combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
  52 + for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
53 53 # print rule
54 54 rule.addToNFA(nfa)
55 55 # nfa.debug()
... ... @@ -60,25 +60,24 @@ class RulesParser(object):
60 60 res.addDFA(key2Def, dfa)
61 61 return res
62 62  
63   - def _doParse(self, combinationEnumeratedLines, segtypesHelper):
  63 + def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename):
64 64 for lineNum, line in combinationEnumeratedLines:
65 65 if not line.startswith('#'):
66   - yield self._doParseOneLine(lineNum, line, segtypesHelper)
  66 + yield self._doParseOneLine(lineNum, line, segtypesHelper, filename)
67 67  
68   - def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper):
  68 + def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
69 69 if not segtypesHelper.hasSegtype(segtype):
70 70 raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype))
71 71 else:
72 72 # return rules.TagRule(segtype)
73   - return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), segtype)
  73 + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype)
74 74  
75   - def _doParseOneLine(self, lineNum, line, segtypesHelper):
  75 + def _doParseOneLine(self, lineNum, line, segtypesHelper, filename):
76 76 rule = Forward()
77 77 tagRule = Word(alphanums+'_')
78   - shiftOrthRule = tagRule + '>'
79   - shiftOrthSameTypeRule = tagRule + '!' + '>'
  78 + shiftOrthRule = Word(alphanums+'_') + Suppress('>')
80 79 parenRule = Suppress('(') + rule + Suppress(')')
81   - atomicRule = tagRule ^ shiftOrthRule ^ shiftOrthSameTypeRule ^ parenRule
  80 + atomicRule = tagRule ^ shiftOrthRule ^ parenRule
82 81 zeroOrMoreRule = atomicRule + Suppress('*')
83 82 oneOrMoreRule = atomicRule + Suppress('+')
84 83 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
... ... @@ -87,13 +86,12 @@ class RulesParser(object):
87 86 concatRule = OneOrMore(complexRule)
88 87 rule << concatRule
89 88  
90   - tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper))
91   - shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0]))
92   - shiftOrthSameTypeRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthSameTypeRule(toks[0]))
  89 + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
  90 + shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
93 91 # parenRule.setParseAction(lambda string, loc, toks: toks[0])
94 92 zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0]))
95 93 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
96 94 oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
97 95 concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
98   - parsedRule = rule.parseString(line, parseAll=True)[0]
  96 + parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
99 97 return parsedRule
... ...
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... ... @@ -33,6 +33,7 @@ class Segtypes(object):
33 33 raise exceptions.ConfigFileException(self.filename, lineNum, msg)
34 34  
35 35 def _readTags(self, segrulesConfigFile):
  36 + gotWildcardPattern = False
36 37 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'):
37 38 splitLine = re.split(r'\s+', line.strip())
38 39 self._validate(
... ... @@ -49,13 +50,27 @@ class Segtypes(object):
49 50 lineNum,
50 51 re.match(r'[a-z_\.\:\%]+', pattern))
51 52  
  53 + self._validate(
  54 + u'Pattern that matches everything must be the last one',
  55 + lineNum - 1,
  56 + not gotWildcardPattern)
  57 +
52 58 if segtype in self.segtype2Segnum:
53 59 segnum = self.segtype2Segnum[segtype]
54 60 else:
55 61 segnum = len(self.segtype2Segnum)
56 62 self.segtype2Segnum[segtype] = segnum
57 63  
58   - self.patternsList.append(SegtypePattern(None, pattern, segnum))
  64 + segtypePattern = SegtypePattern(None, pattern, segnum)
  65 +
  66 + self._validate(
  67 + u'There is no tag that matches pattern "%s".' % pattern,
  68 + lineNum,
  69 + any([segtypePattern.tryToMatch(None, tag) != -1 for tag in self.tagset.getAllTags()]))
  70 +
  71 + self.patternsList.append(segtypePattern)
  72 +
  73 + gotWildcardPattern = gotWildcardPattern or pattern == '%'
59 74  
60 75 self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()])
61 76  
... ... @@ -67,7 +82,7 @@ class Segtypes(object):
67 82 lineNum,
68 83 re.match(r'[a-z_]+', segtype))
69 84 self._validate(
70   - u'Pattern must contain lemma and POS',
  85 + u'Pattern must contain lemma and part-of-speech fields',
71 86 lineNum,
72 87 re.match(r'.+\:[a-z_]+', pattern, re.U))
73 88  
... ... @@ -79,7 +94,14 @@ class Segtypes(object):
79 94  
80 95 lemma, pos = pattern.split(':')
81 96  
82   - self.patternsList.append(SegtypePattern(lemma, '%s|%s:%%' % (pos, pos), segnum))
  97 + segtypePattern = SegtypePattern(lemma, pos + ':%', segnum)
  98 +
  99 + self._validate(
  100 + u'There is no tag that matches pattern "%s".' % (pos + ':%'),
  101 + lineNum,
  102 + any([segtypePattern.tryToMatch(lemma, tag) != -1 for tag in self.tagset.getAllTags()]))
  103 +
  104 + self.patternsList.append(segtypePattern)
83 105  
84 106 def _debugSegnums(self):
85 107 for tagnum, segnum in self._tagnum2Segnum.items():
... ... @@ -121,11 +143,6 @@ class Segtypes(object):
121 143 if not res:
122 144 res = self._tagnum2Segnum.get(tagnum, None)
123 145 return res
124   -# for p in self.patternsList:
125   -# res = p.tryToMatch(lemma, tag)
126   -# if res >= 0:
127   -# return res
128   -# return None
129 146  
130 147 class SegtypePattern(object):
131 148  
... ... @@ -135,8 +152,13 @@ class SegtypePattern(object):
135 152 self.segnum = segnum
136 153  
137 154 def tryToMatch(self, lemma, tag):
  155 +# tag2Match = tag + ':' if not tag.endswith(':') else tag
  156 +# print tag2Match
  157 + patterns2Match = []
  158 + patterns2Match.append(self.pattern.replace('%', '.*'))
  159 + patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*'))
138 160 if (self.lemma is None or self.lemma == lemma) \
139   - and re.match(self.pattern.replace('%', '.*'), tag):
  161 + and any([re.match(p, tag) for p in patterns2Match]):
140 162 return self.segnum
141 163 else:
142 164 return -1
... ...
fsabuilder/morfeuszbuilder/utils/serializationUtils.py 0 → 100644
  1 +'''
  2 +Created on 12 mar 2014
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +# serialize uint16 as big endian
  8 +def htons(n):
  9 + assert n < 65536
  10 + assert n >= 0
  11 + res = bytearray()
  12 + res.append((n & 0x00FF00) >> 8)
  13 + res.append(n & 0x0000FF)
  14 + return res
  15 +
  16 +# serialize uint32 as big endian
  17 +def htonl(n):
  18 + assert n >= 0
  19 + res = bytearray()
  20 + res.append((n & 0xFF000000) >> 24)
  21 + res.append((n & 0x00FF0000) >> 16)
  22 + res.append((n & 0x0000FF00) >> 8)
  23 + res.append(n & 0x000000FF)
  24 + return res
... ...
input/dodatki.tab
... ... @@ -41,13 +41,171 @@ z Z brev:pun
41 41 ż Ż brev:pun
42 42 ch Ch brev:pun
43 43 st St brev:pun
44   -0 0 dig
45   -1 1 dig
46   -2 2 dig
47   -3 3 dig
48   -4 4 dig
49   -5 5 dig
50   -6 6 dig
51   -7 7 dig
52   -8 8 dig
53   -9 9 dig
  44 +poli poli prefa
  45 +poli poli prefs
  46 +niby niby prefa
  47 +niby niby prefs
  48 +eks eks prefs
  49 +ex ex prefs
  50 +euro euro prefa
  51 +euro euro prefs
  52 +mikro mikro prefs
  53 +mikro mikro prefa
  54 +makro makro prefa
  55 +makro makro prefs
  56 +bez bez prefa
  57 +do do prefv
  58 +do do prefa
  59 +dez dez prefv
  60 +dez dez prefa
  61 +dez dez prefs
  62 +ko ko prefa
  63 +ko ko prefs
  64 +między między prefa
  65 +między między prefs
  66 +na na prefa
  67 +na na prefs
  68 +na na prefv
  69 +nad nad prefa
  70 +nad nad prefs
  71 +nad nad prefv
  72 +o o prefv
  73 +ob ob prefv
  74 +od od prefa
  75 +od od prefs
  76 +od od prefv
  77 +pra pra prefs
  78 +post post prefa
  79 +post post prefs
  80 +pod pod prefa
  81 +pod pod prefs
  82 +pod pod prefv
  83 +poza poza prefa
  84 +ponad ponad prefa
  85 +pre pre prefa
  86 +pre pre prefs
  87 +pro pro prefa
  88 +pro pro prefs
  89 +prze prze prefa
  90 +prze prze prefv
  91 +przeciw przeciw prefa
  92 +przeciw przeciw prefs
  93 +re re prefa
  94 +re re prefs
  95 +re re prefv
  96 +przy przy prefa
  97 +przy przy prefv
  98 +roz roz prefv
  99 +u u prefv
  100 +samo samo prefa
  101 +samo samo prefs
  102 +video video prefs
  103 +video video prefa
  104 +w w prefv
  105 +wy wy prefv
  106 +współ współ prefv
  107 +współ współ prefa
  108 +współ współ prefs
  109 +wice wice prefs
  110 +neo neo prefa
  111 +neo neo prefs
  112 +tele tele prefs
  113 +tele tele prefa
  114 +z z prefv
  115 +za za prefv
  116 +za za prefa
  117 +za za prefs
  118 +wideo wideo prefa
  119 +wideo wideo prefs
  120 +meta meta prefs
  121 +meta meta prefa
  122 +multi multi prefa
  123 +multi multi prefs
  124 +mega mega prefa
  125 +mega mega prefs
  126 +kontra kontra prefs
  127 +kontra kontra prefa
  128 +inter inter prefa
  129 +inter inter prefs
  130 +homo homo prefs
  131 +homo homo prefa
  132 +ekstra ekstra prefa
  133 +ekstra ekstra prefs
  134 +giga giga prefa
  135 +giga giga prefs
  136 +bi bi prefs
  137 +bi bi prefa
  138 +auto auto prefs
  139 +auto auto prefa
  140 +de de prefv
  141 +de de prefa
  142 +de de prefs
  143 +ultra ultra prefs
  144 +ultra ultra prefa
  145 +e- e- prefa
  146 +e- e- prefs
  147 +mini mini prefs
  148 +mini mini prefa
  149 +maxi maxi prefs
  150 +maxi maxi prefa
  151 +midi midi prefs
  152 +midi midi prefa
  153 +arcy arcy prefs
  154 +arcy arcy prefa
  155 +anty anty prefa
  156 +anty anty prefs
  157 +a a prefa
  158 +a a prefs
  159 +pan pan prefs
  160 +pan pan prefa
  161 +in in prefa
  162 +in in prefs
  163 +dys dys prefs
  164 +dys dys prefa
  165 +mono mono prefa
  166 +mono mono prefs
  167 +porno porno prefs
  168 +porno porno prefa
  169 +anglo anglo prefa
  170 +aero aero prefs
  171 +aero aero prefa
  172 +bio bio prefs
  173 +bio bio prefa
  174 +wszystko wszystko prefs
  175 +wszystko wszystko prefa
  176 +wszech wszech prefs
  177 +wszech wszech prefa
  178 +śród śród prefs
  179 +śród śród prefa
  180 +audio audio prefs
  181 +audio audio prefa
  182 +eko eko prefs
  183 +eko eko prefa
  184 +s s prefv
  185 +elektro elektro prefs
  186 +elektro elektro prefa
  187 +trans trans prefa
  188 +trans trans prefs
  189 +kontr kontr prefs
  190 +kontr kontr prefa
  191 +pseudo pseudo prefs
  192 +pseudo pseudo prefa
  193 +quasi quasi prefs
  194 +quasi quasi prefa
  195 +super super prefs
  196 +super super prefa
  197 +po po prefv
  198 +po po prefa
  199 +po po prefs
  200 +sub sub prefs
  201 +sub sub prefa
  202 +hiper hiper prefa
  203 +hiper hiper prefs
  204 +non non prefs
  205 +non non prefa
  206 +stereo stereo prefa
  207 +stereo stereo prefs
  208 +energo energo prefa
  209 +para para prefa
  210 +para para prefs
  211 +ś ś prefv
... ...
input/polimorf.tagset
... ... @@ -584,6 +584,9 @@
584 584 579 interp
585 585 580 brev:pun
586 586 581 brev:npun
  587 +582 prefa
  588 +583 prefs
  589 +584 prefv
587 590  
588 591 [NAMES]
589 592  
... ...
input/segmenty.dat
... ... @@ -19,7 +19,7 @@ samotny
19 19 # przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”:
20 20 moze_interp(praet_sg_na)
21 21  
22   -# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „czytał”:
  22 +# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „moze”:
23 23 moze_interp(praet_sg)
24 24  
25 25 # przeszlik mnogi, np. „czytali”:
... ... @@ -69,9 +69,8 @@ moze_interp(praet_sg by aglsg)
69 69 # np. „gnietli·by·śmy”
70 70 moze_interp(praet_pl by aglpl)
71 71 #else
72   -moze_interp(praetcond)
  72 +# moze_interp(praetcond)
73 73 #endif
74   -
75 74 # np. „by·ś”
76 75 moze_interp(by aglsg)
77 76 # np. „by·ście”
... ... @@ -98,9 +97,9 @@ moze_interp( (adja dywiz)+ adj )
98 97 # adja dywiz adja dywiz adja dywiz adj interp?
99 98 # adja dywiz adja dywiz adja dywiz adja dywiz adj interp?
100 99  
101   -# Stopień najwyższy:
102   -# np. „naj·zieleńszy”, „naj·mądrzej”
103   -moze_interp( naj> adj_sup )
  100 +# Formy zanegowane stopnia wyższego przymiotników i przysłówków (WK)
  101 +# np. „nie·grzeczniejszy”, „nie·grzeczniej”
  102 +moze_interp( nie> adj_com )
104 103  
105 104 # Formy „zanegowane” gerundiów i imiesłowów:
106 105 # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”:
... ... @@ -112,15 +111,21 @@ moze_interp(z_on_agl)
112 111 moze_interp(z_on_agl on_agl)
113 112  
114 113 # Liczba zapisana jako ciąg cyfr:
115   -moze_interp( dig!>+ )
  114 +moze_interp( dig )
116 115  
117 116 # Formacje prefiksalne
118 117 #### trzeba wydzielić odpowiednie samodze!
119   -# rzeczownikowe i przymiotnikowe
120   -# np. „euro·sodoma”, „e-·papieros”, „euro·sodomski”, „bez·argumentowy”
121   -moze_interp( prefs samodz )
  118 +# rzeczownikowe
  119 +# np. „euro·sodoma”, „e-·papieros”
  120 +moze_interp(nomina)
  121 +moze_interp( prefs> nomina )
122 122 # czasownikowe np. „po·nakapywać”
123   -moze_interp( prefv samodz )
  123 +moze_interp(verba_imperf)
  124 +moze_interp( prefv> verba_imperf )
  125 +# przymiotnikowe np. „do·żylny”, „euro·sodomski”, „bez·argumentowy”
  126 +moze_interp(adjectiva)
  127 +moze_interp(prefa> adj)
  128 +moze_interp( prefa> adjectiva )
124 129  
125 130 # Apozycje z dywizem
126 131 # np. „kobieta-prezydent”
... ... @@ -133,11 +138,28 @@ adj dywiz samodz
133 138 # ?
134 139 samodz dywiz adj
135 140  
  141 +#### PONIŻEJ REGUŁY WK
  142 +# Stopień najwyższy:
  143 +# np. „naj·zieleńszy”, „naj·mądrzej”
  144 +moze_interp( naj> adj_sup )
  145 +# Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj
  146 +moze_interp( praet_sg dywiz li)
  147 +moze_interp( praet_pl dywiz li)
  148 +moze_interp( praet_sg_na dywiz li)
  149 +moze_interp( fin dywiz li)
  150 +
  151 +# i bez dywizu --- czy bez dywizu jest sens to łapać?
  152 +#moze_interp( praet_sg li)
  153 +#moze_interp( praet_pl li)
  154 +#moze_interp( praet_sg_na li)
  155 +#moze_interp( fin li)
  156 +
136 157 [segment types]
137 158 naj
138 159 nie
139 160 prefs
140 161 prefv
  162 +prefa
141 163 dig
142 164 adja
143 165 adj
... ... @@ -161,11 +183,14 @@ naj naj
161 183 nie nie
162 184 prefs prefs
163 185 prefv prefv
  186 +prefa prefa
164 187 dig dig
165 188 adja adja
166 189 adj adj:%:pos
167 190 adj_sup adj:%:sup
168 191 adj_sup adv:sup
  192 +adj_com adj:%:com
  193 +adj_com adj:%:com
169 194 negat ger:%:neg
170 195 negat pact:%:neg
171 196 negat ppas:%:neg
... ... @@ -173,26 +198,35 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
173 198 z_on_agl prep:%
174 199 samotny brev:pun
175 200 samotny brev:npun
176   -samotny intrj
  201 +samotny interj
177 202 interp interp
178 203 aglsg aglt:sg:%
179 204 aglpl aglt:pl:%
180   -praetcond cond:%
181   -praetcond praet:%:pri:%
182   -praetcond praet:%:sec:%
183   -praetcond praet:%:ter:%
184 205 praet_sg_agl praet:sg:%:agl
185 206 praet_sg_na praet:sg:%:nagl
186 207 praet_sg praet:sg:%
187 208 praet_pl praet:pl:%
188 209 praet_sg winien:sg:%
189 210 praet_pl winien:pl:%
  211 +fin fin:%
  212 +nomina subst:%
  213 +nomina ger:%
  214 +nomina depr:%
  215 +adjectiva adv:%
  216 +adjectiva ppas:%
  217 +adjectiva pact:%
  218 +verba_imperf praet:%:imperf
  219 +verba_imperf fin:%:imperf
  220 +verba_imperf inf:imperf
  221 +verba_imperf imps:imperf
  222 +verba_imperf impt:%:imperf
190 223 samodz %
191 224  
192 225 [lexemes]
193 226 z_aglt aby:comp
194 227 z_aglt bowiem:comp
195 228 by by:qub
  229 +li li:qub
196 230 z_aglt by:comp
197 231 z_aglt cóż:subst
198 232 z_aglt czemu:adv
... ...
input/segmenty1.dat
... ... @@ -7,9 +7,10 @@ praet=split composite
7 7  
8 8 #define moze_interp(segmenty) wsz_interp segmenty wsz_interp
9 9  
  10 +dig>* dig
10 11 (adja dywiz)+ adj
11   -dig!>+
12   -dig!> dig!> dig!>
  12 +#dig!>+
  13 +#dig!> dig!> dig!>
13 14 naj> adj_sup
14 15  
15 16 [segment types]
... ... @@ -52,20 +53,10 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
52 53 z_on_agl prep:%
53 54 samotny brev:pun
54 55 samotny brev:npun
55   -samotny intrj
  56 +samotny interj
56 57 interp interp
57 58 aglsg aglt:sg:%
58 59 aglpl aglt:pl:%
59   -praetcond cond:%
60   -praetcond praet:%:pri:%
61   -praetcond praet:%:sec:%
62   -praetcond praet:%:ter:%
63   -praet_sg_agl praet:sg:%:agl
64   -praet_sg_na praet:sg:%:nagl
65   -praet_sg praet:sg:%
66   -praet_pl praet:pl:%
67   -praet_sg winien:sg:%
68   -praet_pl winien:pl:%
69 60 samodz %
70 61  
71 62 [lexemes]
... ...
morfeusz/InterpretedChunk.hpp
... ... @@ -17,7 +17,6 @@ struct InterpretedChunk {
17 17 std::vector<uint32_t> lowercaseCodepoints;
18 18 InterpsGroup interpsGroup;
19 19 bool shiftOrth;
20   - bool shiftOrthSameType;
21 20 bool orthWasShifted;
22 21 std::vector<InterpretedChunk> prefixChunks;
23 22 };
... ...
morfeusz/Morfeusz.cpp
... ... @@ -37,11 +37,19 @@ static MorfeuszOptions createDefaultOptions() {
37 37 return res;
38 38 }
39 39  
  40 +static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) {
  41 + SegrulesOptions opts;
  42 + opts["aggl"] = "isolated";
  43 + opts["praet"] = "split";
  44 + return (*(map.find(opts))).second;
  45 +}
  46 +
40 47 Morfeusz::Morfeusz()
41 48 : env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET),
42 49 analyzerPtr(DEFAULT_FSA),
43 50 analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())),
44 51 segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)),
  52 +currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)),
45 53 isAnalyzerFSAFromFile(false),
46 54 generatorPtr(DEFAULT_SYNTH_FSA),
47 55 isGeneratorFSAFromFile(false),
... ... @@ -50,9 +58,9 @@ options(createDefaultOptions()) {
50 58  
51 59 }
52 60  
53   -static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSAType*>& fsasMap) {
  61 +static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) {
54 62 for (
55   - std::map<SegrulesOptions, SegrulesFSAType*>::iterator it = fsasMap.begin();
  63 + std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin();
56 64 it != fsasMap.end();
57 65 ++it) {
58 66 delete it->second;
... ... @@ -100,11 +108,8 @@ void Morfeusz::analyzeOneWord(
100 108 vector<InterpretedChunk> accum;
101 109 FlexionGraph graph;
102 110 const char* currInput = inputStart;
103   - SegrulesOptions opts;
104   - opts["aggl"] = "isolated";
105   - opts["praet"] = "split";
106   - SegrulesFSAType* segrulesFSA = (*(this->segrulesFSAsMap.find(opts))).second;
107   - doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->getInitialState());
  111 + SegrulesFSA* segrulesFSA = this->currSegrulesFSA;
  112 + doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->initialState);
108 113 if (!graph.empty()) {
109 114 InterpretedChunksDecoder interpretedChunksDecoder(env);
110 115 int srcNode = startNodeNum;
... ... @@ -118,7 +123,8 @@ void Morfeusz::analyzeOneWord(
118 123 srcNode++;
119 124 }
120 125 // graph.getResults(*this->tagset, results);
121   - } else if (inputStart != inputEnd) {
  126 + }
  127 + else if (inputStart != inputEnd) {
122 128 this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results);
123 129 }
124 130 inputStart = currInput;
... ... @@ -126,9 +132,9 @@ void Morfeusz::analyzeOneWord(
126 132  
127 133 static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
128 134 to.prefixChunks.insert(
129   - to.prefixChunks.begin(),
130   - from.prefixChunks.begin(),
131   - from.prefixChunks.end());
  135 + to.prefixChunks.begin(),
  136 + from.prefixChunks.begin(),
  137 + from.prefixChunks.end());
132 138 to.prefixChunks.push_back(from);
133 139 from.orthWasShifted = true;
134 140 }
... ... @@ -138,7 +144,8 @@ void Morfeusz::doAnalyzeOneWord(
138 144 const char* inputEnd,
139 145 vector<InterpretedChunk>& accum,
140 146 FlexionGraph& graph,
141   - SegrulesStateType segrulesState) const {
  147 + SegrulesState segrulesState) const {
  148 + // cerr << "doAnalyzeOneWord " << inputData << endl;
142 149 bool endOfWord = inputData == inputEnd;
143 150 const char* currInput = inputData;
144 151 uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd);
... ... @@ -159,16 +166,27 @@ void Morfeusz::doAnalyzeOneWord(
159 166 vector<InterpsGroup> val(state.getValue());
160 167 for (unsigned int i = 0; i < val.size(); i++) {
161 168 InterpsGroup& ig = val[i];
162   - cerr << (int) ig.type << endl;
163   - SegrulesStateType newSegrulesState = segrulesState;
164   - newSegrulesState.proceedToNext(ig.type);
165   - if (!newSegrulesState.isSink()) {
166   - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1;
167   - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2;
168   - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false};
169   - if (!accum.empty()
170   - && (accum.back().shiftOrth
171   - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) {
  169 + // newSegrulesState.proceedToNext(ig.type);
  170 + // this->currSegrulesFSA->proceedToNext(ig.type, segrulesStates, newSegrulesStates);
  171 + set<SegrulesState> newSegrulesStates;
  172 + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates);
  173 + for (
  174 + set<SegrulesState>::iterator it = newSegrulesStates.begin();
  175 + it != newSegrulesStates.end();
  176 + it++) {
  177 + SegrulesState newSegrulesState = *it;
  178 + // bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1;
  179 + // bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2;
  180 + InterpretedChunk ic = {
  181 + inputData,
  182 + originalCodepoints,
  183 + lowercaseCodepoints,
  184 + ig,
  185 + newSegrulesState.shiftOrthFromPrevious,
  186 + false,
  187 + vector<InterpretedChunk>()
  188 + };
  189 + if (!accum.empty() && accum.back().shiftOrth) {
172 190 doShiftOrth(accum.back(), ic);
173 191 }
174 192 accum.push_back(ic);
... ... @@ -182,27 +200,37 @@ void Morfeusz::doAnalyzeOneWord(
182 200 this->env.getCharsetConverter().next(currInput, inputEnd);
183 201 }
184 202 }
  203 + // cerr << "end of word" << endl;
185 204 // we are at the end of word
186 205 if (state.isAccepting()) {
187 206 vector<InterpsGroup > val(state.getValue());
188 207 for (unsigned int i = 0; i < val.size(); i++) {
189 208 InterpsGroup& ig = val[i];
190   - SegrulesStateType newSegrulesState = segrulesState;
191   - newSegrulesState.proceedToNext(ig.type);
192   - if (newSegrulesState.isAccepting()) {
193   - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1;
194   - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2;
195   - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false};
196   - if (!accum.empty()
197   - && (accum.back().shiftOrth
198   - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) {
199   - doShiftOrth(accum.back(), ic);
  209 + // cerr << "currInput=" << currInput << endl;
  210 + // cerr << "type=" << (int) ig.type << endl;
  211 + set<SegrulesState> newSegrulesStates;
  212 + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates);
  213 + for (
  214 + set<SegrulesState>::iterator it = newSegrulesStates.begin();
  215 + it != newSegrulesStates.end();
  216 + it++) {
  217 + SegrulesState newSegrulesState = *it;
  218 + if (newSegrulesState.accepting) {
  219 + InterpretedChunk ic = {
  220 + inputData,
  221 + originalCodepoints,
  222 + lowercaseCodepoints,
  223 + ig,
  224 + newSegrulesState.shiftOrthFromPrevious,
  225 + false,
  226 + vector<InterpretedChunk>()};
  227 + if (!accum.empty() && accum.back().shiftOrth) {
  228 + doShiftOrth(accum.back(), ic);
  229 + }
  230 + accum.push_back(ic);
  231 + graph.addPath(accum);
  232 + accum.pop_back();
200 233 }
201   - accum.push_back(ic);
202   - graph.addPath(accum);
203   - accum.pop_back();
204   - } else if (!newSegrulesState.isSink()) {
205   - } else {
206 234 }
207 235 }
208 236 }
... ...
morfeusz/Morfeusz.hpp
... ... @@ -12,6 +12,7 @@
12 12 #include <list>
13 13 #include <vector>
14 14 #include <map>
  15 +#include <set>
15 16 #include "EncodedInterpretation.hpp"
16 17 #include "fsa/fsa.hpp"
17 18 #include "MorphInterpretation.hpp"
... ... @@ -27,6 +28,7 @@
27 28 #include "Environment.hpp"
28 29  
29 30 #include "segrules/segrules.hpp"
  31 +#include "segrules/SegrulesFSA.hpp"
30 32  
31 33 class Morfeusz;
32 34 class ResultsIterator;
... ... @@ -111,7 +113,7 @@ private:
111 113 const char* inputEnd,
112 114 std::vector<InterpretedChunk>& accum,
113 115 FlexionGraph& graph,
114   - SegrulesStateType segrulesState) const;
  116 + SegrulesState segrulesState) const;
115 117  
116 118 void appendIgnotiumToResults(
117 119 const std::string& word,
... ... @@ -120,17 +122,13 @@ private:
120 122 Environment env;
121 123 const unsigned char* analyzerPtr;
122 124 FSAType* analyzerFSA;
123   - std::map<SegrulesOptions, SegrulesFSAType*> segrulesFSAsMap;
  125 + std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap;
  126 + SegrulesFSA* currSegrulesFSA;
124 127 bool isAnalyzerFSAFromFile;
125 128  
126 129 const unsigned char* generatorPtr;
127 130 bool isGeneratorFSAFromFile;
128 131 Generator generator;
129   -// const CharsetConverter* charsetConverter;
130   -// const Tagset* tagset;
131   -// const CaseConverter* caseConverter;
132   -//
133   -// UTF8CharsetConverter utf8CharsetConverter;
134 132  
135 133 MorfeuszOptions options;
136 134 };
... ...
morfeusz/segrules/SegrulesFSA.hpp 0 → 100644
  1 +/*
  2 + * File: SegrulesFSA.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 12 marzec 2014, 17:52
  6 + */
  7 +
  8 +#ifndef SEGRULESFSA_HPP
  9 +#define SEGRULESFSA_HPP
  10 +
  11 +#include <set>
  12 +#include "../endianness.hpp"
  13 +
  14 +struct SegrulesState {
  15 + uint16_t offset;
  16 + bool accepting;
  17 + bool weak;
  18 + bool shiftOrthFromPrevious;
  19 +};
  20 +
  21 +inline bool operator<(const SegrulesState& s1, const SegrulesState& s2)
  22 +{
  23 + return s1.offset < s2.offset;
  24 +}
  25 +
  26 +class SegrulesFSA {
  27 +public:
  28 + SegrulesFSA(const unsigned char* ptr): initialState(), ptr(ptr) {
  29 + SegrulesState state = {0, false, false, false};
  30 + initialState = state;
  31 + }
  32 +
  33 + void proceedToNext(
  34 + const unsigned char segnum,
  35 + const SegrulesState state,
  36 + std::set<SegrulesState>& newStates) const {
  37 +
  38 + const unsigned char* currPtr = ptr + state.offset;
  39 + currPtr++;
  40 + const unsigned char transitionsNum = *currPtr;
  41 + currPtr++;
  42 + for (unsigned int i = 0; i < transitionsNum; i++) {
  43 + if (*currPtr == segnum) {
  44 + newStates.insert(newStates.begin(), this->transition2State(currPtr));
  45 + }
  46 + currPtr += 4;
  47 + }
  48 + }
  49 +
  50 + virtual ~SegrulesFSA() {}
  51 +
  52 + SegrulesState initialState;
  53 +private:
  54 + const unsigned char* ptr;
  55 +
  56 + SegrulesState transition2State(const unsigned char* transitionPtr) const {
  57 + unsigned char ACCEPTING_FLAG = 1;
  58 + unsigned char WEAK_FLAG = 2;
  59 + SegrulesState res;
  60 + transitionPtr++;
  61 + res.shiftOrthFromPrevious = *transitionPtr;
  62 + transitionPtr++;
  63 + res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr));
  64 + res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG;
  65 + res.weak = *(ptr + res.offset) & WEAK_FLAG;
  66 + return res;
  67 + }
  68 +};
  69 +
  70 +#endif /* SEGRULESFSA_HPP */
  71 +
... ...
morfeusz/segrules/segrules.cpp
... ... @@ -33,23 +33,23 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*&amp; ptr) {
33 33 return res;
34 34 }
35 35  
36   -static inline SegrulesFSAType* deserializeFSA(const unsigned char*& ptr) {
  36 +static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) {
37 37 uint32_t fsaSize = deserializeUint32(ptr);
38   - static SegrulesDeserializer deserializer;
39   - SegrulesFSAType* res = SegrulesFSAType::getFSA(ptr, deserializer);
  38 +// static SegrulesDeserializer deserializer;
  39 + SegrulesFSA* res = new SegrulesFSA(ptr);
40 40 ptr += fsaSize;
41 41 return res;
42 42 }
43 43  
44   -map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) {
45   - map<SegrulesOptions, SegrulesFSAType*> res;
  44 +map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) {
  45 + map<SegrulesOptions, SegrulesFSA*> res;
46 46 const unsigned char* fsasMapPtr = getFSAsMapPtr(analyzerPtr);
47 47 const unsigned char* currPtr = fsasMapPtr;
48 48 unsigned char fsasNum = *currPtr;
49 49 currPtr++;
50 50 for (unsigned char i = 0; i < fsasNum; i++) {
51 51 SegrulesOptions options = deserializeOptions(currPtr);
52   - SegrulesFSAType* fsa = deserializeFSA(currPtr);
  52 + SegrulesFSA* fsa = deserializeFSA(currPtr);
53 53 res[options] = fsa;
54 54 }
55 55 return res;
... ...
morfeusz/segrules/segrules.hpp
... ... @@ -11,13 +11,13 @@
11 11 #include <utility>
12 12 #include <map>
13 13 #include <string>
14   -#include "../fsa/fsa.hpp"
  14 +#include "SegrulesFSA.hpp"
15 15  
16 16 typedef std::map<std::string, std::string> SegrulesOptions;
17   -typedef State<unsigned char> SegrulesStateType;
18   -typedef FSA<unsigned char> SegrulesFSAType;
  17 +//typedef State<unsigned char> SegrulesStateType;
  18 +//typedef FSA<unsigned char> SegrulesFSAType;
19 19  
20   -std::map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr);
  20 +std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr);
21 21  
22 22 #endif /* SEGRULES_HPP */
23 23  
... ...
nbproject/configurations.xml
... ... @@ -106,14 +106,20 @@
106 106 </makeTool>
107 107 </makefileType>
108 108 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
  109 + <ccTool flags="1">
  110 + </ccTool>
109 111 </item>
110 112 <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
  113 + <ccTool flags="1">
  114 + </ccTool>
111 115 </item>
112 116 <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4">
113 117 </item>
114 118 <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
115 119 </item>
116 120 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
  121 + <ccTool flags="1">
  122 + </ccTool>
117 123 </item>
118 124 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
119 125 ex="false"
... ... @@ -121,7 +127,6 @@
121 127 flavor2="8">
122 128 <ccTool>
123 129 <incDir>
124   - <pElem>build</pElem>
125 130 <pElem>/usr/lib/jvm/default-java/include</pElem>
126 131 <pElem>morfeusz</pElem>
127 132 <pElem>build/morfeusz/java</pElem>
... ... @@ -145,7 +150,6 @@
145 150 flavor2="8">
146 151 <ccTool>
147 152 <incDir>
148   - <pElem>build</pElem>
149 153 <pElem>/usr/include/python2.7</pElem>
150 154 <pElem>morfeusz</pElem>
151 155 <pElem>build/morfeusz/python</pElem>
... ... @@ -173,9 +177,8 @@
173 177 <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4">
174 178 <ccTool flags="1">
175 179 <incDir>
176   - <pElem>build1</pElem>
177 180 <pElem>morfeusz</pElem>
178   - <pElem>build1/morfeusz</pElem>
  181 + <pElem>morfeusz/build/morfeusz</pElem>
179 182 </incDir>
180 183 <preprocessorList>
181 184 <Elem>libmorfeusz_EXPORTS</Elem>
... ... @@ -185,9 +188,8 @@
185 188 <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
186 189 <ccTool flags="1">
187 190 <incDir>
188   - <pElem>build1</pElem>
189 191 <pElem>morfeusz</pElem>
190   - <pElem>build1/morfeusz</pElem>
  192 + <pElem>morfeusz/build/morfeusz</pElem>
191 193 </incDir>
192 194 <preprocessorList>
193 195 <Elem>libmorfeusz_EXPORTS</Elem>
... ... @@ -266,12 +268,18 @@
266 268 </preprocessorList>
267 269 </ccTool>
268 270 </folder>
269   - <folder path="morfeusz/java">
  271 + <folder path="morfeusz">
270 272 <ccTool>
271 273 <incDir>
272 274 <pElem>build</pElem>
  275 + </incDir>
  276 + </ccTool>
  277 + </folder>
  278 + <folder path="morfeusz/java">
  279 + <ccTool>
  280 + <incDir>
273 281 <pElem>morfeusz</pElem>
274   - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
  282 + <pElem>/usr/lib/jvm/default-java/include</pElem>
275 283 </incDir>
276 284 <preprocessorList>
277 285 <Elem>libjmorfeusz_EXPORTS</Elem>
... ... @@ -281,7 +289,6 @@
281 289 <folder path="morfeusz/python">
282 290 <ccTool>
283 291 <incDir>
284   - <pElem>build</pElem>
285 292 <pElem>/usr/include/python2.7</pElem>
286 293 <pElem>morfeusz</pElem>
287 294 </incDir>
... ... @@ -407,18 +414,26 @@
407 414 </ccTool>
408 415 </item>
409 416 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4">
  417 + <ccTool flags="1">
  418 + </ccTool>
410 419 </item>
411 420 <item path="morfeusz/charset/CharsetConverter.cpp"
412 421 ex="false"
413 422 tool="1"
414 423 flavor2="4">
  424 + <ccTool flags="1">
  425 + </ccTool>
415 426 </item>
416 427 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
  428 + <ccTool flags="1">
  429 + </ccTool>
417 430 </item>
418 431 <item path="morfeusz/charset/conversion_tables.cpp"
419 432 ex="false"
420 433 tool="1"
421 434 flavor2="4">
  435 + <ccTool flags="1">
  436 + </ccTool>
422 437 </item>
423 438 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
424 439 <ccTool flags="1">
... ... @@ -507,8 +522,12 @@
507 522 ex="false"
508 523 tool="1"
509 524 flavor2="4">
  525 + <ccTool flags="1">
  526 + </ccTool>
510 527 </item>
511 528 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
  529 + <ccTool flags="1">
  530 + </ccTool>
512 531 </item>
513 532 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
514 533 <ccTool flags="0">
... ...