Commit 00e66248a61ae340a23b5635cfc761be6dbf38cd

Authored by Michał Lenart
1 parent a6f0d912

poprawiona obsługa segmentacji (działają już cyfry tak, jak na początku ustalono)

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@112 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
@@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "") @@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "")
36 if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") 36 if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE")
37 set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) 37 set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt)
38 else () 38 else ()
39 - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") 39 + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorf-0.6.7.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab")
40 endif () 40 endif ()
41 endif () 41 endif ()
42 42
buildAll.sh
@@ -10,7 +10,7 @@ function build { @@ -10,7 +10,7 @@ function build {
10 targets=$@ 10 targets=$@
11 11
12 srcDir=`pwd` 12 srcDir=`pwd`
13 - buildDir=build/build-$os-$arch 13 + buildDir=buildall/build-$os-$arch
14 targetDir=$srcDir/target/$os-$arch 14 targetDir=$srcDir/target/$os-$arch
15 toolchain=$srcDir/morfeusz/Toolchain-$os-$arch.cmake 15 toolchain=$srcDir/morfeusz/Toolchain-$os-$arch.cmake
16 16
fsabuilder/buildfsa.py
@@ -261,8 +261,9 @@ def main(opts): @@ -261,8 +261,9 @@ def main(opts):
261 if __name__ == '__main__': 261 if __name__ == '__main__':
262 import os 262 import os
263 opts = _parseOptions() 263 opts = _parseOptions()
264 - try:  
265 - main(opts)  
266 - except Exception as ex:  
267 - print >> sys.stderr, unicode(ex).encode('utf8') 264 +# try:
  265 + main(opts)
  266 +# except Exception as ex:
  267 +# raise ex
  268 +# print >> sys.stderr, unicode(ex).encode('utf8')
268 269
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -113,12 +113,13 @@ class FSA(object): @@ -113,12 +113,13 @@ class FSA(object):
113 return q 113 return q
114 114
115 def calculateOffsets(self, sizeCounter): 115 def calculateOffsets(self, sizeCounter):
116 - currReverseOffset = 0  
117 - for state in self.initialState.dfs(set()):  
118 - currReverseOffset += sizeCounter(state)  
119 - state.reverseOffset = currReverseOffset  
120 - for state in self.initialState.dfs(set()):  
121 - state.offset = currReverseOffset - state.reverseOffset 116 + self.initialState.calculateOffsets(sizeCounter)
  117 +# currReverseOffset = 0
  118 +# for state in self.initialState.dfs(set()):
  119 +# currReverseOffset += sizeCounter(state)
  120 +# state.reverseOffset = currReverseOffset
  121 +# for state in self.initialState.dfs(set()):
  122 +# state.offset = currReverseOffset - state.reverseOffset
122 123
123 def debug(self): 124 def debug(self):
124 for state in self.initialState.dfs(set()): 125 for state in self.initialState.dfs(set()):
fsabuilder/morfeuszbuilder/fsa/serializer.py
@@ -6,6 +6,7 @@ Created on Oct 20, 2013 @@ -6,6 +6,7 @@ Created on Oct 20, 2013
6 6
7 import logging 7 import logging
8 from state import State 8 from state import State
  9 +from morfeuszbuilder.utils.serializationUtils import *
9 10
10 class Serializer(object): 11 class Serializer(object):
11 12
@@ -63,7 +64,7 @@ class Serializer(object): @@ -63,7 +64,7 @@ class Serializer(object):
63 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) 64 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
64 for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): 65 for state in sorted(self.fsa.dfs(), key=lambda s: s.offset):
65 fsaData.extend(self.state2bytearray(state)) 66 fsaData.extend(self.state2bytearray(state))
66 - res.extend(self.htonl(len(fsaData))) 67 + res.extend(htonl(len(fsaData)))
67 res.extend(fsaData) 68 res.extend(fsaData)
68 res.extend(self.serializeEpilogue(additionalData, moreAdditionalData)) 69 res.extend(self.serializeEpilogue(additionalData, moreAdditionalData))
69 return res 70 return res
@@ -71,9 +72,9 @@ class Serializer(object): @@ -71,9 +72,9 @@ class Serializer(object):
71 def _serializeTags(self, tagsMap): 72 def _serializeTags(self, tagsMap):
72 res = bytearray() 73 res = bytearray()
73 numOfTags = len(tagsMap) 74 numOfTags = len(tagsMap)
74 - res.extend(self.htons(numOfTags)) 75 + res.extend(htons(numOfTags))
75 for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): 76 for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum):
76 - res.extend(self.htons(tagnum)) 77 + res.extend(htons(tagnum))
77 res.extend(self.fsa.encodeWord(tag)) 78 res.extend(self.fsa.encodeWord(tag))
78 res.append(0) 79 res.append(0)
79 return res 80 return res
@@ -86,25 +87,6 @@ class Serializer(object): @@ -86,25 +87,6 @@ class Serializer(object):
86 res.extend(self._serializeTags(tagset._name2namenum)) 87 res.extend(self._serializeTags(tagset._name2namenum))
87 return res 88 return res
88 89
89 - # serialize uint16 as big endian  
90 - def htons(self, n):  
91 - assert n < 65536  
92 - assert n >= 0  
93 - res = bytearray()  
94 - res.append((n & 0x00FF00) >> 8)  
95 - res.append(n & 0x0000FF)  
96 - return res  
97 -  
98 - # serialize uint32 as big endian  
99 - def htonl(self, n):  
100 - assert n >= 0  
101 - res = bytearray()  
102 - res.append((n & 0xFF000000) >> 24)  
103 - res.append((n & 0x00FF0000) >> 16)  
104 - res.append((n & 0x0000FF00) >> 8)  
105 - res.append(n & 0x000000FF)  
106 - return res  
107 -  
108 def serializePrologue(self): 90 def serializePrologue(self):
109 res = bytearray() 91 res = bytearray()
110 92
@@ -126,7 +108,7 @@ class Serializer(object): @@ -126,7 +108,7 @@ class Serializer(object):
126 res = bytearray() 108 res = bytearray()
127 additionalDataSize = len(additionalData) if additionalData else 0 109 additionalDataSize = len(additionalData) if additionalData else 0
128 moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0 110 moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0
129 - res.extend(self.htonl(additionalDataSize)) 111 + res.extend(htonl(additionalDataSize))
130 112
131 # add additional data itself 113 # add additional data itself
132 if additionalDataSize: 114 if additionalDataSize:
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -13,7 +13,7 @@ class State(object): @@ -13,7 +13,7 @@ class State(object):
13 13
14 def __init__(self, additionalData=None): 14 def __init__(self, additionalData=None):
15 self.transitionsMap = {} 15 self.transitionsMap = {}
16 - self.transitionsDataMap = {} 16 +# self.transitionsDataMap = {}
17 self.freq = 0 17 self.freq = 0
18 self.encodedData = None 18 self.encodedData = None
19 self.reverseOffset = None 19 self.reverseOffset = None
@@ -29,11 +29,11 @@ class State(object): @@ -29,11 +29,11 @@ class State(object):
29 def transitionsNum(self): 29 def transitionsNum(self):
30 return len(self.transitionsMap) 30 return len(self.transitionsMap)
31 31
32 - def setTransition(self, byte, nextState):  
33 - self.transitionsMap[byte] = nextState  
34 -  
35 - def setTransitionData(self, byte, data):  
36 - self.transitionsDataMap[byte] = data 32 + def setTransition(self, label, nextState):
  33 + self.transitionsMap[label] = nextState
  34 +#
  35 +# def setTransitionData(self, byte, data):
  36 +# self.transitionsDataMap[byte] = data
37 37
38 def hasNext(self, byte): 38 def hasNext(self, byte):
39 return byte in self.transitionsMap 39 return byte in self.transitionsMap
@@ -68,6 +68,14 @@ class State(object): @@ -68,6 +68,14 @@ class State(object):
68 yield state1 68 yield state1
69 yield self 69 yield self
70 70
  71 + def calculateOffsets(self, sizeCounter):
  72 + currReverseOffset = 0
  73 + for state in self.dfs(set()):
  74 + currReverseOffset += sizeCounter(state)
  75 + state.reverseOffset = currReverseOffset
  76 + for state in self.dfs(set()):
  77 + state.offset = currReverseOffset - state.reverseOffset
  78 +
71 def debug(self): 79 def debug(self):
72 print '----------------' 80 print '----------------'
73 print 'STATE:', self.idx, 'accepting', self.isAccepting() 81 print 'STATE:', self.idx, 'accepting', self.isAccepting()
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -7,6 +7,7 @@ Created on 23 sty 2014 @@ -7,6 +7,7 @@ Created on 23 sty 2014
7 import re 7 import re
8 from pyparsing import * 8 from pyparsing import *
9 from morfeuszbuilder.utils import exceptions 9 from morfeuszbuilder.utils import exceptions
  10 +from pyparseString import pyparseString
10 11
11 identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') 12 identifier = Word(alphas, bodyChars=alphanums+u'_>*+!')
12 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() 13 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
@@ -54,7 +55,7 @@ def _tryToSubstituteNonArgDefine(s, t, defines): @@ -54,7 +55,7 @@ def _tryToSubstituteNonArgDefine(s, t, defines):
54 else: 55 else:
55 return defineName 56 return defineName
56 57
57 -def _processLine(lineNum, line, defines): 58 +def _processLine(lineNum, line, defines, filename):
58 if line.strip(): 59 if line.strip():
59 60
60 rule = Forward() 61 rule = Forward()
@@ -67,24 +68,16 @@ def _processLine(lineNum, line, defines): @@ -67,24 +68,16 @@ def _processLine(lineNum, line, defines):
67 rule.setParseAction(lambda s, l, t: ' '.join(t)) 68 rule.setParseAction(lambda s, l, t: ' '.join(t))
68 defineInstance.setParseAction(lambda s, l, t: _tryToSubstituteArgDefine(s, t, defines)) 69 defineInstance.setParseAction(lambda s, l, t: _tryToSubstituteArgDefine(s, t, defines))
69 localId.setParseAction(lambda s, l, t: _tryToSubstituteNonArgDefine(s, t, defines)) 70 localId.setParseAction(lambda s, l, t: _tryToSubstituteNonArgDefine(s, t, defines))
70 - try:  
71 - return rule.parseString(line, parseAll=True)[0]  
72 - except ParseException as ex:  
73 - msg = u'Preprocessing of segmentation rules failed.\n'  
74 - msg += line + '\n'  
75 - msg += (ex.col - 1) * ' ' + '^\n'  
76 - msg += ex.msg  
77 -# print unicode(exceptions.SegtypesException(msg)).encode('utf8')  
78 - raise exceptions.SegtypesException(msg) 71 + return pyparseString(rule, lineNum, line, filename)[0]
79 else: 72 else:
80 return line 73 return line
81 74
82 -def preprocess(inputLines, defs): 75 +def preprocess(inputLines, defs, filename):
83 defines = {} 76 defines = {}
84 ifdefsStack = [] 77 ifdefsStack = []
85 for lineNum, line in inputLines: 78 for lineNum, line in inputLines:
86 if line.startswith('#define'): 79 if line.startswith('#define'):
87 - parsedDefine = list(define.parseString(line)) 80 + parsedDefine = list(pyparseString(define, lineNum, line, filename))
88 if len(parsedDefine) == 2: 81 if len(parsedDefine) == 2:
89 name, val = parsedDefine 82 name, val = parsedDefine
90 defines[name] = NonArgDefine(name, val) 83 defines[name] = NonArgDefine(name, val)
@@ -92,15 +85,16 @@ def preprocess(inputLines, defs): @@ -92,15 +85,16 @@ def preprocess(inputLines, defs):
92 name, arg, val = parsedDefine 85 name, arg, val = parsedDefine
93 localDefines = defines.copy() 86 localDefines = defines.copy()
94 localDefines[arg] = NonArgDefine(arg, arg) 87 localDefines[arg] = NonArgDefine(arg, arg)
95 - val = _processLine(lineNum, val, localDefines) 88 + val = _processLine(lineNum, val, localDefines, filename)
96 defines[name] = ArgDefine(name, arg, val) 89 defines[name] = ArgDefine(name, arg, val)
97 elif line.startswith('#ifdef'): 90 elif line.startswith('#ifdef'):
98 - name = ifdef.parseString(line)[0] 91 + name = pyparseString(ifdef, lineNum, line, filename)[0]
  92 +# name = ifdef.parseString(line)[0]
99 ifdefsStack.append(name) 93 ifdefsStack.append(name)
100 elif line.startswith('#endif'): 94 elif line.startswith('#endif'):
101 ifdefsStack.pop() 95 ifdefsStack.pop()
102 elif line.startswith('#'): 96 elif line.startswith('#'):
103 yield lineNum, line 97 yield lineNum, line
104 elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)): 98 elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)):
105 - yield lineNum, _processLine(lineNum, line, defines) 99 + yield lineNum, _processLine(lineNum, line, defines, filename)
106 100
107 \ No newline at end of file 101 \ No newline at end of file
fsabuilder/morfeuszbuilder/segrules/pyparseString.py 0 → 100644
  1 +'''
  2 +Created on 12 mar 2014
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +from pyparsing import ParseException
  8 +from morfeuszbuilder.utils import exceptions
  9 +
  10 +def pyparseString(rule, lineNum, line, filename):
  11 + try:
  12 + return rule.parseString(line, parseAll=True)
  13 + except ParseException as ex:
  14 + msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum)
  15 + msg += line + '\n'
  16 + msg += (ex.col - 1) * ' ' + '^\n'
  17 + msg += ex.msg
  18 +# print unicode(exceptions.SegtypesException(msg)).encode('utf8')
  19 + raise exceptions.SegtypesException(msg)
0 \ No newline at end of file 20 \ No newline at end of file
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -25,16 +25,17 @@ class SegmentRule(object): @@ -25,16 +25,17 @@ class SegmentRule(object):
25 25
26 class TagRule(SegmentRule): 26 class TagRule(SegmentRule):
27 27
28 - def __init__(self, segnum, segtype): 28 + def __init__(self, segnum, shiftOrth, segtype):
29 self.segnum = segnum 29 self.segnum = segnum
30 self.segtype = segtype 30 self.segtype = segtype
  31 + self.shiftOrth = shiftOrth
31 32
32 def addToNFA(self, fsa): 33 def addToNFA(self, fsa):
33 endState = RulesNFAState(final=True) 34 endState = RulesNFAState(final=True)
34 self._doAddToNFA(fsa.initialState, endState) 35 self._doAddToNFA(fsa.initialState, endState)
35 36
36 def _doAddToNFA(self, startState, endState): 37 def _doAddToNFA(self, startState, endState):
37 - startState.addTransition(self.segnum, endState) 38 + startState.addTransition((self.segnum, self.shiftOrth), endState)
38 39
39 def __str__(self): 40 def __str__(self):
40 return u'%s(%d)' % (self.segtype, self.segnum) 41 return u'%s(%d)' % (self.segtype, self.segnum)
@@ -92,6 +93,7 @@ class ZeroOrMoreRule(UnaryRule): @@ -92,6 +93,7 @@ class ZeroOrMoreRule(UnaryRule):
92 93
93 def __init__(self, child): 94 def __init__(self, child):
94 super(ZeroOrMoreRule, self).__init__(child) 95 super(ZeroOrMoreRule, self).__init__(child)
  96 + assert isinstance(child, SegmentRule)
95 97
96 def addToNFA(self, fsa): 98 def addToNFA(self, fsa):
97 raise ValueError() 99 raise ValueError()
@@ -108,33 +110,3 @@ class ZeroOrMoreRule(UnaryRule): @@ -108,33 +110,3 @@ class ZeroOrMoreRule(UnaryRule):
108 110
109 def __str__(self): 111 def __str__(self):
110 return u'(' + str(self.child) + ')*' 112 return u'(' + str(self.child) + ')*'
111 -  
112 -class ShiftOrthRule(UnaryRule):  
113 -  
114 - def __init__(self, child):  
115 - super(ShiftOrthRule, self).__init__(child)  
116 -  
117 - def addToNFA(self, fsa):  
118 - raise ValueError()  
119 -  
120 - def _doAddToNFA(self, startState, endState):  
121 - self.child._doAddToNFA(startState, endState)  
122 - startState.setTransitionData(self.child.segnum, 1)  
123 -  
124 - def __str__(self):  
125 - return u'(' + str(self.child) + ')>'  
126 -  
127 -class ShiftOrthSameTypeRule(UnaryRule):  
128 -  
129 - def __init__(self, child):  
130 - super(ShiftOrthSameTypeRule, self).__init__(child)  
131 -  
132 - def addToNFA(self, fsa):  
133 - raise ValueError()  
134 -  
135 - def _doAddToNFA(self, startState, endState):  
136 - self.child._doAddToNFA(startState, endState)  
137 - startState.setTransitionData(self.child.segnum, 2)  
138 -  
139 - def __str__(self):  
140 - return u'(' + str(self.child) + ')!>'  
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py 0 → 100644
  1 +'''
  2 +Created on 12 mar 2014
  3 +
  4 +@author: mlenart
  5 +'''
  6 +import logging
  7 +from morfeuszbuilder.fsa import state
  8 +from morfeuszbuilder.utils.serializationUtils import htons
  9 +
  10 +class RulesState(state.State):
  11 +
  12 + def __init__(self):
  13 + super(RulesState, self).__init__()
  14 + self.weak = None
  15 +
  16 + def setAsAccepting(self, weak):
  17 + self.weak = weak
  18 + self.encodedData = bytearray([1 if weak else 0])
  19 +
  20 + def getEncodedSize(self):
  21 + stateInfoSize = 2 # accepting info + transitionsNum
  22 + transitionsSize = 4 * len(self.transitionsMap)
  23 + return stateInfoSize + transitionsSize
  24 +
  25 +class RulesFSA(object):
  26 +
  27 + def __init__(self):
  28 + self.initialState = state.State()
  29 + self.ACCEPTING_FLAG = 1
  30 + self.WEAK_FLAG = 2
  31 +
  32 + def stateData2bytearray(self, state):
  33 + res = bytearray()
  34 + firstByte = 0
  35 + if state.isAccepting():
  36 + firstByte |= self.ACCEPTING_FLAG
  37 + if state.weak:
  38 + firstByte |= self.WEAK_FLAG
  39 + assert firstByte < 256 and firstByte >= 0
  40 + res.append(firstByte)
  41 +
  42 + secondByte = len(state.transitionsMap)
  43 + assert secondByte < 256 and secondByte >= 0
  44 + res.append(secondByte)
  45 +
  46 + return res
  47 +
  48 + def transitionsData2bytearray(self, state):
  49 + res = bytearray()
  50 +# logging.debug('next')
  51 + for (segnum, shiftOrth), nextState in state.transitionsMap.iteritems():
  52 + res.append(segnum)
  53 + if shiftOrth:
  54 + res.append(1)
  55 + else:
  56 + res.append(0)
  57 + offset = nextState.offset
  58 + assert offset < 65536
  59 +# res.append((offset & 0xFF0000) >> 16)
  60 + res.extend(htons(offset))
  61 + return res
  62 +
  63 + def serialize(self):
  64 + self.initialState.calculateOffsets(sizeCounter=lambda s: s.getEncodedSize())
  65 + res = bytearray()
  66 +
  67 + for state in sorted(self.initialState.dfs(set()), key=lambda s: s.offset):
  68 + res.extend(self.stateData2bytearray(state))
  69 + res.extend(self.transitionsData2bytearray(state))
  70 +
  71 + logging.info('Segmentation automaton size: %d bytes', len(res))
  72 + print list(res)
  73 + return res
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -4,7 +4,7 @@ Created on 20 lut 2014 @@ -4,7 +4,7 @@ Created on 20 lut 2014
4 @author: mlenart 4 @author: mlenart
5 ''' 5 '''
6 import logging 6 import logging
7 -from morfeuszbuilder.fsa.serializer import SimpleSerializer 7 +from morfeuszbuilder.utils.serializationUtils import htons, htonl
8 8
9 class RulesManager(object): 9 class RulesManager(object):
10 10
@@ -52,9 +52,9 @@ class RulesManager(object): @@ -52,9 +52,9 @@ class RulesManager(object):
52 52
53 def _serializeDFA(self, dfa): 53 def _serializeDFA(self, dfa):
54 res = bytearray() 54 res = bytearray()
55 - serializer = SimpleSerializer(dfa, serializeTransitionsData=True)  
56 - dfaBytearray = serializer.fsa2bytearray()  
57 - res.extend(serializer.htonl(len(dfaBytearray))) 55 +# serializer = SimpleSerializer(dfa, serializeTransitionsData=True)
  56 + dfaBytearray = dfa.serialize()
  57 + res.extend(htonl(len(dfaBytearray)))
58 res.extend(dfaBytearray) 58 res.extend(dfaBytearray)
59 return res 59 return res
60 60
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -4,7 +4,7 @@ Created on 24 sty 2014 @@ -4,7 +4,7 @@ Created on 24 sty 2014
4 @author: mlenart 4 @author: mlenart
5 ''' 5 '''
6 6
7 -from morfeuszbuilder.fsa import fsa, state, encode 7 +from morfeuszbuilder.segrules.rulesFSA import RulesFSA, RulesState
8 8
9 class RulesNFAState(object): 9 class RulesNFAState(object):
10 10
@@ -12,7 +12,7 @@ class RulesNFAState(object): @@ -12,7 +12,7 @@ class RulesNFAState(object):
12 12
13 def __init__(self, initial=False, final=False, weak=False): 13 def __init__(self, initial=False, final=False, weak=False):
14 self.transitionsMap = {} 14 self.transitionsMap = {}
15 - self.transitionsDataMap = {} 15 +# self.transitionsDataMap = {}
16 self.initial = initial 16 self.initial = initial
17 self.final = final 17 self.final = final
18 self.weak = weak 18 self.weak = weak
@@ -20,13 +20,9 @@ class RulesNFAState(object): @@ -20,13 +20,9 @@ class RulesNFAState(object):
20 RulesNFAState.statesCounter += 1 20 RulesNFAState.statesCounter += 1
21 21
22 def addTransition(self, label, targetState): 22 def addTransition(self, label, targetState):
  23 + assert label is None or len(label) == 2
23 self.transitionsMap.setdefault(label, set()) 24 self.transitionsMap.setdefault(label, set())
24 self.transitionsMap[label].add(targetState) 25 self.transitionsMap[label].add(targetState)
25 - self.transitionsDataMap[label] = 0  
26 -  
27 - def setTransitionData(self, label, byte):  
28 - assert len(self.transitionsMap[label]) == 1  
29 - self.transitionsDataMap[label] = byte  
30 26
31 def getClosure(self, visited): 27 def getClosure(self, visited):
32 if self in visited: 28 if self in visited:
@@ -64,10 +60,11 @@ class RulesNFA(object): @@ -64,10 +60,11 @@ class RulesNFA(object):
64 for nfaState in nfaStates: 60 for nfaState in nfaStates:
65 for label, nextStates in nfaState.transitionsMap.iteritems(): 61 for label, nextStates in nfaState.transitionsMap.iteritems():
66 if label is not None: 62 if label is not None:
67 - transitionData = nfaState.transitionsDataMap[label]  
68 - res.setdefault((label, transitionData), set()) 63 +# transitionData = nfaState.transitionsDataMap[label]
  64 + segnum, shiftOrth = label
  65 + res.setdefault((segnum, shiftOrth), set())
69 for nextNFAState in nextStates: 66 for nextNFAState in nextStates:
70 - res[(label, transitionData)] |= nextNFAState.getClosure(set()) 67 + res[(segnum, shiftOrth)] |= nextNFAState.getClosure(set())
71 return res 68 return res
72 69
73 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): 70 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
@@ -79,23 +76,24 @@ class RulesNFA(object): @@ -79,23 +76,24 @@ class RulesNFA(object):
79 if final: 76 if final:
80 # dfaState should be final 77 # dfaState should be final
81 # and contain info about weakness 78 # and contain info about weakness
82 - dfaState.encodedData = bytearray([1 if weak else 0])  
83 - for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): 79 + dfaState.setAsAccepting(weak=weak)
  80 +# dfaState.encodedData = bytearray([1 if weak else 0])
  81 + for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems():
84 key = frozenset(nextNFAStates) 82 key = frozenset(nextNFAStates)
85 if key in nfaSubset2DFAState: 83 if key in nfaSubset2DFAState:
86 nextDFAState = nfaSubset2DFAState[key] 84 nextDFAState = nfaSubset2DFAState[key]
87 else: 85 else:
88 - nextDFAState = state.State() 86 + nextDFAState = RulesState()
89 nfaSubset2DFAState[key] = nextDFAState 87 nfaSubset2DFAState[key] = nextDFAState
90 self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) 88 self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState)
91 - dfaState.setTransition(label, nextDFAState)  
92 - dfaState.setTransitionData(label, transitionData) 89 + dfaState.setTransition((segnum, shiftOrth), nextDFAState)
  90 +# dfaState.setTransitionData(label, transitionData)
93 91
94 def convertToDFA(self): 92 def convertToDFA(self):
95 - dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) 93 + dfa = RulesFSA()
96 startStates = self.initialState.getClosure(set()) 94 startStates = self.initialState.getClosure(set())
97 assert not any(filter(lambda s: s.final, startStates)) 95 assert not any(filter(lambda s: s.final, startStates))
98 - dfa.initialState = state.State(additionalData=False) 96 + dfa.initialState = RulesState()
99 self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) 97 self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState})
100 return dfa 98 return dfa
101 99
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -3,7 +3,7 @@ from pyparsing import * @@ -3,7 +3,7 @@ from pyparsing import *
3 ParserElement.enablePackrat() 3 ParserElement.enablePackrat()
4 from morfeuszbuilder.tagset import segtypes 4 from morfeuszbuilder.tagset import segtypes
5 from morfeuszbuilder.utils import configFile, exceptions 5 from morfeuszbuilder.utils import configFile, exceptions
6 -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager 6 +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString
7 import codecs 7 import codecs
8 import re 8 import re
9 9
@@ -48,8 +48,8 @@ class RulesParser(object): @@ -48,8 +48,8 @@ class RulesParser(object):
48 if not firstNFA: 48 if not firstNFA:
49 firstNFA = nfa 49 firstNFA = nfa
50 combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') 50 combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations')
51 - combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs))  
52 - for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): 51 + combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
  52 + for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
53 # print rule 53 # print rule
54 rule.addToNFA(nfa) 54 rule.addToNFA(nfa)
55 # nfa.debug() 55 # nfa.debug()
@@ -60,25 +60,24 @@ class RulesParser(object): @@ -60,25 +60,24 @@ class RulesParser(object):
60 res.addDFA(key2Def, dfa) 60 res.addDFA(key2Def, dfa)
61 return res 61 return res
62 62
63 - def _doParse(self, combinationEnumeratedLines, segtypesHelper): 63 + def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename):
64 for lineNum, line in combinationEnumeratedLines: 64 for lineNum, line in combinationEnumeratedLines:
65 if not line.startswith('#'): 65 if not line.startswith('#'):
66 - yield self._doParseOneLine(lineNum, line, segtypesHelper) 66 + yield self._doParseOneLine(lineNum, line, segtypesHelper, filename)
67 67
68 - def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper): 68 + def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
69 if not segtypesHelper.hasSegtype(segtype): 69 if not segtypesHelper.hasSegtype(segtype):
70 raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) 70 raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype))
71 else: 71 else:
72 # return rules.TagRule(segtype) 72 # return rules.TagRule(segtype)
73 - return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), segtype) 73 + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype)
74 74
75 - def _doParseOneLine(self, lineNum, line, segtypesHelper): 75 + def _doParseOneLine(self, lineNum, line, segtypesHelper, filename):
76 rule = Forward() 76 rule = Forward()
77 tagRule = Word(alphanums+'_') 77 tagRule = Word(alphanums+'_')
78 - shiftOrthRule = tagRule + '>'  
79 - shiftOrthSameTypeRule = tagRule + '!' + '>' 78 + shiftOrthRule = Word(alphanums+'_') + Suppress('>')
80 parenRule = Suppress('(') + rule + Suppress(')') 79 parenRule = Suppress('(') + rule + Suppress(')')
81 - atomicRule = tagRule ^ shiftOrthRule ^ shiftOrthSameTypeRule ^ parenRule 80 + atomicRule = tagRule ^ shiftOrthRule ^ parenRule
82 zeroOrMoreRule = atomicRule + Suppress('*') 81 zeroOrMoreRule = atomicRule + Suppress('*')
83 oneOrMoreRule = atomicRule + Suppress('+') 82 oneOrMoreRule = atomicRule + Suppress('+')
84 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule 83 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
@@ -87,13 +86,12 @@ class RulesParser(object): @@ -87,13 +86,12 @@ class RulesParser(object):
87 concatRule = OneOrMore(complexRule) 86 concatRule = OneOrMore(complexRule)
88 rule << concatRule 87 rule << concatRule
89 88
90 - tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper))  
91 - shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0]))  
92 - shiftOrthSameTypeRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthSameTypeRule(toks[0])) 89 + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
  90 + shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
93 # parenRule.setParseAction(lambda string, loc, toks: toks[0]) 91 # parenRule.setParseAction(lambda string, loc, toks: toks[0])
94 zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) 92 zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0]))
95 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) 93 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
96 oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) 94 oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
97 concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) 95 concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
98 - parsedRule = rule.parseString(line, parseAll=True)[0] 96 + parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
99 return parsedRule 97 return parsedRule
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -33,6 +33,7 @@ class Segtypes(object): @@ -33,6 +33,7 @@ class Segtypes(object):
33 raise exceptions.ConfigFileException(self.filename, lineNum, msg) 33 raise exceptions.ConfigFileException(self.filename, lineNum, msg)
34 34
35 def _readTags(self, segrulesConfigFile): 35 def _readTags(self, segrulesConfigFile):
  36 + gotWildcardPattern = False
36 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): 37 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'):
37 splitLine = re.split(r'\s+', line.strip()) 38 splitLine = re.split(r'\s+', line.strip())
38 self._validate( 39 self._validate(
@@ -49,13 +50,27 @@ class Segtypes(object): @@ -49,13 +50,27 @@ class Segtypes(object):
49 lineNum, 50 lineNum,
50 re.match(r'[a-z_\.\:\%]+', pattern)) 51 re.match(r'[a-z_\.\:\%]+', pattern))
51 52
  53 + self._validate(
  54 + u'Pattern that matches everything must be the last one',
  55 + lineNum - 1,
  56 + not gotWildcardPattern)
  57 +
52 if segtype in self.segtype2Segnum: 58 if segtype in self.segtype2Segnum:
53 segnum = self.segtype2Segnum[segtype] 59 segnum = self.segtype2Segnum[segtype]
54 else: 60 else:
55 segnum = len(self.segtype2Segnum) 61 segnum = len(self.segtype2Segnum)
56 self.segtype2Segnum[segtype] = segnum 62 self.segtype2Segnum[segtype] = segnum
57 63
58 - self.patternsList.append(SegtypePattern(None, pattern, segnum)) 64 + segtypePattern = SegtypePattern(None, pattern, segnum)
  65 +
  66 + self._validate(
  67 + u'There is no tag that matches pattern "%s".' % pattern,
  68 + lineNum,
  69 + any([segtypePattern.tryToMatch(None, tag) != -1 for tag in self.tagset.getAllTags()]))
  70 +
  71 + self.patternsList.append(segtypePattern)
  72 +
  73 + gotWildcardPattern = gotWildcardPattern or pattern == '%'
59 74
60 self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()]) 75 self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()])
61 76
@@ -67,7 +82,7 @@ class Segtypes(object): @@ -67,7 +82,7 @@ class Segtypes(object):
67 lineNum, 82 lineNum,
68 re.match(r'[a-z_]+', segtype)) 83 re.match(r'[a-z_]+', segtype))
69 self._validate( 84 self._validate(
70 - u'Pattern must contain lemma and POS', 85 + u'Pattern must contain lemma and part-of-speech fields',
71 lineNum, 86 lineNum,
72 re.match(r'.+\:[a-z_]+', pattern, re.U)) 87 re.match(r'.+\:[a-z_]+', pattern, re.U))
73 88
@@ -79,7 +94,14 @@ class Segtypes(object): @@ -79,7 +94,14 @@ class Segtypes(object):
79 94
80 lemma, pos = pattern.split(':') 95 lemma, pos = pattern.split(':')
81 96
82 - self.patternsList.append(SegtypePattern(lemma, '%s|%s:%%' % (pos, pos), segnum)) 97 + segtypePattern = SegtypePattern(lemma, pos + ':%', segnum)
  98 +
  99 + self._validate(
  100 + u'There is no tag that matches pattern "%s".' % (pos + ':%'),
  101 + lineNum,
  102 + any([segtypePattern.tryToMatch(lemma, tag) != -1 for tag in self.tagset.getAllTags()]))
  103 +
  104 + self.patternsList.append(segtypePattern)
83 105
84 def _debugSegnums(self): 106 def _debugSegnums(self):
85 for tagnum, segnum in self._tagnum2Segnum.items(): 107 for tagnum, segnum in self._tagnum2Segnum.items():
@@ -121,11 +143,6 @@ class Segtypes(object): @@ -121,11 +143,6 @@ class Segtypes(object):
121 if not res: 143 if not res:
122 res = self._tagnum2Segnum.get(tagnum, None) 144 res = self._tagnum2Segnum.get(tagnum, None)
123 return res 145 return res
124 -# for p in self.patternsList:  
125 -# res = p.tryToMatch(lemma, tag)  
126 -# if res >= 0:  
127 -# return res  
128 -# return None  
129 146
130 class SegtypePattern(object): 147 class SegtypePattern(object):
131 148
@@ -135,8 +152,13 @@ class SegtypePattern(object): @@ -135,8 +152,13 @@ class SegtypePattern(object):
135 self.segnum = segnum 152 self.segnum = segnum
136 153
137 def tryToMatch(self, lemma, tag): 154 def tryToMatch(self, lemma, tag):
  155 +# tag2Match = tag + ':' if not tag.endswith(':') else tag
  156 +# print tag2Match
  157 + patterns2Match = []
  158 + patterns2Match.append(self.pattern.replace('%', '.*'))
  159 + patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*'))
138 if (self.lemma is None or self.lemma == lemma) \ 160 if (self.lemma is None or self.lemma == lemma) \
139 - and re.match(self.pattern.replace('%', '.*'), tag): 161 + and any([re.match(p, tag) for p in patterns2Match]):
140 return self.segnum 162 return self.segnum
141 else: 163 else:
142 return -1 164 return -1
fsabuilder/morfeuszbuilder/utils/serializationUtils.py 0 → 100644
  1 +'''
  2 +Created on 12 mar 2014
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +# serialize uint16 as big endian
  8 +def htons(n):
  9 + assert n < 65536
  10 + assert n >= 0
  11 + res = bytearray()
  12 + res.append((n & 0x00FF00) >> 8)
  13 + res.append(n & 0x0000FF)
  14 + return res
  15 +
  16 +# serialize uint32 as big endian
  17 +def htonl(n):
  18 + assert n >= 0
  19 + res = bytearray()
  20 + res.append((n & 0xFF000000) >> 24)
  21 + res.append((n & 0x00FF0000) >> 16)
  22 + res.append((n & 0x0000FF00) >> 8)
  23 + res.append(n & 0x000000FF)
  24 + return res
input/dodatki.tab
@@ -41,13 +41,171 @@ z Z brev:pun @@ -41,13 +41,171 @@ z Z brev:pun
41 ż Ż brev:pun 41 ż Ż brev:pun
42 ch Ch brev:pun 42 ch Ch brev:pun
43 st St brev:pun 43 st St brev:pun
44 -0 0 dig  
45 -1 1 dig  
46 -2 2 dig  
47 -3 3 dig  
48 -4 4 dig  
49 -5 5 dig  
50 -6 6 dig  
51 -7 7 dig  
52 -8 8 dig  
53 -9 9 dig 44 +poli poli prefa
  45 +poli poli prefs
  46 +niby niby prefa
  47 +niby niby prefs
  48 +eks eks prefs
  49 +ex ex prefs
  50 +euro euro prefa
  51 +euro euro prefs
  52 +mikro mikro prefs
  53 +mikro mikro prefa
  54 +makro makro prefa
  55 +makro makro prefs
  56 +bez bez prefa
  57 +do do prefv
  58 +do do prefa
  59 +dez dez prefv
  60 +dez dez prefa
  61 +dez dez prefs
  62 +ko ko prefa
  63 +ko ko prefs
  64 +między między prefa
  65 +między między prefs
  66 +na na prefa
  67 +na na prefs
  68 +na na prefv
  69 +nad nad prefa
  70 +nad nad prefs
  71 +nad nad prefv
  72 +o o prefv
  73 +ob ob prefv
  74 +od od prefa
  75 +od od prefs
  76 +od od prefv
  77 +pra pra prefs
  78 +post post prefa
  79 +post post prefs
  80 +pod pod prefa
  81 +pod pod prefs
  82 +pod pod prefv
  83 +poza poza prefa
  84 +ponad ponad prefa
  85 +pre pre prefa
  86 +pre pre prefs
  87 +pro pro prefa
  88 +pro pro prefs
  89 +prze prze prefa
  90 +prze prze prefv
  91 +przeciw przeciw prefa
  92 +przeciw przeciw prefs
  93 +re re prefa
  94 +re re prefs
  95 +re re prefv
  96 +przy przy prefa
  97 +przy przy prefv
  98 +roz roz prefv
  99 +u u prefv
  100 +samo samo prefa
  101 +samo samo prefs
  102 +video video prefs
  103 +video video prefa
  104 +w w prefv
  105 +wy wy prefv
  106 +współ współ prefv
  107 +współ współ prefa
  108 +współ współ prefs
  109 +wice wice prefs
  110 +neo neo prefa
  111 +neo neo prefs
  112 +tele tele prefs
  113 +tele tele prefa
  114 +z z prefv
  115 +za za prefv
  116 +za za prefa
  117 +za za prefs
  118 +wideo wideo prefa
  119 +wideo wideo prefs
  120 +meta meta prefs
  121 +meta meta prefa
  122 +multi multi prefa
  123 +multi multi prefs
  124 +mega mega prefa
  125 +mega mega prefs
  126 +kontra kontra prefs
  127 +kontra kontra prefa
  128 +inter inter prefa
  129 +inter inter prefs
  130 +homo homo prefs
  131 +homo homo prefa
  132 +ekstra ekstra prefa
  133 +ekstra ekstra prefs
  134 +giga giga prefa
  135 +giga giga prefs
  136 +bi bi prefs
  137 +bi bi prefa
  138 +auto auto prefs
  139 +auto auto prefa
  140 +de de prefv
  141 +de de prefa
  142 +de de prefs
  143 +ultra ultra prefs
  144 +ultra ultra prefa
  145 +e- e- prefa
  146 +e- e- prefs
  147 +mini mini prefs
  148 +mini mini prefa
  149 +maxi maxi prefs
  150 +maxi maxi prefa
  151 +midi midi prefs
  152 +midi midi prefa
  153 +arcy arcy prefs
  154 +arcy arcy prefa
  155 +anty anty prefa
  156 +anty anty prefs
  157 +a a prefa
  158 +a a prefs
  159 +pan pan prefs
  160 +pan pan prefa
  161 +in in prefa
  162 +in in prefs
  163 +dys dys prefs
  164 +dys dys prefa
  165 +mono mono prefa
  166 +mono mono prefs
  167 +porno porno prefs
  168 +porno porno prefa
  169 +anglo anglo prefa
  170 +aero aero prefs
  171 +aero aero prefa
  172 +bio bio prefs
  173 +bio bio prefa
  174 +wszystko wszystko prefs
  175 +wszystko wszystko prefa
  176 +wszech wszech prefs
  177 +wszech wszech prefa
  178 +śród śród prefs
  179 +śród śród prefa
  180 +audio audio prefs
  181 +audio audio prefa
  182 +eko eko prefs
  183 +eko eko prefa
  184 +s s prefv
  185 +elektro elektro prefs
  186 +elektro elektro prefa
  187 +trans trans prefa
  188 +trans trans prefs
  189 +kontr kontr prefs
  190 +kontr kontr prefa
  191 +pseudo pseudo prefs
  192 +pseudo pseudo prefa
  193 +quasi quasi prefs
  194 +quasi quasi prefa
  195 +super super prefs
  196 +super super prefa
  197 +po po prefv
  198 +po po prefa
  199 +po po prefs
  200 +sub sub prefs
  201 +sub sub prefa
  202 +hiper hiper prefa
  203 +hiper hiper prefs
  204 +non non prefs
  205 +non non prefa
  206 +stereo stereo prefa
  207 +stereo stereo prefs
  208 +energo energo prefa
  209 +para para prefa
  210 +para para prefs
  211 +ś ś prefv
input/polimorf.tagset
@@ -584,6 +584,9 @@ @@ -584,6 +584,9 @@
584 579 interp 584 579 interp
585 580 brev:pun 585 580 brev:pun
586 581 brev:npun 586 581 brev:npun
  587 +582 prefa
  588 +583 prefs
  589 +584 prefv
587 590
588 [NAMES] 591 [NAMES]
589 592
input/segmenty.dat
@@ -19,7 +19,7 @@ samotny @@ -19,7 +19,7 @@ samotny
19 # przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”: 19 # przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”:
20 moze_interp(praet_sg_na) 20 moze_interp(praet_sg_na)
21 21
22 -# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „czytał”: 22 +# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „moze”:
23 moze_interp(praet_sg) 23 moze_interp(praet_sg)
24 24
25 # przeszlik mnogi, np. „czytali”: 25 # przeszlik mnogi, np. „czytali”:
@@ -69,9 +69,8 @@ moze_interp(praet_sg by aglsg) @@ -69,9 +69,8 @@ moze_interp(praet_sg by aglsg)
69 # np. „gnietli·by·śmy” 69 # np. „gnietli·by·śmy”
70 moze_interp(praet_pl by aglpl) 70 moze_interp(praet_pl by aglpl)
71 #else 71 #else
72 -moze_interp(praetcond) 72 +# moze_interp(praetcond)
73 #endif 73 #endif
74 -  
75 # np. „by·ś” 74 # np. „by·ś”
76 moze_interp(by aglsg) 75 moze_interp(by aglsg)
77 # np. „by·ście” 76 # np. „by·ście”
@@ -98,9 +97,9 @@ moze_interp( (adja dywiz)+ adj ) @@ -98,9 +97,9 @@ moze_interp( (adja dywiz)+ adj )
98 # adja dywiz adja dywiz adja dywiz adj interp? 97 # adja dywiz adja dywiz adja dywiz adj interp?
99 # adja dywiz adja dywiz adja dywiz adja dywiz adj interp? 98 # adja dywiz adja dywiz adja dywiz adja dywiz adj interp?
100 99
101 -# Stopień najwyższy:  
102 -# np. „naj·zieleńszy”, „naj·mądrzej”  
103 -moze_interp( naj> adj_sup ) 100 +# Formy zanegowane stopnia wyższego przymiotników i przysłówków (WK)
  101 +# np. „nie·grzeczniejszy”, „nie·grzeczniej”
  102 +moze_interp( nie> adj_com )
104 103
105 # Formy „zanegowane” gerundiów i imiesłowów: 104 # Formy „zanegowane” gerundiów i imiesłowów:
106 # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: 105 # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”:
@@ -112,15 +111,21 @@ moze_interp(z_on_agl) @@ -112,15 +111,21 @@ moze_interp(z_on_agl)
112 moze_interp(z_on_agl on_agl) 111 moze_interp(z_on_agl on_agl)
113 112
114 # Liczba zapisana jako ciąg cyfr: 113 # Liczba zapisana jako ciąg cyfr:
115 -moze_interp( dig!>+ ) 114 +moze_interp( dig )
116 115
117 # Formacje prefiksalne 116 # Formacje prefiksalne
118 #### trzeba wydzielić odpowiednie samodze! 117 #### trzeba wydzielić odpowiednie samodze!
119 -# rzeczownikowe i przymiotnikowe  
120 -# np. „euro·sodoma”, „e-·papieros”, „euro·sodomski”, „bez·argumentowy”  
121 -moze_interp( prefs samodz ) 118 +# rzeczownikowe
  119 +# np. „euro·sodoma”, „e-·papieros”
  120 +moze_interp(nomina)
  121 +moze_interp( prefs> nomina )
122 # czasownikowe np. „po·nakapywać” 122 # czasownikowe np. „po·nakapywać”
123 -moze_interp( prefv samodz ) 123 +moze_interp(verba_imperf)
  124 +moze_interp( prefv> verba_imperf )
  125 +# przymiotnikowe np. „do·żylny”, „euro·sodomski”, „bez·argumentowy”
  126 +moze_interp(adjectiva)
  127 +moze_interp(prefa> adj)
  128 +moze_interp( prefa> adjectiva )
124 129
125 # Apozycje z dywizem 130 # Apozycje z dywizem
126 # np. „kobieta-prezydent” 131 # np. „kobieta-prezydent”
@@ -133,11 +138,28 @@ adj dywiz samodz @@ -133,11 +138,28 @@ adj dywiz samodz
133 # ? 138 # ?
134 samodz dywiz adj 139 samodz dywiz adj
135 140
  141 +#### PONIŻEJ REGUŁY WK
  142 +# Stopień najwyższy:
  143 +# np. „naj·zieleńszy”, „naj·mądrzej”
  144 +moze_interp( naj> adj_sup )
  145 +# Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj
  146 +moze_interp( praet_sg dywiz li)
  147 +moze_interp( praet_pl dywiz li)
  148 +moze_interp( praet_sg_na dywiz li)
  149 +moze_interp( fin dywiz li)
  150 +
  151 +# i bez dywizu --- czy bez dywizu jest sens to łapać?
  152 +#moze_interp( praet_sg li)
  153 +#moze_interp( praet_pl li)
  154 +#moze_interp( praet_sg_na li)
  155 +#moze_interp( fin li)
  156 +
136 [segment types] 157 [segment types]
137 naj 158 naj
138 nie 159 nie
139 prefs 160 prefs
140 prefv 161 prefv
  162 +prefa
141 dig 163 dig
142 adja 164 adja
143 adj 165 adj
@@ -161,11 +183,14 @@ naj naj @@ -161,11 +183,14 @@ naj naj
161 nie nie 183 nie nie
162 prefs prefs 184 prefs prefs
163 prefv prefv 185 prefv prefv
  186 +prefa prefa
164 dig dig 187 dig dig
165 adja adja 188 adja adja
166 adj adj:%:pos 189 adj adj:%:pos
167 adj_sup adj:%:sup 190 adj_sup adj:%:sup
168 adj_sup adv:sup 191 adj_sup adv:sup
  192 +adj_com adj:%:com
  193 +adj_com adj:%:com
169 negat ger:%:neg 194 negat ger:%:neg
170 negat pact:%:neg 195 negat pact:%:neg
171 negat ppas:%:neg 196 negat ppas:%:neg
@@ -173,26 +198,35 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep @@ -173,26 +198,35 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
173 z_on_agl prep:% 198 z_on_agl prep:%
174 samotny brev:pun 199 samotny brev:pun
175 samotny brev:npun 200 samotny brev:npun
176 -samotny intrj 201 +samotny interj
177 interp interp 202 interp interp
178 aglsg aglt:sg:% 203 aglsg aglt:sg:%
179 aglpl aglt:pl:% 204 aglpl aglt:pl:%
180 -praetcond cond:%  
181 -praetcond praet:%:pri:%  
182 -praetcond praet:%:sec:%  
183 -praetcond praet:%:ter:%  
184 praet_sg_agl praet:sg:%:agl 205 praet_sg_agl praet:sg:%:agl
185 praet_sg_na praet:sg:%:nagl 206 praet_sg_na praet:sg:%:nagl
186 praet_sg praet:sg:% 207 praet_sg praet:sg:%
187 praet_pl praet:pl:% 208 praet_pl praet:pl:%
188 praet_sg winien:sg:% 209 praet_sg winien:sg:%
189 praet_pl winien:pl:% 210 praet_pl winien:pl:%
  211 +fin fin:%
  212 +nomina subst:%
  213 +nomina ger:%
  214 +nomina depr:%
  215 +adjectiva adv:%
  216 +adjectiva ppas:%
  217 +adjectiva pact:%
  218 +verba_imperf praet:%:imperf
  219 +verba_imperf fin:%:imperf
  220 +verba_imperf inf:imperf
  221 +verba_imperf imps:imperf
  222 +verba_imperf impt:%:imperf
190 samodz % 223 samodz %
191 224
192 [lexemes] 225 [lexemes]
193 z_aglt aby:comp 226 z_aglt aby:comp
194 z_aglt bowiem:comp 227 z_aglt bowiem:comp
195 by by:qub 228 by by:qub
  229 +li li:qub
196 z_aglt by:comp 230 z_aglt by:comp
197 z_aglt cóż:subst 231 z_aglt cóż:subst
198 z_aglt czemu:adv 232 z_aglt czemu:adv
input/segmenty1.dat
@@ -7,9 +7,10 @@ praet=split composite @@ -7,9 +7,10 @@ praet=split composite
7 7
8 #define moze_interp(segmenty) wsz_interp segmenty wsz_interp 8 #define moze_interp(segmenty) wsz_interp segmenty wsz_interp
9 9
  10 +dig>* dig
10 (adja dywiz)+ adj 11 (adja dywiz)+ adj
11 -dig!>+  
12 -dig!> dig!> dig!> 12 +#dig!>+
  13 +#dig!> dig!> dig!>
13 naj> adj_sup 14 naj> adj_sup
14 15
15 [segment types] 16 [segment types]
@@ -52,20 +53,10 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep @@ -52,20 +53,10 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
52 z_on_agl prep:% 53 z_on_agl prep:%
53 samotny brev:pun 54 samotny brev:pun
54 samotny brev:npun 55 samotny brev:npun
55 -samotny intrj 56 +samotny interj
56 interp interp 57 interp interp
57 aglsg aglt:sg:% 58 aglsg aglt:sg:%
58 aglpl aglt:pl:% 59 aglpl aglt:pl:%
59 -praetcond cond:%  
60 -praetcond praet:%:pri:%  
61 -praetcond praet:%:sec:%  
62 -praetcond praet:%:ter:%  
63 -praet_sg_agl praet:sg:%:agl  
64 -praet_sg_na praet:sg:%:nagl  
65 -praet_sg praet:sg:%  
66 -praet_pl praet:pl:%  
67 -praet_sg winien:sg:%  
68 -praet_pl winien:pl:%  
69 samodz % 60 samodz %
70 61
71 [lexemes] 62 [lexemes]
morfeusz/InterpretedChunk.hpp
@@ -17,7 +17,6 @@ struct InterpretedChunk { @@ -17,7 +17,6 @@ struct InterpretedChunk {
17 std::vector<uint32_t> lowercaseCodepoints; 17 std::vector<uint32_t> lowercaseCodepoints;
18 InterpsGroup interpsGroup; 18 InterpsGroup interpsGroup;
19 bool shiftOrth; 19 bool shiftOrth;
20 - bool shiftOrthSameType;  
21 bool orthWasShifted; 20 bool orthWasShifted;
22 std::vector<InterpretedChunk> prefixChunks; 21 std::vector<InterpretedChunk> prefixChunks;
23 }; 22 };
morfeusz/Morfeusz.cpp
@@ -37,11 +37,19 @@ static MorfeuszOptions createDefaultOptions() { @@ -37,11 +37,19 @@ static MorfeuszOptions createDefaultOptions() {
37 return res; 37 return res;
38 } 38 }
39 39
  40 +static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) {
  41 + SegrulesOptions opts;
  42 + opts["aggl"] = "isolated";
  43 + opts["praet"] = "split";
  44 + return (*(map.find(opts))).second;
  45 +}
  46 +
40 Morfeusz::Morfeusz() 47 Morfeusz::Morfeusz()
41 : env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET), 48 : env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET),
42 analyzerPtr(DEFAULT_FSA), 49 analyzerPtr(DEFAULT_FSA),
43 analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())), 50 analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())),
44 segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)), 51 segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)),
  52 +currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)),
45 isAnalyzerFSAFromFile(false), 53 isAnalyzerFSAFromFile(false),
46 generatorPtr(DEFAULT_SYNTH_FSA), 54 generatorPtr(DEFAULT_SYNTH_FSA),
47 isGeneratorFSAFromFile(false), 55 isGeneratorFSAFromFile(false),
@@ -50,9 +58,9 @@ options(createDefaultOptions()) { @@ -50,9 +58,9 @@ options(createDefaultOptions()) {
50 58
51 } 59 }
52 60
53 -static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSAType*>& fsasMap) { 61 +static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) {
54 for ( 62 for (
55 - std::map<SegrulesOptions, SegrulesFSAType*>::iterator it = fsasMap.begin(); 63 + std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin();
56 it != fsasMap.end(); 64 it != fsasMap.end();
57 ++it) { 65 ++it) {
58 delete it->second; 66 delete it->second;
@@ -100,11 +108,8 @@ void Morfeusz::analyzeOneWord( @@ -100,11 +108,8 @@ void Morfeusz::analyzeOneWord(
100 vector<InterpretedChunk> accum; 108 vector<InterpretedChunk> accum;
101 FlexionGraph graph; 109 FlexionGraph graph;
102 const char* currInput = inputStart; 110 const char* currInput = inputStart;
103 - SegrulesOptions opts;  
104 - opts["aggl"] = "isolated";  
105 - opts["praet"] = "split";  
106 - SegrulesFSAType* segrulesFSA = (*(this->segrulesFSAsMap.find(opts))).second;  
107 - doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->getInitialState()); 111 + SegrulesFSA* segrulesFSA = this->currSegrulesFSA;
  112 + doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->initialState);
108 if (!graph.empty()) { 113 if (!graph.empty()) {
109 InterpretedChunksDecoder interpretedChunksDecoder(env); 114 InterpretedChunksDecoder interpretedChunksDecoder(env);
110 int srcNode = startNodeNum; 115 int srcNode = startNodeNum;
@@ -118,7 +123,8 @@ void Morfeusz::analyzeOneWord( @@ -118,7 +123,8 @@ void Morfeusz::analyzeOneWord(
118 srcNode++; 123 srcNode++;
119 } 124 }
120 // graph.getResults(*this->tagset, results); 125 // graph.getResults(*this->tagset, results);
121 - } else if (inputStart != inputEnd) { 126 + }
  127 + else if (inputStart != inputEnd) {
122 this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results); 128 this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results);
123 } 129 }
124 inputStart = currInput; 130 inputStart = currInput;
@@ -126,9 +132,9 @@ void Morfeusz::analyzeOneWord( @@ -126,9 +132,9 @@ void Morfeusz::analyzeOneWord(
126 132
127 static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { 133 static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
128 to.prefixChunks.insert( 134 to.prefixChunks.insert(
129 - to.prefixChunks.begin(),  
130 - from.prefixChunks.begin(),  
131 - from.prefixChunks.end()); 135 + to.prefixChunks.begin(),
  136 + from.prefixChunks.begin(),
  137 + from.prefixChunks.end());
132 to.prefixChunks.push_back(from); 138 to.prefixChunks.push_back(from);
133 from.orthWasShifted = true; 139 from.orthWasShifted = true;
134 } 140 }
@@ -138,7 +144,8 @@ void Morfeusz::doAnalyzeOneWord( @@ -138,7 +144,8 @@ void Morfeusz::doAnalyzeOneWord(
138 const char* inputEnd, 144 const char* inputEnd,
139 vector<InterpretedChunk>& accum, 145 vector<InterpretedChunk>& accum,
140 FlexionGraph& graph, 146 FlexionGraph& graph,
141 - SegrulesStateType segrulesState) const { 147 + SegrulesState segrulesState) const {
  148 + // cerr << "doAnalyzeOneWord " << inputData << endl;
142 bool endOfWord = inputData == inputEnd; 149 bool endOfWord = inputData == inputEnd;
143 const char* currInput = inputData; 150 const char* currInput = inputData;
144 uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd); 151 uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd);
@@ -159,16 +166,27 @@ void Morfeusz::doAnalyzeOneWord( @@ -159,16 +166,27 @@ void Morfeusz::doAnalyzeOneWord(
159 vector<InterpsGroup> val(state.getValue()); 166 vector<InterpsGroup> val(state.getValue());
160 for (unsigned int i = 0; i < val.size(); i++) { 167 for (unsigned int i = 0; i < val.size(); i++) {
161 InterpsGroup& ig = val[i]; 168 InterpsGroup& ig = val[i];
162 - cerr << (int) ig.type << endl;  
163 - SegrulesStateType newSegrulesState = segrulesState;  
164 - newSegrulesState.proceedToNext(ig.type);  
165 - if (!newSegrulesState.isSink()) {  
166 - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1;  
167 - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2;  
168 - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false};  
169 - if (!accum.empty()  
170 - && (accum.back().shiftOrth  
171 - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) { 169 + // newSegrulesState.proceedToNext(ig.type);
  170 + // this->currSegrulesFSA->proceedToNext(ig.type, segrulesStates, newSegrulesStates);
  171 + set<SegrulesState> newSegrulesStates;
  172 + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates);
  173 + for (
  174 + set<SegrulesState>::iterator it = newSegrulesStates.begin();
  175 + it != newSegrulesStates.end();
  176 + it++) {
  177 + SegrulesState newSegrulesState = *it;
  178 + // bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1;
  179 + // bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2;
  180 + InterpretedChunk ic = {
  181 + inputData,
  182 + originalCodepoints,
  183 + lowercaseCodepoints,
  184 + ig,
  185 + newSegrulesState.shiftOrthFromPrevious,
  186 + false,
  187 + vector<InterpretedChunk>()
  188 + };
  189 + if (!accum.empty() && accum.back().shiftOrth) {
172 doShiftOrth(accum.back(), ic); 190 doShiftOrth(accum.back(), ic);
173 } 191 }
174 accum.push_back(ic); 192 accum.push_back(ic);
@@ -182,27 +200,37 @@ void Morfeusz::doAnalyzeOneWord( @@ -182,27 +200,37 @@ void Morfeusz::doAnalyzeOneWord(
182 this->env.getCharsetConverter().next(currInput, inputEnd); 200 this->env.getCharsetConverter().next(currInput, inputEnd);
183 } 201 }
184 } 202 }
  203 + // cerr << "end of word" << endl;
185 // we are at the end of word 204 // we are at the end of word
186 if (state.isAccepting()) { 205 if (state.isAccepting()) {
187 vector<InterpsGroup > val(state.getValue()); 206 vector<InterpsGroup > val(state.getValue());
188 for (unsigned int i = 0; i < val.size(); i++) { 207 for (unsigned int i = 0; i < val.size(); i++) {
189 InterpsGroup& ig = val[i]; 208 InterpsGroup& ig = val[i];
190 - SegrulesStateType newSegrulesState = segrulesState;  
191 - newSegrulesState.proceedToNext(ig.type);  
192 - if (newSegrulesState.isAccepting()) {  
193 - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1;  
194 - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2;  
195 - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false};  
196 - if (!accum.empty()  
197 - && (accum.back().shiftOrth  
198 - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) {  
199 - doShiftOrth(accum.back(), ic); 209 + // cerr << "currInput=" << currInput << endl;
  210 + // cerr << "type=" << (int) ig.type << endl;
  211 + set<SegrulesState> newSegrulesStates;
  212 + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates);
  213 + for (
  214 + set<SegrulesState>::iterator it = newSegrulesStates.begin();
  215 + it != newSegrulesStates.end();
  216 + it++) {
  217 + SegrulesState newSegrulesState = *it;
  218 + if (newSegrulesState.accepting) {
  219 + InterpretedChunk ic = {
  220 + inputData,
  221 + originalCodepoints,
  222 + lowercaseCodepoints,
  223 + ig,
  224 + newSegrulesState.shiftOrthFromPrevious,
  225 + false,
  226 + vector<InterpretedChunk>()};
  227 + if (!accum.empty() && accum.back().shiftOrth) {
  228 + doShiftOrth(accum.back(), ic);
  229 + }
  230 + accum.push_back(ic);
  231 + graph.addPath(accum);
  232 + accum.pop_back();
200 } 233 }
201 - accum.push_back(ic);  
202 - graph.addPath(accum);  
203 - accum.pop_back();  
204 - } else if (!newSegrulesState.isSink()) {  
205 - } else {  
206 } 234 }
207 } 235 }
208 } 236 }
morfeusz/Morfeusz.hpp
@@ -12,6 +12,7 @@ @@ -12,6 +12,7 @@
12 #include <list> 12 #include <list>
13 #include <vector> 13 #include <vector>
14 #include <map> 14 #include <map>
  15 +#include <set>
15 #include "EncodedInterpretation.hpp" 16 #include "EncodedInterpretation.hpp"
16 #include "fsa/fsa.hpp" 17 #include "fsa/fsa.hpp"
17 #include "MorphInterpretation.hpp" 18 #include "MorphInterpretation.hpp"
@@ -27,6 +28,7 @@ @@ -27,6 +28,7 @@
27 #include "Environment.hpp" 28 #include "Environment.hpp"
28 29
29 #include "segrules/segrules.hpp" 30 #include "segrules/segrules.hpp"
  31 +#include "segrules/SegrulesFSA.hpp"
30 32
31 class Morfeusz; 33 class Morfeusz;
32 class ResultsIterator; 34 class ResultsIterator;
@@ -111,7 +113,7 @@ private: @@ -111,7 +113,7 @@ private:
111 const char* inputEnd, 113 const char* inputEnd,
112 std::vector<InterpretedChunk>& accum, 114 std::vector<InterpretedChunk>& accum,
113 FlexionGraph& graph, 115 FlexionGraph& graph,
114 - SegrulesStateType segrulesState) const; 116 + SegrulesState segrulesState) const;
115 117
116 void appendIgnotiumToResults( 118 void appendIgnotiumToResults(
117 const std::string& word, 119 const std::string& word,
@@ -120,17 +122,13 @@ private: @@ -120,17 +122,13 @@ private:
120 Environment env; 122 Environment env;
121 const unsigned char* analyzerPtr; 123 const unsigned char* analyzerPtr;
122 FSAType* analyzerFSA; 124 FSAType* analyzerFSA;
123 - std::map<SegrulesOptions, SegrulesFSAType*> segrulesFSAsMap; 125 + std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap;
  126 + SegrulesFSA* currSegrulesFSA;
124 bool isAnalyzerFSAFromFile; 127 bool isAnalyzerFSAFromFile;
125 128
126 const unsigned char* generatorPtr; 129 const unsigned char* generatorPtr;
127 bool isGeneratorFSAFromFile; 130 bool isGeneratorFSAFromFile;
128 Generator generator; 131 Generator generator;
129 -// const CharsetConverter* charsetConverter;  
130 -// const Tagset* tagset;  
131 -// const CaseConverter* caseConverter;  
132 -//  
133 -// UTF8CharsetConverter utf8CharsetConverter;  
134 132
135 MorfeuszOptions options; 133 MorfeuszOptions options;
136 }; 134 };
morfeusz/segrules/SegrulesFSA.hpp 0 → 100644
  1 +/*
  2 + * File: SegrulesFSA.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 12 marzec 2014, 17:52
  6 + */
  7 +
  8 +#ifndef SEGRULESFSA_HPP
  9 +#define SEGRULESFSA_HPP
  10 +
  11 +#include <set>
  12 +#include "../endianness.hpp"
  13 +
  14 +struct SegrulesState {
  15 + uint16_t offset;
  16 + bool accepting;
  17 + bool weak;
  18 + bool shiftOrthFromPrevious;
  19 +};
  20 +
  21 +inline bool operator<(const SegrulesState& s1, const SegrulesState& s2)
  22 +{
  23 + return s1.offset < s2.offset;
  24 +}
  25 +
  26 +class SegrulesFSA {
  27 +public:
  28 + SegrulesFSA(const unsigned char* ptr): initialState(), ptr(ptr) {
  29 + SegrulesState state = {0, false, false, false};
  30 + initialState = state;
  31 + }
  32 +
  33 + void proceedToNext(
  34 + const unsigned char segnum,
  35 + const SegrulesState state,
  36 + std::set<SegrulesState>& newStates) const {
  37 +
  38 + const unsigned char* currPtr = ptr + state.offset;
  39 + currPtr++;
  40 + const unsigned char transitionsNum = *currPtr;
  41 + currPtr++;
  42 + for (unsigned int i = 0; i < transitionsNum; i++) {
  43 + if (*currPtr == segnum) {
  44 + newStates.insert(newStates.begin(), this->transition2State(currPtr));
  45 + }
  46 + currPtr += 4;
  47 + }
  48 + }
  49 +
  50 + virtual ~SegrulesFSA() {}
  51 +
  52 + SegrulesState initialState;
  53 +private:
  54 + const unsigned char* ptr;
  55 +
  56 + SegrulesState transition2State(const unsigned char* transitionPtr) const {
  57 + unsigned char ACCEPTING_FLAG = 1;
  58 + unsigned char WEAK_FLAG = 2;
  59 + SegrulesState res;
  60 + transitionPtr++;
  61 + res.shiftOrthFromPrevious = *transitionPtr;
  62 + transitionPtr++;
  63 + res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr));
  64 + res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG;
  65 + res.weak = *(ptr + res.offset) & WEAK_FLAG;
  66 + return res;
  67 + }
  68 +};
  69 +
  70 +#endif /* SEGRULESFSA_HPP */
  71 +
morfeusz/segrules/segrules.cpp
@@ -33,23 +33,23 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*&amp; ptr) { @@ -33,23 +33,23 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*&amp; ptr) {
33 return res; 33 return res;
34 } 34 }
35 35
36 -static inline SegrulesFSAType* deserializeFSA(const unsigned char*& ptr) { 36 +static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) {
37 uint32_t fsaSize = deserializeUint32(ptr); 37 uint32_t fsaSize = deserializeUint32(ptr);
38 - static SegrulesDeserializer deserializer;  
39 - SegrulesFSAType* res = SegrulesFSAType::getFSA(ptr, deserializer); 38 +// static SegrulesDeserializer deserializer;
  39 + SegrulesFSA* res = new SegrulesFSA(ptr);
40 ptr += fsaSize; 40 ptr += fsaSize;
41 return res; 41 return res;
42 } 42 }
43 43
44 -map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) {  
45 - map<SegrulesOptions, SegrulesFSAType*> res; 44 +map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) {
  45 + map<SegrulesOptions, SegrulesFSA*> res;
46 const unsigned char* fsasMapPtr = getFSAsMapPtr(analyzerPtr); 46 const unsigned char* fsasMapPtr = getFSAsMapPtr(analyzerPtr);
47 const unsigned char* currPtr = fsasMapPtr; 47 const unsigned char* currPtr = fsasMapPtr;
48 unsigned char fsasNum = *currPtr; 48 unsigned char fsasNum = *currPtr;
49 currPtr++; 49 currPtr++;
50 for (unsigned char i = 0; i < fsasNum; i++) { 50 for (unsigned char i = 0; i < fsasNum; i++) {
51 SegrulesOptions options = deserializeOptions(currPtr); 51 SegrulesOptions options = deserializeOptions(currPtr);
52 - SegrulesFSAType* fsa = deserializeFSA(currPtr); 52 + SegrulesFSA* fsa = deserializeFSA(currPtr);
53 res[options] = fsa; 53 res[options] = fsa;
54 } 54 }
55 return res; 55 return res;
morfeusz/segrules/segrules.hpp
@@ -11,13 +11,13 @@ @@ -11,13 +11,13 @@
11 #include <utility> 11 #include <utility>
12 #include <map> 12 #include <map>
13 #include <string> 13 #include <string>
14 -#include "../fsa/fsa.hpp" 14 +#include "SegrulesFSA.hpp"
15 15
16 typedef std::map<std::string, std::string> SegrulesOptions; 16 typedef std::map<std::string, std::string> SegrulesOptions;
17 -typedef State<unsigned char> SegrulesStateType;  
18 -typedef FSA<unsigned char> SegrulesFSAType; 17 +//typedef State<unsigned char> SegrulesStateType;
  18 +//typedef FSA<unsigned char> SegrulesFSAType;
19 19
20 -std::map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); 20 +std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr);
21 21
22 #endif /* SEGRULES_HPP */ 22 #endif /* SEGRULES_HPP */
23 23
nbproject/configurations.xml
@@ -106,14 +106,20 @@ @@ -106,14 +106,20 @@
106 </makeTool> 106 </makeTool>
107 </makefileType> 107 </makefileType>
108 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> 108 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
  109 + <ccTool flags="1">
  110 + </ccTool>
109 </item> 111 </item>
110 <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> 112 <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
  113 + <ccTool flags="1">
  114 + </ccTool>
111 </item> 115 </item>
112 <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> 116 <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4">
113 </item> 117 </item>
114 <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> 118 <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
115 </item> 119 </item>
116 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> 120 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
  121 + <ccTool flags="1">
  122 + </ccTool>
117 </item> 123 </item>
118 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" 124 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
119 ex="false" 125 ex="false"
@@ -121,7 +127,6 @@ @@ -121,7 +127,6 @@
121 flavor2="8"> 127 flavor2="8">
122 <ccTool> 128 <ccTool>
123 <incDir> 129 <incDir>
124 - <pElem>build</pElem>  
125 <pElem>/usr/lib/jvm/default-java/include</pElem> 130 <pElem>/usr/lib/jvm/default-java/include</pElem>
126 <pElem>morfeusz</pElem> 131 <pElem>morfeusz</pElem>
127 <pElem>build/morfeusz/java</pElem> 132 <pElem>build/morfeusz/java</pElem>
@@ -145,7 +150,6 @@ @@ -145,7 +150,6 @@
145 flavor2="8"> 150 flavor2="8">
146 <ccTool> 151 <ccTool>
147 <incDir> 152 <incDir>
148 - <pElem>build</pElem>  
149 <pElem>/usr/include/python2.7</pElem> 153 <pElem>/usr/include/python2.7</pElem>
150 <pElem>morfeusz</pElem> 154 <pElem>morfeusz</pElem>
151 <pElem>build/morfeusz/python</pElem> 155 <pElem>build/morfeusz/python</pElem>
@@ -173,9 +177,8 @@ @@ -173,9 +177,8 @@
173 <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> 177 <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4">
174 <ccTool flags="1"> 178 <ccTool flags="1">
175 <incDir> 179 <incDir>
176 - <pElem>build1</pElem>  
177 <pElem>morfeusz</pElem> 180 <pElem>morfeusz</pElem>
178 - <pElem>build1/morfeusz</pElem> 181 + <pElem>morfeusz/build/morfeusz</pElem>
179 </incDir> 182 </incDir>
180 <preprocessorList> 183 <preprocessorList>
181 <Elem>libmorfeusz_EXPORTS</Elem> 184 <Elem>libmorfeusz_EXPORTS</Elem>
@@ -185,9 +188,8 @@ @@ -185,9 +188,8 @@
185 <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> 188 <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
186 <ccTool flags="1"> 189 <ccTool flags="1">
187 <incDir> 190 <incDir>
188 - <pElem>build1</pElem>  
189 <pElem>morfeusz</pElem> 191 <pElem>morfeusz</pElem>
190 - <pElem>build1/morfeusz</pElem> 192 + <pElem>morfeusz/build/morfeusz</pElem>
191 </incDir> 193 </incDir>
192 <preprocessorList> 194 <preprocessorList>
193 <Elem>libmorfeusz_EXPORTS</Elem> 195 <Elem>libmorfeusz_EXPORTS</Elem>
@@ -266,12 +268,18 @@ @@ -266,12 +268,18 @@
266 </preprocessorList> 268 </preprocessorList>
267 </ccTool> 269 </ccTool>
268 </folder> 270 </folder>
269 - <folder path="morfeusz/java"> 271 + <folder path="morfeusz">
270 <ccTool> 272 <ccTool>
271 <incDir> 273 <incDir>
272 <pElem>build</pElem> 274 <pElem>build</pElem>
  275 + </incDir>
  276 + </ccTool>
  277 + </folder>
  278 + <folder path="morfeusz/java">
  279 + <ccTool>
  280 + <incDir>
273 <pElem>morfeusz</pElem> 281 <pElem>morfeusz</pElem>
274 - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> 282 + <pElem>/usr/lib/jvm/default-java/include</pElem>
275 </incDir> 283 </incDir>
276 <preprocessorList> 284 <preprocessorList>
277 <Elem>libjmorfeusz_EXPORTS</Elem> 285 <Elem>libjmorfeusz_EXPORTS</Elem>
@@ -281,7 +289,6 @@ @@ -281,7 +289,6 @@
281 <folder path="morfeusz/python"> 289 <folder path="morfeusz/python">
282 <ccTool> 290 <ccTool>
283 <incDir> 291 <incDir>
284 - <pElem>build</pElem>  
285 <pElem>/usr/include/python2.7</pElem> 292 <pElem>/usr/include/python2.7</pElem>
286 <pElem>morfeusz</pElem> 293 <pElem>morfeusz</pElem>
287 </incDir> 294 </incDir>
@@ -407,18 +414,26 @@ @@ -407,18 +414,26 @@
407 </ccTool> 414 </ccTool>
408 </item> 415 </item>
409 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> 416 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4">
  417 + <ccTool flags="1">
  418 + </ccTool>
410 </item> 419 </item>
411 <item path="morfeusz/charset/CharsetConverter.cpp" 420 <item path="morfeusz/charset/CharsetConverter.cpp"
412 ex="false" 421 ex="false"
413 tool="1" 422 tool="1"
414 flavor2="4"> 423 flavor2="4">
  424 + <ccTool flags="1">
  425 + </ccTool>
415 </item> 426 </item>
416 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> 427 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
  428 + <ccTool flags="1">
  429 + </ccTool>
417 </item> 430 </item>
418 <item path="morfeusz/charset/conversion_tables.cpp" 431 <item path="morfeusz/charset/conversion_tables.cpp"
419 ex="false" 432 ex="false"
420 tool="1" 433 tool="1"
421 flavor2="4"> 434 flavor2="4">
  435 + <ccTool flags="1">
  436 + </ccTool>
422 </item> 437 </item>
423 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> 438 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
424 <ccTool flags="1"> 439 <ccTool flags="1">
@@ -507,8 +522,12 @@ @@ -507,8 +522,12 @@
507 ex="false" 522 ex="false"
508 tool="1" 523 tool="1"
509 flavor2="4"> 524 flavor2="4">
  525 + <ccTool flags="1">
  526 + </ccTool>
510 </item> 527 </item>
511 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> 528 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
  529 + <ccTool flags="1">
  530 + </ccTool>
512 </item> 531 </item>
513 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> 532 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
514 <ccTool flags="0"> 533 <ccTool flags="0">