Commit 00e66248a61ae340a23b5635cfc761be6dbf38cd
1 parent
a6f0d912
poprawiona obsługa segmentacji (działają już cyfry tak, jak na początku ustalono)
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@112 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
26 changed files
with
629 additions
and
236 deletions
CMakeLists.txt
@@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "") | @@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "") | ||
36 | if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") | 36 | if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") |
37 | set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) | 37 | set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) |
38 | else () | 38 | else () |
39 | - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | 39 | + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorf-0.6.7.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") |
40 | endif () | 40 | endif () |
41 | endif () | 41 | endif () |
42 | 42 |
buildAll.sh
@@ -10,7 +10,7 @@ function build { | @@ -10,7 +10,7 @@ function build { | ||
10 | targets=$@ | 10 | targets=$@ |
11 | 11 | ||
12 | srcDir=`pwd` | 12 | srcDir=`pwd` |
13 | - buildDir=build/build-$os-$arch | 13 | + buildDir=buildall/build-$os-$arch |
14 | targetDir=$srcDir/target/$os-$arch | 14 | targetDir=$srcDir/target/$os-$arch |
15 | toolchain=$srcDir/morfeusz/Toolchain-$os-$arch.cmake | 15 | toolchain=$srcDir/morfeusz/Toolchain-$os-$arch.cmake |
16 | 16 |
fsabuilder/buildfsa.py
@@ -261,8 +261,9 @@ def main(opts): | @@ -261,8 +261,9 @@ def main(opts): | ||
261 | if __name__ == '__main__': | 261 | if __name__ == '__main__': |
262 | import os | 262 | import os |
263 | opts = _parseOptions() | 263 | opts = _parseOptions() |
264 | - try: | ||
265 | - main(opts) | ||
266 | - except Exception as ex: | ||
267 | - print >> sys.stderr, unicode(ex).encode('utf8') | 264 | +# try: |
265 | + main(opts) | ||
266 | +# except Exception as ex: | ||
267 | +# raise ex | ||
268 | +# print >> sys.stderr, unicode(ex).encode('utf8') | ||
268 | 269 |
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -113,12 +113,13 @@ class FSA(object): | @@ -113,12 +113,13 @@ class FSA(object): | ||
113 | return q | 113 | return q |
114 | 114 | ||
115 | def calculateOffsets(self, sizeCounter): | 115 | def calculateOffsets(self, sizeCounter): |
116 | - currReverseOffset = 0 | ||
117 | - for state in self.initialState.dfs(set()): | ||
118 | - currReverseOffset += sizeCounter(state) | ||
119 | - state.reverseOffset = currReverseOffset | ||
120 | - for state in self.initialState.dfs(set()): | ||
121 | - state.offset = currReverseOffset - state.reverseOffset | 116 | + self.initialState.calculateOffsets(sizeCounter) |
117 | +# currReverseOffset = 0 | ||
118 | +# for state in self.initialState.dfs(set()): | ||
119 | +# currReverseOffset += sizeCounter(state) | ||
120 | +# state.reverseOffset = currReverseOffset | ||
121 | +# for state in self.initialState.dfs(set()): | ||
122 | +# state.offset = currReverseOffset - state.reverseOffset | ||
122 | 123 | ||
123 | def debug(self): | 124 | def debug(self): |
124 | for state in self.initialState.dfs(set()): | 125 | for state in self.initialState.dfs(set()): |
fsabuilder/morfeuszbuilder/fsa/serializer.py
@@ -6,6 +6,7 @@ Created on Oct 20, 2013 | @@ -6,6 +6,7 @@ Created on Oct 20, 2013 | ||
6 | 6 | ||
7 | import logging | 7 | import logging |
8 | from state import State | 8 | from state import State |
9 | +from morfeuszbuilder.utils.serializationUtils import * | ||
9 | 10 | ||
10 | class Serializer(object): | 11 | class Serializer(object): |
11 | 12 | ||
@@ -63,7 +64,7 @@ class Serializer(object): | @@ -63,7 +64,7 @@ class Serializer(object): | ||
63 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | 64 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
64 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): | 65 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): |
65 | fsaData.extend(self.state2bytearray(state)) | 66 | fsaData.extend(self.state2bytearray(state)) |
66 | - res.extend(self.htonl(len(fsaData))) | 67 | + res.extend(htonl(len(fsaData))) |
67 | res.extend(fsaData) | 68 | res.extend(fsaData) |
68 | res.extend(self.serializeEpilogue(additionalData, moreAdditionalData)) | 69 | res.extend(self.serializeEpilogue(additionalData, moreAdditionalData)) |
69 | return res | 70 | return res |
@@ -71,9 +72,9 @@ class Serializer(object): | @@ -71,9 +72,9 @@ class Serializer(object): | ||
71 | def _serializeTags(self, tagsMap): | 72 | def _serializeTags(self, tagsMap): |
72 | res = bytearray() | 73 | res = bytearray() |
73 | numOfTags = len(tagsMap) | 74 | numOfTags = len(tagsMap) |
74 | - res.extend(self.htons(numOfTags)) | 75 | + res.extend(htons(numOfTags)) |
75 | for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): | 76 | for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): |
76 | - res.extend(self.htons(tagnum)) | 77 | + res.extend(htons(tagnum)) |
77 | res.extend(self.fsa.encodeWord(tag)) | 78 | res.extend(self.fsa.encodeWord(tag)) |
78 | res.append(0) | 79 | res.append(0) |
79 | return res | 80 | return res |
@@ -86,25 +87,6 @@ class Serializer(object): | @@ -86,25 +87,6 @@ class Serializer(object): | ||
86 | res.extend(self._serializeTags(tagset._name2namenum)) | 87 | res.extend(self._serializeTags(tagset._name2namenum)) |
87 | return res | 88 | return res |
88 | 89 | ||
89 | - # serialize uint16 as big endian | ||
90 | - def htons(self, n): | ||
91 | - assert n < 65536 | ||
92 | - assert n >= 0 | ||
93 | - res = bytearray() | ||
94 | - res.append((n & 0x00FF00) >> 8) | ||
95 | - res.append(n & 0x0000FF) | ||
96 | - return res | ||
97 | - | ||
98 | - # serialize uint32 as big endian | ||
99 | - def htonl(self, n): | ||
100 | - assert n >= 0 | ||
101 | - res = bytearray() | ||
102 | - res.append((n & 0xFF000000) >> 24) | ||
103 | - res.append((n & 0x00FF0000) >> 16) | ||
104 | - res.append((n & 0x0000FF00) >> 8) | ||
105 | - res.append(n & 0x000000FF) | ||
106 | - return res | ||
107 | - | ||
108 | def serializePrologue(self): | 90 | def serializePrologue(self): |
109 | res = bytearray() | 91 | res = bytearray() |
110 | 92 | ||
@@ -126,7 +108,7 @@ class Serializer(object): | @@ -126,7 +108,7 @@ class Serializer(object): | ||
126 | res = bytearray() | 108 | res = bytearray() |
127 | additionalDataSize = len(additionalData) if additionalData else 0 | 109 | additionalDataSize = len(additionalData) if additionalData else 0 |
128 | moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0 | 110 | moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0 |
129 | - res.extend(self.htonl(additionalDataSize)) | 111 | + res.extend(htonl(additionalDataSize)) |
130 | 112 | ||
131 | # add additional data itself | 113 | # add additional data itself |
132 | if additionalDataSize: | 114 | if additionalDataSize: |
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -13,7 +13,7 @@ class State(object): | @@ -13,7 +13,7 @@ class State(object): | ||
13 | 13 | ||
14 | def __init__(self, additionalData=None): | 14 | def __init__(self, additionalData=None): |
15 | self.transitionsMap = {} | 15 | self.transitionsMap = {} |
16 | - self.transitionsDataMap = {} | 16 | +# self.transitionsDataMap = {} |
17 | self.freq = 0 | 17 | self.freq = 0 |
18 | self.encodedData = None | 18 | self.encodedData = None |
19 | self.reverseOffset = None | 19 | self.reverseOffset = None |
@@ -29,11 +29,11 @@ class State(object): | @@ -29,11 +29,11 @@ class State(object): | ||
29 | def transitionsNum(self): | 29 | def transitionsNum(self): |
30 | return len(self.transitionsMap) | 30 | return len(self.transitionsMap) |
31 | 31 | ||
32 | - def setTransition(self, byte, nextState): | ||
33 | - self.transitionsMap[byte] = nextState | ||
34 | - | ||
35 | - def setTransitionData(self, byte, data): | ||
36 | - self.transitionsDataMap[byte] = data | 32 | + def setTransition(self, label, nextState): |
33 | + self.transitionsMap[label] = nextState | ||
34 | +# | ||
35 | +# def setTransitionData(self, byte, data): | ||
36 | +# self.transitionsDataMap[byte] = data | ||
37 | 37 | ||
38 | def hasNext(self, byte): | 38 | def hasNext(self, byte): |
39 | return byte in self.transitionsMap | 39 | return byte in self.transitionsMap |
@@ -68,6 +68,14 @@ class State(object): | @@ -68,6 +68,14 @@ class State(object): | ||
68 | yield state1 | 68 | yield state1 |
69 | yield self | 69 | yield self |
70 | 70 | ||
71 | + def calculateOffsets(self, sizeCounter): | ||
72 | + currReverseOffset = 0 | ||
73 | + for state in self.dfs(set()): | ||
74 | + currReverseOffset += sizeCounter(state) | ||
75 | + state.reverseOffset = currReverseOffset | ||
76 | + for state in self.dfs(set()): | ||
77 | + state.offset = currReverseOffset - state.reverseOffset | ||
78 | + | ||
71 | def debug(self): | 79 | def debug(self): |
72 | print '----------------' | 80 | print '----------------' |
73 | print 'STATE:', self.idx, 'accepting', self.isAccepting() | 81 | print 'STATE:', self.idx, 'accepting', self.isAccepting() |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -7,6 +7,7 @@ Created on 23 sty 2014 | @@ -7,6 +7,7 @@ Created on 23 sty 2014 | ||
7 | import re | 7 | import re |
8 | from pyparsing import * | 8 | from pyparsing import * |
9 | from morfeuszbuilder.utils import exceptions | 9 | from morfeuszbuilder.utils import exceptions |
10 | +from pyparseString import pyparseString | ||
10 | 11 | ||
11 | identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') | 12 | identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') |
12 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | 13 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() |
@@ -54,7 +55,7 @@ def _tryToSubstituteNonArgDefine(s, t, defines): | @@ -54,7 +55,7 @@ def _tryToSubstituteNonArgDefine(s, t, defines): | ||
54 | else: | 55 | else: |
55 | return defineName | 56 | return defineName |
56 | 57 | ||
57 | -def _processLine(lineNum, line, defines): | 58 | +def _processLine(lineNum, line, defines, filename): |
58 | if line.strip(): | 59 | if line.strip(): |
59 | 60 | ||
60 | rule = Forward() | 61 | rule = Forward() |
@@ -67,24 +68,16 @@ def _processLine(lineNum, line, defines): | @@ -67,24 +68,16 @@ def _processLine(lineNum, line, defines): | ||
67 | rule.setParseAction(lambda s, l, t: ' '.join(t)) | 68 | rule.setParseAction(lambda s, l, t: ' '.join(t)) |
68 | defineInstance.setParseAction(lambda s, l, t: _tryToSubstituteArgDefine(s, t, defines)) | 69 | defineInstance.setParseAction(lambda s, l, t: _tryToSubstituteArgDefine(s, t, defines)) |
69 | localId.setParseAction(lambda s, l, t: _tryToSubstituteNonArgDefine(s, t, defines)) | 70 | localId.setParseAction(lambda s, l, t: _tryToSubstituteNonArgDefine(s, t, defines)) |
70 | - try: | ||
71 | - return rule.parseString(line, parseAll=True)[0] | ||
72 | - except ParseException as ex: | ||
73 | - msg = u'Preprocessing of segmentation rules failed.\n' | ||
74 | - msg += line + '\n' | ||
75 | - msg += (ex.col - 1) * ' ' + '^\n' | ||
76 | - msg += ex.msg | ||
77 | -# print unicode(exceptions.SegtypesException(msg)).encode('utf8') | ||
78 | - raise exceptions.SegtypesException(msg) | 71 | + return pyparseString(rule, lineNum, line, filename)[0] |
79 | else: | 72 | else: |
80 | return line | 73 | return line |
81 | 74 | ||
82 | -def preprocess(inputLines, defs): | 75 | +def preprocess(inputLines, defs, filename): |
83 | defines = {} | 76 | defines = {} |
84 | ifdefsStack = [] | 77 | ifdefsStack = [] |
85 | for lineNum, line in inputLines: | 78 | for lineNum, line in inputLines: |
86 | if line.startswith('#define'): | 79 | if line.startswith('#define'): |
87 | - parsedDefine = list(define.parseString(line)) | 80 | + parsedDefine = list(pyparseString(define, lineNum, line, filename)) |
88 | if len(parsedDefine) == 2: | 81 | if len(parsedDefine) == 2: |
89 | name, val = parsedDefine | 82 | name, val = parsedDefine |
90 | defines[name] = NonArgDefine(name, val) | 83 | defines[name] = NonArgDefine(name, val) |
@@ -92,15 +85,16 @@ def preprocess(inputLines, defs): | @@ -92,15 +85,16 @@ def preprocess(inputLines, defs): | ||
92 | name, arg, val = parsedDefine | 85 | name, arg, val = parsedDefine |
93 | localDefines = defines.copy() | 86 | localDefines = defines.copy() |
94 | localDefines[arg] = NonArgDefine(arg, arg) | 87 | localDefines[arg] = NonArgDefine(arg, arg) |
95 | - val = _processLine(lineNum, val, localDefines) | 88 | + val = _processLine(lineNum, val, localDefines, filename) |
96 | defines[name] = ArgDefine(name, arg, val) | 89 | defines[name] = ArgDefine(name, arg, val) |
97 | elif line.startswith('#ifdef'): | 90 | elif line.startswith('#ifdef'): |
98 | - name = ifdef.parseString(line)[0] | 91 | + name = pyparseString(ifdef, lineNum, line, filename)[0] |
92 | +# name = ifdef.parseString(line)[0] | ||
99 | ifdefsStack.append(name) | 93 | ifdefsStack.append(name) |
100 | elif line.startswith('#endif'): | 94 | elif line.startswith('#endif'): |
101 | ifdefsStack.pop() | 95 | ifdefsStack.pop() |
102 | elif line.startswith('#'): | 96 | elif line.startswith('#'): |
103 | yield lineNum, line | 97 | yield lineNum, line |
104 | elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)): | 98 | elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)): |
105 | - yield lineNum, _processLine(lineNum, line, defines) | 99 | + yield lineNum, _processLine(lineNum, line, defines, filename) |
106 | 100 | ||
107 | \ No newline at end of file | 101 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/pyparseString.py
0 → 100644
1 | +''' | ||
2 | +Created on 12 mar 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | + | ||
7 | +from pyparsing import ParseException | ||
8 | +from morfeuszbuilder.utils import exceptions | ||
9 | + | ||
10 | +def pyparseString(rule, lineNum, line, filename): | ||
11 | + try: | ||
12 | + return rule.parseString(line, parseAll=True) | ||
13 | + except ParseException as ex: | ||
14 | + msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) | ||
15 | + msg += line + '\n' | ||
16 | + msg += (ex.col - 1) * ' ' + '^\n' | ||
17 | + msg += ex.msg | ||
18 | +# print unicode(exceptions.SegtypesException(msg)).encode('utf8') | ||
19 | + raise exceptions.SegtypesException(msg) | ||
0 | \ No newline at end of file | 20 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -25,16 +25,17 @@ class SegmentRule(object): | @@ -25,16 +25,17 @@ class SegmentRule(object): | ||
25 | 25 | ||
26 | class TagRule(SegmentRule): | 26 | class TagRule(SegmentRule): |
27 | 27 | ||
28 | - def __init__(self, segnum, segtype): | 28 | + def __init__(self, segnum, shiftOrth, segtype): |
29 | self.segnum = segnum | 29 | self.segnum = segnum |
30 | self.segtype = segtype | 30 | self.segtype = segtype |
31 | + self.shiftOrth = shiftOrth | ||
31 | 32 | ||
32 | def addToNFA(self, fsa): | 33 | def addToNFA(self, fsa): |
33 | endState = RulesNFAState(final=True) | 34 | endState = RulesNFAState(final=True) |
34 | self._doAddToNFA(fsa.initialState, endState) | 35 | self._doAddToNFA(fsa.initialState, endState) |
35 | 36 | ||
36 | def _doAddToNFA(self, startState, endState): | 37 | def _doAddToNFA(self, startState, endState): |
37 | - startState.addTransition(self.segnum, endState) | 38 | + startState.addTransition((self.segnum, self.shiftOrth), endState) |
38 | 39 | ||
39 | def __str__(self): | 40 | def __str__(self): |
40 | return u'%s(%d)' % (self.segtype, self.segnum) | 41 | return u'%s(%d)' % (self.segtype, self.segnum) |
@@ -92,6 +93,7 @@ class ZeroOrMoreRule(UnaryRule): | @@ -92,6 +93,7 @@ class ZeroOrMoreRule(UnaryRule): | ||
92 | 93 | ||
93 | def __init__(self, child): | 94 | def __init__(self, child): |
94 | super(ZeroOrMoreRule, self).__init__(child) | 95 | super(ZeroOrMoreRule, self).__init__(child) |
96 | + assert isinstance(child, SegmentRule) | ||
95 | 97 | ||
96 | def addToNFA(self, fsa): | 98 | def addToNFA(self, fsa): |
97 | raise ValueError() | 99 | raise ValueError() |
@@ -108,33 +110,3 @@ class ZeroOrMoreRule(UnaryRule): | @@ -108,33 +110,3 @@ class ZeroOrMoreRule(UnaryRule): | ||
108 | 110 | ||
109 | def __str__(self): | 111 | def __str__(self): |
110 | return u'(' + str(self.child) + ')*' | 112 | return u'(' + str(self.child) + ')*' |
111 | - | ||
112 | -class ShiftOrthRule(UnaryRule): | ||
113 | - | ||
114 | - def __init__(self, child): | ||
115 | - super(ShiftOrthRule, self).__init__(child) | ||
116 | - | ||
117 | - def addToNFA(self, fsa): | ||
118 | - raise ValueError() | ||
119 | - | ||
120 | - def _doAddToNFA(self, startState, endState): | ||
121 | - self.child._doAddToNFA(startState, endState) | ||
122 | - startState.setTransitionData(self.child.segnum, 1) | ||
123 | - | ||
124 | - def __str__(self): | ||
125 | - return u'(' + str(self.child) + ')>' | ||
126 | - | ||
127 | -class ShiftOrthSameTypeRule(UnaryRule): | ||
128 | - | ||
129 | - def __init__(self, child): | ||
130 | - super(ShiftOrthSameTypeRule, self).__init__(child) | ||
131 | - | ||
132 | - def addToNFA(self, fsa): | ||
133 | - raise ValueError() | ||
134 | - | ||
135 | - def _doAddToNFA(self, startState, endState): | ||
136 | - self.child._doAddToNFA(startState, endState) | ||
137 | - startState.setTransitionData(self.child.segnum, 2) | ||
138 | - | ||
139 | - def __str__(self): | ||
140 | - return u'(' + str(self.child) + ')!>' |
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
0 → 100644
1 | +''' | ||
2 | +Created on 12 mar 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | +import logging | ||
7 | +from morfeuszbuilder.fsa import state | ||
8 | +from morfeuszbuilder.utils.serializationUtils import htons | ||
9 | + | ||
10 | +class RulesState(state.State): | ||
11 | + | ||
12 | + def __init__(self): | ||
13 | + super(RulesState, self).__init__() | ||
14 | + self.weak = None | ||
15 | + | ||
16 | + def setAsAccepting(self, weak): | ||
17 | + self.weak = weak | ||
18 | + self.encodedData = bytearray([1 if weak else 0]) | ||
19 | + | ||
20 | + def getEncodedSize(self): | ||
21 | + stateInfoSize = 2 # accepting info + transitionsNum | ||
22 | + transitionsSize = 4 * len(self.transitionsMap) | ||
23 | + return stateInfoSize + transitionsSize | ||
24 | + | ||
25 | +class RulesFSA(object): | ||
26 | + | ||
27 | + def __init__(self): | ||
28 | + self.initialState = state.State() | ||
29 | + self.ACCEPTING_FLAG = 1 | ||
30 | + self.WEAK_FLAG = 2 | ||
31 | + | ||
32 | + def stateData2bytearray(self, state): | ||
33 | + res = bytearray() | ||
34 | + firstByte = 0 | ||
35 | + if state.isAccepting(): | ||
36 | + firstByte |= self.ACCEPTING_FLAG | ||
37 | + if state.weak: | ||
38 | + firstByte |= self.WEAK_FLAG | ||
39 | + assert firstByte < 256 and firstByte >= 0 | ||
40 | + res.append(firstByte) | ||
41 | + | ||
42 | + secondByte = len(state.transitionsMap) | ||
43 | + assert secondByte < 256 and secondByte >= 0 | ||
44 | + res.append(secondByte) | ||
45 | + | ||
46 | + return res | ||
47 | + | ||
48 | + def transitionsData2bytearray(self, state): | ||
49 | + res = bytearray() | ||
50 | +# logging.debug('next') | ||
51 | + for (segnum, shiftOrth), nextState in state.transitionsMap.iteritems(): | ||
52 | + res.append(segnum) | ||
53 | + if shiftOrth: | ||
54 | + res.append(1) | ||
55 | + else: | ||
56 | + res.append(0) | ||
57 | + offset = nextState.offset | ||
58 | + assert offset < 65536 | ||
59 | +# res.append((offset & 0xFF0000) >> 16) | ||
60 | + res.extend(htons(offset)) | ||
61 | + return res | ||
62 | + | ||
63 | + def serialize(self): | ||
64 | + self.initialState.calculateOffsets(sizeCounter=lambda s: s.getEncodedSize()) | ||
65 | + res = bytearray() | ||
66 | + | ||
67 | + for state in sorted(self.initialState.dfs(set()), key=lambda s: s.offset): | ||
68 | + res.extend(self.stateData2bytearray(state)) | ||
69 | + res.extend(self.transitionsData2bytearray(state)) | ||
70 | + | ||
71 | + logging.info('Segmentation automaton size: %d bytes', len(res)) | ||
72 | + print list(res) | ||
73 | + return res |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -4,7 +4,7 @@ Created on 20 lut 2014 | @@ -4,7 +4,7 @@ Created on 20 lut 2014 | ||
4 | @author: mlenart | 4 | @author: mlenart |
5 | ''' | 5 | ''' |
6 | import logging | 6 | import logging |
7 | -from morfeuszbuilder.fsa.serializer import SimpleSerializer | 7 | +from morfeuszbuilder.utils.serializationUtils import htons, htonl |
8 | 8 | ||
9 | class RulesManager(object): | 9 | class RulesManager(object): |
10 | 10 | ||
@@ -52,9 +52,9 @@ class RulesManager(object): | @@ -52,9 +52,9 @@ class RulesManager(object): | ||
52 | 52 | ||
53 | def _serializeDFA(self, dfa): | 53 | def _serializeDFA(self, dfa): |
54 | res = bytearray() | 54 | res = bytearray() |
55 | - serializer = SimpleSerializer(dfa, serializeTransitionsData=True) | ||
56 | - dfaBytearray = serializer.fsa2bytearray() | ||
57 | - res.extend(serializer.htonl(len(dfaBytearray))) | 55 | +# serializer = SimpleSerializer(dfa, serializeTransitionsData=True) |
56 | + dfaBytearray = dfa.serialize() | ||
57 | + res.extend(htonl(len(dfaBytearray))) | ||
58 | res.extend(dfaBytearray) | 58 | res.extend(dfaBytearray) |
59 | return res | 59 | return res |
60 | 60 |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -4,7 +4,7 @@ Created on 24 sty 2014 | @@ -4,7 +4,7 @@ Created on 24 sty 2014 | ||
4 | @author: mlenart | 4 | @author: mlenart |
5 | ''' | 5 | ''' |
6 | 6 | ||
7 | -from morfeuszbuilder.fsa import fsa, state, encode | 7 | +from morfeuszbuilder.segrules.rulesFSA import RulesFSA, RulesState |
8 | 8 | ||
9 | class RulesNFAState(object): | 9 | class RulesNFAState(object): |
10 | 10 | ||
@@ -12,7 +12,7 @@ class RulesNFAState(object): | @@ -12,7 +12,7 @@ class RulesNFAState(object): | ||
12 | 12 | ||
13 | def __init__(self, initial=False, final=False, weak=False): | 13 | def __init__(self, initial=False, final=False, weak=False): |
14 | self.transitionsMap = {} | 14 | self.transitionsMap = {} |
15 | - self.transitionsDataMap = {} | 15 | +# self.transitionsDataMap = {} |
16 | self.initial = initial | 16 | self.initial = initial |
17 | self.final = final | 17 | self.final = final |
18 | self.weak = weak | 18 | self.weak = weak |
@@ -20,13 +20,9 @@ class RulesNFAState(object): | @@ -20,13 +20,9 @@ class RulesNFAState(object): | ||
20 | RulesNFAState.statesCounter += 1 | 20 | RulesNFAState.statesCounter += 1 |
21 | 21 | ||
22 | def addTransition(self, label, targetState): | 22 | def addTransition(self, label, targetState): |
23 | + assert label is None or len(label) == 2 | ||
23 | self.transitionsMap.setdefault(label, set()) | 24 | self.transitionsMap.setdefault(label, set()) |
24 | self.transitionsMap[label].add(targetState) | 25 | self.transitionsMap[label].add(targetState) |
25 | - self.transitionsDataMap[label] = 0 | ||
26 | - | ||
27 | - def setTransitionData(self, label, byte): | ||
28 | - assert len(self.transitionsMap[label]) == 1 | ||
29 | - self.transitionsDataMap[label] = byte | ||
30 | 26 | ||
31 | def getClosure(self, visited): | 27 | def getClosure(self, visited): |
32 | if self in visited: | 28 | if self in visited: |
@@ -64,10 +60,11 @@ class RulesNFA(object): | @@ -64,10 +60,11 @@ class RulesNFA(object): | ||
64 | for nfaState in nfaStates: | 60 | for nfaState in nfaStates: |
65 | for label, nextStates in nfaState.transitionsMap.iteritems(): | 61 | for label, nextStates in nfaState.transitionsMap.iteritems(): |
66 | if label is not None: | 62 | if label is not None: |
67 | - transitionData = nfaState.transitionsDataMap[label] | ||
68 | - res.setdefault((label, transitionData), set()) | 63 | +# transitionData = nfaState.transitionsDataMap[label] |
64 | + segnum, shiftOrth = label | ||
65 | + res.setdefault((segnum, shiftOrth), set()) | ||
69 | for nextNFAState in nextStates: | 66 | for nextNFAState in nextStates: |
70 | - res[(label, transitionData)] |= nextNFAState.getClosure(set()) | 67 | + res[(segnum, shiftOrth)] |= nextNFAState.getClosure(set()) |
71 | return res | 68 | return res |
72 | 69 | ||
73 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): | 70 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): |
@@ -79,23 +76,24 @@ class RulesNFA(object): | @@ -79,23 +76,24 @@ class RulesNFA(object): | ||
79 | if final: | 76 | if final: |
80 | # dfaState should be final | 77 | # dfaState should be final |
81 | # and contain info about weakness | 78 | # and contain info about weakness |
82 | - dfaState.encodedData = bytearray([1 if weak else 0]) | ||
83 | - for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | 79 | + dfaState.setAsAccepting(weak=weak) |
80 | +# dfaState.encodedData = bytearray([1 if weak else 0]) | ||
81 | + for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | ||
84 | key = frozenset(nextNFAStates) | 82 | key = frozenset(nextNFAStates) |
85 | if key in nfaSubset2DFAState: | 83 | if key in nfaSubset2DFAState: |
86 | nextDFAState = nfaSubset2DFAState[key] | 84 | nextDFAState = nfaSubset2DFAState[key] |
87 | else: | 85 | else: |
88 | - nextDFAState = state.State() | 86 | + nextDFAState = RulesState() |
89 | nfaSubset2DFAState[key] = nextDFAState | 87 | nfaSubset2DFAState[key] = nextDFAState |
90 | self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) | 88 | self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) |
91 | - dfaState.setTransition(label, nextDFAState) | ||
92 | - dfaState.setTransitionData(label, transitionData) | 89 | + dfaState.setTransition((segnum, shiftOrth), nextDFAState) |
90 | +# dfaState.setTransitionData(label, transitionData) | ||
93 | 91 | ||
94 | def convertToDFA(self): | 92 | def convertToDFA(self): |
95 | - dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) | 93 | + dfa = RulesFSA() |
96 | startStates = self.initialState.getClosure(set()) | 94 | startStates = self.initialState.getClosure(set()) |
97 | assert not any(filter(lambda s: s.final, startStates)) | 95 | assert not any(filter(lambda s: s.final, startStates)) |
98 | - dfa.initialState = state.State(additionalData=False) | 96 | + dfa.initialState = RulesState() |
99 | self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) | 97 | self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) |
100 | return dfa | 98 | return dfa |
101 | 99 |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -3,7 +3,7 @@ from pyparsing import * | @@ -3,7 +3,7 @@ from pyparsing import * | ||
3 | ParserElement.enablePackrat() | 3 | ParserElement.enablePackrat() |
4 | from morfeuszbuilder.tagset import segtypes | 4 | from morfeuszbuilder.tagset import segtypes |
5 | from morfeuszbuilder.utils import configFile, exceptions | 5 | from morfeuszbuilder.utils import configFile, exceptions |
6 | -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager | 6 | +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString |
7 | import codecs | 7 | import codecs |
8 | import re | 8 | import re |
9 | 9 | ||
@@ -48,8 +48,8 @@ class RulesParser(object): | @@ -48,8 +48,8 @@ class RulesParser(object): | ||
48 | if not firstNFA: | 48 | if not firstNFA: |
49 | firstNFA = nfa | 49 | firstNFA = nfa |
50 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') | 50 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') |
51 | - combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) | ||
52 | - for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): | 51 | + combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) |
52 | + for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): | ||
53 | # print rule | 53 | # print rule |
54 | rule.addToNFA(nfa) | 54 | rule.addToNFA(nfa) |
55 | # nfa.debug() | 55 | # nfa.debug() |
@@ -60,25 +60,24 @@ class RulesParser(object): | @@ -60,25 +60,24 @@ class RulesParser(object): | ||
60 | res.addDFA(key2Def, dfa) | 60 | res.addDFA(key2Def, dfa) |
61 | return res | 61 | return res |
62 | 62 | ||
63 | - def _doParse(self, combinationEnumeratedLines, segtypesHelper): | 63 | + def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename): |
64 | for lineNum, line in combinationEnumeratedLines: | 64 | for lineNum, line in combinationEnumeratedLines: |
65 | if not line.startswith('#'): | 65 | if not line.startswith('#'): |
66 | - yield self._doParseOneLine(lineNum, line, segtypesHelper) | 66 | + yield self._doParseOneLine(lineNum, line, segtypesHelper, filename) |
67 | 67 | ||
68 | - def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper): | 68 | + def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): |
69 | if not segtypesHelper.hasSegtype(segtype): | 69 | if not segtypesHelper.hasSegtype(segtype): |
70 | raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) | 70 | raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) |
71 | else: | 71 | else: |
72 | # return rules.TagRule(segtype) | 72 | # return rules.TagRule(segtype) |
73 | - return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), segtype) | 73 | + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype) |
74 | 74 | ||
75 | - def _doParseOneLine(self, lineNum, line, segtypesHelper): | 75 | + def _doParseOneLine(self, lineNum, line, segtypesHelper, filename): |
76 | rule = Forward() | 76 | rule = Forward() |
77 | tagRule = Word(alphanums+'_') | 77 | tagRule = Word(alphanums+'_') |
78 | - shiftOrthRule = tagRule + '>' | ||
79 | - shiftOrthSameTypeRule = tagRule + '!' + '>' | 78 | + shiftOrthRule = Word(alphanums+'_') + Suppress('>') |
80 | parenRule = Suppress('(') + rule + Suppress(')') | 79 | parenRule = Suppress('(') + rule + Suppress(')') |
81 | - atomicRule = tagRule ^ shiftOrthRule ^ shiftOrthSameTypeRule ^ parenRule | 80 | + atomicRule = tagRule ^ shiftOrthRule ^ parenRule |
82 | zeroOrMoreRule = atomicRule + Suppress('*') | 81 | zeroOrMoreRule = atomicRule + Suppress('*') |
83 | oneOrMoreRule = atomicRule + Suppress('+') | 82 | oneOrMoreRule = atomicRule + Suppress('+') |
84 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule | 83 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule |
@@ -87,13 +86,12 @@ class RulesParser(object): | @@ -87,13 +86,12 @@ class RulesParser(object): | ||
87 | concatRule = OneOrMore(complexRule) | 86 | concatRule = OneOrMore(complexRule) |
88 | rule << concatRule | 87 | rule << concatRule |
89 | 88 | ||
90 | - tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) | ||
91 | - shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0])) | ||
92 | - shiftOrthSameTypeRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthSameTypeRule(toks[0])) | 89 | + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) |
90 | + shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) | ||
93 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) | 91 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) |
94 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) | 92 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) |
95 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) | 93 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) |
96 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) | 94 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) |
97 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) | 95 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) |
98 | - parsedRule = rule.parseString(line, parseAll=True)[0] | 96 | + parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0] |
99 | return parsedRule | 97 | return parsedRule |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -33,6 +33,7 @@ class Segtypes(object): | @@ -33,6 +33,7 @@ class Segtypes(object): | ||
33 | raise exceptions.ConfigFileException(self.filename, lineNum, msg) | 33 | raise exceptions.ConfigFileException(self.filename, lineNum, msg) |
34 | 34 | ||
35 | def _readTags(self, segrulesConfigFile): | 35 | def _readTags(self, segrulesConfigFile): |
36 | + gotWildcardPattern = False | ||
36 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): | 37 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): |
37 | splitLine = re.split(r'\s+', line.strip()) | 38 | splitLine = re.split(r'\s+', line.strip()) |
38 | self._validate( | 39 | self._validate( |
@@ -49,13 +50,27 @@ class Segtypes(object): | @@ -49,13 +50,27 @@ class Segtypes(object): | ||
49 | lineNum, | 50 | lineNum, |
50 | re.match(r'[a-z_\.\:\%]+', pattern)) | 51 | re.match(r'[a-z_\.\:\%]+', pattern)) |
51 | 52 | ||
53 | + self._validate( | ||
54 | + u'Pattern that matches everything must be the last one', | ||
55 | + lineNum - 1, | ||
56 | + not gotWildcardPattern) | ||
57 | + | ||
52 | if segtype in self.segtype2Segnum: | 58 | if segtype in self.segtype2Segnum: |
53 | segnum = self.segtype2Segnum[segtype] | 59 | segnum = self.segtype2Segnum[segtype] |
54 | else: | 60 | else: |
55 | segnum = len(self.segtype2Segnum) | 61 | segnum = len(self.segtype2Segnum) |
56 | self.segtype2Segnum[segtype] = segnum | 62 | self.segtype2Segnum[segtype] = segnum |
57 | 63 | ||
58 | - self.patternsList.append(SegtypePattern(None, pattern, segnum)) | 64 | + segtypePattern = SegtypePattern(None, pattern, segnum) |
65 | + | ||
66 | + self._validate( | ||
67 | + u'There is no tag that matches pattern "%s".' % pattern, | ||
68 | + lineNum, | ||
69 | + any([segtypePattern.tryToMatch(None, tag) != -1 for tag in self.tagset.getAllTags()])) | ||
70 | + | ||
71 | + self.patternsList.append(segtypePattern) | ||
72 | + | ||
73 | + gotWildcardPattern = gotWildcardPattern or pattern == '%' | ||
59 | 74 | ||
60 | self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()]) | 75 | self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()]) |
61 | 76 | ||
@@ -67,7 +82,7 @@ class Segtypes(object): | @@ -67,7 +82,7 @@ class Segtypes(object): | ||
67 | lineNum, | 82 | lineNum, |
68 | re.match(r'[a-z_]+', segtype)) | 83 | re.match(r'[a-z_]+', segtype)) |
69 | self._validate( | 84 | self._validate( |
70 | - u'Pattern must contain lemma and POS', | 85 | + u'Pattern must contain lemma and part-of-speech fields', |
71 | lineNum, | 86 | lineNum, |
72 | re.match(r'.+\:[a-z_]+', pattern, re.U)) | 87 | re.match(r'.+\:[a-z_]+', pattern, re.U)) |
73 | 88 | ||
@@ -79,7 +94,14 @@ class Segtypes(object): | @@ -79,7 +94,14 @@ class Segtypes(object): | ||
79 | 94 | ||
80 | lemma, pos = pattern.split(':') | 95 | lemma, pos = pattern.split(':') |
81 | 96 | ||
82 | - self.patternsList.append(SegtypePattern(lemma, '%s|%s:%%' % (pos, pos), segnum)) | 97 | + segtypePattern = SegtypePattern(lemma, pos + ':%', segnum) |
98 | + | ||
99 | + self._validate( | ||
100 | + u'There is no tag that matches pattern "%s".' % (pos + ':%'), | ||
101 | + lineNum, | ||
102 | + any([segtypePattern.tryToMatch(lemma, tag) != -1 for tag in self.tagset.getAllTags()])) | ||
103 | + | ||
104 | + self.patternsList.append(segtypePattern) | ||
83 | 105 | ||
84 | def _debugSegnums(self): | 106 | def _debugSegnums(self): |
85 | for tagnum, segnum in self._tagnum2Segnum.items(): | 107 | for tagnum, segnum in self._tagnum2Segnum.items(): |
@@ -121,11 +143,6 @@ class Segtypes(object): | @@ -121,11 +143,6 @@ class Segtypes(object): | ||
121 | if not res: | 143 | if not res: |
122 | res = self._tagnum2Segnum.get(tagnum, None) | 144 | res = self._tagnum2Segnum.get(tagnum, None) |
123 | return res | 145 | return res |
124 | -# for p in self.patternsList: | ||
125 | -# res = p.tryToMatch(lemma, tag) | ||
126 | -# if res >= 0: | ||
127 | -# return res | ||
128 | -# return None | ||
129 | 146 | ||
130 | class SegtypePattern(object): | 147 | class SegtypePattern(object): |
131 | 148 | ||
@@ -135,8 +152,13 @@ class SegtypePattern(object): | @@ -135,8 +152,13 @@ class SegtypePattern(object): | ||
135 | self.segnum = segnum | 152 | self.segnum = segnum |
136 | 153 | ||
137 | def tryToMatch(self, lemma, tag): | 154 | def tryToMatch(self, lemma, tag): |
155 | +# tag2Match = tag + ':' if not tag.endswith(':') else tag | ||
156 | +# print tag2Match | ||
157 | + patterns2Match = [] | ||
158 | + patterns2Match.append(self.pattern.replace('%', '.*')) | ||
159 | + patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) | ||
138 | if (self.lemma is None or self.lemma == lemma) \ | 160 | if (self.lemma is None or self.lemma == lemma) \ |
139 | - and re.match(self.pattern.replace('%', '.*'), tag): | 161 | + and any([re.match(p, tag) for p in patterns2Match]): |
140 | return self.segnum | 162 | return self.segnum |
141 | else: | 163 | else: |
142 | return -1 | 164 | return -1 |
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
0 → 100644
1 | +''' | ||
2 | +Created on 12 mar 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | + | ||
7 | +# serialize uint16 as big endian | ||
8 | +def htons(n): | ||
9 | + assert n < 65536 | ||
10 | + assert n >= 0 | ||
11 | + res = bytearray() | ||
12 | + res.append((n & 0x00FF00) >> 8) | ||
13 | + res.append(n & 0x0000FF) | ||
14 | + return res | ||
15 | + | ||
16 | +# serialize uint32 as big endian | ||
17 | +def htonl(n): | ||
18 | + assert n >= 0 | ||
19 | + res = bytearray() | ||
20 | + res.append((n & 0xFF000000) >> 24) | ||
21 | + res.append((n & 0x00FF0000) >> 16) | ||
22 | + res.append((n & 0x0000FF00) >> 8) | ||
23 | + res.append(n & 0x000000FF) | ||
24 | + return res |
input/dodatki.tab
@@ -41,13 +41,171 @@ z Z brev:pun | @@ -41,13 +41,171 @@ z Z brev:pun | ||
41 | ż Ż brev:pun | 41 | ż Ż brev:pun |
42 | ch Ch brev:pun | 42 | ch Ch brev:pun |
43 | st St brev:pun | 43 | st St brev:pun |
44 | -0 0 dig | ||
45 | -1 1 dig | ||
46 | -2 2 dig | ||
47 | -3 3 dig | ||
48 | -4 4 dig | ||
49 | -5 5 dig | ||
50 | -6 6 dig | ||
51 | -7 7 dig | ||
52 | -8 8 dig | ||
53 | -9 9 dig | 44 | +poli poli prefa |
45 | +poli poli prefs | ||
46 | +niby niby prefa | ||
47 | +niby niby prefs | ||
48 | +eks eks prefs | ||
49 | +ex ex prefs | ||
50 | +euro euro prefa | ||
51 | +euro euro prefs | ||
52 | +mikro mikro prefs | ||
53 | +mikro mikro prefa | ||
54 | +makro makro prefa | ||
55 | +makro makro prefs | ||
56 | +bez bez prefa | ||
57 | +do do prefv | ||
58 | +do do prefa | ||
59 | +dez dez prefv | ||
60 | +dez dez prefa | ||
61 | +dez dez prefs | ||
62 | +ko ko prefa | ||
63 | +ko ko prefs | ||
64 | +między między prefa | ||
65 | +między między prefs | ||
66 | +na na prefa | ||
67 | +na na prefs | ||
68 | +na na prefv | ||
69 | +nad nad prefa | ||
70 | +nad nad prefs | ||
71 | +nad nad prefv | ||
72 | +o o prefv | ||
73 | +ob ob prefv | ||
74 | +od od prefa | ||
75 | +od od prefs | ||
76 | +od od prefv | ||
77 | +pra pra prefs | ||
78 | +post post prefa | ||
79 | +post post prefs | ||
80 | +pod pod prefa | ||
81 | +pod pod prefs | ||
82 | +pod pod prefv | ||
83 | +poza poza prefa | ||
84 | +ponad ponad prefa | ||
85 | +pre pre prefa | ||
86 | +pre pre prefs | ||
87 | +pro pro prefa | ||
88 | +pro pro prefs | ||
89 | +prze prze prefa | ||
90 | +prze prze prefv | ||
91 | +przeciw przeciw prefa | ||
92 | +przeciw przeciw prefs | ||
93 | +re re prefa | ||
94 | +re re prefs | ||
95 | +re re prefv | ||
96 | +przy przy prefa | ||
97 | +przy przy prefv | ||
98 | +roz roz prefv | ||
99 | +u u prefv | ||
100 | +samo samo prefa | ||
101 | +samo samo prefs | ||
102 | +video video prefs | ||
103 | +video video prefa | ||
104 | +w w prefv | ||
105 | +wy wy prefv | ||
106 | +współ współ prefv | ||
107 | +współ współ prefa | ||
108 | +współ współ prefs | ||
109 | +wice wice prefs | ||
110 | +neo neo prefa | ||
111 | +neo neo prefs | ||
112 | +tele tele prefs | ||
113 | +tele tele prefa | ||
114 | +z z prefv | ||
115 | +za za prefv | ||
116 | +za za prefa | ||
117 | +za za prefs | ||
118 | +wideo wideo prefa | ||
119 | +wideo wideo prefs | ||
120 | +meta meta prefs | ||
121 | +meta meta prefa | ||
122 | +multi multi prefa | ||
123 | +multi multi prefs | ||
124 | +mega mega prefa | ||
125 | +mega mega prefs | ||
126 | +kontra kontra prefs | ||
127 | +kontra kontra prefa | ||
128 | +inter inter prefa | ||
129 | +inter inter prefs | ||
130 | +homo homo prefs | ||
131 | +homo homo prefa | ||
132 | +ekstra ekstra prefa | ||
133 | +ekstra ekstra prefs | ||
134 | +giga giga prefa | ||
135 | +giga giga prefs | ||
136 | +bi bi prefs | ||
137 | +bi bi prefa | ||
138 | +auto auto prefs | ||
139 | +auto auto prefa | ||
140 | +de de prefv | ||
141 | +de de prefa | ||
142 | +de de prefs | ||
143 | +ultra ultra prefs | ||
144 | +ultra ultra prefa | ||
145 | +e- e- prefa | ||
146 | +e- e- prefs | ||
147 | +mini mini prefs | ||
148 | +mini mini prefa | ||
149 | +maxi maxi prefs | ||
150 | +maxi maxi prefa | ||
151 | +midi midi prefs | ||
152 | +midi midi prefa | ||
153 | +arcy arcy prefs | ||
154 | +arcy arcy prefa | ||
155 | +anty anty prefa | ||
156 | +anty anty prefs | ||
157 | +a a prefa | ||
158 | +a a prefs | ||
159 | +pan pan prefs | ||
160 | +pan pan prefa | ||
161 | +in in prefa | ||
162 | +in in prefs | ||
163 | +dys dys prefs | ||
164 | +dys dys prefa | ||
165 | +mono mono prefa | ||
166 | +mono mono prefs | ||
167 | +porno porno prefs | ||
168 | +porno porno prefa | ||
169 | +anglo anglo prefa | ||
170 | +aero aero prefs | ||
171 | +aero aero prefa | ||
172 | +bio bio prefs | ||
173 | +bio bio prefa | ||
174 | +wszystko wszystko prefs | ||
175 | +wszystko wszystko prefa | ||
176 | +wszech wszech prefs | ||
177 | +wszech wszech prefa | ||
178 | +śród śród prefs | ||
179 | +śród śród prefa | ||
180 | +audio audio prefs | ||
181 | +audio audio prefa | ||
182 | +eko eko prefs | ||
183 | +eko eko prefa | ||
184 | +s s prefv | ||
185 | +elektro elektro prefs | ||
186 | +elektro elektro prefa | ||
187 | +trans trans prefa | ||
188 | +trans trans prefs | ||
189 | +kontr kontr prefs | ||
190 | +kontr kontr prefa | ||
191 | +pseudo pseudo prefs | ||
192 | +pseudo pseudo prefa | ||
193 | +quasi quasi prefs | ||
194 | +quasi quasi prefa | ||
195 | +super super prefs | ||
196 | +super super prefa | ||
197 | +po po prefv | ||
198 | +po po prefa | ||
199 | +po po prefs | ||
200 | +sub sub prefs | ||
201 | +sub sub prefa | ||
202 | +hiper hiper prefa | ||
203 | +hiper hiper prefs | ||
204 | +non non prefs | ||
205 | +non non prefa | ||
206 | +stereo stereo prefa | ||
207 | +stereo stereo prefs | ||
208 | +energo energo prefa | ||
209 | +para para prefa | ||
210 | +para para prefs | ||
211 | +ś ś prefv |
input/polimorf.tagset
input/segmenty.dat
@@ -19,7 +19,7 @@ samotny | @@ -19,7 +19,7 @@ samotny | ||
19 | # przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”: | 19 | # przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”: |
20 | moze_interp(praet_sg_na) | 20 | moze_interp(praet_sg_na) |
21 | 21 | ||
22 | -# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „czytał”: | 22 | +# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „moze”: |
23 | moze_interp(praet_sg) | 23 | moze_interp(praet_sg) |
24 | 24 | ||
25 | # przeszlik mnogi, np. „czytali”: | 25 | # przeszlik mnogi, np. „czytali”: |
@@ -69,9 +69,8 @@ moze_interp(praet_sg by aglsg) | @@ -69,9 +69,8 @@ moze_interp(praet_sg by aglsg) | ||
69 | # np. „gnietli·by·śmy” | 69 | # np. „gnietli·by·śmy” |
70 | moze_interp(praet_pl by aglpl) | 70 | moze_interp(praet_pl by aglpl) |
71 | #else | 71 | #else |
72 | -moze_interp(praetcond) | 72 | +# moze_interp(praetcond) |
73 | #endif | 73 | #endif |
74 | - | ||
75 | # np. „by·ś” | 74 | # np. „by·ś” |
76 | moze_interp(by aglsg) | 75 | moze_interp(by aglsg) |
77 | # np. „by·ście” | 76 | # np. „by·ście” |
@@ -98,9 +97,9 @@ moze_interp( (adja dywiz)+ adj ) | @@ -98,9 +97,9 @@ moze_interp( (adja dywiz)+ adj ) | ||
98 | # adja dywiz adja dywiz adja dywiz adj interp? | 97 | # adja dywiz adja dywiz adja dywiz adj interp? |
99 | # adja dywiz adja dywiz adja dywiz adja dywiz adj interp? | 98 | # adja dywiz adja dywiz adja dywiz adja dywiz adj interp? |
100 | 99 | ||
101 | -# Stopień najwyższy: | ||
102 | -# np. „naj·zieleńszy”, „naj·mądrzej” | ||
103 | -moze_interp( naj> adj_sup ) | 100 | +# Formy zanegowane stopnia wyższego przymiotników i przysłówków (WK) |
101 | +# np. „nie·grzeczniejszy”, „nie·grzeczniej” | ||
102 | +moze_interp( nie> adj_com ) | ||
104 | 103 | ||
105 | # Formy „zanegowane” gerundiów i imiesłowów: | 104 | # Formy „zanegowane” gerundiów i imiesłowów: |
106 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: | 105 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: |
@@ -112,15 +111,21 @@ moze_interp(z_on_agl) | @@ -112,15 +111,21 @@ moze_interp(z_on_agl) | ||
112 | moze_interp(z_on_agl on_agl) | 111 | moze_interp(z_on_agl on_agl) |
113 | 112 | ||
114 | # Liczba zapisana jako ciąg cyfr: | 113 | # Liczba zapisana jako ciąg cyfr: |
115 | -moze_interp( dig!>+ ) | 114 | +moze_interp( dig ) |
116 | 115 | ||
117 | # Formacje prefiksalne | 116 | # Formacje prefiksalne |
118 | #### trzeba wydzielić odpowiednie samodze! | 117 | #### trzeba wydzielić odpowiednie samodze! |
119 | -# rzeczownikowe i przymiotnikowe | ||
120 | -# np. „euro·sodoma”, „e-·papieros”, „euro·sodomski”, „bez·argumentowy” | ||
121 | -moze_interp( prefs samodz ) | 118 | +# rzeczownikowe |
119 | +# np. „euro·sodoma”, „e-·papieros” | ||
120 | +moze_interp(nomina) | ||
121 | +moze_interp( prefs> nomina ) | ||
122 | # czasownikowe np. „po·nakapywać” | 122 | # czasownikowe np. „po·nakapywać” |
123 | -moze_interp( prefv samodz ) | 123 | +moze_interp(verba_imperf) |
124 | +moze_interp( prefv> verba_imperf ) | ||
125 | +# przymiotnikowe np. „do·żylny”, „euro·sodomski”, „bez·argumentowy” | ||
126 | +moze_interp(adjectiva) | ||
127 | +moze_interp(prefa> adj) | ||
128 | +moze_interp( prefa> adjectiva ) | ||
124 | 129 | ||
125 | # Apozycje z dywizem | 130 | # Apozycje z dywizem |
126 | # np. „kobieta-prezydent” | 131 | # np. „kobieta-prezydent” |
@@ -133,11 +138,28 @@ adj dywiz samodz | @@ -133,11 +138,28 @@ adj dywiz samodz | ||
133 | # ? | 138 | # ? |
134 | samodz dywiz adj | 139 | samodz dywiz adj |
135 | 140 | ||
141 | +#### PONIŻEJ REGUŁY WK | ||
142 | +# Stopień najwyższy: | ||
143 | +# np. „naj·zieleńszy”, „naj·mądrzej” | ||
144 | +moze_interp( naj> adj_sup ) | ||
145 | +# Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj | ||
146 | +moze_interp( praet_sg dywiz li) | ||
147 | +moze_interp( praet_pl dywiz li) | ||
148 | +moze_interp( praet_sg_na dywiz li) | ||
149 | +moze_interp( fin dywiz li) | ||
150 | + | ||
151 | +# i bez dywizu --- czy bez dywizu jest sens to łapać? | ||
152 | +#moze_interp( praet_sg li) | ||
153 | +#moze_interp( praet_pl li) | ||
154 | +#moze_interp( praet_sg_na li) | ||
155 | +#moze_interp( fin li) | ||
156 | + | ||
136 | [segment types] | 157 | [segment types] |
137 | naj | 158 | naj |
138 | nie | 159 | nie |
139 | prefs | 160 | prefs |
140 | prefv | 161 | prefv |
162 | +prefa | ||
141 | dig | 163 | dig |
142 | adja | 164 | adja |
143 | adj | 165 | adj |
@@ -161,11 +183,14 @@ naj naj | @@ -161,11 +183,14 @@ naj naj | ||
161 | nie nie | 183 | nie nie |
162 | prefs prefs | 184 | prefs prefs |
163 | prefv prefv | 185 | prefv prefv |
186 | +prefa prefa | ||
164 | dig dig | 187 | dig dig |
165 | adja adja | 188 | adja adja |
166 | adj adj:%:pos | 189 | adj adj:%:pos |
167 | adj_sup adj:%:sup | 190 | adj_sup adj:%:sup |
168 | adj_sup adv:sup | 191 | adj_sup adv:sup |
192 | +adj_com adj:%:com | ||
193 | +adj_com adj:%:com | ||
169 | negat ger:%:neg | 194 | negat ger:%:neg |
170 | negat pact:%:neg | 195 | negat pact:%:neg |
171 | negat ppas:%:neg | 196 | negat ppas:%:neg |
@@ -173,26 +198,35 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep | @@ -173,26 +198,35 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep | ||
173 | z_on_agl prep:% | 198 | z_on_agl prep:% |
174 | samotny brev:pun | 199 | samotny brev:pun |
175 | samotny brev:npun | 200 | samotny brev:npun |
176 | -samotny intrj | 201 | +samotny interj |
177 | interp interp | 202 | interp interp |
178 | aglsg aglt:sg:% | 203 | aglsg aglt:sg:% |
179 | aglpl aglt:pl:% | 204 | aglpl aglt:pl:% |
180 | -praetcond cond:% | ||
181 | -praetcond praet:%:pri:% | ||
182 | -praetcond praet:%:sec:% | ||
183 | -praetcond praet:%:ter:% | ||
184 | praet_sg_agl praet:sg:%:agl | 205 | praet_sg_agl praet:sg:%:agl |
185 | praet_sg_na praet:sg:%:nagl | 206 | praet_sg_na praet:sg:%:nagl |
186 | praet_sg praet:sg:% | 207 | praet_sg praet:sg:% |
187 | praet_pl praet:pl:% | 208 | praet_pl praet:pl:% |
188 | praet_sg winien:sg:% | 209 | praet_sg winien:sg:% |
189 | praet_pl winien:pl:% | 210 | praet_pl winien:pl:% |
211 | +fin fin:% | ||
212 | +nomina subst:% | ||
213 | +nomina ger:% | ||
214 | +nomina depr:% | ||
215 | +adjectiva adv:% | ||
216 | +adjectiva ppas:% | ||
217 | +adjectiva pact:% | ||
218 | +verba_imperf praet:%:imperf | ||
219 | +verba_imperf fin:%:imperf | ||
220 | +verba_imperf inf:imperf | ||
221 | +verba_imperf imps:imperf | ||
222 | +verba_imperf impt:%:imperf | ||
190 | samodz % | 223 | samodz % |
191 | 224 | ||
192 | [lexemes] | 225 | [lexemes] |
193 | z_aglt aby:comp | 226 | z_aglt aby:comp |
194 | z_aglt bowiem:comp | 227 | z_aglt bowiem:comp |
195 | by by:qub | 228 | by by:qub |
229 | +li li:qub | ||
196 | z_aglt by:comp | 230 | z_aglt by:comp |
197 | z_aglt cóż:subst | 231 | z_aglt cóż:subst |
198 | z_aglt czemu:adv | 232 | z_aglt czemu:adv |
input/segmenty1.dat
@@ -7,9 +7,10 @@ praet=split composite | @@ -7,9 +7,10 @@ praet=split composite | ||
7 | 7 | ||
8 | #define moze_interp(segmenty) wsz_interp segmenty wsz_interp | 8 | #define moze_interp(segmenty) wsz_interp segmenty wsz_interp |
9 | 9 | ||
10 | +dig>* dig | ||
10 | (adja dywiz)+ adj | 11 | (adja dywiz)+ adj |
11 | -dig!>+ | ||
12 | -dig!> dig!> dig!> | 12 | +#dig!>+ |
13 | +#dig!> dig!> dig!> | ||
13 | naj> adj_sup | 14 | naj> adj_sup |
14 | 15 | ||
15 | [segment types] | 16 | [segment types] |
@@ -52,20 +53,10 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep | @@ -52,20 +53,10 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep | ||
52 | z_on_agl prep:% | 53 | z_on_agl prep:% |
53 | samotny brev:pun | 54 | samotny brev:pun |
54 | samotny brev:npun | 55 | samotny brev:npun |
55 | -samotny intrj | 56 | +samotny interj |
56 | interp interp | 57 | interp interp |
57 | aglsg aglt:sg:% | 58 | aglsg aglt:sg:% |
58 | aglpl aglt:pl:% | 59 | aglpl aglt:pl:% |
59 | -praetcond cond:% | ||
60 | -praetcond praet:%:pri:% | ||
61 | -praetcond praet:%:sec:% | ||
62 | -praetcond praet:%:ter:% | ||
63 | -praet_sg_agl praet:sg:%:agl | ||
64 | -praet_sg_na praet:sg:%:nagl | ||
65 | -praet_sg praet:sg:% | ||
66 | -praet_pl praet:pl:% | ||
67 | -praet_sg winien:sg:% | ||
68 | -praet_pl winien:pl:% | ||
69 | samodz % | 60 | samodz % |
70 | 61 | ||
71 | [lexemes] | 62 | [lexemes] |
morfeusz/InterpretedChunk.hpp
@@ -17,7 +17,6 @@ struct InterpretedChunk { | @@ -17,7 +17,6 @@ struct InterpretedChunk { | ||
17 | std::vector<uint32_t> lowercaseCodepoints; | 17 | std::vector<uint32_t> lowercaseCodepoints; |
18 | InterpsGroup interpsGroup; | 18 | InterpsGroup interpsGroup; |
19 | bool shiftOrth; | 19 | bool shiftOrth; |
20 | - bool shiftOrthSameType; | ||
21 | bool orthWasShifted; | 20 | bool orthWasShifted; |
22 | std::vector<InterpretedChunk> prefixChunks; | 21 | std::vector<InterpretedChunk> prefixChunks; |
23 | }; | 22 | }; |
morfeusz/Morfeusz.cpp
@@ -37,11 +37,19 @@ static MorfeuszOptions createDefaultOptions() { | @@ -37,11 +37,19 @@ static MorfeuszOptions createDefaultOptions() { | ||
37 | return res; | 37 | return res; |
38 | } | 38 | } |
39 | 39 | ||
40 | +static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { | ||
41 | + SegrulesOptions opts; | ||
42 | + opts["aggl"] = "isolated"; | ||
43 | + opts["praet"] = "split"; | ||
44 | + return (*(map.find(opts))).second; | ||
45 | +} | ||
46 | + | ||
40 | Morfeusz::Morfeusz() | 47 | Morfeusz::Morfeusz() |
41 | : env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET), | 48 | : env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET), |
42 | analyzerPtr(DEFAULT_FSA), | 49 | analyzerPtr(DEFAULT_FSA), |
43 | analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())), | 50 | analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())), |
44 | segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)), | 51 | segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)), |
52 | +currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), | ||
45 | isAnalyzerFSAFromFile(false), | 53 | isAnalyzerFSAFromFile(false), |
46 | generatorPtr(DEFAULT_SYNTH_FSA), | 54 | generatorPtr(DEFAULT_SYNTH_FSA), |
47 | isGeneratorFSAFromFile(false), | 55 | isGeneratorFSAFromFile(false), |
@@ -50,9 +58,9 @@ options(createDefaultOptions()) { | @@ -50,9 +58,9 @@ options(createDefaultOptions()) { | ||
50 | 58 | ||
51 | } | 59 | } |
52 | 60 | ||
53 | -static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSAType*>& fsasMap) { | 61 | +static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) { |
54 | for ( | 62 | for ( |
55 | - std::map<SegrulesOptions, SegrulesFSAType*>::iterator it = fsasMap.begin(); | 63 | + std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin(); |
56 | it != fsasMap.end(); | 64 | it != fsasMap.end(); |
57 | ++it) { | 65 | ++it) { |
58 | delete it->second; | 66 | delete it->second; |
@@ -100,11 +108,8 @@ void Morfeusz::analyzeOneWord( | @@ -100,11 +108,8 @@ void Morfeusz::analyzeOneWord( | ||
100 | vector<InterpretedChunk> accum; | 108 | vector<InterpretedChunk> accum; |
101 | FlexionGraph graph; | 109 | FlexionGraph graph; |
102 | const char* currInput = inputStart; | 110 | const char* currInput = inputStart; |
103 | - SegrulesOptions opts; | ||
104 | - opts["aggl"] = "isolated"; | ||
105 | - opts["praet"] = "split"; | ||
106 | - SegrulesFSAType* segrulesFSA = (*(this->segrulesFSAsMap.find(opts))).second; | ||
107 | - doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->getInitialState()); | 111 | + SegrulesFSA* segrulesFSA = this->currSegrulesFSA; |
112 | + doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->initialState); | ||
108 | if (!graph.empty()) { | 113 | if (!graph.empty()) { |
109 | InterpretedChunksDecoder interpretedChunksDecoder(env); | 114 | InterpretedChunksDecoder interpretedChunksDecoder(env); |
110 | int srcNode = startNodeNum; | 115 | int srcNode = startNodeNum; |
@@ -118,7 +123,8 @@ void Morfeusz::analyzeOneWord( | @@ -118,7 +123,8 @@ void Morfeusz::analyzeOneWord( | ||
118 | srcNode++; | 123 | srcNode++; |
119 | } | 124 | } |
120 | // graph.getResults(*this->tagset, results); | 125 | // graph.getResults(*this->tagset, results); |
121 | - } else if (inputStart != inputEnd) { | 126 | + } |
127 | + else if (inputStart != inputEnd) { | ||
122 | this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results); | 128 | this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results); |
123 | } | 129 | } |
124 | inputStart = currInput; | 130 | inputStart = currInput; |
@@ -126,9 +132,9 @@ void Morfeusz::analyzeOneWord( | @@ -126,9 +132,9 @@ void Morfeusz::analyzeOneWord( | ||
126 | 132 | ||
127 | static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | 133 | static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { |
128 | to.prefixChunks.insert( | 134 | to.prefixChunks.insert( |
129 | - to.prefixChunks.begin(), | ||
130 | - from.prefixChunks.begin(), | ||
131 | - from.prefixChunks.end()); | 135 | + to.prefixChunks.begin(), |
136 | + from.prefixChunks.begin(), | ||
137 | + from.prefixChunks.end()); | ||
132 | to.prefixChunks.push_back(from); | 138 | to.prefixChunks.push_back(from); |
133 | from.orthWasShifted = true; | 139 | from.orthWasShifted = true; |
134 | } | 140 | } |
@@ -138,7 +144,8 @@ void Morfeusz::doAnalyzeOneWord( | @@ -138,7 +144,8 @@ void Morfeusz::doAnalyzeOneWord( | ||
138 | const char* inputEnd, | 144 | const char* inputEnd, |
139 | vector<InterpretedChunk>& accum, | 145 | vector<InterpretedChunk>& accum, |
140 | FlexionGraph& graph, | 146 | FlexionGraph& graph, |
141 | - SegrulesStateType segrulesState) const { | 147 | + SegrulesState segrulesState) const { |
148 | + // cerr << "doAnalyzeOneWord " << inputData << endl; | ||
142 | bool endOfWord = inputData == inputEnd; | 149 | bool endOfWord = inputData == inputEnd; |
143 | const char* currInput = inputData; | 150 | const char* currInput = inputData; |
144 | uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd); | 151 | uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd); |
@@ -159,16 +166,27 @@ void Morfeusz::doAnalyzeOneWord( | @@ -159,16 +166,27 @@ void Morfeusz::doAnalyzeOneWord( | ||
159 | vector<InterpsGroup> val(state.getValue()); | 166 | vector<InterpsGroup> val(state.getValue()); |
160 | for (unsigned int i = 0; i < val.size(); i++) { | 167 | for (unsigned int i = 0; i < val.size(); i++) { |
161 | InterpsGroup& ig = val[i]; | 168 | InterpsGroup& ig = val[i]; |
162 | - cerr << (int) ig.type << endl; | ||
163 | - SegrulesStateType newSegrulesState = segrulesState; | ||
164 | - newSegrulesState.proceedToNext(ig.type); | ||
165 | - if (!newSegrulesState.isSink()) { | ||
166 | - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; | ||
167 | - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; | ||
168 | - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false}; | ||
169 | - if (!accum.empty() | ||
170 | - && (accum.back().shiftOrth | ||
171 | - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) { | 169 | + // newSegrulesState.proceedToNext(ig.type); |
170 | + // this->currSegrulesFSA->proceedToNext(ig.type, segrulesStates, newSegrulesStates); | ||
171 | + set<SegrulesState> newSegrulesStates; | ||
172 | + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates); | ||
173 | + for ( | ||
174 | + set<SegrulesState>::iterator it = newSegrulesStates.begin(); | ||
175 | + it != newSegrulesStates.end(); | ||
176 | + it++) { | ||
177 | + SegrulesState newSegrulesState = *it; | ||
178 | + // bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; | ||
179 | + // bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; | ||
180 | + InterpretedChunk ic = { | ||
181 | + inputData, | ||
182 | + originalCodepoints, | ||
183 | + lowercaseCodepoints, | ||
184 | + ig, | ||
185 | + newSegrulesState.shiftOrthFromPrevious, | ||
186 | + false, | ||
187 | + vector<InterpretedChunk>() | ||
188 | + }; | ||
189 | + if (!accum.empty() && accum.back().shiftOrth) { | ||
172 | doShiftOrth(accum.back(), ic); | 190 | doShiftOrth(accum.back(), ic); |
173 | } | 191 | } |
174 | accum.push_back(ic); | 192 | accum.push_back(ic); |
@@ -182,27 +200,37 @@ void Morfeusz::doAnalyzeOneWord( | @@ -182,27 +200,37 @@ void Morfeusz::doAnalyzeOneWord( | ||
182 | this->env.getCharsetConverter().next(currInput, inputEnd); | 200 | this->env.getCharsetConverter().next(currInput, inputEnd); |
183 | } | 201 | } |
184 | } | 202 | } |
203 | + // cerr << "end of word" << endl; | ||
185 | // we are at the end of word | 204 | // we are at the end of word |
186 | if (state.isAccepting()) { | 205 | if (state.isAccepting()) { |
187 | vector<InterpsGroup > val(state.getValue()); | 206 | vector<InterpsGroup > val(state.getValue()); |
188 | for (unsigned int i = 0; i < val.size(); i++) { | 207 | for (unsigned int i = 0; i < val.size(); i++) { |
189 | InterpsGroup& ig = val[i]; | 208 | InterpsGroup& ig = val[i]; |
190 | - SegrulesStateType newSegrulesState = segrulesState; | ||
191 | - newSegrulesState.proceedToNext(ig.type); | ||
192 | - if (newSegrulesState.isAccepting()) { | ||
193 | - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; | ||
194 | - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; | ||
195 | - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false}; | ||
196 | - if (!accum.empty() | ||
197 | - && (accum.back().shiftOrth | ||
198 | - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) { | ||
199 | - doShiftOrth(accum.back(), ic); | 209 | + // cerr << "currInput=" << currInput << endl; |
210 | + // cerr << "type=" << (int) ig.type << endl; | ||
211 | + set<SegrulesState> newSegrulesStates; | ||
212 | + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates); | ||
213 | + for ( | ||
214 | + set<SegrulesState>::iterator it = newSegrulesStates.begin(); | ||
215 | + it != newSegrulesStates.end(); | ||
216 | + it++) { | ||
217 | + SegrulesState newSegrulesState = *it; | ||
218 | + if (newSegrulesState.accepting) { | ||
219 | + InterpretedChunk ic = { | ||
220 | + inputData, | ||
221 | + originalCodepoints, | ||
222 | + lowercaseCodepoints, | ||
223 | + ig, | ||
224 | + newSegrulesState.shiftOrthFromPrevious, | ||
225 | + false, | ||
226 | + vector<InterpretedChunk>()}; | ||
227 | + if (!accum.empty() && accum.back().shiftOrth) { | ||
228 | + doShiftOrth(accum.back(), ic); | ||
229 | + } | ||
230 | + accum.push_back(ic); | ||
231 | + graph.addPath(accum); | ||
232 | + accum.pop_back(); | ||
200 | } | 233 | } |
201 | - accum.push_back(ic); | ||
202 | - graph.addPath(accum); | ||
203 | - accum.pop_back(); | ||
204 | - } else if (!newSegrulesState.isSink()) { | ||
205 | - } else { | ||
206 | } | 234 | } |
207 | } | 235 | } |
208 | } | 236 | } |
morfeusz/Morfeusz.hpp
@@ -12,6 +12,7 @@ | @@ -12,6 +12,7 @@ | ||
12 | #include <list> | 12 | #include <list> |
13 | #include <vector> | 13 | #include <vector> |
14 | #include <map> | 14 | #include <map> |
15 | +#include <set> | ||
15 | #include "EncodedInterpretation.hpp" | 16 | #include "EncodedInterpretation.hpp" |
16 | #include "fsa/fsa.hpp" | 17 | #include "fsa/fsa.hpp" |
17 | #include "MorphInterpretation.hpp" | 18 | #include "MorphInterpretation.hpp" |
@@ -27,6 +28,7 @@ | @@ -27,6 +28,7 @@ | ||
27 | #include "Environment.hpp" | 28 | #include "Environment.hpp" |
28 | 29 | ||
29 | #include "segrules/segrules.hpp" | 30 | #include "segrules/segrules.hpp" |
31 | +#include "segrules/SegrulesFSA.hpp" | ||
30 | 32 | ||
31 | class Morfeusz; | 33 | class Morfeusz; |
32 | class ResultsIterator; | 34 | class ResultsIterator; |
@@ -111,7 +113,7 @@ private: | @@ -111,7 +113,7 @@ private: | ||
111 | const char* inputEnd, | 113 | const char* inputEnd, |
112 | std::vector<InterpretedChunk>& accum, | 114 | std::vector<InterpretedChunk>& accum, |
113 | FlexionGraph& graph, | 115 | FlexionGraph& graph, |
114 | - SegrulesStateType segrulesState) const; | 116 | + SegrulesState segrulesState) const; |
115 | 117 | ||
116 | void appendIgnotiumToResults( | 118 | void appendIgnotiumToResults( |
117 | const std::string& word, | 119 | const std::string& word, |
@@ -120,17 +122,13 @@ private: | @@ -120,17 +122,13 @@ private: | ||
120 | Environment env; | 122 | Environment env; |
121 | const unsigned char* analyzerPtr; | 123 | const unsigned char* analyzerPtr; |
122 | FSAType* analyzerFSA; | 124 | FSAType* analyzerFSA; |
123 | - std::map<SegrulesOptions, SegrulesFSAType*> segrulesFSAsMap; | 125 | + std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap; |
126 | + SegrulesFSA* currSegrulesFSA; | ||
124 | bool isAnalyzerFSAFromFile; | 127 | bool isAnalyzerFSAFromFile; |
125 | 128 | ||
126 | const unsigned char* generatorPtr; | 129 | const unsigned char* generatorPtr; |
127 | bool isGeneratorFSAFromFile; | 130 | bool isGeneratorFSAFromFile; |
128 | Generator generator; | 131 | Generator generator; |
129 | -// const CharsetConverter* charsetConverter; | ||
130 | -// const Tagset* tagset; | ||
131 | -// const CaseConverter* caseConverter; | ||
132 | -// | ||
133 | -// UTF8CharsetConverter utf8CharsetConverter; | ||
134 | 132 | ||
135 | MorfeuszOptions options; | 133 | MorfeuszOptions options; |
136 | }; | 134 | }; |
morfeusz/segrules/SegrulesFSA.hpp
0 → 100644
1 | +/* | ||
2 | + * File: SegrulesFSA.hpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on 12 marzec 2014, 17:52 | ||
6 | + */ | ||
7 | + | ||
8 | +#ifndef SEGRULESFSA_HPP | ||
9 | +#define SEGRULESFSA_HPP | ||
10 | + | ||
11 | +#include <set> | ||
12 | +#include "../endianness.hpp" | ||
13 | + | ||
14 | +struct SegrulesState { | ||
15 | + uint16_t offset; | ||
16 | + bool accepting; | ||
17 | + bool weak; | ||
18 | + bool shiftOrthFromPrevious; | ||
19 | +}; | ||
20 | + | ||
21 | +inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) | ||
22 | +{ | ||
23 | + return s1.offset < s2.offset; | ||
24 | +} | ||
25 | + | ||
26 | +class SegrulesFSA { | ||
27 | +public: | ||
28 | + SegrulesFSA(const unsigned char* ptr): initialState(), ptr(ptr) { | ||
29 | + SegrulesState state = {0, false, false, false}; | ||
30 | + initialState = state; | ||
31 | + } | ||
32 | + | ||
33 | + void proceedToNext( | ||
34 | + const unsigned char segnum, | ||
35 | + const SegrulesState state, | ||
36 | + std::set<SegrulesState>& newStates) const { | ||
37 | + | ||
38 | + const unsigned char* currPtr = ptr + state.offset; | ||
39 | + currPtr++; | ||
40 | + const unsigned char transitionsNum = *currPtr; | ||
41 | + currPtr++; | ||
42 | + for (unsigned int i = 0; i < transitionsNum; i++) { | ||
43 | + if (*currPtr == segnum) { | ||
44 | + newStates.insert(newStates.begin(), this->transition2State(currPtr)); | ||
45 | + } | ||
46 | + currPtr += 4; | ||
47 | + } | ||
48 | + } | ||
49 | + | ||
50 | + virtual ~SegrulesFSA() {} | ||
51 | + | ||
52 | + SegrulesState initialState; | ||
53 | +private: | ||
54 | + const unsigned char* ptr; | ||
55 | + | ||
56 | + SegrulesState transition2State(const unsigned char* transitionPtr) const { | ||
57 | + unsigned char ACCEPTING_FLAG = 1; | ||
58 | + unsigned char WEAK_FLAG = 2; | ||
59 | + SegrulesState res; | ||
60 | + transitionPtr++; | ||
61 | + res.shiftOrthFromPrevious = *transitionPtr; | ||
62 | + transitionPtr++; | ||
63 | + res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr)); | ||
64 | + res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; | ||
65 | + res.weak = *(ptr + res.offset) & WEAK_FLAG; | ||
66 | + return res; | ||
67 | + } | ||
68 | +}; | ||
69 | + | ||
70 | +#endif /* SEGRULESFSA_HPP */ | ||
71 | + |
morfeusz/segrules/segrules.cpp
@@ -33,23 +33,23 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { | @@ -33,23 +33,23 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { | ||
33 | return res; | 33 | return res; |
34 | } | 34 | } |
35 | 35 | ||
36 | -static inline SegrulesFSAType* deserializeFSA(const unsigned char*& ptr) { | 36 | +static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) { |
37 | uint32_t fsaSize = deserializeUint32(ptr); | 37 | uint32_t fsaSize = deserializeUint32(ptr); |
38 | - static SegrulesDeserializer deserializer; | ||
39 | - SegrulesFSAType* res = SegrulesFSAType::getFSA(ptr, deserializer); | 38 | +// static SegrulesDeserializer deserializer; |
39 | + SegrulesFSA* res = new SegrulesFSA(ptr); | ||
40 | ptr += fsaSize; | 40 | ptr += fsaSize; |
41 | return res; | 41 | return res; |
42 | } | 42 | } |
43 | 43 | ||
44 | -map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) { | ||
45 | - map<SegrulesOptions, SegrulesFSAType*> res; | 44 | +map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) { |
45 | + map<SegrulesOptions, SegrulesFSA*> res; | ||
46 | const unsigned char* fsasMapPtr = getFSAsMapPtr(analyzerPtr); | 46 | const unsigned char* fsasMapPtr = getFSAsMapPtr(analyzerPtr); |
47 | const unsigned char* currPtr = fsasMapPtr; | 47 | const unsigned char* currPtr = fsasMapPtr; |
48 | unsigned char fsasNum = *currPtr; | 48 | unsigned char fsasNum = *currPtr; |
49 | currPtr++; | 49 | currPtr++; |
50 | for (unsigned char i = 0; i < fsasNum; i++) { | 50 | for (unsigned char i = 0; i < fsasNum; i++) { |
51 | SegrulesOptions options = deserializeOptions(currPtr); | 51 | SegrulesOptions options = deserializeOptions(currPtr); |
52 | - SegrulesFSAType* fsa = deserializeFSA(currPtr); | 52 | + SegrulesFSA* fsa = deserializeFSA(currPtr); |
53 | res[options] = fsa; | 53 | res[options] = fsa; |
54 | } | 54 | } |
55 | return res; | 55 | return res; |
morfeusz/segrules/segrules.hpp
@@ -11,13 +11,13 @@ | @@ -11,13 +11,13 @@ | ||
11 | #include <utility> | 11 | #include <utility> |
12 | #include <map> | 12 | #include <map> |
13 | #include <string> | 13 | #include <string> |
14 | -#include "../fsa/fsa.hpp" | 14 | +#include "SegrulesFSA.hpp" |
15 | 15 | ||
16 | typedef std::map<std::string, std::string> SegrulesOptions; | 16 | typedef std::map<std::string, std::string> SegrulesOptions; |
17 | -typedef State<unsigned char> SegrulesStateType; | ||
18 | -typedef FSA<unsigned char> SegrulesFSAType; | 17 | +//typedef State<unsigned char> SegrulesStateType; |
18 | +//typedef FSA<unsigned char> SegrulesFSAType; | ||
19 | 19 | ||
20 | -std::map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); | 20 | +std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); |
21 | 21 | ||
22 | #endif /* SEGRULES_HPP */ | 22 | #endif /* SEGRULES_HPP */ |
23 | 23 |
nbproject/configurations.xml
@@ -106,14 +106,20 @@ | @@ -106,14 +106,20 @@ | ||
106 | </makeTool> | 106 | </makeTool> |
107 | </makefileType> | 107 | </makefileType> |
108 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> | 108 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
109 | + <ccTool flags="1"> | ||
110 | + </ccTool> | ||
109 | </item> | 111 | </item> |
110 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> | 112 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | + <ccTool flags="1"> | ||
114 | + </ccTool> | ||
111 | </item> | 115 | </item> |
112 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> | 116 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | </item> | 117 | </item> |
114 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> | 118 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
115 | </item> | 119 | </item> |
116 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> | 120 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
121 | + <ccTool flags="1"> | ||
122 | + </ccTool> | ||
117 | </item> | 123 | </item> |
118 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" | 124 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
119 | ex="false" | 125 | ex="false" |
@@ -121,7 +127,6 @@ | @@ -121,7 +127,6 @@ | ||
121 | flavor2="8"> | 127 | flavor2="8"> |
122 | <ccTool> | 128 | <ccTool> |
123 | <incDir> | 129 | <incDir> |
124 | - <pElem>build</pElem> | ||
125 | <pElem>/usr/lib/jvm/default-java/include</pElem> | 130 | <pElem>/usr/lib/jvm/default-java/include</pElem> |
126 | <pElem>morfeusz</pElem> | 131 | <pElem>morfeusz</pElem> |
127 | <pElem>build/morfeusz/java</pElem> | 132 | <pElem>build/morfeusz/java</pElem> |
@@ -145,7 +150,6 @@ | @@ -145,7 +150,6 @@ | ||
145 | flavor2="8"> | 150 | flavor2="8"> |
146 | <ccTool> | 151 | <ccTool> |
147 | <incDir> | 152 | <incDir> |
148 | - <pElem>build</pElem> | ||
149 | <pElem>/usr/include/python2.7</pElem> | 153 | <pElem>/usr/include/python2.7</pElem> |
150 | <pElem>morfeusz</pElem> | 154 | <pElem>morfeusz</pElem> |
151 | <pElem>build/morfeusz/python</pElem> | 155 | <pElem>build/morfeusz/python</pElem> |
@@ -173,9 +177,8 @@ | @@ -173,9 +177,8 @@ | ||
173 | <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> | 177 | <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
174 | <ccTool flags="1"> | 178 | <ccTool flags="1"> |
175 | <incDir> | 179 | <incDir> |
176 | - <pElem>build1</pElem> | ||
177 | <pElem>morfeusz</pElem> | 180 | <pElem>morfeusz</pElem> |
178 | - <pElem>build1/morfeusz</pElem> | 181 | + <pElem>morfeusz/build/morfeusz</pElem> |
179 | </incDir> | 182 | </incDir> |
180 | <preprocessorList> | 183 | <preprocessorList> |
181 | <Elem>libmorfeusz_EXPORTS</Elem> | 184 | <Elem>libmorfeusz_EXPORTS</Elem> |
@@ -185,9 +188,8 @@ | @@ -185,9 +188,8 @@ | ||
185 | <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> | 188 | <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
186 | <ccTool flags="1"> | 189 | <ccTool flags="1"> |
187 | <incDir> | 190 | <incDir> |
188 | - <pElem>build1</pElem> | ||
189 | <pElem>morfeusz</pElem> | 191 | <pElem>morfeusz</pElem> |
190 | - <pElem>build1/morfeusz</pElem> | 192 | + <pElem>morfeusz/build/morfeusz</pElem> |
191 | </incDir> | 193 | </incDir> |
192 | <preprocessorList> | 194 | <preprocessorList> |
193 | <Elem>libmorfeusz_EXPORTS</Elem> | 195 | <Elem>libmorfeusz_EXPORTS</Elem> |
@@ -266,12 +268,18 @@ | @@ -266,12 +268,18 @@ | ||
266 | </preprocessorList> | 268 | </preprocessorList> |
267 | </ccTool> | 269 | </ccTool> |
268 | </folder> | 270 | </folder> |
269 | - <folder path="morfeusz/java"> | 271 | + <folder path="morfeusz"> |
270 | <ccTool> | 272 | <ccTool> |
271 | <incDir> | 273 | <incDir> |
272 | <pElem>build</pElem> | 274 | <pElem>build</pElem> |
275 | + </incDir> | ||
276 | + </ccTool> | ||
277 | + </folder> | ||
278 | + <folder path="morfeusz/java"> | ||
279 | + <ccTool> | ||
280 | + <incDir> | ||
273 | <pElem>morfeusz</pElem> | 281 | <pElem>morfeusz</pElem> |
274 | - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | 282 | + <pElem>/usr/lib/jvm/default-java/include</pElem> |
275 | </incDir> | 283 | </incDir> |
276 | <preprocessorList> | 284 | <preprocessorList> |
277 | <Elem>libjmorfeusz_EXPORTS</Elem> | 285 | <Elem>libjmorfeusz_EXPORTS</Elem> |
@@ -281,7 +289,6 @@ | @@ -281,7 +289,6 @@ | ||
281 | <folder path="morfeusz/python"> | 289 | <folder path="morfeusz/python"> |
282 | <ccTool> | 290 | <ccTool> |
283 | <incDir> | 291 | <incDir> |
284 | - <pElem>build</pElem> | ||
285 | <pElem>/usr/include/python2.7</pElem> | 292 | <pElem>/usr/include/python2.7</pElem> |
286 | <pElem>morfeusz</pElem> | 293 | <pElem>morfeusz</pElem> |
287 | </incDir> | 294 | </incDir> |
@@ -407,18 +414,26 @@ | @@ -407,18 +414,26 @@ | ||
407 | </ccTool> | 414 | </ccTool> |
408 | </item> | 415 | </item> |
409 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> | 416 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
417 | + <ccTool flags="1"> | ||
418 | + </ccTool> | ||
410 | </item> | 419 | </item> |
411 | <item path="morfeusz/charset/CharsetConverter.cpp" | 420 | <item path="morfeusz/charset/CharsetConverter.cpp" |
412 | ex="false" | 421 | ex="false" |
413 | tool="1" | 422 | tool="1" |
414 | flavor2="4"> | 423 | flavor2="4"> |
424 | + <ccTool flags="1"> | ||
425 | + </ccTool> | ||
415 | </item> | 426 | </item> |
416 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> | 427 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
428 | + <ccTool flags="1"> | ||
429 | + </ccTool> | ||
417 | </item> | 430 | </item> |
418 | <item path="morfeusz/charset/conversion_tables.cpp" | 431 | <item path="morfeusz/charset/conversion_tables.cpp" |
419 | ex="false" | 432 | ex="false" |
420 | tool="1" | 433 | tool="1" |
421 | flavor2="4"> | 434 | flavor2="4"> |
435 | + <ccTool flags="1"> | ||
436 | + </ccTool> | ||
422 | </item> | 437 | </item> |
423 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> | 438 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
424 | <ccTool flags="1"> | 439 | <ccTool flags="1"> |
@@ -507,8 +522,12 @@ | @@ -507,8 +522,12 @@ | ||
507 | ex="false" | 522 | ex="false" |
508 | tool="1" | 523 | tool="1" |
509 | flavor2="4"> | 524 | flavor2="4"> |
525 | + <ccTool flags="1"> | ||
526 | + </ccTool> | ||
510 | </item> | 527 | </item> |
511 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> | 528 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
529 | + <ccTool flags="1"> | ||
530 | + </ccTool> | ||
512 | </item> | 531 | </item> |
513 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> | 532 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
514 | <ccTool flags="0"> | 533 | <ccTool flags="0"> |