Commit 00e66248a61ae340a23b5635cfc761be6dbf38cd
1 parent
a6f0d912
poprawiona obsługa segmentacji (działają już cyfry tak, jak na początku ustalono)
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@112 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
26 changed files
with
629 additions
and
236 deletions
CMakeLists.txt
... | ... | @@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "") |
36 | 36 | if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") |
37 | 37 | set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) |
38 | 38 | else () |
39 | - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
39 | + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorf-0.6.7.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
40 | 40 | endif () |
41 | 41 | endif () |
42 | 42 | |
... | ... |
buildAll.sh
fsabuilder/buildfsa.py
... | ... | @@ -261,8 +261,9 @@ def main(opts): |
261 | 261 | if __name__ == '__main__': |
262 | 262 | import os |
263 | 263 | opts = _parseOptions() |
264 | - try: | |
265 | - main(opts) | |
266 | - except Exception as ex: | |
267 | - print >> sys.stderr, unicode(ex).encode('utf8') | |
264 | +# try: | |
265 | + main(opts) | |
266 | +# except Exception as ex: | |
267 | +# raise ex | |
268 | +# print >> sys.stderr, unicode(ex).encode('utf8') | |
268 | 269 | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/fsa.py
... | ... | @@ -113,12 +113,13 @@ class FSA(object): |
113 | 113 | return q |
114 | 114 | |
115 | 115 | def calculateOffsets(self, sizeCounter): |
116 | - currReverseOffset = 0 | |
117 | - for state in self.initialState.dfs(set()): | |
118 | - currReverseOffset += sizeCounter(state) | |
119 | - state.reverseOffset = currReverseOffset | |
120 | - for state in self.initialState.dfs(set()): | |
121 | - state.offset = currReverseOffset - state.reverseOffset | |
116 | + self.initialState.calculateOffsets(sizeCounter) | |
117 | +# currReverseOffset = 0 | |
118 | +# for state in self.initialState.dfs(set()): | |
119 | +# currReverseOffset += sizeCounter(state) | |
120 | +# state.reverseOffset = currReverseOffset | |
121 | +# for state in self.initialState.dfs(set()): | |
122 | +# state.offset = currReverseOffset - state.reverseOffset | |
122 | 123 | |
123 | 124 | def debug(self): |
124 | 125 | for state in self.initialState.dfs(set()): |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.py
... | ... | @@ -6,6 +6,7 @@ Created on Oct 20, 2013 |
6 | 6 | |
7 | 7 | import logging |
8 | 8 | from state import State |
9 | +from morfeuszbuilder.utils.serializationUtils import * | |
9 | 10 | |
10 | 11 | class Serializer(object): |
11 | 12 | |
... | ... | @@ -63,7 +64,7 @@ class Serializer(object): |
63 | 64 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
64 | 65 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): |
65 | 66 | fsaData.extend(self.state2bytearray(state)) |
66 | - res.extend(self.htonl(len(fsaData))) | |
67 | + res.extend(htonl(len(fsaData))) | |
67 | 68 | res.extend(fsaData) |
68 | 69 | res.extend(self.serializeEpilogue(additionalData, moreAdditionalData)) |
69 | 70 | return res |
... | ... | @@ -71,9 +72,9 @@ class Serializer(object): |
71 | 72 | def _serializeTags(self, tagsMap): |
72 | 73 | res = bytearray() |
73 | 74 | numOfTags = len(tagsMap) |
74 | - res.extend(self.htons(numOfTags)) | |
75 | + res.extend(htons(numOfTags)) | |
75 | 76 | for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): |
76 | - res.extend(self.htons(tagnum)) | |
77 | + res.extend(htons(tagnum)) | |
77 | 78 | res.extend(self.fsa.encodeWord(tag)) |
78 | 79 | res.append(0) |
79 | 80 | return res |
... | ... | @@ -86,25 +87,6 @@ class Serializer(object): |
86 | 87 | res.extend(self._serializeTags(tagset._name2namenum)) |
87 | 88 | return res |
88 | 89 | |
89 | - # serialize uint16 as big endian | |
90 | - def htons(self, n): | |
91 | - assert n < 65536 | |
92 | - assert n >= 0 | |
93 | - res = bytearray() | |
94 | - res.append((n & 0x00FF00) >> 8) | |
95 | - res.append(n & 0x0000FF) | |
96 | - return res | |
97 | - | |
98 | - # serialize uint32 as big endian | |
99 | - def htonl(self, n): | |
100 | - assert n >= 0 | |
101 | - res = bytearray() | |
102 | - res.append((n & 0xFF000000) >> 24) | |
103 | - res.append((n & 0x00FF0000) >> 16) | |
104 | - res.append((n & 0x0000FF00) >> 8) | |
105 | - res.append(n & 0x000000FF) | |
106 | - return res | |
107 | - | |
108 | 90 | def serializePrologue(self): |
109 | 91 | res = bytearray() |
110 | 92 | |
... | ... | @@ -126,7 +108,7 @@ class Serializer(object): |
126 | 108 | res = bytearray() |
127 | 109 | additionalDataSize = len(additionalData) if additionalData else 0 |
128 | 110 | moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0 |
129 | - res.extend(self.htonl(additionalDataSize)) | |
111 | + res.extend(htonl(additionalDataSize)) | |
130 | 112 | |
131 | 113 | # add additional data itself |
132 | 114 | if additionalDataSize: |
... | ... |
fsabuilder/morfeuszbuilder/fsa/state.py
... | ... | @@ -13,7 +13,7 @@ class State(object): |
13 | 13 | |
14 | 14 | def __init__(self, additionalData=None): |
15 | 15 | self.transitionsMap = {} |
16 | - self.transitionsDataMap = {} | |
16 | +# self.transitionsDataMap = {} | |
17 | 17 | self.freq = 0 |
18 | 18 | self.encodedData = None |
19 | 19 | self.reverseOffset = None |
... | ... | @@ -29,11 +29,11 @@ class State(object): |
29 | 29 | def transitionsNum(self): |
30 | 30 | return len(self.transitionsMap) |
31 | 31 | |
32 | - def setTransition(self, byte, nextState): | |
33 | - self.transitionsMap[byte] = nextState | |
34 | - | |
35 | - def setTransitionData(self, byte, data): | |
36 | - self.transitionsDataMap[byte] = data | |
32 | + def setTransition(self, label, nextState): | |
33 | + self.transitionsMap[label] = nextState | |
34 | +# | |
35 | +# def setTransitionData(self, byte, data): | |
36 | +# self.transitionsDataMap[byte] = data | |
37 | 37 | |
38 | 38 | def hasNext(self, byte): |
39 | 39 | return byte in self.transitionsMap |
... | ... | @@ -68,6 +68,14 @@ class State(object): |
68 | 68 | yield state1 |
69 | 69 | yield self |
70 | 70 | |
71 | + def calculateOffsets(self, sizeCounter): | |
72 | + currReverseOffset = 0 | |
73 | + for state in self.dfs(set()): | |
74 | + currReverseOffset += sizeCounter(state) | |
75 | + state.reverseOffset = currReverseOffset | |
76 | + for state in self.dfs(set()): | |
77 | + state.offset = currReverseOffset - state.reverseOffset | |
78 | + | |
71 | 79 | def debug(self): |
72 | 80 | print '----------------' |
73 | 81 | print 'STATE:', self.idx, 'accepting', self.isAccepting() |
... | ... |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... | ... | @@ -7,6 +7,7 @@ Created on 23 sty 2014 |
7 | 7 | import re |
8 | 8 | from pyparsing import * |
9 | 9 | from morfeuszbuilder.utils import exceptions |
10 | +from pyparseString import pyparseString | |
10 | 11 | |
11 | 12 | identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') |
12 | 13 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() |
... | ... | @@ -54,7 +55,7 @@ def _tryToSubstituteNonArgDefine(s, t, defines): |
54 | 55 | else: |
55 | 56 | return defineName |
56 | 57 | |
57 | -def _processLine(lineNum, line, defines): | |
58 | +def _processLine(lineNum, line, defines, filename): | |
58 | 59 | if line.strip(): |
59 | 60 | |
60 | 61 | rule = Forward() |
... | ... | @@ -67,24 +68,16 @@ def _processLine(lineNum, line, defines): |
67 | 68 | rule.setParseAction(lambda s, l, t: ' '.join(t)) |
68 | 69 | defineInstance.setParseAction(lambda s, l, t: _tryToSubstituteArgDefine(s, t, defines)) |
69 | 70 | localId.setParseAction(lambda s, l, t: _tryToSubstituteNonArgDefine(s, t, defines)) |
70 | - try: | |
71 | - return rule.parseString(line, parseAll=True)[0] | |
72 | - except ParseException as ex: | |
73 | - msg = u'Preprocessing of segmentation rules failed.\n' | |
74 | - msg += line + '\n' | |
75 | - msg += (ex.col - 1) * ' ' + '^\n' | |
76 | - msg += ex.msg | |
77 | -# print unicode(exceptions.SegtypesException(msg)).encode('utf8') | |
78 | - raise exceptions.SegtypesException(msg) | |
71 | + return pyparseString(rule, lineNum, line, filename)[0] | |
79 | 72 | else: |
80 | 73 | return line |
81 | 74 | |
82 | -def preprocess(inputLines, defs): | |
75 | +def preprocess(inputLines, defs, filename): | |
83 | 76 | defines = {} |
84 | 77 | ifdefsStack = [] |
85 | 78 | for lineNum, line in inputLines: |
86 | 79 | if line.startswith('#define'): |
87 | - parsedDefine = list(define.parseString(line)) | |
80 | + parsedDefine = list(pyparseString(define, lineNum, line, filename)) | |
88 | 81 | if len(parsedDefine) == 2: |
89 | 82 | name, val = parsedDefine |
90 | 83 | defines[name] = NonArgDefine(name, val) |
... | ... | @@ -92,15 +85,16 @@ def preprocess(inputLines, defs): |
92 | 85 | name, arg, val = parsedDefine |
93 | 86 | localDefines = defines.copy() |
94 | 87 | localDefines[arg] = NonArgDefine(arg, arg) |
95 | - val = _processLine(lineNum, val, localDefines) | |
88 | + val = _processLine(lineNum, val, localDefines, filename) | |
96 | 89 | defines[name] = ArgDefine(name, arg, val) |
97 | 90 | elif line.startswith('#ifdef'): |
98 | - name = ifdef.parseString(line)[0] | |
91 | + name = pyparseString(ifdef, lineNum, line, filename)[0] | |
92 | +# name = ifdef.parseString(line)[0] | |
99 | 93 | ifdefsStack.append(name) |
100 | 94 | elif line.startswith('#endif'): |
101 | 95 | ifdefsStack.pop() |
102 | 96 | elif line.startswith('#'): |
103 | 97 | yield lineNum, line |
104 | 98 | elif len(ifdefsStack) == 0 or all(map(lambda name: name in defs, ifdefsStack)): |
105 | - yield lineNum, _processLine(lineNum, line, defines) | |
99 | + yield lineNum, _processLine(lineNum, line, defines, filename) | |
106 | 100 | |
107 | 101 | \ No newline at end of file |
... | ... |
fsabuilder/morfeuszbuilder/segrules/pyparseString.py
0 → 100644
1 | +''' | |
2 | +Created on 12 mar 2014 | |
3 | + | |
4 | +@author: mlenart | |
5 | +''' | |
6 | + | |
7 | +from pyparsing import ParseException | |
8 | +from morfeuszbuilder.utils import exceptions | |
9 | + | |
10 | +def pyparseString(rule, lineNum, line, filename): | |
11 | + try: | |
12 | + return rule.parseString(line, parseAll=True) | |
13 | + except ParseException as ex: | |
14 | + msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) | |
15 | + msg += line + '\n' | |
16 | + msg += (ex.col - 1) * ' ' + '^\n' | |
17 | + msg += ex.msg | |
18 | +# print unicode(exceptions.SegtypesException(msg)).encode('utf8') | |
19 | + raise exceptions.SegtypesException(msg) | |
0 | 20 | \ No newline at end of file |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rules.py
... | ... | @@ -25,16 +25,17 @@ class SegmentRule(object): |
25 | 25 | |
26 | 26 | class TagRule(SegmentRule): |
27 | 27 | |
28 | - def __init__(self, segnum, segtype): | |
28 | + def __init__(self, segnum, shiftOrth, segtype): | |
29 | 29 | self.segnum = segnum |
30 | 30 | self.segtype = segtype |
31 | + self.shiftOrth = shiftOrth | |
31 | 32 | |
32 | 33 | def addToNFA(self, fsa): |
33 | 34 | endState = RulesNFAState(final=True) |
34 | 35 | self._doAddToNFA(fsa.initialState, endState) |
35 | 36 | |
36 | 37 | def _doAddToNFA(self, startState, endState): |
37 | - startState.addTransition(self.segnum, endState) | |
38 | + startState.addTransition((self.segnum, self.shiftOrth), endState) | |
38 | 39 | |
39 | 40 | def __str__(self): |
40 | 41 | return u'%s(%d)' % (self.segtype, self.segnum) |
... | ... | @@ -92,6 +93,7 @@ class ZeroOrMoreRule(UnaryRule): |
92 | 93 | |
93 | 94 | def __init__(self, child): |
94 | 95 | super(ZeroOrMoreRule, self).__init__(child) |
96 | + assert isinstance(child, SegmentRule) | |
95 | 97 | |
96 | 98 | def addToNFA(self, fsa): |
97 | 99 | raise ValueError() |
... | ... | @@ -108,33 +110,3 @@ class ZeroOrMoreRule(UnaryRule): |
108 | 110 | |
109 | 111 | def __str__(self): |
110 | 112 | return u'(' + str(self.child) + ')*' |
111 | - | |
112 | -class ShiftOrthRule(UnaryRule): | |
113 | - | |
114 | - def __init__(self, child): | |
115 | - super(ShiftOrthRule, self).__init__(child) | |
116 | - | |
117 | - def addToNFA(self, fsa): | |
118 | - raise ValueError() | |
119 | - | |
120 | - def _doAddToNFA(self, startState, endState): | |
121 | - self.child._doAddToNFA(startState, endState) | |
122 | - startState.setTransitionData(self.child.segnum, 1) | |
123 | - | |
124 | - def __str__(self): | |
125 | - return u'(' + str(self.child) + ')>' | |
126 | - | |
127 | -class ShiftOrthSameTypeRule(UnaryRule): | |
128 | - | |
129 | - def __init__(self, child): | |
130 | - super(ShiftOrthSameTypeRule, self).__init__(child) | |
131 | - | |
132 | - def addToNFA(self, fsa): | |
133 | - raise ValueError() | |
134 | - | |
135 | - def _doAddToNFA(self, startState, endState): | |
136 | - self.child._doAddToNFA(startState, endState) | |
137 | - startState.setTransitionData(self.child.segnum, 2) | |
138 | - | |
139 | - def __str__(self): | |
140 | - return u'(' + str(self.child) + ')!>' | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
0 → 100644
1 | +''' | |
2 | +Created on 12 mar 2014 | |
3 | + | |
4 | +@author: mlenart | |
5 | +''' | |
6 | +import logging | |
7 | +from morfeuszbuilder.fsa import state | |
8 | +from morfeuszbuilder.utils.serializationUtils import htons | |
9 | + | |
10 | +class RulesState(state.State): | |
11 | + | |
12 | + def __init__(self): | |
13 | + super(RulesState, self).__init__() | |
14 | + self.weak = None | |
15 | + | |
16 | + def setAsAccepting(self, weak): | |
17 | + self.weak = weak | |
18 | + self.encodedData = bytearray([1 if weak else 0]) | |
19 | + | |
20 | + def getEncodedSize(self): | |
21 | + stateInfoSize = 2 # accepting info + transitionsNum | |
22 | + transitionsSize = 4 * len(self.transitionsMap) | |
23 | + return stateInfoSize + transitionsSize | |
24 | + | |
25 | +class RulesFSA(object): | |
26 | + | |
27 | + def __init__(self): | |
28 | + self.initialState = state.State() | |
29 | + self.ACCEPTING_FLAG = 1 | |
30 | + self.WEAK_FLAG = 2 | |
31 | + | |
32 | + def stateData2bytearray(self, state): | |
33 | + res = bytearray() | |
34 | + firstByte = 0 | |
35 | + if state.isAccepting(): | |
36 | + firstByte |= self.ACCEPTING_FLAG | |
37 | + if state.weak: | |
38 | + firstByte |= self.WEAK_FLAG | |
39 | + assert firstByte < 256 and firstByte >= 0 | |
40 | + res.append(firstByte) | |
41 | + | |
42 | + secondByte = len(state.transitionsMap) | |
43 | + assert secondByte < 256 and secondByte >= 0 | |
44 | + res.append(secondByte) | |
45 | + | |
46 | + return res | |
47 | + | |
48 | + def transitionsData2bytearray(self, state): | |
49 | + res = bytearray() | |
50 | +# logging.debug('next') | |
51 | + for (segnum, shiftOrth), nextState in state.transitionsMap.iteritems(): | |
52 | + res.append(segnum) | |
53 | + if shiftOrth: | |
54 | + res.append(1) | |
55 | + else: | |
56 | + res.append(0) | |
57 | + offset = nextState.offset | |
58 | + assert offset < 65536 | |
59 | +# res.append((offset & 0xFF0000) >> 16) | |
60 | + res.extend(htons(offset)) | |
61 | + return res | |
62 | + | |
63 | + def serialize(self): | |
64 | + self.initialState.calculateOffsets(sizeCounter=lambda s: s.getEncodedSize()) | |
65 | + res = bytearray() | |
66 | + | |
67 | + for state in sorted(self.initialState.dfs(set()), key=lambda s: s.offset): | |
68 | + res.extend(self.stateData2bytearray(state)) | |
69 | + res.extend(self.transitionsData2bytearray(state)) | |
70 | + | |
71 | + logging.info('Segmentation automaton size: %d bytes', len(res)) | |
72 | + print list(res) | |
73 | + return res | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... | ... | @@ -4,7 +4,7 @@ Created on 20 lut 2014 |
4 | 4 | @author: mlenart |
5 | 5 | ''' |
6 | 6 | import logging |
7 | -from morfeuszbuilder.fsa.serializer import SimpleSerializer | |
7 | +from morfeuszbuilder.utils.serializationUtils import htons, htonl | |
8 | 8 | |
9 | 9 | class RulesManager(object): |
10 | 10 | |
... | ... | @@ -52,9 +52,9 @@ class RulesManager(object): |
52 | 52 | |
53 | 53 | def _serializeDFA(self, dfa): |
54 | 54 | res = bytearray() |
55 | - serializer = SimpleSerializer(dfa, serializeTransitionsData=True) | |
56 | - dfaBytearray = serializer.fsa2bytearray() | |
57 | - res.extend(serializer.htonl(len(dfaBytearray))) | |
55 | +# serializer = SimpleSerializer(dfa, serializeTransitionsData=True) | |
56 | + dfaBytearray = dfa.serialize() | |
57 | + res.extend(htonl(len(dfaBytearray))) | |
58 | 58 | res.extend(dfaBytearray) |
59 | 59 | return res |
60 | 60 | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... | ... | @@ -4,7 +4,7 @@ Created on 24 sty 2014 |
4 | 4 | @author: mlenart |
5 | 5 | ''' |
6 | 6 | |
7 | -from morfeuszbuilder.fsa import fsa, state, encode | |
7 | +from morfeuszbuilder.segrules.rulesFSA import RulesFSA, RulesState | |
8 | 8 | |
9 | 9 | class RulesNFAState(object): |
10 | 10 | |
... | ... | @@ -12,7 +12,7 @@ class RulesNFAState(object): |
12 | 12 | |
13 | 13 | def __init__(self, initial=False, final=False, weak=False): |
14 | 14 | self.transitionsMap = {} |
15 | - self.transitionsDataMap = {} | |
15 | +# self.transitionsDataMap = {} | |
16 | 16 | self.initial = initial |
17 | 17 | self.final = final |
18 | 18 | self.weak = weak |
... | ... | @@ -20,13 +20,9 @@ class RulesNFAState(object): |
20 | 20 | RulesNFAState.statesCounter += 1 |
21 | 21 | |
22 | 22 | def addTransition(self, label, targetState): |
23 | + assert label is None or len(label) == 2 | |
23 | 24 | self.transitionsMap.setdefault(label, set()) |
24 | 25 | self.transitionsMap[label].add(targetState) |
25 | - self.transitionsDataMap[label] = 0 | |
26 | - | |
27 | - def setTransitionData(self, label, byte): | |
28 | - assert len(self.transitionsMap[label]) == 1 | |
29 | - self.transitionsDataMap[label] = byte | |
30 | 26 | |
31 | 27 | def getClosure(self, visited): |
32 | 28 | if self in visited: |
... | ... | @@ -64,10 +60,11 @@ class RulesNFA(object): |
64 | 60 | for nfaState in nfaStates: |
65 | 61 | for label, nextStates in nfaState.transitionsMap.iteritems(): |
66 | 62 | if label is not None: |
67 | - transitionData = nfaState.transitionsDataMap[label] | |
68 | - res.setdefault((label, transitionData), set()) | |
63 | +# transitionData = nfaState.transitionsDataMap[label] | |
64 | + segnum, shiftOrth = label | |
65 | + res.setdefault((segnum, shiftOrth), set()) | |
69 | 66 | for nextNFAState in nextStates: |
70 | - res[(label, transitionData)] |= nextNFAState.getClosure(set()) | |
67 | + res[(segnum, shiftOrth)] |= nextNFAState.getClosure(set()) | |
71 | 68 | return res |
72 | 69 | |
73 | 70 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): |
... | ... | @@ -79,23 +76,24 @@ class RulesNFA(object): |
79 | 76 | if final: |
80 | 77 | # dfaState should be final |
81 | 78 | # and contain info about weakness |
82 | - dfaState.encodedData = bytearray([1 if weak else 0]) | |
83 | - for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | |
79 | + dfaState.setAsAccepting(weak=weak) | |
80 | +# dfaState.encodedData = bytearray([1 if weak else 0]) | |
81 | + for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | |
84 | 82 | key = frozenset(nextNFAStates) |
85 | 83 | if key in nfaSubset2DFAState: |
86 | 84 | nextDFAState = nfaSubset2DFAState[key] |
87 | 85 | else: |
88 | - nextDFAState = state.State() | |
86 | + nextDFAState = RulesState() | |
89 | 87 | nfaSubset2DFAState[key] = nextDFAState |
90 | 88 | self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) |
91 | - dfaState.setTransition(label, nextDFAState) | |
92 | - dfaState.setTransitionData(label, transitionData) | |
89 | + dfaState.setTransition((segnum, shiftOrth), nextDFAState) | |
90 | +# dfaState.setTransitionData(label, transitionData) | |
93 | 91 | |
94 | 92 | def convertToDFA(self): |
95 | - dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) | |
93 | + dfa = RulesFSA() | |
96 | 94 | startStates = self.initialState.getClosure(set()) |
97 | 95 | assert not any(filter(lambda s: s.final, startStates)) |
98 | - dfa.initialState = state.State(additionalData=False) | |
96 | + dfa.initialState = RulesState() | |
99 | 97 | self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) |
100 | 98 | return dfa |
101 | 99 | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -3,7 +3,7 @@ from pyparsing import * |
3 | 3 | ParserElement.enablePackrat() |
4 | 4 | from morfeuszbuilder.tagset import segtypes |
5 | 5 | from morfeuszbuilder.utils import configFile, exceptions |
6 | -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager | |
6 | +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString | |
7 | 7 | import codecs |
8 | 8 | import re |
9 | 9 | |
... | ... | @@ -48,8 +48,8 @@ class RulesParser(object): |
48 | 48 | if not firstNFA: |
49 | 49 | firstNFA = nfa |
50 | 50 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') |
51 | - combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) | |
52 | - for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): | |
51 | + combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) | |
52 | + for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): | |
53 | 53 | # print rule |
54 | 54 | rule.addToNFA(nfa) |
55 | 55 | # nfa.debug() |
... | ... | @@ -60,25 +60,24 @@ class RulesParser(object): |
60 | 60 | res.addDFA(key2Def, dfa) |
61 | 61 | return res |
62 | 62 | |
63 | - def _doParse(self, combinationEnumeratedLines, segtypesHelper): | |
63 | + def _doParse(self, combinationEnumeratedLines, segtypesHelper, filename): | |
64 | 64 | for lineNum, line in combinationEnumeratedLines: |
65 | 65 | if not line.startswith('#'): |
66 | - yield self._doParseOneLine(lineNum, line, segtypesHelper) | |
66 | + yield self._doParseOneLine(lineNum, line, segtypesHelper, filename) | |
67 | 67 | |
68 | - def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper): | |
68 | + def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): | |
69 | 69 | if not segtypesHelper.hasSegtype(segtype): |
70 | 70 | raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) |
71 | 71 | else: |
72 | 72 | # return rules.TagRule(segtype) |
73 | - return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), segtype) | |
73 | + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype) | |
74 | 74 | |
75 | - def _doParseOneLine(self, lineNum, line, segtypesHelper): | |
75 | + def _doParseOneLine(self, lineNum, line, segtypesHelper, filename): | |
76 | 76 | rule = Forward() |
77 | 77 | tagRule = Word(alphanums+'_') |
78 | - shiftOrthRule = tagRule + '>' | |
79 | - shiftOrthSameTypeRule = tagRule + '!' + '>' | |
78 | + shiftOrthRule = Word(alphanums+'_') + Suppress('>') | |
80 | 79 | parenRule = Suppress('(') + rule + Suppress(')') |
81 | - atomicRule = tagRule ^ shiftOrthRule ^ shiftOrthSameTypeRule ^ parenRule | |
80 | + atomicRule = tagRule ^ shiftOrthRule ^ parenRule | |
82 | 81 | zeroOrMoreRule = atomicRule + Suppress('*') |
83 | 82 | oneOrMoreRule = atomicRule + Suppress('+') |
84 | 83 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule |
... | ... | @@ -87,13 +86,12 @@ class RulesParser(object): |
87 | 86 | concatRule = OneOrMore(complexRule) |
88 | 87 | rule << concatRule |
89 | 88 | |
90 | - tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) | |
91 | - shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0])) | |
92 | - shiftOrthSameTypeRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthSameTypeRule(toks[0])) | |
89 | + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) | |
90 | + shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) | |
93 | 91 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) |
94 | 92 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) |
95 | 93 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) |
96 | 94 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) |
97 | 95 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) |
98 | - parsedRule = rule.parseString(line, parseAll=True)[0] | |
96 | + parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0] | |
99 | 97 | return parsedRule |
... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -33,6 +33,7 @@ class Segtypes(object): |
33 | 33 | raise exceptions.ConfigFileException(self.filename, lineNum, msg) |
34 | 34 | |
35 | 35 | def _readTags(self, segrulesConfigFile): |
36 | + gotWildcardPattern = False | |
36 | 37 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): |
37 | 38 | splitLine = re.split(r'\s+', line.strip()) |
38 | 39 | self._validate( |
... | ... | @@ -49,13 +50,27 @@ class Segtypes(object): |
49 | 50 | lineNum, |
50 | 51 | re.match(r'[a-z_\.\:\%]+', pattern)) |
51 | 52 | |
53 | + self._validate( | |
54 | + u'Pattern that matches everything must be the last one', | |
55 | + lineNum - 1, | |
56 | + not gotWildcardPattern) | |
57 | + | |
52 | 58 | if segtype in self.segtype2Segnum: |
53 | 59 | segnum = self.segtype2Segnum[segtype] |
54 | 60 | else: |
55 | 61 | segnum = len(self.segtype2Segnum) |
56 | 62 | self.segtype2Segnum[segtype] = segnum |
57 | 63 | |
58 | - self.patternsList.append(SegtypePattern(None, pattern, segnum)) | |
64 | + segtypePattern = SegtypePattern(None, pattern, segnum) | |
65 | + | |
66 | + self._validate( | |
67 | + u'There is no tag that matches pattern "%s".' % pattern, | |
68 | + lineNum, | |
69 | + any([segtypePattern.tryToMatch(None, tag) != -1 for tag in self.tagset.getAllTags()])) | |
70 | + | |
71 | + self.patternsList.append(segtypePattern) | |
72 | + | |
73 | + gotWildcardPattern = gotWildcardPattern or pattern == '%' | |
59 | 74 | |
60 | 75 | self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()]) |
61 | 76 | |
... | ... | @@ -67,7 +82,7 @@ class Segtypes(object): |
67 | 82 | lineNum, |
68 | 83 | re.match(r'[a-z_]+', segtype)) |
69 | 84 | self._validate( |
70 | - u'Pattern must contain lemma and POS', | |
85 | + u'Pattern must contain lemma and part-of-speech fields', | |
71 | 86 | lineNum, |
72 | 87 | re.match(r'.+\:[a-z_]+', pattern, re.U)) |
73 | 88 | |
... | ... | @@ -79,7 +94,14 @@ class Segtypes(object): |
79 | 94 | |
80 | 95 | lemma, pos = pattern.split(':') |
81 | 96 | |
82 | - self.patternsList.append(SegtypePattern(lemma, '%s|%s:%%' % (pos, pos), segnum)) | |
97 | + segtypePattern = SegtypePattern(lemma, pos + ':%', segnum) | |
98 | + | |
99 | + self._validate( | |
100 | + u'There is no tag that matches pattern "%s".' % (pos + ':%'), | |
101 | + lineNum, | |
102 | + any([segtypePattern.tryToMatch(lemma, tag) != -1 for tag in self.tagset.getAllTags()])) | |
103 | + | |
104 | + self.patternsList.append(segtypePattern) | |
83 | 105 | |
84 | 106 | def _debugSegnums(self): |
85 | 107 | for tagnum, segnum in self._tagnum2Segnum.items(): |
... | ... | @@ -121,11 +143,6 @@ class Segtypes(object): |
121 | 143 | if not res: |
122 | 144 | res = self._tagnum2Segnum.get(tagnum, None) |
123 | 145 | return res |
124 | -# for p in self.patternsList: | |
125 | -# res = p.tryToMatch(lemma, tag) | |
126 | -# if res >= 0: | |
127 | -# return res | |
128 | -# return None | |
129 | 146 | |
130 | 147 | class SegtypePattern(object): |
131 | 148 | |
... | ... | @@ -135,8 +152,13 @@ class SegtypePattern(object): |
135 | 152 | self.segnum = segnum |
136 | 153 | |
137 | 154 | def tryToMatch(self, lemma, tag): |
155 | +# tag2Match = tag + ':' if not tag.endswith(':') else tag | |
156 | +# print tag2Match | |
157 | + patterns2Match = [] | |
158 | + patterns2Match.append(self.pattern.replace('%', '.*')) | |
159 | + patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) | |
138 | 160 | if (self.lemma is None or self.lemma == lemma) \ |
139 | - and re.match(self.pattern.replace('%', '.*'), tag): | |
161 | + and any([re.match(p, tag) for p in patterns2Match]): | |
140 | 162 | return self.segnum |
141 | 163 | else: |
142 | 164 | return -1 |
... | ... |
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
0 → 100644
1 | +''' | |
2 | +Created on 12 mar 2014 | |
3 | + | |
4 | +@author: mlenart | |
5 | +''' | |
6 | + | |
7 | +# serialize uint16 as big endian | |
8 | +def htons(n): | |
9 | + assert n < 65536 | |
10 | + assert n >= 0 | |
11 | + res = bytearray() | |
12 | + res.append((n & 0x00FF00) >> 8) | |
13 | + res.append(n & 0x0000FF) | |
14 | + return res | |
15 | + | |
16 | +# serialize uint32 as big endian | |
17 | +def htonl(n): | |
18 | + assert n >= 0 | |
19 | + res = bytearray() | |
20 | + res.append((n & 0xFF000000) >> 24) | |
21 | + res.append((n & 0x00FF0000) >> 16) | |
22 | + res.append((n & 0x0000FF00) >> 8) | |
23 | + res.append(n & 0x000000FF) | |
24 | + return res | |
... | ... |
input/dodatki.tab
... | ... | @@ -41,13 +41,171 @@ z Z brev:pun |
41 | 41 | ż Ż brev:pun |
42 | 42 | ch Ch brev:pun |
43 | 43 | st St brev:pun |
44 | -0 0 dig | |
45 | -1 1 dig | |
46 | -2 2 dig | |
47 | -3 3 dig | |
48 | -4 4 dig | |
49 | -5 5 dig | |
50 | -6 6 dig | |
51 | -7 7 dig | |
52 | -8 8 dig | |
53 | -9 9 dig | |
44 | +poli poli prefa | |
45 | +poli poli prefs | |
46 | +niby niby prefa | |
47 | +niby niby prefs | |
48 | +eks eks prefs | |
49 | +ex ex prefs | |
50 | +euro euro prefa | |
51 | +euro euro prefs | |
52 | +mikro mikro prefs | |
53 | +mikro mikro prefa | |
54 | +makro makro prefa | |
55 | +makro makro prefs | |
56 | +bez bez prefa | |
57 | +do do prefv | |
58 | +do do prefa | |
59 | +dez dez prefv | |
60 | +dez dez prefa | |
61 | +dez dez prefs | |
62 | +ko ko prefa | |
63 | +ko ko prefs | |
64 | +między między prefa | |
65 | +między między prefs | |
66 | +na na prefa | |
67 | +na na prefs | |
68 | +na na prefv | |
69 | +nad nad prefa | |
70 | +nad nad prefs | |
71 | +nad nad prefv | |
72 | +o o prefv | |
73 | +ob ob prefv | |
74 | +od od prefa | |
75 | +od od prefs | |
76 | +od od prefv | |
77 | +pra pra prefs | |
78 | +post post prefa | |
79 | +post post prefs | |
80 | +pod pod prefa | |
81 | +pod pod prefs | |
82 | +pod pod prefv | |
83 | +poza poza prefa | |
84 | +ponad ponad prefa | |
85 | +pre pre prefa | |
86 | +pre pre prefs | |
87 | +pro pro prefa | |
88 | +pro pro prefs | |
89 | +prze prze prefa | |
90 | +prze prze prefv | |
91 | +przeciw przeciw prefa | |
92 | +przeciw przeciw prefs | |
93 | +re re prefa | |
94 | +re re prefs | |
95 | +re re prefv | |
96 | +przy przy prefa | |
97 | +przy przy prefv | |
98 | +roz roz prefv | |
99 | +u u prefv | |
100 | +samo samo prefa | |
101 | +samo samo prefs | |
102 | +video video prefs | |
103 | +video video prefa | |
104 | +w w prefv | |
105 | +wy wy prefv | |
106 | +współ współ prefv | |
107 | +współ współ prefa | |
108 | +współ współ prefs | |
109 | +wice wice prefs | |
110 | +neo neo prefa | |
111 | +neo neo prefs | |
112 | +tele tele prefs | |
113 | +tele tele prefa | |
114 | +z z prefv | |
115 | +za za prefv | |
116 | +za za prefa | |
117 | +za za prefs | |
118 | +wideo wideo prefa | |
119 | +wideo wideo prefs | |
120 | +meta meta prefs | |
121 | +meta meta prefa | |
122 | +multi multi prefa | |
123 | +multi multi prefs | |
124 | +mega mega prefa | |
125 | +mega mega prefs | |
126 | +kontra kontra prefs | |
127 | +kontra kontra prefa | |
128 | +inter inter prefa | |
129 | +inter inter prefs | |
130 | +homo homo prefs | |
131 | +homo homo prefa | |
132 | +ekstra ekstra prefa | |
133 | +ekstra ekstra prefs | |
134 | +giga giga prefa | |
135 | +giga giga prefs | |
136 | +bi bi prefs | |
137 | +bi bi prefa | |
138 | +auto auto prefs | |
139 | +auto auto prefa | |
140 | +de de prefv | |
141 | +de de prefa | |
142 | +de de prefs | |
143 | +ultra ultra prefs | |
144 | +ultra ultra prefa | |
145 | +e- e- prefa | |
146 | +e- e- prefs | |
147 | +mini mini prefs | |
148 | +mini mini prefa | |
149 | +maxi maxi prefs | |
150 | +maxi maxi prefa | |
151 | +midi midi prefs | |
152 | +midi midi prefa | |
153 | +arcy arcy prefs | |
154 | +arcy arcy prefa | |
155 | +anty anty prefa | |
156 | +anty anty prefs | |
157 | +a a prefa | |
158 | +a a prefs | |
159 | +pan pan prefs | |
160 | +pan pan prefa | |
161 | +in in prefa | |
162 | +in in prefs | |
163 | +dys dys prefs | |
164 | +dys dys prefa | |
165 | +mono mono prefa | |
166 | +mono mono prefs | |
167 | +porno porno prefs | |
168 | +porno porno prefa | |
169 | +anglo anglo prefa | |
170 | +aero aero prefs | |
171 | +aero aero prefa | |
172 | +bio bio prefs | |
173 | +bio bio prefa | |
174 | +wszystko wszystko prefs | |
175 | +wszystko wszystko prefa | |
176 | +wszech wszech prefs | |
177 | +wszech wszech prefa | |
178 | +śród śród prefs | |
179 | +śród śród prefa | |
180 | +audio audio prefs | |
181 | +audio audio prefa | |
182 | +eko eko prefs | |
183 | +eko eko prefa | |
184 | +s s prefv | |
185 | +elektro elektro prefs | |
186 | +elektro elektro prefa | |
187 | +trans trans prefa | |
188 | +trans trans prefs | |
189 | +kontr kontr prefs | |
190 | +kontr kontr prefa | |
191 | +pseudo pseudo prefs | |
192 | +pseudo pseudo prefa | |
193 | +quasi quasi prefs | |
194 | +quasi quasi prefa | |
195 | +super super prefs | |
196 | +super super prefa | |
197 | +po po prefv | |
198 | +po po prefa | |
199 | +po po prefs | |
200 | +sub sub prefs | |
201 | +sub sub prefa | |
202 | +hiper hiper prefa | |
203 | +hiper hiper prefs | |
204 | +non non prefs | |
205 | +non non prefa | |
206 | +stereo stereo prefa | |
207 | +stereo stereo prefs | |
208 | +energo energo prefa | |
209 | +para para prefa | |
210 | +para para prefs | |
211 | +ś ś prefv | |
... | ... |
input/polimorf.tagset
input/segmenty.dat
... | ... | @@ -19,7 +19,7 @@ samotny |
19 | 19 | # przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”: |
20 | 20 | moze_interp(praet_sg_na) |
21 | 21 | |
22 | -# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „czytał”: | |
22 | +# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „moze”: | |
23 | 23 | moze_interp(praet_sg) |
24 | 24 | |
25 | 25 | # przeszlik mnogi, np. „czytali”: |
... | ... | @@ -69,9 +69,8 @@ moze_interp(praet_sg by aglsg) |
69 | 69 | # np. „gnietli·by·śmy” |
70 | 70 | moze_interp(praet_pl by aglpl) |
71 | 71 | #else |
72 | -moze_interp(praetcond) | |
72 | +# moze_interp(praetcond) | |
73 | 73 | #endif |
74 | - | |
75 | 74 | # np. „by·ś” |
76 | 75 | moze_interp(by aglsg) |
77 | 76 | # np. „by·ście” |
... | ... | @@ -98,9 +97,9 @@ moze_interp( (adja dywiz)+ adj ) |
98 | 97 | # adja dywiz adja dywiz adja dywiz adj interp? |
99 | 98 | # adja dywiz adja dywiz adja dywiz adja dywiz adj interp? |
100 | 99 | |
101 | -# Stopień najwyższy: | |
102 | -# np. „naj·zieleńszy”, „naj·mądrzej” | |
103 | -moze_interp( naj> adj_sup ) | |
100 | +# Formy zanegowane stopnia wyższego przymiotników i przysłówków (WK) | |
101 | +# np. „nie·grzeczniejszy”, „nie·grzeczniej” | |
102 | +moze_interp( nie> adj_com ) | |
104 | 103 | |
105 | 104 | # Formy „zanegowane” gerundiów i imiesłowów: |
106 | 105 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: |
... | ... | @@ -112,15 +111,21 @@ moze_interp(z_on_agl) |
112 | 111 | moze_interp(z_on_agl on_agl) |
113 | 112 | |
114 | 113 | # Liczba zapisana jako ciąg cyfr: |
115 | -moze_interp( dig!>+ ) | |
114 | +moze_interp( dig ) | |
116 | 115 | |
117 | 116 | # Formacje prefiksalne |
118 | 117 | #### trzeba wydzielić odpowiednie samodze! |
119 | -# rzeczownikowe i przymiotnikowe | |
120 | -# np. „euro·sodoma”, „e-·papieros”, „euro·sodomski”, „bez·argumentowy” | |
121 | -moze_interp( prefs samodz ) | |
118 | +# rzeczownikowe | |
119 | +# np. „euro·sodoma”, „e-·papieros” | |
120 | +moze_interp(nomina) | |
121 | +moze_interp( prefs> nomina ) | |
122 | 122 | # czasownikowe np. „po·nakapywać” |
123 | -moze_interp( prefv samodz ) | |
123 | +moze_interp(verba_imperf) | |
124 | +moze_interp( prefv> verba_imperf ) | |
125 | +# przymiotnikowe np. „do·żylny”, „euro·sodomski”, „bez·argumentowy” | |
126 | +moze_interp(adjectiva) | |
127 | +moze_interp(prefa> adj) | |
128 | +moze_interp( prefa> adjectiva ) | |
124 | 129 | |
125 | 130 | # Apozycje z dywizem |
126 | 131 | # np. „kobieta-prezydent” |
... | ... | @@ -133,11 +138,28 @@ adj dywiz samodz |
133 | 138 | # ? |
134 | 139 | samodz dywiz adj |
135 | 140 | |
141 | +#### PONIŻEJ REGUŁY WK | |
142 | +# Stopień najwyższy: | |
143 | +# np. „naj·zieleńszy”, „naj·mądrzej” | |
144 | +moze_interp( naj> adj_sup ) | |
145 | +# Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj | |
146 | +moze_interp( praet_sg dywiz li) | |
147 | +moze_interp( praet_pl dywiz li) | |
148 | +moze_interp( praet_sg_na dywiz li) | |
149 | +moze_interp( fin dywiz li) | |
150 | + | |
151 | +# i bez dywizu --- czy bez dywizu jest sens to łapać? | |
152 | +#moze_interp( praet_sg li) | |
153 | +#moze_interp( praet_pl li) | |
154 | +#moze_interp( praet_sg_na li) | |
155 | +#moze_interp( fin li) | |
156 | + | |
136 | 157 | [segment types] |
137 | 158 | naj |
138 | 159 | nie |
139 | 160 | prefs |
140 | 161 | prefv |
162 | +prefa | |
141 | 163 | dig |
142 | 164 | adja |
143 | 165 | adj |
... | ... | @@ -161,11 +183,14 @@ naj naj |
161 | 183 | nie nie |
162 | 184 | prefs prefs |
163 | 185 | prefv prefv |
186 | +prefa prefa | |
164 | 187 | dig dig |
165 | 188 | adja adja |
166 | 189 | adj adj:%:pos |
167 | 190 | adj_sup adj:%:sup |
168 | 191 | adj_sup adv:sup |
192 | +adj_com adj:%:com | |
193 | +adj_com adj:%:com | |
169 | 194 | negat ger:%:neg |
170 | 195 | negat pact:%:neg |
171 | 196 | negat ppas:%:neg |
... | ... | @@ -173,26 +198,35 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep |
173 | 198 | z_on_agl prep:% |
174 | 199 | samotny brev:pun |
175 | 200 | samotny brev:npun |
176 | -samotny intrj | |
201 | +samotny interj | |
177 | 202 | interp interp |
178 | 203 | aglsg aglt:sg:% |
179 | 204 | aglpl aglt:pl:% |
180 | -praetcond cond:% | |
181 | -praetcond praet:%:pri:% | |
182 | -praetcond praet:%:sec:% | |
183 | -praetcond praet:%:ter:% | |
184 | 205 | praet_sg_agl praet:sg:%:agl |
185 | 206 | praet_sg_na praet:sg:%:nagl |
186 | 207 | praet_sg praet:sg:% |
187 | 208 | praet_pl praet:pl:% |
188 | 209 | praet_sg winien:sg:% |
189 | 210 | praet_pl winien:pl:% |
211 | +fin fin:% | |
212 | +nomina subst:% | |
213 | +nomina ger:% | |
214 | +nomina depr:% | |
215 | +adjectiva adv:% | |
216 | +adjectiva ppas:% | |
217 | +adjectiva pact:% | |
218 | +verba_imperf praet:%:imperf | |
219 | +verba_imperf fin:%:imperf | |
220 | +verba_imperf inf:imperf | |
221 | +verba_imperf imps:imperf | |
222 | +verba_imperf impt:%:imperf | |
190 | 223 | samodz % |
191 | 224 | |
192 | 225 | [lexemes] |
193 | 226 | z_aglt aby:comp |
194 | 227 | z_aglt bowiem:comp |
195 | 228 | by by:qub |
229 | +li li:qub | |
196 | 230 | z_aglt by:comp |
197 | 231 | z_aglt cóż:subst |
198 | 232 | z_aglt czemu:adv |
... | ... |
input/segmenty1.dat
... | ... | @@ -7,9 +7,10 @@ praet=split composite |
7 | 7 | |
8 | 8 | #define moze_interp(segmenty) wsz_interp segmenty wsz_interp |
9 | 9 | |
10 | +dig>* dig | |
10 | 11 | (adja dywiz)+ adj |
11 | -dig!>+ | |
12 | -dig!> dig!> dig!> | |
12 | +#dig!>+ | |
13 | +#dig!> dig!> dig!> | |
13 | 14 | naj> adj_sup |
14 | 15 | |
15 | 16 | [segment types] |
... | ... | @@ -52,20 +53,10 @@ on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep |
52 | 53 | z_on_agl prep:% |
53 | 54 | samotny brev:pun |
54 | 55 | samotny brev:npun |
55 | -samotny intrj | |
56 | +samotny interj | |
56 | 57 | interp interp |
57 | 58 | aglsg aglt:sg:% |
58 | 59 | aglpl aglt:pl:% |
59 | -praetcond cond:% | |
60 | -praetcond praet:%:pri:% | |
61 | -praetcond praet:%:sec:% | |
62 | -praetcond praet:%:ter:% | |
63 | -praet_sg_agl praet:sg:%:agl | |
64 | -praet_sg_na praet:sg:%:nagl | |
65 | -praet_sg praet:sg:% | |
66 | -praet_pl praet:pl:% | |
67 | -praet_sg winien:sg:% | |
68 | -praet_pl winien:pl:% | |
69 | 60 | samodz % |
70 | 61 | |
71 | 62 | [lexemes] |
... | ... |
morfeusz/InterpretedChunk.hpp
morfeusz/Morfeusz.cpp
... | ... | @@ -37,11 +37,19 @@ static MorfeuszOptions createDefaultOptions() { |
37 | 37 | return res; |
38 | 38 | } |
39 | 39 | |
40 | +static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { | |
41 | + SegrulesOptions opts; | |
42 | + opts["aggl"] = "isolated"; | |
43 | + opts["praet"] = "split"; | |
44 | + return (*(map.find(opts))).second; | |
45 | +} | |
46 | + | |
40 | 47 | Morfeusz::Morfeusz() |
41 | 48 | : env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET), |
42 | 49 | analyzerPtr(DEFAULT_FSA), |
43 | 50 | analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())), |
44 | 51 | segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)), |
52 | +currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), | |
45 | 53 | isAnalyzerFSAFromFile(false), |
46 | 54 | generatorPtr(DEFAULT_SYNTH_FSA), |
47 | 55 | isGeneratorFSAFromFile(false), |
... | ... | @@ -50,9 +58,9 @@ options(createDefaultOptions()) { |
50 | 58 | |
51 | 59 | } |
52 | 60 | |
53 | -static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSAType*>& fsasMap) { | |
61 | +static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) { | |
54 | 62 | for ( |
55 | - std::map<SegrulesOptions, SegrulesFSAType*>::iterator it = fsasMap.begin(); | |
63 | + std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin(); | |
56 | 64 | it != fsasMap.end(); |
57 | 65 | ++it) { |
58 | 66 | delete it->second; |
... | ... | @@ -100,11 +108,8 @@ void Morfeusz::analyzeOneWord( |
100 | 108 | vector<InterpretedChunk> accum; |
101 | 109 | FlexionGraph graph; |
102 | 110 | const char* currInput = inputStart; |
103 | - SegrulesOptions opts; | |
104 | - opts["aggl"] = "isolated"; | |
105 | - opts["praet"] = "split"; | |
106 | - SegrulesFSAType* segrulesFSA = (*(this->segrulesFSAsMap.find(opts))).second; | |
107 | - doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->getInitialState()); | |
111 | + SegrulesFSA* segrulesFSA = this->currSegrulesFSA; | |
112 | + doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->initialState); | |
108 | 113 | if (!graph.empty()) { |
109 | 114 | InterpretedChunksDecoder interpretedChunksDecoder(env); |
110 | 115 | int srcNode = startNodeNum; |
... | ... | @@ -118,7 +123,8 @@ void Morfeusz::analyzeOneWord( |
118 | 123 | srcNode++; |
119 | 124 | } |
120 | 125 | // graph.getResults(*this->tagset, results); |
121 | - } else if (inputStart != inputEnd) { | |
126 | + } | |
127 | + else if (inputStart != inputEnd) { | |
122 | 128 | this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results); |
123 | 129 | } |
124 | 130 | inputStart = currInput; |
... | ... | @@ -126,9 +132,9 @@ void Morfeusz::analyzeOneWord( |
126 | 132 | |
127 | 133 | static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { |
128 | 134 | to.prefixChunks.insert( |
129 | - to.prefixChunks.begin(), | |
130 | - from.prefixChunks.begin(), | |
131 | - from.prefixChunks.end()); | |
135 | + to.prefixChunks.begin(), | |
136 | + from.prefixChunks.begin(), | |
137 | + from.prefixChunks.end()); | |
132 | 138 | to.prefixChunks.push_back(from); |
133 | 139 | from.orthWasShifted = true; |
134 | 140 | } |
... | ... | @@ -138,7 +144,8 @@ void Morfeusz::doAnalyzeOneWord( |
138 | 144 | const char* inputEnd, |
139 | 145 | vector<InterpretedChunk>& accum, |
140 | 146 | FlexionGraph& graph, |
141 | - SegrulesStateType segrulesState) const { | |
147 | + SegrulesState segrulesState) const { | |
148 | + // cerr << "doAnalyzeOneWord " << inputData << endl; | |
142 | 149 | bool endOfWord = inputData == inputEnd; |
143 | 150 | const char* currInput = inputData; |
144 | 151 | uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd); |
... | ... | @@ -159,16 +166,27 @@ void Morfeusz::doAnalyzeOneWord( |
159 | 166 | vector<InterpsGroup> val(state.getValue()); |
160 | 167 | for (unsigned int i = 0; i < val.size(); i++) { |
161 | 168 | InterpsGroup& ig = val[i]; |
162 | - cerr << (int) ig.type << endl; | |
163 | - SegrulesStateType newSegrulesState = segrulesState; | |
164 | - newSegrulesState.proceedToNext(ig.type); | |
165 | - if (!newSegrulesState.isSink()) { | |
166 | - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; | |
167 | - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; | |
168 | - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false}; | |
169 | - if (!accum.empty() | |
170 | - && (accum.back().shiftOrth | |
171 | - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) { | |
169 | + // newSegrulesState.proceedToNext(ig.type); | |
170 | + // this->currSegrulesFSA->proceedToNext(ig.type, segrulesStates, newSegrulesStates); | |
171 | + set<SegrulesState> newSegrulesStates; | |
172 | + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates); | |
173 | + for ( | |
174 | + set<SegrulesState>::iterator it = newSegrulesStates.begin(); | |
175 | + it != newSegrulesStates.end(); | |
176 | + it++) { | |
177 | + SegrulesState newSegrulesState = *it; | |
178 | + // bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; | |
179 | + // bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; | |
180 | + InterpretedChunk ic = { | |
181 | + inputData, | |
182 | + originalCodepoints, | |
183 | + lowercaseCodepoints, | |
184 | + ig, | |
185 | + newSegrulesState.shiftOrthFromPrevious, | |
186 | + false, | |
187 | + vector<InterpretedChunk>() | |
188 | + }; | |
189 | + if (!accum.empty() && accum.back().shiftOrth) { | |
172 | 190 | doShiftOrth(accum.back(), ic); |
173 | 191 | } |
174 | 192 | accum.push_back(ic); |
... | ... | @@ -182,27 +200,37 @@ void Morfeusz::doAnalyzeOneWord( |
182 | 200 | this->env.getCharsetConverter().next(currInput, inputEnd); |
183 | 201 | } |
184 | 202 | } |
203 | + // cerr << "end of word" << endl; | |
185 | 204 | // we are at the end of word |
186 | 205 | if (state.isAccepting()) { |
187 | 206 | vector<InterpsGroup > val(state.getValue()); |
188 | 207 | for (unsigned int i = 0; i < val.size(); i++) { |
189 | 208 | InterpsGroup& ig = val[i]; |
190 | - SegrulesStateType newSegrulesState = segrulesState; | |
191 | - newSegrulesState.proceedToNext(ig.type); | |
192 | - if (newSegrulesState.isAccepting()) { | |
193 | - bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; | |
194 | - bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; | |
195 | - InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig, shiftOrth, shiftOrthSameType, false}; | |
196 | - if (!accum.empty() | |
197 | - && (accum.back().shiftOrth | |
198 | - || (accum.back().shiftOrthSameType && accum.back().interpsGroup.type == ig.type))) { | |
199 | - doShiftOrth(accum.back(), ic); | |
209 | + // cerr << "currInput=" << currInput << endl; | |
210 | + // cerr << "type=" << (int) ig.type << endl; | |
211 | + set<SegrulesState> newSegrulesStates; | |
212 | + currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates); | |
213 | + for ( | |
214 | + set<SegrulesState>::iterator it = newSegrulesStates.begin(); | |
215 | + it != newSegrulesStates.end(); | |
216 | + it++) { | |
217 | + SegrulesState newSegrulesState = *it; | |
218 | + if (newSegrulesState.accepting) { | |
219 | + InterpretedChunk ic = { | |
220 | + inputData, | |
221 | + originalCodepoints, | |
222 | + lowercaseCodepoints, | |
223 | + ig, | |
224 | + newSegrulesState.shiftOrthFromPrevious, | |
225 | + false, | |
226 | + vector<InterpretedChunk>()}; | |
227 | + if (!accum.empty() && accum.back().shiftOrth) { | |
228 | + doShiftOrth(accum.back(), ic); | |
229 | + } | |
230 | + accum.push_back(ic); | |
231 | + graph.addPath(accum); | |
232 | + accum.pop_back(); | |
200 | 233 | } |
201 | - accum.push_back(ic); | |
202 | - graph.addPath(accum); | |
203 | - accum.pop_back(); | |
204 | - } else if (!newSegrulesState.isSink()) { | |
205 | - } else { | |
206 | 234 | } |
207 | 235 | } |
208 | 236 | } |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -12,6 +12,7 @@ |
12 | 12 | #include <list> |
13 | 13 | #include <vector> |
14 | 14 | #include <map> |
15 | +#include <set> | |
15 | 16 | #include "EncodedInterpretation.hpp" |
16 | 17 | #include "fsa/fsa.hpp" |
17 | 18 | #include "MorphInterpretation.hpp" |
... | ... | @@ -27,6 +28,7 @@ |
27 | 28 | #include "Environment.hpp" |
28 | 29 | |
29 | 30 | #include "segrules/segrules.hpp" |
31 | +#include "segrules/SegrulesFSA.hpp" | |
30 | 32 | |
31 | 33 | class Morfeusz; |
32 | 34 | class ResultsIterator; |
... | ... | @@ -111,7 +113,7 @@ private: |
111 | 113 | const char* inputEnd, |
112 | 114 | std::vector<InterpretedChunk>& accum, |
113 | 115 | FlexionGraph& graph, |
114 | - SegrulesStateType segrulesState) const; | |
116 | + SegrulesState segrulesState) const; | |
115 | 117 | |
116 | 118 | void appendIgnotiumToResults( |
117 | 119 | const std::string& word, |
... | ... | @@ -120,17 +122,13 @@ private: |
120 | 122 | Environment env; |
121 | 123 | const unsigned char* analyzerPtr; |
122 | 124 | FSAType* analyzerFSA; |
123 | - std::map<SegrulesOptions, SegrulesFSAType*> segrulesFSAsMap; | |
125 | + std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap; | |
126 | + SegrulesFSA* currSegrulesFSA; | |
124 | 127 | bool isAnalyzerFSAFromFile; |
125 | 128 | |
126 | 129 | const unsigned char* generatorPtr; |
127 | 130 | bool isGeneratorFSAFromFile; |
128 | 131 | Generator generator; |
129 | -// const CharsetConverter* charsetConverter; | |
130 | -// const Tagset* tagset; | |
131 | -// const CaseConverter* caseConverter; | |
132 | -// | |
133 | -// UTF8CharsetConverter utf8CharsetConverter; | |
134 | 132 | |
135 | 133 | MorfeuszOptions options; |
136 | 134 | }; |
... | ... |
morfeusz/segrules/SegrulesFSA.hpp
0 → 100644
1 | +/* | |
2 | + * File: SegrulesFSA.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 12 marzec 2014, 17:52 | |
6 | + */ | |
7 | + | |
8 | +#ifndef SEGRULESFSA_HPP | |
9 | +#define SEGRULESFSA_HPP | |
10 | + | |
11 | +#include <set> | |
12 | +#include "../endianness.hpp" | |
13 | + | |
14 | +struct SegrulesState { | |
15 | + uint16_t offset; | |
16 | + bool accepting; | |
17 | + bool weak; | |
18 | + bool shiftOrthFromPrevious; | |
19 | +}; | |
20 | + | |
21 | +inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) | |
22 | +{ | |
23 | + return s1.offset < s2.offset; | |
24 | +} | |
25 | + | |
26 | +class SegrulesFSA { | |
27 | +public: | |
28 | + SegrulesFSA(const unsigned char* ptr): initialState(), ptr(ptr) { | |
29 | + SegrulesState state = {0, false, false, false}; | |
30 | + initialState = state; | |
31 | + } | |
32 | + | |
33 | + void proceedToNext( | |
34 | + const unsigned char segnum, | |
35 | + const SegrulesState state, | |
36 | + std::set<SegrulesState>& newStates) const { | |
37 | + | |
38 | + const unsigned char* currPtr = ptr + state.offset; | |
39 | + currPtr++; | |
40 | + const unsigned char transitionsNum = *currPtr; | |
41 | + currPtr++; | |
42 | + for (unsigned int i = 0; i < transitionsNum; i++) { | |
43 | + if (*currPtr == segnum) { | |
44 | + newStates.insert(newStates.begin(), this->transition2State(currPtr)); | |
45 | + } | |
46 | + currPtr += 4; | |
47 | + } | |
48 | + } | |
49 | + | |
50 | + virtual ~SegrulesFSA() {} | |
51 | + | |
52 | + SegrulesState initialState; | |
53 | +private: | |
54 | + const unsigned char* ptr; | |
55 | + | |
56 | + SegrulesState transition2State(const unsigned char* transitionPtr) const { | |
57 | + unsigned char ACCEPTING_FLAG = 1; | |
58 | + unsigned char WEAK_FLAG = 2; | |
59 | + SegrulesState res; | |
60 | + transitionPtr++; | |
61 | + res.shiftOrthFromPrevious = *transitionPtr; | |
62 | + transitionPtr++; | |
63 | + res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr)); | |
64 | + res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; | |
65 | + res.weak = *(ptr + res.offset) & WEAK_FLAG; | |
66 | + return res; | |
67 | + } | |
68 | +}; | |
69 | + | |
70 | +#endif /* SEGRULESFSA_HPP */ | |
71 | + | |
... | ... |
morfeusz/segrules/segrules.cpp
... | ... | @@ -33,23 +33,23 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { |
33 | 33 | return res; |
34 | 34 | } |
35 | 35 | |
36 | -static inline SegrulesFSAType* deserializeFSA(const unsigned char*& ptr) { | |
36 | +static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) { | |
37 | 37 | uint32_t fsaSize = deserializeUint32(ptr); |
38 | - static SegrulesDeserializer deserializer; | |
39 | - SegrulesFSAType* res = SegrulesFSAType::getFSA(ptr, deserializer); | |
38 | +// static SegrulesDeserializer deserializer; | |
39 | + SegrulesFSA* res = new SegrulesFSA(ptr); | |
40 | 40 | ptr += fsaSize; |
41 | 41 | return res; |
42 | 42 | } |
43 | 43 | |
44 | -map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) { | |
45 | - map<SegrulesOptions, SegrulesFSAType*> res; | |
44 | +map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr) { | |
45 | + map<SegrulesOptions, SegrulesFSA*> res; | |
46 | 46 | const unsigned char* fsasMapPtr = getFSAsMapPtr(analyzerPtr); |
47 | 47 | const unsigned char* currPtr = fsasMapPtr; |
48 | 48 | unsigned char fsasNum = *currPtr; |
49 | 49 | currPtr++; |
50 | 50 | for (unsigned char i = 0; i < fsasNum; i++) { |
51 | 51 | SegrulesOptions options = deserializeOptions(currPtr); |
52 | - SegrulesFSAType* fsa = deserializeFSA(currPtr); | |
52 | + SegrulesFSA* fsa = deserializeFSA(currPtr); | |
53 | 53 | res[options] = fsa; |
54 | 54 | } |
55 | 55 | return res; |
... | ... |
morfeusz/segrules/segrules.hpp
... | ... | @@ -11,13 +11,13 @@ |
11 | 11 | #include <utility> |
12 | 12 | #include <map> |
13 | 13 | #include <string> |
14 | -#include "../fsa/fsa.hpp" | |
14 | +#include "SegrulesFSA.hpp" | |
15 | 15 | |
16 | 16 | typedef std::map<std::string, std::string> SegrulesOptions; |
17 | -typedef State<unsigned char> SegrulesStateType; | |
18 | -typedef FSA<unsigned char> SegrulesFSAType; | |
17 | +//typedef State<unsigned char> SegrulesStateType; | |
18 | +//typedef FSA<unsigned char> SegrulesFSAType; | |
19 | 19 | |
20 | -std::map<SegrulesOptions, SegrulesFSAType*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); | |
20 | +std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); | |
21 | 21 | |
22 | 22 | #endif /* SEGRULES_HPP */ |
23 | 23 | |
... | ... |
nbproject/configurations.xml
... | ... | @@ -106,14 +106,20 @@ |
106 | 106 | </makeTool> |
107 | 107 | </makefileType> |
108 | 108 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
109 | + <ccTool flags="1"> | |
110 | + </ccTool> | |
109 | 111 | </item> |
110 | 112 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | + <ccTool flags="1"> | |
114 | + </ccTool> | |
111 | 115 | </item> |
112 | 116 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | 117 | </item> |
114 | 118 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
115 | 119 | </item> |
116 | 120 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
121 | + <ccTool flags="1"> | |
122 | + </ccTool> | |
117 | 123 | </item> |
118 | 124 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
119 | 125 | ex="false" |
... | ... | @@ -121,7 +127,6 @@ |
121 | 127 | flavor2="8"> |
122 | 128 | <ccTool> |
123 | 129 | <incDir> |
124 | - <pElem>build</pElem> | |
125 | 130 | <pElem>/usr/lib/jvm/default-java/include</pElem> |
126 | 131 | <pElem>morfeusz</pElem> |
127 | 132 | <pElem>build/morfeusz/java</pElem> |
... | ... | @@ -145,7 +150,6 @@ |
145 | 150 | flavor2="8"> |
146 | 151 | <ccTool> |
147 | 152 | <incDir> |
148 | - <pElem>build</pElem> | |
149 | 153 | <pElem>/usr/include/python2.7</pElem> |
150 | 154 | <pElem>morfeusz</pElem> |
151 | 155 | <pElem>build/morfeusz/python</pElem> |
... | ... | @@ -173,9 +177,8 @@ |
173 | 177 | <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
174 | 178 | <ccTool flags="1"> |
175 | 179 | <incDir> |
176 | - <pElem>build1</pElem> | |
177 | 180 | <pElem>morfeusz</pElem> |
178 | - <pElem>build1/morfeusz</pElem> | |
181 | + <pElem>morfeusz/build/morfeusz</pElem> | |
179 | 182 | </incDir> |
180 | 183 | <preprocessorList> |
181 | 184 | <Elem>libmorfeusz_EXPORTS</Elem> |
... | ... | @@ -185,9 +188,8 @@ |
185 | 188 | <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
186 | 189 | <ccTool flags="1"> |
187 | 190 | <incDir> |
188 | - <pElem>build1</pElem> | |
189 | 191 | <pElem>morfeusz</pElem> |
190 | - <pElem>build1/morfeusz</pElem> | |
192 | + <pElem>morfeusz/build/morfeusz</pElem> | |
191 | 193 | </incDir> |
192 | 194 | <preprocessorList> |
193 | 195 | <Elem>libmorfeusz_EXPORTS</Elem> |
... | ... | @@ -266,12 +268,18 @@ |
266 | 268 | </preprocessorList> |
267 | 269 | </ccTool> |
268 | 270 | </folder> |
269 | - <folder path="morfeusz/java"> | |
271 | + <folder path="morfeusz"> | |
270 | 272 | <ccTool> |
271 | 273 | <incDir> |
272 | 274 | <pElem>build</pElem> |
275 | + </incDir> | |
276 | + </ccTool> | |
277 | + </folder> | |
278 | + <folder path="morfeusz/java"> | |
279 | + <ccTool> | |
280 | + <incDir> | |
273 | 281 | <pElem>morfeusz</pElem> |
274 | - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | |
282 | + <pElem>/usr/lib/jvm/default-java/include</pElem> | |
275 | 283 | </incDir> |
276 | 284 | <preprocessorList> |
277 | 285 | <Elem>libjmorfeusz_EXPORTS</Elem> |
... | ... | @@ -281,7 +289,6 @@ |
281 | 289 | <folder path="morfeusz/python"> |
282 | 290 | <ccTool> |
283 | 291 | <incDir> |
284 | - <pElem>build</pElem> | |
285 | 292 | <pElem>/usr/include/python2.7</pElem> |
286 | 293 | <pElem>morfeusz</pElem> |
287 | 294 | </incDir> |
... | ... | @@ -407,18 +414,26 @@ |
407 | 414 | </ccTool> |
408 | 415 | </item> |
409 | 416 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
417 | + <ccTool flags="1"> | |
418 | + </ccTool> | |
410 | 419 | </item> |
411 | 420 | <item path="morfeusz/charset/CharsetConverter.cpp" |
412 | 421 | ex="false" |
413 | 422 | tool="1" |
414 | 423 | flavor2="4"> |
424 | + <ccTool flags="1"> | |
425 | + </ccTool> | |
415 | 426 | </item> |
416 | 427 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
428 | + <ccTool flags="1"> | |
429 | + </ccTool> | |
417 | 430 | </item> |
418 | 431 | <item path="morfeusz/charset/conversion_tables.cpp" |
419 | 432 | ex="false" |
420 | 433 | tool="1" |
421 | 434 | flavor2="4"> |
435 | + <ccTool flags="1"> | |
436 | + </ccTool> | |
422 | 437 | </item> |
423 | 438 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
424 | 439 | <ccTool flags="1"> |
... | ... | @@ -507,8 +522,12 @@ |
507 | 522 | ex="false" |
508 | 523 | tool="1" |
509 | 524 | flavor2="4"> |
525 | + <ccTool flags="1"> | |
526 | + </ccTool> | |
510 | 527 | </item> |
511 | 528 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
529 | + <ccTool flags="1"> | |
530 | + </ccTool> | |
512 | 531 | </item> |
513 | 532 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
514 | 533 | <ccTool flags="0"> |
... | ... |