Commit f1e52ff44027610390237bd862449c85c4a044cc
1 parent
de0e960d
poprawienie czasu działania, przebudowanie analizatora tak, by nie powielać kodu…
… w generatorze, poprawienie rozpoznawania pierwszego segmentu w grafie fleksyjnym git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@114 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
17 changed files
with
411 additions
and
205 deletions
CMakeLists.txt
... | ... | @@ -5,7 +5,7 @@ project (Morfeusz) |
5 | 5 | set (Morfeusz_VERSION_MAJOR 2) |
6 | 6 | set (Morfeusz_VERSION_MINOR 0) |
7 | 7 | set (Morfeusz_VERSION_PATCH 0) |
8 | -set (CMAKE_BUILD_TYPE "Debug") | |
8 | +set (CMAKE_BUILD_TYPE "Release") | |
9 | 9 | |
10 | 10 | enable_testing() |
11 | 11 | |
... | ... | @@ -47,7 +47,7 @@ endif () |
47 | 47 | |
48 | 48 | # SEGMENT_RULES_FILE |
49 | 49 | if ("${SEGMENT_RULES_FILE}" STREQUAL "") |
50 | - set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat") | |
50 | + set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat") | |
51 | 51 | endif () |
52 | 52 | |
53 | 53 | message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules") |
... | ... |
fsabuilder/morfeuszbuilder/fsa/common.py
... | ... | @@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object): |
40 | 40 | self.suffixToAdd = bestEncodedForm.suffixToAdd |
41 | 41 | self.prefixToAdd = targetWord[:bestPrefixLength] |
42 | 42 | |
43 | -class Interpretation(object): | |
43 | +class Interpretation4Analyzer(object): | |
44 | 44 | |
45 | 45 | def __init__(self, orth, base, tagnum, namenum, typenum): |
46 | - self.lemma = EncodedForm(orth, base) | |
46 | + self.encodedForm = EncodedForm(orth, base) | |
47 | 47 | self.tagnum = tagnum |
48 | 48 | self.namenum = namenum |
49 | 49 | self.typenum = typenum |
50 | 50 | |
51 | 51 | def getSortKey(self): |
52 | 52 | return ( |
53 | - self.lemma.cutLength, | |
54 | - tuple(self.lemma.suffixToAdd), | |
55 | - tuple(self.lemma.casePattern), | |
53 | + self.encodedForm.cutLength, | |
54 | + tuple(self.encodedForm.suffixToAdd), | |
55 | + tuple(self.encodedForm.casePattern), | |
56 | 56 | self.tagnum, |
57 | 57 | self.namenum) |
58 | 58 | |
59 | 59 | def __eq__(self, other): |
60 | - if isinstance(other, Interpretation): | |
60 | + if isinstance(other, Interpretation4Analyzer): | |
61 | 61 | return self.getSortKey() == other.getSortKey() |
62 | 62 | else: |
63 | 63 | return False |
... | ... | @@ -68,8 +68,8 @@ class Interpretation(object): |
68 | 68 | class Interpretation4Generator(object): |
69 | 69 | |
70 | 70 | def __init__(self, orth, base, tagnum, namenum, typenum): |
71 | - self.lemma = base | |
72 | - self.orth = EncodedFormWithPrefix(base, orth) | |
71 | + self.encodedForm = base | |
72 | + self.encodedForm = EncodedFormWithPrefix(base, orth) | |
73 | 73 | self.tagnum = tagnum |
74 | 74 | self.namenum = namenum |
75 | 75 | self.typenum = typenum |
... | ... | @@ -77,9 +77,9 @@ class Interpretation4Generator(object): |
77 | 77 | def getSortKey(self): |
78 | 78 | return ( |
79 | 79 | self.tagnum, |
80 | - self.orth.cutLength, | |
81 | - tuple(self.orth.suffixToAdd), | |
82 | -# tuple(self.lemma.casePattern), | |
80 | + self.encodedForm.cutLength, | |
81 | + tuple(self.encodedForm.suffixToAdd), | |
82 | +# tuple(self.encodedForm.casePattern), | |
83 | 83 | self.namenum) |
84 | 84 | |
85 | 85 | def __eq__(self, other): |
... | ... | @@ -92,7 +92,7 @@ class Interpretation4Generator(object): |
92 | 92 | return hash(self.getSortKey()) |
93 | 93 | |
94 | 94 | def __unicode__(self): |
95 | - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.orth.cutLength, self.orth.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) | |
95 | + return u'<%s,(%d %s),%d,%d>' % (self.encodedForm.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) | |
96 | 96 | |
97 | 97 | def __repr__(self): |
98 | 98 | return unicode(self) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... | ... | @@ -4,7 +4,7 @@ Created on Oct 23, 2013 |
4 | 4 | @author: mlenart |
5 | 5 | ''' |
6 | 6 | import logging |
7 | -from common import Interpretation | |
7 | +from common import Interpretation4Analyzer | |
8 | 8 | from morfeuszbuilder.fsa.common import Interpretation4Generator |
9 | 9 | |
10 | 10 | def _mergeEntries(inputLines): |
... | ... | @@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object): |
74 | 74 | tagnum = int(tagnum) |
75 | 75 | namenum = int(namenum) |
76 | 76 | typenum = int(typenum) |
77 | - yield (orth, Interpretation(orth, base, tagnum, namenum, typenum)) | |
77 | + yield (orth, Interpretation4Analyzer(orth, base, tagnum, namenum, typenum)) | |
78 | 78 | |
79 | 79 | def convert(self, inputLines): |
80 | 80 | return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines)))) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/encode.py
... | ... | @@ -5,6 +5,7 @@ Created on Oct 23, 2013 |
5 | 5 | ''' |
6 | 6 | |
7 | 7 | import logging |
8 | +from morfeuszbuilder.utils import serializationUtils | |
8 | 9 | |
9 | 10 | class Encoder(object): |
10 | 11 | ''' |
... | ... | @@ -96,6 +97,54 @@ class Encoder(object): |
96 | 97 | def _encodeNameNum(self, namenum): |
97 | 98 | assert namenum < 256 and namenum >= 0 |
98 | 99 | return bytearray([namenum]) |
100 | + | |
101 | + def _groupInterpsByType(self, interpsList): | |
102 | + res = {} | |
103 | + for interp in interpsList: | |
104 | + res.setdefault(interp.typenum, []) | |
105 | + res[interp.typenum].append(interp) | |
106 | + return res | |
107 | + | |
108 | + def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix): | |
109 | + res = bytearray() | |
110 | + res.extend(self._encodeTypeNum(typenum)) | |
111 | + | |
112 | + encodedInterpsList = bytearray() | |
113 | + for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
114 | + encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) | |
115 | + encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) | |
116 | + encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) | |
117 | + | |
118 | + res.extend(serializationUtils.htons(len(encodedInterpsList))) | |
119 | + res.extend(encodedInterpsList) | |
120 | + return res | |
121 | + | |
122 | + def _doEncodeData(self, interpsList, withCasePattern, withPrefix): | |
123 | + | |
124 | + assert type(interpsList) == frozenset | |
125 | + | |
126 | + segnum2Interps = self._groupInterpsByType(interpsList) | |
127 | + | |
128 | + | |
129 | + res = bytearray() | |
130 | + firstByte = len(segnum2Interps) | |
131 | + assert firstByte < 256 | |
132 | + assert firstByte > 0 | |
133 | + res.append(firstByte) | |
134 | + | |
135 | + for typenum, interpsList in segnum2Interps.iteritems(): | |
136 | + res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix)) | |
137 | + | |
138 | + | |
139 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
140 | +# encodedInterpsList.extend(self._encodeTypeNum(interp.typenum)) | |
141 | +# encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) | |
142 | +# encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) | |
143 | +# encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) | |
144 | + del interpsList | |
145 | +# res.extend(serializationUtils.htons(len(encodedInterpsList))) | |
146 | +# res.extend(encodedInterpsList) | |
147 | + return res | |
99 | 148 | |
100 | 149 | class MorphEncoder(Encoder): |
101 | 150 | |
... | ... | @@ -106,19 +155,20 @@ class MorphEncoder(Encoder): |
106 | 155 | self.LEMMA_MIXED_CASE = 2 |
107 | 156 | |
108 | 157 | def encodeData(self, interpsList): |
109 | - res = bytearray() | |
110 | - firstByte = len(interpsList) | |
111 | - assert firstByte < 256 | |
112 | - assert firstByte > 0 | |
113 | - res.append(firstByte) | |
114 | - assert type(interpsList) == frozenset | |
115 | - for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
116 | - res.extend(self._encodeTypeNum(interp.typenum)) | |
117 | - res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False)) | |
118 | - res.extend(self._encodeTagNum(interp.tagnum)) | |
119 | - res.extend(self._encodeNameNum(interp.namenum)) | |
120 | - del interpsList | |
121 | - return res | |
158 | + return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False) | |
159 | +# res = bytearray() | |
160 | +# firstByte = len(interpsList) | |
161 | +# assert firstByte < 256 | |
162 | +# assert firstByte > 0 | |
163 | +# res.append(firstByte) | |
164 | +# assert type(interpsList) == frozenset | |
165 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
166 | +# res.extend(self._encodeTypeNum(interp.typenum)) | |
167 | +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=True, withPrefix=False)) | |
168 | +# res.extend(self._encodeTagNum(interp.tagnum)) | |
169 | +# res.extend(self._encodeNameNum(interp.namenum)) | |
170 | +# del interpsList | |
171 | +# return res | |
122 | 172 | |
123 | 173 | class Encoder4Generator(Encoder): |
124 | 174 | |
... | ... | @@ -126,18 +176,19 @@ class Encoder4Generator(Encoder): |
126 | 176 | super(Encoder4Generator, self).__init__(encoding) |
127 | 177 | |
128 | 178 | def encodeData(self, interpsList): |
129 | - res = bytearray() | |
130 | - firstByte = len(interpsList) | |
131 | - assert firstByte < 256 | |
132 | - assert firstByte > 0 | |
133 | - res.append(firstByte) | |
134 | - assert type(interpsList) == frozenset | |
135 | - for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
136 | - res.extend(self._encodeTypeNum(interp.typenum)) | |
137 | - res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True)) | |
138 | - res.extend(self._encodeTagNum(interp.tagnum)) | |
139 | - res.extend(self._encodeNameNum(interp.namenum)) | |
140 | - return res | |
179 | + return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True) | |
180 | +# res = bytearray() | |
181 | +# firstByte = len(interpsList) | |
182 | +# assert firstByte < 256 | |
183 | +# assert firstByte > 0 | |
184 | +# res.append(firstByte) | |
185 | +# assert type(interpsList) == frozenset | |
186 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
187 | +# res.extend(self._encodeTypeNum(interp.typenum)) | |
188 | +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=False, withPrefix=True)) | |
189 | +# res.extend(self._encodeTagNum(interp.tagnum)) | |
190 | +# res.extend(self._encodeNameNum(interp.namenum)) | |
191 | +# return res | |
141 | 192 | # |
142 | 193 | # def decodeData(self, data): |
143 | 194 | # |
... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -106,7 +106,7 @@ class Segtypes(object): |
106 | 106 | lineNum, |
107 | 107 | re.match(r'[a-z_]+', segtype)) |
108 | 108 | self._validate( |
109 | - u'Pattern must contain lemma and part-of-speech fields', | |
109 | + u'Pattern must contain encodedForm and part-of-speech fields', | |
110 | 110 | lineNum, |
111 | 111 | re.match(r'.+\:[a-z_]+', pattern, re.U)) |
112 | 112 | |
... | ... | @@ -146,13 +146,13 @@ class Segtypes(object): |
146 | 146 | |
147 | 147 | # index lexemes |
148 | 148 | for p in self.patternsList: |
149 | - if p.lemma: | |
149 | + if p.encodedForm: | |
150 | 150 | for tag in self.tagset.getAllTags(): |
151 | 151 | tagnum = self.tagset.getTagnum4Tag(tag) |
152 | - if not (p.lemma, tagnum) in self._lemmaTagnum2Segnum: | |
153 | - segnum = p.tryToMatch(p.lemma, tag) | |
152 | + if not (p.encodedForm, tagnum) in self._lemmaTagnum2Segnum: | |
153 | + segnum = p.tryToMatch(p.encodedForm, tag) | |
154 | 154 | if segnum != -1: |
155 | - self._lemmaTagnum2Segnum[(p.lemma, tagnum)] = segnum | |
155 | + self._lemmaTagnum2Segnum[(p.encodedForm, tagnum)] = segnum | |
156 | 156 | # logging.info('indexing segment type numbers - done') |
157 | 157 | # self._debugSegnums() |
158 | 158 | |
... | ... | @@ -171,7 +171,7 @@ class Segtypes(object): |
171 | 171 | class SegtypePattern(object): |
172 | 172 | |
173 | 173 | def __init__(self, lemma, pattern, segnum): |
174 | - self.lemma = lemma | |
174 | + self.encodedForm = lemma | |
175 | 175 | self.pattern = pattern |
176 | 176 | self.segnum = segnum |
177 | 177 | |
... | ... | @@ -181,7 +181,7 @@ class SegtypePattern(object): |
181 | 181 | patterns2Match = [] |
182 | 182 | patterns2Match.append(self.pattern.replace('%', '.*')) |
183 | 183 | patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) |
184 | - if (self.lemma is None or self.lemma == lemma) \ | |
184 | + if (self.encodedForm is None or self.encodedForm == lemma) \ | |
185 | 185 | and any([re.match(p, tag) for p in patterns2Match]): |
186 | 186 | return self.segnum |
187 | 187 | else: |
... | ... |
input/dodatki.tab
input/segmenty.dat
... | ... | @@ -142,6 +142,7 @@ samodz dywiz adj |
142 | 142 | # Stopień najwyższy: |
143 | 143 | # np. „naj·zieleńszy”, „naj·mądrzej” |
144 | 144 | moze_interp( naj> adj_sup ) |
145 | +moze_interp( nie> naj> adj_sup ) | |
145 | 146 | # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj |
146 | 147 | moze_interp( praet_sg dywiz li) |
147 | 148 | moze_interp( praet_pl dywiz li) |
... | ... |
input/segmenty1.dat
... | ... | @@ -52,11 +52,14 @@ naj naj |
52 | 52 | nie nie |
53 | 53 | prefs prefs |
54 | 54 | prefv prefv |
55 | +prefa prefa | |
55 | 56 | dig dig |
56 | 57 | adja adja |
57 | 58 | adj adj:%:pos |
58 | 59 | adj_sup adj:%:sup |
59 | 60 | adj_sup adv:sup |
61 | +adj_com adj:%:com | |
62 | +adj_com adj:%:com | |
60 | 63 | negat ger:%:neg |
61 | 64 | negat pact:%:neg |
62 | 65 | negat ppas:%:neg |
... | ... | @@ -69,6 +72,22 @@ interp interp |
69 | 72 | aglsg aglt:sg:% |
70 | 73 | aglpl aglt:pl:% |
71 | 74 | samodz % |
75 | +praet_fin praet:% | |
76 | +praet_fin fin:% | |
77 | +li li:qub:% | |
78 | +nomina subst:% | |
79 | +nomina ger:% | |
80 | +nomina depr:% | |
81 | +adjectiva adj:% | |
82 | +adjectiva adv:% | |
83 | +adjectiva ppas:% | |
84 | +adjectiva pact:% | |
85 | +verba_imperf praet:%:imperf | |
86 | +verba_imperf fin:%:imperf | |
87 | +verba_imperf inf:imperf | |
88 | +verba_imperf imps:imperf | |
89 | +verba_imperf impt:imperf | |
90 | + | |
72 | 91 | |
73 | 92 | [lexemes] |
74 | 93 | z_aglt aby:comp |
... | ... |
morfeusz/EncodedInterpretation.hpp
morfeusz/Environment.cpp
... | ... | @@ -13,10 +13,12 @@ |
13 | 13 | //class InterpretedChunksDecoder4Analyzer; |
14 | 14 | //class InterpretedChunksDecoder4Generator; |
15 | 15 | |
16 | -static Deserializer<vector<InterpsGroup> >* initializeDeserializer() { | |
17 | - static Deserializer < vector < InterpsGroup > > *deserializer | |
16 | +static Deserializer<vector<InterpsGroup> >& initializeDeserializer(MorfeuszProcessorType processorType) { | |
17 | + static Deserializer < vector < InterpsGroup > > *analyzerDeserializer | |
18 | 18 | = new MorphDeserializer(); |
19 | - return deserializer; | |
19 | + static Deserializer < vector < InterpsGroup > > *generatorDeserializer | |
20 | + = new MorphDeserializer(); | |
21 | + return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer); | |
20 | 22 | } |
21 | 23 | |
22 | 24 | static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { |
... | ... | @@ -48,14 +50,15 @@ Environment::Environment( |
48 | 50 | caseConverter(), |
49 | 51 | tagset(fsaFileStartPtr), |
50 | 52 | fsaFileStartPtr(fsaFileStartPtr), |
51 | - fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())), | |
53 | + fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))), | |
52 | 54 | segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)), |
53 | 55 | currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), |
54 | 56 | isFromFile(false), |
55 | 57 | chunksDecoder( |
56 | 58 | processorType == ANALYZER |
57 | 59 | ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) |
58 | - : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)) | |
60 | + : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), | |
61 | + processorType(processorType) | |
59 | 62 | { |
60 | 63 | } |
61 | 64 | |
... | ... | @@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string& filename) { |
110 | 113 | delete this->fsaFileStartPtr; |
111 | 114 | } |
112 | 115 | this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str()); |
113 | - this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer()); | |
116 | + this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType)); | |
114 | 117 | this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr); |
115 | 118 | this->isFromFile = true; |
116 | 119 | } |
... | ... |
morfeusz/Environment.hpp
morfeusz/FlexionGraph.cpp
... | ... | @@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge& e) { |
15 | 15 | this->graph.push_back(vector<Edge>()); |
16 | 16 | this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr); |
17 | 17 | } |
18 | -// cerr << string(e.chunk.chunkStartPtr) << endl; | |
19 | 18 | assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr); |
20 | 19 | this->graph[0].push_back(e); |
21 | 20 | } |
... | ... | @@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge& e) { |
30 | 29 | this->graph[startNode].push_back(e); |
31 | 30 | } |
32 | 31 | |
32 | +static inline bool chunkIsAtFront( | |
33 | + const InterpretedChunk& chunk, | |
34 | + const std::vector<InterpretedChunk>& path) { | |
35 | + unsigned int i; | |
36 | + for (i = 0; i < path.size() - 1 && path[i].orthWasShifted; i++) { | |
37 | + } | |
38 | + assert(!path[i].orthWasShifted); | |
39 | + return &chunk == &(path[i]); | |
40 | +} | |
41 | + | |
42 | +static inline bool chunkIsAtBack( | |
43 | + const InterpretedChunk& chunk, | |
44 | + const std::vector<InterpretedChunk>& path) { | |
45 | + return &chunk == &(path.back()); | |
46 | +} | |
47 | + | |
48 | +static inline bool chunkIsTheOnlyOne( | |
49 | + const InterpretedChunk& chunk, | |
50 | + const std::vector<InterpretedChunk>& path) { | |
51 | + return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); | |
52 | +} | |
53 | + | |
33 | 54 | void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { |
34 | 55 | // debugPath(path); |
35 | 56 | // debugGraph(this->graph); |
36 | 57 | for (unsigned int i = 0; i < path.size(); i++) { |
37 | 58 | const InterpretedChunk& chunk = path[i]; |
38 | 59 | if (!chunk.orthWasShifted) { |
39 | - if (&chunk == &(path.front()) | |
40 | - && &chunk == &(path.back())) { | |
60 | + if (chunkIsTheOnlyOne(chunk, path)) { | |
41 | 61 | Edge e = {chunk, UINT_MAX}; |
42 | 62 | this->addStartEdge(e); |
43 | 63 | } |
44 | - else if (&chunk == &(path.front())) { | |
64 | + else if (chunkIsAtFront(chunk, path)) { | |
45 | 65 | Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()}; |
46 | 66 | this->addStartEdge(e); |
47 | 67 | } |
48 | - else if (&chunk == &(path.back())) { | |
68 | + else if (chunkIsAtBack(chunk, path)) { | |
49 | 69 | Edge e = {chunk, UINT_MAX}; |
50 | 70 | this->addMiddleEdge((unsigned int) this->graph.size(), e); |
51 | 71 | } |
... | ... |
morfeusz/InterpretedChunksDecoder.hpp
... | ... | @@ -18,6 +18,10 @@ |
18 | 18 | #include "charset/CaseConverter.hpp" |
19 | 19 | #include "Environment.hpp" |
20 | 20 | |
21 | +const uint8_t LEMMA_ONLY_LOWER = 0; | |
22 | +const uint8_t LEMMA_UPPER_PREFIX = 1; | |
23 | +const uint8_t LEMMA_MIXED_CASE = 2; | |
24 | + | |
21 | 25 | class InterpretedChunksDecoder { |
22 | 26 | public: |
23 | 27 | |
... | ... | @@ -30,22 +34,12 @@ public: |
30 | 34 | unsigned int endNode, |
31 | 35 | const InterpretedChunk& interpretedChunk, |
32 | 36 | std::vector<MorphInterpretation>& out) const = 0; |
33 | - | |
34 | - virtual ~InterpretedChunksDecoder() {} | |
35 | 37 | |
36 | -protected: | |
37 | - | |
38 | - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { | |
39 | - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | |
40 | - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | |
41 | - originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | |
42 | - decodeForm( | |
43 | - prefixChunk.lowercaseCodepoints, | |
44 | - prefixChunk.interpsGroup.interps[0].value, | |
45 | - decodedForm); | |
46 | - } | |
38 | + virtual ~InterpretedChunksDecoder() { | |
47 | 39 | } |
48 | - | |
40 | + | |
41 | +protected: | |
42 | + | |
49 | 43 | virtual void decodeForm( |
50 | 44 | const std::vector<uint32_t>& orth, |
51 | 45 | const EncodedForm& form, |
... | ... | @@ -55,9 +49,10 @@ protected: |
55 | 49 | }; |
56 | 50 | |
57 | 51 | class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { |
58 | - | |
59 | 52 | public: |
60 | - InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {} | |
53 | + | |
54 | + InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { | |
55 | + } | |
61 | 56 | |
62 | 57 | void decode( |
63 | 58 | unsigned int startNode, |
... | ... | @@ -65,22 +60,12 @@ public: |
65 | 60 | const InterpretedChunk& interpretedChunk, |
66 | 61 | std::vector<MorphInterpretation>& out) const { |
67 | 62 | string orth; |
68 | - string lemma; | |
69 | - convertPrefixes(interpretedChunk, orth, lemma); | |
63 | + string lemmaPrefix; | |
64 | + convertPrefixes(interpretedChunk, orth, lemmaPrefix); | |
70 | 65 | orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); |
71 | - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | |
72 | - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | |
73 | - decodeForm( | |
74 | - interpretedChunk.lowercaseCodepoints, | |
75 | - ei.value, | |
76 | - lemma); | |
77 | - out.push_back(MorphInterpretation( | |
78 | - startNode, endNode, | |
79 | - orth, lemma, | |
80 | - ei.tag, | |
81 | - ei.nameClassifier, | |
82 | - env.getTagset(), | |
83 | - env.getCharsetConverter())); | |
66 | + const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr; | |
67 | + while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) { | |
68 | + out.push_back(this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr)); | |
84 | 69 | } |
85 | 70 | } |
86 | 71 | |
... | ... | @@ -104,36 +89,116 @@ protected: |
104 | 89 | env.getCharsetConverter().append(cp, res); |
105 | 90 | } |
106 | 91 | } |
92 | + | |
93 | +private: | |
94 | + | |
95 | + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { | |
96 | + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | |
97 | + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | |
98 | + originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | |
99 | + const unsigned char* ptr = prefixChunk.interpsGroup.ptr; | |
100 | + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, originalForm, string(""), prefixChunk, ptr); | |
101 | + decodedForm += mi.getLemma(); | |
102 | + } | |
103 | + } | |
104 | + | |
105 | + MorphInterpretation decodeMorphInterpretation( | |
106 | + unsigned int startNode, unsigned int endNode, | |
107 | + const string& orth, | |
108 | + const string& lemmaPrefix, | |
109 | + const InterpretedChunk& chunk, | |
110 | + const unsigned char*& ptr) const { | |
111 | + string lemma = lemmaPrefix; | |
112 | + EncodedInterpretation ei = this->decodeInterp(ptr); | |
113 | + this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma); | |
114 | + return MorphInterpretation( | |
115 | + startNode, endNode, | |
116 | + orth, lemma, | |
117 | + ei.tag, | |
118 | + ei.nameClassifier, | |
119 | + env.getTagset(), | |
120 | + env.getCharsetConverter()); | |
121 | + } | |
122 | + | |
123 | + void decodeLemma(const unsigned char*& ptr, EncodedForm& lemma) const { | |
124 | + lemma.suffixToCut = *ptr; | |
125 | + ptr++; | |
126 | + lemma.suffixToAdd = (const char*) ptr; | |
127 | + ptr += strlen((const char*) ptr) + 1; | |
128 | + assert(lemma.casePattern.size() == 0); | |
129 | + // lemma.casePattern.resize(MAX_WORD_SIZE, false); | |
130 | + uint8_t casePatternType = *ptr; | |
131 | + ptr++; | |
132 | + uint8_t prefixLength; | |
133 | + uint8_t patternLength; | |
134 | + switch (casePatternType) { | |
135 | + case LEMMA_ONLY_LOWER: | |
136 | + break; | |
137 | + case LEMMA_UPPER_PREFIX: | |
138 | + prefixLength = *ptr; | |
139 | + ptr++; | |
140 | + for (unsigned int i = 0; i < prefixLength; i++) { | |
141 | + // lemma.casePattern[i] = true; | |
142 | + lemma.casePattern.push_back(true); | |
143 | + } | |
144 | + // lemma.casePattern.resize(prefixLength, true); | |
145 | + break; | |
146 | + case LEMMA_MIXED_CASE: | |
147 | + patternLength = *ptr; | |
148 | + ptr++; | |
149 | + for (unsigned int i = 0; i < patternLength; i++) { | |
150 | + uint8_t idx = *ptr; | |
151 | + ptr++; | |
152 | + // lemma.casePattern[idx] = true; | |
153 | + lemma.casePattern.resize(idx + 1, false); | |
154 | + lemma.casePattern[idx] = true; | |
155 | + } | |
156 | + break; | |
157 | + } | |
158 | + } | |
159 | + | |
160 | + EncodedInterpretation decodeInterp(const unsigned char*& ptr) const { | |
161 | + EncodedInterpretation interp; | |
162 | + decodeLemma(ptr, interp.value); | |
163 | + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr))); | |
164 | + ptr += 2; | |
165 | + interp.nameClassifier = *ptr; | |
166 | + ptr++; | |
167 | + return interp; | |
168 | + } | |
107 | 169 | }; |
108 | 170 | |
109 | 171 | class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { |
110 | - | |
111 | 172 | public: |
112 | - InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {} | |
173 | + | |
174 | + InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { | |
175 | + } | |
113 | 176 | |
114 | 177 | void decode( |
115 | 178 | unsigned int startNode, |
116 | 179 | unsigned int endNode, |
117 | 180 | const InterpretedChunk& interpretedChunk, |
118 | 181 | std::vector<MorphInterpretation>& out) const { |
119 | - string orth; | |
120 | - string lemma; | |
121 | - convertPrefixes(interpretedChunk, lemma, orth); | |
122 | - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
123 | - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | |
124 | - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | |
125 | - decodeForm( | |
126 | - interpretedChunk.originalCodepoints, | |
127 | - ei.value, | |
128 | - orth); | |
129 | - out.push_back(MorphInterpretation( | |
130 | - startNode, endNode, | |
131 | - orth, lemma, | |
132 | - ei.tag, | |
133 | - ei.nameClassifier, | |
134 | - env.getTagset(), | |
135 | - env.getCharsetConverter())); | |
136 | - } | |
182 | + // string orth; | |
183 | + // string lemma; | |
184 | + // convertPrefixes(interpretedChunk, lemma, orth); | |
185 | + // size_t orthLength = orth.length(); | |
186 | + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
187 | + // for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | |
188 | + // const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | |
189 | + // decodeForm( | |
190 | + // interpretedChunk.originalCodepoints, | |
191 | + // ei.value, | |
192 | + // orth); | |
193 | + // out.push_back(MorphInterpretation( | |
194 | + // startNode, endNode, | |
195 | + // orth, lemma, | |
196 | + // ei.tag, | |
197 | + // ei.nameClassifier, | |
198 | + // env.getTagset(), | |
199 | + // env.getCharsetConverter())); | |
200 | + // orth.erase(orthLength); | |
201 | + // } | |
137 | 202 | } |
138 | 203 | |
139 | 204 | private: |
... | ... |
morfeusz/InterpsGroup.hpp
... | ... | @@ -14,24 +14,26 @@ |
14 | 14 | #include "MorphInterpretation.hpp" |
15 | 15 | #include "Tagset.hpp" |
16 | 16 | |
17 | -class InterpsGroup { | |
18 | -public: | |
19 | - | |
20 | - InterpsGroup() { | |
21 | - | |
22 | - } | |
23 | - | |
24 | - explicit InterpsGroup(const unsigned char type) | |
25 | - : type(type) { | |
26 | - | |
27 | - } | |
28 | - | |
29 | - void addInterpretation(const EncodedInterpretation& interp) { | |
30 | - interps.push_back(interp); | |
31 | - } | |
17 | +struct InterpsGroup { | |
18 | +//public: | |
19 | +// | |
20 | +// InterpsGroup() { | |
21 | +// | |
22 | +// } | |
23 | +// | |
24 | +// explicit InterpsGroup(const unsigned char type) | |
25 | +// : type(type) { | |
26 | +// | |
27 | +// } | |
28 | +// | |
29 | +// void addInterpretation(const EncodedInterpretation& interp) { | |
30 | +// interps.push_back(interp); | |
31 | +// } | |
32 | 32 | |
33 | 33 | unsigned char type; |
34 | - std::vector<EncodedInterpretation> interps; | |
34 | + uint16_t size; | |
35 | + const unsigned char* ptr; | |
36 | +// std::vector<EncodedInterpretation> interps; | |
35 | 37 | }; |
36 | 38 | |
37 | 39 | #endif /* GROUPEDINTERPRETATIONS_HPP */ |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -82,7 +82,9 @@ void Morfeusz::processOneWord( |
82 | 82 | FlexionGraph graph; |
83 | 83 | const char* currInput = inputStart; |
84 | 84 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); |
85 | + | |
85 | 86 | doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); |
87 | + | |
86 | 88 | if (!graph.empty()) { |
87 | 89 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
88 | 90 | int srcNode = startNodeNum; |
... | ... | @@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { |
110 | 112 | from.prefixChunks.end()); |
111 | 113 | to.prefixChunks.push_back(from); |
112 | 114 | from.orthWasShifted = true; |
115 | + to.chunkStartPtr = from.chunkStartPtr; | |
113 | 116 | } |
114 | 117 | |
115 | 118 | void Morfeusz::doProcessOneWord( |
... | ... | @@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord( |
119 | 122 | SegrulesState segrulesState, |
120 | 123 | vector<InterpretedChunk>& accum, |
121 | 124 | FlexionGraph& graph) const { |
122 | - cerr << "doAnalyzeOneWord " << inputData << endl; | |
123 | - bool endOfProcessing = inputData == inputEnd; | |
125 | +// cerr << "doAnalyzeOneWord " << inputData << endl; | |
124 | 126 | const char* currInput = inputData; |
125 | - uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
126 | - // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter)); | |
127 | + uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
127 | 128 | vector<uint32_t> originalCodepoints; |
128 | 129 | vector<uint32_t> lowercaseCodepoints; |
129 | 130 | |
130 | 131 | StateType state = env.getFSA().getInitialState(); |
131 | 132 | |
132 | - while (!endOfProcessing) { | |
133 | - if (isEndOfWord(codepoint)) { | |
134 | - endOfProcessing = true; | |
135 | - } | |
136 | - cerr << "not end of word '" << string(currInput) << "'" << endl; | |
133 | + while (!isEndOfWord(codepoint)) { | |
137 | 134 | uint32_t lowerCP = env.getCaseConverter().toLower(codepoint); |
138 | 135 | originalCodepoints.push_back(codepoint); |
139 | 136 | lowercaseCodepoints.push_back(lowerCP); |
140 | 137 | feedState(state, lowerCP, UTF8CharsetConverter()); |
141 | 138 | codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); |
142 | 139 | if (state.isAccepting()) { |
143 | - cerr << "accepting" << endl; | |
144 | 140 | vector<InterpsGroup> val(state.getValue()); |
145 | 141 | for (unsigned int i = 0; i < val.size(); i++) { |
146 | 142 | InterpsGroup& ig = val[i]; |
... | ... | @@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord( |
151 | 147 | it != newSegrulesStates.end(); |
152 | 148 | ++it) { |
153 | 149 | SegrulesState newSegrulesState = *it; |
150 | +// if (newSegrulesState.shiftOrthFromPrevious) { | |
151 | +// | |
152 | +// } | |
154 | 153 | InterpretedChunk ic = { |
155 | 154 | inputData, |
156 | 155 | originalCodepoints, |
... | ... | @@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord( |
165 | 164 | } |
166 | 165 | accum.push_back(ic); |
167 | 166 | if (isEndOfWord(codepoint)) { |
168 | - cerr << "end of word inside " << currInput <<endl; | |
169 | 167 | if (newSegrulesState.accepting) |
170 | 168 | graph.addPath(accum); |
171 | 169 | } |
... | ... | @@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord( |
177 | 175 | } |
178 | 176 | } |
179 | 177 | } |
178 | + codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
180 | 179 | } |
181 | - cerr << "end of word " << currInput << endl; | |
182 | 180 | inputData = currInput; |
183 | 181 | } |
184 | 182 | |
... | ... |
morfeusz/MorphDeserializer.cpp
... | ... | @@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() { |
23 | 23 | MorphDeserializer::~MorphDeserializer() { |
24 | 24 | } |
25 | 25 | |
26 | -static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { | |
27 | - // XXX uważać na poprawność danych | |
28 | - lemma.suffixToCut = *ptr; | |
29 | - ptr++; | |
30 | - lemma.suffixToAdd = (const char*) ptr; | |
31 | - ptr += strlen((const char*) ptr) + 1; | |
32 | - assert(lemma.casePattern.size() == 0); | |
33 | -// lemma.casePattern.resize(MAX_WORD_SIZE, false); | |
34 | - uint8_t casePatternType = *ptr; | |
35 | - ptr++; | |
36 | - uint8_t prefixLength; | |
37 | - uint8_t patternLength; | |
38 | - switch (casePatternType) { | |
39 | - case LEMMA_ONLY_LOWER: | |
40 | - break; | |
41 | - case LEMMA_UPPER_PREFIX: | |
42 | - prefixLength = *ptr; | |
43 | - ptr++; | |
44 | - for (unsigned int i = 0; i < prefixLength; i++) { | |
45 | -// lemma.casePattern[i] = true; | |
46 | - lemma.casePattern.push_back(true); | |
47 | - } | |
48 | -// lemma.casePattern.resize(prefixLength, true); | |
49 | - break; | |
50 | - case LEMMA_MIXED_CASE: | |
51 | - patternLength = *ptr; | |
52 | - ptr++; | |
53 | - for (unsigned int i = 0; i < patternLength; i++) { | |
54 | - uint8_t idx = *ptr; | |
55 | - ptr++; | |
26 | +//static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { | |
27 | +// // XXX uważać na poprawność danych | |
28 | +// lemma.suffixToCut = *ptr; | |
29 | +// ptr++; | |
30 | +// lemma.suffixToAdd = (const char*) ptr; | |
31 | +// ptr += strlen((const char*) ptr) + 1; | |
32 | +// assert(lemma.casePattern.size() == 0); | |
33 | +//// lemma.casePattern.resize(MAX_WORD_SIZE, false); | |
34 | +// uint8_t casePatternType = *ptr; | |
35 | +// ptr++; | |
36 | +// uint8_t prefixLength; | |
37 | +// uint8_t patternLength; | |
38 | +// switch (casePatternType) { | |
39 | +// case LEMMA_ONLY_LOWER: | |
40 | +// break; | |
41 | +// case LEMMA_UPPER_PREFIX: | |
42 | +// prefixLength = *ptr; | |
43 | +// ptr++; | |
44 | +// for (unsigned int i = 0; i < prefixLength; i++) { | |
45 | +//// lemma.casePattern[i] = true; | |
46 | +// lemma.casePattern.push_back(true); | |
47 | +// } | |
48 | +//// lemma.casePattern.resize(prefixLength, true); | |
49 | +// break; | |
50 | +// case LEMMA_MIXED_CASE: | |
51 | +// patternLength = *ptr; | |
52 | +// ptr++; | |
53 | +// for (unsigned int i = 0; i < patternLength; i++) { | |
54 | +// uint8_t idx = *ptr; | |
55 | +// ptr++; | |
56 | +//// lemma.casePattern[idx] = true; | |
57 | +// lemma.casePattern.resize(idx + 1, false); | |
56 | 58 | // lemma.casePattern[idx] = true; |
57 | - lemma.casePattern.resize(idx + 1, false); | |
58 | - lemma.casePattern[idx] = true; | |
59 | - } | |
60 | - break; | |
61 | - } | |
62 | -} | |
63 | - | |
64 | -static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { | |
65 | - interp.type = *ptr; | |
66 | - ptr++; | |
67 | - deserializeLemma(ptr, interp.value); | |
68 | - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | |
69 | - ptr += 2; | |
70 | - interp.nameClassifier = *ptr; | |
71 | - ptr++; | |
72 | -} | |
59 | +// } | |
60 | +// break; | |
61 | +// } | |
62 | +//} | |
63 | +// | |
64 | +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { | |
65 | +// interp.type = *ptr; | |
66 | +// ptr++; | |
67 | +// deserializeLemma(ptr, interp.value); | |
68 | +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | |
69 | +// ptr += 2; | |
70 | +// interp.nameClassifier = *ptr; | |
71 | +// ptr++; | |
72 | +//} | |
73 | 73 | |
74 | 74 | long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { |
75 | 75 | const unsigned char* currPtr = ptr; |
76 | - uint8_t interpsNum = *ptr; | |
77 | - interps.clear(); | |
78 | - interps.reserve(interpsNum); | |
76 | + uint8_t interpTypesNum = *currPtr; | |
79 | 77 | currPtr++; |
80 | - // FIXME - to jest do poprawy | |
81 | - map<int, InterpsGroup> results; | |
82 | - for (unsigned int i = 0; i < interpsNum; ++i) { | |
83 | - EncodedInterpretation interp; | |
84 | - deserializeInterp(currPtr, interp); | |
85 | - if (results.count(interp.type) == 0) { | |
86 | - results[interp.type] = InterpsGroup(interp.type); | |
87 | - } | |
88 | - results[interp.type].addInterpretation(interp); | |
89 | -// interps.push_back(interp); | |
90 | - } | |
91 | - map<int, InterpsGroup>::iterator it; | |
92 | - for (it = results.begin(); it != results.end(); ++it) { | |
93 | - interps.push_back((*it).second); | |
78 | + interps.clear(); | |
79 | + interps.reserve(interpTypesNum); | |
80 | + for (unsigned int i = 0; i < interpTypesNum; i++) { | |
81 | + InterpsGroup ig; | |
82 | + ig.type = *currPtr; | |
83 | + currPtr++; | |
84 | + ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr))); | |
85 | + currPtr += 2; | |
86 | + ig.ptr = currPtr; | |
87 | + currPtr += ig.size; | |
88 | + interps.push_back(ig); | |
94 | 89 | } |
95 | 90 | return currPtr - ptr; |
96 | 91 | } |
92 | + | |
93 | +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { | |
94 | +// const unsigned char* currPtr = ptr; | |
95 | +// uint8_t interpsNum = *ptr; | |
96 | +// interps.clear(); | |
97 | +// interps.reserve(interpsNum); | |
98 | +// currPtr++; | |
99 | +// // FIXME - to jest do poprawy | |
100 | +// map<int, InterpsGroup> results; | |
101 | +// for (unsigned int i = 0; i < interpsNum; ++i) { | |
102 | +// EncodedInterpretation interp; | |
103 | +// deserializeInterp(currPtr, interp); | |
104 | +// if (results.count(interp.type) == 0) { | |
105 | +// results[interp.type] = InterpsGroup(interp.type); | |
106 | +// } | |
107 | +// results[interp.type].addInterpretation(interp); | |
108 | +//// interps.push_back(interp); | |
109 | +// } | |
110 | +// map<int, InterpsGroup>::iterator it; | |
111 | +// for (it = results.begin(); it != results.end(); ++it) { | |
112 | +// interps.push_back((*it).second); | |
113 | +// } | |
114 | +// return currPtr - ptr; | |
115 | +//} | |
... | ... |
nbproject/configurations.xml
... | ... | @@ -106,14 +106,20 @@ |
106 | 106 | </makeTool> |
107 | 107 | </makefileType> |
108 | 108 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
109 | + <ccTool flags="1"> | |
110 | + </ccTool> | |
109 | 111 | </item> |
110 | 112 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | + <ccTool flags="1"> | |
114 | + </ccTool> | |
111 | 115 | </item> |
112 | 116 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | 117 | </item> |
114 | 118 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
115 | 119 | </item> |
116 | 120 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
121 | + <ccTool flags="1"> | |
122 | + </ccTool> | |
117 | 123 | </item> |
118 | 124 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
119 | 125 | ex="false" |
... | ... | @@ -169,7 +175,7 @@ |
169 | 175 | <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
170 | 176 | </item> |
171 | 177 | <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
172 | - <ccTool flags="1"> | |
178 | + <ccTool> | |
173 | 179 | <incDir> |
174 | 180 | <pElem>morfeusz</pElem> |
175 | 181 | <pElem>morfeusz/build/morfeusz</pElem> |
... | ... | @@ -180,7 +186,7 @@ |
180 | 186 | </ccTool> |
181 | 187 | </item> |
182 | 188 | <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
183 | - <ccTool flags="1"> | |
189 | + <ccTool> | |
184 | 190 | <incDir> |
185 | 191 | <pElem>morfeusz</pElem> |
186 | 192 | <pElem>morfeusz/build/morfeusz</pElem> |
... | ... | @@ -273,7 +279,7 @@ |
273 | 279 | <ccTool> |
274 | 280 | <incDir> |
275 | 281 | <pElem>morfeusz</pElem> |
276 | - <pElem>/usr/lib/jvm/default-java/include</pElem> | |
282 | + <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | |
277 | 283 | </incDir> |
278 | 284 | <preprocessorList> |
279 | 285 | <Elem>libjmorfeusz_EXPORTS</Elem> |
... | ... | @@ -408,18 +414,26 @@ |
408 | 414 | </ccTool> |
409 | 415 | </item> |
410 | 416 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
417 | + <ccTool flags="1"> | |
418 | + </ccTool> | |
411 | 419 | </item> |
412 | 420 | <item path="morfeusz/charset/CharsetConverter.cpp" |
413 | 421 | ex="false" |
414 | 422 | tool="1" |
415 | 423 | flavor2="4"> |
424 | + <ccTool flags="1"> | |
425 | + </ccTool> | |
416 | 426 | </item> |
417 | 427 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
428 | + <ccTool flags="1"> | |
429 | + </ccTool> | |
418 | 430 | </item> |
419 | 431 | <item path="morfeusz/charset/conversion_tables.cpp" |
420 | 432 | ex="false" |
421 | 433 | tool="1" |
422 | 434 | flavor2="4"> |
435 | + <ccTool flags="1"> | |
436 | + </ccTool> | |
423 | 437 | </item> |
424 | 438 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
425 | 439 | <ccTool flags="1"> |
... | ... | @@ -508,8 +522,12 @@ |
508 | 522 | ex="false" |
509 | 523 | tool="1" |
510 | 524 | flavor2="4"> |
525 | + <ccTool flags="1"> | |
526 | + </ccTool> | |
511 | 527 | </item> |
512 | 528 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
529 | + <ccTool flags="1"> | |
530 | + </ccTool> | |
513 | 531 | </item> |
514 | 532 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
515 | 533 | <ccTool flags="0"> |
... | ... |