Commit f1e52ff44027610390237bd862449c85c4a044cc
1 parent
de0e960d
poprawienie czasu działania, przebudowanie analizatora tak, by nie powielać kodu…
… w generatorze, poprawienie rozpoznawania pierwszego segmentu w grafie fleksyjnym git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@114 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
17 changed files
with
411 additions
and
205 deletions
CMakeLists.txt
| ... | ... | @@ -5,7 +5,7 @@ project (Morfeusz) |
| 5 | 5 | set (Morfeusz_VERSION_MAJOR 2) |
| 6 | 6 | set (Morfeusz_VERSION_MINOR 0) |
| 7 | 7 | set (Morfeusz_VERSION_PATCH 0) |
| 8 | -set (CMAKE_BUILD_TYPE "Debug") | |
| 8 | +set (CMAKE_BUILD_TYPE "Release") | |
| 9 | 9 | |
| 10 | 10 | enable_testing() |
| 11 | 11 | |
| ... | ... | @@ -47,7 +47,7 @@ endif () |
| 47 | 47 | |
| 48 | 48 | # SEGMENT_RULES_FILE |
| 49 | 49 | if ("${SEGMENT_RULES_FILE}" STREQUAL "") |
| 50 | - set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat") | |
| 50 | + set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat") | |
| 51 | 51 | endif () |
| 52 | 52 | |
| 53 | 53 | message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules") |
| ... | ... |
fsabuilder/morfeuszbuilder/fsa/common.py
| ... | ... | @@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object): |
| 40 | 40 | self.suffixToAdd = bestEncodedForm.suffixToAdd |
| 41 | 41 | self.prefixToAdd = targetWord[:bestPrefixLength] |
| 42 | 42 | |
| 43 | -class Interpretation(object): | |
| 43 | +class Interpretation4Analyzer(object): | |
| 44 | 44 | |
| 45 | 45 | def __init__(self, orth, base, tagnum, namenum, typenum): |
| 46 | - self.lemma = EncodedForm(orth, base) | |
| 46 | + self.encodedForm = EncodedForm(orth, base) | |
| 47 | 47 | self.tagnum = tagnum |
| 48 | 48 | self.namenum = namenum |
| 49 | 49 | self.typenum = typenum |
| 50 | 50 | |
| 51 | 51 | def getSortKey(self): |
| 52 | 52 | return ( |
| 53 | - self.lemma.cutLength, | |
| 54 | - tuple(self.lemma.suffixToAdd), | |
| 55 | - tuple(self.lemma.casePattern), | |
| 53 | + self.encodedForm.cutLength, | |
| 54 | + tuple(self.encodedForm.suffixToAdd), | |
| 55 | + tuple(self.encodedForm.casePattern), | |
| 56 | 56 | self.tagnum, |
| 57 | 57 | self.namenum) |
| 58 | 58 | |
| 59 | 59 | def __eq__(self, other): |
| 60 | - if isinstance(other, Interpretation): | |
| 60 | + if isinstance(other, Interpretation4Analyzer): | |
| 61 | 61 | return self.getSortKey() == other.getSortKey() |
| 62 | 62 | else: |
| 63 | 63 | return False |
| ... | ... | @@ -68,8 +68,8 @@ class Interpretation(object): |
| 68 | 68 | class Interpretation4Generator(object): |
| 69 | 69 | |
| 70 | 70 | def __init__(self, orth, base, tagnum, namenum, typenum): |
| 71 | - self.lemma = base | |
| 72 | - self.orth = EncodedFormWithPrefix(base, orth) | |
| 71 | + self.encodedForm = base | |
| 72 | + self.encodedForm = EncodedFormWithPrefix(base, orth) | |
| 73 | 73 | self.tagnum = tagnum |
| 74 | 74 | self.namenum = namenum |
| 75 | 75 | self.typenum = typenum |
| ... | ... | @@ -77,9 +77,9 @@ class Interpretation4Generator(object): |
| 77 | 77 | def getSortKey(self): |
| 78 | 78 | return ( |
| 79 | 79 | self.tagnum, |
| 80 | - self.orth.cutLength, | |
| 81 | - tuple(self.orth.suffixToAdd), | |
| 82 | -# tuple(self.lemma.casePattern), | |
| 80 | + self.encodedForm.cutLength, | |
| 81 | + tuple(self.encodedForm.suffixToAdd), | |
| 82 | +# tuple(self.encodedForm.casePattern), | |
| 83 | 83 | self.namenum) |
| 84 | 84 | |
| 85 | 85 | def __eq__(self, other): |
| ... | ... | @@ -92,7 +92,7 @@ class Interpretation4Generator(object): |
| 92 | 92 | return hash(self.getSortKey()) |
| 93 | 93 | |
| 94 | 94 | def __unicode__(self): |
| 95 | - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.orth.cutLength, self.orth.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) | |
| 95 | + return u'<%s,(%d %s),%d,%d>' % (self.encodedForm.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) | |
| 96 | 96 | |
| 97 | 97 | def __repr__(self): |
| 98 | 98 | return unicode(self) |
| ... | ... |
fsabuilder/morfeuszbuilder/fsa/convertinput.py
| ... | ... | @@ -4,7 +4,7 @@ Created on Oct 23, 2013 |
| 4 | 4 | @author: mlenart |
| 5 | 5 | ''' |
| 6 | 6 | import logging |
| 7 | -from common import Interpretation | |
| 7 | +from common import Interpretation4Analyzer | |
| 8 | 8 | from morfeuszbuilder.fsa.common import Interpretation4Generator |
| 9 | 9 | |
| 10 | 10 | def _mergeEntries(inputLines): |
| ... | ... | @@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object): |
| 74 | 74 | tagnum = int(tagnum) |
| 75 | 75 | namenum = int(namenum) |
| 76 | 76 | typenum = int(typenum) |
| 77 | - yield (orth, Interpretation(orth, base, tagnum, namenum, typenum)) | |
| 77 | + yield (orth, Interpretation4Analyzer(orth, base, tagnum, namenum, typenum)) | |
| 78 | 78 | |
| 79 | 79 | def convert(self, inputLines): |
| 80 | 80 | return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines)))) |
| ... | ... |
fsabuilder/morfeuszbuilder/fsa/encode.py
| ... | ... | @@ -5,6 +5,7 @@ Created on Oct 23, 2013 |
| 5 | 5 | ''' |
| 6 | 6 | |
| 7 | 7 | import logging |
| 8 | +from morfeuszbuilder.utils import serializationUtils | |
| 8 | 9 | |
| 9 | 10 | class Encoder(object): |
| 10 | 11 | ''' |
| ... | ... | @@ -96,6 +97,54 @@ class Encoder(object): |
| 96 | 97 | def _encodeNameNum(self, namenum): |
| 97 | 98 | assert namenum < 256 and namenum >= 0 |
| 98 | 99 | return bytearray([namenum]) |
| 100 | + | |
| 101 | + def _groupInterpsByType(self, interpsList): | |
| 102 | + res = {} | |
| 103 | + for interp in interpsList: | |
| 104 | + res.setdefault(interp.typenum, []) | |
| 105 | + res[interp.typenum].append(interp) | |
| 106 | + return res | |
| 107 | + | |
| 108 | + def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix): | |
| 109 | + res = bytearray() | |
| 110 | + res.extend(self._encodeTypeNum(typenum)) | |
| 111 | + | |
| 112 | + encodedInterpsList = bytearray() | |
| 113 | + for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
| 114 | + encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) | |
| 115 | + encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) | |
| 116 | + encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) | |
| 117 | + | |
| 118 | + res.extend(serializationUtils.htons(len(encodedInterpsList))) | |
| 119 | + res.extend(encodedInterpsList) | |
| 120 | + return res | |
| 121 | + | |
| 122 | + def _doEncodeData(self, interpsList, withCasePattern, withPrefix): | |
| 123 | + | |
| 124 | + assert type(interpsList) == frozenset | |
| 125 | + | |
| 126 | + segnum2Interps = self._groupInterpsByType(interpsList) | |
| 127 | + | |
| 128 | + | |
| 129 | + res = bytearray() | |
| 130 | + firstByte = len(segnum2Interps) | |
| 131 | + assert firstByte < 256 | |
| 132 | + assert firstByte > 0 | |
| 133 | + res.append(firstByte) | |
| 134 | + | |
| 135 | + for typenum, interpsList in segnum2Interps.iteritems(): | |
| 136 | + res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix)) | |
| 137 | + | |
| 138 | + | |
| 139 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
| 140 | +# encodedInterpsList.extend(self._encodeTypeNum(interp.typenum)) | |
| 141 | +# encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) | |
| 142 | +# encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) | |
| 143 | +# encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) | |
| 144 | + del interpsList | |
| 145 | +# res.extend(serializationUtils.htons(len(encodedInterpsList))) | |
| 146 | +# res.extend(encodedInterpsList) | |
| 147 | + return res | |
| 99 | 148 | |
| 100 | 149 | class MorphEncoder(Encoder): |
| 101 | 150 | |
| ... | ... | @@ -106,19 +155,20 @@ class MorphEncoder(Encoder): |
| 106 | 155 | self.LEMMA_MIXED_CASE = 2 |
| 107 | 156 | |
| 108 | 157 | def encodeData(self, interpsList): |
| 109 | - res = bytearray() | |
| 110 | - firstByte = len(interpsList) | |
| 111 | - assert firstByte < 256 | |
| 112 | - assert firstByte > 0 | |
| 113 | - res.append(firstByte) | |
| 114 | - assert type(interpsList) == frozenset | |
| 115 | - for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
| 116 | - res.extend(self._encodeTypeNum(interp.typenum)) | |
| 117 | - res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False)) | |
| 118 | - res.extend(self._encodeTagNum(interp.tagnum)) | |
| 119 | - res.extend(self._encodeNameNum(interp.namenum)) | |
| 120 | - del interpsList | |
| 121 | - return res | |
| 158 | + return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False) | |
| 159 | +# res = bytearray() | |
| 160 | +# firstByte = len(interpsList) | |
| 161 | +# assert firstByte < 256 | |
| 162 | +# assert firstByte > 0 | |
| 163 | +# res.append(firstByte) | |
| 164 | +# assert type(interpsList) == frozenset | |
| 165 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
| 166 | +# res.extend(self._encodeTypeNum(interp.typenum)) | |
| 167 | +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=True, withPrefix=False)) | |
| 168 | +# res.extend(self._encodeTagNum(interp.tagnum)) | |
| 169 | +# res.extend(self._encodeNameNum(interp.namenum)) | |
| 170 | +# del interpsList | |
| 171 | +# return res | |
| 122 | 172 | |
| 123 | 173 | class Encoder4Generator(Encoder): |
| 124 | 174 | |
| ... | ... | @@ -126,18 +176,19 @@ class Encoder4Generator(Encoder): |
| 126 | 176 | super(Encoder4Generator, self).__init__(encoding) |
| 127 | 177 | |
| 128 | 178 | def encodeData(self, interpsList): |
| 129 | - res = bytearray() | |
| 130 | - firstByte = len(interpsList) | |
| 131 | - assert firstByte < 256 | |
| 132 | - assert firstByte > 0 | |
| 133 | - res.append(firstByte) | |
| 134 | - assert type(interpsList) == frozenset | |
| 135 | - for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
| 136 | - res.extend(self._encodeTypeNum(interp.typenum)) | |
| 137 | - res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True)) | |
| 138 | - res.extend(self._encodeTagNum(interp.tagnum)) | |
| 139 | - res.extend(self._encodeNameNum(interp.namenum)) | |
| 140 | - return res | |
| 179 | + return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True) | |
| 180 | +# res = bytearray() | |
| 181 | +# firstByte = len(interpsList) | |
| 182 | +# assert firstByte < 256 | |
| 183 | +# assert firstByte > 0 | |
| 184 | +# res.append(firstByte) | |
| 185 | +# assert type(interpsList) == frozenset | |
| 186 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | |
| 187 | +# res.extend(self._encodeTypeNum(interp.typenum)) | |
| 188 | +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=False, withPrefix=True)) | |
| 189 | +# res.extend(self._encodeTagNum(interp.tagnum)) | |
| 190 | +# res.extend(self._encodeNameNum(interp.namenum)) | |
| 191 | +# return res | |
| 141 | 192 | # |
| 142 | 193 | # def decodeData(self, data): |
| 143 | 194 | # |
| ... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
| ... | ... | @@ -106,7 +106,7 @@ class Segtypes(object): |
| 106 | 106 | lineNum, |
| 107 | 107 | re.match(r'[a-z_]+', segtype)) |
| 108 | 108 | self._validate( |
| 109 | - u'Pattern must contain lemma and part-of-speech fields', | |
| 109 | + u'Pattern must contain encodedForm and part-of-speech fields', | |
| 110 | 110 | lineNum, |
| 111 | 111 | re.match(r'.+\:[a-z_]+', pattern, re.U)) |
| 112 | 112 | |
| ... | ... | @@ -146,13 +146,13 @@ class Segtypes(object): |
| 146 | 146 | |
| 147 | 147 | # index lexemes |
| 148 | 148 | for p in self.patternsList: |
| 149 | - if p.lemma: | |
| 149 | + if p.encodedForm: | |
| 150 | 150 | for tag in self.tagset.getAllTags(): |
| 151 | 151 | tagnum = self.tagset.getTagnum4Tag(tag) |
| 152 | - if not (p.lemma, tagnum) in self._lemmaTagnum2Segnum: | |
| 153 | - segnum = p.tryToMatch(p.lemma, tag) | |
| 152 | + if not (p.encodedForm, tagnum) in self._lemmaTagnum2Segnum: | |
| 153 | + segnum = p.tryToMatch(p.encodedForm, tag) | |
| 154 | 154 | if segnum != -1: |
| 155 | - self._lemmaTagnum2Segnum[(p.lemma, tagnum)] = segnum | |
| 155 | + self._lemmaTagnum2Segnum[(p.encodedForm, tagnum)] = segnum | |
| 156 | 156 | # logging.info('indexing segment type numbers - done') |
| 157 | 157 | # self._debugSegnums() |
| 158 | 158 | |
| ... | ... | @@ -171,7 +171,7 @@ class Segtypes(object): |
| 171 | 171 | class SegtypePattern(object): |
| 172 | 172 | |
| 173 | 173 | def __init__(self, lemma, pattern, segnum): |
| 174 | - self.lemma = lemma | |
| 174 | + self.encodedForm = lemma | |
| 175 | 175 | self.pattern = pattern |
| 176 | 176 | self.segnum = segnum |
| 177 | 177 | |
| ... | ... | @@ -181,7 +181,7 @@ class SegtypePattern(object): |
| 181 | 181 | patterns2Match = [] |
| 182 | 182 | patterns2Match.append(self.pattern.replace('%', '.*')) |
| 183 | 183 | patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) |
| 184 | - if (self.lemma is None or self.lemma == lemma) \ | |
| 184 | + if (self.encodedForm is None or self.encodedForm == lemma) \ | |
| 185 | 185 | and any([re.match(p, tag) for p in patterns2Match]): |
| 186 | 186 | return self.segnum |
| 187 | 187 | else: |
| ... | ... |
input/dodatki.tab
input/segmenty.dat
| ... | ... | @@ -142,6 +142,7 @@ samodz dywiz adj |
| 142 | 142 | # Stopień najwyższy: |
| 143 | 143 | # np. „naj·zieleńszy”, „naj·mądrzej” |
| 144 | 144 | moze_interp( naj> adj_sup ) |
| 145 | +moze_interp( nie> naj> adj_sup ) | |
| 145 | 146 | # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj |
| 146 | 147 | moze_interp( praet_sg dywiz li) |
| 147 | 148 | moze_interp( praet_pl dywiz li) |
| ... | ... |
input/segmenty1.dat
| ... | ... | @@ -52,11 +52,14 @@ naj naj |
| 52 | 52 | nie nie |
| 53 | 53 | prefs prefs |
| 54 | 54 | prefv prefv |
| 55 | +prefa prefa | |
| 55 | 56 | dig dig |
| 56 | 57 | adja adja |
| 57 | 58 | adj adj:%:pos |
| 58 | 59 | adj_sup adj:%:sup |
| 59 | 60 | adj_sup adv:sup |
| 61 | +adj_com adj:%:com | |
| 62 | +adj_com adj:%:com | |
| 60 | 63 | negat ger:%:neg |
| 61 | 64 | negat pact:%:neg |
| 62 | 65 | negat ppas:%:neg |
| ... | ... | @@ -69,6 +72,22 @@ interp interp |
| 69 | 72 | aglsg aglt:sg:% |
| 70 | 73 | aglpl aglt:pl:% |
| 71 | 74 | samodz % |
| 75 | +praet_fin praet:% | |
| 76 | +praet_fin fin:% | |
| 77 | +li li:qub:% | |
| 78 | +nomina subst:% | |
| 79 | +nomina ger:% | |
| 80 | +nomina depr:% | |
| 81 | +adjectiva adj:% | |
| 82 | +adjectiva adv:% | |
| 83 | +adjectiva ppas:% | |
| 84 | +adjectiva pact:% | |
| 85 | +verba_imperf praet:%:imperf | |
| 86 | +verba_imperf fin:%:imperf | |
| 87 | +verba_imperf inf:imperf | |
| 88 | +verba_imperf imps:imperf | |
| 89 | +verba_imperf impt:imperf | |
| 90 | + | |
| 72 | 91 | |
| 73 | 92 | [lexemes] |
| 74 | 93 | z_aglt aby:comp |
| ... | ... |
morfeusz/EncodedInterpretation.hpp
morfeusz/Environment.cpp
| ... | ... | @@ -13,10 +13,12 @@ |
| 13 | 13 | //class InterpretedChunksDecoder4Analyzer; |
| 14 | 14 | //class InterpretedChunksDecoder4Generator; |
| 15 | 15 | |
| 16 | -static Deserializer<vector<InterpsGroup> >* initializeDeserializer() { | |
| 17 | - static Deserializer < vector < InterpsGroup > > *deserializer | |
| 16 | +static Deserializer<vector<InterpsGroup> >& initializeDeserializer(MorfeuszProcessorType processorType) { | |
| 17 | + static Deserializer < vector < InterpsGroup > > *analyzerDeserializer | |
| 18 | 18 | = new MorphDeserializer(); |
| 19 | - return deserializer; | |
| 19 | + static Deserializer < vector < InterpsGroup > > *generatorDeserializer | |
| 20 | + = new MorphDeserializer(); | |
| 21 | + return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer); | |
| 20 | 22 | } |
| 21 | 23 | |
| 22 | 24 | static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { |
| ... | ... | @@ -48,14 +50,15 @@ Environment::Environment( |
| 48 | 50 | caseConverter(), |
| 49 | 51 | tagset(fsaFileStartPtr), |
| 50 | 52 | fsaFileStartPtr(fsaFileStartPtr), |
| 51 | - fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())), | |
| 53 | + fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))), | |
| 52 | 54 | segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)), |
| 53 | 55 | currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), |
| 54 | 56 | isFromFile(false), |
| 55 | 57 | chunksDecoder( |
| 56 | 58 | processorType == ANALYZER |
| 57 | 59 | ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) |
| 58 | - : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)) | |
| 60 | + : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), | |
| 61 | + processorType(processorType) | |
| 59 | 62 | { |
| 60 | 63 | } |
| 61 | 64 | |
| ... | ... | @@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string& filename) { |
| 110 | 113 | delete this->fsaFileStartPtr; |
| 111 | 114 | } |
| 112 | 115 | this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str()); |
| 113 | - this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer()); | |
| 116 | + this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType)); | |
| 114 | 117 | this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr); |
| 115 | 118 | this->isFromFile = true; |
| 116 | 119 | } |
| ... | ... |
morfeusz/Environment.hpp
morfeusz/FlexionGraph.cpp
| ... | ... | @@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge& e) { |
| 15 | 15 | this->graph.push_back(vector<Edge>()); |
| 16 | 16 | this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr); |
| 17 | 17 | } |
| 18 | -// cerr << string(e.chunk.chunkStartPtr) << endl; | |
| 19 | 18 | assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr); |
| 20 | 19 | this->graph[0].push_back(e); |
| 21 | 20 | } |
| ... | ... | @@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge& e) { |
| 30 | 29 | this->graph[startNode].push_back(e); |
| 31 | 30 | } |
| 32 | 31 | |
| 32 | +static inline bool chunkIsAtFront( | |
| 33 | + const InterpretedChunk& chunk, | |
| 34 | + const std::vector<InterpretedChunk>& path) { | |
| 35 | + unsigned int i; | |
| 36 | + for (i = 0; i < path.size() - 1 && path[i].orthWasShifted; i++) { | |
| 37 | + } | |
| 38 | + assert(!path[i].orthWasShifted); | |
| 39 | + return &chunk == &(path[i]); | |
| 40 | +} | |
| 41 | + | |
| 42 | +static inline bool chunkIsAtBack( | |
| 43 | + const InterpretedChunk& chunk, | |
| 44 | + const std::vector<InterpretedChunk>& path) { | |
| 45 | + return &chunk == &(path.back()); | |
| 46 | +} | |
| 47 | + | |
| 48 | +static inline bool chunkIsTheOnlyOne( | |
| 49 | + const InterpretedChunk& chunk, | |
| 50 | + const std::vector<InterpretedChunk>& path) { | |
| 51 | + return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); | |
| 52 | +} | |
| 53 | + | |
| 33 | 54 | void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { |
| 34 | 55 | // debugPath(path); |
| 35 | 56 | // debugGraph(this->graph); |
| 36 | 57 | for (unsigned int i = 0; i < path.size(); i++) { |
| 37 | 58 | const InterpretedChunk& chunk = path[i]; |
| 38 | 59 | if (!chunk.orthWasShifted) { |
| 39 | - if (&chunk == &(path.front()) | |
| 40 | - && &chunk == &(path.back())) { | |
| 60 | + if (chunkIsTheOnlyOne(chunk, path)) { | |
| 41 | 61 | Edge e = {chunk, UINT_MAX}; |
| 42 | 62 | this->addStartEdge(e); |
| 43 | 63 | } |
| 44 | - else if (&chunk == &(path.front())) { | |
| 64 | + else if (chunkIsAtFront(chunk, path)) { | |
| 45 | 65 | Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()}; |
| 46 | 66 | this->addStartEdge(e); |
| 47 | 67 | } |
| 48 | - else if (&chunk == &(path.back())) { | |
| 68 | + else if (chunkIsAtBack(chunk, path)) { | |
| 49 | 69 | Edge e = {chunk, UINT_MAX}; |
| 50 | 70 | this->addMiddleEdge((unsigned int) this->graph.size(), e); |
| 51 | 71 | } |
| ... | ... |
morfeusz/InterpretedChunksDecoder.hpp
| ... | ... | @@ -18,6 +18,10 @@ |
| 18 | 18 | #include "charset/CaseConverter.hpp" |
| 19 | 19 | #include "Environment.hpp" |
| 20 | 20 | |
| 21 | +const uint8_t LEMMA_ONLY_LOWER = 0; | |
| 22 | +const uint8_t LEMMA_UPPER_PREFIX = 1; | |
| 23 | +const uint8_t LEMMA_MIXED_CASE = 2; | |
| 24 | + | |
| 21 | 25 | class InterpretedChunksDecoder { |
| 22 | 26 | public: |
| 23 | 27 | |
| ... | ... | @@ -30,22 +34,12 @@ public: |
| 30 | 34 | unsigned int endNode, |
| 31 | 35 | const InterpretedChunk& interpretedChunk, |
| 32 | 36 | std::vector<MorphInterpretation>& out) const = 0; |
| 33 | - | |
| 34 | - virtual ~InterpretedChunksDecoder() {} | |
| 35 | 37 | |
| 36 | -protected: | |
| 37 | - | |
| 38 | - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { | |
| 39 | - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | |
| 40 | - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | |
| 41 | - originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | |
| 42 | - decodeForm( | |
| 43 | - prefixChunk.lowercaseCodepoints, | |
| 44 | - prefixChunk.interpsGroup.interps[0].value, | |
| 45 | - decodedForm); | |
| 46 | - } | |
| 38 | + virtual ~InterpretedChunksDecoder() { | |
| 47 | 39 | } |
| 48 | - | |
| 40 | + | |
| 41 | +protected: | |
| 42 | + | |
| 49 | 43 | virtual void decodeForm( |
| 50 | 44 | const std::vector<uint32_t>& orth, |
| 51 | 45 | const EncodedForm& form, |
| ... | ... | @@ -55,9 +49,10 @@ protected: |
| 55 | 49 | }; |
| 56 | 50 | |
| 57 | 51 | class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { |
| 58 | - | |
| 59 | 52 | public: |
| 60 | - InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {} | |
| 53 | + | |
| 54 | + InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { | |
| 55 | + } | |
| 61 | 56 | |
| 62 | 57 | void decode( |
| 63 | 58 | unsigned int startNode, |
| ... | ... | @@ -65,22 +60,12 @@ public: |
| 65 | 60 | const InterpretedChunk& interpretedChunk, |
| 66 | 61 | std::vector<MorphInterpretation>& out) const { |
| 67 | 62 | string orth; |
| 68 | - string lemma; | |
| 69 | - convertPrefixes(interpretedChunk, orth, lemma); | |
| 63 | + string lemmaPrefix; | |
| 64 | + convertPrefixes(interpretedChunk, orth, lemmaPrefix); | |
| 70 | 65 | orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); |
| 71 | - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | |
| 72 | - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | |
| 73 | - decodeForm( | |
| 74 | - interpretedChunk.lowercaseCodepoints, | |
| 75 | - ei.value, | |
| 76 | - lemma); | |
| 77 | - out.push_back(MorphInterpretation( | |
| 78 | - startNode, endNode, | |
| 79 | - orth, lemma, | |
| 80 | - ei.tag, | |
| 81 | - ei.nameClassifier, | |
| 82 | - env.getTagset(), | |
| 83 | - env.getCharsetConverter())); | |
| 66 | + const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr; | |
| 67 | + while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) { | |
| 68 | + out.push_back(this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr)); | |
| 84 | 69 | } |
| 85 | 70 | } |
| 86 | 71 | |
| ... | ... | @@ -104,36 +89,116 @@ protected: |
| 104 | 89 | env.getCharsetConverter().append(cp, res); |
| 105 | 90 | } |
| 106 | 91 | } |
| 92 | + | |
| 93 | +private: | |
| 94 | + | |
| 95 | + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { | |
| 96 | + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | |
| 97 | + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | |
| 98 | + originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | |
| 99 | + const unsigned char* ptr = prefixChunk.interpsGroup.ptr; | |
| 100 | + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, originalForm, string(""), prefixChunk, ptr); | |
| 101 | + decodedForm += mi.getLemma(); | |
| 102 | + } | |
| 103 | + } | |
| 104 | + | |
| 105 | + MorphInterpretation decodeMorphInterpretation( | |
| 106 | + unsigned int startNode, unsigned int endNode, | |
| 107 | + const string& orth, | |
| 108 | + const string& lemmaPrefix, | |
| 109 | + const InterpretedChunk& chunk, | |
| 110 | + const unsigned char*& ptr) const { | |
| 111 | + string lemma = lemmaPrefix; | |
| 112 | + EncodedInterpretation ei = this->decodeInterp(ptr); | |
| 113 | + this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma); | |
| 114 | + return MorphInterpretation( | |
| 115 | + startNode, endNode, | |
| 116 | + orth, lemma, | |
| 117 | + ei.tag, | |
| 118 | + ei.nameClassifier, | |
| 119 | + env.getTagset(), | |
| 120 | + env.getCharsetConverter()); | |
| 121 | + } | |
| 122 | + | |
| 123 | + void decodeLemma(const unsigned char*& ptr, EncodedForm& lemma) const { | |
| 124 | + lemma.suffixToCut = *ptr; | |
| 125 | + ptr++; | |
| 126 | + lemma.suffixToAdd = (const char*) ptr; | |
| 127 | + ptr += strlen((const char*) ptr) + 1; | |
| 128 | + assert(lemma.casePattern.size() == 0); | |
| 129 | + // lemma.casePattern.resize(MAX_WORD_SIZE, false); | |
| 130 | + uint8_t casePatternType = *ptr; | |
| 131 | + ptr++; | |
| 132 | + uint8_t prefixLength; | |
| 133 | + uint8_t patternLength; | |
| 134 | + switch (casePatternType) { | |
| 135 | + case LEMMA_ONLY_LOWER: | |
| 136 | + break; | |
| 137 | + case LEMMA_UPPER_PREFIX: | |
| 138 | + prefixLength = *ptr; | |
| 139 | + ptr++; | |
| 140 | + for (unsigned int i = 0; i < prefixLength; i++) { | |
| 141 | + // lemma.casePattern[i] = true; | |
| 142 | + lemma.casePattern.push_back(true); | |
| 143 | + } | |
| 144 | + // lemma.casePattern.resize(prefixLength, true); | |
| 145 | + break; | |
| 146 | + case LEMMA_MIXED_CASE: | |
| 147 | + patternLength = *ptr; | |
| 148 | + ptr++; | |
| 149 | + for (unsigned int i = 0; i < patternLength; i++) { | |
| 150 | + uint8_t idx = *ptr; | |
| 151 | + ptr++; | |
| 152 | + // lemma.casePattern[idx] = true; | |
| 153 | + lemma.casePattern.resize(idx + 1, false); | |
| 154 | + lemma.casePattern[idx] = true; | |
| 155 | + } | |
| 156 | + break; | |
| 157 | + } | |
| 158 | + } | |
| 159 | + | |
| 160 | + EncodedInterpretation decodeInterp(const unsigned char*& ptr) const { | |
| 161 | + EncodedInterpretation interp; | |
| 162 | + decodeLemma(ptr, interp.value); | |
| 163 | + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr))); | |
| 164 | + ptr += 2; | |
| 165 | + interp.nameClassifier = *ptr; | |
| 166 | + ptr++; | |
| 167 | + return interp; | |
| 168 | + } | |
| 107 | 169 | }; |
| 108 | 170 | |
| 109 | 171 | class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { |
| 110 | - | |
| 111 | 172 | public: |
| 112 | - InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {} | |
| 173 | + | |
| 174 | + InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { | |
| 175 | + } | |
| 113 | 176 | |
| 114 | 177 | void decode( |
| 115 | 178 | unsigned int startNode, |
| 116 | 179 | unsigned int endNode, |
| 117 | 180 | const InterpretedChunk& interpretedChunk, |
| 118 | 181 | std::vector<MorphInterpretation>& out) const { |
| 119 | - string orth; | |
| 120 | - string lemma; | |
| 121 | - convertPrefixes(interpretedChunk, lemma, orth); | |
| 122 | - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
| 123 | - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | |
| 124 | - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | |
| 125 | - decodeForm( | |
| 126 | - interpretedChunk.originalCodepoints, | |
| 127 | - ei.value, | |
| 128 | - orth); | |
| 129 | - out.push_back(MorphInterpretation( | |
| 130 | - startNode, endNode, | |
| 131 | - orth, lemma, | |
| 132 | - ei.tag, | |
| 133 | - ei.nameClassifier, | |
| 134 | - env.getTagset(), | |
| 135 | - env.getCharsetConverter())); | |
| 136 | - } | |
| 182 | + // string orth; | |
| 183 | + // string lemma; | |
| 184 | + // convertPrefixes(interpretedChunk, lemma, orth); | |
| 185 | + // size_t orthLength = orth.length(); | |
| 186 | + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
| 187 | + // for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | |
| 188 | + // const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | |
| 189 | + // decodeForm( | |
| 190 | + // interpretedChunk.originalCodepoints, | |
| 191 | + // ei.value, | |
| 192 | + // orth); | |
| 193 | + // out.push_back(MorphInterpretation( | |
| 194 | + // startNode, endNode, | |
| 195 | + // orth, lemma, | |
| 196 | + // ei.tag, | |
| 197 | + // ei.nameClassifier, | |
| 198 | + // env.getTagset(), | |
| 199 | + // env.getCharsetConverter())); | |
| 200 | + // orth.erase(orthLength); | |
| 201 | + // } | |
| 137 | 202 | } |
| 138 | 203 | |
| 139 | 204 | private: |
| ... | ... |
morfeusz/InterpsGroup.hpp
| ... | ... | @@ -14,24 +14,26 @@ |
| 14 | 14 | #include "MorphInterpretation.hpp" |
| 15 | 15 | #include "Tagset.hpp" |
| 16 | 16 | |
| 17 | -class InterpsGroup { | |
| 18 | -public: | |
| 19 | - | |
| 20 | - InterpsGroup() { | |
| 21 | - | |
| 22 | - } | |
| 23 | - | |
| 24 | - explicit InterpsGroup(const unsigned char type) | |
| 25 | - : type(type) { | |
| 26 | - | |
| 27 | - } | |
| 28 | - | |
| 29 | - void addInterpretation(const EncodedInterpretation& interp) { | |
| 30 | - interps.push_back(interp); | |
| 31 | - } | |
| 17 | +struct InterpsGroup { | |
| 18 | +//public: | |
| 19 | +// | |
| 20 | +// InterpsGroup() { | |
| 21 | +// | |
| 22 | +// } | |
| 23 | +// | |
| 24 | +// explicit InterpsGroup(const unsigned char type) | |
| 25 | +// : type(type) { | |
| 26 | +// | |
| 27 | +// } | |
| 28 | +// | |
| 29 | +// void addInterpretation(const EncodedInterpretation& interp) { | |
| 30 | +// interps.push_back(interp); | |
| 31 | +// } | |
| 32 | 32 | |
| 33 | 33 | unsigned char type; |
| 34 | - std::vector<EncodedInterpretation> interps; | |
| 34 | + uint16_t size; | |
| 35 | + const unsigned char* ptr; | |
| 36 | +// std::vector<EncodedInterpretation> interps; | |
| 35 | 37 | }; |
| 36 | 38 | |
| 37 | 39 | #endif /* GROUPEDINTERPRETATIONS_HPP */ |
| ... | ... |
morfeusz/Morfeusz.cpp
| ... | ... | @@ -82,7 +82,9 @@ void Morfeusz::processOneWord( |
| 82 | 82 | FlexionGraph graph; |
| 83 | 83 | const char* currInput = inputStart; |
| 84 | 84 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); |
| 85 | + | |
| 85 | 86 | doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); |
| 87 | + | |
| 86 | 88 | if (!graph.empty()) { |
| 87 | 89 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
| 88 | 90 | int srcNode = startNodeNum; |
| ... | ... | @@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { |
| 110 | 112 | from.prefixChunks.end()); |
| 111 | 113 | to.prefixChunks.push_back(from); |
| 112 | 114 | from.orthWasShifted = true; |
| 115 | + to.chunkStartPtr = from.chunkStartPtr; | |
| 113 | 116 | } |
| 114 | 117 | |
| 115 | 118 | void Morfeusz::doProcessOneWord( |
| ... | ... | @@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord( |
| 119 | 122 | SegrulesState segrulesState, |
| 120 | 123 | vector<InterpretedChunk>& accum, |
| 121 | 124 | FlexionGraph& graph) const { |
| 122 | - cerr << "doAnalyzeOneWord " << inputData << endl; | |
| 123 | - bool endOfProcessing = inputData == inputEnd; | |
| 125 | +// cerr << "doAnalyzeOneWord " << inputData << endl; | |
| 124 | 126 | const char* currInput = inputData; |
| 125 | - uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
| 126 | - // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter)); | |
| 127 | + uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
| 127 | 128 | vector<uint32_t> originalCodepoints; |
| 128 | 129 | vector<uint32_t> lowercaseCodepoints; |
| 129 | 130 | |
| 130 | 131 | StateType state = env.getFSA().getInitialState(); |
| 131 | 132 | |
| 132 | - while (!endOfProcessing) { | |
| 133 | - if (isEndOfWord(codepoint)) { | |
| 134 | - endOfProcessing = true; | |
| 135 | - } | |
| 136 | - cerr << "not end of word '" << string(currInput) << "'" << endl; | |
| 133 | + while (!isEndOfWord(codepoint)) { | |
| 137 | 134 | uint32_t lowerCP = env.getCaseConverter().toLower(codepoint); |
| 138 | 135 | originalCodepoints.push_back(codepoint); |
| 139 | 136 | lowercaseCodepoints.push_back(lowerCP); |
| 140 | 137 | feedState(state, lowerCP, UTF8CharsetConverter()); |
| 141 | 138 | codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); |
| 142 | 139 | if (state.isAccepting()) { |
| 143 | - cerr << "accepting" << endl; | |
| 144 | 140 | vector<InterpsGroup> val(state.getValue()); |
| 145 | 141 | for (unsigned int i = 0; i < val.size(); i++) { |
| 146 | 142 | InterpsGroup& ig = val[i]; |
| ... | ... | @@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord( |
| 151 | 147 | it != newSegrulesStates.end(); |
| 152 | 148 | ++it) { |
| 153 | 149 | SegrulesState newSegrulesState = *it; |
| 150 | +// if (newSegrulesState.shiftOrthFromPrevious) { | |
| 151 | +// | |
| 152 | +// } | |
| 154 | 153 | InterpretedChunk ic = { |
| 155 | 154 | inputData, |
| 156 | 155 | originalCodepoints, |
| ... | ... | @@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord( |
| 165 | 164 | } |
| 166 | 165 | accum.push_back(ic); |
| 167 | 166 | if (isEndOfWord(codepoint)) { |
| 168 | - cerr << "end of word inside " << currInput <<endl; | |
| 169 | 167 | if (newSegrulesState.accepting) |
| 170 | 168 | graph.addPath(accum); |
| 171 | 169 | } |
| ... | ... | @@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord( |
| 177 | 175 | } |
| 178 | 176 | } |
| 179 | 177 | } |
| 178 | + codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
| 180 | 179 | } |
| 181 | - cerr << "end of word " << currInput << endl; | |
| 182 | 180 | inputData = currInput; |
| 183 | 181 | } |
| 184 | 182 | |
| ... | ... |
morfeusz/MorphDeserializer.cpp
| ... | ... | @@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() { |
| 23 | 23 | MorphDeserializer::~MorphDeserializer() { |
| 24 | 24 | } |
| 25 | 25 | |
| 26 | -static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { | |
| 27 | - // XXX uważać na poprawność danych | |
| 28 | - lemma.suffixToCut = *ptr; | |
| 29 | - ptr++; | |
| 30 | - lemma.suffixToAdd = (const char*) ptr; | |
| 31 | - ptr += strlen((const char*) ptr) + 1; | |
| 32 | - assert(lemma.casePattern.size() == 0); | |
| 33 | -// lemma.casePattern.resize(MAX_WORD_SIZE, false); | |
| 34 | - uint8_t casePatternType = *ptr; | |
| 35 | - ptr++; | |
| 36 | - uint8_t prefixLength; | |
| 37 | - uint8_t patternLength; | |
| 38 | - switch (casePatternType) { | |
| 39 | - case LEMMA_ONLY_LOWER: | |
| 40 | - break; | |
| 41 | - case LEMMA_UPPER_PREFIX: | |
| 42 | - prefixLength = *ptr; | |
| 43 | - ptr++; | |
| 44 | - for (unsigned int i = 0; i < prefixLength; i++) { | |
| 45 | -// lemma.casePattern[i] = true; | |
| 46 | - lemma.casePattern.push_back(true); | |
| 47 | - } | |
| 48 | -// lemma.casePattern.resize(prefixLength, true); | |
| 49 | - break; | |
| 50 | - case LEMMA_MIXED_CASE: | |
| 51 | - patternLength = *ptr; | |
| 52 | - ptr++; | |
| 53 | - for (unsigned int i = 0; i < patternLength; i++) { | |
| 54 | - uint8_t idx = *ptr; | |
| 55 | - ptr++; | |
| 26 | +//static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { | |
| 27 | +// // XXX uważać na poprawność danych | |
| 28 | +// lemma.suffixToCut = *ptr; | |
| 29 | +// ptr++; | |
| 30 | +// lemma.suffixToAdd = (const char*) ptr; | |
| 31 | +// ptr += strlen((const char*) ptr) + 1; | |
| 32 | +// assert(lemma.casePattern.size() == 0); | |
| 33 | +//// lemma.casePattern.resize(MAX_WORD_SIZE, false); | |
| 34 | +// uint8_t casePatternType = *ptr; | |
| 35 | +// ptr++; | |
| 36 | +// uint8_t prefixLength; | |
| 37 | +// uint8_t patternLength; | |
| 38 | +// switch (casePatternType) { | |
| 39 | +// case LEMMA_ONLY_LOWER: | |
| 40 | +// break; | |
| 41 | +// case LEMMA_UPPER_PREFIX: | |
| 42 | +// prefixLength = *ptr; | |
| 43 | +// ptr++; | |
| 44 | +// for (unsigned int i = 0; i < prefixLength; i++) { | |
| 45 | +//// lemma.casePattern[i] = true; | |
| 46 | +// lemma.casePattern.push_back(true); | |
| 47 | +// } | |
| 48 | +//// lemma.casePattern.resize(prefixLength, true); | |
| 49 | +// break; | |
| 50 | +// case LEMMA_MIXED_CASE: | |
| 51 | +// patternLength = *ptr; | |
| 52 | +// ptr++; | |
| 53 | +// for (unsigned int i = 0; i < patternLength; i++) { | |
| 54 | +// uint8_t idx = *ptr; | |
| 55 | +// ptr++; | |
| 56 | +//// lemma.casePattern[idx] = true; | |
| 57 | +// lemma.casePattern.resize(idx + 1, false); | |
| 56 | 58 | // lemma.casePattern[idx] = true; |
| 57 | - lemma.casePattern.resize(idx + 1, false); | |
| 58 | - lemma.casePattern[idx] = true; | |
| 59 | - } | |
| 60 | - break; | |
| 61 | - } | |
| 62 | -} | |
| 63 | - | |
| 64 | -static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { | |
| 65 | - interp.type = *ptr; | |
| 66 | - ptr++; | |
| 67 | - deserializeLemma(ptr, interp.value); | |
| 68 | - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | |
| 69 | - ptr += 2; | |
| 70 | - interp.nameClassifier = *ptr; | |
| 71 | - ptr++; | |
| 72 | -} | |
| 59 | +// } | |
| 60 | +// break; | |
| 61 | +// } | |
| 62 | +//} | |
| 63 | +// | |
| 64 | +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { | |
| 65 | +// interp.type = *ptr; | |
| 66 | +// ptr++; | |
| 67 | +// deserializeLemma(ptr, interp.value); | |
| 68 | +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | |
| 69 | +// ptr += 2; | |
| 70 | +// interp.nameClassifier = *ptr; | |
| 71 | +// ptr++; | |
| 72 | +//} | |
| 73 | 73 | |
| 74 | 74 | long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { |
| 75 | 75 | const unsigned char* currPtr = ptr; |
| 76 | - uint8_t interpsNum = *ptr; | |
| 77 | - interps.clear(); | |
| 78 | - interps.reserve(interpsNum); | |
| 76 | + uint8_t interpTypesNum = *currPtr; | |
| 79 | 77 | currPtr++; |
| 80 | - // FIXME - to jest do poprawy | |
| 81 | - map<int, InterpsGroup> results; | |
| 82 | - for (unsigned int i = 0; i < interpsNum; ++i) { | |
| 83 | - EncodedInterpretation interp; | |
| 84 | - deserializeInterp(currPtr, interp); | |
| 85 | - if (results.count(interp.type) == 0) { | |
| 86 | - results[interp.type] = InterpsGroup(interp.type); | |
| 87 | - } | |
| 88 | - results[interp.type].addInterpretation(interp); | |
| 89 | -// interps.push_back(interp); | |
| 90 | - } | |
| 91 | - map<int, InterpsGroup>::iterator it; | |
| 92 | - for (it = results.begin(); it != results.end(); ++it) { | |
| 93 | - interps.push_back((*it).second); | |
| 78 | + interps.clear(); | |
| 79 | + interps.reserve(interpTypesNum); | |
| 80 | + for (unsigned int i = 0; i < interpTypesNum; i++) { | |
| 81 | + InterpsGroup ig; | |
| 82 | + ig.type = *currPtr; | |
| 83 | + currPtr++; | |
| 84 | + ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr))); | |
| 85 | + currPtr += 2; | |
| 86 | + ig.ptr = currPtr; | |
| 87 | + currPtr += ig.size; | |
| 88 | + interps.push_back(ig); | |
| 94 | 89 | } |
| 95 | 90 | return currPtr - ptr; |
| 96 | 91 | } |
| 92 | + | |
| 93 | +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { | |
| 94 | +// const unsigned char* currPtr = ptr; | |
| 95 | +// uint8_t interpsNum = *ptr; | |
| 96 | +// interps.clear(); | |
| 97 | +// interps.reserve(interpsNum); | |
| 98 | +// currPtr++; | |
| 99 | +// // FIXME - to jest do poprawy | |
| 100 | +// map<int, InterpsGroup> results; | |
| 101 | +// for (unsigned int i = 0; i < interpsNum; ++i) { | |
| 102 | +// EncodedInterpretation interp; | |
| 103 | +// deserializeInterp(currPtr, interp); | |
| 104 | +// if (results.count(interp.type) == 0) { | |
| 105 | +// results[interp.type] = InterpsGroup(interp.type); | |
| 106 | +// } | |
| 107 | +// results[interp.type].addInterpretation(interp); | |
| 108 | +//// interps.push_back(interp); | |
| 109 | +// } | |
| 110 | +// map<int, InterpsGroup>::iterator it; | |
| 111 | +// for (it = results.begin(); it != results.end(); ++it) { | |
| 112 | +// interps.push_back((*it).second); | |
| 113 | +// } | |
| 114 | +// return currPtr - ptr; | |
| 115 | +//} | |
| ... | ... |
nbproject/configurations.xml
| ... | ... | @@ -106,14 +106,20 @@ |
| 106 | 106 | </makeTool> |
| 107 | 107 | </makefileType> |
| 108 | 108 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
| 109 | + <ccTool flags="1"> | |
| 110 | + </ccTool> | |
| 109 | 111 | </item> |
| 110 | 112 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
| 113 | + <ccTool flags="1"> | |
| 114 | + </ccTool> | |
| 111 | 115 | </item> |
| 112 | 116 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
| 113 | 117 | </item> |
| 114 | 118 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
| 115 | 119 | </item> |
| 116 | 120 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
| 121 | + <ccTool flags="1"> | |
| 122 | + </ccTool> | |
| 117 | 123 | </item> |
| 118 | 124 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
| 119 | 125 | ex="false" |
| ... | ... | @@ -169,7 +175,7 @@ |
| 169 | 175 | <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
| 170 | 176 | </item> |
| 171 | 177 | <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
| 172 | - <ccTool flags="1"> | |
| 178 | + <ccTool> | |
| 173 | 179 | <incDir> |
| 174 | 180 | <pElem>morfeusz</pElem> |
| 175 | 181 | <pElem>morfeusz/build/morfeusz</pElem> |
| ... | ... | @@ -180,7 +186,7 @@ |
| 180 | 186 | </ccTool> |
| 181 | 187 | </item> |
| 182 | 188 | <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
| 183 | - <ccTool flags="1"> | |
| 189 | + <ccTool> | |
| 184 | 190 | <incDir> |
| 185 | 191 | <pElem>morfeusz</pElem> |
| 186 | 192 | <pElem>morfeusz/build/morfeusz</pElem> |
| ... | ... | @@ -273,7 +279,7 @@ |
| 273 | 279 | <ccTool> |
| 274 | 280 | <incDir> |
| 275 | 281 | <pElem>morfeusz</pElem> |
| 276 | - <pElem>/usr/lib/jvm/default-java/include</pElem> | |
| 282 | + <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | |
| 277 | 283 | </incDir> |
| 278 | 284 | <preprocessorList> |
| 279 | 285 | <Elem>libjmorfeusz_EXPORTS</Elem> |
| ... | ... | @@ -408,18 +414,26 @@ |
| 408 | 414 | </ccTool> |
| 409 | 415 | </item> |
| 410 | 416 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
| 417 | + <ccTool flags="1"> | |
| 418 | + </ccTool> | |
| 411 | 419 | </item> |
| 412 | 420 | <item path="morfeusz/charset/CharsetConverter.cpp" |
| 413 | 421 | ex="false" |
| 414 | 422 | tool="1" |
| 415 | 423 | flavor2="4"> |
| 424 | + <ccTool flags="1"> | |
| 425 | + </ccTool> | |
| 416 | 426 | </item> |
| 417 | 427 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
| 428 | + <ccTool flags="1"> | |
| 429 | + </ccTool> | |
| 418 | 430 | </item> |
| 419 | 431 | <item path="morfeusz/charset/conversion_tables.cpp" |
| 420 | 432 | ex="false" |
| 421 | 433 | tool="1" |
| 422 | 434 | flavor2="4"> |
| 435 | + <ccTool flags="1"> | |
| 436 | + </ccTool> | |
| 423 | 437 | </item> |
| 424 | 438 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
| 425 | 439 | <ccTool flags="1"> |
| ... | ... | @@ -508,8 +522,12 @@ |
| 508 | 522 | ex="false" |
| 509 | 523 | tool="1" |
| 510 | 524 | flavor2="4"> |
| 525 | + <ccTool flags="1"> | |
| 526 | + </ccTool> | |
| 511 | 527 | </item> |
| 512 | 528 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
| 529 | + <ccTool flags="1"> | |
| 530 | + </ccTool> | |
| 513 | 531 | </item> |
| 514 | 532 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
| 515 | 533 | <ccTool flags="0"> |
| ... | ... |