Commit f1e52ff44027610390237bd862449c85c4a044cc

Authored by Michał Lenart
1 parent de0e960d

poprawienie czasu działania, przebudowanie analizatora tak, by nie powielać kodu…

… w generatorze, poprawienie rozpoznawania pierwszego segmentu w grafie fleksyjnym

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@114 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
@@ -5,7 +5,7 @@ project (Morfeusz) @@ -5,7 +5,7 @@ project (Morfeusz)
5 set (Morfeusz_VERSION_MAJOR 2) 5 set (Morfeusz_VERSION_MAJOR 2)
6 set (Morfeusz_VERSION_MINOR 0) 6 set (Morfeusz_VERSION_MINOR 0)
7 set (Morfeusz_VERSION_PATCH 0) 7 set (Morfeusz_VERSION_PATCH 0)
8 -set (CMAKE_BUILD_TYPE "Debug") 8 +set (CMAKE_BUILD_TYPE "Release")
9 9
10 enable_testing() 10 enable_testing()
11 11
@@ -47,7 +47,7 @@ endif () @@ -47,7 +47,7 @@ endif ()
47 47
48 # SEGMENT_RULES_FILE 48 # SEGMENT_RULES_FILE
49 if ("${SEGMENT_RULES_FILE}" STREQUAL "") 49 if ("${SEGMENT_RULES_FILE}" STREQUAL "")
50 - set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat") 50 + set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat")
51 endif () 51 endif ()
52 52
53 message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules") 53 message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules")
fsabuilder/morfeuszbuilder/fsa/common.py
@@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object): @@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object):
40 self.suffixToAdd = bestEncodedForm.suffixToAdd 40 self.suffixToAdd = bestEncodedForm.suffixToAdd
41 self.prefixToAdd = targetWord[:bestPrefixLength] 41 self.prefixToAdd = targetWord[:bestPrefixLength]
42 42
43 -class Interpretation(object): 43 +class Interpretation4Analyzer(object):
44 44
45 def __init__(self, orth, base, tagnum, namenum, typenum): 45 def __init__(self, orth, base, tagnum, namenum, typenum):
46 - self.lemma = EncodedForm(orth, base) 46 + self.encodedForm = EncodedForm(orth, base)
47 self.tagnum = tagnum 47 self.tagnum = tagnum
48 self.namenum = namenum 48 self.namenum = namenum
49 self.typenum = typenum 49 self.typenum = typenum
50 50
51 def getSortKey(self): 51 def getSortKey(self):
52 return ( 52 return (
53 - self.lemma.cutLength,  
54 - tuple(self.lemma.suffixToAdd),  
55 - tuple(self.lemma.casePattern), 53 + self.encodedForm.cutLength,
  54 + tuple(self.encodedForm.suffixToAdd),
  55 + tuple(self.encodedForm.casePattern),
56 self.tagnum, 56 self.tagnum,
57 self.namenum) 57 self.namenum)
58 58
59 def __eq__(self, other): 59 def __eq__(self, other):
60 - if isinstance(other, Interpretation): 60 + if isinstance(other, Interpretation4Analyzer):
61 return self.getSortKey() == other.getSortKey() 61 return self.getSortKey() == other.getSortKey()
62 else: 62 else:
63 return False 63 return False
@@ -68,8 +68,8 @@ class Interpretation(object): @@ -68,8 +68,8 @@ class Interpretation(object):
68 class Interpretation4Generator(object): 68 class Interpretation4Generator(object):
69 69
70 def __init__(self, orth, base, tagnum, namenum, typenum): 70 def __init__(self, orth, base, tagnum, namenum, typenum):
71 - self.lemma = base  
72 - self.orth = EncodedFormWithPrefix(base, orth) 71 + self.encodedForm = base
  72 + self.encodedForm = EncodedFormWithPrefix(base, orth)
73 self.tagnum = tagnum 73 self.tagnum = tagnum
74 self.namenum = namenum 74 self.namenum = namenum
75 self.typenum = typenum 75 self.typenum = typenum
@@ -77,9 +77,9 @@ class Interpretation4Generator(object): @@ -77,9 +77,9 @@ class Interpretation4Generator(object):
77 def getSortKey(self): 77 def getSortKey(self):
78 return ( 78 return (
79 self.tagnum, 79 self.tagnum,
80 - self.orth.cutLength,  
81 - tuple(self.orth.suffixToAdd),  
82 -# tuple(self.lemma.casePattern), 80 + self.encodedForm.cutLength,
  81 + tuple(self.encodedForm.suffixToAdd),
  82 +# tuple(self.encodedForm.casePattern),
83 self.namenum) 83 self.namenum)
84 84
85 def __eq__(self, other): 85 def __eq__(self, other):
@@ -92,7 +92,7 @@ class Interpretation4Generator(object): @@ -92,7 +92,7 @@ class Interpretation4Generator(object):
92 return hash(self.getSortKey()) 92 return hash(self.getSortKey())
93 93
94 def __unicode__(self): 94 def __unicode__(self):
95 - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.orth.cutLength, self.orth.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) 95 + return u'<%s,(%d %s),%d,%d>' % (self.encodedForm.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
96 96
97 def __repr__(self): 97 def __repr__(self):
98 return unicode(self) 98 return unicode(self)
fsabuilder/morfeuszbuilder/fsa/convertinput.py
@@ -4,7 +4,7 @@ Created on Oct 23, 2013 @@ -4,7 +4,7 @@ Created on Oct 23, 2013
4 @author: mlenart 4 @author: mlenart
5 ''' 5 '''
6 import logging 6 import logging
7 -from common import Interpretation 7 +from common import Interpretation4Analyzer
8 from morfeuszbuilder.fsa.common import Interpretation4Generator 8 from morfeuszbuilder.fsa.common import Interpretation4Generator
9 9
10 def _mergeEntries(inputLines): 10 def _mergeEntries(inputLines):
@@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object): @@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object):
74 tagnum = int(tagnum) 74 tagnum = int(tagnum)
75 namenum = int(namenum) 75 namenum = int(namenum)
76 typenum = int(typenum) 76 typenum = int(typenum)
77 - yield (orth, Interpretation(orth, base, tagnum, namenum, typenum)) 77 + yield (orth, Interpretation4Analyzer(orth, base, tagnum, namenum, typenum))
78 78
79 def convert(self, inputLines): 79 def convert(self, inputLines):
80 return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines)))) 80 return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))))
fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -5,6 +5,7 @@ Created on Oct 23, 2013 @@ -5,6 +5,7 @@ Created on Oct 23, 2013
5 ''' 5 '''
6 6
7 import logging 7 import logging
  8 +from morfeuszbuilder.utils import serializationUtils
8 9
9 class Encoder(object): 10 class Encoder(object):
10 ''' 11 '''
@@ -96,6 +97,54 @@ class Encoder(object): @@ -96,6 +97,54 @@ class Encoder(object):
96 def _encodeNameNum(self, namenum): 97 def _encodeNameNum(self, namenum):
97 assert namenum < 256 and namenum >= 0 98 assert namenum < 256 and namenum >= 0
98 return bytearray([namenum]) 99 return bytearray([namenum])
  100 +
  101 + def _groupInterpsByType(self, interpsList):
  102 + res = {}
  103 + for interp in interpsList:
  104 + res.setdefault(interp.typenum, [])
  105 + res[interp.typenum].append(interp)
  106 + return res
  107 +
  108 + def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix):
  109 + res = bytearray()
  110 + res.extend(self._encodeTypeNum(typenum))
  111 +
  112 + encodedInterpsList = bytearray()
  113 + for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  114 + encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
  115 + encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
  116 + encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
  117 +
  118 + res.extend(serializationUtils.htons(len(encodedInterpsList)))
  119 + res.extend(encodedInterpsList)
  120 + return res
  121 +
  122 + def _doEncodeData(self, interpsList, withCasePattern, withPrefix):
  123 +
  124 + assert type(interpsList) == frozenset
  125 +
  126 + segnum2Interps = self._groupInterpsByType(interpsList)
  127 +
  128 +
  129 + res = bytearray()
  130 + firstByte = len(segnum2Interps)
  131 + assert firstByte < 256
  132 + assert firstByte > 0
  133 + res.append(firstByte)
  134 +
  135 + for typenum, interpsList in segnum2Interps.iteritems():
  136 + res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix))
  137 +
  138 +
  139 +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  140 +# encodedInterpsList.extend(self._encodeTypeNum(interp.typenum))
  141 +# encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
  142 +# encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
  143 +# encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
  144 + del interpsList
  145 +# res.extend(serializationUtils.htons(len(encodedInterpsList)))
  146 +# res.extend(encodedInterpsList)
  147 + return res
99 148
100 class MorphEncoder(Encoder): 149 class MorphEncoder(Encoder):
101 150
@@ -106,19 +155,20 @@ class MorphEncoder(Encoder): @@ -106,19 +155,20 @@ class MorphEncoder(Encoder):
106 self.LEMMA_MIXED_CASE = 2 155 self.LEMMA_MIXED_CASE = 2
107 156
108 def encodeData(self, interpsList): 157 def encodeData(self, interpsList):
109 - res = bytearray()  
110 - firstByte = len(interpsList)  
111 - assert firstByte < 256  
112 - assert firstByte > 0  
113 - res.append(firstByte)  
114 - assert type(interpsList) == frozenset  
115 - for interp in sorted(interpsList, key=lambda i: i.getSortKey()):  
116 - res.extend(self._encodeTypeNum(interp.typenum))  
117 - res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False))  
118 - res.extend(self._encodeTagNum(interp.tagnum))  
119 - res.extend(self._encodeNameNum(interp.namenum))  
120 - del interpsList  
121 - return res 158 + return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False)
  159 +# res = bytearray()
  160 +# firstByte = len(interpsList)
  161 +# assert firstByte < 256
  162 +# assert firstByte > 0
  163 +# res.append(firstByte)
  164 +# assert type(interpsList) == frozenset
  165 +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  166 +# res.extend(self._encodeTypeNum(interp.typenum))
  167 +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=True, withPrefix=False))
  168 +# res.extend(self._encodeTagNum(interp.tagnum))
  169 +# res.extend(self._encodeNameNum(interp.namenum))
  170 +# del interpsList
  171 +# return res
122 172
123 class Encoder4Generator(Encoder): 173 class Encoder4Generator(Encoder):
124 174
@@ -126,18 +176,19 @@ class Encoder4Generator(Encoder): @@ -126,18 +176,19 @@ class Encoder4Generator(Encoder):
126 super(Encoder4Generator, self).__init__(encoding) 176 super(Encoder4Generator, self).__init__(encoding)
127 177
128 def encodeData(self, interpsList): 178 def encodeData(self, interpsList):
129 - res = bytearray()  
130 - firstByte = len(interpsList)  
131 - assert firstByte < 256  
132 - assert firstByte > 0  
133 - res.append(firstByte)  
134 - assert type(interpsList) == frozenset  
135 - for interp in sorted(interpsList, key=lambda i: i.getSortKey()):  
136 - res.extend(self._encodeTypeNum(interp.typenum))  
137 - res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True))  
138 - res.extend(self._encodeTagNum(interp.tagnum))  
139 - res.extend(self._encodeNameNum(interp.namenum))  
140 - return res 179 + return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True)
  180 +# res = bytearray()
  181 +# firstByte = len(interpsList)
  182 +# assert firstByte < 256
  183 +# assert firstByte > 0
  184 +# res.append(firstByte)
  185 +# assert type(interpsList) == frozenset
  186 +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  187 +# res.extend(self._encodeTypeNum(interp.typenum))
  188 +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=False, withPrefix=True))
  189 +# res.extend(self._encodeTagNum(interp.tagnum))
  190 +# res.extend(self._encodeNameNum(interp.namenum))
  191 +# return res
141 # 192 #
142 # def decodeData(self, data): 193 # def decodeData(self, data):
143 # 194 #
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -106,7 +106,7 @@ class Segtypes(object): @@ -106,7 +106,7 @@ class Segtypes(object):
106 lineNum, 106 lineNum,
107 re.match(r'[a-z_]+', segtype)) 107 re.match(r'[a-z_]+', segtype))
108 self._validate( 108 self._validate(
109 - u'Pattern must contain lemma and part-of-speech fields', 109 + u'Pattern must contain encodedForm and part-of-speech fields',
110 lineNum, 110 lineNum,
111 re.match(r'.+\:[a-z_]+', pattern, re.U)) 111 re.match(r'.+\:[a-z_]+', pattern, re.U))
112 112
@@ -146,13 +146,13 @@ class Segtypes(object): @@ -146,13 +146,13 @@ class Segtypes(object):
146 146
147 # index lexemes 147 # index lexemes
148 for p in self.patternsList: 148 for p in self.patternsList:
149 - if p.lemma: 149 + if p.encodedForm:
150 for tag in self.tagset.getAllTags(): 150 for tag in self.tagset.getAllTags():
151 tagnum = self.tagset.getTagnum4Tag(tag) 151 tagnum = self.tagset.getTagnum4Tag(tag)
152 - if not (p.lemma, tagnum) in self._lemmaTagnum2Segnum:  
153 - segnum = p.tryToMatch(p.lemma, tag) 152 + if not (p.encodedForm, tagnum) in self._lemmaTagnum2Segnum:
  153 + segnum = p.tryToMatch(p.encodedForm, tag)
154 if segnum != -1: 154 if segnum != -1:
155 - self._lemmaTagnum2Segnum[(p.lemma, tagnum)] = segnum 155 + self._lemmaTagnum2Segnum[(p.encodedForm, tagnum)] = segnum
156 # logging.info('indexing segment type numbers - done') 156 # logging.info('indexing segment type numbers - done')
157 # self._debugSegnums() 157 # self._debugSegnums()
158 158
@@ -171,7 +171,7 @@ class Segtypes(object): @@ -171,7 +171,7 @@ class Segtypes(object):
171 class SegtypePattern(object): 171 class SegtypePattern(object):
172 172
173 def __init__(self, lemma, pattern, segnum): 173 def __init__(self, lemma, pattern, segnum):
174 - self.lemma = lemma 174 + self.encodedForm = lemma
175 self.pattern = pattern 175 self.pattern = pattern
176 self.segnum = segnum 176 self.segnum = segnum
177 177
@@ -181,7 +181,7 @@ class SegtypePattern(object): @@ -181,7 +181,7 @@ class SegtypePattern(object):
181 patterns2Match = [] 181 patterns2Match = []
182 patterns2Match.append(self.pattern.replace('%', '.*')) 182 patterns2Match.append(self.pattern.replace('%', '.*'))
183 patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) 183 patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*'))
184 - if (self.lemma is None or self.lemma == lemma) \ 184 + if (self.encodedForm is None or self.encodedForm == lemma) \
185 and any([re.match(p, tag) for p in patterns2Match]): 185 and any([re.match(p, tag) for p in patterns2Match]):
186 return self.segnum 186 return self.segnum
187 else: 187 else:
input/dodatki.tab
  1 +0 0 dig
  2 +1 1 dig
  3 +2 2 dig
  4 +3 3 dig
  5 +4 4 dig
  6 +5 5 dig
  7 +6 6 dig
  8 +7 7 dig
  9 +8 8 dig
  10 +9 9 dig
1 ń on ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep 11 ń on ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
2 by by qub 12 by by qub
3 naj naj 13 naj naj
input/segmenty.dat
@@ -142,6 +142,7 @@ samodz dywiz adj @@ -142,6 +142,7 @@ samodz dywiz adj
142 # Stopień najwyższy: 142 # Stopień najwyższy:
143 # np. „naj·zieleńszy”, „naj·mądrzej” 143 # np. „naj·zieleńszy”, „naj·mądrzej”
144 moze_interp( naj> adj_sup ) 144 moze_interp( naj> adj_sup )
  145 +moze_interp( nie> naj> adj_sup )
145 # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj 146 # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj
146 moze_interp( praet_sg dywiz li) 147 moze_interp( praet_sg dywiz li)
147 moze_interp( praet_pl dywiz li) 148 moze_interp( praet_pl dywiz li)
input/segmenty1.dat
@@ -52,11 +52,14 @@ naj naj @@ -52,11 +52,14 @@ naj naj
52 nie nie 52 nie nie
53 prefs prefs 53 prefs prefs
54 prefv prefv 54 prefv prefv
  55 +prefa prefa
55 dig dig 56 dig dig
56 adja adja 57 adja adja
57 adj adj:%:pos 58 adj adj:%:pos
58 adj_sup adj:%:sup 59 adj_sup adj:%:sup
59 adj_sup adv:sup 60 adj_sup adv:sup
  61 +adj_com adj:%:com
  62 +adj_com adj:%:com
60 negat ger:%:neg 63 negat ger:%:neg
61 negat pact:%:neg 64 negat pact:%:neg
62 negat ppas:%:neg 65 negat ppas:%:neg
@@ -69,6 +72,22 @@ interp interp @@ -69,6 +72,22 @@ interp interp
69 aglsg aglt:sg:% 72 aglsg aglt:sg:%
70 aglpl aglt:pl:% 73 aglpl aglt:pl:%
71 samodz % 74 samodz %
  75 +praet_fin praet:%
  76 +praet_fin fin:%
  77 +li li:qub:%
  78 +nomina subst:%
  79 +nomina ger:%
  80 +nomina depr:%
  81 +adjectiva adj:%
  82 +adjectiva adv:%
  83 +adjectiva ppas:%
  84 +adjectiva pact:%
  85 +verba_imperf praet:%:imperf
  86 +verba_imperf fin:%:imperf
  87 +verba_imperf inf:imperf
  88 +verba_imperf imps:imperf
  89 +verba_imperf impt:imperf
  90 +
72 91
73 [lexemes] 92 [lexemes]
74 z_aglt aby:comp 93 z_aglt aby:comp
morfeusz/EncodedInterpretation.hpp
@@ -28,7 +28,6 @@ struct EncodedForm { @@ -28,7 +28,6 @@ struct EncodedForm {
28 */ 28 */
29 struct EncodedInterpretation { 29 struct EncodedInterpretation {
30 EncodedForm value; 30 EncodedForm value;
31 - unsigned char type;  
32 int tag; 31 int tag;
33 int nameClassifier; 32 int nameClassifier;
34 }; 33 };
morfeusz/Environment.cpp
@@ -13,10 +13,12 @@ @@ -13,10 +13,12 @@
13 //class InterpretedChunksDecoder4Analyzer; 13 //class InterpretedChunksDecoder4Analyzer;
14 //class InterpretedChunksDecoder4Generator; 14 //class InterpretedChunksDecoder4Generator;
15 15
16 -static Deserializer<vector<InterpsGroup> >* initializeDeserializer() {  
17 - static Deserializer < vector < InterpsGroup > > *deserializer 16 +static Deserializer<vector<InterpsGroup> >& initializeDeserializer(MorfeuszProcessorType processorType) {
  17 + static Deserializer < vector < InterpsGroup > > *analyzerDeserializer
18 = new MorphDeserializer(); 18 = new MorphDeserializer();
19 - return deserializer; 19 + static Deserializer < vector < InterpsGroup > > *generatorDeserializer
  20 + = new MorphDeserializer();
  21 + return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer);
20 } 22 }
21 23
22 static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { 24 static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) {
@@ -48,14 +50,15 @@ Environment::Environment( @@ -48,14 +50,15 @@ Environment::Environment(
48 caseConverter(), 50 caseConverter(),
49 tagset(fsaFileStartPtr), 51 tagset(fsaFileStartPtr),
50 fsaFileStartPtr(fsaFileStartPtr), 52 fsaFileStartPtr(fsaFileStartPtr),
51 - fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())), 53 + fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
52 segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)), 54 segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
53 currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), 55 currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)),
54 isFromFile(false), 56 isFromFile(false),
55 chunksDecoder( 57 chunksDecoder(
56 processorType == ANALYZER 58 processorType == ANALYZER
57 ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) 59 ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
58 - : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)) 60 + : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
  61 + processorType(processorType)
59 { 62 {
60 } 63 }
61 64
@@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string&amp; filename) { @@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string&amp; filename) {
110 delete this->fsaFileStartPtr; 113 delete this->fsaFileStartPtr;
111 } 114 }
112 this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str()); 115 this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str());
113 - this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer()); 116 + this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType));
114 this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr); 117 this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr);
115 this->isFromFile = true; 118 this->isFromFile = true;
116 } 119 }
morfeusz/Environment.hpp
@@ -64,6 +64,7 @@ private: @@ -64,6 +64,7 @@ private:
64 bool isFromFile; 64 bool isFromFile;
65 65
66 const InterpretedChunksDecoder* chunksDecoder; 66 const InterpretedChunksDecoder* chunksDecoder;
  67 + MorfeuszProcessorType processorType;
67 68
68 const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; 69 const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const;
69 }; 70 };
morfeusz/FlexionGraph.cpp
@@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge&amp; e) { @@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge&amp; e) {
15 this->graph.push_back(vector<Edge>()); 15 this->graph.push_back(vector<Edge>());
16 this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr); 16 this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr);
17 } 17 }
18 -// cerr << string(e.chunk.chunkStartPtr) << endl;  
19 assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr); 18 assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr);
20 this->graph[0].push_back(e); 19 this->graph[0].push_back(e);
21 } 20 }
@@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge&amp; e) { @@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge&amp; e) {
30 this->graph[startNode].push_back(e); 29 this->graph[startNode].push_back(e);
31 } 30 }
32 31
  32 +static inline bool chunkIsAtFront(
  33 + const InterpretedChunk& chunk,
  34 + const std::vector<InterpretedChunk>& path) {
  35 + unsigned int i;
  36 + for (i = 0; i < path.size() - 1 && path[i].orthWasShifted; i++) {
  37 + }
  38 + assert(!path[i].orthWasShifted);
  39 + return &chunk == &(path[i]);
  40 +}
  41 +
  42 +static inline bool chunkIsAtBack(
  43 + const InterpretedChunk& chunk,
  44 + const std::vector<InterpretedChunk>& path) {
  45 + return &chunk == &(path.back());
  46 +}
  47 +
  48 +static inline bool chunkIsTheOnlyOne(
  49 + const InterpretedChunk& chunk,
  50 + const std::vector<InterpretedChunk>& path) {
  51 + return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path);
  52 +}
  53 +
33 void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { 54 void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) {
34 // debugPath(path); 55 // debugPath(path);
35 // debugGraph(this->graph); 56 // debugGraph(this->graph);
36 for (unsigned int i = 0; i < path.size(); i++) { 57 for (unsigned int i = 0; i < path.size(); i++) {
37 const InterpretedChunk& chunk = path[i]; 58 const InterpretedChunk& chunk = path[i];
38 if (!chunk.orthWasShifted) { 59 if (!chunk.orthWasShifted) {
39 - if (&chunk == &(path.front())  
40 - && &chunk == &(path.back())) { 60 + if (chunkIsTheOnlyOne(chunk, path)) {
41 Edge e = {chunk, UINT_MAX}; 61 Edge e = {chunk, UINT_MAX};
42 this->addStartEdge(e); 62 this->addStartEdge(e);
43 } 63 }
44 - else if (&chunk == &(path.front())) { 64 + else if (chunkIsAtFront(chunk, path)) {
45 Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()}; 65 Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()};
46 this->addStartEdge(e); 66 this->addStartEdge(e);
47 } 67 }
48 - else if (&chunk == &(path.back())) { 68 + else if (chunkIsAtBack(chunk, path)) {
49 Edge e = {chunk, UINT_MAX}; 69 Edge e = {chunk, UINT_MAX};
50 this->addMiddleEdge((unsigned int) this->graph.size(), e); 70 this->addMiddleEdge((unsigned int) this->graph.size(), e);
51 } 71 }
morfeusz/InterpretedChunksDecoder.hpp
@@ -18,6 +18,10 @@ @@ -18,6 +18,10 @@
18 #include "charset/CaseConverter.hpp" 18 #include "charset/CaseConverter.hpp"
19 #include "Environment.hpp" 19 #include "Environment.hpp"
20 20
  21 +const uint8_t LEMMA_ONLY_LOWER = 0;
  22 +const uint8_t LEMMA_UPPER_PREFIX = 1;
  23 +const uint8_t LEMMA_MIXED_CASE = 2;
  24 +
21 class InterpretedChunksDecoder { 25 class InterpretedChunksDecoder {
22 public: 26 public:
23 27
@@ -30,22 +34,12 @@ public: @@ -30,22 +34,12 @@ public:
30 unsigned int endNode, 34 unsigned int endNode,
31 const InterpretedChunk& interpretedChunk, 35 const InterpretedChunk& interpretedChunk,
32 std::vector<MorphInterpretation>& out) const = 0; 36 std::vector<MorphInterpretation>& out) const = 0;
33 -  
34 - virtual ~InterpretedChunksDecoder() {}  
35 37
36 -protected:  
37 -  
38 - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const {  
39 - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {  
40 - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];  
41 - originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);  
42 - decodeForm(  
43 - prefixChunk.lowercaseCodepoints,  
44 - prefixChunk.interpsGroup.interps[0].value,  
45 - decodedForm);  
46 - } 38 + virtual ~InterpretedChunksDecoder() {
47 } 39 }
48 - 40 +
  41 +protected:
  42 +
49 virtual void decodeForm( 43 virtual void decodeForm(
50 const std::vector<uint32_t>& orth, 44 const std::vector<uint32_t>& orth,
51 const EncodedForm& form, 45 const EncodedForm& form,
@@ -55,9 +49,10 @@ protected: @@ -55,9 +49,10 @@ protected:
55 }; 49 };
56 50
57 class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { 51 class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder {
58 -  
59 public: 52 public:
60 - InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {} 53 +
  54 + InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) {
  55 + }
61 56
62 void decode( 57 void decode(
63 unsigned int startNode, 58 unsigned int startNode,
@@ -65,22 +60,12 @@ public: @@ -65,22 +60,12 @@ public:
65 const InterpretedChunk& interpretedChunk, 60 const InterpretedChunk& interpretedChunk,
66 std::vector<MorphInterpretation>& out) const { 61 std::vector<MorphInterpretation>& out) const {
67 string orth; 62 string orth;
68 - string lemma;  
69 - convertPrefixes(interpretedChunk, orth, lemma); 63 + string lemmaPrefix;
  64 + convertPrefixes(interpretedChunk, orth, lemmaPrefix);
70 orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); 65 orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
71 - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {  
72 - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];  
73 - decodeForm(  
74 - interpretedChunk.lowercaseCodepoints,  
75 - ei.value,  
76 - lemma);  
77 - out.push_back(MorphInterpretation(  
78 - startNode, endNode,  
79 - orth, lemma,  
80 - ei.tag,  
81 - ei.nameClassifier,  
82 - env.getTagset(),  
83 - env.getCharsetConverter())); 66 + const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr;
  67 + while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) {
  68 + out.push_back(this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr));
84 } 69 }
85 } 70 }
86 71
@@ -104,36 +89,116 @@ protected: @@ -104,36 +89,116 @@ protected:
104 env.getCharsetConverter().append(cp, res); 89 env.getCharsetConverter().append(cp, res);
105 } 90 }
106 } 91 }
  92 +
  93 +private:
  94 +
  95 + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const {
  96 + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
  97 + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
  98 + originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
  99 + const unsigned char* ptr = prefixChunk.interpsGroup.ptr;
  100 + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, originalForm, string(""), prefixChunk, ptr);
  101 + decodedForm += mi.getLemma();
  102 + }
  103 + }
  104 +
  105 + MorphInterpretation decodeMorphInterpretation(
  106 + unsigned int startNode, unsigned int endNode,
  107 + const string& orth,
  108 + const string& lemmaPrefix,
  109 + const InterpretedChunk& chunk,
  110 + const unsigned char*& ptr) const {
  111 + string lemma = lemmaPrefix;
  112 + EncodedInterpretation ei = this->decodeInterp(ptr);
  113 + this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
  114 + return MorphInterpretation(
  115 + startNode, endNode,
  116 + orth, lemma,
  117 + ei.tag,
  118 + ei.nameClassifier,
  119 + env.getTagset(),
  120 + env.getCharsetConverter());
  121 + }
  122 +
  123 + void decodeLemma(const unsigned char*& ptr, EncodedForm& lemma) const {
  124 + lemma.suffixToCut = *ptr;
  125 + ptr++;
  126 + lemma.suffixToAdd = (const char*) ptr;
  127 + ptr += strlen((const char*) ptr) + 1;
  128 + assert(lemma.casePattern.size() == 0);
  129 + // lemma.casePattern.resize(MAX_WORD_SIZE, false);
  130 + uint8_t casePatternType = *ptr;
  131 + ptr++;
  132 + uint8_t prefixLength;
  133 + uint8_t patternLength;
  134 + switch (casePatternType) {
  135 + case LEMMA_ONLY_LOWER:
  136 + break;
  137 + case LEMMA_UPPER_PREFIX:
  138 + prefixLength = *ptr;
  139 + ptr++;
  140 + for (unsigned int i = 0; i < prefixLength; i++) {
  141 + // lemma.casePattern[i] = true;
  142 + lemma.casePattern.push_back(true);
  143 + }
  144 + // lemma.casePattern.resize(prefixLength, true);
  145 + break;
  146 + case LEMMA_MIXED_CASE:
  147 + patternLength = *ptr;
  148 + ptr++;
  149 + for (unsigned int i = 0; i < patternLength; i++) {
  150 + uint8_t idx = *ptr;
  151 + ptr++;
  152 + // lemma.casePattern[idx] = true;
  153 + lemma.casePattern.resize(idx + 1, false);
  154 + lemma.casePattern[idx] = true;
  155 + }
  156 + break;
  157 + }
  158 + }
  159 +
  160 + EncodedInterpretation decodeInterp(const unsigned char*& ptr) const {
  161 + EncodedInterpretation interp;
  162 + decodeLemma(ptr, interp.value);
  163 + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr)));
  164 + ptr += 2;
  165 + interp.nameClassifier = *ptr;
  166 + ptr++;
  167 + return interp;
  168 + }
107 }; 169 };
108 170
109 class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { 171 class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder {
110 -  
111 public: 172 public:
112 - InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {} 173 +
  174 + InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) {
  175 + }
113 176
114 void decode( 177 void decode(
115 unsigned int startNode, 178 unsigned int startNode,
116 unsigned int endNode, 179 unsigned int endNode,
117 const InterpretedChunk& interpretedChunk, 180 const InterpretedChunk& interpretedChunk,
118 std::vector<MorphInterpretation>& out) const { 181 std::vector<MorphInterpretation>& out) const {
119 - string orth;  
120 - string lemma;  
121 - convertPrefixes(interpretedChunk, lemma, orth);  
122 - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);  
123 - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {  
124 - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];  
125 - decodeForm(  
126 - interpretedChunk.originalCodepoints,  
127 - ei.value,  
128 - orth);  
129 - out.push_back(MorphInterpretation(  
130 - startNode, endNode,  
131 - orth, lemma,  
132 - ei.tag,  
133 - ei.nameClassifier,  
134 - env.getTagset(),  
135 - env.getCharsetConverter()));  
136 - } 182 + // string orth;
  183 + // string lemma;
  184 + // convertPrefixes(interpretedChunk, lemma, orth);
  185 + // size_t orthLength = orth.length();
  186 + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
  187 + // for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
  188 + // const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
  189 + // decodeForm(
  190 + // interpretedChunk.originalCodepoints,
  191 + // ei.value,
  192 + // orth);
  193 + // out.push_back(MorphInterpretation(
  194 + // startNode, endNode,
  195 + // orth, lemma,
  196 + // ei.tag,
  197 + // ei.nameClassifier,
  198 + // env.getTagset(),
  199 + // env.getCharsetConverter()));
  200 + // orth.erase(orthLength);
  201 + // }
137 } 202 }
138 203
139 private: 204 private:
morfeusz/InterpsGroup.hpp
@@ -14,24 +14,26 @@ @@ -14,24 +14,26 @@
14 #include "MorphInterpretation.hpp" 14 #include "MorphInterpretation.hpp"
15 #include "Tagset.hpp" 15 #include "Tagset.hpp"
16 16
17 -class InterpsGroup {  
18 -public:  
19 -  
20 - InterpsGroup() {  
21 -  
22 - }  
23 -  
24 - explicit InterpsGroup(const unsigned char type)  
25 - : type(type) {  
26 -  
27 - }  
28 -  
29 - void addInterpretation(const EncodedInterpretation& interp) {  
30 - interps.push_back(interp);  
31 - } 17 +struct InterpsGroup {
  18 +//public:
  19 +//
  20 +// InterpsGroup() {
  21 +//
  22 +// }
  23 +//
  24 +// explicit InterpsGroup(const unsigned char type)
  25 +// : type(type) {
  26 +//
  27 +// }
  28 +//
  29 +// void addInterpretation(const EncodedInterpretation& interp) {
  30 +// interps.push_back(interp);
  31 +// }
32 32
33 unsigned char type; 33 unsigned char type;
34 - std::vector<EncodedInterpretation> interps; 34 + uint16_t size;
  35 + const unsigned char* ptr;
  36 +// std::vector<EncodedInterpretation> interps;
35 }; 37 };
36 38
37 #endif /* GROUPEDINTERPRETATIONS_HPP */ 39 #endif /* GROUPEDINTERPRETATIONS_HPP */
morfeusz/Morfeusz.cpp
@@ -82,7 +82,9 @@ void Morfeusz::processOneWord( @@ -82,7 +82,9 @@ void Morfeusz::processOneWord(
82 FlexionGraph graph; 82 FlexionGraph graph;
83 const char* currInput = inputStart; 83 const char* currInput = inputStart;
84 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); 84 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
  85 +
85 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); 86 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
  87 +
86 if (!graph.empty()) { 88 if (!graph.empty()) {
87 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); 89 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
88 int srcNode = startNodeNum; 90 int srcNode = startNodeNum;
@@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) { @@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) {
110 from.prefixChunks.end()); 112 from.prefixChunks.end());
111 to.prefixChunks.push_back(from); 113 to.prefixChunks.push_back(from);
112 from.orthWasShifted = true; 114 from.orthWasShifted = true;
  115 + to.chunkStartPtr = from.chunkStartPtr;
113 } 116 }
114 117
115 void Morfeusz::doProcessOneWord( 118 void Morfeusz::doProcessOneWord(
@@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord( @@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord(
119 SegrulesState segrulesState, 122 SegrulesState segrulesState,
120 vector<InterpretedChunk>& accum, 123 vector<InterpretedChunk>& accum,
121 FlexionGraph& graph) const { 124 FlexionGraph& graph) const {
122 - cerr << "doAnalyzeOneWord " << inputData << endl;  
123 - bool endOfProcessing = inputData == inputEnd; 125 +// cerr << "doAnalyzeOneWord " << inputData << endl;
124 const char* currInput = inputData; 126 const char* currInput = inputData;
125 - uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd);  
126 - // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter)); 127 + uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
127 vector<uint32_t> originalCodepoints; 128 vector<uint32_t> originalCodepoints;
128 vector<uint32_t> lowercaseCodepoints; 129 vector<uint32_t> lowercaseCodepoints;
129 130
130 StateType state = env.getFSA().getInitialState(); 131 StateType state = env.getFSA().getInitialState();
131 132
132 - while (!endOfProcessing) {  
133 - if (isEndOfWord(codepoint)) {  
134 - endOfProcessing = true;  
135 - }  
136 - cerr << "not end of word '" << string(currInput) << "'" << endl; 133 + while (!isEndOfWord(codepoint)) {
137 uint32_t lowerCP = env.getCaseConverter().toLower(codepoint); 134 uint32_t lowerCP = env.getCaseConverter().toLower(codepoint);
138 originalCodepoints.push_back(codepoint); 135 originalCodepoints.push_back(codepoint);
139 lowercaseCodepoints.push_back(lowerCP); 136 lowercaseCodepoints.push_back(lowerCP);
140 feedState(state, lowerCP, UTF8CharsetConverter()); 137 feedState(state, lowerCP, UTF8CharsetConverter());
141 codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); 138 codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd);
142 if (state.isAccepting()) { 139 if (state.isAccepting()) {
143 - cerr << "accepting" << endl;  
144 vector<InterpsGroup> val(state.getValue()); 140 vector<InterpsGroup> val(state.getValue());
145 for (unsigned int i = 0; i < val.size(); i++) { 141 for (unsigned int i = 0; i < val.size(); i++) {
146 InterpsGroup& ig = val[i]; 142 InterpsGroup& ig = val[i];
@@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord( @@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord(
151 it != newSegrulesStates.end(); 147 it != newSegrulesStates.end();
152 ++it) { 148 ++it) {
153 SegrulesState newSegrulesState = *it; 149 SegrulesState newSegrulesState = *it;
  150 +// if (newSegrulesState.shiftOrthFromPrevious) {
  151 +//
  152 +// }
154 InterpretedChunk ic = { 153 InterpretedChunk ic = {
155 inputData, 154 inputData,
156 originalCodepoints, 155 originalCodepoints,
@@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord( @@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord(
165 } 164 }
166 accum.push_back(ic); 165 accum.push_back(ic);
167 if (isEndOfWord(codepoint)) { 166 if (isEndOfWord(codepoint)) {
168 - cerr << "end of word inside " << currInput <<endl;  
169 if (newSegrulesState.accepting) 167 if (newSegrulesState.accepting)
170 graph.addPath(accum); 168 graph.addPath(accum);
171 } 169 }
@@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord( @@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord(
177 } 175 }
178 } 176 }
179 } 177 }
  178 + codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
180 } 179 }
181 - cerr << "end of word " << currInput << endl;  
182 inputData = currInput; 180 inputData = currInput;
183 } 181 }
184 182
morfeusz/MorphDeserializer.cpp
@@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() { @@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() {
23 MorphDeserializer::~MorphDeserializer() { 23 MorphDeserializer::~MorphDeserializer() {
24 } 24 }
25 25
26 -static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) {  
27 - // XXX uważać na poprawność danych  
28 - lemma.suffixToCut = *ptr;  
29 - ptr++;  
30 - lemma.suffixToAdd = (const char*) ptr;  
31 - ptr += strlen((const char*) ptr) + 1;  
32 - assert(lemma.casePattern.size() == 0);  
33 -// lemma.casePattern.resize(MAX_WORD_SIZE, false);  
34 - uint8_t casePatternType = *ptr;  
35 - ptr++;  
36 - uint8_t prefixLength;  
37 - uint8_t patternLength;  
38 - switch (casePatternType) {  
39 - case LEMMA_ONLY_LOWER:  
40 - break;  
41 - case LEMMA_UPPER_PREFIX:  
42 - prefixLength = *ptr;  
43 - ptr++;  
44 - for (unsigned int i = 0; i < prefixLength; i++) {  
45 -// lemma.casePattern[i] = true;  
46 - lemma.casePattern.push_back(true);  
47 - }  
48 -// lemma.casePattern.resize(prefixLength, true);  
49 - break;  
50 - case LEMMA_MIXED_CASE:  
51 - patternLength = *ptr;  
52 - ptr++;  
53 - for (unsigned int i = 0; i < patternLength; i++) {  
54 - uint8_t idx = *ptr;  
55 - ptr++; 26 +//static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) {
  27 +// // XXX uważać na poprawność danych
  28 +// lemma.suffixToCut = *ptr;
  29 +// ptr++;
  30 +// lemma.suffixToAdd = (const char*) ptr;
  31 +// ptr += strlen((const char*) ptr) + 1;
  32 +// assert(lemma.casePattern.size() == 0);
  33 +//// lemma.casePattern.resize(MAX_WORD_SIZE, false);
  34 +// uint8_t casePatternType = *ptr;
  35 +// ptr++;
  36 +// uint8_t prefixLength;
  37 +// uint8_t patternLength;
  38 +// switch (casePatternType) {
  39 +// case LEMMA_ONLY_LOWER:
  40 +// break;
  41 +// case LEMMA_UPPER_PREFIX:
  42 +// prefixLength = *ptr;
  43 +// ptr++;
  44 +// for (unsigned int i = 0; i < prefixLength; i++) {
  45 +//// lemma.casePattern[i] = true;
  46 +// lemma.casePattern.push_back(true);
  47 +// }
  48 +//// lemma.casePattern.resize(prefixLength, true);
  49 +// break;
  50 +// case LEMMA_MIXED_CASE:
  51 +// patternLength = *ptr;
  52 +// ptr++;
  53 +// for (unsigned int i = 0; i < patternLength; i++) {
  54 +// uint8_t idx = *ptr;
  55 +// ptr++;
  56 +//// lemma.casePattern[idx] = true;
  57 +// lemma.casePattern.resize(idx + 1, false);
56 // lemma.casePattern[idx] = true; 58 // lemma.casePattern[idx] = true;
57 - lemma.casePattern.resize(idx + 1, false);  
58 - lemma.casePattern[idx] = true;  
59 - }  
60 - break;  
61 - }  
62 -}  
63 -  
64 -static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {  
65 - interp.type = *ptr;  
66 - ptr++;  
67 - deserializeLemma(ptr, interp.value);  
68 - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));  
69 - ptr += 2;  
70 - interp.nameClassifier = *ptr;  
71 - ptr++;  
72 -} 59 +// }
  60 +// break;
  61 +// }
  62 +//}
  63 +//
  64 +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
  65 +// interp.type = *ptr;
  66 +// ptr++;
  67 +// deserializeLemma(ptr, interp.value);
  68 +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
  69 +// ptr += 2;
  70 +// interp.nameClassifier = *ptr;
  71 +// ptr++;
  72 +//}
73 73
74 long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { 74 long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
75 const unsigned char* currPtr = ptr; 75 const unsigned char* currPtr = ptr;
76 - uint8_t interpsNum = *ptr;  
77 - interps.clear();  
78 - interps.reserve(interpsNum); 76 + uint8_t interpTypesNum = *currPtr;
79 currPtr++; 77 currPtr++;
80 - // FIXME - to jest do poprawy  
81 - map<int, InterpsGroup> results;  
82 - for (unsigned int i = 0; i < interpsNum; ++i) {  
83 - EncodedInterpretation interp;  
84 - deserializeInterp(currPtr, interp);  
85 - if (results.count(interp.type) == 0) {  
86 - results[interp.type] = InterpsGroup(interp.type);  
87 - }  
88 - results[interp.type].addInterpretation(interp);  
89 -// interps.push_back(interp);  
90 - }  
91 - map<int, InterpsGroup>::iterator it;  
92 - for (it = results.begin(); it != results.end(); ++it) {  
93 - interps.push_back((*it).second); 78 + interps.clear();
  79 + interps.reserve(interpTypesNum);
  80 + for (unsigned int i = 0; i < interpTypesNum; i++) {
  81 + InterpsGroup ig;
  82 + ig.type = *currPtr;
  83 + currPtr++;
  84 + ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr)));
  85 + currPtr += 2;
  86 + ig.ptr = currPtr;
  87 + currPtr += ig.size;
  88 + interps.push_back(ig);
94 } 89 }
95 return currPtr - ptr; 90 return currPtr - ptr;
96 } 91 }
  92 +
  93 +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
  94 +// const unsigned char* currPtr = ptr;
  95 +// uint8_t interpsNum = *ptr;
  96 +// interps.clear();
  97 +// interps.reserve(interpsNum);
  98 +// currPtr++;
  99 +// // FIXME - to jest do poprawy
  100 +// map<int, InterpsGroup> results;
  101 +// for (unsigned int i = 0; i < interpsNum; ++i) {
  102 +// EncodedInterpretation interp;
  103 +// deserializeInterp(currPtr, interp);
  104 +// if (results.count(interp.type) == 0) {
  105 +// results[interp.type] = InterpsGroup(interp.type);
  106 +// }
  107 +// results[interp.type].addInterpretation(interp);
  108 +//// interps.push_back(interp);
  109 +// }
  110 +// map<int, InterpsGroup>::iterator it;
  111 +// for (it = results.begin(); it != results.end(); ++it) {
  112 +// interps.push_back((*it).second);
  113 +// }
  114 +// return currPtr - ptr;
  115 +//}
nbproject/configurations.xml
@@ -106,14 +106,20 @@ @@ -106,14 +106,20 @@
106 </makeTool> 106 </makeTool>
107 </makefileType> 107 </makefileType>
108 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> 108 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
  109 + <ccTool flags="1">
  110 + </ccTool>
109 </item> 111 </item>
110 <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> 112 <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
  113 + <ccTool flags="1">
  114 + </ccTool>
111 </item> 115 </item>
112 <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> 116 <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4">
113 </item> 117 </item>
114 <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> 118 <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
115 </item> 119 </item>
116 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> 120 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
  121 + <ccTool flags="1">
  122 + </ccTool>
117 </item> 123 </item>
118 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" 124 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
119 ex="false" 125 ex="false"
@@ -169,7 +175,7 @@ @@ -169,7 +175,7 @@
169 <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> 175 <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
170 </item> 176 </item>
171 <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> 177 <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4">
172 - <ccTool flags="1"> 178 + <ccTool>
173 <incDir> 179 <incDir>
174 <pElem>morfeusz</pElem> 180 <pElem>morfeusz</pElem>
175 <pElem>morfeusz/build/morfeusz</pElem> 181 <pElem>morfeusz/build/morfeusz</pElem>
@@ -180,7 +186,7 @@ @@ -180,7 +186,7 @@
180 </ccTool> 186 </ccTool>
181 </item> 187 </item>
182 <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> 188 <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
183 - <ccTool flags="1"> 189 + <ccTool>
184 <incDir> 190 <incDir>
185 <pElem>morfeusz</pElem> 191 <pElem>morfeusz</pElem>
186 <pElem>morfeusz/build/morfeusz</pElem> 192 <pElem>morfeusz/build/morfeusz</pElem>
@@ -273,7 +279,7 @@ @@ -273,7 +279,7 @@
273 <ccTool> 279 <ccTool>
274 <incDir> 280 <incDir>
275 <pElem>morfeusz</pElem> 281 <pElem>morfeusz</pElem>
276 - <pElem>/usr/lib/jvm/default-java/include</pElem> 282 + <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
277 </incDir> 283 </incDir>
278 <preprocessorList> 284 <preprocessorList>
279 <Elem>libjmorfeusz_EXPORTS</Elem> 285 <Elem>libjmorfeusz_EXPORTS</Elem>
@@ -408,18 +414,26 @@ @@ -408,18 +414,26 @@
408 </ccTool> 414 </ccTool>
409 </item> 415 </item>
410 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> 416 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4">
  417 + <ccTool flags="1">
  418 + </ccTool>
411 </item> 419 </item>
412 <item path="morfeusz/charset/CharsetConverter.cpp" 420 <item path="morfeusz/charset/CharsetConverter.cpp"
413 ex="false" 421 ex="false"
414 tool="1" 422 tool="1"
415 flavor2="4"> 423 flavor2="4">
  424 + <ccTool flags="1">
  425 + </ccTool>
416 </item> 426 </item>
417 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> 427 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
  428 + <ccTool flags="1">
  429 + </ccTool>
418 </item> 430 </item>
419 <item path="morfeusz/charset/conversion_tables.cpp" 431 <item path="morfeusz/charset/conversion_tables.cpp"
420 ex="false" 432 ex="false"
421 tool="1" 433 tool="1"
422 flavor2="4"> 434 flavor2="4">
  435 + <ccTool flags="1">
  436 + </ccTool>
423 </item> 437 </item>
424 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> 438 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
425 <ccTool flags="1"> 439 <ccTool flags="1">
@@ -508,8 +522,12 @@ @@ -508,8 +522,12 @@
508 ex="false" 522 ex="false"
509 tool="1" 523 tool="1"
510 flavor2="4"> 524 flavor2="4">
  525 + <ccTool flags="1">
  526 + </ccTool>
511 </item> 527 </item>
512 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> 528 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
  529 + <ccTool flags="1">
  530 + </ccTool>
513 </item> 531 </item>
514 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> 532 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
515 <ccTool flags="0"> 533 <ccTool flags="0">