Commit f1e52ff44027610390237bd862449c85c4a044cc
1 parent
de0e960d
poprawienie czasu działania, przebudowanie analizatora tak, by nie powielać kodu…
… w generatorze, poprawienie rozpoznawania pierwszego segmentu w grafie fleksyjnym git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@114 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
17 changed files
with
411 additions
and
205 deletions
CMakeLists.txt
@@ -5,7 +5,7 @@ project (Morfeusz) | @@ -5,7 +5,7 @@ project (Morfeusz) | ||
5 | set (Morfeusz_VERSION_MAJOR 2) | 5 | set (Morfeusz_VERSION_MAJOR 2) |
6 | set (Morfeusz_VERSION_MINOR 0) | 6 | set (Morfeusz_VERSION_MINOR 0) |
7 | set (Morfeusz_VERSION_PATCH 0) | 7 | set (Morfeusz_VERSION_PATCH 0) |
8 | -set (CMAKE_BUILD_TYPE "Debug") | 8 | +set (CMAKE_BUILD_TYPE "Release") |
9 | 9 | ||
10 | enable_testing() | 10 | enable_testing() |
11 | 11 | ||
@@ -47,7 +47,7 @@ endif () | @@ -47,7 +47,7 @@ endif () | ||
47 | 47 | ||
48 | # SEGMENT_RULES_FILE | 48 | # SEGMENT_RULES_FILE |
49 | if ("${SEGMENT_RULES_FILE}" STREQUAL "") | 49 | if ("${SEGMENT_RULES_FILE}" STREQUAL "") |
50 | - set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat") | 50 | + set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat") |
51 | endif () | 51 | endif () |
52 | 52 | ||
53 | message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules") | 53 | message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules") |
fsabuilder/morfeuszbuilder/fsa/common.py
@@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object): | @@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object): | ||
40 | self.suffixToAdd = bestEncodedForm.suffixToAdd | 40 | self.suffixToAdd = bestEncodedForm.suffixToAdd |
41 | self.prefixToAdd = targetWord[:bestPrefixLength] | 41 | self.prefixToAdd = targetWord[:bestPrefixLength] |
42 | 42 | ||
43 | -class Interpretation(object): | 43 | +class Interpretation4Analyzer(object): |
44 | 44 | ||
45 | def __init__(self, orth, base, tagnum, namenum, typenum): | 45 | def __init__(self, orth, base, tagnum, namenum, typenum): |
46 | - self.lemma = EncodedForm(orth, base) | 46 | + self.encodedForm = EncodedForm(orth, base) |
47 | self.tagnum = tagnum | 47 | self.tagnum = tagnum |
48 | self.namenum = namenum | 48 | self.namenum = namenum |
49 | self.typenum = typenum | 49 | self.typenum = typenum |
50 | 50 | ||
51 | def getSortKey(self): | 51 | def getSortKey(self): |
52 | return ( | 52 | return ( |
53 | - self.lemma.cutLength, | ||
54 | - tuple(self.lemma.suffixToAdd), | ||
55 | - tuple(self.lemma.casePattern), | 53 | + self.encodedForm.cutLength, |
54 | + tuple(self.encodedForm.suffixToAdd), | ||
55 | + tuple(self.encodedForm.casePattern), | ||
56 | self.tagnum, | 56 | self.tagnum, |
57 | self.namenum) | 57 | self.namenum) |
58 | 58 | ||
59 | def __eq__(self, other): | 59 | def __eq__(self, other): |
60 | - if isinstance(other, Interpretation): | 60 | + if isinstance(other, Interpretation4Analyzer): |
61 | return self.getSortKey() == other.getSortKey() | 61 | return self.getSortKey() == other.getSortKey() |
62 | else: | 62 | else: |
63 | return False | 63 | return False |
@@ -68,8 +68,8 @@ class Interpretation(object): | @@ -68,8 +68,8 @@ class Interpretation(object): | ||
68 | class Interpretation4Generator(object): | 68 | class Interpretation4Generator(object): |
69 | 69 | ||
70 | def __init__(self, orth, base, tagnum, namenum, typenum): | 70 | def __init__(self, orth, base, tagnum, namenum, typenum): |
71 | - self.lemma = base | ||
72 | - self.orth = EncodedFormWithPrefix(base, orth) | 71 | + self.encodedForm = base |
72 | + self.encodedForm = EncodedFormWithPrefix(base, orth) | ||
73 | self.tagnum = tagnum | 73 | self.tagnum = tagnum |
74 | self.namenum = namenum | 74 | self.namenum = namenum |
75 | self.typenum = typenum | 75 | self.typenum = typenum |
@@ -77,9 +77,9 @@ class Interpretation4Generator(object): | @@ -77,9 +77,9 @@ class Interpretation4Generator(object): | ||
77 | def getSortKey(self): | 77 | def getSortKey(self): |
78 | return ( | 78 | return ( |
79 | self.tagnum, | 79 | self.tagnum, |
80 | - self.orth.cutLength, | ||
81 | - tuple(self.orth.suffixToAdd), | ||
82 | -# tuple(self.lemma.casePattern), | 80 | + self.encodedForm.cutLength, |
81 | + tuple(self.encodedForm.suffixToAdd), | ||
82 | +# tuple(self.encodedForm.casePattern), | ||
83 | self.namenum) | 83 | self.namenum) |
84 | 84 | ||
85 | def __eq__(self, other): | 85 | def __eq__(self, other): |
@@ -92,7 +92,7 @@ class Interpretation4Generator(object): | @@ -92,7 +92,7 @@ class Interpretation4Generator(object): | ||
92 | return hash(self.getSortKey()) | 92 | return hash(self.getSortKey()) |
93 | 93 | ||
94 | def __unicode__(self): | 94 | def __unicode__(self): |
95 | - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.orth.cutLength, self.orth.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) | 95 | + return u'<%s,(%d %s),%d,%d>' % (self.encodedForm.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) |
96 | 96 | ||
97 | def __repr__(self): | 97 | def __repr__(self): |
98 | return unicode(self) | 98 | return unicode(self) |
fsabuilder/morfeuszbuilder/fsa/convertinput.py
@@ -4,7 +4,7 @@ Created on Oct 23, 2013 | @@ -4,7 +4,7 @@ Created on Oct 23, 2013 | ||
4 | @author: mlenart | 4 | @author: mlenart |
5 | ''' | 5 | ''' |
6 | import logging | 6 | import logging |
7 | -from common import Interpretation | 7 | +from common import Interpretation4Analyzer |
8 | from morfeuszbuilder.fsa.common import Interpretation4Generator | 8 | from morfeuszbuilder.fsa.common import Interpretation4Generator |
9 | 9 | ||
10 | def _mergeEntries(inputLines): | 10 | def _mergeEntries(inputLines): |
@@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object): | @@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object): | ||
74 | tagnum = int(tagnum) | 74 | tagnum = int(tagnum) |
75 | namenum = int(namenum) | 75 | namenum = int(namenum) |
76 | typenum = int(typenum) | 76 | typenum = int(typenum) |
77 | - yield (orth, Interpretation(orth, base, tagnum, namenum, typenum)) | 77 | + yield (orth, Interpretation4Analyzer(orth, base, tagnum, namenum, typenum)) |
78 | 78 | ||
79 | def convert(self, inputLines): | 79 | def convert(self, inputLines): |
80 | return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines)))) | 80 | return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines)))) |
fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -5,6 +5,7 @@ Created on Oct 23, 2013 | @@ -5,6 +5,7 @@ Created on Oct 23, 2013 | ||
5 | ''' | 5 | ''' |
6 | 6 | ||
7 | import logging | 7 | import logging |
8 | +from morfeuszbuilder.utils import serializationUtils | ||
8 | 9 | ||
9 | class Encoder(object): | 10 | class Encoder(object): |
10 | ''' | 11 | ''' |
@@ -96,6 +97,54 @@ class Encoder(object): | @@ -96,6 +97,54 @@ class Encoder(object): | ||
96 | def _encodeNameNum(self, namenum): | 97 | def _encodeNameNum(self, namenum): |
97 | assert namenum < 256 and namenum >= 0 | 98 | assert namenum < 256 and namenum >= 0 |
98 | return bytearray([namenum]) | 99 | return bytearray([namenum]) |
100 | + | ||
101 | + def _groupInterpsByType(self, interpsList): | ||
102 | + res = {} | ||
103 | + for interp in interpsList: | ||
104 | + res.setdefault(interp.typenum, []) | ||
105 | + res[interp.typenum].append(interp) | ||
106 | + return res | ||
107 | + | ||
108 | + def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix): | ||
109 | + res = bytearray() | ||
110 | + res.extend(self._encodeTypeNum(typenum)) | ||
111 | + | ||
112 | + encodedInterpsList = bytearray() | ||
113 | + for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | ||
114 | + encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) | ||
115 | + encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) | ||
116 | + encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) | ||
117 | + | ||
118 | + res.extend(serializationUtils.htons(len(encodedInterpsList))) | ||
119 | + res.extend(encodedInterpsList) | ||
120 | + return res | ||
121 | + | ||
122 | + def _doEncodeData(self, interpsList, withCasePattern, withPrefix): | ||
123 | + | ||
124 | + assert type(interpsList) == frozenset | ||
125 | + | ||
126 | + segnum2Interps = self._groupInterpsByType(interpsList) | ||
127 | + | ||
128 | + | ||
129 | + res = bytearray() | ||
130 | + firstByte = len(segnum2Interps) | ||
131 | + assert firstByte < 256 | ||
132 | + assert firstByte > 0 | ||
133 | + res.append(firstByte) | ||
134 | + | ||
135 | + for typenum, interpsList in segnum2Interps.iteritems(): | ||
136 | + res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix)) | ||
137 | + | ||
138 | + | ||
139 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | ||
140 | +# encodedInterpsList.extend(self._encodeTypeNum(interp.typenum)) | ||
141 | +# encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) | ||
142 | +# encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) | ||
143 | +# encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) | ||
144 | + del interpsList | ||
145 | +# res.extend(serializationUtils.htons(len(encodedInterpsList))) | ||
146 | +# res.extend(encodedInterpsList) | ||
147 | + return res | ||
99 | 148 | ||
100 | class MorphEncoder(Encoder): | 149 | class MorphEncoder(Encoder): |
101 | 150 | ||
@@ -106,19 +155,20 @@ class MorphEncoder(Encoder): | @@ -106,19 +155,20 @@ class MorphEncoder(Encoder): | ||
106 | self.LEMMA_MIXED_CASE = 2 | 155 | self.LEMMA_MIXED_CASE = 2 |
107 | 156 | ||
108 | def encodeData(self, interpsList): | 157 | def encodeData(self, interpsList): |
109 | - res = bytearray() | ||
110 | - firstByte = len(interpsList) | ||
111 | - assert firstByte < 256 | ||
112 | - assert firstByte > 0 | ||
113 | - res.append(firstByte) | ||
114 | - assert type(interpsList) == frozenset | ||
115 | - for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | ||
116 | - res.extend(self._encodeTypeNum(interp.typenum)) | ||
117 | - res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False)) | ||
118 | - res.extend(self._encodeTagNum(interp.tagnum)) | ||
119 | - res.extend(self._encodeNameNum(interp.namenum)) | ||
120 | - del interpsList | ||
121 | - return res | 158 | + return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False) |
159 | +# res = bytearray() | ||
160 | +# firstByte = len(interpsList) | ||
161 | +# assert firstByte < 256 | ||
162 | +# assert firstByte > 0 | ||
163 | +# res.append(firstByte) | ||
164 | +# assert type(interpsList) == frozenset | ||
165 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | ||
166 | +# res.extend(self._encodeTypeNum(interp.typenum)) | ||
167 | +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=True, withPrefix=False)) | ||
168 | +# res.extend(self._encodeTagNum(interp.tagnum)) | ||
169 | +# res.extend(self._encodeNameNum(interp.namenum)) | ||
170 | +# del interpsList | ||
171 | +# return res | ||
122 | 172 | ||
123 | class Encoder4Generator(Encoder): | 173 | class Encoder4Generator(Encoder): |
124 | 174 | ||
@@ -126,18 +176,19 @@ class Encoder4Generator(Encoder): | @@ -126,18 +176,19 @@ class Encoder4Generator(Encoder): | ||
126 | super(Encoder4Generator, self).__init__(encoding) | 176 | super(Encoder4Generator, self).__init__(encoding) |
127 | 177 | ||
128 | def encodeData(self, interpsList): | 178 | def encodeData(self, interpsList): |
129 | - res = bytearray() | ||
130 | - firstByte = len(interpsList) | ||
131 | - assert firstByte < 256 | ||
132 | - assert firstByte > 0 | ||
133 | - res.append(firstByte) | ||
134 | - assert type(interpsList) == frozenset | ||
135 | - for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | ||
136 | - res.extend(self._encodeTypeNum(interp.typenum)) | ||
137 | - res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True)) | ||
138 | - res.extend(self._encodeTagNum(interp.tagnum)) | ||
139 | - res.extend(self._encodeNameNum(interp.namenum)) | ||
140 | - return res | 179 | + return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True) |
180 | +# res = bytearray() | ||
181 | +# firstByte = len(interpsList) | ||
182 | +# assert firstByte < 256 | ||
183 | +# assert firstByte > 0 | ||
184 | +# res.append(firstByte) | ||
185 | +# assert type(interpsList) == frozenset | ||
186 | +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | ||
187 | +# res.extend(self._encodeTypeNum(interp.typenum)) | ||
188 | +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=False, withPrefix=True)) | ||
189 | +# res.extend(self._encodeTagNum(interp.tagnum)) | ||
190 | +# res.extend(self._encodeNameNum(interp.namenum)) | ||
191 | +# return res | ||
141 | # | 192 | # |
142 | # def decodeData(self, data): | 193 | # def decodeData(self, data): |
143 | # | 194 | # |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -106,7 +106,7 @@ class Segtypes(object): | @@ -106,7 +106,7 @@ class Segtypes(object): | ||
106 | lineNum, | 106 | lineNum, |
107 | re.match(r'[a-z_]+', segtype)) | 107 | re.match(r'[a-z_]+', segtype)) |
108 | self._validate( | 108 | self._validate( |
109 | - u'Pattern must contain lemma and part-of-speech fields', | 109 | + u'Pattern must contain encodedForm and part-of-speech fields', |
110 | lineNum, | 110 | lineNum, |
111 | re.match(r'.+\:[a-z_]+', pattern, re.U)) | 111 | re.match(r'.+\:[a-z_]+', pattern, re.U)) |
112 | 112 | ||
@@ -146,13 +146,13 @@ class Segtypes(object): | @@ -146,13 +146,13 @@ class Segtypes(object): | ||
146 | 146 | ||
147 | # index lexemes | 147 | # index lexemes |
148 | for p in self.patternsList: | 148 | for p in self.patternsList: |
149 | - if p.lemma: | 149 | + if p.encodedForm: |
150 | for tag in self.tagset.getAllTags(): | 150 | for tag in self.tagset.getAllTags(): |
151 | tagnum = self.tagset.getTagnum4Tag(tag) | 151 | tagnum = self.tagset.getTagnum4Tag(tag) |
152 | - if not (p.lemma, tagnum) in self._lemmaTagnum2Segnum: | ||
153 | - segnum = p.tryToMatch(p.lemma, tag) | 152 | + if not (p.encodedForm, tagnum) in self._lemmaTagnum2Segnum: |
153 | + segnum = p.tryToMatch(p.encodedForm, tag) | ||
154 | if segnum != -1: | 154 | if segnum != -1: |
155 | - self._lemmaTagnum2Segnum[(p.lemma, tagnum)] = segnum | 155 | + self._lemmaTagnum2Segnum[(p.encodedForm, tagnum)] = segnum |
156 | # logging.info('indexing segment type numbers - done') | 156 | # logging.info('indexing segment type numbers - done') |
157 | # self._debugSegnums() | 157 | # self._debugSegnums() |
158 | 158 | ||
@@ -171,7 +171,7 @@ class Segtypes(object): | @@ -171,7 +171,7 @@ class Segtypes(object): | ||
171 | class SegtypePattern(object): | 171 | class SegtypePattern(object): |
172 | 172 | ||
173 | def __init__(self, lemma, pattern, segnum): | 173 | def __init__(self, lemma, pattern, segnum): |
174 | - self.lemma = lemma | 174 | + self.encodedForm = lemma |
175 | self.pattern = pattern | 175 | self.pattern = pattern |
176 | self.segnum = segnum | 176 | self.segnum = segnum |
177 | 177 | ||
@@ -181,7 +181,7 @@ class SegtypePattern(object): | @@ -181,7 +181,7 @@ class SegtypePattern(object): | ||
181 | patterns2Match = [] | 181 | patterns2Match = [] |
182 | patterns2Match.append(self.pattern.replace('%', '.*')) | 182 | patterns2Match.append(self.pattern.replace('%', '.*')) |
183 | patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) | 183 | patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) |
184 | - if (self.lemma is None or self.lemma == lemma) \ | 184 | + if (self.encodedForm is None or self.encodedForm == lemma) \ |
185 | and any([re.match(p, tag) for p in patterns2Match]): | 185 | and any([re.match(p, tag) for p in patterns2Match]): |
186 | return self.segnum | 186 | return self.segnum |
187 | else: | 187 | else: |
input/dodatki.tab
input/segmenty.dat
@@ -142,6 +142,7 @@ samodz dywiz adj | @@ -142,6 +142,7 @@ samodz dywiz adj | ||
142 | # Stopień najwyższy: | 142 | # Stopień najwyższy: |
143 | # np. „naj·zieleńszy”, „naj·mądrzej” | 143 | # np. „naj·zieleńszy”, „naj·mądrzej” |
144 | moze_interp( naj> adj_sup ) | 144 | moze_interp( naj> adj_sup ) |
145 | +moze_interp( nie> naj> adj_sup ) | ||
145 | # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj | 146 | # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj |
146 | moze_interp( praet_sg dywiz li) | 147 | moze_interp( praet_sg dywiz li) |
147 | moze_interp( praet_pl dywiz li) | 148 | moze_interp( praet_pl dywiz li) |
input/segmenty1.dat
@@ -52,11 +52,14 @@ naj naj | @@ -52,11 +52,14 @@ naj naj | ||
52 | nie nie | 52 | nie nie |
53 | prefs prefs | 53 | prefs prefs |
54 | prefv prefv | 54 | prefv prefv |
55 | +prefa prefa | ||
55 | dig dig | 56 | dig dig |
56 | adja adja | 57 | adja adja |
57 | adj adj:%:pos | 58 | adj adj:%:pos |
58 | adj_sup adj:%:sup | 59 | adj_sup adj:%:sup |
59 | adj_sup adv:sup | 60 | adj_sup adv:sup |
61 | +adj_com adj:%:com | ||
62 | +adj_com adj:%:com | ||
60 | negat ger:%:neg | 63 | negat ger:%:neg |
61 | negat pact:%:neg | 64 | negat pact:%:neg |
62 | negat ppas:%:neg | 65 | negat ppas:%:neg |
@@ -69,6 +72,22 @@ interp interp | @@ -69,6 +72,22 @@ interp interp | ||
69 | aglsg aglt:sg:% | 72 | aglsg aglt:sg:% |
70 | aglpl aglt:pl:% | 73 | aglpl aglt:pl:% |
71 | samodz % | 74 | samodz % |
75 | +praet_fin praet:% | ||
76 | +praet_fin fin:% | ||
77 | +li li:qub:% | ||
78 | +nomina subst:% | ||
79 | +nomina ger:% | ||
80 | +nomina depr:% | ||
81 | +adjectiva adj:% | ||
82 | +adjectiva adv:% | ||
83 | +adjectiva ppas:% | ||
84 | +adjectiva pact:% | ||
85 | +verba_imperf praet:%:imperf | ||
86 | +verba_imperf fin:%:imperf | ||
87 | +verba_imperf inf:imperf | ||
88 | +verba_imperf imps:imperf | ||
89 | +verba_imperf impt:imperf | ||
90 | + | ||
72 | 91 | ||
73 | [lexemes] | 92 | [lexemes] |
74 | z_aglt aby:comp | 93 | z_aglt aby:comp |
morfeusz/EncodedInterpretation.hpp
@@ -28,7 +28,6 @@ struct EncodedForm { | @@ -28,7 +28,6 @@ struct EncodedForm { | ||
28 | */ | 28 | */ |
29 | struct EncodedInterpretation { | 29 | struct EncodedInterpretation { |
30 | EncodedForm value; | 30 | EncodedForm value; |
31 | - unsigned char type; | ||
32 | int tag; | 31 | int tag; |
33 | int nameClassifier; | 32 | int nameClassifier; |
34 | }; | 33 | }; |
morfeusz/Environment.cpp
@@ -13,10 +13,12 @@ | @@ -13,10 +13,12 @@ | ||
13 | //class InterpretedChunksDecoder4Analyzer; | 13 | //class InterpretedChunksDecoder4Analyzer; |
14 | //class InterpretedChunksDecoder4Generator; | 14 | //class InterpretedChunksDecoder4Generator; |
15 | 15 | ||
16 | -static Deserializer<vector<InterpsGroup> >* initializeDeserializer() { | ||
17 | - static Deserializer < vector < InterpsGroup > > *deserializer | 16 | +static Deserializer<vector<InterpsGroup> >& initializeDeserializer(MorfeuszProcessorType processorType) { |
17 | + static Deserializer < vector < InterpsGroup > > *analyzerDeserializer | ||
18 | = new MorphDeserializer(); | 18 | = new MorphDeserializer(); |
19 | - return deserializer; | 19 | + static Deserializer < vector < InterpsGroup > > *generatorDeserializer |
20 | + = new MorphDeserializer(); | ||
21 | + return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer); | ||
20 | } | 22 | } |
21 | 23 | ||
22 | static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { | 24 | static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { |
@@ -48,14 +50,15 @@ Environment::Environment( | @@ -48,14 +50,15 @@ Environment::Environment( | ||
48 | caseConverter(), | 50 | caseConverter(), |
49 | tagset(fsaFileStartPtr), | 51 | tagset(fsaFileStartPtr), |
50 | fsaFileStartPtr(fsaFileStartPtr), | 52 | fsaFileStartPtr(fsaFileStartPtr), |
51 | - fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())), | 53 | + fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))), |
52 | segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)), | 54 | segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)), |
53 | currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), | 55 | currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), |
54 | isFromFile(false), | 56 | isFromFile(false), |
55 | chunksDecoder( | 57 | chunksDecoder( |
56 | processorType == ANALYZER | 58 | processorType == ANALYZER |
57 | ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) | 59 | ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) |
58 | - : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)) | 60 | + : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), |
61 | + processorType(processorType) | ||
59 | { | 62 | { |
60 | } | 63 | } |
61 | 64 | ||
@@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string& filename) { | @@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string& filename) { | ||
110 | delete this->fsaFileStartPtr; | 113 | delete this->fsaFileStartPtr; |
111 | } | 114 | } |
112 | this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str()); | 115 | this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str()); |
113 | - this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer()); | 116 | + this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType)); |
114 | this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr); | 117 | this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr); |
115 | this->isFromFile = true; | 118 | this->isFromFile = true; |
116 | } | 119 | } |
morfeusz/Environment.hpp
@@ -64,6 +64,7 @@ private: | @@ -64,6 +64,7 @@ private: | ||
64 | bool isFromFile; | 64 | bool isFromFile; |
65 | 65 | ||
66 | const InterpretedChunksDecoder* chunksDecoder; | 66 | const InterpretedChunksDecoder* chunksDecoder; |
67 | + MorfeuszProcessorType processorType; | ||
67 | 68 | ||
68 | const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; | 69 | const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; |
69 | }; | 70 | }; |
morfeusz/FlexionGraph.cpp
@@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge& e) { | @@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge& e) { | ||
15 | this->graph.push_back(vector<Edge>()); | 15 | this->graph.push_back(vector<Edge>()); |
16 | this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr); | 16 | this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr); |
17 | } | 17 | } |
18 | -// cerr << string(e.chunk.chunkStartPtr) << endl; | ||
19 | assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr); | 18 | assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr); |
20 | this->graph[0].push_back(e); | 19 | this->graph[0].push_back(e); |
21 | } | 20 | } |
@@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge& e) { | @@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge& e) { | ||
30 | this->graph[startNode].push_back(e); | 29 | this->graph[startNode].push_back(e); |
31 | } | 30 | } |
32 | 31 | ||
32 | +static inline bool chunkIsAtFront( | ||
33 | + const InterpretedChunk& chunk, | ||
34 | + const std::vector<InterpretedChunk>& path) { | ||
35 | + unsigned int i; | ||
36 | + for (i = 0; i < path.size() - 1 && path[i].orthWasShifted; i++) { | ||
37 | + } | ||
38 | + assert(!path[i].orthWasShifted); | ||
39 | + return &chunk == &(path[i]); | ||
40 | +} | ||
41 | + | ||
42 | +static inline bool chunkIsAtBack( | ||
43 | + const InterpretedChunk& chunk, | ||
44 | + const std::vector<InterpretedChunk>& path) { | ||
45 | + return &chunk == &(path.back()); | ||
46 | +} | ||
47 | + | ||
48 | +static inline bool chunkIsTheOnlyOne( | ||
49 | + const InterpretedChunk& chunk, | ||
50 | + const std::vector<InterpretedChunk>& path) { | ||
51 | + return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); | ||
52 | +} | ||
53 | + | ||
33 | void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { | 54 | void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { |
34 | // debugPath(path); | 55 | // debugPath(path); |
35 | // debugGraph(this->graph); | 56 | // debugGraph(this->graph); |
36 | for (unsigned int i = 0; i < path.size(); i++) { | 57 | for (unsigned int i = 0; i < path.size(); i++) { |
37 | const InterpretedChunk& chunk = path[i]; | 58 | const InterpretedChunk& chunk = path[i]; |
38 | if (!chunk.orthWasShifted) { | 59 | if (!chunk.orthWasShifted) { |
39 | - if (&chunk == &(path.front()) | ||
40 | - && &chunk == &(path.back())) { | 60 | + if (chunkIsTheOnlyOne(chunk, path)) { |
41 | Edge e = {chunk, UINT_MAX}; | 61 | Edge e = {chunk, UINT_MAX}; |
42 | this->addStartEdge(e); | 62 | this->addStartEdge(e); |
43 | } | 63 | } |
44 | - else if (&chunk == &(path.front())) { | 64 | + else if (chunkIsAtFront(chunk, path)) { |
45 | Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()}; | 65 | Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()}; |
46 | this->addStartEdge(e); | 66 | this->addStartEdge(e); |
47 | } | 67 | } |
48 | - else if (&chunk == &(path.back())) { | 68 | + else if (chunkIsAtBack(chunk, path)) { |
49 | Edge e = {chunk, UINT_MAX}; | 69 | Edge e = {chunk, UINT_MAX}; |
50 | this->addMiddleEdge((unsigned int) this->graph.size(), e); | 70 | this->addMiddleEdge((unsigned int) this->graph.size(), e); |
51 | } | 71 | } |
morfeusz/InterpretedChunksDecoder.hpp
@@ -18,6 +18,10 @@ | @@ -18,6 +18,10 @@ | ||
18 | #include "charset/CaseConverter.hpp" | 18 | #include "charset/CaseConverter.hpp" |
19 | #include "Environment.hpp" | 19 | #include "Environment.hpp" |
20 | 20 | ||
21 | +const uint8_t LEMMA_ONLY_LOWER = 0; | ||
22 | +const uint8_t LEMMA_UPPER_PREFIX = 1; | ||
23 | +const uint8_t LEMMA_MIXED_CASE = 2; | ||
24 | + | ||
21 | class InterpretedChunksDecoder { | 25 | class InterpretedChunksDecoder { |
22 | public: | 26 | public: |
23 | 27 | ||
@@ -30,22 +34,12 @@ public: | @@ -30,22 +34,12 @@ public: | ||
30 | unsigned int endNode, | 34 | unsigned int endNode, |
31 | const InterpretedChunk& interpretedChunk, | 35 | const InterpretedChunk& interpretedChunk, |
32 | std::vector<MorphInterpretation>& out) const = 0; | 36 | std::vector<MorphInterpretation>& out) const = 0; |
33 | - | ||
34 | - virtual ~InterpretedChunksDecoder() {} | ||
35 | 37 | ||
36 | -protected: | ||
37 | - | ||
38 | - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { | ||
39 | - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | ||
40 | - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | ||
41 | - originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | ||
42 | - decodeForm( | ||
43 | - prefixChunk.lowercaseCodepoints, | ||
44 | - prefixChunk.interpsGroup.interps[0].value, | ||
45 | - decodedForm); | ||
46 | - } | 38 | + virtual ~InterpretedChunksDecoder() { |
47 | } | 39 | } |
48 | - | 40 | + |
41 | +protected: | ||
42 | + | ||
49 | virtual void decodeForm( | 43 | virtual void decodeForm( |
50 | const std::vector<uint32_t>& orth, | 44 | const std::vector<uint32_t>& orth, |
51 | const EncodedForm& form, | 45 | const EncodedForm& form, |
@@ -55,9 +49,10 @@ protected: | @@ -55,9 +49,10 @@ protected: | ||
55 | }; | 49 | }; |
56 | 50 | ||
57 | class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { | 51 | class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { |
58 | - | ||
59 | public: | 52 | public: |
60 | - InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {} | 53 | + |
54 | + InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { | ||
55 | + } | ||
61 | 56 | ||
62 | void decode( | 57 | void decode( |
63 | unsigned int startNode, | 58 | unsigned int startNode, |
@@ -65,22 +60,12 @@ public: | @@ -65,22 +60,12 @@ public: | ||
65 | const InterpretedChunk& interpretedChunk, | 60 | const InterpretedChunk& interpretedChunk, |
66 | std::vector<MorphInterpretation>& out) const { | 61 | std::vector<MorphInterpretation>& out) const { |
67 | string orth; | 62 | string orth; |
68 | - string lemma; | ||
69 | - convertPrefixes(interpretedChunk, orth, lemma); | 63 | + string lemmaPrefix; |
64 | + convertPrefixes(interpretedChunk, orth, lemmaPrefix); | ||
70 | orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | 65 | orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); |
71 | - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | ||
72 | - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | ||
73 | - decodeForm( | ||
74 | - interpretedChunk.lowercaseCodepoints, | ||
75 | - ei.value, | ||
76 | - lemma); | ||
77 | - out.push_back(MorphInterpretation( | ||
78 | - startNode, endNode, | ||
79 | - orth, lemma, | ||
80 | - ei.tag, | ||
81 | - ei.nameClassifier, | ||
82 | - env.getTagset(), | ||
83 | - env.getCharsetConverter())); | 66 | + const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr; |
67 | + while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) { | ||
68 | + out.push_back(this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr)); | ||
84 | } | 69 | } |
85 | } | 70 | } |
86 | 71 | ||
@@ -104,36 +89,116 @@ protected: | @@ -104,36 +89,116 @@ protected: | ||
104 | env.getCharsetConverter().append(cp, res); | 89 | env.getCharsetConverter().append(cp, res); |
105 | } | 90 | } |
106 | } | 91 | } |
92 | + | ||
93 | +private: | ||
94 | + | ||
95 | + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { | ||
96 | + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | ||
97 | + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | ||
98 | + originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | ||
99 | + const unsigned char* ptr = prefixChunk.interpsGroup.ptr; | ||
100 | + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, originalForm, string(""), prefixChunk, ptr); | ||
101 | + decodedForm += mi.getLemma(); | ||
102 | + } | ||
103 | + } | ||
104 | + | ||
105 | + MorphInterpretation decodeMorphInterpretation( | ||
106 | + unsigned int startNode, unsigned int endNode, | ||
107 | + const string& orth, | ||
108 | + const string& lemmaPrefix, | ||
109 | + const InterpretedChunk& chunk, | ||
110 | + const unsigned char*& ptr) const { | ||
111 | + string lemma = lemmaPrefix; | ||
112 | + EncodedInterpretation ei = this->decodeInterp(ptr); | ||
113 | + this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma); | ||
114 | + return MorphInterpretation( | ||
115 | + startNode, endNode, | ||
116 | + orth, lemma, | ||
117 | + ei.tag, | ||
118 | + ei.nameClassifier, | ||
119 | + env.getTagset(), | ||
120 | + env.getCharsetConverter()); | ||
121 | + } | ||
122 | + | ||
123 | + void decodeLemma(const unsigned char*& ptr, EncodedForm& lemma) const { | ||
124 | + lemma.suffixToCut = *ptr; | ||
125 | + ptr++; | ||
126 | + lemma.suffixToAdd = (const char*) ptr; | ||
127 | + ptr += strlen((const char*) ptr) + 1; | ||
128 | + assert(lemma.casePattern.size() == 0); | ||
129 | + // lemma.casePattern.resize(MAX_WORD_SIZE, false); | ||
130 | + uint8_t casePatternType = *ptr; | ||
131 | + ptr++; | ||
132 | + uint8_t prefixLength; | ||
133 | + uint8_t patternLength; | ||
134 | + switch (casePatternType) { | ||
135 | + case LEMMA_ONLY_LOWER: | ||
136 | + break; | ||
137 | + case LEMMA_UPPER_PREFIX: | ||
138 | + prefixLength = *ptr; | ||
139 | + ptr++; | ||
140 | + for (unsigned int i = 0; i < prefixLength; i++) { | ||
141 | + // lemma.casePattern[i] = true; | ||
142 | + lemma.casePattern.push_back(true); | ||
143 | + } | ||
144 | + // lemma.casePattern.resize(prefixLength, true); | ||
145 | + break; | ||
146 | + case LEMMA_MIXED_CASE: | ||
147 | + patternLength = *ptr; | ||
148 | + ptr++; | ||
149 | + for (unsigned int i = 0; i < patternLength; i++) { | ||
150 | + uint8_t idx = *ptr; | ||
151 | + ptr++; | ||
152 | + // lemma.casePattern[idx] = true; | ||
153 | + lemma.casePattern.resize(idx + 1, false); | ||
154 | + lemma.casePattern[idx] = true; | ||
155 | + } | ||
156 | + break; | ||
157 | + } | ||
158 | + } | ||
159 | + | ||
160 | + EncodedInterpretation decodeInterp(const unsigned char*& ptr) const { | ||
161 | + EncodedInterpretation interp; | ||
162 | + decodeLemma(ptr, interp.value); | ||
163 | + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr))); | ||
164 | + ptr += 2; | ||
165 | + interp.nameClassifier = *ptr; | ||
166 | + ptr++; | ||
167 | + return interp; | ||
168 | + } | ||
107 | }; | 169 | }; |
108 | 170 | ||
109 | class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { | 171 | class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { |
110 | - | ||
111 | public: | 172 | public: |
112 | - InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {} | 173 | + |
174 | + InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { | ||
175 | + } | ||
113 | 176 | ||
114 | void decode( | 177 | void decode( |
115 | unsigned int startNode, | 178 | unsigned int startNode, |
116 | unsigned int endNode, | 179 | unsigned int endNode, |
117 | const InterpretedChunk& interpretedChunk, | 180 | const InterpretedChunk& interpretedChunk, |
118 | std::vector<MorphInterpretation>& out) const { | 181 | std::vector<MorphInterpretation>& out) const { |
119 | - string orth; | ||
120 | - string lemma; | ||
121 | - convertPrefixes(interpretedChunk, lemma, orth); | ||
122 | - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | ||
123 | - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | ||
124 | - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | ||
125 | - decodeForm( | ||
126 | - interpretedChunk.originalCodepoints, | ||
127 | - ei.value, | ||
128 | - orth); | ||
129 | - out.push_back(MorphInterpretation( | ||
130 | - startNode, endNode, | ||
131 | - orth, lemma, | ||
132 | - ei.tag, | ||
133 | - ei.nameClassifier, | ||
134 | - env.getTagset(), | ||
135 | - env.getCharsetConverter())); | ||
136 | - } | 182 | + // string orth; |
183 | + // string lemma; | ||
184 | + // convertPrefixes(interpretedChunk, lemma, orth); | ||
185 | + // size_t orthLength = orth.length(); | ||
186 | + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | ||
187 | + // for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | ||
188 | + // const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | ||
189 | + // decodeForm( | ||
190 | + // interpretedChunk.originalCodepoints, | ||
191 | + // ei.value, | ||
192 | + // orth); | ||
193 | + // out.push_back(MorphInterpretation( | ||
194 | + // startNode, endNode, | ||
195 | + // orth, lemma, | ||
196 | + // ei.tag, | ||
197 | + // ei.nameClassifier, | ||
198 | + // env.getTagset(), | ||
199 | + // env.getCharsetConverter())); | ||
200 | + // orth.erase(orthLength); | ||
201 | + // } | ||
137 | } | 202 | } |
138 | 203 | ||
139 | private: | 204 | private: |
morfeusz/InterpsGroup.hpp
@@ -14,24 +14,26 @@ | @@ -14,24 +14,26 @@ | ||
14 | #include "MorphInterpretation.hpp" | 14 | #include "MorphInterpretation.hpp" |
15 | #include "Tagset.hpp" | 15 | #include "Tagset.hpp" |
16 | 16 | ||
17 | -class InterpsGroup { | ||
18 | -public: | ||
19 | - | ||
20 | - InterpsGroup() { | ||
21 | - | ||
22 | - } | ||
23 | - | ||
24 | - explicit InterpsGroup(const unsigned char type) | ||
25 | - : type(type) { | ||
26 | - | ||
27 | - } | ||
28 | - | ||
29 | - void addInterpretation(const EncodedInterpretation& interp) { | ||
30 | - interps.push_back(interp); | ||
31 | - } | 17 | +struct InterpsGroup { |
18 | +//public: | ||
19 | +// | ||
20 | +// InterpsGroup() { | ||
21 | +// | ||
22 | +// } | ||
23 | +// | ||
24 | +// explicit InterpsGroup(const unsigned char type) | ||
25 | +// : type(type) { | ||
26 | +// | ||
27 | +// } | ||
28 | +// | ||
29 | +// void addInterpretation(const EncodedInterpretation& interp) { | ||
30 | +// interps.push_back(interp); | ||
31 | +// } | ||
32 | 32 | ||
33 | unsigned char type; | 33 | unsigned char type; |
34 | - std::vector<EncodedInterpretation> interps; | 34 | + uint16_t size; |
35 | + const unsigned char* ptr; | ||
36 | +// std::vector<EncodedInterpretation> interps; | ||
35 | }; | 37 | }; |
36 | 38 | ||
37 | #endif /* GROUPEDINTERPRETATIONS_HPP */ | 39 | #endif /* GROUPEDINTERPRETATIONS_HPP */ |
morfeusz/Morfeusz.cpp
@@ -82,7 +82,9 @@ void Morfeusz::processOneWord( | @@ -82,7 +82,9 @@ void Morfeusz::processOneWord( | ||
82 | FlexionGraph graph; | 82 | FlexionGraph graph; |
83 | const char* currInput = inputStart; | 83 | const char* currInput = inputStart; |
84 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); | 84 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); |
85 | + | ||
85 | doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); | 86 | doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); |
87 | + | ||
86 | if (!graph.empty()) { | 88 | if (!graph.empty()) { |
87 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); | 89 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
88 | int srcNode = startNodeNum; | 90 | int srcNode = startNodeNum; |
@@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | @@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | ||
110 | from.prefixChunks.end()); | 112 | from.prefixChunks.end()); |
111 | to.prefixChunks.push_back(from); | 113 | to.prefixChunks.push_back(from); |
112 | from.orthWasShifted = true; | 114 | from.orthWasShifted = true; |
115 | + to.chunkStartPtr = from.chunkStartPtr; | ||
113 | } | 116 | } |
114 | 117 | ||
115 | void Morfeusz::doProcessOneWord( | 118 | void Morfeusz::doProcessOneWord( |
@@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord( | @@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord( | ||
119 | SegrulesState segrulesState, | 122 | SegrulesState segrulesState, |
120 | vector<InterpretedChunk>& accum, | 123 | vector<InterpretedChunk>& accum, |
121 | FlexionGraph& graph) const { | 124 | FlexionGraph& graph) const { |
122 | - cerr << "doAnalyzeOneWord " << inputData << endl; | ||
123 | - bool endOfProcessing = inputData == inputEnd; | 125 | +// cerr << "doAnalyzeOneWord " << inputData << endl; |
124 | const char* currInput = inputData; | 126 | const char* currInput = inputData; |
125 | - uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | ||
126 | - // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter)); | 127 | + uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); |
127 | vector<uint32_t> originalCodepoints; | 128 | vector<uint32_t> originalCodepoints; |
128 | vector<uint32_t> lowercaseCodepoints; | 129 | vector<uint32_t> lowercaseCodepoints; |
129 | 130 | ||
130 | StateType state = env.getFSA().getInitialState(); | 131 | StateType state = env.getFSA().getInitialState(); |
131 | 132 | ||
132 | - while (!endOfProcessing) { | ||
133 | - if (isEndOfWord(codepoint)) { | ||
134 | - endOfProcessing = true; | ||
135 | - } | ||
136 | - cerr << "not end of word '" << string(currInput) << "'" << endl; | 133 | + while (!isEndOfWord(codepoint)) { |
137 | uint32_t lowerCP = env.getCaseConverter().toLower(codepoint); | 134 | uint32_t lowerCP = env.getCaseConverter().toLower(codepoint); |
138 | originalCodepoints.push_back(codepoint); | 135 | originalCodepoints.push_back(codepoint); |
139 | lowercaseCodepoints.push_back(lowerCP); | 136 | lowercaseCodepoints.push_back(lowerCP); |
140 | feedState(state, lowerCP, UTF8CharsetConverter()); | 137 | feedState(state, lowerCP, UTF8CharsetConverter()); |
141 | codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); | 138 | codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); |
142 | if (state.isAccepting()) { | 139 | if (state.isAccepting()) { |
143 | - cerr << "accepting" << endl; | ||
144 | vector<InterpsGroup> val(state.getValue()); | 140 | vector<InterpsGroup> val(state.getValue()); |
145 | for (unsigned int i = 0; i < val.size(); i++) { | 141 | for (unsigned int i = 0; i < val.size(); i++) { |
146 | InterpsGroup& ig = val[i]; | 142 | InterpsGroup& ig = val[i]; |
@@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord( | @@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord( | ||
151 | it != newSegrulesStates.end(); | 147 | it != newSegrulesStates.end(); |
152 | ++it) { | 148 | ++it) { |
153 | SegrulesState newSegrulesState = *it; | 149 | SegrulesState newSegrulesState = *it; |
150 | +// if (newSegrulesState.shiftOrthFromPrevious) { | ||
151 | +// | ||
152 | +// } | ||
154 | InterpretedChunk ic = { | 153 | InterpretedChunk ic = { |
155 | inputData, | 154 | inputData, |
156 | originalCodepoints, | 155 | originalCodepoints, |
@@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord( | @@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord( | ||
165 | } | 164 | } |
166 | accum.push_back(ic); | 165 | accum.push_back(ic); |
167 | if (isEndOfWord(codepoint)) { | 166 | if (isEndOfWord(codepoint)) { |
168 | - cerr << "end of word inside " << currInput <<endl; | ||
169 | if (newSegrulesState.accepting) | 167 | if (newSegrulesState.accepting) |
170 | graph.addPath(accum); | 168 | graph.addPath(accum); |
171 | } | 169 | } |
@@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord( | @@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord( | ||
177 | } | 175 | } |
178 | } | 176 | } |
179 | } | 177 | } |
178 | + codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | ||
180 | } | 179 | } |
181 | - cerr << "end of word " << currInput << endl; | ||
182 | inputData = currInput; | 180 | inputData = currInput; |
183 | } | 181 | } |
184 | 182 |
morfeusz/MorphDeserializer.cpp
@@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() { | @@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() { | ||
23 | MorphDeserializer::~MorphDeserializer() { | 23 | MorphDeserializer::~MorphDeserializer() { |
24 | } | 24 | } |
25 | 25 | ||
26 | -static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { | ||
27 | - // XXX uważać na poprawność danych | ||
28 | - lemma.suffixToCut = *ptr; | ||
29 | - ptr++; | ||
30 | - lemma.suffixToAdd = (const char*) ptr; | ||
31 | - ptr += strlen((const char*) ptr) + 1; | ||
32 | - assert(lemma.casePattern.size() == 0); | ||
33 | -// lemma.casePattern.resize(MAX_WORD_SIZE, false); | ||
34 | - uint8_t casePatternType = *ptr; | ||
35 | - ptr++; | ||
36 | - uint8_t prefixLength; | ||
37 | - uint8_t patternLength; | ||
38 | - switch (casePatternType) { | ||
39 | - case LEMMA_ONLY_LOWER: | ||
40 | - break; | ||
41 | - case LEMMA_UPPER_PREFIX: | ||
42 | - prefixLength = *ptr; | ||
43 | - ptr++; | ||
44 | - for (unsigned int i = 0; i < prefixLength; i++) { | ||
45 | -// lemma.casePattern[i] = true; | ||
46 | - lemma.casePattern.push_back(true); | ||
47 | - } | ||
48 | -// lemma.casePattern.resize(prefixLength, true); | ||
49 | - break; | ||
50 | - case LEMMA_MIXED_CASE: | ||
51 | - patternLength = *ptr; | ||
52 | - ptr++; | ||
53 | - for (unsigned int i = 0; i < patternLength; i++) { | ||
54 | - uint8_t idx = *ptr; | ||
55 | - ptr++; | 26 | +//static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { |
27 | +// // XXX uważać na poprawność danych | ||
28 | +// lemma.suffixToCut = *ptr; | ||
29 | +// ptr++; | ||
30 | +// lemma.suffixToAdd = (const char*) ptr; | ||
31 | +// ptr += strlen((const char*) ptr) + 1; | ||
32 | +// assert(lemma.casePattern.size() == 0); | ||
33 | +//// lemma.casePattern.resize(MAX_WORD_SIZE, false); | ||
34 | +// uint8_t casePatternType = *ptr; | ||
35 | +// ptr++; | ||
36 | +// uint8_t prefixLength; | ||
37 | +// uint8_t patternLength; | ||
38 | +// switch (casePatternType) { | ||
39 | +// case LEMMA_ONLY_LOWER: | ||
40 | +// break; | ||
41 | +// case LEMMA_UPPER_PREFIX: | ||
42 | +// prefixLength = *ptr; | ||
43 | +// ptr++; | ||
44 | +// for (unsigned int i = 0; i < prefixLength; i++) { | ||
45 | +//// lemma.casePattern[i] = true; | ||
46 | +// lemma.casePattern.push_back(true); | ||
47 | +// } | ||
48 | +//// lemma.casePattern.resize(prefixLength, true); | ||
49 | +// break; | ||
50 | +// case LEMMA_MIXED_CASE: | ||
51 | +// patternLength = *ptr; | ||
52 | +// ptr++; | ||
53 | +// for (unsigned int i = 0; i < patternLength; i++) { | ||
54 | +// uint8_t idx = *ptr; | ||
55 | +// ptr++; | ||
56 | +//// lemma.casePattern[idx] = true; | ||
57 | +// lemma.casePattern.resize(idx + 1, false); | ||
56 | // lemma.casePattern[idx] = true; | 58 | // lemma.casePattern[idx] = true; |
57 | - lemma.casePattern.resize(idx + 1, false); | ||
58 | - lemma.casePattern[idx] = true; | ||
59 | - } | ||
60 | - break; | ||
61 | - } | ||
62 | -} | ||
63 | - | ||
64 | -static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { | ||
65 | - interp.type = *ptr; | ||
66 | - ptr++; | ||
67 | - deserializeLemma(ptr, interp.value); | ||
68 | - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | ||
69 | - ptr += 2; | ||
70 | - interp.nameClassifier = *ptr; | ||
71 | - ptr++; | ||
72 | -} | 59 | +// } |
60 | +// break; | ||
61 | +// } | ||
62 | +//} | ||
63 | +// | ||
64 | +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { | ||
65 | +// interp.type = *ptr; | ||
66 | +// ptr++; | ||
67 | +// deserializeLemma(ptr, interp.value); | ||
68 | +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | ||
69 | +// ptr += 2; | ||
70 | +// interp.nameClassifier = *ptr; | ||
71 | +// ptr++; | ||
72 | +//} | ||
73 | 73 | ||
74 | long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { | 74 | long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { |
75 | const unsigned char* currPtr = ptr; | 75 | const unsigned char* currPtr = ptr; |
76 | - uint8_t interpsNum = *ptr; | ||
77 | - interps.clear(); | ||
78 | - interps.reserve(interpsNum); | 76 | + uint8_t interpTypesNum = *currPtr; |
79 | currPtr++; | 77 | currPtr++; |
80 | - // FIXME - to jest do poprawy | ||
81 | - map<int, InterpsGroup> results; | ||
82 | - for (unsigned int i = 0; i < interpsNum; ++i) { | ||
83 | - EncodedInterpretation interp; | ||
84 | - deserializeInterp(currPtr, interp); | ||
85 | - if (results.count(interp.type) == 0) { | ||
86 | - results[interp.type] = InterpsGroup(interp.type); | ||
87 | - } | ||
88 | - results[interp.type].addInterpretation(interp); | ||
89 | -// interps.push_back(interp); | ||
90 | - } | ||
91 | - map<int, InterpsGroup>::iterator it; | ||
92 | - for (it = results.begin(); it != results.end(); ++it) { | ||
93 | - interps.push_back((*it).second); | 78 | + interps.clear(); |
79 | + interps.reserve(interpTypesNum); | ||
80 | + for (unsigned int i = 0; i < interpTypesNum; i++) { | ||
81 | + InterpsGroup ig; | ||
82 | + ig.type = *currPtr; | ||
83 | + currPtr++; | ||
84 | + ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr))); | ||
85 | + currPtr += 2; | ||
86 | + ig.ptr = currPtr; | ||
87 | + currPtr += ig.size; | ||
88 | + interps.push_back(ig); | ||
94 | } | 89 | } |
95 | return currPtr - ptr; | 90 | return currPtr - ptr; |
96 | } | 91 | } |
92 | + | ||
93 | +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { | ||
94 | +// const unsigned char* currPtr = ptr; | ||
95 | +// uint8_t interpsNum = *ptr; | ||
96 | +// interps.clear(); | ||
97 | +// interps.reserve(interpsNum); | ||
98 | +// currPtr++; | ||
99 | +// // FIXME - to jest do poprawy | ||
100 | +// map<int, InterpsGroup> results; | ||
101 | +// for (unsigned int i = 0; i < interpsNum; ++i) { | ||
102 | +// EncodedInterpretation interp; | ||
103 | +// deserializeInterp(currPtr, interp); | ||
104 | +// if (results.count(interp.type) == 0) { | ||
105 | +// results[interp.type] = InterpsGroup(interp.type); | ||
106 | +// } | ||
107 | +// results[interp.type].addInterpretation(interp); | ||
108 | +//// interps.push_back(interp); | ||
109 | +// } | ||
110 | +// map<int, InterpsGroup>::iterator it; | ||
111 | +// for (it = results.begin(); it != results.end(); ++it) { | ||
112 | +// interps.push_back((*it).second); | ||
113 | +// } | ||
114 | +// return currPtr - ptr; | ||
115 | +//} |
nbproject/configurations.xml
@@ -106,14 +106,20 @@ | @@ -106,14 +106,20 @@ | ||
106 | </makeTool> | 106 | </makeTool> |
107 | </makefileType> | 107 | </makefileType> |
108 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> | 108 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
109 | + <ccTool flags="1"> | ||
110 | + </ccTool> | ||
109 | </item> | 111 | </item> |
110 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> | 112 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | + <ccTool flags="1"> | ||
114 | + </ccTool> | ||
111 | </item> | 115 | </item> |
112 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> | 116 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | </item> | 117 | </item> |
114 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> | 118 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
115 | </item> | 119 | </item> |
116 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> | 120 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
121 | + <ccTool flags="1"> | ||
122 | + </ccTool> | ||
117 | </item> | 123 | </item> |
118 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" | 124 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
119 | ex="false" | 125 | ex="false" |
@@ -169,7 +175,7 @@ | @@ -169,7 +175,7 @@ | ||
169 | <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> | 175 | <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
170 | </item> | 176 | </item> |
171 | <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> | 177 | <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
172 | - <ccTool flags="1"> | 178 | + <ccTool> |
173 | <incDir> | 179 | <incDir> |
174 | <pElem>morfeusz</pElem> | 180 | <pElem>morfeusz</pElem> |
175 | <pElem>morfeusz/build/morfeusz</pElem> | 181 | <pElem>morfeusz/build/morfeusz</pElem> |
@@ -180,7 +186,7 @@ | @@ -180,7 +186,7 @@ | ||
180 | </ccTool> | 186 | </ccTool> |
181 | </item> | 187 | </item> |
182 | <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> | 188 | <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
183 | - <ccTool flags="1"> | 189 | + <ccTool> |
184 | <incDir> | 190 | <incDir> |
185 | <pElem>morfeusz</pElem> | 191 | <pElem>morfeusz</pElem> |
186 | <pElem>morfeusz/build/morfeusz</pElem> | 192 | <pElem>morfeusz/build/morfeusz</pElem> |
@@ -273,7 +279,7 @@ | @@ -273,7 +279,7 @@ | ||
273 | <ccTool> | 279 | <ccTool> |
274 | <incDir> | 280 | <incDir> |
275 | <pElem>morfeusz</pElem> | 281 | <pElem>morfeusz</pElem> |
276 | - <pElem>/usr/lib/jvm/default-java/include</pElem> | 282 | + <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> |
277 | </incDir> | 283 | </incDir> |
278 | <preprocessorList> | 284 | <preprocessorList> |
279 | <Elem>libjmorfeusz_EXPORTS</Elem> | 285 | <Elem>libjmorfeusz_EXPORTS</Elem> |
@@ -408,18 +414,26 @@ | @@ -408,18 +414,26 @@ | ||
408 | </ccTool> | 414 | </ccTool> |
409 | </item> | 415 | </item> |
410 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> | 416 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
417 | + <ccTool flags="1"> | ||
418 | + </ccTool> | ||
411 | </item> | 419 | </item> |
412 | <item path="morfeusz/charset/CharsetConverter.cpp" | 420 | <item path="morfeusz/charset/CharsetConverter.cpp" |
413 | ex="false" | 421 | ex="false" |
414 | tool="1" | 422 | tool="1" |
415 | flavor2="4"> | 423 | flavor2="4"> |
424 | + <ccTool flags="1"> | ||
425 | + </ccTool> | ||
416 | </item> | 426 | </item> |
417 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> | 427 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
428 | + <ccTool flags="1"> | ||
429 | + </ccTool> | ||
418 | </item> | 430 | </item> |
419 | <item path="morfeusz/charset/conversion_tables.cpp" | 431 | <item path="morfeusz/charset/conversion_tables.cpp" |
420 | ex="false" | 432 | ex="false" |
421 | tool="1" | 433 | tool="1" |
422 | flavor2="4"> | 434 | flavor2="4"> |
435 | + <ccTool flags="1"> | ||
436 | + </ccTool> | ||
423 | </item> | 437 | </item> |
424 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> | 438 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
425 | <ccTool flags="1"> | 439 | <ccTool flags="1"> |
@@ -508,8 +522,12 @@ | @@ -508,8 +522,12 @@ | ||
508 | ex="false" | 522 | ex="false" |
509 | tool="1" | 523 | tool="1" |
510 | flavor2="4"> | 524 | flavor2="4"> |
525 | + <ccTool flags="1"> | ||
526 | + </ccTool> | ||
511 | </item> | 527 | </item> |
512 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> | 528 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
529 | + <ccTool flags="1"> | ||
530 | + </ccTool> | ||
513 | </item> | 531 | </item> |
514 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> | 532 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
515 | <ccTool flags="0"> | 533 | <ccTool flags="0"> |