Commit f1e52ff44027610390237bd862449c85c4a044cc

Authored by Michał Lenart
1 parent de0e960d

poprawienie czasu działania, przebudowanie analizatora tak, by nie powielać kodu…

… w generatorze, poprawienie rozpoznawania pierwszego segmentu w grafie fleksyjnym

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@114 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
... ... @@ -5,7 +5,7 @@ project (Morfeusz)
5 5 set (Morfeusz_VERSION_MAJOR 2)
6 6 set (Morfeusz_VERSION_MINOR 0)
7 7 set (Morfeusz_VERSION_PATCH 0)
8   -set (CMAKE_BUILD_TYPE "Debug")
  8 +set (CMAKE_BUILD_TYPE "Release")
9 9  
10 10 enable_testing()
11 11  
... ... @@ -47,7 +47,7 @@ endif ()
47 47  
48 48 # SEGMENT_RULES_FILE
49 49 if ("${SEGMENT_RULES_FILE}" STREQUAL "")
50   - set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat")
  50 + set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat")
51 51 endif ()
52 52  
53 53 message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules")
... ...
fsabuilder/morfeuszbuilder/fsa/common.py
... ... @@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object):
40 40 self.suffixToAdd = bestEncodedForm.suffixToAdd
41 41 self.prefixToAdd = targetWord[:bestPrefixLength]
42 42  
43   -class Interpretation(object):
  43 +class Interpretation4Analyzer(object):
44 44  
45 45 def __init__(self, orth, base, tagnum, namenum, typenum):
46   - self.lemma = EncodedForm(orth, base)
  46 + self.encodedForm = EncodedForm(orth, base)
47 47 self.tagnum = tagnum
48 48 self.namenum = namenum
49 49 self.typenum = typenum
50 50  
51 51 def getSortKey(self):
52 52 return (
53   - self.lemma.cutLength,
54   - tuple(self.lemma.suffixToAdd),
55   - tuple(self.lemma.casePattern),
  53 + self.encodedForm.cutLength,
  54 + tuple(self.encodedForm.suffixToAdd),
  55 + tuple(self.encodedForm.casePattern),
56 56 self.tagnum,
57 57 self.namenum)
58 58  
59 59 def __eq__(self, other):
60   - if isinstance(other, Interpretation):
  60 + if isinstance(other, Interpretation4Analyzer):
61 61 return self.getSortKey() == other.getSortKey()
62 62 else:
63 63 return False
... ... @@ -68,8 +68,8 @@ class Interpretation(object):
68 68 class Interpretation4Generator(object):
69 69  
70 70 def __init__(self, orth, base, tagnum, namenum, typenum):
71   - self.lemma = base
72   - self.orth = EncodedFormWithPrefix(base, orth)
  71 + self.encodedForm = base
  72 + self.encodedForm = EncodedFormWithPrefix(base, orth)
73 73 self.tagnum = tagnum
74 74 self.namenum = namenum
75 75 self.typenum = typenum
... ... @@ -77,9 +77,9 @@ class Interpretation4Generator(object):
77 77 def getSortKey(self):
78 78 return (
79 79 self.tagnum,
80   - self.orth.cutLength,
81   - tuple(self.orth.suffixToAdd),
82   -# tuple(self.lemma.casePattern),
  80 + self.encodedForm.cutLength,
  81 + tuple(self.encodedForm.suffixToAdd),
  82 +# tuple(self.encodedForm.casePattern),
83 83 self.namenum)
84 84  
85 85 def __eq__(self, other):
... ... @@ -92,7 +92,7 @@ class Interpretation4Generator(object):
92 92 return hash(self.getSortKey())
93 93  
94 94 def __unicode__(self):
95   - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.orth.cutLength, self.orth.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
  95 + return u'<%s,(%d %s),%d,%d>' % (self.encodedForm.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
96 96  
97 97 def __repr__(self):
98 98 return unicode(self)
... ...
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... ... @@ -4,7 +4,7 @@ Created on Oct 23, 2013
4 4 @author: mlenart
5 5 '''
6 6 import logging
7   -from common import Interpretation
  7 +from common import Interpretation4Analyzer
8 8 from morfeuszbuilder.fsa.common import Interpretation4Generator
9 9  
10 10 def _mergeEntries(inputLines):
... ... @@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object):
74 74 tagnum = int(tagnum)
75 75 namenum = int(namenum)
76 76 typenum = int(typenum)
77   - yield (orth, Interpretation(orth, base, tagnum, namenum, typenum))
  77 + yield (orth, Interpretation4Analyzer(orth, base, tagnum, namenum, typenum))
78 78  
79 79 def convert(self, inputLines):
80 80 return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))))
... ...
fsabuilder/morfeuszbuilder/fsa/encode.py
... ... @@ -5,6 +5,7 @@ Created on Oct 23, 2013
5 5 '''
6 6  
7 7 import logging
  8 +from morfeuszbuilder.utils import serializationUtils
8 9  
9 10 class Encoder(object):
10 11 '''
... ... @@ -96,6 +97,54 @@ class Encoder(object):
96 97 def _encodeNameNum(self, namenum):
97 98 assert namenum < 256 and namenum >= 0
98 99 return bytearray([namenum])
  100 +
  101 + def _groupInterpsByType(self, interpsList):
  102 + res = {}
  103 + for interp in interpsList:
  104 + res.setdefault(interp.typenum, [])
  105 + res[interp.typenum].append(interp)
  106 + return res
  107 +
  108 + def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix):
  109 + res = bytearray()
  110 + res.extend(self._encodeTypeNum(typenum))
  111 +
  112 + encodedInterpsList = bytearray()
  113 + for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  114 + encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
  115 + encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
  116 + encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
  117 +
  118 + res.extend(serializationUtils.htons(len(encodedInterpsList)))
  119 + res.extend(encodedInterpsList)
  120 + return res
  121 +
  122 + def _doEncodeData(self, interpsList, withCasePattern, withPrefix):
  123 +
  124 + assert type(interpsList) == frozenset
  125 +
  126 + segnum2Interps = self._groupInterpsByType(interpsList)
  127 +
  128 +
  129 + res = bytearray()
  130 + firstByte = len(segnum2Interps)
  131 + assert firstByte < 256
  132 + assert firstByte > 0
  133 + res.append(firstByte)
  134 +
  135 + for typenum, interpsList in segnum2Interps.iteritems():
  136 + res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix))
  137 +
  138 +
  139 +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  140 +# encodedInterpsList.extend(self._encodeTypeNum(interp.typenum))
  141 +# encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
  142 +# encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
  143 +# encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
  144 + del interpsList
  145 +# res.extend(serializationUtils.htons(len(encodedInterpsList)))
  146 +# res.extend(encodedInterpsList)
  147 + return res
99 148  
100 149 class MorphEncoder(Encoder):
101 150  
... ... @@ -106,19 +155,20 @@ class MorphEncoder(Encoder):
106 155 self.LEMMA_MIXED_CASE = 2
107 156  
108 157 def encodeData(self, interpsList):
109   - res = bytearray()
110   - firstByte = len(interpsList)
111   - assert firstByte < 256
112   - assert firstByte > 0
113   - res.append(firstByte)
114   - assert type(interpsList) == frozenset
115   - for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
116   - res.extend(self._encodeTypeNum(interp.typenum))
117   - res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False))
118   - res.extend(self._encodeTagNum(interp.tagnum))
119   - res.extend(self._encodeNameNum(interp.namenum))
120   - del interpsList
121   - return res
  158 + return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False)
  159 +# res = bytearray()
  160 +# firstByte = len(interpsList)
  161 +# assert firstByte < 256
  162 +# assert firstByte > 0
  163 +# res.append(firstByte)
  164 +# assert type(interpsList) == frozenset
  165 +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  166 +# res.extend(self._encodeTypeNum(interp.typenum))
  167 +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=True, withPrefix=False))
  168 +# res.extend(self._encodeTagNum(interp.tagnum))
  169 +# res.extend(self._encodeNameNum(interp.namenum))
  170 +# del interpsList
  171 +# return res
122 172  
123 173 class Encoder4Generator(Encoder):
124 174  
... ... @@ -126,18 +176,19 @@ class Encoder4Generator(Encoder):
126 176 super(Encoder4Generator, self).__init__(encoding)
127 177  
128 178 def encodeData(self, interpsList):
129   - res = bytearray()
130   - firstByte = len(interpsList)
131   - assert firstByte < 256
132   - assert firstByte > 0
133   - res.append(firstByte)
134   - assert type(interpsList) == frozenset
135   - for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
136   - res.extend(self._encodeTypeNum(interp.typenum))
137   - res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True))
138   - res.extend(self._encodeTagNum(interp.tagnum))
139   - res.extend(self._encodeNameNum(interp.namenum))
140   - return res
  179 + return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True)
  180 +# res = bytearray()
  181 +# firstByte = len(interpsList)
  182 +# assert firstByte < 256
  183 +# assert firstByte > 0
  184 +# res.append(firstByte)
  185 +# assert type(interpsList) == frozenset
  186 +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  187 +# res.extend(self._encodeTypeNum(interp.typenum))
  188 +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=False, withPrefix=True))
  189 +# res.extend(self._encodeTagNum(interp.tagnum))
  190 +# res.extend(self._encodeNameNum(interp.namenum))
  191 +# return res
141 192 #
142 193 # def decodeData(self, data):
143 194 #
... ...
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... ... @@ -106,7 +106,7 @@ class Segtypes(object):
106 106 lineNum,
107 107 re.match(r'[a-z_]+', segtype))
108 108 self._validate(
109   - u'Pattern must contain lemma and part-of-speech fields',
  109 + u'Pattern must contain encodedForm and part-of-speech fields',
110 110 lineNum,
111 111 re.match(r'.+\:[a-z_]+', pattern, re.U))
112 112  
... ... @@ -146,13 +146,13 @@ class Segtypes(object):
146 146  
147 147 # index lexemes
148 148 for p in self.patternsList:
149   - if p.lemma:
  149 + if p.encodedForm:
150 150 for tag in self.tagset.getAllTags():
151 151 tagnum = self.tagset.getTagnum4Tag(tag)
152   - if not (p.lemma, tagnum) in self._lemmaTagnum2Segnum:
153   - segnum = p.tryToMatch(p.lemma, tag)
  152 + if not (p.encodedForm, tagnum) in self._lemmaTagnum2Segnum:
  153 + segnum = p.tryToMatch(p.encodedForm, tag)
154 154 if segnum != -1:
155   - self._lemmaTagnum2Segnum[(p.lemma, tagnum)] = segnum
  155 + self._lemmaTagnum2Segnum[(p.encodedForm, tagnum)] = segnum
156 156 # logging.info('indexing segment type numbers - done')
157 157 # self._debugSegnums()
158 158  
... ... @@ -171,7 +171,7 @@ class Segtypes(object):
171 171 class SegtypePattern(object):
172 172  
173 173 def __init__(self, lemma, pattern, segnum):
174   - self.lemma = lemma
  174 + self.encodedForm = lemma
175 175 self.pattern = pattern
176 176 self.segnum = segnum
177 177  
... ... @@ -181,7 +181,7 @@ class SegtypePattern(object):
181 181 patterns2Match = []
182 182 patterns2Match.append(self.pattern.replace('%', '.*'))
183 183 patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*'))
184   - if (self.lemma is None or self.lemma == lemma) \
  184 + if (self.encodedForm is None or self.encodedForm == lemma) \
185 185 and any([re.match(p, tag) for p in patterns2Match]):
186 186 return self.segnum
187 187 else:
... ...
input/dodatki.tab
  1 +0 0 dig
  2 +1 1 dig
  3 +2 2 dig
  4 +3 3 dig
  5 +4 4 dig
  6 +5 5 dig
  7 +6 6 dig
  8 +7 7 dig
  9 +8 8 dig
  10 +9 9 dig
1 11 ń on ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
2 12 by by qub
3 13 naj naj
... ...
input/segmenty.dat
... ... @@ -142,6 +142,7 @@ samodz dywiz adj
142 142 # Stopień najwyższy:
143 143 # np. „naj·zieleńszy”, „naj·mądrzej”
144 144 moze_interp( naj> adj_sup )
  145 +moze_interp( nie> naj> adj_sup )
145 146 # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj
146 147 moze_interp( praet_sg dywiz li)
147 148 moze_interp( praet_pl dywiz li)
... ...
input/segmenty1.dat
... ... @@ -52,11 +52,14 @@ naj naj
52 52 nie nie
53 53 prefs prefs
54 54 prefv prefv
  55 +prefa prefa
55 56 dig dig
56 57 adja adja
57 58 adj adj:%:pos
58 59 adj_sup adj:%:sup
59 60 adj_sup adv:sup
  61 +adj_com adj:%:com
  62 +adj_com adj:%:com
60 63 negat ger:%:neg
61 64 negat pact:%:neg
62 65 negat ppas:%:neg
... ... @@ -69,6 +72,22 @@ interp interp
69 72 aglsg aglt:sg:%
70 73 aglpl aglt:pl:%
71 74 samodz %
  75 +praet_fin praet:%
  76 +praet_fin fin:%
  77 +li li:qub:%
  78 +nomina subst:%
  79 +nomina ger:%
  80 +nomina depr:%
  81 +adjectiva adj:%
  82 +adjectiva adv:%
  83 +adjectiva ppas:%
  84 +adjectiva pact:%
  85 +verba_imperf praet:%:imperf
  86 +verba_imperf fin:%:imperf
  87 +verba_imperf inf:imperf
  88 +verba_imperf imps:imperf
  89 +verba_imperf impt:imperf
  90 +
72 91  
73 92 [lexemes]
74 93 z_aglt aby:comp
... ...
morfeusz/EncodedInterpretation.hpp
... ... @@ -28,7 +28,6 @@ struct EncodedForm {
28 28 */
29 29 struct EncodedInterpretation {
30 30 EncodedForm value;
31   - unsigned char type;
32 31 int tag;
33 32 int nameClassifier;
34 33 };
... ...
morfeusz/Environment.cpp
... ... @@ -13,10 +13,12 @@
13 13 //class InterpretedChunksDecoder4Analyzer;
14 14 //class InterpretedChunksDecoder4Generator;
15 15  
16   -static Deserializer<vector<InterpsGroup> >* initializeDeserializer() {
17   - static Deserializer < vector < InterpsGroup > > *deserializer
  16 +static Deserializer<vector<InterpsGroup> >& initializeDeserializer(MorfeuszProcessorType processorType) {
  17 + static Deserializer < vector < InterpsGroup > > *analyzerDeserializer
18 18 = new MorphDeserializer();
19   - return deserializer;
  19 + static Deserializer < vector < InterpsGroup > > *generatorDeserializer
  20 + = new MorphDeserializer();
  21 + return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer);
20 22 }
21 23  
22 24 static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) {
... ... @@ -48,14 +50,15 @@ Environment::Environment(
48 50 caseConverter(),
49 51 tagset(fsaFileStartPtr),
50 52 fsaFileStartPtr(fsaFileStartPtr),
51   - fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())),
  53 + fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
52 54 segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
53 55 currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)),
54 56 isFromFile(false),
55 57 chunksDecoder(
56 58 processorType == ANALYZER
57 59 ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
58   - : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this))
  60 + : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
  61 + processorType(processorType)
59 62 {
60 63 }
61 64  
... ... @@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string&amp; filename) {
110 113 delete this->fsaFileStartPtr;
111 114 }
112 115 this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str());
113   - this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer());
  116 + this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType));
114 117 this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr);
115 118 this->isFromFile = true;
116 119 }
... ...
morfeusz/Environment.hpp
... ... @@ -64,6 +64,7 @@ private:
64 64 bool isFromFile;
65 65  
66 66 const InterpretedChunksDecoder* chunksDecoder;
  67 + MorfeuszProcessorType processorType;
67 68  
68 69 const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const;
69 70 };
... ...
morfeusz/FlexionGraph.cpp
... ... @@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge&amp; e) {
15 15 this->graph.push_back(vector<Edge>());
16 16 this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr);
17 17 }
18   -// cerr << string(e.chunk.chunkStartPtr) << endl;
19 18 assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr);
20 19 this->graph[0].push_back(e);
21 20 }
... ... @@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge&amp; e) {
30 29 this->graph[startNode].push_back(e);
31 30 }
32 31  
  32 +static inline bool chunkIsAtFront(
  33 + const InterpretedChunk& chunk,
  34 + const std::vector<InterpretedChunk>& path) {
  35 + unsigned int i;
  36 + for (i = 0; i < path.size() - 1 && path[i].orthWasShifted; i++) {
  37 + }
  38 + assert(!path[i].orthWasShifted);
  39 + return &chunk == &(path[i]);
  40 +}
  41 +
  42 +static inline bool chunkIsAtBack(
  43 + const InterpretedChunk& chunk,
  44 + const std::vector<InterpretedChunk>& path) {
  45 + return &chunk == &(path.back());
  46 +}
  47 +
  48 +static inline bool chunkIsTheOnlyOne(
  49 + const InterpretedChunk& chunk,
  50 + const std::vector<InterpretedChunk>& path) {
  51 + return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path);
  52 +}
  53 +
33 54 void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) {
34 55 // debugPath(path);
35 56 // debugGraph(this->graph);
36 57 for (unsigned int i = 0; i < path.size(); i++) {
37 58 const InterpretedChunk& chunk = path[i];
38 59 if (!chunk.orthWasShifted) {
39   - if (&chunk == &(path.front())
40   - && &chunk == &(path.back())) {
  60 + if (chunkIsTheOnlyOne(chunk, path)) {
41 61 Edge e = {chunk, UINT_MAX};
42 62 this->addStartEdge(e);
43 63 }
44   - else if (&chunk == &(path.front())) {
  64 + else if (chunkIsAtFront(chunk, path)) {
45 65 Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()};
46 66 this->addStartEdge(e);
47 67 }
48   - else if (&chunk == &(path.back())) {
  68 + else if (chunkIsAtBack(chunk, path)) {
49 69 Edge e = {chunk, UINT_MAX};
50 70 this->addMiddleEdge((unsigned int) this->graph.size(), e);
51 71 }
... ...
morfeusz/InterpretedChunksDecoder.hpp
... ... @@ -18,6 +18,10 @@
18 18 #include "charset/CaseConverter.hpp"
19 19 #include "Environment.hpp"
20 20  
  21 +const uint8_t LEMMA_ONLY_LOWER = 0;
  22 +const uint8_t LEMMA_UPPER_PREFIX = 1;
  23 +const uint8_t LEMMA_MIXED_CASE = 2;
  24 +
21 25 class InterpretedChunksDecoder {
22 26 public:
23 27  
... ... @@ -30,22 +34,12 @@ public:
30 34 unsigned int endNode,
31 35 const InterpretedChunk& interpretedChunk,
32 36 std::vector<MorphInterpretation>& out) const = 0;
33   -
34   - virtual ~InterpretedChunksDecoder() {}
35 37  
36   -protected:
37   -
38   - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const {
39   - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
40   - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
41   - originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
42   - decodeForm(
43   - prefixChunk.lowercaseCodepoints,
44   - prefixChunk.interpsGroup.interps[0].value,
45   - decodedForm);
46   - }
  38 + virtual ~InterpretedChunksDecoder() {
47 39 }
48   -
  40 +
  41 +protected:
  42 +
49 43 virtual void decodeForm(
50 44 const std::vector<uint32_t>& orth,
51 45 const EncodedForm& form,
... ... @@ -55,9 +49,10 @@ protected:
55 49 };
56 50  
57 51 class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder {
58   -
59 52 public:
60   - InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {}
  53 +
  54 + InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) {
  55 + }
61 56  
62 57 void decode(
63 58 unsigned int startNode,
... ... @@ -65,22 +60,12 @@ public:
65 60 const InterpretedChunk& interpretedChunk,
66 61 std::vector<MorphInterpretation>& out) const {
67 62 string orth;
68   - string lemma;
69   - convertPrefixes(interpretedChunk, orth, lemma);
  63 + string lemmaPrefix;
  64 + convertPrefixes(interpretedChunk, orth, lemmaPrefix);
70 65 orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
71   - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
72   - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
73   - decodeForm(
74   - interpretedChunk.lowercaseCodepoints,
75   - ei.value,
76   - lemma);
77   - out.push_back(MorphInterpretation(
78   - startNode, endNode,
79   - orth, lemma,
80   - ei.tag,
81   - ei.nameClassifier,
82   - env.getTagset(),
83   - env.getCharsetConverter()));
  66 + const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr;
  67 + while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) {
  68 + out.push_back(this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr));
84 69 }
85 70 }
86 71  
... ... @@ -104,36 +89,116 @@ protected:
104 89 env.getCharsetConverter().append(cp, res);
105 90 }
106 91 }
  92 +
  93 +private:
  94 +
  95 + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const {
  96 + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
  97 + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
  98 + originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
  99 + const unsigned char* ptr = prefixChunk.interpsGroup.ptr;
  100 + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, originalForm, string(""), prefixChunk, ptr);
  101 + decodedForm += mi.getLemma();
  102 + }
  103 + }
  104 +
  105 + MorphInterpretation decodeMorphInterpretation(
  106 + unsigned int startNode, unsigned int endNode,
  107 + const string& orth,
  108 + const string& lemmaPrefix,
  109 + const InterpretedChunk& chunk,
  110 + const unsigned char*& ptr) const {
  111 + string lemma = lemmaPrefix;
  112 + EncodedInterpretation ei = this->decodeInterp(ptr);
  113 + this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
  114 + return MorphInterpretation(
  115 + startNode, endNode,
  116 + orth, lemma,
  117 + ei.tag,
  118 + ei.nameClassifier,
  119 + env.getTagset(),
  120 + env.getCharsetConverter());
  121 + }
  122 +
  123 + void decodeLemma(const unsigned char*& ptr, EncodedForm& lemma) const {
  124 + lemma.suffixToCut = *ptr;
  125 + ptr++;
  126 + lemma.suffixToAdd = (const char*) ptr;
  127 + ptr += strlen((const char*) ptr) + 1;
  128 + assert(lemma.casePattern.size() == 0);
  129 + // lemma.casePattern.resize(MAX_WORD_SIZE, false);
  130 + uint8_t casePatternType = *ptr;
  131 + ptr++;
  132 + uint8_t prefixLength;
  133 + uint8_t patternLength;
  134 + switch (casePatternType) {
  135 + case LEMMA_ONLY_LOWER:
  136 + break;
  137 + case LEMMA_UPPER_PREFIX:
  138 + prefixLength = *ptr;
  139 + ptr++;
  140 + for (unsigned int i = 0; i < prefixLength; i++) {
  141 + // lemma.casePattern[i] = true;
  142 + lemma.casePattern.push_back(true);
  143 + }
  144 + // lemma.casePattern.resize(prefixLength, true);
  145 + break;
  146 + case LEMMA_MIXED_CASE:
  147 + patternLength = *ptr;
  148 + ptr++;
  149 + for (unsigned int i = 0; i < patternLength; i++) {
  150 + uint8_t idx = *ptr;
  151 + ptr++;
  152 + // lemma.casePattern[idx] = true;
  153 + lemma.casePattern.resize(idx + 1, false);
  154 + lemma.casePattern[idx] = true;
  155 + }
  156 + break;
  157 + }
  158 + }
  159 +
  160 + EncodedInterpretation decodeInterp(const unsigned char*& ptr) const {
  161 + EncodedInterpretation interp;
  162 + decodeLemma(ptr, interp.value);
  163 + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr)));
  164 + ptr += 2;
  165 + interp.nameClassifier = *ptr;
  166 + ptr++;
  167 + return interp;
  168 + }
107 169 };
108 170  
109 171 class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder {
110   -
111 172 public:
112   - InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {}
  173 +
  174 + InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) {
  175 + }
113 176  
114 177 void decode(
115 178 unsigned int startNode,
116 179 unsigned int endNode,
117 180 const InterpretedChunk& interpretedChunk,
118 181 std::vector<MorphInterpretation>& out) const {
119   - string orth;
120   - string lemma;
121   - convertPrefixes(interpretedChunk, lemma, orth);
122   - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
123   - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
124   - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
125   - decodeForm(
126   - interpretedChunk.originalCodepoints,
127   - ei.value,
128   - orth);
129   - out.push_back(MorphInterpretation(
130   - startNode, endNode,
131   - orth, lemma,
132   - ei.tag,
133   - ei.nameClassifier,
134   - env.getTagset(),
135   - env.getCharsetConverter()));
136   - }
  182 + // string orth;
  183 + // string lemma;
  184 + // convertPrefixes(interpretedChunk, lemma, orth);
  185 + // size_t orthLength = orth.length();
  186 + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
  187 + // for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
  188 + // const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
  189 + // decodeForm(
  190 + // interpretedChunk.originalCodepoints,
  191 + // ei.value,
  192 + // orth);
  193 + // out.push_back(MorphInterpretation(
  194 + // startNode, endNode,
  195 + // orth, lemma,
  196 + // ei.tag,
  197 + // ei.nameClassifier,
  198 + // env.getTagset(),
  199 + // env.getCharsetConverter()));
  200 + // orth.erase(orthLength);
  201 + // }
137 202 }
138 203  
139 204 private:
... ...
morfeusz/InterpsGroup.hpp
... ... @@ -14,24 +14,26 @@
14 14 #include "MorphInterpretation.hpp"
15 15 #include "Tagset.hpp"
16 16  
17   -class InterpsGroup {
18   -public:
19   -
20   - InterpsGroup() {
21   -
22   - }
23   -
24   - explicit InterpsGroup(const unsigned char type)
25   - : type(type) {
26   -
27   - }
28   -
29   - void addInterpretation(const EncodedInterpretation& interp) {
30   - interps.push_back(interp);
31   - }
  17 +struct InterpsGroup {
  18 +//public:
  19 +//
  20 +// InterpsGroup() {
  21 +//
  22 +// }
  23 +//
  24 +// explicit InterpsGroup(const unsigned char type)
  25 +// : type(type) {
  26 +//
  27 +// }
  28 +//
  29 +// void addInterpretation(const EncodedInterpretation& interp) {
  30 +// interps.push_back(interp);
  31 +// }
32 32  
33 33 unsigned char type;
34   - std::vector<EncodedInterpretation> interps;
  34 + uint16_t size;
  35 + const unsigned char* ptr;
  36 +// std::vector<EncodedInterpretation> interps;
35 37 };
36 38  
37 39 #endif /* GROUPEDINTERPRETATIONS_HPP */
... ...
morfeusz/Morfeusz.cpp
... ... @@ -82,7 +82,9 @@ void Morfeusz::processOneWord(
82 82 FlexionGraph graph;
83 83 const char* currInput = inputStart;
84 84 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
  85 +
85 86 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
  87 +
86 88 if (!graph.empty()) {
87 89 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
88 90 int srcNode = startNodeNum;
... ... @@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) {
110 112 from.prefixChunks.end());
111 113 to.prefixChunks.push_back(from);
112 114 from.orthWasShifted = true;
  115 + to.chunkStartPtr = from.chunkStartPtr;
113 116 }
114 117  
115 118 void Morfeusz::doProcessOneWord(
... ... @@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord(
119 122 SegrulesState segrulesState,
120 123 vector<InterpretedChunk>& accum,
121 124 FlexionGraph& graph) const {
122   - cerr << "doAnalyzeOneWord " << inputData << endl;
123   - bool endOfProcessing = inputData == inputEnd;
  125 +// cerr << "doAnalyzeOneWord " << inputData << endl;
124 126 const char* currInput = inputData;
125   - uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
126   - // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter));
  127 + uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
127 128 vector<uint32_t> originalCodepoints;
128 129 vector<uint32_t> lowercaseCodepoints;
129 130  
130 131 StateType state = env.getFSA().getInitialState();
131 132  
132   - while (!endOfProcessing) {
133   - if (isEndOfWord(codepoint)) {
134   - endOfProcessing = true;
135   - }
136   - cerr << "not end of word '" << string(currInput) << "'" << endl;
  133 + while (!isEndOfWord(codepoint)) {
137 134 uint32_t lowerCP = env.getCaseConverter().toLower(codepoint);
138 135 originalCodepoints.push_back(codepoint);
139 136 lowercaseCodepoints.push_back(lowerCP);
140 137 feedState(state, lowerCP, UTF8CharsetConverter());
141 138 codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd);
142 139 if (state.isAccepting()) {
143   - cerr << "accepting" << endl;
144 140 vector<InterpsGroup> val(state.getValue());
145 141 for (unsigned int i = 0; i < val.size(); i++) {
146 142 InterpsGroup& ig = val[i];
... ... @@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord(
151 147 it != newSegrulesStates.end();
152 148 ++it) {
153 149 SegrulesState newSegrulesState = *it;
  150 +// if (newSegrulesState.shiftOrthFromPrevious) {
  151 +//
  152 +// }
154 153 InterpretedChunk ic = {
155 154 inputData,
156 155 originalCodepoints,
... ... @@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord(
165 164 }
166 165 accum.push_back(ic);
167 166 if (isEndOfWord(codepoint)) {
168   - cerr << "end of word inside " << currInput <<endl;
169 167 if (newSegrulesState.accepting)
170 168 graph.addPath(accum);
171 169 }
... ... @@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord(
177 175 }
178 176 }
179 177 }
  178 + codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
180 179 }
181   - cerr << "end of word " << currInput << endl;
182 180 inputData = currInput;
183 181 }
184 182  
... ...
morfeusz/MorphDeserializer.cpp
... ... @@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() {
23 23 MorphDeserializer::~MorphDeserializer() {
24 24 }
25 25  
26   -static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) {
27   - // XXX uważać na poprawność danych
28   - lemma.suffixToCut = *ptr;
29   - ptr++;
30   - lemma.suffixToAdd = (const char*) ptr;
31   - ptr += strlen((const char*) ptr) + 1;
32   - assert(lemma.casePattern.size() == 0);
33   -// lemma.casePattern.resize(MAX_WORD_SIZE, false);
34   - uint8_t casePatternType = *ptr;
35   - ptr++;
36   - uint8_t prefixLength;
37   - uint8_t patternLength;
38   - switch (casePatternType) {
39   - case LEMMA_ONLY_LOWER:
40   - break;
41   - case LEMMA_UPPER_PREFIX:
42   - prefixLength = *ptr;
43   - ptr++;
44   - for (unsigned int i = 0; i < prefixLength; i++) {
45   -// lemma.casePattern[i] = true;
46   - lemma.casePattern.push_back(true);
47   - }
48   -// lemma.casePattern.resize(prefixLength, true);
49   - break;
50   - case LEMMA_MIXED_CASE:
51   - patternLength = *ptr;
52   - ptr++;
53   - for (unsigned int i = 0; i < patternLength; i++) {
54   - uint8_t idx = *ptr;
55   - ptr++;
  26 +//static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) {
  27 +// // XXX uważać na poprawność danych
  28 +// lemma.suffixToCut = *ptr;
  29 +// ptr++;
  30 +// lemma.suffixToAdd = (const char*) ptr;
  31 +// ptr += strlen((const char*) ptr) + 1;
  32 +// assert(lemma.casePattern.size() == 0);
  33 +//// lemma.casePattern.resize(MAX_WORD_SIZE, false);
  34 +// uint8_t casePatternType = *ptr;
  35 +// ptr++;
  36 +// uint8_t prefixLength;
  37 +// uint8_t patternLength;
  38 +// switch (casePatternType) {
  39 +// case LEMMA_ONLY_LOWER:
  40 +// break;
  41 +// case LEMMA_UPPER_PREFIX:
  42 +// prefixLength = *ptr;
  43 +// ptr++;
  44 +// for (unsigned int i = 0; i < prefixLength; i++) {
  45 +//// lemma.casePattern[i] = true;
  46 +// lemma.casePattern.push_back(true);
  47 +// }
  48 +//// lemma.casePattern.resize(prefixLength, true);
  49 +// break;
  50 +// case LEMMA_MIXED_CASE:
  51 +// patternLength = *ptr;
  52 +// ptr++;
  53 +// for (unsigned int i = 0; i < patternLength; i++) {
  54 +// uint8_t idx = *ptr;
  55 +// ptr++;
  56 +//// lemma.casePattern[idx] = true;
  57 +// lemma.casePattern.resize(idx + 1, false);
56 58 // lemma.casePattern[idx] = true;
57   - lemma.casePattern.resize(idx + 1, false);
58   - lemma.casePattern[idx] = true;
59   - }
60   - break;
61   - }
62   -}
63   -
64   -static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
65   - interp.type = *ptr;
66   - ptr++;
67   - deserializeLemma(ptr, interp.value);
68   - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
69   - ptr += 2;
70   - interp.nameClassifier = *ptr;
71   - ptr++;
72   -}
  59 +// }
  60 +// break;
  61 +// }
  62 +//}
  63 +//
  64 +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
  65 +// interp.type = *ptr;
  66 +// ptr++;
  67 +// deserializeLemma(ptr, interp.value);
  68 +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
  69 +// ptr += 2;
  70 +// interp.nameClassifier = *ptr;
  71 +// ptr++;
  72 +//}
73 73  
74 74 long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
75 75 const unsigned char* currPtr = ptr;
76   - uint8_t interpsNum = *ptr;
77   - interps.clear();
78   - interps.reserve(interpsNum);
  76 + uint8_t interpTypesNum = *currPtr;
79 77 currPtr++;
80   - // FIXME - to jest do poprawy
81   - map<int, InterpsGroup> results;
82   - for (unsigned int i = 0; i < interpsNum; ++i) {
83   - EncodedInterpretation interp;
84   - deserializeInterp(currPtr, interp);
85   - if (results.count(interp.type) == 0) {
86   - results[interp.type] = InterpsGroup(interp.type);
87   - }
88   - results[interp.type].addInterpretation(interp);
89   -// interps.push_back(interp);
90   - }
91   - map<int, InterpsGroup>::iterator it;
92   - for (it = results.begin(); it != results.end(); ++it) {
93   - interps.push_back((*it).second);
  78 + interps.clear();
  79 + interps.reserve(interpTypesNum);
  80 + for (unsigned int i = 0; i < interpTypesNum; i++) {
  81 + InterpsGroup ig;
  82 + ig.type = *currPtr;
  83 + currPtr++;
  84 + ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr)));
  85 + currPtr += 2;
  86 + ig.ptr = currPtr;
  87 + currPtr += ig.size;
  88 + interps.push_back(ig);
94 89 }
95 90 return currPtr - ptr;
96 91 }
  92 +
  93 +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
  94 +// const unsigned char* currPtr = ptr;
  95 +// uint8_t interpsNum = *ptr;
  96 +// interps.clear();
  97 +// interps.reserve(interpsNum);
  98 +// currPtr++;
  99 +// // FIXME - to jest do poprawy
  100 +// map<int, InterpsGroup> results;
  101 +// for (unsigned int i = 0; i < interpsNum; ++i) {
  102 +// EncodedInterpretation interp;
  103 +// deserializeInterp(currPtr, interp);
  104 +// if (results.count(interp.type) == 0) {
  105 +// results[interp.type] = InterpsGroup(interp.type);
  106 +// }
  107 +// results[interp.type].addInterpretation(interp);
  108 +//// interps.push_back(interp);
  109 +// }
  110 +// map<int, InterpsGroup>::iterator it;
  111 +// for (it = results.begin(); it != results.end(); ++it) {
  112 +// interps.push_back((*it).second);
  113 +// }
  114 +// return currPtr - ptr;
  115 +//}
... ...
nbproject/configurations.xml
... ... @@ -106,14 +106,20 @@
106 106 </makeTool>
107 107 </makefileType>
108 108 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
  109 + <ccTool flags="1">
  110 + </ccTool>
109 111 </item>
110 112 <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
  113 + <ccTool flags="1">
  114 + </ccTool>
111 115 </item>
112 116 <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4">
113 117 </item>
114 118 <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
115 119 </item>
116 120 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
  121 + <ccTool flags="1">
  122 + </ccTool>
117 123 </item>
118 124 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
119 125 ex="false"
... ... @@ -169,7 +175,7 @@
169 175 <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
170 176 </item>
171 177 <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4">
172   - <ccTool flags="1">
  178 + <ccTool>
173 179 <incDir>
174 180 <pElem>morfeusz</pElem>
175 181 <pElem>morfeusz/build/morfeusz</pElem>
... ... @@ -180,7 +186,7 @@
180 186 </ccTool>
181 187 </item>
182 188 <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
183   - <ccTool flags="1">
  189 + <ccTool>
184 190 <incDir>
185 191 <pElem>morfeusz</pElem>
186 192 <pElem>morfeusz/build/morfeusz</pElem>
... ... @@ -273,7 +279,7 @@
273 279 <ccTool>
274 280 <incDir>
275 281 <pElem>morfeusz</pElem>
276   - <pElem>/usr/lib/jvm/default-java/include</pElem>
  282 + <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
277 283 </incDir>
278 284 <preprocessorList>
279 285 <Elem>libjmorfeusz_EXPORTS</Elem>
... ... @@ -408,18 +414,26 @@
408 414 </ccTool>
409 415 </item>
410 416 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4">
  417 + <ccTool flags="1">
  418 + </ccTool>
411 419 </item>
412 420 <item path="morfeusz/charset/CharsetConverter.cpp"
413 421 ex="false"
414 422 tool="1"
415 423 flavor2="4">
  424 + <ccTool flags="1">
  425 + </ccTool>
416 426 </item>
417 427 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
  428 + <ccTool flags="1">
  429 + </ccTool>
418 430 </item>
419 431 <item path="morfeusz/charset/conversion_tables.cpp"
420 432 ex="false"
421 433 tool="1"
422 434 flavor2="4">
  435 + <ccTool flags="1">
  436 + </ccTool>
423 437 </item>
424 438 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
425 439 <ccTool flags="1">
... ... @@ -508,8 +522,12 @@
508 522 ex="false"
509 523 tool="1"
510 524 flavor2="4">
  525 + <ccTool flags="1">
  526 + </ccTool>
511 527 </item>
512 528 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
  529 + <ccTool flags="1">
  530 + </ccTool>
513 531 </item>
514 532 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
515 533 <ccTool flags="0">
... ...