Commit aece7355ccc6dec55f688a43c5c8bcd911c7b9ca
1 parent
d836a116
nowsza wersja generatora - teraz naprawdę jest lustrzanym odbiciem analizatora
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@166 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
21 changed files
with
117 additions
and
139 deletions
README
... | ... | @@ -5,7 +5,7 @@ Compilation - prerequisites |
5 | 5 | |
6 | 6 | This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family). |
7 | 7 | |
8 | -sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip | |
8 | +sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip python-all-dev python-pyparsing | |
9 | 9 | sudo pip install pyinstaller |
10 | 10 | |
11 | 11 | For cross compiling: |
... | ... |
fsabuilder/CMakeLists.txt
... | ... | @@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") |
55 | 55 | list (APPEND PACKAGE_DEPENDS package-python-win-installer) |
56 | 56 | |
57 | 57 | #~ add_custom_target (buildfsa-exec ALL |
58 | - #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder | |
59 | -#~ ) | |
58 | + #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder | |
59 | + #~ ) | |
60 | 60 | #~ |
61 | 61 | #~ add_executable (morfeusz_builder IMPORTED) |
62 | 62 | #~ add_dependencies (morfeusz_builder buildfsa-exec) |
... | ... |
fsabuilder/buildanalyzer.sh
1 | 1 | #!/bin/bash |
2 | 2 | |
3 | -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1 | |
3 | +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1 | |
... | ... |
fsabuilder/buildgenerator.sh
1 | 1 | #!/bin/bash |
2 | 2 | |
3 | -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ | |
4 | - --tagset-file=../input/polimorf.tagset \ | |
3 | +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ | |
4 | + --tagset-file=../input/sgjp-morfeusz.tagset \ | |
5 | 5 | --segments-file=../input/segmenty.dat \ |
6 | 6 | --generator \ |
7 | 7 | --serialization-method=V2 \ |
... | ... |
fsabuilder/morfeuszbuilder/fsa/common.py
... | ... | @@ -41,6 +41,11 @@ class EncodedForm4Generator(object): |
41 | 41 | self.cutLength = bestEncodedForm.cutLength |
42 | 42 | self.suffixToAdd = bestEncodedForm.suffixToAdd |
43 | 43 | self.prefixToAdd = targetWord[:bestPrefixLength] |
44 | + | |
45 | +# if fromWord == 'BC': | |
46 | +# print self.cutLength | |
47 | +# print self.suffixToAdd | |
48 | +# print self.prefixToAdd, len(self.prefixToAdd) | |
44 | 49 | |
45 | 50 | class Interpretation4Analyzer(object): |
46 | 51 | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/encode.py
... | ... | @@ -6,7 +6,7 @@ Created on Oct 23, 2013 |
6 | 6 | |
7 | 7 | import logging |
8 | 8 | import itertools |
9 | -from morfeuszbuilder.utils import serializationUtils | |
9 | +from morfeuszbuilder.utils.serializationUtils import * | |
10 | 10 | |
11 | 11 | class Encoder(object): |
12 | 12 | ''' |
... | ... | @@ -44,19 +44,6 @@ class Encoder(object): |
44 | 44 | assert typenum >= 0 and typenum < 256 |
45 | 45 | return bytearray([typenum]) |
46 | 46 | |
47 | - def _encodeEncodedForm(self, form, withCasePattern, withPrefix): | |
48 | - res = bytearray() | |
49 | - assert form.cutLength < 256 and form.cutLength >= 0 | |
50 | - if withPrefix: | |
51 | - res.extend(self.encodeWord(form.prefixToAdd, lowercase=False)) | |
52 | - res.append(0) | |
53 | - res.append(form.cutLength) | |
54 | - res.extend(self.encodeWord(form.suffixToAdd, lowercase=False)) | |
55 | - res.append(0) | |
56 | - if withCasePattern: | |
57 | - res.extend(self._encodeCasePattern(form.casePattern)) | |
58 | - return res | |
59 | - | |
60 | 47 | def _encodeCasePattern(self, casePattern): |
61 | 48 | res = bytearray() |
62 | 49 | if True not in casePattern: |
... | ... | @@ -84,7 +71,7 @@ class Encoder(object): |
84 | 71 | n = len(self.qualifiersMap) |
85 | 72 | self.qualifiersMap[key] = n |
86 | 73 | assert n < 500 |
87 | - res.extend(serializationUtils.htons(n)) | |
74 | + res.extend(htons(n)) | |
88 | 75 | return res |
89 | 76 | |
90 | 77 | def _hasUpperPrefix(self, casePattern): |
... | ... | @@ -102,11 +89,9 @@ class Encoder(object): |
102 | 89 | |
103 | 90 | def _encodeTagNum(self, tagnum): |
104 | 91 | res = bytearray() |
105 | -# logging.info((tagnum & 0xFF00) >> 8) | |
106 | 92 | assert tagnum < 65536 and tagnum >= 0 |
107 | 93 | res.append((tagnum & 0xFF00) >> 8) |
108 | 94 | res.append(tagnum & 0x00FF) |
109 | -# logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1]))) | |
110 | 95 | return res |
111 | 96 | |
112 | 97 | def _encodeNameNum(self, namenum): |
... | ... | @@ -129,31 +114,37 @@ class Encoder(object): |
129 | 114 | res.append(list(interp.orthCasePattern)) |
130 | 115 | return res |
131 | 116 | |
132 | - def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId): | |
117 | + def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer): | |
133 | 118 | res = bytearray() |
134 | 119 | res.extend(self._encodeTypeNum(typenum)) |
135 | 120 | encodedInterpsList = bytearray() |
136 | - if withCasePattern: | |
121 | + if isAnalyzer: | |
137 | 122 | casePatterns = self._getOrthCasePatterns(interpsList) |
138 | 123 | encodedInterpsList.append(len(casePatterns)) |
139 | 124 | for casePattern in casePatterns: |
140 | 125 | encodedInterpsList.extend(self._encodeCasePattern(casePattern)) |
141 | 126 | for interp in sorted(interpsList, key=lambda i: i.getSortKey()): |
142 | - if withHomonymId: | |
143 | - encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False)) | |
144 | - encodedInterpsList.append(0) | |
145 | - if withCasePattern: | |
127 | + if isAnalyzer: | |
146 | 128 | encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) |
147 | - encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) | |
148 | - encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) | |
149 | - encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) | |
129 | + else: | |
130 | + serializeString(interp.homonymId, encodedInterpsList) | |
131 | + serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList) | |
132 | + encodedInterpsList.append(interp.encodedForm.cutLength) | |
133 | + serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList) | |
134 | + if isAnalyzer: | |
135 | + encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) | |
136 | + encodedInterpsList.extend(htons(interp.tagnum)) | |
137 | + encodedInterpsList.append(interp.namenum) | |
150 | 138 | encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) |
139 | + | |
140 | + if interp.encodedForm.suffixToAdd == 'bc': | |
141 | + print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList] | |
151 | 142 | |
152 | - res.extend(serializationUtils.htons(len(encodedInterpsList))) | |
143 | + res.extend(htons(len(encodedInterpsList))) | |
153 | 144 | res.extend(encodedInterpsList) |
154 | 145 | return res |
155 | 146 | |
156 | - def _doEncodeData(self, interpsList, withCasePattern, withPrefix, withHomonymId): | |
147 | + def _doEncodeData(self, interpsList, isAnalyzer): | |
157 | 148 | |
158 | 149 | assert type(interpsList) == frozenset |
159 | 150 | |
... | ... | @@ -167,7 +158,7 @@ class Encoder(object): |
167 | 158 | res.append(firstByte) |
168 | 159 | |
169 | 160 | for typenum, interpsList in segnum2Interps.iteritems(): |
170 | - res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix, withHomonymId)) | |
161 | + res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer)) | |
171 | 162 | del interpsList |
172 | 163 | |
173 | 164 | return res |
... | ... | @@ -181,7 +172,7 @@ class MorphEncoder(Encoder): |
181 | 172 | self.LEMMA_MIXED_CASE = 2 |
182 | 173 | |
183 | 174 | def encodeData(self, interpsList): |
184 | - return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False, withHomonymId=False) | |
175 | + return self._doEncodeData(interpsList, isAnalyzer=True) | |
185 | 176 | |
186 | 177 | class Encoder4Generator(Encoder): |
187 | 178 | |
... | ... | @@ -189,4 +180,4 @@ class Encoder4Generator(Encoder): |
189 | 180 | super(Encoder4Generator, self).__init__(False, encoding) |
190 | 181 | |
191 | 182 | def encodeData(self, interpsList): |
192 | - return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True, withHomonymId=True) | |
183 | + return self._doEncodeData(interpsList, isAnalyzer=False) | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/fsa.py
... | ... | @@ -43,9 +43,6 @@ class FSA(object): |
43 | 43 | # debug |
44 | 44 | if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: |
45 | 45 | logging.info(u'%d %s' % (self.n, word)) |
46 | -# logging.info(str(self.register.getStatesNum())) | |
47 | -# logging.info(str(self.register.getStatesNum())) | |
48 | - # allWords.append(word) | |
49 | 46 | for label in encodedWord: |
50 | 47 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 |
51 | 48 | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rules.py
... | ... | @@ -56,6 +56,7 @@ class TagRule(SegmentRule): |
56 | 56 | |
57 | 57 | def __str__(self): |
58 | 58 | res = self.segtype |
59 | + res += '(' + str(self.segnum) + ')' | |
59 | 60 | if self.shiftOrth: |
60 | 61 | res += '>' |
61 | 62 | return res |
... | ... | @@ -70,8 +71,8 @@ class TagRule(SegmentRule): |
70 | 71 | class UnaryRule(SegmentRule): |
71 | 72 | |
72 | 73 | def __init__(self, child, linenum): |
74 | + super(UnaryRule, self).__init__(linenum) | |
73 | 75 | self.child = child |
74 | - self.linenum = linenum | |
75 | 76 | assert not child.isSinkRule() |
76 | 77 | |
77 | 78 | def isShiftOrthRule(self): |
... | ... | @@ -80,8 +81,8 @@ class UnaryRule(SegmentRule): |
80 | 81 | class ComplexRule(SegmentRule): |
81 | 82 | |
82 | 83 | def __init__(self, children, linenum): |
84 | + super(ComplexRule, self).__init__(linenum) | |
83 | 85 | self.children = children |
84 | - self.linenum = linenum | |
85 | 86 | assert not any(map(lambda c: c.isSinkRule(), children)) |
86 | 87 | |
87 | 88 | def addToNFA(self, fsa): |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... | ... | @@ -49,6 +49,7 @@ class RulesManager(object): |
49 | 49 | res.extend(self._serializeDFA(dfa)) |
50 | 50 | res.extend(self._serializeOptionsMap(self.defaultOptions)) |
51 | 51 | logging.info('segmentation rules size: %s bytes', len(res)) |
52 | +# logging.info([int(x) for x in res]) | |
52 | 53 | return res |
53 | 54 | |
54 | 55 | def _serializeSeparatorsList(self): |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -63,8 +63,8 @@ class RulesParser(object): |
63 | 63 | nfa = rulesNFA.RulesNFA() |
64 | 64 | if not firstNFA: |
65 | 65 | firstNFA = nfa |
66 | - section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations' | |
67 | - combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False) | |
66 | +# section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations' | |
67 | + combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False) | |
68 | 68 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) |
69 | 69 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): |
70 | 70 | if rule.allowsEmptySequence(): |
... | ... | @@ -72,8 +72,11 @@ class RulesParser(object): |
72 | 72 | filename, |
73 | 73 | rule.linenum, |
74 | 74 | 'This rule allows empty segments sequence to be accepted') |
75 | - rule.addToNFA(nfa) | |
76 | -# nfa.debug() | |
75 | + if self.rulesType == RulesParser.PARSE4GENERATOR: | |
76 | + rule = rule.transformToGeneratorVersion() | |
77 | + if not rule.isSinkRule(): | |
78 | + rule.addToNFA(nfa) | |
79 | +# nfa.debug() | |
77 | 80 | try: |
78 | 81 | dfa = nfa.convertToDFA() |
79 | 82 | res.addDFA(key2Def, dfa) |
... | ... | @@ -146,10 +149,11 @@ class RulesParser(object): |
146 | 149 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3 |
147 | 150 | oneOfRule = delimitedList(unaryRule, delim='|') |
148 | 151 | complexRule = unaryRule ^ oneOfRule |
149 | - if self.rulesType == RulesParser.PARSE4ANALYZER: | |
150 | - concatRule = OneOrMore(complexRule) | |
151 | - else: | |
152 | - concatRule = ZeroOrMore(shiftOrthRule) + tagRule | |
152 | + concatRule = OneOrMore(complexRule) | |
153 | +# if self.rulesType == RulesParser.PARSE4ANALYZER: | |
154 | +# concatRule = OneOrMore(complexRule) | |
155 | +# else: | |
156 | +# concatRule = ZeroOrMore(shiftOrthRule) + tagRule | |
153 | 157 | rule << concatRule + Optional(CaselessLiteral('!weak')) |
154 | 158 | |
155 | 159 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) |
... | ... |
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
input/segmenty.dat
morfeusz/CasePatternHelper.hpp
... | ... | @@ -62,15 +62,6 @@ public: |
62 | 62 | } |
63 | 63 | } |
64 | 64 | |
65 | - const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const { | |
66 | - const unsigned char* currPtr = ig.ptr; | |
67 | - unsigned char casePatternsNum = *currPtr++; | |
68 | - for (unsigned int i = 0; i < casePatternsNum; i++) { | |
69 | - deserializeOneCasePattern(currPtr); | |
70 | - } | |
71 | - return currPtr; | |
72 | - } | |
73 | - | |
74 | 65 | std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { |
75 | 66 | std::vector<bool> res; |
76 | 67 | uint8_t casePatternType = *ptr; |
... | ... | @@ -103,26 +94,6 @@ public: |
103 | 94 | } |
104 | 95 | return res; |
105 | 96 | } |
106 | - | |
107 | -// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const { | |
108 | -// if (this->caseSensitive) { | |
109 | -// for (unsigned int i = 0; i < chunks.size(); i++) { | |
110 | -// const InterpretedChunk& ic = chunks[i]; | |
111 | -// const unsigned char* casePatternPtr = ic.interpsGroup.ptr; | |
112 | -// std::vector<bool> casePattern; | |
113 | -// deserializeCasePattern(casePatternPtr, casePattern); | |
114 | -// if (!checkCasePattern(ic, casePattern)) { | |
115 | -// return false; | |
116 | -// } | |
117 | -// } | |
118 | -// } | |
119 | -// return true; | |
120 | -// } | |
121 | - | |
122 | -// void skipCasePattern(const unsigned char*& ptr) const { | |
123 | -// vector<bool> _dupa; | |
124 | -// deserializeCasePattern(ptr, _dupa); | |
125 | -// } | |
126 | 97 | private: |
127 | 98 | bool caseSensitive; |
128 | 99 | |
... | ... |
morfeusz/InterpretedChunksDecoder.hpp
... | ... | @@ -40,18 +40,6 @@ public: |
40 | 40 | |
41 | 41 | protected: |
42 | 42 | |
43 | - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { | |
44 | - EncodedInterpretation interp; | |
45 | - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); | |
46 | - deserializeEncodedForm(ptr, interp.value); | |
47 | - interp.tag = readInt16(ptr); | |
48 | - interp.nameClassifier = *ptr++; | |
49 | - interp.qualifiers = readInt16(ptr); | |
50 | - return interp; | |
51 | - } | |
52 | - | |
53 | - virtual void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const = 0; | |
54 | - | |
55 | 43 | const Environment& env; |
56 | 44 | }; |
57 | 45 | |
... | ... | @@ -106,6 +94,16 @@ protected: |
106 | 94 | assert(encodedForm.casePattern.size() == 0); |
107 | 95 | encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); |
108 | 96 | } |
97 | + | |
98 | + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { | |
99 | + EncodedInterpretation interp; | |
100 | + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); | |
101 | + deserializeEncodedForm(ptr, interp.value); | |
102 | + interp.tag = readInt16(ptr); | |
103 | + interp.nameClassifier = *ptr++; | |
104 | + interp.qualifiers = readInt16(ptr); | |
105 | + return interp; | |
106 | + } | |
109 | 107 | private: |
110 | 108 | |
111 | 109 | pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { |
... | ... | @@ -176,7 +174,7 @@ public: |
176 | 174 | const unsigned char* currPtr = interpretedChunk.interpsPtr; |
177 | 175 | while (currPtr < interpretedChunk.interpsEndPtr) { |
178 | 176 | MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); |
179 | - // cerr << mi.toString(false) << endl; | |
177 | +// cerr << mi.toString(false) << endl; | |
180 | 178 | // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; |
181 | 179 | if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) { |
182 | 180 | out.push_back(mi); |
... | ... | @@ -203,15 +201,12 @@ private: |
203 | 201 | const InterpretedChunk& chunk, |
204 | 202 | const unsigned char*& ptr) const { |
205 | 203 | string orth = orthPrefix; |
206 | - string homonymId = (const char*) ptr; | |
207 | - ptr += strlen((const char*) ptr) + 1; | |
208 | 204 | EncodedInterpretation ei = this->deserializeInterp(ptr); |
209 | 205 | this->decodeForm(chunk.originalCodepoints, ei.value, orth); |
210 | - // string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId); | |
211 | 206 | return MorphInterpretation( |
212 | 207 | startNode, endNode, |
213 | 208 | orth, lemma, |
214 | - homonymId, | |
209 | + ei.homonymId, | |
215 | 210 | ei.tag, |
216 | 211 | ei.nameClassifier, |
217 | 212 | ei.qualifiers, |
... | ... | @@ -233,14 +228,17 @@ private: |
233 | 228 | env.getCharsetConverter().append(cp, res); |
234 | 229 | } |
235 | 230 | } |
236 | - | |
237 | - void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const { | |
238 | - encodedForm.prefixToAdd = (const char*) ptr; | |
239 | - ptr += strlen((const char*) ptr) + 1; | |
240 | - encodedForm.suffixToCut = *ptr; | |
241 | - ptr++; | |
242 | - encodedForm.suffixToAdd = (const char*) ptr; | |
243 | - ptr += strlen((const char*) ptr) + 1; | |
231 | + | |
232 | + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { | |
233 | + EncodedInterpretation interp; | |
234 | + interp.homonymId = readString(ptr); | |
235 | + interp.value.prefixToAdd = readString(ptr); | |
236 | + interp.value.suffixToCut = readInt8(ptr); | |
237 | + interp.value.suffixToAdd = readString(ptr); | |
238 | + interp.tag = readInt16(ptr); | |
239 | + interp.nameClassifier = readInt8(ptr); | |
240 | + interp.qualifiers = readInt16(ptr); | |
241 | + return interp; | |
244 | 242 | } |
245 | 243 | }; |
246 | 244 | |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -18,6 +18,7 @@ |
18 | 18 | #include "charset/CaseConverter.hpp" |
19 | 19 | #include "segrules/segrules.hpp" |
20 | 20 | #include "const.hpp" |
21 | +#include "deserializationUtils.hpp" | |
21 | 22 | #include "charset/utf8.h" |
22 | 23 | |
23 | 24 | // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba |
... | ... | @@ -40,6 +41,20 @@ options(createDefaultOptions()) { |
40 | 41 | generatorEnv.setCaseSensitive(false); |
41 | 42 | } |
42 | 43 | |
44 | +inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) { | |
45 | + if (env.getProcessorType() == ANALYZER) { | |
46 | + const unsigned char* currPtr = ig.ptr; | |
47 | + unsigned char casePatternsNum = *currPtr++; | |
48 | + for (unsigned int i = 0; i < casePatternsNum; i++) { | |
49 | + env.getCasePatternHelper().deserializeOneCasePattern(currPtr); | |
50 | + } | |
51 | + return currPtr; | |
52 | + } | |
53 | + else { | |
54 | + return ig.ptr; | |
55 | + } | |
56 | +} | |
57 | + | |
43 | 58 | void Morfeusz::setAnalyzerFile(const string& filename) { |
44 | 59 | this->analyzerEnv.setFSAFile(filename); |
45 | 60 | } |
... | ... | @@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord( |
183 | 198 | it != newSegrulesStates.end(); |
184 | 199 | ++it) { |
185 | 200 | SegrulesState newSegrulesState = *it; |
186 | - const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig); | |
201 | + const unsigned char* interpsPtr = getInterpretationsPtr(env, ig); | |
187 | 202 | const unsigned char* interpsEndPtr = ig.ptr + ig.size; |
188 | 203 | InterpretedChunk ic = { |
189 | 204 | ig.type, |
... | ... |
morfeusz/Qualifiers.cpp
... | ... | @@ -20,7 +20,6 @@ qualifiers() { |
20 | 20 | readTags(currPtr, _dupa); |
21 | 21 | _dupa.clear(); |
22 | 22 | readTags(currPtr, _dupa); |
23 | - | |
24 | 23 | uint16_t allCombinationsSize = readInt16(currPtr); |
25 | 24 | this->qualifiers.reserve(allCombinationsSize); |
26 | 25 | for (unsigned int i = 0; i < allCombinationsSize; i++) { |
... | ... |
morfeusz/deserializationUtils.hpp
... | ... | @@ -11,14 +11,24 @@ |
11 | 11 | #include "endianness.hpp" |
12 | 12 | #include <iostream> |
13 | 13 | |
14 | +inline unsigned char readInt8(const unsigned char*& currPtr) { | |
15 | + return *currPtr++; | |
16 | +} | |
17 | + | |
14 | 18 | inline uint16_t readInt16(const unsigned char*& currPtr) { |
15 | - uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr)); | |
19 | + uint16_t res = htons(*reinterpret_cast<const uint16_t*> (currPtr)); | |
16 | 20 | currPtr += 2; |
17 | 21 | return res; |
18 | 22 | } |
19 | 23 | |
24 | +inline uint32_t readInt32(const unsigned char*& currPtr) { | |
25 | + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr)); | |
26 | + currPtr += 4; | |
27 | + return res; | |
28 | +} | |
29 | + | |
20 | 30 | inline std::string readString(const unsigned char*& currPtr) { |
21 | - std::string res(reinterpret_cast<const char*>(currPtr)); | |
31 | + std::string res((const char*) currPtr); | |
22 | 32 | currPtr += res.length(); |
23 | 33 | currPtr++; |
24 | 34 | return res; |
... | ... |
morfeusz/segrules/SegrulesFSA.hpp
... | ... | @@ -9,7 +9,8 @@ |
9 | 9 | #define SEGRULESFSA_HPP |
10 | 10 | |
11 | 11 | #include <set> |
12 | -#include "../endianness.hpp" | |
12 | +#include <iostream> | |
13 | +#include "../deserializationUtils.hpp" | |
13 | 14 | |
14 | 15 | struct SegrulesState { |
15 | 16 | uint16_t offset; |
... | ... | @@ -37,8 +38,7 @@ public: |
37 | 38 | |
38 | 39 | const unsigned char* currPtr = ptr + state.offset; |
39 | 40 | currPtr++; |
40 | - const unsigned char transitionsNum = *currPtr; | |
41 | - currPtr++; | |
41 | + const unsigned char transitionsNum = *currPtr++; | |
42 | 42 | for (unsigned int i = 0; i < transitionsNum; i++) { |
43 | 43 | if (*currPtr == segnum) { |
44 | 44 | newStates.insert(newStates.begin(), this->transition2State(currPtr)); |
... | ... | @@ -58,9 +58,8 @@ private: |
58 | 58 | unsigned char WEAK_FLAG = 2; |
59 | 59 | SegrulesState res; |
60 | 60 | transitionPtr++; |
61 | - res.shiftOrthFromPrevious = *transitionPtr; | |
62 | - transitionPtr++; | |
63 | - res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr)); | |
61 | + res.shiftOrthFromPrevious = *transitionPtr++; | |
62 | + res.offset = readInt16(transitionPtr); | |
64 | 63 | res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; |
65 | 64 | res.weak = *(ptr + res.offset) & WEAK_FLAG; |
66 | 65 | return res; |
... | ... |
morfeusz/segrules/segrules.cpp
... | ... | @@ -2,25 +2,12 @@ |
2 | 2 | #include "segrules.hpp" |
3 | 3 | #include "../fsa/fsa.hpp" |
4 | 4 | #include "../fsa/const.hpp" |
5 | +#include "../deserializationUtils.hpp" | |
5 | 6 | |
6 | 7 | using namespace std; |
7 | 8 | |
8 | -static inline uint32_t deserializeUint32(const unsigned char*& ptr) { | |
9 | - uint32_t res = *reinterpret_cast<const uint32_t*>(ptr); | |
10 | - res = htonl(res); | |
11 | - ptr += 4; | |
12 | - return res; | |
13 | -} | |
14 | - | |
15 | -static inline string deserializeString(const unsigned char*& ptr) { | |
16 | - string res(reinterpret_cast<const char*>(ptr)); | |
17 | - ptr += res.length() + 1; | |
18 | - return res; | |
19 | -} | |
20 | - | |
21 | 9 | static inline void skipSeparatorsList(const unsigned char*& ptr) { |
22 | - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr)); | |
23 | - ptr += 2; | |
10 | + uint16_t listSize = readInt16(ptr); | |
24 | 11 | ptr += 4 * listSize; |
25 | 12 | } |
26 | 13 | |
... | ... | @@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr |
28 | 15 | const unsigned char* additionalDataPtr = ptr |
29 | 16 | + FSA_DATA_OFFSET |
30 | 17 | + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); |
31 | - const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; | |
18 | + const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4; | |
32 | 19 | return res; |
33 | 20 | } |
34 | 21 | |
... | ... | @@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { |
47 | 34 | unsigned char optsNum = *ptr; |
48 | 35 | ptr++; |
49 | 36 | for (unsigned char i = 0; i < optsNum; i++) { |
50 | - string key = deserializeString(ptr); | |
51 | - res[key] = deserializeString(ptr); | |
37 | + string key = readString(ptr); | |
38 | + res[key] = readString(ptr); | |
52 | 39 | } |
53 | 40 | return res; |
54 | 41 | } |
55 | 42 | |
56 | 43 | static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) { |
57 | - uint32_t fsaSize = deserializeUint32(ptr); | |
44 | + uint32_t fsaSize = readInt32(ptr); | |
58 | 45 | // static SegrulesDeserializer deserializer; |
59 | 46 | SegrulesFSA* res = new SegrulesFSA(ptr); |
60 | 47 | ptr += fsaSize; |
... | ... |
nbproject/configurations.xml
... | ... | @@ -105,7 +105,7 @@ |
105 | 105 | <buildCommandWorkingDir>build</buildCommandWorkingDir> |
106 | 106 | <buildCommand>${MAKE} -f Makefile</buildCommand> |
107 | 107 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> |
108 | - <executablePath>build/morfeusz/morfeusz_analyzer</executablePath> | |
108 | + <executablePath>build/morfeusz/morfeusz_generator</executablePath> | |
109 | 109 | </makeTool> |
110 | 110 | </makefileType> |
111 | 111 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
... | ... | @@ -311,7 +311,7 @@ |
311 | 311 | <ccTool> |
312 | 312 | <incDir> |
313 | 313 | <pElem>morfeusz</pElem> |
314 | - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | |
314 | + <pElem>/usr/lib/jvm/default-java/include</pElem> | |
315 | 315 | </incDir> |
316 | 316 | <preprocessorList> |
317 | 317 | <Elem>libjmorfeusz_EXPORTS</Elem> |
... | ... |