Commit aece7355ccc6dec55f688a43c5c8bcd911c7b9ca

Authored by Michał Lenart
1 parent d836a116

nowsza wersja generatora - teraz naprawdę jest lustrzanym odbiciem analizatora

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@166 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
... ... @@ -5,7 +5,7 @@ Compilation - prerequisites
5 5  
6 6 This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family).
7 7  
8   -sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip
  8 +sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip python-all-dev python-pyparsing
9 9 sudo pip install pyinstaller
10 10  
11 11 For cross compiling:
... ...
fsabuilder/CMakeLists.txt
... ... @@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
55 55 list (APPEND PACKAGE_DEPENDS package-python-win-installer)
56 56  
57 57 #~ add_custom_target (buildfsa-exec ALL
58   - #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
59   -#~ )
  58 + #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
  59 + #~ )
60 60 #~
61 61 #~ add_executable (morfeusz_builder IMPORTED)
62 62 #~ add_dependencies (morfeusz_builder buildfsa-exec)
... ...
fsabuilder/buildanalyzer.sh
1 1 #!/bin/bash
2 2  
3   -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1
  3 +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1
... ...
fsabuilder/buildgenerator.sh
1 1 #!/bin/bash
2 2  
3   -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
4   - --tagset-file=../input/polimorf.tagset \
  3 +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
  4 + --tagset-file=../input/sgjp-morfeusz.tagset \
5 5 --segments-file=../input/segmenty.dat \
6 6 --generator \
7 7 --serialization-method=V2 \
... ...
fsabuilder/morfeuszbuilder/fsa/common.py
... ... @@ -41,6 +41,11 @@ class EncodedForm4Generator(object):
41 41 self.cutLength = bestEncodedForm.cutLength
42 42 self.suffixToAdd = bestEncodedForm.suffixToAdd
43 43 self.prefixToAdd = targetWord[:bestPrefixLength]
  44 +
  45 +# if fromWord == 'BC':
  46 +# print self.cutLength
  47 +# print self.suffixToAdd
  48 +# print self.prefixToAdd, len(self.prefixToAdd)
44 49  
45 50 class Interpretation4Analyzer(object):
46 51  
... ...
fsabuilder/morfeuszbuilder/fsa/encode.py
... ... @@ -6,7 +6,7 @@ Created on Oct 23, 2013
6 6  
7 7 import logging
8 8 import itertools
9   -from morfeuszbuilder.utils import serializationUtils
  9 +from morfeuszbuilder.utils.serializationUtils import *
10 10  
11 11 class Encoder(object):
12 12 '''
... ... @@ -44,19 +44,6 @@ class Encoder(object):
44 44 assert typenum >= 0 and typenum < 256
45 45 return bytearray([typenum])
46 46  
47   - def _encodeEncodedForm(self, form, withCasePattern, withPrefix):
48   - res = bytearray()
49   - assert form.cutLength < 256 and form.cutLength >= 0
50   - if withPrefix:
51   - res.extend(self.encodeWord(form.prefixToAdd, lowercase=False))
52   - res.append(0)
53   - res.append(form.cutLength)
54   - res.extend(self.encodeWord(form.suffixToAdd, lowercase=False))
55   - res.append(0)
56   - if withCasePattern:
57   - res.extend(self._encodeCasePattern(form.casePattern))
58   - return res
59   -
60 47 def _encodeCasePattern(self, casePattern):
61 48 res = bytearray()
62 49 if True not in casePattern:
... ... @@ -84,7 +71,7 @@ class Encoder(object):
84 71 n = len(self.qualifiersMap)
85 72 self.qualifiersMap[key] = n
86 73 assert n < 500
87   - res.extend(serializationUtils.htons(n))
  74 + res.extend(htons(n))
88 75 return res
89 76  
90 77 def _hasUpperPrefix(self, casePattern):
... ... @@ -102,11 +89,9 @@ class Encoder(object):
102 89  
103 90 def _encodeTagNum(self, tagnum):
104 91 res = bytearray()
105   -# logging.info((tagnum & 0xFF00) >> 8)
106 92 assert tagnum < 65536 and tagnum >= 0
107 93 res.append((tagnum & 0xFF00) >> 8)
108 94 res.append(tagnum & 0x00FF)
109   -# logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1])))
110 95 return res
111 96  
112 97 def _encodeNameNum(self, namenum):
... ... @@ -129,31 +114,37 @@ class Encoder(object):
129 114 res.append(list(interp.orthCasePattern))
130 115 return res
131 116  
132   - def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId):
  117 + def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer):
133 118 res = bytearray()
134 119 res.extend(self._encodeTypeNum(typenum))
135 120 encodedInterpsList = bytearray()
136   - if withCasePattern:
  121 + if isAnalyzer:
137 122 casePatterns = self._getOrthCasePatterns(interpsList)
138 123 encodedInterpsList.append(len(casePatterns))
139 124 for casePattern in casePatterns:
140 125 encodedInterpsList.extend(self._encodeCasePattern(casePattern))
141 126 for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
142   - if withHomonymId:
143   - encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False))
144   - encodedInterpsList.append(0)
145   - if withCasePattern:
  127 + if isAnalyzer:
146 128 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
147   - encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
148   - encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
149   - encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
  129 + else:
  130 + serializeString(interp.homonymId, encodedInterpsList)
  131 + serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList)
  132 + encodedInterpsList.append(interp.encodedForm.cutLength)
  133 + serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList)
  134 + if isAnalyzer:
  135 + encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
  136 + encodedInterpsList.extend(htons(interp.tagnum))
  137 + encodedInterpsList.append(interp.namenum)
150 138 encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
  139 +
  140 + if interp.encodedForm.suffixToAdd == 'bc':
  141 + print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList]
151 142  
152   - res.extend(serializationUtils.htons(len(encodedInterpsList)))
  143 + res.extend(htons(len(encodedInterpsList)))
153 144 res.extend(encodedInterpsList)
154 145 return res
155 146  
156   - def _doEncodeData(self, interpsList, withCasePattern, withPrefix, withHomonymId):
  147 + def _doEncodeData(self, interpsList, isAnalyzer):
157 148  
158 149 assert type(interpsList) == frozenset
159 150  
... ... @@ -167,7 +158,7 @@ class Encoder(object):
167 158 res.append(firstByte)
168 159  
169 160 for typenum, interpsList in segnum2Interps.iteritems():
170   - res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix, withHomonymId))
  161 + res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer))
171 162 del interpsList
172 163  
173 164 return res
... ... @@ -181,7 +172,7 @@ class MorphEncoder(Encoder):
181 172 self.LEMMA_MIXED_CASE = 2
182 173  
183 174 def encodeData(self, interpsList):
184   - return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False, withHomonymId=False)
  175 + return self._doEncodeData(interpsList, isAnalyzer=True)
185 176  
186 177 class Encoder4Generator(Encoder):
187 178  
... ... @@ -189,4 +180,4 @@ class Encoder4Generator(Encoder):
189 180 super(Encoder4Generator, self).__init__(False, encoding)
190 181  
191 182 def encodeData(self, interpsList):
192   - return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True, withHomonymId=True)
  183 + return self._doEncodeData(interpsList, isAnalyzer=False)
... ...
fsabuilder/morfeuszbuilder/fsa/fsa.py
... ... @@ -43,9 +43,6 @@ class FSA(object):
43 43 # debug
44 44 if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0:
45 45 logging.info(u'%d %s' % (self.n, word))
46   -# logging.info(str(self.register.getStatesNum()))
47   -# logging.info(str(self.register.getStatesNum()))
48   - # allWords.append(word)
49 46 for label in encodedWord:
50 47 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
51 48  
... ...
fsabuilder/morfeuszbuilder/segrules/rules.py
... ... @@ -56,6 +56,7 @@ class TagRule(SegmentRule):
56 56  
57 57 def __str__(self):
58 58 res = self.segtype
  59 + res += '(' + str(self.segnum) + ')'
59 60 if self.shiftOrth:
60 61 res += '>'
61 62 return res
... ... @@ -70,8 +71,8 @@ class TagRule(SegmentRule):
70 71 class UnaryRule(SegmentRule):
71 72  
72 73 def __init__(self, child, linenum):
  74 + super(UnaryRule, self).__init__(linenum)
73 75 self.child = child
74   - self.linenum = linenum
75 76 assert not child.isSinkRule()
76 77  
77 78 def isShiftOrthRule(self):
... ... @@ -80,8 +81,8 @@ class UnaryRule(SegmentRule):
80 81 class ComplexRule(SegmentRule):
81 82  
82 83 def __init__(self, children, linenum):
  84 + super(ComplexRule, self).__init__(linenum)
83 85 self.children = children
84   - self.linenum = linenum
85 86 assert not any(map(lambda c: c.isSinkRule(), children))
86 87  
87 88 def addToNFA(self, fsa):
... ...
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
... ... @@ -68,6 +68,4 @@ class RulesFSA(object):
68 68 res.extend(self.stateData2bytearray(state))
69 69 res.extend(self.transitionsData2bytearray(state))
70 70  
71   -# logging.info('Segmentation automaton size: %d bytes', len(res))
72   -# print list(res)
73 71 return res
... ...
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... ... @@ -49,6 +49,7 @@ class RulesManager(object):
49 49 res.extend(self._serializeDFA(dfa))
50 50 res.extend(self._serializeOptionsMap(self.defaultOptions))
51 51 logging.info('segmentation rules size: %s bytes', len(res))
  52 +# logging.info([int(x) for x in res])
52 53 return res
53 54  
54 55 def _serializeSeparatorsList(self):
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -63,8 +63,8 @@ class RulesParser(object):
63 63 nfa = rulesNFA.RulesNFA()
64 64 if not firstNFA:
65 65 firstNFA = nfa
66   - section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
67   - combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False)
  66 +# section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
  67 + combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False)
68 68 combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
69 69 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
70 70 if rule.allowsEmptySequence():
... ... @@ -72,8 +72,11 @@ class RulesParser(object):
72 72 filename,
73 73 rule.linenum,
74 74 'This rule allows empty segments sequence to be accepted')
75   - rule.addToNFA(nfa)
76   -# nfa.debug()
  75 + if self.rulesType == RulesParser.PARSE4GENERATOR:
  76 + rule = rule.transformToGeneratorVersion()
  77 + if not rule.isSinkRule():
  78 + rule.addToNFA(nfa)
  79 +# nfa.debug()
77 80 try:
78 81 dfa = nfa.convertToDFA()
79 82 res.addDFA(key2Def, dfa)
... ... @@ -146,10 +149,11 @@ class RulesParser(object):
146 149 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3
147 150 oneOfRule = delimitedList(unaryRule, delim='|')
148 151 complexRule = unaryRule ^ oneOfRule
149   - if self.rulesType == RulesParser.PARSE4ANALYZER:
150   - concatRule = OneOrMore(complexRule)
151   - else:
152   - concatRule = ZeroOrMore(shiftOrthRule) + tagRule
  152 + concatRule = OneOrMore(complexRule)
  153 +# if self.rulesType == RulesParser.PARSE4ANALYZER:
  154 +# concatRule = OneOrMore(complexRule)
  155 +# else:
  156 +# concatRule = ZeroOrMore(shiftOrthRule) + tagRule
153 157 rule << concatRule + Optional(CaselessLiteral('!weak'))
154 158  
155 159 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
... ...
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
... ... @@ -22,3 +22,7 @@ def htonl(n):
22 22 res.append((n & 0x0000FF00) >> 8)
23 23 res.append(n & 0x000000FF)
24 24 return res
  25 +
  26 +def serializeString(string, out):
  27 + out.extend(string.encode('utf8'))
  28 + out.append(0)
... ...
input/segmenty.dat
... ... @@ -682,5 +682,3 @@ pref_dyw e-+:prefs
682 682  
683 683 # ;
684 684 59
685   -
686   -[generator combinations]
... ...
morfeusz/CasePatternHelper.hpp
... ... @@ -62,15 +62,6 @@ public:
62 62 }
63 63 }
64 64  
65   - const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const {
66   - const unsigned char* currPtr = ig.ptr;
67   - unsigned char casePatternsNum = *currPtr++;
68   - for (unsigned int i = 0; i < casePatternsNum; i++) {
69   - deserializeOneCasePattern(currPtr);
70   - }
71   - return currPtr;
72   - }
73   -
74 65 std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
75 66 std::vector<bool> res;
76 67 uint8_t casePatternType = *ptr;
... ... @@ -103,26 +94,6 @@ public:
103 94 }
104 95 return res;
105 96 }
106   -
107   -// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const {
108   -// if (this->caseSensitive) {
109   -// for (unsigned int i = 0; i < chunks.size(); i++) {
110   -// const InterpretedChunk& ic = chunks[i];
111   -// const unsigned char* casePatternPtr = ic.interpsGroup.ptr;
112   -// std::vector<bool> casePattern;
113   -// deserializeCasePattern(casePatternPtr, casePattern);
114   -// if (!checkCasePattern(ic, casePattern)) {
115   -// return false;
116   -// }
117   -// }
118   -// }
119   -// return true;
120   -// }
121   -
122   -// void skipCasePattern(const unsigned char*& ptr) const {
123   -// vector<bool> _dupa;
124   -// deserializeCasePattern(ptr, _dupa);
125   -// }
126 97 private:
127 98 bool caseSensitive;
128 99  
... ...
morfeusz/InterpretedChunksDecoder.hpp
... ... @@ -40,18 +40,6 @@ public:
40 40  
41 41 protected:
42 42  
43   - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
44   - EncodedInterpretation interp;
45   - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
46   - deserializeEncodedForm(ptr, interp.value);
47   - interp.tag = readInt16(ptr);
48   - interp.nameClassifier = *ptr++;
49   - interp.qualifiers = readInt16(ptr);
50   - return interp;
51   - }
52   -
53   - virtual void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const = 0;
54   -
55 43 const Environment& env;
56 44 };
57 45  
... ... @@ -106,6 +94,16 @@ protected:
106 94 assert(encodedForm.casePattern.size() == 0);
107 95 encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
108 96 }
  97 +
  98 + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
  99 + EncodedInterpretation interp;
  100 + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
  101 + deserializeEncodedForm(ptr, interp.value);
  102 + interp.tag = readInt16(ptr);
  103 + interp.nameClassifier = *ptr++;
  104 + interp.qualifiers = readInt16(ptr);
  105 + return interp;
  106 + }
109 107 private:
110 108  
111 109 pair<string, string> getLemmaHomonymIdPair(const string& lemma) const {
... ... @@ -176,7 +174,7 @@ public:
176 174 const unsigned char* currPtr = interpretedChunk.interpsPtr;
177 175 while (currPtr < interpretedChunk.interpsEndPtr) {
178 176 MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
179   - // cerr << mi.toString(false) << endl;
  177 +// cerr << mi.toString(false) << endl;
180 178 // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
181 179 if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) {
182 180 out.push_back(mi);
... ... @@ -203,15 +201,12 @@ private:
203 201 const InterpretedChunk& chunk,
204 202 const unsigned char*& ptr) const {
205 203 string orth = orthPrefix;
206   - string homonymId = (const char*) ptr;
207   - ptr += strlen((const char*) ptr) + 1;
208 204 EncodedInterpretation ei = this->deserializeInterp(ptr);
209 205 this->decodeForm(chunk.originalCodepoints, ei.value, orth);
210   - // string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId);
211 206 return MorphInterpretation(
212 207 startNode, endNode,
213 208 orth, lemma,
214   - homonymId,
  209 + ei.homonymId,
215 210 ei.tag,
216 211 ei.nameClassifier,
217 212 ei.qualifiers,
... ... @@ -233,14 +228,17 @@ private:
233 228 env.getCharsetConverter().append(cp, res);
234 229 }
235 230 }
236   -
237   - void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const {
238   - encodedForm.prefixToAdd = (const char*) ptr;
239   - ptr += strlen((const char*) ptr) + 1;
240   - encodedForm.suffixToCut = *ptr;
241   - ptr++;
242   - encodedForm.suffixToAdd = (const char*) ptr;
243   - ptr += strlen((const char*) ptr) + 1;
  231 +
  232 + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
  233 + EncodedInterpretation interp;
  234 + interp.homonymId = readString(ptr);
  235 + interp.value.prefixToAdd = readString(ptr);
  236 + interp.value.suffixToCut = readInt8(ptr);
  237 + interp.value.suffixToAdd = readString(ptr);
  238 + interp.tag = readInt16(ptr);
  239 + interp.nameClassifier = readInt8(ptr);
  240 + interp.qualifiers = readInt16(ptr);
  241 + return interp;
244 242 }
245 243 };
246 244  
... ...
morfeusz/Morfeusz.cpp
... ... @@ -18,6 +18,7 @@
18 18 #include "charset/CaseConverter.hpp"
19 19 #include "segrules/segrules.hpp"
20 20 #include "const.hpp"
  21 +#include "deserializationUtils.hpp"
21 22 #include "charset/utf8.h"
22 23  
23 24 // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
... ... @@ -40,6 +41,20 @@ options(createDefaultOptions()) {
40 41 generatorEnv.setCaseSensitive(false);
41 42 }
42 43  
  44 +inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) {
  45 + if (env.getProcessorType() == ANALYZER) {
  46 + const unsigned char* currPtr = ig.ptr;
  47 + unsigned char casePatternsNum = *currPtr++;
  48 + for (unsigned int i = 0; i < casePatternsNum; i++) {
  49 + env.getCasePatternHelper().deserializeOneCasePattern(currPtr);
  50 + }
  51 + return currPtr;
  52 + }
  53 + else {
  54 + return ig.ptr;
  55 + }
  56 +}
  57 +
43 58 void Morfeusz::setAnalyzerFile(const string& filename) {
44 59 this->analyzerEnv.setFSAFile(filename);
45 60 }
... ... @@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord(
183 198 it != newSegrulesStates.end();
184 199 ++it) {
185 200 SegrulesState newSegrulesState = *it;
186   - const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig);
  201 + const unsigned char* interpsPtr = getInterpretationsPtr(env, ig);
187 202 const unsigned char* interpsEndPtr = ig.ptr + ig.size;
188 203 InterpretedChunk ic = {
189 204 ig.type,
... ...
morfeusz/Qualifiers.cpp
... ... @@ -20,7 +20,6 @@ qualifiers() {
20 20 readTags(currPtr, _dupa);
21 21 _dupa.clear();
22 22 readTags(currPtr, _dupa);
23   -
24 23 uint16_t allCombinationsSize = readInt16(currPtr);
25 24 this->qualifiers.reserve(allCombinationsSize);
26 25 for (unsigned int i = 0; i < allCombinationsSize; i++) {
... ...
morfeusz/deserializationUtils.hpp
... ... @@ -11,14 +11,24 @@
11 11 #include "endianness.hpp"
12 12 #include <iostream>
13 13  
  14 +inline unsigned char readInt8(const unsigned char*& currPtr) {
  15 + return *currPtr++;
  16 +}
  17 +
14 18 inline uint16_t readInt16(const unsigned char*& currPtr) {
15   - uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr));
  19 + uint16_t res = htons(*reinterpret_cast<const uint16_t*> (currPtr));
16 20 currPtr += 2;
17 21 return res;
18 22 }
19 23  
  24 +inline uint32_t readInt32(const unsigned char*& currPtr) {
  25 + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr));
  26 + currPtr += 4;
  27 + return res;
  28 +}
  29 +
20 30 inline std::string readString(const unsigned char*& currPtr) {
21   - std::string res(reinterpret_cast<const char*>(currPtr));
  31 + std::string res((const char*) currPtr);
22 32 currPtr += res.length();
23 33 currPtr++;
24 34 return res;
... ...
morfeusz/segrules/SegrulesFSA.hpp
... ... @@ -9,7 +9,8 @@
9 9 #define SEGRULESFSA_HPP
10 10  
11 11 #include <set>
12   -#include "../endianness.hpp"
  12 +#include <iostream>
  13 +#include "../deserializationUtils.hpp"
13 14  
14 15 struct SegrulesState {
15 16 uint16_t offset;
... ... @@ -37,8 +38,7 @@ public:
37 38  
38 39 const unsigned char* currPtr = ptr + state.offset;
39 40 currPtr++;
40   - const unsigned char transitionsNum = *currPtr;
41   - currPtr++;
  41 + const unsigned char transitionsNum = *currPtr++;
42 42 for (unsigned int i = 0; i < transitionsNum; i++) {
43 43 if (*currPtr == segnum) {
44 44 newStates.insert(newStates.begin(), this->transition2State(currPtr));
... ... @@ -58,9 +58,8 @@ private:
58 58 unsigned char WEAK_FLAG = 2;
59 59 SegrulesState res;
60 60 transitionPtr++;
61   - res.shiftOrthFromPrevious = *transitionPtr;
62   - transitionPtr++;
63   - res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr));
  61 + res.shiftOrthFromPrevious = *transitionPtr++;
  62 + res.offset = readInt16(transitionPtr);
64 63 res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG;
65 64 res.weak = *(ptr + res.offset) & WEAK_FLAG;
66 65 return res;
... ...
morfeusz/segrules/segrules.cpp
... ... @@ -2,25 +2,12 @@
2 2 #include "segrules.hpp"
3 3 #include "../fsa/fsa.hpp"
4 4 #include "../fsa/const.hpp"
  5 +#include "../deserializationUtils.hpp"
5 6  
6 7 using namespace std;
7 8  
8   -static inline uint32_t deserializeUint32(const unsigned char*& ptr) {
9   - uint32_t res = *reinterpret_cast<const uint32_t*>(ptr);
10   - res = htonl(res);
11   - ptr += 4;
12   - return res;
13   -}
14   -
15   -static inline string deserializeString(const unsigned char*& ptr) {
16   - string res(reinterpret_cast<const char*>(ptr));
17   - ptr += res.length() + 1;
18   - return res;
19   -}
20   -
21 9 static inline void skipSeparatorsList(const unsigned char*& ptr) {
22   - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));
23   - ptr += 2;
  10 + uint16_t listSize = readInt16(ptr);
24 11 ptr += 4 * listSize;
25 12 }
26 13  
... ... @@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr
28 15 const unsigned char* additionalDataPtr = ptr
29 16 + FSA_DATA_OFFSET
30 17 + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
31   - const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
  18 + const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4;
32 19 return res;
33 20 }
34 21  
... ... @@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*&amp; ptr) {
47 34 unsigned char optsNum = *ptr;
48 35 ptr++;
49 36 for (unsigned char i = 0; i < optsNum; i++) {
50   - string key = deserializeString(ptr);
51   - res[key] = deserializeString(ptr);
  37 + string key = readString(ptr);
  38 + res[key] = readString(ptr);
52 39 }
53 40 return res;
54 41 }
55 42  
56 43 static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) {
57   - uint32_t fsaSize = deserializeUint32(ptr);
  44 + uint32_t fsaSize = readInt32(ptr);
58 45 // static SegrulesDeserializer deserializer;
59 46 SegrulesFSA* res = new SegrulesFSA(ptr);
60 47 ptr += fsaSize;
... ...
nbproject/configurations.xml
... ... @@ -105,7 +105,7 @@
105 105 <buildCommandWorkingDir>build</buildCommandWorkingDir>
106 106 <buildCommand>${MAKE} -f Makefile</buildCommand>
107 107 <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
108   - <executablePath>build/morfeusz/morfeusz_analyzer</executablePath>
  108 + <executablePath>build/morfeusz/morfeusz_generator</executablePath>
109 109 </makeTool>
110 110 </makefileType>
111 111 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
... ... @@ -311,7 +311,7 @@
311 311 <ccTool>
312 312 <incDir>
313 313 <pElem>morfeusz</pElem>
314   - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
  314 + <pElem>/usr/lib/jvm/default-java/include</pElem>
315 315 </incDir>
316 316 <preprocessorList>
317 317 <Elem>libjmorfeusz_EXPORTS</Elem>
... ...