Commit aece7355ccc6dec55f688a43c5c8bcd911c7b9ca

Authored by Michał Lenart
1 parent d836a116

nowsza wersja generatora - teraz naprawdę jest lustrzanym odbiciem analizatora

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@166 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
@@ -5,7 +5,7 @@ Compilation - prerequisites @@ -5,7 +5,7 @@ Compilation - prerequisites
5 5
6 This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family). 6 This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family).
7 7
8 -sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip 8 +sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip python-all-dev python-pyparsing
9 sudo pip install pyinstaller 9 sudo pip install pyinstaller
10 10
11 For cross compiling: 11 For cross compiling:
fsabuilder/CMakeLists.txt
@@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") @@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
55 list (APPEND PACKAGE_DEPENDS package-python-win-installer) 55 list (APPEND PACKAGE_DEPENDS package-python-win-installer)
56 56
57 #~ add_custom_target (buildfsa-exec ALL 57 #~ add_custom_target (buildfsa-exec ALL
58 - #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder  
59 -#~ ) 58 + #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
  59 + #~ )
60 #~ 60 #~
61 #~ add_executable (morfeusz_builder IMPORTED) 61 #~ add_executable (morfeusz_builder IMPORTED)
62 #~ add_dependencies (morfeusz_builder buildfsa-exec) 62 #~ add_dependencies (morfeusz_builder buildfsa-exec)
fsabuilder/buildanalyzer.sh
1 #!/bin/bash 1 #!/bin/bash
2 2
3 -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1 3 +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1
fsabuilder/buildgenerator.sh
1 #!/bin/bash 1 #!/bin/bash
2 2
3 -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \  
4 - --tagset-file=../input/polimorf.tagset \ 3 +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
  4 + --tagset-file=../input/sgjp-morfeusz.tagset \
5 --segments-file=../input/segmenty.dat \ 5 --segments-file=../input/segmenty.dat \
6 --generator \ 6 --generator \
7 --serialization-method=V2 \ 7 --serialization-method=V2 \
fsabuilder/morfeuszbuilder/fsa/common.py
@@ -41,6 +41,11 @@ class EncodedForm4Generator(object): @@ -41,6 +41,11 @@ class EncodedForm4Generator(object):
41 self.cutLength = bestEncodedForm.cutLength 41 self.cutLength = bestEncodedForm.cutLength
42 self.suffixToAdd = bestEncodedForm.suffixToAdd 42 self.suffixToAdd = bestEncodedForm.suffixToAdd
43 self.prefixToAdd = targetWord[:bestPrefixLength] 43 self.prefixToAdd = targetWord[:bestPrefixLength]
  44 +
  45 +# if fromWord == 'BC':
  46 +# print self.cutLength
  47 +# print self.suffixToAdd
  48 +# print self.prefixToAdd, len(self.prefixToAdd)
44 49
45 class Interpretation4Analyzer(object): 50 class Interpretation4Analyzer(object):
46 51
fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -6,7 +6,7 @@ Created on Oct 23, 2013 @@ -6,7 +6,7 @@ Created on Oct 23, 2013
6 6
7 import logging 7 import logging
8 import itertools 8 import itertools
9 -from morfeuszbuilder.utils import serializationUtils 9 +from morfeuszbuilder.utils.serializationUtils import *
10 10
11 class Encoder(object): 11 class Encoder(object):
12 ''' 12 '''
@@ -44,19 +44,6 @@ class Encoder(object): @@ -44,19 +44,6 @@ class Encoder(object):
44 assert typenum >= 0 and typenum < 256 44 assert typenum >= 0 and typenum < 256
45 return bytearray([typenum]) 45 return bytearray([typenum])
46 46
47 - def _encodeEncodedForm(self, form, withCasePattern, withPrefix):  
48 - res = bytearray()  
49 - assert form.cutLength < 256 and form.cutLength >= 0  
50 - if withPrefix:  
51 - res.extend(self.encodeWord(form.prefixToAdd, lowercase=False))  
52 - res.append(0)  
53 - res.append(form.cutLength)  
54 - res.extend(self.encodeWord(form.suffixToAdd, lowercase=False))  
55 - res.append(0)  
56 - if withCasePattern:  
57 - res.extend(self._encodeCasePattern(form.casePattern))  
58 - return res  
59 -  
60 def _encodeCasePattern(self, casePattern): 47 def _encodeCasePattern(self, casePattern):
61 res = bytearray() 48 res = bytearray()
62 if True not in casePattern: 49 if True not in casePattern:
@@ -84,7 +71,7 @@ class Encoder(object): @@ -84,7 +71,7 @@ class Encoder(object):
84 n = len(self.qualifiersMap) 71 n = len(self.qualifiersMap)
85 self.qualifiersMap[key] = n 72 self.qualifiersMap[key] = n
86 assert n < 500 73 assert n < 500
87 - res.extend(serializationUtils.htons(n)) 74 + res.extend(htons(n))
88 return res 75 return res
89 76
90 def _hasUpperPrefix(self, casePattern): 77 def _hasUpperPrefix(self, casePattern):
@@ -102,11 +89,9 @@ class Encoder(object): @@ -102,11 +89,9 @@ class Encoder(object):
102 89
103 def _encodeTagNum(self, tagnum): 90 def _encodeTagNum(self, tagnum):
104 res = bytearray() 91 res = bytearray()
105 -# logging.info((tagnum & 0xFF00) >> 8)  
106 assert tagnum < 65536 and tagnum >= 0 92 assert tagnum < 65536 and tagnum >= 0
107 res.append((tagnum & 0xFF00) >> 8) 93 res.append((tagnum & 0xFF00) >> 8)
108 res.append(tagnum & 0x00FF) 94 res.append(tagnum & 0x00FF)
109 -# logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1])))  
110 return res 95 return res
111 96
112 def _encodeNameNum(self, namenum): 97 def _encodeNameNum(self, namenum):
@@ -129,31 +114,37 @@ class Encoder(object): @@ -129,31 +114,37 @@ class Encoder(object):
129 res.append(list(interp.orthCasePattern)) 114 res.append(list(interp.orthCasePattern))
130 return res 115 return res
131 116
132 - def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId): 117 + def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer):
133 res = bytearray() 118 res = bytearray()
134 res.extend(self._encodeTypeNum(typenum)) 119 res.extend(self._encodeTypeNum(typenum))
135 encodedInterpsList = bytearray() 120 encodedInterpsList = bytearray()
136 - if withCasePattern: 121 + if isAnalyzer:
137 casePatterns = self._getOrthCasePatterns(interpsList) 122 casePatterns = self._getOrthCasePatterns(interpsList)
138 encodedInterpsList.append(len(casePatterns)) 123 encodedInterpsList.append(len(casePatterns))
139 for casePattern in casePatterns: 124 for casePattern in casePatterns:
140 encodedInterpsList.extend(self._encodeCasePattern(casePattern)) 125 encodedInterpsList.extend(self._encodeCasePattern(casePattern))
141 for interp in sorted(interpsList, key=lambda i: i.getSortKey()): 126 for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
142 - if withHomonymId:  
143 - encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False))  
144 - encodedInterpsList.append(0)  
145 - if withCasePattern: 127 + if isAnalyzer:
146 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) 128 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
147 - encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))  
148 - encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))  
149 - encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) 129 + else:
  130 + serializeString(interp.homonymId, encodedInterpsList)
  131 + serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList)
  132 + encodedInterpsList.append(interp.encodedForm.cutLength)
  133 + serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList)
  134 + if isAnalyzer:
  135 + encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
  136 + encodedInterpsList.extend(htons(interp.tagnum))
  137 + encodedInterpsList.append(interp.namenum)
150 encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) 138 encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
  139 +
  140 + if interp.encodedForm.suffixToAdd == 'bc':
  141 + print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList]
151 142
152 - res.extend(serializationUtils.htons(len(encodedInterpsList))) 143 + res.extend(htons(len(encodedInterpsList)))
153 res.extend(encodedInterpsList) 144 res.extend(encodedInterpsList)
154 return res 145 return res
155 146
156 - def _doEncodeData(self, interpsList, withCasePattern, withPrefix, withHomonymId): 147 + def _doEncodeData(self, interpsList, isAnalyzer):
157 148
158 assert type(interpsList) == frozenset 149 assert type(interpsList) == frozenset
159 150
@@ -167,7 +158,7 @@ class Encoder(object): @@ -167,7 +158,7 @@ class Encoder(object):
167 res.append(firstByte) 158 res.append(firstByte)
168 159
169 for typenum, interpsList in segnum2Interps.iteritems(): 160 for typenum, interpsList in segnum2Interps.iteritems():
170 - res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix, withHomonymId)) 161 + res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer))
171 del interpsList 162 del interpsList
172 163
173 return res 164 return res
@@ -181,7 +172,7 @@ class MorphEncoder(Encoder): @@ -181,7 +172,7 @@ class MorphEncoder(Encoder):
181 self.LEMMA_MIXED_CASE = 2 172 self.LEMMA_MIXED_CASE = 2
182 173
183 def encodeData(self, interpsList): 174 def encodeData(self, interpsList):
184 - return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False, withHomonymId=False) 175 + return self._doEncodeData(interpsList, isAnalyzer=True)
185 176
186 class Encoder4Generator(Encoder): 177 class Encoder4Generator(Encoder):
187 178
@@ -189,4 +180,4 @@ class Encoder4Generator(Encoder): @@ -189,4 +180,4 @@ class Encoder4Generator(Encoder):
189 super(Encoder4Generator, self).__init__(False, encoding) 180 super(Encoder4Generator, self).__init__(False, encoding)
190 181
191 def encodeData(self, interpsList): 182 def encodeData(self, interpsList):
192 - return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True, withHomonymId=True) 183 + return self._doEncodeData(interpsList, isAnalyzer=False)
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -43,9 +43,6 @@ class FSA(object): @@ -43,9 +43,6 @@ class FSA(object):
43 # debug 43 # debug
44 if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: 44 if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0:
45 logging.info(u'%d %s' % (self.n, word)) 45 logging.info(u'%d %s' % (self.n, word))
46 -# logging.info(str(self.register.getStatesNum()))  
47 -# logging.info(str(self.register.getStatesNum()))  
48 - # allWords.append(word)  
49 for label in encodedWord: 46 for label in encodedWord:
50 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 47 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
51 48
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -56,6 +56,7 @@ class TagRule(SegmentRule): @@ -56,6 +56,7 @@ class TagRule(SegmentRule):
56 56
57 def __str__(self): 57 def __str__(self):
58 res = self.segtype 58 res = self.segtype
  59 + res += '(' + str(self.segnum) + ')'
59 if self.shiftOrth: 60 if self.shiftOrth:
60 res += '>' 61 res += '>'
61 return res 62 return res
@@ -70,8 +71,8 @@ class TagRule(SegmentRule): @@ -70,8 +71,8 @@ class TagRule(SegmentRule):
70 class UnaryRule(SegmentRule): 71 class UnaryRule(SegmentRule):
71 72
72 def __init__(self, child, linenum): 73 def __init__(self, child, linenum):
  74 + super(UnaryRule, self).__init__(linenum)
73 self.child = child 75 self.child = child
74 - self.linenum = linenum  
75 assert not child.isSinkRule() 76 assert not child.isSinkRule()
76 77
77 def isShiftOrthRule(self): 78 def isShiftOrthRule(self):
@@ -80,8 +81,8 @@ class UnaryRule(SegmentRule): @@ -80,8 +81,8 @@ class UnaryRule(SegmentRule):
80 class ComplexRule(SegmentRule): 81 class ComplexRule(SegmentRule):
81 82
82 def __init__(self, children, linenum): 83 def __init__(self, children, linenum):
  84 + super(ComplexRule, self).__init__(linenum)
83 self.children = children 85 self.children = children
84 - self.linenum = linenum  
85 assert not any(map(lambda c: c.isSinkRule(), children)) 86 assert not any(map(lambda c: c.isSinkRule(), children))
86 87
87 def addToNFA(self, fsa): 88 def addToNFA(self, fsa):
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
@@ -68,6 +68,4 @@ class RulesFSA(object): @@ -68,6 +68,4 @@ class RulesFSA(object):
68 res.extend(self.stateData2bytearray(state)) 68 res.extend(self.stateData2bytearray(state))
69 res.extend(self.transitionsData2bytearray(state)) 69 res.extend(self.transitionsData2bytearray(state))
70 70
71 -# logging.info('Segmentation automaton size: %d bytes', len(res))  
72 -# print list(res)  
73 return res 71 return res
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -49,6 +49,7 @@ class RulesManager(object): @@ -49,6 +49,7 @@ class RulesManager(object):
49 res.extend(self._serializeDFA(dfa)) 49 res.extend(self._serializeDFA(dfa))
50 res.extend(self._serializeOptionsMap(self.defaultOptions)) 50 res.extend(self._serializeOptionsMap(self.defaultOptions))
51 logging.info('segmentation rules size: %s bytes', len(res)) 51 logging.info('segmentation rules size: %s bytes', len(res))
  52 +# logging.info([int(x) for x in res])
52 return res 53 return res
53 54
54 def _serializeSeparatorsList(self): 55 def _serializeSeparatorsList(self):
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -63,8 +63,8 @@ class RulesParser(object): @@ -63,8 +63,8 @@ class RulesParser(object):
63 nfa = rulesNFA.RulesNFA() 63 nfa = rulesNFA.RulesNFA()
64 if not firstNFA: 64 if not firstNFA:
65 firstNFA = nfa 65 firstNFA = nfa
66 - section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'  
67 - combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False) 66 +# section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
  67 + combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False)
68 combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) 68 combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
69 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): 69 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
70 if rule.allowsEmptySequence(): 70 if rule.allowsEmptySequence():
@@ -72,8 +72,11 @@ class RulesParser(object): @@ -72,8 +72,11 @@ class RulesParser(object):
72 filename, 72 filename,
73 rule.linenum, 73 rule.linenum,
74 'This rule allows empty segments sequence to be accepted') 74 'This rule allows empty segments sequence to be accepted')
75 - rule.addToNFA(nfa)  
76 -# nfa.debug() 75 + if self.rulesType == RulesParser.PARSE4GENERATOR:
  76 + rule = rule.transformToGeneratorVersion()
  77 + if not rule.isSinkRule():
  78 + rule.addToNFA(nfa)
  79 +# nfa.debug()
77 try: 80 try:
78 dfa = nfa.convertToDFA() 81 dfa = nfa.convertToDFA()
79 res.addDFA(key2Def, dfa) 82 res.addDFA(key2Def, dfa)
@@ -146,10 +149,11 @@ class RulesParser(object): @@ -146,10 +149,11 @@ class RulesParser(object):
146 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3 149 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3
147 oneOfRule = delimitedList(unaryRule, delim='|') 150 oneOfRule = delimitedList(unaryRule, delim='|')
148 complexRule = unaryRule ^ oneOfRule 151 complexRule = unaryRule ^ oneOfRule
149 - if self.rulesType == RulesParser.PARSE4ANALYZER:  
150 - concatRule = OneOrMore(complexRule)  
151 - else:  
152 - concatRule = ZeroOrMore(shiftOrthRule) + tagRule 152 + concatRule = OneOrMore(complexRule)
  153 +# if self.rulesType == RulesParser.PARSE4ANALYZER:
  154 +# concatRule = OneOrMore(complexRule)
  155 +# else:
  156 +# concatRule = ZeroOrMore(shiftOrthRule) + tagRule
153 rule << concatRule + Optional(CaselessLiteral('!weak')) 157 rule << concatRule + Optional(CaselessLiteral('!weak'))
154 158
155 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) 159 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
@@ -22,3 +22,7 @@ def htonl(n): @@ -22,3 +22,7 @@ def htonl(n):
22 res.append((n & 0x0000FF00) >> 8) 22 res.append((n & 0x0000FF00) >> 8)
23 res.append(n & 0x000000FF) 23 res.append(n & 0x000000FF)
24 return res 24 return res
  25 +
  26 +def serializeString(string, out):
  27 + out.extend(string.encode('utf8'))
  28 + out.append(0)
input/segmenty.dat
@@ -682,5 +682,3 @@ pref_dyw e-+:prefs @@ -682,5 +682,3 @@ pref_dyw e-+:prefs
682 682
683 # ; 683 # ;
684 59 684 59
685 -  
686 -[generator combinations]  
morfeusz/CasePatternHelper.hpp
@@ -62,15 +62,6 @@ public: @@ -62,15 +62,6 @@ public:
62 } 62 }
63 } 63 }
64 64
65 - const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const {  
66 - const unsigned char* currPtr = ig.ptr;  
67 - unsigned char casePatternsNum = *currPtr++;  
68 - for (unsigned int i = 0; i < casePatternsNum; i++) {  
69 - deserializeOneCasePattern(currPtr);  
70 - }  
71 - return currPtr;  
72 - }  
73 -  
74 std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { 65 std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
75 std::vector<bool> res; 66 std::vector<bool> res;
76 uint8_t casePatternType = *ptr; 67 uint8_t casePatternType = *ptr;
@@ -103,26 +94,6 @@ public: @@ -103,26 +94,6 @@ public:
103 } 94 }
104 return res; 95 return res;
105 } 96 }
106 -  
107 -// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const {  
108 -// if (this->caseSensitive) {  
109 -// for (unsigned int i = 0; i < chunks.size(); i++) {  
110 -// const InterpretedChunk& ic = chunks[i];  
111 -// const unsigned char* casePatternPtr = ic.interpsGroup.ptr;  
112 -// std::vector<bool> casePattern;  
113 -// deserializeCasePattern(casePatternPtr, casePattern);  
114 -// if (!checkCasePattern(ic, casePattern)) {  
115 -// return false;  
116 -// }  
117 -// }  
118 -// }  
119 -// return true;  
120 -// }  
121 -  
122 -// void skipCasePattern(const unsigned char*& ptr) const {  
123 -// vector<bool> _dupa;  
124 -// deserializeCasePattern(ptr, _dupa);  
125 -// }  
126 private: 97 private:
127 bool caseSensitive; 98 bool caseSensitive;
128 99
morfeusz/InterpretedChunksDecoder.hpp
@@ -40,18 +40,6 @@ public: @@ -40,18 +40,6 @@ public:
40 40
41 protected: 41 protected:
42 42
43 - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {  
44 - EncodedInterpretation interp;  
45 - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);  
46 - deserializeEncodedForm(ptr, interp.value);  
47 - interp.tag = readInt16(ptr);  
48 - interp.nameClassifier = *ptr++;  
49 - interp.qualifiers = readInt16(ptr);  
50 - return interp;  
51 - }  
52 -  
53 - virtual void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const = 0;  
54 -  
55 const Environment& env; 43 const Environment& env;
56 }; 44 };
57 45
@@ -106,6 +94,16 @@ protected: @@ -106,6 +94,16 @@ protected:
106 assert(encodedForm.casePattern.size() == 0); 94 assert(encodedForm.casePattern.size() == 0);
107 encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); 95 encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
108 } 96 }
  97 +
  98 + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
  99 + EncodedInterpretation interp;
  100 + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
  101 + deserializeEncodedForm(ptr, interp.value);
  102 + interp.tag = readInt16(ptr);
  103 + interp.nameClassifier = *ptr++;
  104 + interp.qualifiers = readInt16(ptr);
  105 + return interp;
  106 + }
109 private: 107 private:
110 108
111 pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { 109 pair<string, string> getLemmaHomonymIdPair(const string& lemma) const {
@@ -176,7 +174,7 @@ public: @@ -176,7 +174,7 @@ public:
176 const unsigned char* currPtr = interpretedChunk.interpsPtr; 174 const unsigned char* currPtr = interpretedChunk.interpsPtr;
177 while (currPtr < interpretedChunk.interpsEndPtr) { 175 while (currPtr < interpretedChunk.interpsEndPtr) {
178 MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); 176 MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
179 - // cerr << mi.toString(false) << endl; 177 +// cerr << mi.toString(false) << endl;
180 // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; 178 // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
181 if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) { 179 if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) {
182 out.push_back(mi); 180 out.push_back(mi);
@@ -203,15 +201,12 @@ private: @@ -203,15 +201,12 @@ private:
203 const InterpretedChunk& chunk, 201 const InterpretedChunk& chunk,
204 const unsigned char*& ptr) const { 202 const unsigned char*& ptr) const {
205 string orth = orthPrefix; 203 string orth = orthPrefix;
206 - string homonymId = (const char*) ptr;  
207 - ptr += strlen((const char*) ptr) + 1;  
208 EncodedInterpretation ei = this->deserializeInterp(ptr); 204 EncodedInterpretation ei = this->deserializeInterp(ptr);
209 this->decodeForm(chunk.originalCodepoints, ei.value, orth); 205 this->decodeForm(chunk.originalCodepoints, ei.value, orth);
210 - // string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId);  
211 return MorphInterpretation( 206 return MorphInterpretation(
212 startNode, endNode, 207 startNode, endNode,
213 orth, lemma, 208 orth, lemma,
214 - homonymId, 209 + ei.homonymId,
215 ei.tag, 210 ei.tag,
216 ei.nameClassifier, 211 ei.nameClassifier,
217 ei.qualifiers, 212 ei.qualifiers,
@@ -233,14 +228,17 @@ private: @@ -233,14 +228,17 @@ private:
233 env.getCharsetConverter().append(cp, res); 228 env.getCharsetConverter().append(cp, res);
234 } 229 }
235 } 230 }
236 -  
237 - void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const {  
238 - encodedForm.prefixToAdd = (const char*) ptr;  
239 - ptr += strlen((const char*) ptr) + 1;  
240 - encodedForm.suffixToCut = *ptr;  
241 - ptr++;  
242 - encodedForm.suffixToAdd = (const char*) ptr;  
243 - ptr += strlen((const char*) ptr) + 1; 231 +
  232 + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
  233 + EncodedInterpretation interp;
  234 + interp.homonymId = readString(ptr);
  235 + interp.value.prefixToAdd = readString(ptr);
  236 + interp.value.suffixToCut = readInt8(ptr);
  237 + interp.value.suffixToAdd = readString(ptr);
  238 + interp.tag = readInt16(ptr);
  239 + interp.nameClassifier = readInt8(ptr);
  240 + interp.qualifiers = readInt16(ptr);
  241 + return interp;
244 } 242 }
245 }; 243 };
246 244
morfeusz/Morfeusz.cpp
@@ -18,6 +18,7 @@ @@ -18,6 +18,7 @@
18 #include "charset/CaseConverter.hpp" 18 #include "charset/CaseConverter.hpp"
19 #include "segrules/segrules.hpp" 19 #include "segrules/segrules.hpp"
20 #include "const.hpp" 20 #include "const.hpp"
  21 +#include "deserializationUtils.hpp"
21 #include "charset/utf8.h" 22 #include "charset/utf8.h"
22 23
23 // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba 24 // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
@@ -40,6 +41,20 @@ options(createDefaultOptions()) { @@ -40,6 +41,20 @@ options(createDefaultOptions()) {
40 generatorEnv.setCaseSensitive(false); 41 generatorEnv.setCaseSensitive(false);
41 } 42 }
42 43
  44 +inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) {
  45 + if (env.getProcessorType() == ANALYZER) {
  46 + const unsigned char* currPtr = ig.ptr;
  47 + unsigned char casePatternsNum = *currPtr++;
  48 + for (unsigned int i = 0; i < casePatternsNum; i++) {
  49 + env.getCasePatternHelper().deserializeOneCasePattern(currPtr);
  50 + }
  51 + return currPtr;
  52 + }
  53 + else {
  54 + return ig.ptr;
  55 + }
  56 +}
  57 +
43 void Morfeusz::setAnalyzerFile(const string& filename) { 58 void Morfeusz::setAnalyzerFile(const string& filename) {
44 this->analyzerEnv.setFSAFile(filename); 59 this->analyzerEnv.setFSAFile(filename);
45 } 60 }
@@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord( @@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord(
183 it != newSegrulesStates.end(); 198 it != newSegrulesStates.end();
184 ++it) { 199 ++it) {
185 SegrulesState newSegrulesState = *it; 200 SegrulesState newSegrulesState = *it;
186 - const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig); 201 + const unsigned char* interpsPtr = getInterpretationsPtr(env, ig);
187 const unsigned char* interpsEndPtr = ig.ptr + ig.size; 202 const unsigned char* interpsEndPtr = ig.ptr + ig.size;
188 InterpretedChunk ic = { 203 InterpretedChunk ic = {
189 ig.type, 204 ig.type,
morfeusz/Qualifiers.cpp
@@ -20,7 +20,6 @@ qualifiers() { @@ -20,7 +20,6 @@ qualifiers() {
20 readTags(currPtr, _dupa); 20 readTags(currPtr, _dupa);
21 _dupa.clear(); 21 _dupa.clear();
22 readTags(currPtr, _dupa); 22 readTags(currPtr, _dupa);
23 -  
24 uint16_t allCombinationsSize = readInt16(currPtr); 23 uint16_t allCombinationsSize = readInt16(currPtr);
25 this->qualifiers.reserve(allCombinationsSize); 24 this->qualifiers.reserve(allCombinationsSize);
26 for (unsigned int i = 0; i < allCombinationsSize; i++) { 25 for (unsigned int i = 0; i < allCombinationsSize; i++) {
morfeusz/deserializationUtils.hpp
@@ -11,14 +11,24 @@ @@ -11,14 +11,24 @@
11 #include "endianness.hpp" 11 #include "endianness.hpp"
12 #include <iostream> 12 #include <iostream>
13 13
  14 +inline unsigned char readInt8(const unsigned char*& currPtr) {
  15 + return *currPtr++;
  16 +}
  17 +
14 inline uint16_t readInt16(const unsigned char*& currPtr) { 18 inline uint16_t readInt16(const unsigned char*& currPtr) {
15 - uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr)); 19 + uint16_t res = htons(*reinterpret_cast<const uint16_t*> (currPtr));
16 currPtr += 2; 20 currPtr += 2;
17 return res; 21 return res;
18 } 22 }
19 23
  24 +inline uint32_t readInt32(const unsigned char*& currPtr) {
  25 + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr));
  26 + currPtr += 4;
  27 + return res;
  28 +}
  29 +
20 inline std::string readString(const unsigned char*& currPtr) { 30 inline std::string readString(const unsigned char*& currPtr) {
21 - std::string res(reinterpret_cast<const char*>(currPtr)); 31 + std::string res((const char*) currPtr);
22 currPtr += res.length(); 32 currPtr += res.length();
23 currPtr++; 33 currPtr++;
24 return res; 34 return res;
morfeusz/segrules/SegrulesFSA.hpp
@@ -9,7 +9,8 @@ @@ -9,7 +9,8 @@
9 #define SEGRULESFSA_HPP 9 #define SEGRULESFSA_HPP
10 10
11 #include <set> 11 #include <set>
12 -#include "../endianness.hpp" 12 +#include <iostream>
  13 +#include "../deserializationUtils.hpp"
13 14
14 struct SegrulesState { 15 struct SegrulesState {
15 uint16_t offset; 16 uint16_t offset;
@@ -37,8 +38,7 @@ public: @@ -37,8 +38,7 @@ public:
37 38
38 const unsigned char* currPtr = ptr + state.offset; 39 const unsigned char* currPtr = ptr + state.offset;
39 currPtr++; 40 currPtr++;
40 - const unsigned char transitionsNum = *currPtr;  
41 - currPtr++; 41 + const unsigned char transitionsNum = *currPtr++;
42 for (unsigned int i = 0; i < transitionsNum; i++) { 42 for (unsigned int i = 0; i < transitionsNum; i++) {
43 if (*currPtr == segnum) { 43 if (*currPtr == segnum) {
44 newStates.insert(newStates.begin(), this->transition2State(currPtr)); 44 newStates.insert(newStates.begin(), this->transition2State(currPtr));
@@ -58,9 +58,8 @@ private: @@ -58,9 +58,8 @@ private:
58 unsigned char WEAK_FLAG = 2; 58 unsigned char WEAK_FLAG = 2;
59 SegrulesState res; 59 SegrulesState res;
60 transitionPtr++; 60 transitionPtr++;
61 - res.shiftOrthFromPrevious = *transitionPtr;  
62 - transitionPtr++;  
63 - res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr)); 61 + res.shiftOrthFromPrevious = *transitionPtr++;
  62 + res.offset = readInt16(transitionPtr);
64 res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; 63 res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG;
65 res.weak = *(ptr + res.offset) & WEAK_FLAG; 64 res.weak = *(ptr + res.offset) & WEAK_FLAG;
66 return res; 65 return res;
morfeusz/segrules/segrules.cpp
@@ -2,25 +2,12 @@ @@ -2,25 +2,12 @@
2 #include "segrules.hpp" 2 #include "segrules.hpp"
3 #include "../fsa/fsa.hpp" 3 #include "../fsa/fsa.hpp"
4 #include "../fsa/const.hpp" 4 #include "../fsa/const.hpp"
  5 +#include "../deserializationUtils.hpp"
5 6
6 using namespace std; 7 using namespace std;
7 8
8 -static inline uint32_t deserializeUint32(const unsigned char*& ptr) {  
9 - uint32_t res = *reinterpret_cast<const uint32_t*>(ptr);  
10 - res = htonl(res);  
11 - ptr += 4;  
12 - return res;  
13 -}  
14 -  
15 -static inline string deserializeString(const unsigned char*& ptr) {  
16 - string res(reinterpret_cast<const char*>(ptr));  
17 - ptr += res.length() + 1;  
18 - return res;  
19 -}  
20 -  
21 static inline void skipSeparatorsList(const unsigned char*& ptr) { 9 static inline void skipSeparatorsList(const unsigned char*& ptr) {
22 - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));  
23 - ptr += 2; 10 + uint16_t listSize = readInt16(ptr);
24 ptr += 4 * listSize; 11 ptr += 4 * listSize;
25 } 12 }
26 13
@@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr @@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr
28 const unsigned char* additionalDataPtr = ptr 15 const unsigned char* additionalDataPtr = ptr
29 + FSA_DATA_OFFSET 16 + FSA_DATA_OFFSET
30 + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); 17 + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
31 - const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; 18 + const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4;
32 return res; 19 return res;
33 } 20 }
34 21
@@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*&amp; ptr) { @@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*&amp; ptr) {
47 unsigned char optsNum = *ptr; 34 unsigned char optsNum = *ptr;
48 ptr++; 35 ptr++;
49 for (unsigned char i = 0; i < optsNum; i++) { 36 for (unsigned char i = 0; i < optsNum; i++) {
50 - string key = deserializeString(ptr);  
51 - res[key] = deserializeString(ptr); 37 + string key = readString(ptr);
  38 + res[key] = readString(ptr);
52 } 39 }
53 return res; 40 return res;
54 } 41 }
55 42
56 static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) { 43 static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) {
57 - uint32_t fsaSize = deserializeUint32(ptr); 44 + uint32_t fsaSize = readInt32(ptr);
58 // static SegrulesDeserializer deserializer; 45 // static SegrulesDeserializer deserializer;
59 SegrulesFSA* res = new SegrulesFSA(ptr); 46 SegrulesFSA* res = new SegrulesFSA(ptr);
60 ptr += fsaSize; 47 ptr += fsaSize;
nbproject/configurations.xml
@@ -105,7 +105,7 @@ @@ -105,7 +105,7 @@
105 <buildCommandWorkingDir>build</buildCommandWorkingDir> 105 <buildCommandWorkingDir>build</buildCommandWorkingDir>
106 <buildCommand>${MAKE} -f Makefile</buildCommand> 106 <buildCommand>${MAKE} -f Makefile</buildCommand>
107 <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> 107 <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
108 - <executablePath>build/morfeusz/morfeusz_analyzer</executablePath> 108 + <executablePath>build/morfeusz/morfeusz_generator</executablePath>
109 </makeTool> 109 </makeTool>
110 </makefileType> 110 </makefileType>
111 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> 111 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
@@ -311,7 +311,7 @@ @@ -311,7 +311,7 @@
311 <ccTool> 311 <ccTool>
312 <incDir> 312 <incDir>
313 <pElem>morfeusz</pElem> 313 <pElem>morfeusz</pElem>
314 - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> 314 + <pElem>/usr/lib/jvm/default-java/include</pElem>
315 </incDir> 315 </incDir>
316 <preprocessorList> 316 <preprocessorList>
317 <Elem>libjmorfeusz_EXPORTS</Elem> 317 <Elem>libjmorfeusz_EXPORTS</Elem>