Commit aece7355ccc6dec55f688a43c5c8bcd911c7b9ca
1 parent
d836a116
nowsza wersja generatora - teraz naprawdę jest lustrzanym odbiciem analizatora
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@166 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
21 changed files
with
117 additions
and
139 deletions
README
@@ -5,7 +5,7 @@ Compilation - prerequisites | @@ -5,7 +5,7 @@ Compilation - prerequisites | ||
5 | 5 | ||
6 | This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family). | 6 | This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family). |
7 | 7 | ||
8 | -sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip | 8 | +sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip python-all-dev python-pyparsing |
9 | sudo pip install pyinstaller | 9 | sudo pip install pyinstaller |
10 | 10 | ||
11 | For cross compiling: | 11 | For cross compiling: |
fsabuilder/CMakeLists.txt
@@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") | @@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") | ||
55 | list (APPEND PACKAGE_DEPENDS package-python-win-installer) | 55 | list (APPEND PACKAGE_DEPENDS package-python-win-installer) |
56 | 56 | ||
57 | #~ add_custom_target (buildfsa-exec ALL | 57 | #~ add_custom_target (buildfsa-exec ALL |
58 | - #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder | ||
59 | -#~ ) | 58 | + #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder |
59 | + #~ ) | ||
60 | #~ | 60 | #~ |
61 | #~ add_executable (morfeusz_builder IMPORTED) | 61 | #~ add_executable (morfeusz_builder IMPORTED) |
62 | #~ add_dependencies (morfeusz_builder buildfsa-exec) | 62 | #~ add_dependencies (morfeusz_builder buildfsa-exec) |
fsabuilder/buildanalyzer.sh
1 | #!/bin/bash | 1 | #!/bin/bash |
2 | 2 | ||
3 | -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1 | 3 | +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1 |
fsabuilder/buildgenerator.sh
1 | #!/bin/bash | 1 | #!/bin/bash |
2 | 2 | ||
3 | -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ | ||
4 | - --tagset-file=../input/polimorf.tagset \ | 3 | +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ |
4 | + --tagset-file=../input/sgjp-morfeusz.tagset \ | ||
5 | --segments-file=../input/segmenty.dat \ | 5 | --segments-file=../input/segmenty.dat \ |
6 | --generator \ | 6 | --generator \ |
7 | --serialization-method=V2 \ | 7 | --serialization-method=V2 \ |
fsabuilder/morfeuszbuilder/fsa/common.py
@@ -41,6 +41,11 @@ class EncodedForm4Generator(object): | @@ -41,6 +41,11 @@ class EncodedForm4Generator(object): | ||
41 | self.cutLength = bestEncodedForm.cutLength | 41 | self.cutLength = bestEncodedForm.cutLength |
42 | self.suffixToAdd = bestEncodedForm.suffixToAdd | 42 | self.suffixToAdd = bestEncodedForm.suffixToAdd |
43 | self.prefixToAdd = targetWord[:bestPrefixLength] | 43 | self.prefixToAdd = targetWord[:bestPrefixLength] |
44 | + | ||
45 | +# if fromWord == 'BC': | ||
46 | +# print self.cutLength | ||
47 | +# print self.suffixToAdd | ||
48 | +# print self.prefixToAdd, len(self.prefixToAdd) | ||
44 | 49 | ||
45 | class Interpretation4Analyzer(object): | 50 | class Interpretation4Analyzer(object): |
46 | 51 |
fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -6,7 +6,7 @@ Created on Oct 23, 2013 | @@ -6,7 +6,7 @@ Created on Oct 23, 2013 | ||
6 | 6 | ||
7 | import logging | 7 | import logging |
8 | import itertools | 8 | import itertools |
9 | -from morfeuszbuilder.utils import serializationUtils | 9 | +from morfeuszbuilder.utils.serializationUtils import * |
10 | 10 | ||
11 | class Encoder(object): | 11 | class Encoder(object): |
12 | ''' | 12 | ''' |
@@ -44,19 +44,6 @@ class Encoder(object): | @@ -44,19 +44,6 @@ class Encoder(object): | ||
44 | assert typenum >= 0 and typenum < 256 | 44 | assert typenum >= 0 and typenum < 256 |
45 | return bytearray([typenum]) | 45 | return bytearray([typenum]) |
46 | 46 | ||
47 | - def _encodeEncodedForm(self, form, withCasePattern, withPrefix): | ||
48 | - res = bytearray() | ||
49 | - assert form.cutLength < 256 and form.cutLength >= 0 | ||
50 | - if withPrefix: | ||
51 | - res.extend(self.encodeWord(form.prefixToAdd, lowercase=False)) | ||
52 | - res.append(0) | ||
53 | - res.append(form.cutLength) | ||
54 | - res.extend(self.encodeWord(form.suffixToAdd, lowercase=False)) | ||
55 | - res.append(0) | ||
56 | - if withCasePattern: | ||
57 | - res.extend(self._encodeCasePattern(form.casePattern)) | ||
58 | - return res | ||
59 | - | ||
60 | def _encodeCasePattern(self, casePattern): | 47 | def _encodeCasePattern(self, casePattern): |
61 | res = bytearray() | 48 | res = bytearray() |
62 | if True not in casePattern: | 49 | if True not in casePattern: |
@@ -84,7 +71,7 @@ class Encoder(object): | @@ -84,7 +71,7 @@ class Encoder(object): | ||
84 | n = len(self.qualifiersMap) | 71 | n = len(self.qualifiersMap) |
85 | self.qualifiersMap[key] = n | 72 | self.qualifiersMap[key] = n |
86 | assert n < 500 | 73 | assert n < 500 |
87 | - res.extend(serializationUtils.htons(n)) | 74 | + res.extend(htons(n)) |
88 | return res | 75 | return res |
89 | 76 | ||
90 | def _hasUpperPrefix(self, casePattern): | 77 | def _hasUpperPrefix(self, casePattern): |
@@ -102,11 +89,9 @@ class Encoder(object): | @@ -102,11 +89,9 @@ class Encoder(object): | ||
102 | 89 | ||
103 | def _encodeTagNum(self, tagnum): | 90 | def _encodeTagNum(self, tagnum): |
104 | res = bytearray() | 91 | res = bytearray() |
105 | -# logging.info((tagnum & 0xFF00) >> 8) | ||
106 | assert tagnum < 65536 and tagnum >= 0 | 92 | assert tagnum < 65536 and tagnum >= 0 |
107 | res.append((tagnum & 0xFF00) >> 8) | 93 | res.append((tagnum & 0xFF00) >> 8) |
108 | res.append(tagnum & 0x00FF) | 94 | res.append(tagnum & 0x00FF) |
109 | -# logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1]))) | ||
110 | return res | 95 | return res |
111 | 96 | ||
112 | def _encodeNameNum(self, namenum): | 97 | def _encodeNameNum(self, namenum): |
@@ -129,31 +114,37 @@ class Encoder(object): | @@ -129,31 +114,37 @@ class Encoder(object): | ||
129 | res.append(list(interp.orthCasePattern)) | 114 | res.append(list(interp.orthCasePattern)) |
130 | return res | 115 | return res |
131 | 116 | ||
132 | - def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId): | 117 | + def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer): |
133 | res = bytearray() | 118 | res = bytearray() |
134 | res.extend(self._encodeTypeNum(typenum)) | 119 | res.extend(self._encodeTypeNum(typenum)) |
135 | encodedInterpsList = bytearray() | 120 | encodedInterpsList = bytearray() |
136 | - if withCasePattern: | 121 | + if isAnalyzer: |
137 | casePatterns = self._getOrthCasePatterns(interpsList) | 122 | casePatterns = self._getOrthCasePatterns(interpsList) |
138 | encodedInterpsList.append(len(casePatterns)) | 123 | encodedInterpsList.append(len(casePatterns)) |
139 | for casePattern in casePatterns: | 124 | for casePattern in casePatterns: |
140 | encodedInterpsList.extend(self._encodeCasePattern(casePattern)) | 125 | encodedInterpsList.extend(self._encodeCasePattern(casePattern)) |
141 | for interp in sorted(interpsList, key=lambda i: i.getSortKey()): | 126 | for interp in sorted(interpsList, key=lambda i: i.getSortKey()): |
142 | - if withHomonymId: | ||
143 | - encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False)) | ||
144 | - encodedInterpsList.append(0) | ||
145 | - if withCasePattern: | 127 | + if isAnalyzer: |
146 | encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) | 128 | encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) |
147 | - encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) | ||
148 | - encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) | ||
149 | - encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) | 129 | + else: |
130 | + serializeString(interp.homonymId, encodedInterpsList) | ||
131 | + serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList) | ||
132 | + encodedInterpsList.append(interp.encodedForm.cutLength) | ||
133 | + serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList) | ||
134 | + if isAnalyzer: | ||
135 | + encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) | ||
136 | + encodedInterpsList.extend(htons(interp.tagnum)) | ||
137 | + encodedInterpsList.append(interp.namenum) | ||
150 | encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) | 138 | encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) |
139 | + | ||
140 | + if interp.encodedForm.suffixToAdd == 'bc': | ||
141 | + print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList] | ||
151 | 142 | ||
152 | - res.extend(serializationUtils.htons(len(encodedInterpsList))) | 143 | + res.extend(htons(len(encodedInterpsList))) |
153 | res.extend(encodedInterpsList) | 144 | res.extend(encodedInterpsList) |
154 | return res | 145 | return res |
155 | 146 | ||
156 | - def _doEncodeData(self, interpsList, withCasePattern, withPrefix, withHomonymId): | 147 | + def _doEncodeData(self, interpsList, isAnalyzer): |
157 | 148 | ||
158 | assert type(interpsList) == frozenset | 149 | assert type(interpsList) == frozenset |
159 | 150 | ||
@@ -167,7 +158,7 @@ class Encoder(object): | @@ -167,7 +158,7 @@ class Encoder(object): | ||
167 | res.append(firstByte) | 158 | res.append(firstByte) |
168 | 159 | ||
169 | for typenum, interpsList in segnum2Interps.iteritems(): | 160 | for typenum, interpsList in segnum2Interps.iteritems(): |
170 | - res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix, withHomonymId)) | 161 | + res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer)) |
171 | del interpsList | 162 | del interpsList |
172 | 163 | ||
173 | return res | 164 | return res |
@@ -181,7 +172,7 @@ class MorphEncoder(Encoder): | @@ -181,7 +172,7 @@ class MorphEncoder(Encoder): | ||
181 | self.LEMMA_MIXED_CASE = 2 | 172 | self.LEMMA_MIXED_CASE = 2 |
182 | 173 | ||
183 | def encodeData(self, interpsList): | 174 | def encodeData(self, interpsList): |
184 | - return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False, withHomonymId=False) | 175 | + return self._doEncodeData(interpsList, isAnalyzer=True) |
185 | 176 | ||
186 | class Encoder4Generator(Encoder): | 177 | class Encoder4Generator(Encoder): |
187 | 178 | ||
@@ -189,4 +180,4 @@ class Encoder4Generator(Encoder): | @@ -189,4 +180,4 @@ class Encoder4Generator(Encoder): | ||
189 | super(Encoder4Generator, self).__init__(False, encoding) | 180 | super(Encoder4Generator, self).__init__(False, encoding) |
190 | 181 | ||
191 | def encodeData(self, interpsList): | 182 | def encodeData(self, interpsList): |
192 | - return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True, withHomonymId=True) | 183 | + return self._doEncodeData(interpsList, isAnalyzer=False) |
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -43,9 +43,6 @@ class FSA(object): | @@ -43,9 +43,6 @@ class FSA(object): | ||
43 | # debug | 43 | # debug |
44 | if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: | 44 | if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: |
45 | logging.info(u'%d %s' % (self.n, word)) | 45 | logging.info(u'%d %s' % (self.n, word)) |
46 | -# logging.info(str(self.register.getStatesNum())) | ||
47 | -# logging.info(str(self.register.getStatesNum())) | ||
48 | - # allWords.append(word) | ||
49 | for label in encodedWord: | 46 | for label in encodedWord: |
50 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 | 47 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 |
51 | 48 |
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -56,6 +56,7 @@ class TagRule(SegmentRule): | @@ -56,6 +56,7 @@ class TagRule(SegmentRule): | ||
56 | 56 | ||
57 | def __str__(self): | 57 | def __str__(self): |
58 | res = self.segtype | 58 | res = self.segtype |
59 | + res += '(' + str(self.segnum) + ')' | ||
59 | if self.shiftOrth: | 60 | if self.shiftOrth: |
60 | res += '>' | 61 | res += '>' |
61 | return res | 62 | return res |
@@ -70,8 +71,8 @@ class TagRule(SegmentRule): | @@ -70,8 +71,8 @@ class TagRule(SegmentRule): | ||
70 | class UnaryRule(SegmentRule): | 71 | class UnaryRule(SegmentRule): |
71 | 72 | ||
72 | def __init__(self, child, linenum): | 73 | def __init__(self, child, linenum): |
74 | + super(UnaryRule, self).__init__(linenum) | ||
73 | self.child = child | 75 | self.child = child |
74 | - self.linenum = linenum | ||
75 | assert not child.isSinkRule() | 76 | assert not child.isSinkRule() |
76 | 77 | ||
77 | def isShiftOrthRule(self): | 78 | def isShiftOrthRule(self): |
@@ -80,8 +81,8 @@ class UnaryRule(SegmentRule): | @@ -80,8 +81,8 @@ class UnaryRule(SegmentRule): | ||
80 | class ComplexRule(SegmentRule): | 81 | class ComplexRule(SegmentRule): |
81 | 82 | ||
82 | def __init__(self, children, linenum): | 83 | def __init__(self, children, linenum): |
84 | + super(ComplexRule, self).__init__(linenum) | ||
83 | self.children = children | 85 | self.children = children |
84 | - self.linenum = linenum | ||
85 | assert not any(map(lambda c: c.isSinkRule(), children)) | 86 | assert not any(map(lambda c: c.isSinkRule(), children)) |
86 | 87 | ||
87 | def addToNFA(self, fsa): | 88 | def addToNFA(self, fsa): |
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
@@ -68,6 +68,4 @@ class RulesFSA(object): | @@ -68,6 +68,4 @@ class RulesFSA(object): | ||
68 | res.extend(self.stateData2bytearray(state)) | 68 | res.extend(self.stateData2bytearray(state)) |
69 | res.extend(self.transitionsData2bytearray(state)) | 69 | res.extend(self.transitionsData2bytearray(state)) |
70 | 70 | ||
71 | -# logging.info('Segmentation automaton size: %d bytes', len(res)) | ||
72 | -# print list(res) | ||
73 | return res | 71 | return res |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -49,6 +49,7 @@ class RulesManager(object): | @@ -49,6 +49,7 @@ class RulesManager(object): | ||
49 | res.extend(self._serializeDFA(dfa)) | 49 | res.extend(self._serializeDFA(dfa)) |
50 | res.extend(self._serializeOptionsMap(self.defaultOptions)) | 50 | res.extend(self._serializeOptionsMap(self.defaultOptions)) |
51 | logging.info('segmentation rules size: %s bytes', len(res)) | 51 | logging.info('segmentation rules size: %s bytes', len(res)) |
52 | +# logging.info([int(x) for x in res]) | ||
52 | return res | 53 | return res |
53 | 54 | ||
54 | def _serializeSeparatorsList(self): | 55 | def _serializeSeparatorsList(self): |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -63,8 +63,8 @@ class RulesParser(object): | @@ -63,8 +63,8 @@ class RulesParser(object): | ||
63 | nfa = rulesNFA.RulesNFA() | 63 | nfa = rulesNFA.RulesNFA() |
64 | if not firstNFA: | 64 | if not firstNFA: |
65 | firstNFA = nfa | 65 | firstNFA = nfa |
66 | - section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations' | ||
67 | - combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False) | 66 | +# section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations' |
67 | + combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False) | ||
68 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) | 68 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) |
69 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): | 69 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): |
70 | if rule.allowsEmptySequence(): | 70 | if rule.allowsEmptySequence(): |
@@ -72,8 +72,11 @@ class RulesParser(object): | @@ -72,8 +72,11 @@ class RulesParser(object): | ||
72 | filename, | 72 | filename, |
73 | rule.linenum, | 73 | rule.linenum, |
74 | 'This rule allows empty segments sequence to be accepted') | 74 | 'This rule allows empty segments sequence to be accepted') |
75 | - rule.addToNFA(nfa) | ||
76 | -# nfa.debug() | 75 | + if self.rulesType == RulesParser.PARSE4GENERATOR: |
76 | + rule = rule.transformToGeneratorVersion() | ||
77 | + if not rule.isSinkRule(): | ||
78 | + rule.addToNFA(nfa) | ||
79 | +# nfa.debug() | ||
77 | try: | 80 | try: |
78 | dfa = nfa.convertToDFA() | 81 | dfa = nfa.convertToDFA() |
79 | res.addDFA(key2Def, dfa) | 82 | res.addDFA(key2Def, dfa) |
@@ -146,10 +149,11 @@ class RulesParser(object): | @@ -146,10 +149,11 @@ class RulesParser(object): | ||
146 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3 | 149 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3 |
147 | oneOfRule = delimitedList(unaryRule, delim='|') | 150 | oneOfRule = delimitedList(unaryRule, delim='|') |
148 | complexRule = unaryRule ^ oneOfRule | 151 | complexRule = unaryRule ^ oneOfRule |
149 | - if self.rulesType == RulesParser.PARSE4ANALYZER: | ||
150 | - concatRule = OneOrMore(complexRule) | ||
151 | - else: | ||
152 | - concatRule = ZeroOrMore(shiftOrthRule) + tagRule | 152 | + concatRule = OneOrMore(complexRule) |
153 | +# if self.rulesType == RulesParser.PARSE4ANALYZER: | ||
154 | +# concatRule = OneOrMore(complexRule) | ||
155 | +# else: | ||
156 | +# concatRule = ZeroOrMore(shiftOrthRule) + tagRule | ||
153 | rule << concatRule + Optional(CaselessLiteral('!weak')) | 157 | rule << concatRule + Optional(CaselessLiteral('!weak')) |
154 | 158 | ||
155 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) | 159 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) |
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
@@ -22,3 +22,7 @@ def htonl(n): | @@ -22,3 +22,7 @@ def htonl(n): | ||
22 | res.append((n & 0x0000FF00) >> 8) | 22 | res.append((n & 0x0000FF00) >> 8) |
23 | res.append(n & 0x000000FF) | 23 | res.append(n & 0x000000FF) |
24 | return res | 24 | return res |
25 | + | ||
26 | +def serializeString(string, out): | ||
27 | + out.extend(string.encode('utf8')) | ||
28 | + out.append(0) |
input/segmenty.dat
morfeusz/CasePatternHelper.hpp
@@ -62,15 +62,6 @@ public: | @@ -62,15 +62,6 @@ public: | ||
62 | } | 62 | } |
63 | } | 63 | } |
64 | 64 | ||
65 | - const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const { | ||
66 | - const unsigned char* currPtr = ig.ptr; | ||
67 | - unsigned char casePatternsNum = *currPtr++; | ||
68 | - for (unsigned int i = 0; i < casePatternsNum; i++) { | ||
69 | - deserializeOneCasePattern(currPtr); | ||
70 | - } | ||
71 | - return currPtr; | ||
72 | - } | ||
73 | - | ||
74 | std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { | 65 | std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { |
75 | std::vector<bool> res; | 66 | std::vector<bool> res; |
76 | uint8_t casePatternType = *ptr; | 67 | uint8_t casePatternType = *ptr; |
@@ -103,26 +94,6 @@ public: | @@ -103,26 +94,6 @@ public: | ||
103 | } | 94 | } |
104 | return res; | 95 | return res; |
105 | } | 96 | } |
106 | - | ||
107 | -// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const { | ||
108 | -// if (this->caseSensitive) { | ||
109 | -// for (unsigned int i = 0; i < chunks.size(); i++) { | ||
110 | -// const InterpretedChunk& ic = chunks[i]; | ||
111 | -// const unsigned char* casePatternPtr = ic.interpsGroup.ptr; | ||
112 | -// std::vector<bool> casePattern; | ||
113 | -// deserializeCasePattern(casePatternPtr, casePattern); | ||
114 | -// if (!checkCasePattern(ic, casePattern)) { | ||
115 | -// return false; | ||
116 | -// } | ||
117 | -// } | ||
118 | -// } | ||
119 | -// return true; | ||
120 | -// } | ||
121 | - | ||
122 | -// void skipCasePattern(const unsigned char*& ptr) const { | ||
123 | -// vector<bool> _dupa; | ||
124 | -// deserializeCasePattern(ptr, _dupa); | ||
125 | -// } | ||
126 | private: | 97 | private: |
127 | bool caseSensitive; | 98 | bool caseSensitive; |
128 | 99 |
morfeusz/InterpretedChunksDecoder.hpp
@@ -40,18 +40,6 @@ public: | @@ -40,18 +40,6 @@ public: | ||
40 | 40 | ||
41 | protected: | 41 | protected: |
42 | 42 | ||
43 | - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { | ||
44 | - EncodedInterpretation interp; | ||
45 | - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); | ||
46 | - deserializeEncodedForm(ptr, interp.value); | ||
47 | - interp.tag = readInt16(ptr); | ||
48 | - interp.nameClassifier = *ptr++; | ||
49 | - interp.qualifiers = readInt16(ptr); | ||
50 | - return interp; | ||
51 | - } | ||
52 | - | ||
53 | - virtual void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const = 0; | ||
54 | - | ||
55 | const Environment& env; | 43 | const Environment& env; |
56 | }; | 44 | }; |
57 | 45 | ||
@@ -106,6 +94,16 @@ protected: | @@ -106,6 +94,16 @@ protected: | ||
106 | assert(encodedForm.casePattern.size() == 0); | 94 | assert(encodedForm.casePattern.size() == 0); |
107 | encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); | 95 | encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); |
108 | } | 96 | } |
97 | + | ||
98 | + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { | ||
99 | + EncodedInterpretation interp; | ||
100 | + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); | ||
101 | + deserializeEncodedForm(ptr, interp.value); | ||
102 | + interp.tag = readInt16(ptr); | ||
103 | + interp.nameClassifier = *ptr++; | ||
104 | + interp.qualifiers = readInt16(ptr); | ||
105 | + return interp; | ||
106 | + } | ||
109 | private: | 107 | private: |
110 | 108 | ||
111 | pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { | 109 | pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { |
@@ -176,7 +174,7 @@ public: | @@ -176,7 +174,7 @@ public: | ||
176 | const unsigned char* currPtr = interpretedChunk.interpsPtr; | 174 | const unsigned char* currPtr = interpretedChunk.interpsPtr; |
177 | while (currPtr < interpretedChunk.interpsEndPtr) { | 175 | while (currPtr < interpretedChunk.interpsEndPtr) { |
178 | MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); | 176 | MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); |
179 | - // cerr << mi.toString(false) << endl; | 177 | +// cerr << mi.toString(false) << endl; |
180 | // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; | 178 | // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; |
181 | if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) { | 179 | if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) { |
182 | out.push_back(mi); | 180 | out.push_back(mi); |
@@ -203,15 +201,12 @@ private: | @@ -203,15 +201,12 @@ private: | ||
203 | const InterpretedChunk& chunk, | 201 | const InterpretedChunk& chunk, |
204 | const unsigned char*& ptr) const { | 202 | const unsigned char*& ptr) const { |
205 | string orth = orthPrefix; | 203 | string orth = orthPrefix; |
206 | - string homonymId = (const char*) ptr; | ||
207 | - ptr += strlen((const char*) ptr) + 1; | ||
208 | EncodedInterpretation ei = this->deserializeInterp(ptr); | 204 | EncodedInterpretation ei = this->deserializeInterp(ptr); |
209 | this->decodeForm(chunk.originalCodepoints, ei.value, orth); | 205 | this->decodeForm(chunk.originalCodepoints, ei.value, orth); |
210 | - // string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId); | ||
211 | return MorphInterpretation( | 206 | return MorphInterpretation( |
212 | startNode, endNode, | 207 | startNode, endNode, |
213 | orth, lemma, | 208 | orth, lemma, |
214 | - homonymId, | 209 | + ei.homonymId, |
215 | ei.tag, | 210 | ei.tag, |
216 | ei.nameClassifier, | 211 | ei.nameClassifier, |
217 | ei.qualifiers, | 212 | ei.qualifiers, |
@@ -233,14 +228,17 @@ private: | @@ -233,14 +228,17 @@ private: | ||
233 | env.getCharsetConverter().append(cp, res); | 228 | env.getCharsetConverter().append(cp, res); |
234 | } | 229 | } |
235 | } | 230 | } |
236 | - | ||
237 | - void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const { | ||
238 | - encodedForm.prefixToAdd = (const char*) ptr; | ||
239 | - ptr += strlen((const char*) ptr) + 1; | ||
240 | - encodedForm.suffixToCut = *ptr; | ||
241 | - ptr++; | ||
242 | - encodedForm.suffixToAdd = (const char*) ptr; | ||
243 | - ptr += strlen((const char*) ptr) + 1; | 231 | + |
232 | + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { | ||
233 | + EncodedInterpretation interp; | ||
234 | + interp.homonymId = readString(ptr); | ||
235 | + interp.value.prefixToAdd = readString(ptr); | ||
236 | + interp.value.suffixToCut = readInt8(ptr); | ||
237 | + interp.value.suffixToAdd = readString(ptr); | ||
238 | + interp.tag = readInt16(ptr); | ||
239 | + interp.nameClassifier = readInt8(ptr); | ||
240 | + interp.qualifiers = readInt16(ptr); | ||
241 | + return interp; | ||
244 | } | 242 | } |
245 | }; | 243 | }; |
246 | 244 |
morfeusz/Morfeusz.cpp
@@ -18,6 +18,7 @@ | @@ -18,6 +18,7 @@ | ||
18 | #include "charset/CaseConverter.hpp" | 18 | #include "charset/CaseConverter.hpp" |
19 | #include "segrules/segrules.hpp" | 19 | #include "segrules/segrules.hpp" |
20 | #include "const.hpp" | 20 | #include "const.hpp" |
21 | +#include "deserializationUtils.hpp" | ||
21 | #include "charset/utf8.h" | 22 | #include "charset/utf8.h" |
22 | 23 | ||
23 | // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba | 24 | // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba |
@@ -40,6 +41,20 @@ options(createDefaultOptions()) { | @@ -40,6 +41,20 @@ options(createDefaultOptions()) { | ||
40 | generatorEnv.setCaseSensitive(false); | 41 | generatorEnv.setCaseSensitive(false); |
41 | } | 42 | } |
42 | 43 | ||
44 | +inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) { | ||
45 | + if (env.getProcessorType() == ANALYZER) { | ||
46 | + const unsigned char* currPtr = ig.ptr; | ||
47 | + unsigned char casePatternsNum = *currPtr++; | ||
48 | + for (unsigned int i = 0; i < casePatternsNum; i++) { | ||
49 | + env.getCasePatternHelper().deserializeOneCasePattern(currPtr); | ||
50 | + } | ||
51 | + return currPtr; | ||
52 | + } | ||
53 | + else { | ||
54 | + return ig.ptr; | ||
55 | + } | ||
56 | +} | ||
57 | + | ||
43 | void Morfeusz::setAnalyzerFile(const string& filename) { | 58 | void Morfeusz::setAnalyzerFile(const string& filename) { |
44 | this->analyzerEnv.setFSAFile(filename); | 59 | this->analyzerEnv.setFSAFile(filename); |
45 | } | 60 | } |
@@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord( | @@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord( | ||
183 | it != newSegrulesStates.end(); | 198 | it != newSegrulesStates.end(); |
184 | ++it) { | 199 | ++it) { |
185 | SegrulesState newSegrulesState = *it; | 200 | SegrulesState newSegrulesState = *it; |
186 | - const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig); | 201 | + const unsigned char* interpsPtr = getInterpretationsPtr(env, ig); |
187 | const unsigned char* interpsEndPtr = ig.ptr + ig.size; | 202 | const unsigned char* interpsEndPtr = ig.ptr + ig.size; |
188 | InterpretedChunk ic = { | 203 | InterpretedChunk ic = { |
189 | ig.type, | 204 | ig.type, |
morfeusz/Qualifiers.cpp
@@ -20,7 +20,6 @@ qualifiers() { | @@ -20,7 +20,6 @@ qualifiers() { | ||
20 | readTags(currPtr, _dupa); | 20 | readTags(currPtr, _dupa); |
21 | _dupa.clear(); | 21 | _dupa.clear(); |
22 | readTags(currPtr, _dupa); | 22 | readTags(currPtr, _dupa); |
23 | - | ||
24 | uint16_t allCombinationsSize = readInt16(currPtr); | 23 | uint16_t allCombinationsSize = readInt16(currPtr); |
25 | this->qualifiers.reserve(allCombinationsSize); | 24 | this->qualifiers.reserve(allCombinationsSize); |
26 | for (unsigned int i = 0; i < allCombinationsSize; i++) { | 25 | for (unsigned int i = 0; i < allCombinationsSize; i++) { |
morfeusz/deserializationUtils.hpp
@@ -11,14 +11,24 @@ | @@ -11,14 +11,24 @@ | ||
11 | #include "endianness.hpp" | 11 | #include "endianness.hpp" |
12 | #include <iostream> | 12 | #include <iostream> |
13 | 13 | ||
14 | +inline unsigned char readInt8(const unsigned char*& currPtr) { | ||
15 | + return *currPtr++; | ||
16 | +} | ||
17 | + | ||
14 | inline uint16_t readInt16(const unsigned char*& currPtr) { | 18 | inline uint16_t readInt16(const unsigned char*& currPtr) { |
15 | - uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr)); | 19 | + uint16_t res = htons(*reinterpret_cast<const uint16_t*> (currPtr)); |
16 | currPtr += 2; | 20 | currPtr += 2; |
17 | return res; | 21 | return res; |
18 | } | 22 | } |
19 | 23 | ||
24 | +inline uint32_t readInt32(const unsigned char*& currPtr) { | ||
25 | + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr)); | ||
26 | + currPtr += 4; | ||
27 | + return res; | ||
28 | +} | ||
29 | + | ||
20 | inline std::string readString(const unsigned char*& currPtr) { | 30 | inline std::string readString(const unsigned char*& currPtr) { |
21 | - std::string res(reinterpret_cast<const char*>(currPtr)); | 31 | + std::string res((const char*) currPtr); |
22 | currPtr += res.length(); | 32 | currPtr += res.length(); |
23 | currPtr++; | 33 | currPtr++; |
24 | return res; | 34 | return res; |
morfeusz/segrules/SegrulesFSA.hpp
@@ -9,7 +9,8 @@ | @@ -9,7 +9,8 @@ | ||
9 | #define SEGRULESFSA_HPP | 9 | #define SEGRULESFSA_HPP |
10 | 10 | ||
11 | #include <set> | 11 | #include <set> |
12 | -#include "../endianness.hpp" | 12 | +#include <iostream> |
13 | +#include "../deserializationUtils.hpp" | ||
13 | 14 | ||
14 | struct SegrulesState { | 15 | struct SegrulesState { |
15 | uint16_t offset; | 16 | uint16_t offset; |
@@ -37,8 +38,7 @@ public: | @@ -37,8 +38,7 @@ public: | ||
37 | 38 | ||
38 | const unsigned char* currPtr = ptr + state.offset; | 39 | const unsigned char* currPtr = ptr + state.offset; |
39 | currPtr++; | 40 | currPtr++; |
40 | - const unsigned char transitionsNum = *currPtr; | ||
41 | - currPtr++; | 41 | + const unsigned char transitionsNum = *currPtr++; |
42 | for (unsigned int i = 0; i < transitionsNum; i++) { | 42 | for (unsigned int i = 0; i < transitionsNum; i++) { |
43 | if (*currPtr == segnum) { | 43 | if (*currPtr == segnum) { |
44 | newStates.insert(newStates.begin(), this->transition2State(currPtr)); | 44 | newStates.insert(newStates.begin(), this->transition2State(currPtr)); |
@@ -58,9 +58,8 @@ private: | @@ -58,9 +58,8 @@ private: | ||
58 | unsigned char WEAK_FLAG = 2; | 58 | unsigned char WEAK_FLAG = 2; |
59 | SegrulesState res; | 59 | SegrulesState res; |
60 | transitionPtr++; | 60 | transitionPtr++; |
61 | - res.shiftOrthFromPrevious = *transitionPtr; | ||
62 | - transitionPtr++; | ||
63 | - res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr)); | 61 | + res.shiftOrthFromPrevious = *transitionPtr++; |
62 | + res.offset = readInt16(transitionPtr); | ||
64 | res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; | 63 | res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; |
65 | res.weak = *(ptr + res.offset) & WEAK_FLAG; | 64 | res.weak = *(ptr + res.offset) & WEAK_FLAG; |
66 | return res; | 65 | return res; |
morfeusz/segrules/segrules.cpp
@@ -2,25 +2,12 @@ | @@ -2,25 +2,12 @@ | ||
2 | #include "segrules.hpp" | 2 | #include "segrules.hpp" |
3 | #include "../fsa/fsa.hpp" | 3 | #include "../fsa/fsa.hpp" |
4 | #include "../fsa/const.hpp" | 4 | #include "../fsa/const.hpp" |
5 | +#include "../deserializationUtils.hpp" | ||
5 | 6 | ||
6 | using namespace std; | 7 | using namespace std; |
7 | 8 | ||
8 | -static inline uint32_t deserializeUint32(const unsigned char*& ptr) { | ||
9 | - uint32_t res = *reinterpret_cast<const uint32_t*>(ptr); | ||
10 | - res = htonl(res); | ||
11 | - ptr += 4; | ||
12 | - return res; | ||
13 | -} | ||
14 | - | ||
15 | -static inline string deserializeString(const unsigned char*& ptr) { | ||
16 | - string res(reinterpret_cast<const char*>(ptr)); | ||
17 | - ptr += res.length() + 1; | ||
18 | - return res; | ||
19 | -} | ||
20 | - | ||
21 | static inline void skipSeparatorsList(const unsigned char*& ptr) { | 9 | static inline void skipSeparatorsList(const unsigned char*& ptr) { |
22 | - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr)); | ||
23 | - ptr += 2; | 10 | + uint16_t listSize = readInt16(ptr); |
24 | ptr += 4 * listSize; | 11 | ptr += 4 * listSize; |
25 | } | 12 | } |
26 | 13 | ||
@@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr | @@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr | ||
28 | const unsigned char* additionalDataPtr = ptr | 15 | const unsigned char* additionalDataPtr = ptr |
29 | + FSA_DATA_OFFSET | 16 | + FSA_DATA_OFFSET |
30 | + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | 17 | + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); |
31 | - const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; | 18 | + const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4; |
32 | return res; | 19 | return res; |
33 | } | 20 | } |
34 | 21 | ||
@@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { | @@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { | ||
47 | unsigned char optsNum = *ptr; | 34 | unsigned char optsNum = *ptr; |
48 | ptr++; | 35 | ptr++; |
49 | for (unsigned char i = 0; i < optsNum; i++) { | 36 | for (unsigned char i = 0; i < optsNum; i++) { |
50 | - string key = deserializeString(ptr); | ||
51 | - res[key] = deserializeString(ptr); | 37 | + string key = readString(ptr); |
38 | + res[key] = readString(ptr); | ||
52 | } | 39 | } |
53 | return res; | 40 | return res; |
54 | } | 41 | } |
55 | 42 | ||
56 | static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) { | 43 | static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) { |
57 | - uint32_t fsaSize = deserializeUint32(ptr); | 44 | + uint32_t fsaSize = readInt32(ptr); |
58 | // static SegrulesDeserializer deserializer; | 45 | // static SegrulesDeserializer deserializer; |
59 | SegrulesFSA* res = new SegrulesFSA(ptr); | 46 | SegrulesFSA* res = new SegrulesFSA(ptr); |
60 | ptr += fsaSize; | 47 | ptr += fsaSize; |
nbproject/configurations.xml
@@ -105,7 +105,7 @@ | @@ -105,7 +105,7 @@ | ||
105 | <buildCommandWorkingDir>build</buildCommandWorkingDir> | 105 | <buildCommandWorkingDir>build</buildCommandWorkingDir> |
106 | <buildCommand>${MAKE} -f Makefile</buildCommand> | 106 | <buildCommand>${MAKE} -f Makefile</buildCommand> |
107 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> | 107 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> |
108 | - <executablePath>build/morfeusz/morfeusz_analyzer</executablePath> | 108 | + <executablePath>build/morfeusz/morfeusz_generator</executablePath> |
109 | </makeTool> | 109 | </makeTool> |
110 | </makefileType> | 110 | </makefileType> |
111 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> | 111 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
@@ -311,7 +311,7 @@ | @@ -311,7 +311,7 @@ | ||
311 | <ccTool> | 311 | <ccTool> |
312 | <incDir> | 312 | <incDir> |
313 | <pElem>morfeusz</pElem> | 313 | <pElem>morfeusz</pElem> |
314 | - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | 314 | + <pElem>/usr/lib/jvm/default-java/include</pElem> |
315 | </incDir> | 315 | </incDir> |
316 | <preprocessorList> | 316 | <preprocessorList> |
317 | <Elem>libjmorfeusz_EXPORTS</Elem> | 317 | <Elem>libjmorfeusz_EXPORTS</Elem> |