Commit ce75f5c31c40bd0aef3a1fb8ec2919c3ea1b3a90
1 parent
03c17574
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@8 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
2 changed files
with
64 additions
and
39 deletions
fsabuilder/fsa/encode.py
fsabuilder/fsa/serializer.py
... | ... | @@ -17,40 +17,47 @@ class Serializer(object): |
17 | 17 | |
18 | 18 | def serialize2CppFile(self, fname): |
19 | 19 | res = [] |
20 | - self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | |
20 | +# self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | |
21 | 21 | res.append('const unsigned char DEFAULT_FSA[] = {') |
22 | - for idx, state in enumerate(sorted(self.fsa.initialState.dfs(set()), key=lambda state: state.offset)): | |
23 | - res.append('// state '+str(idx)) | |
24 | - partRes = [] | |
25 | - for byte in self.state2bytearray(state): | |
26 | - partRes.append(hex(byte)) | |
27 | - partRes.append(',') | |
28 | - res.append(' '.join(partRes)) | |
22 | + for byte in self.fsa2bytearray(): | |
23 | + res.append(hex(byte)); | |
24 | + res.append(','); | |
25 | +# for idx, state in enumerate(sorted(self.fsa.initialState.dfs(set()), key=lambda state: state.offset)): | |
26 | +# res.append('// state '+str(idx)) | |
27 | +# partRes = [] | |
28 | +# for byte in self.state2bytearray(state): | |
29 | +# partRes.append(hex(byte)) | |
30 | +# partRes.append(',') | |
31 | +# res.append(' '.join(partRes)) | |
29 | 32 | res.append('}') |
30 | 33 | with open(fname, 'w') as f: |
31 | 34 | f.write('\n'.join(res)) |
32 | 35 | |
33 | 36 | def serialize2BinaryFile(self, fname): |
34 | - res = bytearray() | |
35 | - self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | |
36 | - for state in sorted(self.fsa.initialState.dfs(set()), key=lambda state: state.offset): | |
37 | -# res.append('// state '+str(idx)) | |
38 | - res.extend(self.state2bytearray(state)) | |
37 | +# res = bytearray() | |
38 | +# self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | |
39 | +# for state in sorted(self.fsa.initialState.dfs(set()), key=lambda state: state.offset): | |
40 | +# # res.append('// state '+str(idx)) | |
41 | +# res.extend(self.state2bytearray(state)) | |
39 | 42 | with open(fname, 'wb') as f: |
40 | - f.write(res) | |
43 | + f.write(self.fsa2bytearray()) | |
41 | 44 | |
42 | 45 | def getStateSize(self, state): |
43 | 46 | raise NotImplementedError('Not implemented') |
44 | 47 | |
45 | - def fsa2bytearray(self, fsa): | |
48 | + def fsa2bytearray(self): | |
46 | 49 | res = bytearray() |
47 | - fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | |
48 | - for state in sorted(fsa.initialState.dfs(set()), key=state.offset): | |
50 | + res.extend(self.serializePrologue()) | |
51 | + self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | |
52 | + for state in sorted(self.fsa.initialState.dfs(set()), key=state.offset): | |
49 | 53 | res.extend(self.state2bytearray(state)) |
50 | 54 | return res |
51 | 55 | |
52 | 56 | def state2bytearray(self, state): |
53 | 57 | raise NotImplementedError('Not implemented') |
58 | + | |
59 | + def serializePrologue(self): | |
60 | + raise NotImplementedError('Not implemented') | |
54 | 61 | |
55 | 62 | class SimpleSerializer(Serializer): |
56 | 63 | |
... | ... | @@ -92,9 +99,14 @@ class SimpleSerializer(Serializer): |
92 | 99 | res.append((offset & 0x00FF00) >> 8) |
93 | 100 | res.append((offset & 0xFF0000) >> 16) |
94 | 101 | return res |
102 | + | |
103 | + def serializePrologue(self): | |
104 | + return bytearray() | |
95 | 105 | |
96 | 106 | class VLengthSerializer(Serializer): |
97 | - | |
107 | + | |
108 | + MAGIC_NUMBER = 0x8fc2bc1b | |
109 | + VERSION = 1 | |
98 | 110 | LAST_FLAG = 128 |
99 | 111 | |
100 | 112 | def __init__(self, fsa): |
... | ... | @@ -102,6 +114,29 @@ class VLengthSerializer(Serializer): |
102 | 114 | self.statesTable = list(reversed(fsa.dfs(set()))) |
103 | 115 | self.state2Index = dict([(state, idx) for (idx, state) in enumerate(self.statesTable)]) |
104 | 116 | |
117 | + # labels sorted by popularity | |
118 | + self.sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda label, freq: (-freq, label))] | |
119 | + | |
120 | + # popular labels table | |
121 | + self.label2Index = dict([(label, sortedLabels.index(label)) for label in sortedLabels][:31]) | |
122 | + | |
123 | + def serializePrologue(self): | |
124 | + res = bytearray() | |
125 | + | |
126 | + # serialize magic number in big-endian order | |
127 | + res.append((VLengthSerializer.MAGIC_NUMBER & 0xFF000000) >> 24) | |
128 | + res.append((VLengthSerializer.MAGIC_NUMBER & 0x00FF0000) >> 16) | |
129 | + res.append((VLengthSerializer.MAGIC_NUMBER & 0x0000FF00) >> 8) | |
130 | + res.append(VLengthSerializer.MAGIC_NUMBER & 0x000000FF) | |
131 | + | |
132 | + # serialize version number | |
133 | + res.append(VLengthSerializer.VERSION) | |
134 | + | |
135 | + # serialize popular labels | |
136 | + for label, freq in self.sortedLabels[:31]: | |
137 | + res.append(label) | |
138 | + | |
139 | + return res | |
105 | 140 | |
106 | 141 | def getStateSize(self, state): |
107 | 142 | return len(self.state2bytearray(state)) |
... | ... | @@ -109,7 +144,7 @@ class VLengthSerializer(Serializer): |
109 | 144 | def getDataSize(self, state): |
110 | 145 | assert type(state.encodedData) == bytearray or not state.isAccepting() |
111 | 146 | return len(state.encodedData) if state.isAccepting() else 0 |
112 | - | |
147 | + | |
113 | 148 | def state2bytearray(self, state): |
114 | 149 | res = bytearray() |
115 | 150 | res.extend(self._stateData2bytearray(state)) |
... | ... | @@ -124,8 +159,6 @@ class VLengthSerializer(Serializer): |
124 | 159 | |
125 | 160 | def _transitionsData2bytearray(self, state): |
126 | 161 | res = bytearray() |
127 | - sortedLabels = list(sorted(self.fsa.label2Freq.iteritems(), key=lambda label, freq: (-freq, label))) | |
128 | - label2Index = dict([(label, sortedLabels.index(label)) for label in sortedLabels][:30]) | |
129 | 162 | transitions = sorted(state.transitionsMap.iteritems(), key=lambda (label, _): (-next.freq, -self.label2Count[label])) |
130 | 163 | thisIdx = self.state2Index[state] |
131 | 164 | |
... | ... | @@ -139,11 +172,8 @@ class VLengthSerializer(Serializer): |
139 | 172 | assert nextState.reverseOffset is not None |
140 | 173 | n = len(transitions) - reversedN |
141 | 174 | |
142 | - popularLabel = label2Index[label] < 31 | |
143 | - firstByte = (label2Index[label] + 1) if popularLabel else 0 | |
144 | - | |
145 | -# if state.isAccepting(): | |
146 | -# firstByte |= VLengthSerializer.ACCEPTING_FLAG | |
175 | + popularLabel = self.label2Index[label] < 31 | |
176 | + firstByte = self.label2Index[label] if popularLabel else 31 | |
147 | 177 | |
148 | 178 | last = len(transitions) == n |
149 | 179 | next = last and stateAfterThis == nextState |
... | ... | @@ -162,24 +192,19 @@ class VLengthSerializer(Serializer): |
162 | 192 | if offset >= 256 * 256: |
163 | 193 | offset += 1 |
164 | 194 | offsetSize += 1 |
165 | - assert offset < 256 * 256 * 256 #TODO - przerobić na jakiś porządny wyjątek | |
195 | + assert offset < 256 * 256 * 256 #TODO - przerobic na jakis porzadny wyjatek | |
166 | 196 | |
167 | 197 | firstByte |= (32 * offsetSize) |
168 | 198 | |
169 | 199 | res.append(firstByte) |
170 | 200 | if not popularLabel: |
171 | 201 | res.append(label) |
172 | - if offsetSize >= 1: | |
173 | - res.append(offset & 0x0000FF) | |
174 | - if offsetSize >= 2: | |
175 | - res.append((offset & 0x00FF00) >> 8) | |
202 | + # serialize offset in big-endian order | |
176 | 203 | if offsetSize == 3: |
177 | 204 | res.append((offset & 0xFF0000) >> 16) |
205 | + if offsetSize >= 2: | |
206 | + res.append((offset & 0x00FF00) >> 8) | |
207 | + if offsetSize >= 1: | |
208 | + res.append(offset & 0x0000FF) | |
209 | + | |
178 | 210 | return res |
179 | -# currReverseOffset = nextState.reverseOffset | |
180 | -# res.append(byte) | |
181 | -# offset = nextState.offset | |
182 | -# res.append(offset & 0x0000FF) | |
183 | -# res.append((offset & 0x00FF00) >> 8) | |
184 | -# res.append((offset & 0xFF0000) >> 16) | |
185 | -# return res | |
186 | 211 | \ No newline at end of file |
... | ... |