Commit ce75f5c31c40bd0aef3a1fb8ec2919c3ea1b3a90

Authored by Michał Lenart
1 parent 03c17574

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@8 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

fsabuilder/fsa/encode.py
... ... @@ -10,7 +10,7 @@ class Encoder(object):
10 10 '''
11 11  
12 12  
13   - def __init__(self, encoding='utf8', appendZero):
  13 + def __init__(self, encoding='utf8', appendZero=True):
14 14 '''
15 15 Constructor
16 16 '''
... ...
fsabuilder/fsa/serializer.py
... ... @@ -17,40 +17,47 @@ class Serializer(object):
17 17  
18 18 def serialize2CppFile(self, fname):
19 19 res = []
20   - self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
  20 +# self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
21 21 res.append('const unsigned char DEFAULT_FSA[] = {')
22   - for idx, state in enumerate(sorted(self.fsa.initialState.dfs(set()), key=lambda state: state.offset)):
23   - res.append('// state '+str(idx))
24   - partRes = []
25   - for byte in self.state2bytearray(state):
26   - partRes.append(hex(byte))
27   - partRes.append(',')
28   - res.append(' '.join(partRes))
  22 + for byte in self.fsa2bytearray():
  23 + res.append(hex(byte));
  24 + res.append(',');
  25 +# for idx, state in enumerate(sorted(self.fsa.initialState.dfs(set()), key=lambda state: state.offset)):
  26 +# res.append('// state '+str(idx))
  27 +# partRes = []
  28 +# for byte in self.state2bytearray(state):
  29 +# partRes.append(hex(byte))
  30 +# partRes.append(',')
  31 +# res.append(' '.join(partRes))
29 32 res.append('}')
30 33 with open(fname, 'w') as f:
31 34 f.write('\n'.join(res))
32 35  
33 36 def serialize2BinaryFile(self, fname):
34   - res = bytearray()
35   - self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
36   - for state in sorted(self.fsa.initialState.dfs(set()), key=lambda state: state.offset):
37   -# res.append('// state '+str(idx))
38   - res.extend(self.state2bytearray(state))
  37 +# res = bytearray()
  38 +# self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
  39 +# for state in sorted(self.fsa.initialState.dfs(set()), key=lambda state: state.offset):
  40 +# # res.append('// state '+str(idx))
  41 +# res.extend(self.state2bytearray(state))
39 42 with open(fname, 'wb') as f:
40   - f.write(res)
  43 + f.write(self.fsa2bytearray())
41 44  
42 45 def getStateSize(self, state):
43 46 raise NotImplementedError('Not implemented')
44 47  
45   - def fsa2bytearray(self, fsa):
  48 + def fsa2bytearray(self):
46 49 res = bytearray()
47   - fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
48   - for state in sorted(fsa.initialState.dfs(set()), key=state.offset):
  50 + res.extend(self.serializePrologue())
  51 + self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
  52 + for state in sorted(self.fsa.initialState.dfs(set()), key=state.offset):
49 53 res.extend(self.state2bytearray(state))
50 54 return res
51 55  
52 56 def state2bytearray(self, state):
53 57 raise NotImplementedError('Not implemented')
  58 +
  59 + def serializePrologue(self):
  60 + raise NotImplementedError('Not implemented')
54 61  
55 62 class SimpleSerializer(Serializer):
56 63  
... ... @@ -92,9 +99,14 @@ class SimpleSerializer(Serializer):
92 99 res.append((offset & 0x00FF00) >> 8)
93 100 res.append((offset & 0xFF0000) >> 16)
94 101 return res
  102 +
  103 + def serializePrologue(self):
  104 + return bytearray()
95 105  
96 106 class VLengthSerializer(Serializer):
97   -
  107 +
  108 + MAGIC_NUMBER = 0x8fc2bc1b
  109 + VERSION = 1
98 110 LAST_FLAG = 128
99 111  
100 112 def __init__(self, fsa):
... ... @@ -102,6 +114,29 @@ class VLengthSerializer(Serializer):
102 114 self.statesTable = list(reversed(fsa.dfs(set())))
103 115 self.state2Index = dict([(state, idx) for (idx, state) in enumerate(self.statesTable)])
104 116  
  117 + # labels sorted by popularity
  118 + self.sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda label, freq: (-freq, label))]
  119 +
  120 + # popular labels table
  121 + self.label2Index = dict([(label, sortedLabels.index(label)) for label in sortedLabels][:31])
  122 +
  123 + def serializePrologue(self):
  124 + res = bytearray()
  125 +
  126 + # serialize magic number in big-endian order
  127 + res.append((VLengthSerializer.MAGIC_NUMBER & 0xFF000000) >> 24)
  128 + res.append((VLengthSerializer.MAGIC_NUMBER & 0x00FF0000) >> 16)
  129 + res.append((VLengthSerializer.MAGIC_NUMBER & 0x0000FF00) >> 8)
  130 + res.append(VLengthSerializer.MAGIC_NUMBER & 0x000000FF)
  131 +
  132 + # serialize version number
  133 + res.append(VLengthSerializer.VERSION)
  134 +
  135 + # serialize popular labels
  136 + for label, freq in self.sortedLabels[:31]:
  137 + res.append(label)
  138 +
  139 + return res
105 140  
106 141 def getStateSize(self, state):
107 142 return len(self.state2bytearray(state))
... ... @@ -109,7 +144,7 @@ class VLengthSerializer(Serializer):
109 144 def getDataSize(self, state):
110 145 assert type(state.encodedData) == bytearray or not state.isAccepting()
111 146 return len(state.encodedData) if state.isAccepting() else 0
112   -
  147 +
113 148 def state2bytearray(self, state):
114 149 res = bytearray()
115 150 res.extend(self._stateData2bytearray(state))
... ... @@ -124,8 +159,6 @@ class VLengthSerializer(Serializer):
124 159  
125 160 def _transitionsData2bytearray(self, state):
126 161 res = bytearray()
127   - sortedLabels = list(sorted(self.fsa.label2Freq.iteritems(), key=lambda label, freq: (-freq, label)))
128   - label2Index = dict([(label, sortedLabels.index(label)) for label in sortedLabels][:30])
129 162 transitions = sorted(state.transitionsMap.iteritems(), key=lambda (label, _): (-next.freq, -self.label2Count[label]))
130 163 thisIdx = self.state2Index[state]
131 164  
... ... @@ -139,11 +172,8 @@ class VLengthSerializer(Serializer):
139 172 assert nextState.reverseOffset is not None
140 173 n = len(transitions) - reversedN
141 174  
142   - popularLabel = label2Index[label] < 31
143   - firstByte = (label2Index[label] + 1) if popularLabel else 0
144   -
145   -# if state.isAccepting():
146   -# firstByte |= VLengthSerializer.ACCEPTING_FLAG
  175 + popularLabel = self.label2Index[label] < 31
  176 + firstByte = self.label2Index[label] if popularLabel else 31
147 177  
148 178 last = len(transitions) == n
149 179 next = last and stateAfterThis == nextState
... ... @@ -162,24 +192,19 @@ class VLengthSerializer(Serializer):
162 192 if offset >= 256 * 256:
163 193 offset += 1
164 194 offsetSize += 1
165   - assert offset < 256 * 256 * 256 #TODO - przerobić na jakiś porządny wyjątek
  195 + assert offset < 256 * 256 * 256 #TODO - przerobic na jakis porzadny wyjatek
166 196  
167 197 firstByte |= (32 * offsetSize)
168 198  
169 199 res.append(firstByte)
170 200 if not popularLabel:
171 201 res.append(label)
172   - if offsetSize >= 1:
173   - res.append(offset & 0x0000FF)
174   - if offsetSize >= 2:
175   - res.append((offset & 0x00FF00) >> 8)
  202 + # serialize offset in big-endian order
176 203 if offsetSize == 3:
177 204 res.append((offset & 0xFF0000) >> 16)
  205 + if offsetSize >= 2:
  206 + res.append((offset & 0x00FF00) >> 8)
  207 + if offsetSize >= 1:
  208 + res.append(offset & 0x0000FF)
  209 +
178 210 return res
179   -# currReverseOffset = nextState.reverseOffset
180   -# res.append(byte)
181   -# offset = nextState.offset
182   -# res.append(offset & 0x0000FF)
183   -# res.append((offset & 0x00FF00) >> 8)
184   -# res.append((offset & 0xFF0000) >> 16)
185   -# return res
186 211 \ No newline at end of file
... ...