Commit d42f73bcc0bb85658229c6d1c1fa0366a4d5be02
1 parent
e2ef01be
już praktycznie działa zakodowywanie interpretacji morfologicznych tak, jak ma być "docelowo"
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@15 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
6 changed files
with
65 additions
and
28 deletions
fsabuilder/fsa/buildfsa.py
... | ... | @@ -16,9 +16,6 @@ from serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer |
16 | 16 | from visualizer import Visualizer |
17 | 17 | from optparse import OptionParser |
18 | 18 | |
19 | -from pycallgraph import PyCallGraph | |
20 | -from pycallgraph.output import GraphvizOutput | |
21 | - | |
22 | 19 | class OutputFormat(): |
23 | 20 | BINARY = 'BINARY' |
24 | 21 | CPP = 'CPP' |
... | ... | @@ -131,8 +128,7 @@ def _parseOptions(): |
131 | 128 | # exit(1) |
132 | 129 | return opts |
133 | 130 | |
134 | -def _readPolimorfInput(inputFile, tagsetFile, encoder): | |
135 | - tagset = common.Tagset(tagsetFile) | |
131 | +def _readPolimorfInput(inputFile, tagset, encoder): | |
136 | 132 | with codecs.open(inputFile, 'r', 'utf8') as f: |
137 | 133 | for entry in convertinput.convertPolimorf(f, tagset, encoder): |
138 | 134 | yield entry |
... | ... | @@ -167,8 +163,9 @@ def _printStats(fsa): |
167 | 163 | |
168 | 164 | def buildFromPoliMorf(inputFile, tagsetFile): |
169 | 165 | encoder = encode.MorphEncoder() |
170 | - fsa = FSA(encoder) | |
171 | - inputData = _readPolimorfInput(inputFile, tagsetFile, encoder) | |
166 | + tagset = common.Tagset(tagsetFile) | |
167 | + fsa = FSA(encoder, tagset) | |
168 | + inputData = _readPolimorfInput(inputFile, tagset, encoder) | |
172 | 169 | fsa.feed(inputData) |
173 | 170 | _printStats(fsa) |
174 | 171 | return fsa |
... | ... | @@ -230,6 +227,8 @@ def main(opts): |
230 | 227 | if __name__ == '__main__': |
231 | 228 | opts = _parseOptions() |
232 | 229 | if opts.profile: |
230 | + from pycallgraph import PyCallGraph | |
231 | + from pycallgraph.output import GraphvizOutput | |
233 | 232 | with PyCallGraph(output=GraphvizOutput()): |
234 | 233 | main(opts) |
235 | 234 | else: |
... | ... |
fsabuilder/fsa/common.py
... | ... | @@ -23,15 +23,15 @@ class Interpretation(object): |
23 | 23 | root += o |
24 | 24 | else: |
25 | 25 | break |
26 | - cutLength = len(orth) - len(root) | |
26 | + cutLength = len(encoder.encodeWord(orth)) - len(encoder.encodeWord(root)) | |
27 | 27 | self.lemma = Lemma( |
28 | 28 | cutLength=cutLength, |
29 | - suffixToAdd=base[len(root):]) | |
29 | + suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False)) | |
30 | 30 | self.tagnum = tagnum |
31 | 31 | self.namenum = namenum |
32 | 32 | |
33 | 33 | def getSortKey(self): |
34 | - return (self.lemma.cutLength, self.lemma.suffixToAdd, self.tagnum, self.namenum) | |
34 | + return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum) | |
35 | 35 | |
36 | 36 | def __eq__(self, other): |
37 | 37 | if isinstance(other, Interpretation): |
... | ... |
fsabuilder/fsa/encode.py
... | ... | @@ -38,8 +38,8 @@ class Encoder(object): |
38 | 38 | |
39 | 39 | class SimpleEncoder(Encoder): |
40 | 40 | |
41 | - def __init__(self, encoding='utf8', appendZero=False): | |
42 | - super(SimpleEncoder, self).__init__(encoding, appendZero) | |
41 | + def __init__(self, encoding='utf8'): | |
42 | + super(SimpleEncoder, self).__init__(encoding) | |
43 | 43 | |
44 | 44 | def encodeData(self, data): |
45 | 45 | return bytearray(data, encoding=self.encoding) + bytearray([0]) |
... | ... | @@ -70,7 +70,7 @@ class MorphEncoder(Encoder): |
70 | 70 | res = bytearray() |
71 | 71 | assert lemma.cutLength < 256 and lemma.cutLength >= 0 |
72 | 72 | res.append(lemma.cutLength) |
73 | - res.extend(self.encodeWord(lemma.suffixToAdd, lowercase=False)) | |
73 | + res.extend(lemma.suffixToAdd) | |
74 | 74 | res.append(0) |
75 | 75 | return res |
76 | 76 | |
... | ... |
fsabuilder/fsa/fsa.py
... | ... | @@ -14,11 +14,12 @@ class FSA(object): |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | |
17 | - def __init__(self, encoder): | |
17 | + def __init__(self, encoder, tagset=None): | |
18 | 18 | self.encodeWord = encoder.encodeWord |
19 | 19 | self.encodeData = encoder.encodeData |
20 | 20 | self.decodeData = encoder.decodeData |
21 | 21 | self.encodedPrevWord = None |
22 | + self.tagset = tagset | |
22 | 23 | self.initialState = state.State() |
23 | 24 | self.register = register.Register() |
24 | 25 | self.label2Freq = {} |
... | ... |
fsabuilder/fsa/serializer.py
... | ... | @@ -18,6 +18,9 @@ class Serializer(object): |
18 | 18 | def fsa(self): |
19 | 19 | return self._fsa |
20 | 20 | |
21 | + def getVersion(self): | |
22 | + return 9 | |
23 | + | |
21 | 24 | def serialize2CppFile(self, fname): |
22 | 25 | res = [] |
23 | 26 | # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
... | ... | @@ -39,14 +42,48 @@ class Serializer(object): |
39 | 42 | def fsa2bytearray(self): |
40 | 43 | |
41 | 44 | res = bytearray() |
42 | - res.extend(self.serializePrologue()) | |
45 | + res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) | |
43 | 46 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
44 | 47 | logging.debug('SERIALIZE') |
45 | 48 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): |
46 | 49 | res.extend(self.state2bytearray(state)) |
47 | 50 | return res |
48 | 51 | |
49 | - def serializePrologue(self): | |
52 | + def serializeTags(self, tagsMap): | |
53 | + res = bytearray() | |
54 | + numOfTags = len(tagsMap) | |
55 | + res.extend(self.htons(numOfTags)) | |
56 | + for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): | |
57 | + res.extend(self.htons(tagnum)) | |
58 | + res.extend(self.fsa.encodeWord(tag)) | |
59 | + res.append(0) | |
60 | + return res | |
61 | + | |
62 | + def serializeTagset(self, tagset): | |
63 | + res = bytearray() | |
64 | + if tagset: | |
65 | + res.extend(self.serializeTags(tagset.tag2tagnum)) | |
66 | + res.extend(self.serializeTags(tagset.name2namenum)) | |
67 | + return res | |
68 | + | |
69 | + def htons(self, n): | |
70 | + assert n < 65536 | |
71 | + assert n >= 0 | |
72 | + res = bytearray() | |
73 | + res.append((n & 0x00FF00) >> 8) | |
74 | + res.append(n & 0x0000FF) | |
75 | + return res | |
76 | + | |
77 | + def htonl(self, n): | |
78 | + assert n >= 0 | |
79 | + res = bytearray() | |
80 | + res.append((n & 0xFF000000) >> 24) | |
81 | + res.append((n & 0x00FF0000) >> 16) | |
82 | + res.append((n & 0x0000FF00) >> 8) | |
83 | + res.append(n & 0x000000FF) | |
84 | + return res | |
85 | + | |
86 | + def serializePrologue(self, additionalData=None): | |
50 | 87 | res = bytearray() |
51 | 88 | |
52 | 89 | # serialize magic number in big-endian order |
... | ... | @@ -61,6 +98,15 @@ class Serializer(object): |
61 | 98 | # serialize implementation code |
62 | 99 | res.append(self.getImplementationCode()) |
63 | 100 | |
101 | + # serialize additional data size in 2-byte big-endian | |
102 | + additionalDataSize = len(additionalData) if additionalData else 0 | |
103 | + res.extend(self.htonl(additionalDataSize)) | |
104 | + | |
105 | + # add additional data itself | |
106 | + if additionalDataSize: | |
107 | + assert type(additionalData) == bytearray | |
108 | + res.extend(additionalData) | |
109 | + | |
64 | 110 | return res |
65 | 111 | |
66 | 112 | def state2bytearray(self, state): |
... | ... | @@ -81,9 +127,6 @@ class Serializer(object): |
81 | 127 | def transitionsData2bytearray(self, state): |
82 | 128 | raise NotImplementedError('Not implemented') |
83 | 129 | |
84 | - def getVersion(self): | |
85 | - raise NotImplementedError('Not implemented') | |
86 | - | |
87 | 130 | def getImplementationCode(self): |
88 | 131 | raise NotImplementedError('Not implemented') |
89 | 132 | |
... | ... | @@ -93,9 +136,6 @@ class SimpleSerializer(Serializer): |
93 | 136 | super(SimpleSerializer, self).__init__(fsa) |
94 | 137 | self.ACCEPTING_FLAG = 128 |
95 | 138 | |
96 | - def getVersion(self): | |
97 | - return 8 | |
98 | - | |
99 | 139 | def getImplementationCode(self): |
100 | 140 | return 0 |
101 | 141 | |
... | ... | @@ -141,9 +181,6 @@ class VLengthSerializer1(Serializer): |
141 | 181 | self.ACCEPTING_FLAG = 0b10000000 |
142 | 182 | self.ARRAY_FLAG = 0b01000000 |
143 | 183 | |
144 | - def getVersion(self): | |
145 | - return 8 | |
146 | - | |
147 | 184 | def getImplementationCode(self): |
148 | 185 | return 1 |
149 | 186 | |
... | ... | @@ -302,9 +339,6 @@ class VLengthSerializer2(Serializer): |
302 | 339 | self.ACCEPTING_FLAG = 64 |
303 | 340 | self.LAST_FLAG = 32 |
304 | 341 | |
305 | - def getVersion(self): | |
306 | - return 8 | |
307 | - | |
308 | 342 | def getImplementationCode(self): |
309 | 343 | return 2 |
310 | 344 | |
... | ... |
fsabuilder/fsa/test/testConstruction.py
... | ... | @@ -7,6 +7,7 @@ Created on Oct 8, 2013 |
7 | 7 | import unittest |
8 | 8 | import os |
9 | 9 | from fsa import fsa, visualizer, encode, buildfsa |
10 | +from fsa.serializer import SimpleSerializer | |
10 | 11 | |
11 | 12 | class Test(unittest.TestCase): |
12 | 13 | |
... | ... | @@ -59,6 +60,8 @@ class Test(unittest.TestCase): |
59 | 60 | inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab') |
60 | 61 | tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset') |
61 | 62 | fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile) |
63 | + serializer = SimpleSerializer(fsa) | |
64 | + serializer.serialize2BinaryFile('/tmp/test0.fsa') | |
62 | 65 | # visualizer.Visualizer().visualize(fsa) |
63 | 66 | |
64 | 67 | if __name__ == "__main__": |
... | ... |