Commit d42f73bcc0bb85658229c6d1c1fa0366a4d5be02
1 parent
e2ef01be
już praktycznie działa zakodowywanie interpretacji morfologicznych tak, jak ma być "docelowo"
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@15 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
6 changed files
with
65 additions
and
28 deletions
fsabuilder/fsa/buildfsa.py
@@ -16,9 +16,6 @@ from serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer | @@ -16,9 +16,6 @@ from serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer | ||
16 | from visualizer import Visualizer | 16 | from visualizer import Visualizer |
17 | from optparse import OptionParser | 17 | from optparse import OptionParser |
18 | 18 | ||
19 | -from pycallgraph import PyCallGraph | ||
20 | -from pycallgraph.output import GraphvizOutput | ||
21 | - | ||
22 | class OutputFormat(): | 19 | class OutputFormat(): |
23 | BINARY = 'BINARY' | 20 | BINARY = 'BINARY' |
24 | CPP = 'CPP' | 21 | CPP = 'CPP' |
@@ -131,8 +128,7 @@ def _parseOptions(): | @@ -131,8 +128,7 @@ def _parseOptions(): | ||
131 | # exit(1) | 128 | # exit(1) |
132 | return opts | 129 | return opts |
133 | 130 | ||
134 | -def _readPolimorfInput(inputFile, tagsetFile, encoder): | ||
135 | - tagset = common.Tagset(tagsetFile) | 131 | +def _readPolimorfInput(inputFile, tagset, encoder): |
136 | with codecs.open(inputFile, 'r', 'utf8') as f: | 132 | with codecs.open(inputFile, 'r', 'utf8') as f: |
137 | for entry in convertinput.convertPolimorf(f, tagset, encoder): | 133 | for entry in convertinput.convertPolimorf(f, tagset, encoder): |
138 | yield entry | 134 | yield entry |
@@ -167,8 +163,9 @@ def _printStats(fsa): | @@ -167,8 +163,9 @@ def _printStats(fsa): | ||
167 | 163 | ||
168 | def buildFromPoliMorf(inputFile, tagsetFile): | 164 | def buildFromPoliMorf(inputFile, tagsetFile): |
169 | encoder = encode.MorphEncoder() | 165 | encoder = encode.MorphEncoder() |
170 | - fsa = FSA(encoder) | ||
171 | - inputData = _readPolimorfInput(inputFile, tagsetFile, encoder) | 166 | + tagset = common.Tagset(tagsetFile) |
167 | + fsa = FSA(encoder, tagset) | ||
168 | + inputData = _readPolimorfInput(inputFile, tagset, encoder) | ||
172 | fsa.feed(inputData) | 169 | fsa.feed(inputData) |
173 | _printStats(fsa) | 170 | _printStats(fsa) |
174 | return fsa | 171 | return fsa |
@@ -230,6 +227,8 @@ def main(opts): | @@ -230,6 +227,8 @@ def main(opts): | ||
230 | if __name__ == '__main__': | 227 | if __name__ == '__main__': |
231 | opts = _parseOptions() | 228 | opts = _parseOptions() |
232 | if opts.profile: | 229 | if opts.profile: |
230 | + from pycallgraph import PyCallGraph | ||
231 | + from pycallgraph.output import GraphvizOutput | ||
233 | with PyCallGraph(output=GraphvizOutput()): | 232 | with PyCallGraph(output=GraphvizOutput()): |
234 | main(opts) | 233 | main(opts) |
235 | else: | 234 | else: |
fsabuilder/fsa/common.py
@@ -23,15 +23,15 @@ class Interpretation(object): | @@ -23,15 +23,15 @@ class Interpretation(object): | ||
23 | root += o | 23 | root += o |
24 | else: | 24 | else: |
25 | break | 25 | break |
26 | - cutLength = len(orth) - len(root) | 26 | + cutLength = len(encoder.encodeWord(orth)) - len(encoder.encodeWord(root)) |
27 | self.lemma = Lemma( | 27 | self.lemma = Lemma( |
28 | cutLength=cutLength, | 28 | cutLength=cutLength, |
29 | - suffixToAdd=base[len(root):]) | 29 | + suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False)) |
30 | self.tagnum = tagnum | 30 | self.tagnum = tagnum |
31 | self.namenum = namenum | 31 | self.namenum = namenum |
32 | 32 | ||
33 | def getSortKey(self): | 33 | def getSortKey(self): |
34 | - return (self.lemma.cutLength, self.lemma.suffixToAdd, self.tagnum, self.namenum) | 34 | + return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum) |
35 | 35 | ||
36 | def __eq__(self, other): | 36 | def __eq__(self, other): |
37 | if isinstance(other, Interpretation): | 37 | if isinstance(other, Interpretation): |
fsabuilder/fsa/encode.py
@@ -38,8 +38,8 @@ class Encoder(object): | @@ -38,8 +38,8 @@ class Encoder(object): | ||
38 | 38 | ||
39 | class SimpleEncoder(Encoder): | 39 | class SimpleEncoder(Encoder): |
40 | 40 | ||
41 | - def __init__(self, encoding='utf8', appendZero=False): | ||
42 | - super(SimpleEncoder, self).__init__(encoding, appendZero) | 41 | + def __init__(self, encoding='utf8'): |
42 | + super(SimpleEncoder, self).__init__(encoding) | ||
43 | 43 | ||
44 | def encodeData(self, data): | 44 | def encodeData(self, data): |
45 | return bytearray(data, encoding=self.encoding) + bytearray([0]) | 45 | return bytearray(data, encoding=self.encoding) + bytearray([0]) |
@@ -70,7 +70,7 @@ class MorphEncoder(Encoder): | @@ -70,7 +70,7 @@ class MorphEncoder(Encoder): | ||
70 | res = bytearray() | 70 | res = bytearray() |
71 | assert lemma.cutLength < 256 and lemma.cutLength >= 0 | 71 | assert lemma.cutLength < 256 and lemma.cutLength >= 0 |
72 | res.append(lemma.cutLength) | 72 | res.append(lemma.cutLength) |
73 | - res.extend(self.encodeWord(lemma.suffixToAdd, lowercase=False)) | 73 | + res.extend(lemma.suffixToAdd) |
74 | res.append(0) | 74 | res.append(0) |
75 | return res | 75 | return res |
76 | 76 |
fsabuilder/fsa/fsa.py
@@ -14,11 +14,12 @@ class FSA(object): | @@ -14,11 +14,12 @@ class FSA(object): | ||
14 | ''' | 14 | ''' |
15 | 15 | ||
16 | 16 | ||
17 | - def __init__(self, encoder): | 17 | + def __init__(self, encoder, tagset=None): |
18 | self.encodeWord = encoder.encodeWord | 18 | self.encodeWord = encoder.encodeWord |
19 | self.encodeData = encoder.encodeData | 19 | self.encodeData = encoder.encodeData |
20 | self.decodeData = encoder.decodeData | 20 | self.decodeData = encoder.decodeData |
21 | self.encodedPrevWord = None | 21 | self.encodedPrevWord = None |
22 | + self.tagset = tagset | ||
22 | self.initialState = state.State() | 23 | self.initialState = state.State() |
23 | self.register = register.Register() | 24 | self.register = register.Register() |
24 | self.label2Freq = {} | 25 | self.label2Freq = {} |
fsabuilder/fsa/serializer.py
@@ -18,6 +18,9 @@ class Serializer(object): | @@ -18,6 +18,9 @@ class Serializer(object): | ||
18 | def fsa(self): | 18 | def fsa(self): |
19 | return self._fsa | 19 | return self._fsa |
20 | 20 | ||
21 | + def getVersion(self): | ||
22 | + return 9 | ||
23 | + | ||
21 | def serialize2CppFile(self, fname): | 24 | def serialize2CppFile(self, fname): |
22 | res = [] | 25 | res = [] |
23 | # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | 26 | # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
@@ -39,14 +42,48 @@ class Serializer(object): | @@ -39,14 +42,48 @@ class Serializer(object): | ||
39 | def fsa2bytearray(self): | 42 | def fsa2bytearray(self): |
40 | 43 | ||
41 | res = bytearray() | 44 | res = bytearray() |
42 | - res.extend(self.serializePrologue()) | 45 | + res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) |
43 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | 46 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
44 | logging.debug('SERIALIZE') | 47 | logging.debug('SERIALIZE') |
45 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): | 48 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): |
46 | res.extend(self.state2bytearray(state)) | 49 | res.extend(self.state2bytearray(state)) |
47 | return res | 50 | return res |
48 | 51 | ||
49 | - def serializePrologue(self): | 52 | + def serializeTags(self, tagsMap): |
53 | + res = bytearray() | ||
54 | + numOfTags = len(tagsMap) | ||
55 | + res.extend(self.htons(numOfTags)) | ||
56 | + for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): | ||
57 | + res.extend(self.htons(tagnum)) | ||
58 | + res.extend(self.fsa.encodeWord(tag)) | ||
59 | + res.append(0) | ||
60 | + return res | ||
61 | + | ||
62 | + def serializeTagset(self, tagset): | ||
63 | + res = bytearray() | ||
64 | + if tagset: | ||
65 | + res.extend(self.serializeTags(tagset.tag2tagnum)) | ||
66 | + res.extend(self.serializeTags(tagset.name2namenum)) | ||
67 | + return res | ||
68 | + | ||
69 | + def htons(self, n): | ||
70 | + assert n < 65536 | ||
71 | + assert n >= 0 | ||
72 | + res = bytearray() | ||
73 | + res.append((n & 0x00FF00) >> 8) | ||
74 | + res.append(n & 0x0000FF) | ||
75 | + return res | ||
76 | + | ||
77 | + def htonl(self, n): | ||
78 | + assert n >= 0 | ||
79 | + res = bytearray() | ||
80 | + res.append((n & 0xFF000000) >> 24) | ||
81 | + res.append((n & 0x00FF0000) >> 16) | ||
82 | + res.append((n & 0x0000FF00) >> 8) | ||
83 | + res.append(n & 0x000000FF) | ||
84 | + return res | ||
85 | + | ||
86 | + def serializePrologue(self, additionalData=None): | ||
50 | res = bytearray() | 87 | res = bytearray() |
51 | 88 | ||
52 | # serialize magic number in big-endian order | 89 | # serialize magic number in big-endian order |
@@ -61,6 +98,15 @@ class Serializer(object): | @@ -61,6 +98,15 @@ class Serializer(object): | ||
61 | # serialize implementation code | 98 | # serialize implementation code |
62 | res.append(self.getImplementationCode()) | 99 | res.append(self.getImplementationCode()) |
63 | 100 | ||
101 | + # serialize additional data size in 2-byte big-endian | ||
102 | + additionalDataSize = len(additionalData) if additionalData else 0 | ||
103 | + res.extend(self.htonl(additionalDataSize)) | ||
104 | + | ||
105 | + # add additional data itself | ||
106 | + if additionalDataSize: | ||
107 | + assert type(additionalData) == bytearray | ||
108 | + res.extend(additionalData) | ||
109 | + | ||
64 | return res | 110 | return res |
65 | 111 | ||
66 | def state2bytearray(self, state): | 112 | def state2bytearray(self, state): |
@@ -81,9 +127,6 @@ class Serializer(object): | @@ -81,9 +127,6 @@ class Serializer(object): | ||
81 | def transitionsData2bytearray(self, state): | 127 | def transitionsData2bytearray(self, state): |
82 | raise NotImplementedError('Not implemented') | 128 | raise NotImplementedError('Not implemented') |
83 | 129 | ||
84 | - def getVersion(self): | ||
85 | - raise NotImplementedError('Not implemented') | ||
86 | - | ||
87 | def getImplementationCode(self): | 130 | def getImplementationCode(self): |
88 | raise NotImplementedError('Not implemented') | 131 | raise NotImplementedError('Not implemented') |
89 | 132 | ||
@@ -93,9 +136,6 @@ class SimpleSerializer(Serializer): | @@ -93,9 +136,6 @@ class SimpleSerializer(Serializer): | ||
93 | super(SimpleSerializer, self).__init__(fsa) | 136 | super(SimpleSerializer, self).__init__(fsa) |
94 | self.ACCEPTING_FLAG = 128 | 137 | self.ACCEPTING_FLAG = 128 |
95 | 138 | ||
96 | - def getVersion(self): | ||
97 | - return 8 | ||
98 | - | ||
99 | def getImplementationCode(self): | 139 | def getImplementationCode(self): |
100 | return 0 | 140 | return 0 |
101 | 141 | ||
@@ -141,9 +181,6 @@ class VLengthSerializer1(Serializer): | @@ -141,9 +181,6 @@ class VLengthSerializer1(Serializer): | ||
141 | self.ACCEPTING_FLAG = 0b10000000 | 181 | self.ACCEPTING_FLAG = 0b10000000 |
142 | self.ARRAY_FLAG = 0b01000000 | 182 | self.ARRAY_FLAG = 0b01000000 |
143 | 183 | ||
144 | - def getVersion(self): | ||
145 | - return 8 | ||
146 | - | ||
147 | def getImplementationCode(self): | 184 | def getImplementationCode(self): |
148 | return 1 | 185 | return 1 |
149 | 186 | ||
@@ -302,9 +339,6 @@ class VLengthSerializer2(Serializer): | @@ -302,9 +339,6 @@ class VLengthSerializer2(Serializer): | ||
302 | self.ACCEPTING_FLAG = 64 | 339 | self.ACCEPTING_FLAG = 64 |
303 | self.LAST_FLAG = 32 | 340 | self.LAST_FLAG = 32 |
304 | 341 | ||
305 | - def getVersion(self): | ||
306 | - return 8 | ||
307 | - | ||
308 | def getImplementationCode(self): | 342 | def getImplementationCode(self): |
309 | return 2 | 343 | return 2 |
310 | 344 |
fsabuilder/fsa/test/testConstruction.py
@@ -7,6 +7,7 @@ Created on Oct 8, 2013 | @@ -7,6 +7,7 @@ Created on Oct 8, 2013 | ||
7 | import unittest | 7 | import unittest |
8 | import os | 8 | import os |
9 | from fsa import fsa, visualizer, encode, buildfsa | 9 | from fsa import fsa, visualizer, encode, buildfsa |
10 | +from fsa.serializer import SimpleSerializer | ||
10 | 11 | ||
11 | class Test(unittest.TestCase): | 12 | class Test(unittest.TestCase): |
12 | 13 | ||
@@ -59,6 +60,8 @@ class Test(unittest.TestCase): | @@ -59,6 +60,8 @@ class Test(unittest.TestCase): | ||
59 | inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab') | 60 | inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab') |
60 | tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset') | 61 | tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset') |
61 | fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile) | 62 | fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile) |
63 | + serializer = SimpleSerializer(fsa) | ||
64 | + serializer.serialize2BinaryFile('/tmp/test0.fsa') | ||
62 | # visualizer.Visualizer().visualize(fsa) | 65 | # visualizer.Visualizer().visualize(fsa) |
63 | 66 | ||
64 | if __name__ == "__main__": | 67 | if __name__ == "__main__": |