|
1
2
3
|
'''
Created on Oct 23, 2013
|
|
4
|
@author: mlenart
|
|
5
6
|
'''
|
|
7
|
import logging
|
|
8
|
import itertools
|
|
9
|
from morfeuszbuilder.utils.serializationUtils import *
|
|
10
|
|
|
11
12
13
14
15
16
|
class Encoder(object):
'''
classdocs
'''
|
|
17
|
def __init__(self, lowercase, encoding='utf8'):
|
|
18
19
20
|
'''
Constructor
'''
|
|
21
|
self.lowercase = lowercase
|
|
22
|
self.encoding = encoding
|
|
23
|
self.qualifiersMap = { frozenset(): 0}
|
|
24
|
|
|
25
|
def encodeWord(self, word, lowercase=True):
|
|
26
|
assert type(word) == unicode
|
|
27
|
res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding)
|
|
28
|
return res
|
|
29
30
|
def encodeData(self, data):
|
|
31
|
raise NotImplementedError()
|
|
32
33
|
def decodeData(self, rawData):
|
|
34
|
return NotImplementedError()
|
|
35
36
37
|
def decodeWord(self, rawWord):
return unicode(str(rawWord).strip('\x00'), self.encoding)
|
|
38
39
|
def word2SortKey(self, word):
|
|
40
41
|
normalizedWord = word.lower() if self.lowercase else word
return normalizedWord.encode(self.encoding)
|
|
42
|
|
|
43
44
45
46
|
def _encodeTypeNum(self, typenum):
assert typenum >= 0 and typenum < 256
return bytearray([typenum])
|
|
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
def _encodeCasePattern(self, casePattern):
res = bytearray()
if True not in casePattern:
res.append(self.LEMMA_ONLY_LOWER)
return res
elif self._hasUpperPrefix(casePattern):
res.append(self.LEMMA_UPPER_PREFIX)
res.append(self._getUpperPrefixLength(casePattern))
return res
else:
assert len(casePattern) < 256
res.append(self.LEMMA_MIXED_CASE)
res.append(len([c for c in casePattern if c]))
for idx in range(len(casePattern)):
if casePattern[idx]:
res.append(idx)
return res
|
|
65
66
67
68
69
70
71
72
|
def _encodeQualifiers(self, qualifiers):
res = bytearray()
key = frozenset(qualifiers)
if key in self.qualifiersMap:
n = self.qualifiersMap[key]
else:
n = len(self.qualifiersMap)
self.qualifiersMap[key] = n
|
|
73
|
assert n < 500
|
|
74
|
res.extend(htons(n))
|
|
75
76
|
return res
|
|
77
|
def _hasUpperPrefix(self, casePattern):
|
|
78
|
for i in range(len(casePattern) + 1):
|
|
79
80
81
82
83
84
85
86
87
88
89
|
if all(casePattern[:i]) and not any(casePattern[i:]):
return True
return False
def _getUpperPrefixLength(self, casePattern):
assert self._hasUpperPrefix(casePattern)
for i in range(len(casePattern)):
if not casePattern[i]:
return i
return len(casePattern)
|
|
90
91
92
93
94
95
96
97
98
99
|
def _encodeTagNum(self, tagnum):
res = bytearray()
assert tagnum < 65536 and tagnum >= 0
res.append((tagnum & 0xFF00) >> 8)
res.append(tagnum & 0x00FF)
return res
def _encodeNameNum(self, namenum):
assert namenum < 256 and namenum >= 0
return bytearray([namenum])
|
|
100
101
102
103
104
105
106
107
|
def _groupInterpsByType(self, interpsList):
res = {}
for interp in interpsList:
res.setdefault(interp.typenum, [])
res[interp.typenum].append(interp)
return res
|
|
108
|
def _getOrthCasePatterns(self, interpsList):
|
|
109
|
res = []
|
|
110
|
for interp in interpsList:
|
|
111
|
if not True in interp.orthCasePattern:
|
|
112
|
return []
|
|
113
|
else:
|
|
114
|
res.append(list(interp.orthCasePattern))
|
|
115
116
|
return res
|
|
117
|
def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer):
|
|
118
119
120
|
res = bytearray()
res.extend(self._encodeTypeNum(typenum))
encodedInterpsList = bytearray()
|
|
121
|
if isAnalyzer:
|
|
122
|
casePatterns = self._getOrthCasePatterns(interpsList)
|
|
123
124
125
|
encodedInterpsList.append(len(casePatterns))
for casePattern in casePatterns:
encodedInterpsList.extend(self._encodeCasePattern(casePattern))
|
|
126
|
for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
|
|
127
|
if isAnalyzer:
|
|
128
|
encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
|
|
129
|
else:
|
|
130
131
|
encodedInterpsList.extend(serializeString(interp.homonymId))
encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd))
|
|
132
|
encodedInterpsList.append(interp.encodedForm.cutLength)
|
|
133
|
encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd))
|
|
134
135
136
137
|
if isAnalyzer:
encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
encodedInterpsList.extend(htons(interp.tagnum))
encodedInterpsList.append(interp.namenum)
|
|
138
|
encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
|
|
139
|
|
|
140
|
res.extend(htons(len(encodedInterpsList)))
|
|
141
142
143
|
res.extend(encodedInterpsList)
return res
|
|
144
|
def _doEncodeData(self, interpsList, isAnalyzer):
|
|
145
146
147
148
149
150
151
152
153
154
155
156
157
|
assert type(interpsList) == frozenset
segnum2Interps = self._groupInterpsByType(interpsList)
res = bytearray()
firstByte = len(segnum2Interps)
assert firstByte < 256
assert firstByte > 0
res.append(firstByte)
for typenum, interpsList in segnum2Interps.iteritems():
|
|
158
|
res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer))
|
|
159
|
del interpsList
|
|
160
|
|
|
161
|
return res
|
|
162
163
164
165
|
class MorphEncoder(Encoder):
def __init__(self, encoding='utf8'):
|
|
166
|
super(MorphEncoder, self).__init__(True, encoding)
|
|
167
168
169
170
171
|
self.LEMMA_ONLY_LOWER = 0
self.LEMMA_UPPER_PREFIX = 1
self.LEMMA_MIXED_CASE = 2
def encodeData(self, interpsList):
|
|
172
|
return self._doEncodeData(interpsList, isAnalyzer=True)
|
|
173
174
175
176
|
class Encoder4Generator(Encoder):
def __init__(self, encoding='utf8'):
|
|
177
|
super(Encoder4Generator, self).__init__(False, encoding)
|
|
178
179
|
def encodeData(self, interpsList):
|
|
180
|
return self._doEncodeData(interpsList, isAnalyzer=False)
|