convertinput.py
1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
'''
Created on Oct 23, 2013
@author: mlenart
'''
import sys
import fileinput
import logging
from encode import Encoder
def _encodeInterp(orth, base, tag, name):
removePrefix = 0
root = u''
for o, b in zip(orth, base):
if o == b:
root += o
else:
break
removeSuffixNum = len(orth) - len(root)
addSuffix = base[len(root):]
return u'+'.join([
chr(ord('A')+removePrefix) + chr(ord('A')+removeSuffixNum) + addSuffix,
tag,
name])
def _parsePolimorf(inputLines):
for line0 in inputLines:
line = line0.strip(u'\n')
if line:
# print line
orth, base, tag, name = line.split(u'\t')
yield (orth, _encodeInterp(orth, base, tag, name))
def _sortAndMergeParsedInput(inputData, key=lambda k: k):
logging.info('sorting input...')
entries = list(inputData)
entries.sort(key=key)
logging.info('done sorting')
prevOrth = None
prevInterps = None
for orth, interp in entries:
if prevOrth and prevOrth == orth:
prevInterps.append(interp)
else:
if prevOrth:
yield (prevOrth, sorted(set(prevInterps)))
prevOrth = orth
prevInterps = [interp]
def convertPolimorf(inputLines, sortKey=lambda k: k):
for orth, interps in _sortAndMergeParsedInput(_parsePolimorf(inputLines), key=sortKey):
yield orth, interps
def _decodeInputLines(rawInputLines, encoding):
for line in rawInputLines:
yield line.decode(encoding)
if __name__ == '__main__':
encoder = Encoder()
for orth, interps in convertPolimorf(_decodeInputLines(fileinput.input(), 'utf8'), lambda (orth, interp): encoder.word2SortKey(orth)):
print u'\t'.join([orth, u'|'.join(interps)]).encode('utf8')