convertinput.py
3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
'''
Created on Oct 23, 2013
@author: mlenart
'''
import logging
from common import Interpretation
from morfeuszbuilder.fsa.common import Interpretation4Generator
def _mergeEntries(inputLines):
prevKey = None
prevInterps = None
for key, interp in inputLines:
key = key.lower()
assert key
if prevKey and prevKey == key:
prevInterps.append(interp)
else:
if prevKey:
yield (prevKey, frozenset(prevInterps))
prevKey = key
prevInterps = [interp]
yield (prevKey, frozenset(prevInterps))
class PolimorfConverter4Analyzer(object):
def __init__(self, tagset, encoder, inputEncoding, segmentRulesManager):
self.tagset = tagset
self.encoder = encoder
self.inputEncoding = inputEncoding
self.segmentRulesManager = segmentRulesManager
# we do it the ugly way (parse to plain text) because it is way more memory-efficient
def _partiallyParseLines(self, inputLines):
for line in inputLines:
line = line.decode(self.inputEncoding).strip('\n')
orth, base, tag, name = line.split(u'\t')
tagnum = self.tagset.tag2tagnum[tag]
namenum = self.tagset.name2namenum[name]
# typenum = tag2typenum.get(tag, 0)
typenum = self.segmentRulesManager.lexeme2SegmentTypeNum(base, tag)
yield '%s %s %d %d %d' % (
orth.encode(self.inputEncoding),
base.encode(self.inputEncoding),
tagnum, namenum, typenum)
# input lines are encoded and partially parsed
def _sortLines(self, inputLines):
return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split(' ')[0].decode('utf8')))
def _reallyParseLines(self, inputLines):
for line in inputLines:
line = line.decode(self.inputEncoding).strip(u'\n')
if line:
orth, base, tagnum, namenum, typenum = line.split(u' ')
tagnum = int(tagnum)
namenum = int(namenum)
typenum = int(typenum)
yield (orth, Interpretation(orth, base, tagnum, namenum, typenum))
def convert(self, inputLines):
return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))))
class PolimorfConverter4Generator(object):
def __init__(self, tagset, encoder, inputEncoding='utf8'):
self.tagset = tagset
self.encoder = encoder
self.inputEncoding = inputEncoding
# we do it the ugly way (parse to plain text) because it is way more memory-efficient
def _partiallyParseLines(self, inputLines):
for line in inputLines:
line = line.decode(self.inputEncoding).strip('\n')
orth, base, tag, name = line.split(u'\t')
tagnum = self.tagset.tag2tagnum[tag]
namenum = self.tagset.name2namenum[name]
yield '%s %s %d %d' % (
orth.encode(self.inputEncoding),
base.encode(self.inputEncoding),
tagnum, namenum)
# input lines are encoded and partially parsed
def _sortLines(self, inputLines):
return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split(' ')[1].decode('utf8')), line))
def _reallyParseLines(self, inputLines):
for line in inputLines:
line = line.decode(self.inputEncoding).strip(u'\n')
if line:
orth, base, tagnum, namenum = line.split(u' ')
tagnum = int(tagnum)
namenum = int(namenum)
yield (base, Interpretation4Generator(orth, base, tagnum, namenum))
def convert(self, inputLines):
return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))))