Commit 3fa5c3388953574a4614746fd748465ff9656f25

Authored by Michał Lenart
1 parent bbe81abc

dodane łączenie identycznych wierszy w jeden

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@142 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/morfeuszbuilder/fsa/convertinput.py
@@ -121,15 +121,17 @@ class PolimorfConverter4Generator(object): @@ -121,15 +121,17 @@ class PolimorfConverter4Generator(object):
121 return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split(' ')[1].decode('utf8')), line)) 121 return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split(' ')[1].decode('utf8')), line))
122 122
123 def _reallyParseLines(self, inputLines): 123 def _reallyParseLines(self, inputLines):
  124 + prevLine = None
124 for line in inputLines: 125 for line in inputLines:
125 line = line.decode(self.inputEncoding).strip(u'\n') 126 line = line.decode(self.inputEncoding).strip(u'\n')
126 - if line: 127 + if line and line != prevLine:
127 orth, base, tagnum, namenum, typenum, homonymId = line.split(u' ') 128 orth, base, tagnum, namenum, typenum, homonymId = line.split(u' ')
128 # print orth.encode('utf8'), base.encode('utf8'), homonymId 129 # print orth.encode('utf8'), base.encode('utf8'), homonymId
129 tagnum = int(tagnum) 130 tagnum = int(tagnum)
130 namenum = int(namenum) 131 namenum = int(namenum)
131 typenum = int(typenum) 132 typenum = int(typenum)
132 yield (base, Interpretation4Generator(orth, base, tagnum, namenum, typenum, homonymId)) 133 yield (base, Interpretation4Generator(orth, base, tagnum, namenum, typenum, homonymId))
  134 + prevLine = line
133 135
134 def convert(self, inputLines): 136 def convert(self, inputLines):
135 return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))), lowercase=False) 137 return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))), lowercase=False)