Commit 3fa5c3388953574a4614746fd748465ff9656f25
1 parent
bbe81abc
dodane łączenie identycznych wierszy w jeden
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@142 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
1 changed file
with
3 additions
and
1 deletions
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... | ... | @@ -121,15 +121,17 @@ class PolimorfConverter4Generator(object): |
121 | 121 | return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split(' ')[1].decode('utf8')), line)) |
122 | 122 | |
123 | 123 | def _reallyParseLines(self, inputLines): |
124 | + prevLine = None | |
124 | 125 | for line in inputLines: |
125 | 126 | line = line.decode(self.inputEncoding).strip(u'\n') |
126 | - if line: | |
127 | + if line and line != prevLine: | |
127 | 128 | orth, base, tagnum, namenum, typenum, homonymId = line.split(u' ') |
128 | 129 | # print orth.encode('utf8'), base.encode('utf8'), homonymId |
129 | 130 | tagnum = int(tagnum) |
130 | 131 | namenum = int(namenum) |
131 | 132 | typenum = int(typenum) |
132 | 133 | yield (base, Interpretation4Generator(orth, base, tagnum, namenum, typenum, homonymId)) |
134 | + prevLine = line | |
133 | 135 | |
134 | 136 | def convert(self, inputLines): |
135 | 137 | return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))), lowercase=False) |
... | ... |