Commit 3fa5c3388953574a4614746fd748465ff9656f25
1 parent
bbe81abc
dodane łączenie identycznych wierszy w jeden
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@142 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
1 changed file
with
3 additions
and
1 deletions
fsabuilder/morfeuszbuilder/fsa/convertinput.py
@@ -121,15 +121,17 @@ class PolimorfConverter4Generator(object): | @@ -121,15 +121,17 @@ class PolimorfConverter4Generator(object): | ||
121 | return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split(' ')[1].decode('utf8')), line)) | 121 | return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split(' ')[1].decode('utf8')), line)) |
122 | 122 | ||
123 | def _reallyParseLines(self, inputLines): | 123 | def _reallyParseLines(self, inputLines): |
124 | + prevLine = None | ||
124 | for line in inputLines: | 125 | for line in inputLines: |
125 | line = line.decode(self.inputEncoding).strip(u'\n') | 126 | line = line.decode(self.inputEncoding).strip(u'\n') |
126 | - if line: | 127 | + if line and line != prevLine: |
127 | orth, base, tagnum, namenum, typenum, homonymId = line.split(u' ') | 128 | orth, base, tagnum, namenum, typenum, homonymId = line.split(u' ') |
128 | # print orth.encode('utf8'), base.encode('utf8'), homonymId | 129 | # print orth.encode('utf8'), base.encode('utf8'), homonymId |
129 | tagnum = int(tagnum) | 130 | tagnum = int(tagnum) |
130 | namenum = int(namenum) | 131 | namenum = int(namenum) |
131 | typenum = int(typenum) | 132 | typenum = int(typenum) |
132 | yield (base, Interpretation4Generator(orth, base, tagnum, namenum, typenum, homonymId)) | 133 | yield (base, Interpretation4Generator(orth, base, tagnum, namenum, typenum, homonymId)) |
134 | + prevLine = line | ||
133 | 135 | ||
134 | def convert(self, inputLines): | 136 | def convert(self, inputLines): |
135 | return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))), lowercase=False) | 137 | return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))), lowercase=False) |