Commit ed66703baa59e9e5c2a15e39bf6d2b2a8e9ec71e
1 parent
a4444480
- poprawienie zużycia pamięci przez budowniczego automatów
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@31 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
2 changed files
with
25 additions
and
12 deletions
fsabuilder/fsa/buildfsa.py
... | ... | @@ -129,7 +129,7 @@ def _parseOptions(): |
129 | 129 | return opts |
130 | 130 | |
131 | 131 | def _readPolimorfInput(inputFile, tagset, encoder): |
132 | - with codecs.open(inputFile, 'r', 'utf8') as f: | |
132 | + with open(inputFile, 'r') as f: | |
133 | 133 | for entry in convertinput.convertPolimorf(f, tagset, encoder): |
134 | 134 | yield entry |
135 | 135 | |
... | ... | @@ -166,7 +166,9 @@ def buildFromPoliMorf(inputFile, tagsetFile): |
166 | 166 | tagset = common.Tagset(tagsetFile) |
167 | 167 | fsa = FSA(encoder, tagset) |
168 | 168 | inputData = _readPolimorfInput(inputFile, tagset, encoder) |
169 | + | |
169 | 170 | fsa.feed(inputData) |
171 | + | |
170 | 172 | _printStats(fsa) |
171 | 173 | return fsa |
172 | 174 | |
... | ... |
fsabuilder/fsa/convertinput.py
... | ... | @@ -398,23 +398,30 @@ tag2typenum = { |
398 | 398 | 'burk': 9, |
399 | 399 | } |
400 | 400 | |
401 | +def _partiallyParseLines(inputLines, tagset): | |
402 | + for line in inputLines: | |
403 | + line = line.decode('utf8').strip('\n') | |
404 | + orth, base, tag, name = line.split(u'\t') | |
405 | + tagnum = tagset.tag2tagnum[tag] | |
406 | + namenum = tagset.name2namenum[name] | |
407 | + typenum = tag2typenum.get(tag, 0) | |
408 | + yield '%s %s %d %d %d' % (orth.encode('utf8'), base.encode('utf8'), tagnum, namenum, typenum) | |
409 | + | |
401 | 410 | def _sortLines(inputLines, encoder): |
402 | - logging.info('sorting input...') | |
403 | 411 | lines = list(inputLines) |
404 | - logging.info('done read data into list') | |
405 | - lines.sort(key=lambda line: encoder.word2SortKey(line.split('\t')[0])) | |
406 | - logging.info('done sorting') | |
412 | + lines.sort(key=lambda line: encoder.word2SortKey(line.split(' ')[0].decode('utf8'))) | |
407 | 413 | return lines |
414 | +# return sorted(inputLines, key=lambda line: encoder.word2SortKey(line.split(' ')[0].decode('utf8'))) | |
408 | 415 | |
409 | -def _parseLines(inputLines, tagset, encoder): | |
416 | +def _reallyParseLines(inputLines): | |
410 | 417 | for line in inputLines: |
411 | - line = line.strip(u'\n') | |
418 | + line = line.decode('utf8').strip(u'\n') | |
412 | 419 | if line: |
413 | 420 | # print line |
414 | - orth, base, tag, name = line.split(u'\t') | |
415 | - tagnum = tagset.tag2tagnum[tag] | |
416 | - namenum = tagset.name2namenum[name] | |
417 | - typenum = tag2typenum.get(tag, 0) | |
421 | + orth, base, tagnum, namenum, typenum = line.split(u' ') | |
422 | + tagnum = int(tagnum) | |
423 | + namenum = int(namenum) | |
424 | + typenum = int(typenum) | |
418 | 425 | yield (orth, Interpretation(orth, base, tagnum, namenum, typenum)) |
419 | 426 | |
420 | 427 | def _mergeEntries(inputLines): |
... | ... | @@ -433,5 +440,9 @@ def _mergeEntries(inputLines): |
433 | 440 | yield (prevOrth, frozenset(prevInterps)) |
434 | 441 | |
435 | 442 | def convertPolimorf(inputLines, tagset, encoder): |
436 | - for orth, interps in _mergeEntries(_parseLines(_sortLines(inputLines, encoder), tagset, encoder)): | |
443 | + for orth, interps in _mergeEntries(_reallyParseLines(_sortLines(_partiallyParseLines(inputLines, tagset), encoder))): | |
437 | 444 | yield orth, interps |
445 | + | |
446 | +# def convertPolimorf(inputLines, tagset, encoder): | |
447 | +# for orth, interps in _mergeEntries(_parseLines(_sortLines(inputLines, encoder), tagset, encoder)): | |
448 | +# yield orth, interps | |
... | ... |