Commit ed66703baa59e9e5c2a15e39bf6d2b2a8e9ec71e

Authored by Michał Lenart
1 parent a4444480

- poprawienie zużycia pamięci przez budowniczego automatów

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@31 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/fsa/buildfsa.py
... ... @@ -129,7 +129,7 @@ def _parseOptions():
129 129 return opts
130 130  
131 131 def _readPolimorfInput(inputFile, tagset, encoder):
132   - with codecs.open(inputFile, 'r', 'utf8') as f:
  132 + with open(inputFile, 'r') as f:
133 133 for entry in convertinput.convertPolimorf(f, tagset, encoder):
134 134 yield entry
135 135  
... ... @@ -166,7 +166,9 @@ def buildFromPoliMorf(inputFile, tagsetFile):
166 166 tagset = common.Tagset(tagsetFile)
167 167 fsa = FSA(encoder, tagset)
168 168 inputData = _readPolimorfInput(inputFile, tagset, encoder)
  169 +
169 170 fsa.feed(inputData)
  171 +
170 172 _printStats(fsa)
171 173 return fsa
172 174  
... ...
fsabuilder/fsa/convertinput.py
... ... @@ -398,23 +398,30 @@ tag2typenum = {
398 398 'burk': 9,
399 399 }
400 400  
  401 +def _partiallyParseLines(inputLines, tagset):
  402 + for line in inputLines:
  403 + line = line.decode('utf8').strip('\n')
  404 + orth, base, tag, name = line.split(u'\t')
  405 + tagnum = tagset.tag2tagnum[tag]
  406 + namenum = tagset.name2namenum[name]
  407 + typenum = tag2typenum.get(tag, 0)
  408 + yield '%s %s %d %d %d' % (orth.encode('utf8'), base.encode('utf8'), tagnum, namenum, typenum)
  409 +
401 410 def _sortLines(inputLines, encoder):
402   - logging.info('sorting input...')
403 411 lines = list(inputLines)
404   - logging.info('done read data into list')
405   - lines.sort(key=lambda line: encoder.word2SortKey(line.split('\t')[0]))
406   - logging.info('done sorting')
  412 + lines.sort(key=lambda line: encoder.word2SortKey(line.split(' ')[0].decode('utf8')))
407 413 return lines
  414 +# return sorted(inputLines, key=lambda line: encoder.word2SortKey(line.split(' ')[0].decode('utf8')))
408 415  
409   -def _parseLines(inputLines, tagset, encoder):
  416 +def _reallyParseLines(inputLines):
410 417 for line in inputLines:
411   - line = line.strip(u'\n')
  418 + line = line.decode('utf8').strip(u'\n')
412 419 if line:
413 420 # print line
414   - orth, base, tag, name = line.split(u'\t')
415   - tagnum = tagset.tag2tagnum[tag]
416   - namenum = tagset.name2namenum[name]
417   - typenum = tag2typenum.get(tag, 0)
  421 + orth, base, tagnum, namenum, typenum = line.split(u' ')
  422 + tagnum = int(tagnum)
  423 + namenum = int(namenum)
  424 + typenum = int(typenum)
418 425 yield (orth, Interpretation(orth, base, tagnum, namenum, typenum))
419 426  
420 427 def _mergeEntries(inputLines):
... ... @@ -433,5 +440,9 @@ def _mergeEntries(inputLines):
433 440 yield (prevOrth, frozenset(prevInterps))
434 441  
435 442 def convertPolimorf(inputLines, tagset, encoder):
436   - for orth, interps in _mergeEntries(_parseLines(_sortLines(inputLines, encoder), tagset, encoder)):
  443 + for orth, interps in _mergeEntries(_reallyParseLines(_sortLines(_partiallyParseLines(inputLines, tagset), encoder))):
437 444 yield orth, interps
  445 +
  446 +# def convertPolimorf(inputLines, tagset, encoder):
  447 +# for orth, interps in _mergeEntries(_parseLines(_sortLines(inputLines, encoder), tagset, encoder)):
  448 +# yield orth, interps
... ...