Commit 96182ff07bfb4683c51c56fc23b351f34b71dc5b
1 parent
7508ece1
- drobne poprawki wydajnościowe
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/trunk@276 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
4 changed files
with
43 additions
and
54 deletions
morfeusz/MorfeuszImpl.cpp
@@ -342,9 +342,9 @@ namespace morfeusz { | @@ -342,9 +342,9 @@ namespace morfeusz { | ||
342 | } | 342 | } |
343 | bool caseMatches = env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, reader.getWordStartPtr(), reader.getCurrPtr(), ig); | 343 | bool caseMatches = env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, reader.getWordStartPtr(), reader.getCurrPtr(), ig); |
344 | if (caseMatches || options.caseHandling == CONDITIONALLY_CASE_SENSITIVE) { | 344 | if (caseMatches || options.caseHandling == CONDITIONALLY_CASE_SENSITIVE) { |
345 | - | ||
346 | - SegrulesState newSegrulesState = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, isAtWhitespace); | ||
347 | - if (!newSegrulesState.sink) { | 345 | + SegrulesState newSegrulesState = SegrulesState::FAILED_STATE; |
346 | + env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, isAtWhitespace, newSegrulesState); | ||
347 | + if (!newSegrulesState.failed) { | ||
348 | InterpretedChunk ic( | 348 | InterpretedChunk ic( |
349 | createChunk(ig, reader, newSegrulesState.shiftOrthFromPrevious, homonymId)); | 349 | createChunk(ig, reader, newSegrulesState.shiftOrthFromPrevious, homonymId)); |
350 | 350 | ||
@@ -356,27 +356,11 @@ namespace morfeusz { | @@ -356,27 +356,11 @@ namespace morfeusz { | ||
356 | newSegrulesState, | 356 | newSegrulesState, |
357 | ic); | 357 | ic); |
358 | } | 358 | } |
359 | -// if (!newSegrulesStates.empty()) { | ||
360 | -// for (unsigned int i = 0; i < newSegrulesStates.size(); i++) { | ||
361 | -// const SegrulesState& newSegrulesState = newSegrulesStates[i]; | ||
362 | -// | ||
363 | -// InterpretedChunk ic( | ||
364 | -// createChunk(ig, reader, newSegrulesState.shiftOrthFromPrevious, homonymId)); | ||
365 | -// | ||
366 | -// processInterpretedChunk( | ||
367 | -// env, | ||
368 | -// reader, | ||
369 | -// isAtWhitespace, | ||
370 | -// caseMatches, | ||
371 | -// newSegrulesState, | ||
372 | -// ic); | ||
373 | -// } | ||
374 | -// newSegrulesStates.resize(0); | ||
375 | -// } | ||
376 | else if (this->options.debug) { | 359 | else if (this->options.debug) { |
377 | std::cerr << "NOT ACCEPTING (segmentation)" << debugAccum(accum) << debugInterpsGroup(ig.type, reader.getWordStartPtr(), reader.getCurrPtr()) << std::endl; | 360 | std::cerr << "NOT ACCEPTING (segmentation)" << debugAccum(accum) << debugInterpsGroup(ig.type, reader.getWordStartPtr(), reader.getCurrPtr()) << std::endl; |
378 | } | 361 | } |
379 | - } else if (this->options.debug) { | 362 | + } |
363 | + else if (this->options.debug) { | ||
380 | std::cerr << "NOT ACCEPTING (case)" << debugAccum(accum) << debugInterpsGroup(ig.type, reader.getWordStartPtr(), reader.getCurrPtr()) << std::endl; | 364 | std::cerr << "NOT ACCEPTING (case)" << debugAccum(accum) << debugInterpsGroup(ig.type, reader.getWordStartPtr(), reader.getCurrPtr()) << std::endl; |
381 | } | 365 | } |
382 | } | 366 | } |
morfeusz/segrules/SegrulesFSA.cpp
@@ -8,41 +8,44 @@ using namespace std; | @@ -8,41 +8,44 @@ using namespace std; | ||
8 | 8 | ||
9 | namespace morfeusz { | 9 | namespace morfeusz { |
10 | 10 | ||
11 | - SegrulesState SegrulesState::SINK_STATE = { | 11 | + SegrulesState SegrulesState::FAILED_STATE = { |
12 | 0, // offset | 12 | 0, // offset |
13 | false, // accepting | 13 | false, // accepting |
14 | false, // weak | 14 | false, // weak |
15 | false, // shift orth | 15 | false, // shift orth |
16 | - true // sink | 16 | + true, // sink |
17 | + true, // failed | ||
17 | }; | 18 | }; |
18 | 19 | ||
19 | - SegrulesState SegrulesFSA::proceedToNext( | 20 | + void SegrulesFSA::proceedToNext( |
20 | const unsigned char segnum, | 21 | const unsigned char segnum, |
21 | const SegrulesState& state, | 22 | const SegrulesState& state, |
22 | - bool atEndOfWord) const { | 23 | + bool atEndOfWord, |
24 | + SegrulesState& resState) const { | ||
25 | + assert(!state.failed); | ||
23 | if (state.offset == 0) { | 26 | if (state.offset == 0) { |
24 | - return doProceedFromInitialState(segnum, atEndOfWord); | 27 | + doProceedFromInitialState(segnum, atEndOfWord, resState); |
25 | } else { | 28 | } else { |
26 | - return doProceedFromNonInitialState(segnum, state, atEndOfWord); | 29 | + doProceedFromNonInitialState(segnum, state, atEndOfWord, resState); |
27 | } | 30 | } |
28 | } | 31 | } |
29 | 32 | ||
30 | - SegrulesState SegrulesFSA::doProceedFromInitialState( | 33 | + void SegrulesFSA::doProceedFromInitialState( |
31 | const unsigned char segnum, | 34 | const unsigned char segnum, |
32 | - bool atEndOfWord) const { | 35 | + bool atEndOfWord, |
36 | + SegrulesState& resState) const { | ||
33 | const SegrulesState& newState = initialTransitions[segnum]; | 37 | const SegrulesState& newState = initialTransitions[segnum]; |
34 | if ((atEndOfWord && newState.accepting) | 38 | if ((atEndOfWord && newState.accepting) |
35 | || (!atEndOfWord && !newState.sink)) { | 39 | || (!atEndOfWord && !newState.sink)) { |
36 | - return newState; | ||
37 | - } else { | ||
38 | - return SegrulesState::SINK_STATE; | 40 | + resState = newState; |
39 | } | 41 | } |
40 | } | 42 | } |
41 | 43 | ||
42 | - SegrulesState SegrulesFSA::doProceedFromNonInitialState( | 44 | + void SegrulesFSA::doProceedFromNonInitialState( |
43 | const unsigned char segnum, | 45 | const unsigned char segnum, |
44 | const SegrulesState& state, | 46 | const SegrulesState& state, |
45 | - bool atEndOfWord) const { | 47 | + bool atEndOfWord, |
48 | + SegrulesState& resState) const { | ||
46 | const unsigned char* currPtr = ptr + state.offset + 1; | 49 | const unsigned char* currPtr = ptr + state.offset + 1; |
47 | const unsigned char transitionsNum = *currPtr++; | 50 | const unsigned char transitionsNum = *currPtr++; |
48 | for (int i = 0; i < transitionsNum; i++) { | 51 | for (int i = 0; i < transitionsNum; i++) { |
@@ -50,14 +53,11 @@ namespace morfeusz { | @@ -50,14 +53,11 @@ namespace morfeusz { | ||
50 | SegrulesState newState = this->transition2State(currPtr); | 53 | SegrulesState newState = this->transition2State(currPtr); |
51 | if ((atEndOfWord && newState.accepting) | 54 | if ((atEndOfWord && newState.accepting) |
52 | || (!atEndOfWord && !newState.sink)) { | 55 | || (!atEndOfWord && !newState.sink)) { |
53 | - return newState; | ||
54 | - } else { | ||
55 | - return SegrulesState::SINK_STATE; | 56 | + resState = newState; |
56 | } | 57 | } |
57 | } | 58 | } |
58 | currPtr += 4; | 59 | currPtr += 4; |
59 | } | 60 | } |
60 | - return SegrulesState::SINK_STATE; | ||
61 | } | 61 | } |
62 | 62 | ||
63 | SegrulesState SegrulesFSA::transition2State(const unsigned char* transitionPtr) const { | 63 | SegrulesState SegrulesFSA::transition2State(const unsigned char* transitionPtr) const { |
@@ -69,12 +69,13 @@ namespace morfeusz { | @@ -69,12 +69,13 @@ namespace morfeusz { | ||
69 | res.offset = readInt16(transitionPtr); | 69 | res.offset = readInt16(transitionPtr); |
70 | res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; | 70 | res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; |
71 | res.weak = *(ptr + res.offset) & WEAK_FLAG; | 71 | res.weak = *(ptr + res.offset) & WEAK_FLAG; |
72 | - res.sink = !res.accepting && *(ptr + res.offset + 1) == 0; | 72 | + res.sink = *(ptr + res.offset + 1) == 0; |
73 | + res.failed = !res.accepting && res.sink; | ||
73 | return res; | 74 | return res; |
74 | } | 75 | } |
75 | 76 | ||
76 | vector< SegrulesState > SegrulesFSA::createInitialTransitionsVector() { | 77 | vector< SegrulesState > SegrulesFSA::createInitialTransitionsVector() { |
77 | - vector< SegrulesState > res(256, SegrulesState()); | 78 | + vector< SegrulesState > res(256, SegrulesState::FAILED_STATE); |
78 | const unsigned char* currPtr = ptr + initialState.offset + 1; | 79 | const unsigned char* currPtr = ptr + initialState.offset + 1; |
79 | const unsigned char transitionsNum = *currPtr++; | 80 | const unsigned char transitionsNum = *currPtr++; |
80 | for (int i = 0; i < transitionsNum; i++) { | 81 | for (int i = 0; i < transitionsNum; i++) { |
morfeusz/segrules/SegrulesFSA.hpp
@@ -20,8 +20,9 @@ struct SegrulesState { | @@ -20,8 +20,9 @@ struct SegrulesState { | ||
20 | bool weak; | 20 | bool weak; |
21 | bool shiftOrthFromPrevious; | 21 | bool shiftOrthFromPrevious; |
22 | bool sink; | 22 | bool sink; |
23 | + bool failed; | ||
23 | 24 | ||
24 | - static SegrulesState SINK_STATE; | 25 | + static SegrulesState FAILED_STATE; |
25 | }; | 26 | }; |
26 | 27 | ||
27 | inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) { | 28 | inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) { |
@@ -32,15 +33,16 @@ class SegrulesFSA { | @@ -32,15 +33,16 @@ class SegrulesFSA { | ||
32 | public: | 33 | public: |
33 | 34 | ||
34 | SegrulesFSA(const unsigned char* ptr) : initialState(), ptr(ptr), initialTransitions() { | 35 | SegrulesFSA(const unsigned char* ptr) : initialState(), ptr(ptr), initialTransitions() { |
35 | - SegrulesState state = {0, false, false, false, false}; | 36 | + SegrulesState state = {0, false, false, false, false, false}; |
36 | initialState = state; | 37 | initialState = state; |
37 | initialTransitions = createInitialTransitionsVector(); | 38 | initialTransitions = createInitialTransitionsVector(); |
38 | } | 39 | } |
39 | 40 | ||
40 | - SegrulesState proceedToNext( | 41 | + void proceedToNext( |
41 | const unsigned char segnum, | 42 | const unsigned char segnum, |
42 | const SegrulesState& state, | 43 | const SegrulesState& state, |
43 | - bool atEndOfWord) const; | 44 | + bool atEndOfWord, |
45 | + SegrulesState& resState) const; | ||
44 | 46 | ||
45 | virtual ~SegrulesFSA() { | 47 | virtual ~SegrulesFSA() { |
46 | } | 48 | } |
@@ -54,14 +56,16 @@ private: | @@ -54,14 +56,16 @@ private: | ||
54 | 56 | ||
55 | std::vector< SegrulesState > createInitialTransitionsVector(); | 57 | std::vector< SegrulesState > createInitialTransitionsVector(); |
56 | 58 | ||
57 | - SegrulesState doProceedFromInitialState( | 59 | + void doProceedFromInitialState( |
58 | const unsigned char segnum, | 60 | const unsigned char segnum, |
59 | - bool atEndOfWord) const; | 61 | + bool atEndOfWord, |
62 | + SegrulesState& resState) const; | ||
60 | 63 | ||
61 | - SegrulesState doProceedFromNonInitialState( | 64 | + void doProceedFromNonInitialState( |
62 | const unsigned char segnum, | 65 | const unsigned char segnum, |
63 | const SegrulesState& state, | 66 | const SegrulesState& state, |
64 | - bool atEndOfWord) const; | 67 | + bool atEndOfWord, |
68 | + SegrulesState& resState) const; | ||
65 | }; | 69 | }; |
66 | 70 | ||
67 | } | 71 | } |
profile.sh
1 | #!/bin/bash | 1 | #!/bin/bash |
2 | 2 | ||
3 | -rm -rf profbuild | ||
4 | -mkdir -p profbuild | ||
5 | -cd profbuild | ||
6 | -cmake -D INPUT_DICTIONARIES=../input/dodatki.tab,../input/PoliMorfSmall.tab -D CMAKE_BUILD_TYPE=Debug -D CMAKE_CXX_FLAGS="-g -O2" -D CMAKE_SHARED_LINKER_FLAGS="-lprofiler" -D CMAKE_EXE_LINKER_FLAGS="-lprofiler" .. | ||
7 | -make | 3 | +#~ rm -rf profbuild |
4 | +#~ mkdir -p profbuild | ||
5 | +#~ cd profbuild | ||
6 | +#~ cmake -D INPUT_DICTIONARIES=/home/wkieras/input/dodatki.tab,../input/PoliMorfSmall.tab -D CMAKE_BUILD_TYPE=Debug -D CMAKE_CXX_FLAGS="-g -O2" -D CMAKE_SHARED_LINKER_FLAGS="-lprofiler" -D CMAKE_EXE_LINKER_FLAGS="-lprofiler" .. | ||
7 | +#~ make | ||
8 | rm -f /tmp/morfeusz.prof | 8 | rm -f /tmp/morfeusz.prof |
9 | export LD_PRELOAD="/usr/lib/libprofiler.so" | 9 | export LD_PRELOAD="/usr/lib/libprofiler.so" |
10 | export CPUPROFILE="/tmp/morfeusz.prof" | 10 | export CPUPROFILE="/tmp/morfeusz.prof" |
11 | -morfeusz/morfeusz_analyzer -i /home/wkieras/output/sgjp_analyzer.fsa < /mnt/storage/morfeusz/sents10k > /dev/null | 11 | +morfeusz/morfeusz_analyzer --dict sgjp --dict-dir /home/mlenart/opt/morfeusz/buildall/Linux-i386-false/_CPack_Packages/Linux/DEB/morfeusz2-2.0.0_sgjp-Linux-i386/usr/share/morfeusz/dictionaries < /mnt/storage/morfeusz/sents10k > /dev/null |
12 | ### pprof --gv profbuild/morfeusz/morfeusz_analyzer /tmp/morfeusz.prof | 12 | ### pprof --gv profbuild/morfeusz/morfeusz_analyzer /tmp/morfeusz.prof |