Commit 7dc1167ee08974171266fee7e2d05b773da02153

Authored by Michał Lenart
1 parent aaf322b3

rozprawienie się z wielkością liter, poprawa jakości pakietu debianowego, zmiana…

… nazwy skryptu do budowania automatu na morfeusz_builder

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@153 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
... ... @@ -5,7 +5,7 @@ project (Morfeusz)
5 5 set (Morfeusz_VERSION_MAJOR 2)
6 6 set (Morfeusz_VERSION_MINOR 0)
7 7 set (Morfeusz_VERSION_PATCH 0)
8   -set (CMAKE_BUILD_TYPE "Debug")
  8 +#~ set (CMAKE_BUILD_TYPE "Release")
9 9  
10 10 enable_testing()
11 11  
... ... @@ -30,13 +30,13 @@ set (PROJECT_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morf
30 30 ### USER DEFINED VARIABLES
31 31  
32 32 # INPUT_DICTIONARY_CPP
33   -set (INPUT_DICTIONARY_CPP "${PROJECT_SOURCE_DIR}/../default_fsa.cpp")
34   -set (INPUT_SYNTH_DICTIONARY_CPP "${PROJECT_SOURCE_DIR}/../default_synth_fsa.cpp")
  33 +set (INPUT_DICTIONARY_CPP "default_fsa.cpp")
  34 +set (INPUT_SYNTH_DICTIONARY_CPP "default_synth_fsa.cpp")
35 35 if ("${INPUT_DICTIONARIES}" STREQUAL "")
36 36 if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE")
37 37 set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt)
38 38 else ()
39   - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab")
  39 + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/sgjp-hom.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab")
40 40 endif ()
41 41 endif ()
42 42  
... ... @@ -98,10 +98,10 @@ set (CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
98 98  
99 99  
100 100 # the RPATH to be used when installing, but only if it's not a system directory
101   -list (FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir)
102   -if ("${isSystemDir}" STREQUAL "-1")
103   - SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
104   -endif ()
  101 +#~ list (FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir)
  102 +#~ if ("${isSystemDir}" STREQUAL "-1")
  103 + #~ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
  104 +#~ endif ()
105 105  
106 106 ########## Configure CPack ##########
107 107  
... ... @@ -110,14 +110,16 @@ set (CPACK_PACKAGE_FILE_NAME "morfeusz2-${PROJECT_VERSION}-${CMAKE_SYSTEM_NAME}-
110 110 set (CPACK_PACKAGE_VENDOR "Michał Lenart")
111 111 set (CPACK_PACKAGE_CONTACT "michal.lenart@ipipan.waw.pl")
112 112 set (CPACK_PACKAGE_DESCRIPTION_SUMMARY "Morphological analyzer for the Polish language.")
  113 +set (CPACK_PACKAGE_DESCRIPTION "This is the second version of Morfeusz. Among numerous improvements it has better segmentation and case-sensitive lemmas handling. It also makes it possible to perform morphological synthesis and to use your own dictionary")
  114 +set (CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/License.txt")
113 115 set (CPACK_OUTPUT_FILE_PREFIX "${TARGET_DIR}")
114 116  
115 117 if (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
116 118 set (CPACK_GENERATOR "DEB" "TGZ")
117 119 #debian
118 120 set (CPACK_DEBIAN_PACKAGE_NAME "morfeusz2")
119   - set (CPACK_DEBIAN_PACKAGE_MAINTAINER "${CPACK_PACKAGE_CONTACT}")
120   - set (CPACK_DEBIAN_PACKAGE_DEPENDS "libstdc++6 (>= 4.6)")
  121 + set (CPACK_DEBIAN_PACKAGE_MAINTAINER "Michał Lenart <${CPACK_PACKAGE_CONTACT}>")
  122 + set (CPACK_DEBIAN_PACKAGE_DEPENDS "libstdc++6 (>= 4.6), libc6")
121 123 set (CPACK_DEBIAN_PACKAGE_ARCHITECTURE "${ARCHITECTURE}")
122 124 elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
123 125 if (${ARCHITECTURE} MATCHES "amd64")
... ... @@ -149,15 +151,15 @@ add_subdirectory (fsabuilder)
149 151 ########## add tests ##########
150 152  
151 153 macro (test_build_and_recognize fname method)
152   - add_test (TestBuild-${method}-${fname} python fsabuilder/buildfsa.py --analyzer --input-files testfiles/${fname} -o /tmp/test-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --segments-file=testfiles/segmenty.dat --serialization-method=${method})
153   - add_test (TestBuild4Synth-${method}-${fname} python fsabuilder/buildfsa.py --generator --input-files testfiles/${fname} -o /tmp/test-synth-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --serialization-method=${method})
  154 + add_test (TestBuild-${method}-${fname} python fsabuilder/morfeusz_builder --analyzer --input-files testfiles/${fname} -o /tmp/test-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --segments-file=testfiles/segmenty.dat --serialization-method=${method})
  155 + add_test (TestBuild4Synth-${method}-${fname} python fsabuilder/morfeusz_builder --generator --input-files testfiles/${fname} -o /tmp/test-synth-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --serialization-method=${method})
154 156 add_test (TestRecognize-${method}-${fname} morfeusz/test_recognize_dict /tmp/test-${method}-${fname}.fsa testfiles/${fname})
155 157 # add_test (TestNOTRecognize-${method}-${fname} fsa/test_not_recognize /tmp/test-${method}-${fname}.fsa testfiles/out_of_dict)
156 158 # add_test (TestSpeed-${method}-${fname} fsa/test_speed /tmp/test-${method}-${fname}.fsa testfiles/speed_test_data)
157 159 endmacro ()
158 160  
159 161 macro (test_result_equals inputFilename requiredOutputFilename encoding)
160   - # add_test (TestBuild4ResultEquals-${dictFilename}-${requiredOutputFilename} python fsabuilder/fsa/buildfsa.py -i ${dictFilename} -o /tmp/test.fsa --tagset-file=testfiles/polimorf.tagset --output-format=BINARY --serialization-method=SIMPLE)
  162 + # add_test (TestBuild4ResultEquals-${dictFilename}-${requiredOutputFilename} python fsabuilder/fsa/morfeusz_builder -i ${dictFilename} -o /tmp/test.fsa --tagset-file=testfiles/polimorf.tagset --output-format=BINARY --serialization-method=SIMPLE)
161 163 add_test (TestResultEquals-${inputFilename}-${requiredOutputFilename} morfeusz/test_result_equals ${inputFilename} ${requiredOutputFilename} ${encoding})
162 164 endmacro ()
163 165  
... ...
License.txt 0 → 100644
  1 +This is the Morfeusz license file.
... ...
fsabuilder/.settings/org.eclipse.core.resources.prefs
1 1 eclipse.preferences.version=1
2 2 encoding//morfeuszbuilder/fsa/test/testConstruction.py=utf-8
3 3 encoding//morfeuszbuilder/segrules/preprocessor.py=utf-8
4   -encoding/buildfsa.py=utf-8
  4 +encoding/morfeusz_builder=utf-8
... ...
fsabuilder/CMakeLists.txt
... ... @@ -16,15 +16,6 @@ add_custom_command (OUTPUT ${SETUP_PY}
16 16  
17 17 add_custom_target (builder-setup DEPENDS ${SETUP_PY})
18 18  
19   -#~ add_custom_target (buildfsa-exec ALL
20   - #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/buildfsa.py
21   -#~ )
22   -#~
23   -#~ add_executable (buildfsa IMPORTED)
24   -#~ add_dependencies (buildfsa buildfsa-exec)
25   -#~ set_property (TARGET buildfsa PROPERTY IMPORTED_LOCATION "${DIST_PATH}")
26   -#~ install (PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/buildfsa" DESTINATION bin)
27   -
28 19 if (${UNIX})
29 20 add_custom_target (install-builder
30 21 COMMAND python ${SETUP_PY} install --home=${CMAKE_INSTALL_PREFIX}
... ... @@ -62,6 +53,15 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES &quot;Windows&quot;)
62 53 DEPENDS builder-setup
63 54 )
64 55 list (APPEND PACKAGE_DEPENDS package-python-win-installer)
  56 +
  57 + #~ add_custom_target (buildfsa-exec ALL
  58 + #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
  59 +#~ )
  60 +#~
  61 +#~ add_executable (morfeusz_builder IMPORTED)
  62 +#~ add_dependencies (morfeusz_builder buildfsa-exec)
  63 +#~ set_property (TARGET morfeusz_builder PROPERTY IMPORTED_LOCATION "${DIST_PATH}")
  64 +#~ install (PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/morfeusz_builder" DESTINATION bin)
65 65 endif ()
66 66  
67 67 add_custom_target(package-builder DEPENDS ${PACKAGE_DEPENDS})
... ...
fsabuilder/buildanalyzer.sh
1 1 #!/bin/bash
2 2  
3   -python buildfsa.py --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
  3 +python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
4 4 --tagset-file=../input/polimorf.tagset \
5 5 --segments-file=../input/segmenty.dat \
6 6 --analyzer \
... ...
fsabuilder/buildgenerator.sh
1 1 #!/bin/bash
2 2  
3   -python buildfsa.py --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
  3 +python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
4 4 --tagset-file=../input/polimorf.tagset \
5 5 --segments-file=../input/segmenty.dat \
6 6 --generator \
... ...
fsabuilder/buildfsa.py renamed to fsabuilder/morfeusz_builder
... ... @@ -12,21 +12,11 @@ import codecs
12 12 from morfeuszbuilder.fsa import encode
13 13 from morfeuszbuilder.fsa import convertinput
14 14 from morfeuszbuilder.fsa.fsa import FSA
15   -from morfeuszbuilder.fsa.serializer import Serializer
  15 +from morfeuszbuilder.fsa.serializer import Serializer, SerializationMethod
16 16 from morfeuszbuilder.tagset.tagset import Tagset
17 17 from morfeuszbuilder.segrules import rulesParser
18 18 from optparse import OptionParser
19 19  
20   -# class InputFormat():
21   -# ENCODED = 'ENCODED'
22   -# POLIMORF = 'POLIMORF'
23   -# PLAIN = 'PLAIN'
24   -
25   -class SerializationMethod():
26   - SIMPLE = 'SIMPLE'
27   - V1 = 'V1'
28   - V2 = 'V2'
29   -
30 20 def _checkOption(opt, parser, msg):
31 21 if opt is None:
32 22 print >> sys.stderr, msg
... ... @@ -46,6 +36,8 @@ def _checkOpen(filename, mode):
46 36 try:
47 37 with open(filename, mode) as _:
48 38 pass
  39 + if 'w' in mode:
  40 + os.remove(filename)
49 41 except IOError as ex:
50 42 print >> sys.stderr, str(ex)
51 43 exit(1)
... ... @@ -238,7 +230,6 @@ def main(opts):
238 230 fsa, qualifiersMap = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager, opts.trimSupneg)
239 231 else:
240 232 fsa, qualifiersMap = buildGeneratorFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager)
241   - print qualifiersMap
242 233 if opts.trainFile:
243 234 logging.info('training with '+opts.trainFile+' ...')
244 235 fsa.train(_readTrainData(opts.trainFile))
... ...
fsabuilder/morfeuszbuilder/fsa/common.py
... ... @@ -21,6 +21,7 @@ class EncodedForm(object):
21 21 self.cutLength = len(fromWord) - len(root)
22 22 self.suffixToAdd = targetWord[len(root):]
23 23 self.casePattern = [c == c.upper() and c != c.lower() for c in root]
  24 +# print fromWord.encode('utf8'), targetWord.encode('utf8'), self.casePattern
24 25  
25 26 class EncodedForm4Generator(object):
26 27  
... ... @@ -54,7 +55,7 @@ class Interpretation4Analyzer(object):
54 55 return (
55 56 self.encodedForm.cutLength,
56 57 tuple(self.encodedForm.suffixToAdd),
57   - tuple(self.encodedForm.casePattern),
  58 + tuple(self.encodedForm.casePattern),
58 59 self.tagnum,
59 60 self.namenum)
60 61  
... ...
fsabuilder/morfeuszbuilder/fsa/encode.py
... ... @@ -120,16 +120,13 @@ class Encoder(object):
120 120 res[interp.typenum].append(interp)
121 121 return res
122 122  
123   - def _getMostLiberalCasePattern(self, interpsList):
124   - res = None
  123 + def _getCasePatterns(self, interpsList):
  124 + res = []
125 125 for interp in interpsList:
126   - if res is None:
127   - res = list(interp.encodedForm.casePattern)
  126 + if not True in interp.encodedForm.casePattern:
  127 + return []
128 128 else:
129   - while len(interp.encodedForm.casePattern) > len(res):
130   - res.append(False)
131   - for idx, (case1, case2) in enumerate(itertools.izip_longest(res, interp.encodedForm.casePattern, fillvalue=False)):
132   - res[idx] = case1 and case2
  129 + res.append(list(interp.encodedForm.casePattern))
133 130 return res
134 131  
135 132 def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId):
... ... @@ -137,7 +134,10 @@ class Encoder(object):
137 134 res.extend(self._encodeTypeNum(typenum))
138 135 encodedInterpsList = bytearray()
139 136 if withCasePattern:
140   - encodedInterpsList.extend(self._encodeCasePattern(self._getMostLiberalCasePattern(interpsList)))
  137 + casePatterns = self._getCasePatterns(interpsList)
  138 + encodedInterpsList.append(len(casePatterns))
  139 + for casePattern in casePatterns:
  140 + encodedInterpsList.extend(self._encodeCasePattern(casePattern))
141 141 for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
142 142 if withHomonymId:
143 143 encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False))
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.py
... ... @@ -8,6 +8,11 @@ import logging
8 8 from state import State
9 9 from morfeuszbuilder.utils.serializationUtils import *
10 10  
  11 +class SerializationMethod(object):
  12 + SIMPLE = 'SIMPLE'
  13 + V1 = 'V1'
  14 + V2 = 'V2'
  15 +
11 16 class Serializer(object):
12 17  
13 18 MAGIC_NUMBER = 0x8fc2bc1b
... ... @@ -20,7 +25,6 @@ class Serializer(object):
20 25  
21 26 @staticmethod
22 27 def getSerializer(serializationMethod, fsa, tagset, qualifiersMap, segmentationRulesData):
23   - from buildfsa import SerializationMethod
24 28 res = {
25 29 SerializationMethod.SIMPLE: SimpleSerializer,
26 30 SerializationMethod.V1: VLengthSerializer1,
... ... @@ -37,7 +41,7 @@ class Serializer(object):
37 41  
38 42 # get the Morfeusz file format version that is being encoded
39 43 def getVersion(self):
40   - return 14
  44 + return 15
41 45  
42 46 def serialize2CppFile(self, fname, isGenerator, headerFilename="data/default_fsa.hpp"):
43 47 res = []
... ...
fsabuilder/setup.py.in
... ... @@ -8,5 +8,5 @@ if __name__ == &#39;__main__&#39;:
8 8 description='Finite state automata builder for Morfeusz.',
9 9 version='${MORFEUSZBUILDER_VERSION}',
10 10 packages=['morfeuszbuilder', 'morfeuszbuilder.fsa', 'morfeuszbuilder.tagset', 'morfeuszbuilder.segrules', 'morfeuszbuilder.utils'],
11   - scripts = ['buildfsa.py'],
  11 + scripts = ['morfeusz_builder'],
12 12 requires=['pyparsing'])
... ...
morfeusz/CMakeLists.txt
... ... @@ -3,13 +3,13 @@
3 3 ########## generate default dictionary data #################
4 4 add_custom_command (
5 5 OUTPUT "${INPUT_DICTIONARY_CPP}"
6   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg
  6 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg
7 7 DEPENDS "${INPUT_DICTIONARY}"
8 8 COMMENT "Building default dictionary C++ file"
9 9 )
10 10 add_custom_command (
11 11 OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}"
12   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2
  12 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2
13 13 DEPENDS "${INPUT_DICTIONARY}"
14 14 COMMENT "Building default dictionary C++ file"
15 15 )
... ...
morfeusz/CasePatternHelper.hpp
... ... @@ -9,7 +9,7 @@
9 9 #define CASEPATTERNHELPER_HPP
10 10  
11 11 #include <vector>
12   -#include "InterpretedChunk.hpp"
  12 +#include "InterpsGroup.hpp"
13 13  
14 14 const uint8_t LEMMA_ONLY_LOWER = 0;
15 15 const uint8_t LEMMA_UPPER_PREFIX = 1;
... ... @@ -39,28 +39,40 @@ public:
39 39 }
40 40 return true;
41 41 }
42   -
43   -// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const {
44   -// if (this->caseSensitive) {
45   -// for (unsigned int i = 0; i < chunks.size(); i++) {
46   -// const InterpretedChunk& ic = chunks[i];
47   -// const unsigned char* casePatternPtr = ic.interpsGroup.ptr;
48   -// std::vector<bool> casePattern;
49   -// deserializeCasePattern(casePatternPtr, casePattern);
50   -// if (!checkCasePattern(ic, casePattern)) {
51   -// return false;
52   -// }
53   -// }
54   -// }
55   -// return true;
56   -// }
57   -
58   - void skipCasePattern(const unsigned char*& ptr) const {
59   - vector<bool> _dupa;
60   - deserializeCasePattern(ptr, _dupa);
  42 +
  43 + bool checkInterpsGroupCasePatterns(
  44 + const std::vector<uint32_t>& lowercaseCodepoints,
  45 + const std::vector<uint32_t>& originalCodepoints,
  46 + const InterpsGroup& ig) const {
  47 + const unsigned char* currPtr = ig.ptr;
  48 + unsigned char casePatternsNum = *currPtr++;
  49 + if (casePatternsNum == 0) {
  50 + return true;
  51 + }
  52 + else {
  53 + for (unsigned int i = 0; i < casePatternsNum; i++) {
  54 + if (checkCasePattern(
  55 + lowercaseCodepoints,
  56 + originalCodepoints,
  57 + deserializeOneCasePattern(currPtr))) {
  58 + return true;
  59 + }
  60 + }
  61 + return false;
  62 + }
61 63 }
62   -
63   - void deserializeCasePattern(const unsigned char*& ptr, std::vector<bool>& res) const {
  64 +
  65 + const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const {
  66 + const unsigned char* currPtr = ig.ptr;
  67 + unsigned char casePatternsNum = *currPtr++;
  68 + for (unsigned int i = 0; i < casePatternsNum; i++) {
  69 + deserializeOneCasePattern(currPtr);
  70 + }
  71 + return currPtr;
  72 + }
  73 +
  74 + std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
  75 + std::vector<bool> res;
64 76 uint8_t casePatternType = *ptr;
65 77 ptr++;
66 78 uint8_t prefixLength;
... ... @@ -89,9 +101,31 @@ public:
89 101 }
90 102 break;
91 103 }
  104 + return res;
92 105 }
  106 +
  107 +// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const {
  108 +// if (this->caseSensitive) {
  109 +// for (unsigned int i = 0; i < chunks.size(); i++) {
  110 +// const InterpretedChunk& ic = chunks[i];
  111 +// const unsigned char* casePatternPtr = ic.interpsGroup.ptr;
  112 +// std::vector<bool> casePattern;
  113 +// deserializeCasePattern(casePatternPtr, casePattern);
  114 +// if (!checkCasePattern(ic, casePattern)) {
  115 +// return false;
  116 +// }
  117 +// }
  118 +// }
  119 +// return true;
  120 +// }
  121 +
  122 +// void skipCasePattern(const unsigned char*& ptr) const {
  123 +// vector<bool> _dupa;
  124 +// deserializeCasePattern(ptr, _dupa);
  125 +// }
93 126 private:
94 127 bool caseSensitive;
  128 +
95 129 };
96 130  
97 131 #endif /* CASEPATTERNHELPER_HPP */
... ...
morfeusz/EncodedGeneratorInterpretation.hpp deleted
1   -/*
2   - * File: EncodedGeneratorInterpretation.hpp
3   - * Author: mlenart
4   - *
5   - * Created on 20 styczeń 2014, 17:15
6   - */
7   -
8   -#ifndef ENCODEDGENERATORINTERPRETATION_HPP
9   -#define ENCODEDGENERATORINTERPRETATION_HPP
10   -
11   -/*
12   - * Orth in a compressed format (as in an automaton)
13   - */
14   -struct EncodedOrth {
15   - int suffixToCut;
16   - std::string suffixToAdd;
17   - std::string prefixToAdd;
18   -};
19   -
20   -/*
21   - * Internal representation of an interpretation - with orth encoded
22   - */
23   -struct EncodedGeneratorInterpretation {
24   - EncodedOrth orth;
25   - int tag;
26   - int nameClassifier;
27   -};
28   -
29   -#endif /* ENCODEDGENERATORINTERPRETATION_HPP */
30   -
morfeusz/InflexionGraph.cpp
... ... @@ -13,9 +13,9 @@ void InflexionGraph::addStartEdge(const Edge&amp; e) {
13 13 if (this->graph.empty()) {
14 14 assert(this->node2ChunkStartPtr.empty());
15 15 this->graph.push_back(vector<Edge>());
16   - this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr);
  16 + this->node2ChunkStartPtr.push_back(e.chunk.textStartPtr);
17 17 }
18   - assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr);
  18 + assert(this->node2ChunkStartPtr[0] == e.chunk.textStartPtr);
19 19 this->graph[0].push_back(e);
20 20 }
21 21  
... ... @@ -24,7 +24,7 @@ void InflexionGraph::addMiddleEdge(unsigned int startNode, const Edge&amp; e) {
24 24 assert(startNode == this->graph.size());
25 25 if (startNode == this->graph.size()) {
26 26 this->graph.push_back(vector<Edge>());
27   - this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr);
  27 + this->node2ChunkStartPtr.push_back(e.chunk.textStartPtr);
28 28 }
29 29 this->graph[startNode].push_back(e);
30 30 }
... ... @@ -98,7 +98,7 @@ set&lt;InflexionGraph::Path&gt; InflexionGraph::getPossiblePaths(unsigned int node) {
98 98 vector<Edge>& edges = this->graph.at(node);
99 99 for (unsigned int i = 0; i < edges.size(); i++) {
100 100 Edge& e = edges[i];
101   - InflexionGraph::PathElement pathElem(e.chunk.chunkStartPtr, e.chunk.interpsGroup.type);
  101 + InflexionGraph::PathElement pathElem(e.chunk.textStartPtr, e.chunk.segmentType);
102 102 if (e.nextNode != this->graph.size()) {
103 103 set<Path> possiblePaths = this->getPossiblePaths(e.nextNode);
104 104 vector<Path> nextPaths(possiblePaths.begin(), possiblePaths.end());
... ... @@ -116,9 +116,9 @@ set&lt;InflexionGraph::Path&gt; InflexionGraph::getPossiblePaths(unsigned int node) {
116 116 static bool containsEqualEdge(const vector<InflexionGraph::Edge>& edges, const InflexionGraph::Edge& e) {
117 117 for (unsigned int i = 0; i < edges.size(); i++) {
118 118 const InflexionGraph::Edge& e1 = edges[i];
119   - if (e1.chunk.chunkStartPtr == e.chunk.chunkStartPtr
  119 + if (e1.chunk.textStartPtr == e.chunk.textStartPtr
120 120 && e1.chunk.lowercaseCodepoints == e.chunk.lowercaseCodepoints
121   - && e1.chunk.interpsGroup.type == e.chunk.interpsGroup.type
  121 + && e1.chunk.segmentType == e.chunk.segmentType
122 122 && e1.nextNode == e.nextNode) {
123 123 return true;
124 124 }
... ...
morfeusz/InterpretedChunk.hpp
... ... @@ -12,11 +12,13 @@
12 12 #include "InterpsGroup.hpp"
13 13  
14 14 struct InterpretedChunk {
15   - const char* chunkStartPtr;
16   - const char* chunkEndPtr;
  15 + unsigned char segmentType;
  16 + const char* textStartPtr;
  17 + const char* textEndPtr;
17 18 std::vector<uint32_t> originalCodepoints;
18 19 std::vector<uint32_t> lowercaseCodepoints;
19   - InterpsGroup interpsGroup;
  20 + const unsigned char* interpsPtr;
  21 + const unsigned char* interpsEndPtr;
20 22 bool shiftOrth;
21 23 bool orthWasShifted;
22 24 std::vector<InterpretedChunk> prefixChunks;
... ...
morfeusz/InterpretedChunksDecoder.hpp
... ... @@ -69,9 +69,8 @@ public:
69 69 string lemmaPrefix;
70 70 if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) {
71 71 orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
72   - const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr;
73   - env.getCasePatternHelper().skipCasePattern(currPtr);
74   - while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) {
  72 + const unsigned char* currPtr = interpretedChunk.interpsPtr;
  73 + while (currPtr < interpretedChunk.interpsEndPtr) {
75 74 this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr, out);
76 75 }
77 76 }
... ... @@ -104,7 +103,7 @@ protected:
104 103 encodedForm.suffixToAdd = (const char*) ptr;
105 104 ptr += strlen((const char*) ptr) + 1;
106 105 assert(encodedForm.casePattern.size() == 0);
107   - env.getCasePatternHelper().deserializeCasePattern(ptr, encodedForm.casePattern);
  106 + encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
108 107 }
109 108 private:
110 109  
... ... @@ -126,8 +125,8 @@ private:
126 125 std::vector<MorphInterpretation>& out) const {
127 126 string lemma = lemmaPrefix;
128 127 EncodedInterpretation ei = this->deserializeInterp(ptr);
  128 + this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
129 129 if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.value.casePattern)) {
130   - this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
131 130 pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma);
132 131 out.push_back(MorphInterpretation(
133 132 startNode, endNode,
... ... @@ -144,9 +143,9 @@ private:
144 143 for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
145 144 const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
146 145 orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
147   - const unsigned char* ptr = prefixChunk.interpsGroup.ptr;
  146 + const unsigned char* ptr = prefixChunk.interpsPtr;
148 147 std::vector<MorphInterpretation> mi;
149   - env.getCasePatternHelper().skipCasePattern(ptr);
  148 +// env.getCasePatternHelper().skipCasePattern(ptr);
150 149 this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, ptr, mi);
151 150 if (!mi.empty()) {
152 151 lemmaPrefix += mi[0].getLemma();
... ... @@ -173,8 +172,8 @@ public:
173 172 string lemma;
174 173 convertPrefixes(interpretedChunk, orthPrefix, lemma);
175 174 lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
176   - const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr;
177   - while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) {
  175 + const unsigned char* currPtr = interpretedChunk.interpsPtr;
  176 + while (currPtr < interpretedChunk.interpsEndPtr) {
178 177 MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
179 178 // cerr << mi.toString(false) << endl;
180 179 // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
... ... @@ -190,7 +189,7 @@ private:
190 189 for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
191 190 const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
192 191 lemma += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
193   - const unsigned char* ptr = prefixChunk.interpsGroup.ptr;
  192 + const unsigned char* ptr = prefixChunk.interpsPtr;
194 193 MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr);
195 194 orthPrefix += mi.getOrth();
196 195 }
... ...
morfeusz/Morfeusz.cpp
... ... @@ -108,7 +108,7 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) {
108 108 from.prefixChunks.end());
109 109 to.prefixChunks.push_back(from);
110 110 from.orthWasShifted = true;
111   - to.chunkStartPtr = from.chunkStartPtr;
  111 + to.textStartPtr = from.textStartPtr;
112 112 }
113 113  
114 114 static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
... ... @@ -120,7 +120,7 @@ static inline string debugInterpsGroup(unsigned char type, const char* startPtr,
120 120 static inline string debugAccum(vector<InterpretedChunk>& accum) {
121 121 stringstream res;
122 122 for (unsigned int i = 0; i < accum.size(); i++) {
123   - res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr);
  123 + res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr);
124 124 // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
125 125 }
126 126 return res.str();
... ... @@ -168,33 +168,37 @@ void Morfeusz::doProcessOneWord(
168 168 vector<InterpsGroup> val(state.getValue());
169 169 for (unsigned int i = 0; i < val.size(); i++) {
170 170 InterpsGroup& ig = val[i];
171   - vector<bool> casePattern;
172   -// env.getCasePatternHelper().skipCasePattern(ig.ptr);
173   - const unsigned char* casePatternPtr = ig.ptr;
174   - env.getCasePatternHelper().deserializeCasePattern(casePatternPtr, casePattern);
  171 + // vector<bool> casePattern;
  172 + // env.getCasePatternHelper().skipCasePattern(ig.ptr);
  173 + // const unsigned char* casePatternPtr = ig.ptr;
  174 + // env.getCasePatternHelper().deserializeCasePattern(casePatternPtr, casePattern);
175 175 if (this->options.debug) {
176 176 cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
177 177 }
178   - if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, originalCodepoints, casePattern)) {
179   -// if (true) {
180   - // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
181   - set<SegrulesState> newSegrulesStates;
182   - env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
183   - if (this->options.debug && newSegrulesStates.empty()) {
184   - cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
185   - }
186   - // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
  178 + // if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, originalCodepoints, casePattern)) {
  179 + // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
  180 + set<SegrulesState> newSegrulesStates;
  181 + env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
  182 + if (this->options.debug && newSegrulesStates.empty()) {
  183 + cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
  184 + }
  185 + if (!newSegrulesStates.empty() && env.getCasePatternHelper().checkInterpsGroupCasePatterns(normalizedCodepoints, originalCodepoints, ig)) {
  186 +
187 187 for (
188 188 set<SegrulesState>::iterator it = newSegrulesStates.begin();
189 189 it != newSegrulesStates.end();
190 190 ++it) {
191 191 SegrulesState newSegrulesState = *it;
  192 + const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig);
  193 + const unsigned char* interpsEndPtr = ig.ptr + ig.size;
192 194 InterpretedChunk ic = {
  195 + ig.type,
193 196 inputStart,
194 197 currInput,
195 198 originalCodepoints,
196 199 normalizedCodepoints,
197   - ig,
  200 + interpsPtr,
  201 + interpsEndPtr,
198 202 newSegrulesState.shiftOrthFromPrevious,
199 203 false,
200 204 vector<InterpretedChunk>(),
... ...
morfeusz/fsa/const.cpp
... ... @@ -2,7 +2,7 @@
2 2 #include "const.hpp"
3 3  
4 4 extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b;
5   -extern const uint8_t VERSION_NUM = 14;
  5 +extern const uint8_t VERSION_NUM = 15;
6 6  
7 7 extern const unsigned int VERSION_NUM_OFFSET = 4;
8 8 extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5;
... ...
morfeusz/fsa/fsa_impl.hpp
... ... @@ -66,13 +66,13 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
66 66  
67 67 uint32_t magicNumber = ntohl(*((const uint32_t*) ptr));
68 68 if (magicNumber != MAGIC_NUMBER) {
69   - throw FSAException("Invalid magic number");
  69 + throw FSAException("Invalid file format");
70 70 }
71 71  
72 72 uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET);
73 73 if (versionNum != VERSION_NUM) {
74 74 std::ostringstream oss;
75   - oss << "Invalid version number: " << versionNum << ", should be: " << VERSION_NUM;
  75 + oss << "Invalid file format version number: " << (int) versionNum << ", should be: " << (int) VERSION_NUM;
76 76 throw FSAException(oss.str());
77 77 }
78 78  
... ...