Commit 7dc1167ee08974171266fee7e2d05b773da02153
1 parent
aaf322b3
rozprawienie się z wielkością liter, poprawa jakości pakietu debianowego, zmiana…
… nazwy skryptu do budowania automatu na morfeusz_builder git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@153 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
20 changed files
with
150 additions
and
142 deletions
CMakeLists.txt
... | ... | @@ -5,7 +5,7 @@ project (Morfeusz) |
5 | 5 | set (Morfeusz_VERSION_MAJOR 2) |
6 | 6 | set (Morfeusz_VERSION_MINOR 0) |
7 | 7 | set (Morfeusz_VERSION_PATCH 0) |
8 | -set (CMAKE_BUILD_TYPE "Debug") | |
8 | +#~ set (CMAKE_BUILD_TYPE "Release") | |
9 | 9 | |
10 | 10 | enable_testing() |
11 | 11 | |
... | ... | @@ -30,13 +30,13 @@ set (PROJECT_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morf |
30 | 30 | ### USER DEFINED VARIABLES |
31 | 31 | |
32 | 32 | # INPUT_DICTIONARY_CPP |
33 | -set (INPUT_DICTIONARY_CPP "${PROJECT_SOURCE_DIR}/../default_fsa.cpp") | |
34 | -set (INPUT_SYNTH_DICTIONARY_CPP "${PROJECT_SOURCE_DIR}/../default_synth_fsa.cpp") | |
33 | +set (INPUT_DICTIONARY_CPP "default_fsa.cpp") | |
34 | +set (INPUT_SYNTH_DICTIONARY_CPP "default_synth_fsa.cpp") | |
35 | 35 | if ("${INPUT_DICTIONARIES}" STREQUAL "") |
36 | 36 | if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") |
37 | 37 | set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) |
38 | 38 | else () |
39 | - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
39 | + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/sgjp-hom.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
40 | 40 | endif () |
41 | 41 | endif () |
42 | 42 | |
... | ... | @@ -98,10 +98,10 @@ set (CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) |
98 | 98 | |
99 | 99 | |
100 | 100 | # the RPATH to be used when installing, but only if it's not a system directory |
101 | -list (FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir) | |
102 | -if ("${isSystemDir}" STREQUAL "-1") | |
103 | - SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") | |
104 | -endif () | |
101 | +#~ list (FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir) | |
102 | +#~ if ("${isSystemDir}" STREQUAL "-1") | |
103 | + #~ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") | |
104 | +#~ endif () | |
105 | 105 | |
106 | 106 | ########## Configure CPack ########## |
107 | 107 | |
... | ... | @@ -110,14 +110,16 @@ set (CPACK_PACKAGE_FILE_NAME "morfeusz2-${PROJECT_VERSION}-${CMAKE_SYSTEM_NAME}- |
110 | 110 | set (CPACK_PACKAGE_VENDOR "Michał Lenart") |
111 | 111 | set (CPACK_PACKAGE_CONTACT "michal.lenart@ipipan.waw.pl") |
112 | 112 | set (CPACK_PACKAGE_DESCRIPTION_SUMMARY "Morphological analyzer for the Polish language.") |
113 | +set (CPACK_PACKAGE_DESCRIPTION "This is the second version of Morfeusz. Among numerous improvements it has better segmentation and case-sensitive lemmas handling. It also makes it possible to perform morphological synthesis and to use your own dictionary") | |
114 | +set (CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/License.txt") | |
113 | 115 | set (CPACK_OUTPUT_FILE_PREFIX "${TARGET_DIR}") |
114 | 116 | |
115 | 117 | if (${CMAKE_SYSTEM_NAME} MATCHES "Linux") |
116 | 118 | set (CPACK_GENERATOR "DEB" "TGZ") |
117 | 119 | #debian |
118 | 120 | set (CPACK_DEBIAN_PACKAGE_NAME "morfeusz2") |
119 | - set (CPACK_DEBIAN_PACKAGE_MAINTAINER "${CPACK_PACKAGE_CONTACT}") | |
120 | - set (CPACK_DEBIAN_PACKAGE_DEPENDS "libstdc++6 (>= 4.6)") | |
121 | + set (CPACK_DEBIAN_PACKAGE_MAINTAINER "Michał Lenart <${CPACK_PACKAGE_CONTACT}>") | |
122 | + set (CPACK_DEBIAN_PACKAGE_DEPENDS "libstdc++6 (>= 4.6), libc6") | |
121 | 123 | set (CPACK_DEBIAN_PACKAGE_ARCHITECTURE "${ARCHITECTURE}") |
122 | 124 | elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") |
123 | 125 | if (${ARCHITECTURE} MATCHES "amd64") |
... | ... | @@ -149,15 +151,15 @@ add_subdirectory (fsabuilder) |
149 | 151 | ########## add tests ########## |
150 | 152 | |
151 | 153 | macro (test_build_and_recognize fname method) |
152 | - add_test (TestBuild-${method}-${fname} python fsabuilder/buildfsa.py --analyzer --input-files testfiles/${fname} -o /tmp/test-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --segments-file=testfiles/segmenty.dat --serialization-method=${method}) | |
153 | - add_test (TestBuild4Synth-${method}-${fname} python fsabuilder/buildfsa.py --generator --input-files testfiles/${fname} -o /tmp/test-synth-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --serialization-method=${method}) | |
154 | + add_test (TestBuild-${method}-${fname} python fsabuilder/morfeusz_builder --analyzer --input-files testfiles/${fname} -o /tmp/test-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --segments-file=testfiles/segmenty.dat --serialization-method=${method}) | |
155 | + add_test (TestBuild4Synth-${method}-${fname} python fsabuilder/morfeusz_builder --generator --input-files testfiles/${fname} -o /tmp/test-synth-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --serialization-method=${method}) | |
154 | 156 | add_test (TestRecognize-${method}-${fname} morfeusz/test_recognize_dict /tmp/test-${method}-${fname}.fsa testfiles/${fname}) |
155 | 157 | # add_test (TestNOTRecognize-${method}-${fname} fsa/test_not_recognize /tmp/test-${method}-${fname}.fsa testfiles/out_of_dict) |
156 | 158 | # add_test (TestSpeed-${method}-${fname} fsa/test_speed /tmp/test-${method}-${fname}.fsa testfiles/speed_test_data) |
157 | 159 | endmacro () |
158 | 160 | |
159 | 161 | macro (test_result_equals inputFilename requiredOutputFilename encoding) |
160 | - # add_test (TestBuild4ResultEquals-${dictFilename}-${requiredOutputFilename} python fsabuilder/fsa/buildfsa.py -i ${dictFilename} -o /tmp/test.fsa --tagset-file=testfiles/polimorf.tagset --output-format=BINARY --serialization-method=SIMPLE) | |
162 | + # add_test (TestBuild4ResultEquals-${dictFilename}-${requiredOutputFilename} python fsabuilder/fsa/morfeusz_builder -i ${dictFilename} -o /tmp/test.fsa --tagset-file=testfiles/polimorf.tagset --output-format=BINARY --serialization-method=SIMPLE) | |
161 | 163 | add_test (TestResultEquals-${inputFilename}-${requiredOutputFilename} morfeusz/test_result_equals ${inputFilename} ${requiredOutputFilename} ${encoding}) |
162 | 164 | endmacro () |
163 | 165 | |
... | ... |
License.txt
0 → 100644
1 | +This is the Morfeusz license file. | |
... | ... |
fsabuilder/.settings/org.eclipse.core.resources.prefs
fsabuilder/CMakeLists.txt
... | ... | @@ -16,15 +16,6 @@ add_custom_command (OUTPUT ${SETUP_PY} |
16 | 16 | |
17 | 17 | add_custom_target (builder-setup DEPENDS ${SETUP_PY}) |
18 | 18 | |
19 | -#~ add_custom_target (buildfsa-exec ALL | |
20 | - #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/buildfsa.py | |
21 | -#~ ) | |
22 | -#~ | |
23 | -#~ add_executable (buildfsa IMPORTED) | |
24 | -#~ add_dependencies (buildfsa buildfsa-exec) | |
25 | -#~ set_property (TARGET buildfsa PROPERTY IMPORTED_LOCATION "${DIST_PATH}") | |
26 | -#~ install (PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/buildfsa" DESTINATION bin) | |
27 | - | |
28 | 19 | if (${UNIX}) |
29 | 20 | add_custom_target (install-builder |
30 | 21 | COMMAND python ${SETUP_PY} install --home=${CMAKE_INSTALL_PREFIX} |
... | ... | @@ -62,6 +53,15 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") |
62 | 53 | DEPENDS builder-setup |
63 | 54 | ) |
64 | 55 | list (APPEND PACKAGE_DEPENDS package-python-win-installer) |
56 | + | |
57 | + #~ add_custom_target (buildfsa-exec ALL | |
58 | + #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder | |
59 | +#~ ) | |
60 | +#~ | |
61 | +#~ add_executable (morfeusz_builder IMPORTED) | |
62 | +#~ add_dependencies (morfeusz_builder buildfsa-exec) | |
63 | +#~ set_property (TARGET morfeusz_builder PROPERTY IMPORTED_LOCATION "${DIST_PATH}") | |
64 | +#~ install (PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/morfeusz_builder" DESTINATION bin) | |
65 | 65 | endif () |
66 | 66 | |
67 | 67 | add_custom_target(package-builder DEPENDS ${PACKAGE_DEPENDS}) |
... | ... |
fsabuilder/buildanalyzer.sh
1 | 1 | #!/bin/bash |
2 | 2 | |
3 | -python buildfsa.py --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ | |
3 | +python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ | |
4 | 4 | --tagset-file=../input/polimorf.tagset \ |
5 | 5 | --segments-file=../input/segmenty.dat \ |
6 | 6 | --analyzer \ |
... | ... |
fsabuilder/buildgenerator.sh
1 | 1 | #!/bin/bash |
2 | 2 | |
3 | -python buildfsa.py --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ | |
3 | +python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ | |
4 | 4 | --tagset-file=../input/polimorf.tagset \ |
5 | 5 | --segments-file=../input/segmenty.dat \ |
6 | 6 | --generator \ |
... | ... |
fsabuilder/buildfsa.py renamed to fsabuilder/morfeusz_builder
... | ... | @@ -12,21 +12,11 @@ import codecs |
12 | 12 | from morfeuszbuilder.fsa import encode |
13 | 13 | from morfeuszbuilder.fsa import convertinput |
14 | 14 | from morfeuszbuilder.fsa.fsa import FSA |
15 | -from morfeuszbuilder.fsa.serializer import Serializer | |
15 | +from morfeuszbuilder.fsa.serializer import Serializer, SerializationMethod | |
16 | 16 | from morfeuszbuilder.tagset.tagset import Tagset |
17 | 17 | from morfeuszbuilder.segrules import rulesParser |
18 | 18 | from optparse import OptionParser |
19 | 19 | |
20 | -# class InputFormat(): | |
21 | -# ENCODED = 'ENCODED' | |
22 | -# POLIMORF = 'POLIMORF' | |
23 | -# PLAIN = 'PLAIN' | |
24 | - | |
25 | -class SerializationMethod(): | |
26 | - SIMPLE = 'SIMPLE' | |
27 | - V1 = 'V1' | |
28 | - V2 = 'V2' | |
29 | - | |
30 | 20 | def _checkOption(opt, parser, msg): |
31 | 21 | if opt is None: |
32 | 22 | print >> sys.stderr, msg |
... | ... | @@ -46,6 +36,8 @@ def _checkOpen(filename, mode): |
46 | 36 | try: |
47 | 37 | with open(filename, mode) as _: |
48 | 38 | pass |
39 | + if 'w' in mode: | |
40 | + os.remove(filename) | |
49 | 41 | except IOError as ex: |
50 | 42 | print >> sys.stderr, str(ex) |
51 | 43 | exit(1) |
... | ... | @@ -238,7 +230,6 @@ def main(opts): |
238 | 230 | fsa, qualifiersMap = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager, opts.trimSupneg) |
239 | 231 | else: |
240 | 232 | fsa, qualifiersMap = buildGeneratorFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager) |
241 | - print qualifiersMap | |
242 | 233 | if opts.trainFile: |
243 | 234 | logging.info('training with '+opts.trainFile+' ...') |
244 | 235 | fsa.train(_readTrainData(opts.trainFile)) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/common.py
... | ... | @@ -21,6 +21,7 @@ class EncodedForm(object): |
21 | 21 | self.cutLength = len(fromWord) - len(root) |
22 | 22 | self.suffixToAdd = targetWord[len(root):] |
23 | 23 | self.casePattern = [c == c.upper() and c != c.lower() for c in root] |
24 | +# print fromWord.encode('utf8'), targetWord.encode('utf8'), self.casePattern | |
24 | 25 | |
25 | 26 | class EncodedForm4Generator(object): |
26 | 27 | |
... | ... | @@ -54,7 +55,7 @@ class Interpretation4Analyzer(object): |
54 | 55 | return ( |
55 | 56 | self.encodedForm.cutLength, |
56 | 57 | tuple(self.encodedForm.suffixToAdd), |
57 | - tuple(self.encodedForm.casePattern), | |
58 | + tuple(self.encodedForm.casePattern), | |
58 | 59 | self.tagnum, |
59 | 60 | self.namenum) |
60 | 61 | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/encode.py
... | ... | @@ -120,16 +120,13 @@ class Encoder(object): |
120 | 120 | res[interp.typenum].append(interp) |
121 | 121 | return res |
122 | 122 | |
123 | - def _getMostLiberalCasePattern(self, interpsList): | |
124 | - res = None | |
123 | + def _getCasePatterns(self, interpsList): | |
124 | + res = [] | |
125 | 125 | for interp in interpsList: |
126 | - if res is None: | |
127 | - res = list(interp.encodedForm.casePattern) | |
126 | + if not True in interp.encodedForm.casePattern: | |
127 | + return [] | |
128 | 128 | else: |
129 | - while len(interp.encodedForm.casePattern) > len(res): | |
130 | - res.append(False) | |
131 | - for idx, (case1, case2) in enumerate(itertools.izip_longest(res, interp.encodedForm.casePattern, fillvalue=False)): | |
132 | - res[idx] = case1 and case2 | |
129 | + res.append(list(interp.encodedForm.casePattern)) | |
133 | 130 | return res |
134 | 131 | |
135 | 132 | def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId): |
... | ... | @@ -137,7 +134,10 @@ class Encoder(object): |
137 | 134 | res.extend(self._encodeTypeNum(typenum)) |
138 | 135 | encodedInterpsList = bytearray() |
139 | 136 | if withCasePattern: |
140 | - encodedInterpsList.extend(self._encodeCasePattern(self._getMostLiberalCasePattern(interpsList))) | |
137 | + casePatterns = self._getCasePatterns(interpsList) | |
138 | + encodedInterpsList.append(len(casePatterns)) | |
139 | + for casePattern in casePatterns: | |
140 | + encodedInterpsList.extend(self._encodeCasePattern(casePattern)) | |
141 | 141 | for interp in sorted(interpsList, key=lambda i: i.getSortKey()): |
142 | 142 | if withHomonymId: |
143 | 143 | encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False)) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.py
... | ... | @@ -8,6 +8,11 @@ import logging |
8 | 8 | from state import State |
9 | 9 | from morfeuszbuilder.utils.serializationUtils import * |
10 | 10 | |
11 | +class SerializationMethod(object): | |
12 | + SIMPLE = 'SIMPLE' | |
13 | + V1 = 'V1' | |
14 | + V2 = 'V2' | |
15 | + | |
11 | 16 | class Serializer(object): |
12 | 17 | |
13 | 18 | MAGIC_NUMBER = 0x8fc2bc1b |
... | ... | @@ -20,7 +25,6 @@ class Serializer(object): |
20 | 25 | |
21 | 26 | @staticmethod |
22 | 27 | def getSerializer(serializationMethod, fsa, tagset, qualifiersMap, segmentationRulesData): |
23 | - from buildfsa import SerializationMethod | |
24 | 28 | res = { |
25 | 29 | SerializationMethod.SIMPLE: SimpleSerializer, |
26 | 30 | SerializationMethod.V1: VLengthSerializer1, |
... | ... | @@ -37,7 +41,7 @@ class Serializer(object): |
37 | 41 | |
38 | 42 | # get the Morfeusz file format version that is being encoded |
39 | 43 | def getVersion(self): |
40 | - return 14 | |
44 | + return 15 | |
41 | 45 | |
42 | 46 | def serialize2CppFile(self, fname, isGenerator, headerFilename="data/default_fsa.hpp"): |
43 | 47 | res = [] |
... | ... |
fsabuilder/setup.py.in
... | ... | @@ -8,5 +8,5 @@ if __name__ == '__main__': |
8 | 8 | description='Finite state automata builder for Morfeusz.', |
9 | 9 | version='${MORFEUSZBUILDER_VERSION}', |
10 | 10 | packages=['morfeuszbuilder', 'morfeuszbuilder.fsa', 'morfeuszbuilder.tagset', 'morfeuszbuilder.segrules', 'morfeuszbuilder.utils'], |
11 | - scripts = ['buildfsa.py'], | |
11 | + scripts = ['morfeusz_builder'], | |
12 | 12 | requires=['pyparsing']) |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -3,13 +3,13 @@ |
3 | 3 | ########## generate default dictionary data ################# |
4 | 4 | add_custom_command ( |
5 | 5 | OUTPUT "${INPUT_DICTIONARY_CPP}" |
6 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg | |
6 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg | |
7 | 7 | DEPENDS "${INPUT_DICTIONARY}" |
8 | 8 | COMMENT "Building default dictionary C++ file" |
9 | 9 | ) |
10 | 10 | add_custom_command ( |
11 | 11 | OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" |
12 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 | |
12 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 | |
13 | 13 | DEPENDS "${INPUT_DICTIONARY}" |
14 | 14 | COMMENT "Building default dictionary C++ file" |
15 | 15 | ) |
... | ... |
morfeusz/CasePatternHelper.hpp
... | ... | @@ -9,7 +9,7 @@ |
9 | 9 | #define CASEPATTERNHELPER_HPP |
10 | 10 | |
11 | 11 | #include <vector> |
12 | -#include "InterpretedChunk.hpp" | |
12 | +#include "InterpsGroup.hpp" | |
13 | 13 | |
14 | 14 | const uint8_t LEMMA_ONLY_LOWER = 0; |
15 | 15 | const uint8_t LEMMA_UPPER_PREFIX = 1; |
... | ... | @@ -39,28 +39,40 @@ public: |
39 | 39 | } |
40 | 40 | return true; |
41 | 41 | } |
42 | - | |
43 | -// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const { | |
44 | -// if (this->caseSensitive) { | |
45 | -// for (unsigned int i = 0; i < chunks.size(); i++) { | |
46 | -// const InterpretedChunk& ic = chunks[i]; | |
47 | -// const unsigned char* casePatternPtr = ic.interpsGroup.ptr; | |
48 | -// std::vector<bool> casePattern; | |
49 | -// deserializeCasePattern(casePatternPtr, casePattern); | |
50 | -// if (!checkCasePattern(ic, casePattern)) { | |
51 | -// return false; | |
52 | -// } | |
53 | -// } | |
54 | -// } | |
55 | -// return true; | |
56 | -// } | |
57 | - | |
58 | - void skipCasePattern(const unsigned char*& ptr) const { | |
59 | - vector<bool> _dupa; | |
60 | - deserializeCasePattern(ptr, _dupa); | |
42 | + | |
43 | + bool checkInterpsGroupCasePatterns( | |
44 | + const std::vector<uint32_t>& lowercaseCodepoints, | |
45 | + const std::vector<uint32_t>& originalCodepoints, | |
46 | + const InterpsGroup& ig) const { | |
47 | + const unsigned char* currPtr = ig.ptr; | |
48 | + unsigned char casePatternsNum = *currPtr++; | |
49 | + if (casePatternsNum == 0) { | |
50 | + return true; | |
51 | + } | |
52 | + else { | |
53 | + for (unsigned int i = 0; i < casePatternsNum; i++) { | |
54 | + if (checkCasePattern( | |
55 | + lowercaseCodepoints, | |
56 | + originalCodepoints, | |
57 | + deserializeOneCasePattern(currPtr))) { | |
58 | + return true; | |
59 | + } | |
60 | + } | |
61 | + return false; | |
62 | + } | |
61 | 63 | } |
62 | - | |
63 | - void deserializeCasePattern(const unsigned char*& ptr, std::vector<bool>& res) const { | |
64 | + | |
65 | + const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const { | |
66 | + const unsigned char* currPtr = ig.ptr; | |
67 | + unsigned char casePatternsNum = *currPtr++; | |
68 | + for (unsigned int i = 0; i < casePatternsNum; i++) { | |
69 | + deserializeOneCasePattern(currPtr); | |
70 | + } | |
71 | + return currPtr; | |
72 | + } | |
73 | + | |
74 | + std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { | |
75 | + std::vector<bool> res; | |
64 | 76 | uint8_t casePatternType = *ptr; |
65 | 77 | ptr++; |
66 | 78 | uint8_t prefixLength; |
... | ... | @@ -89,9 +101,31 @@ public: |
89 | 101 | } |
90 | 102 | break; |
91 | 103 | } |
104 | + return res; | |
92 | 105 | } |
106 | + | |
107 | +// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const { | |
108 | +// if (this->caseSensitive) { | |
109 | +// for (unsigned int i = 0; i < chunks.size(); i++) { | |
110 | +// const InterpretedChunk& ic = chunks[i]; | |
111 | +// const unsigned char* casePatternPtr = ic.interpsGroup.ptr; | |
112 | +// std::vector<bool> casePattern; | |
113 | +// deserializeCasePattern(casePatternPtr, casePattern); | |
114 | +// if (!checkCasePattern(ic, casePattern)) { | |
115 | +// return false; | |
116 | +// } | |
117 | +// } | |
118 | +// } | |
119 | +// return true; | |
120 | +// } | |
121 | + | |
122 | +// void skipCasePattern(const unsigned char*& ptr) const { | |
123 | +// vector<bool> _dupa; | |
124 | +// deserializeCasePattern(ptr, _dupa); | |
125 | +// } | |
93 | 126 | private: |
94 | 127 | bool caseSensitive; |
128 | + | |
95 | 129 | }; |
96 | 130 | |
97 | 131 | #endif /* CASEPATTERNHELPER_HPP */ |
... | ... |
morfeusz/EncodedGeneratorInterpretation.hpp deleted
1 | -/* | |
2 | - * File: EncodedGeneratorInterpretation.hpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on 20 styczeń 2014, 17:15 | |
6 | - */ | |
7 | - | |
8 | -#ifndef ENCODEDGENERATORINTERPRETATION_HPP | |
9 | -#define ENCODEDGENERATORINTERPRETATION_HPP | |
10 | - | |
11 | -/* | |
12 | - * Orth in a compressed format (as in an automaton) | |
13 | - */ | |
14 | -struct EncodedOrth { | |
15 | - int suffixToCut; | |
16 | - std::string suffixToAdd; | |
17 | - std::string prefixToAdd; | |
18 | -}; | |
19 | - | |
20 | -/* | |
21 | - * Internal representation of an interpretation - with orth encoded | |
22 | - */ | |
23 | -struct EncodedGeneratorInterpretation { | |
24 | - EncodedOrth orth; | |
25 | - int tag; | |
26 | - int nameClassifier; | |
27 | -}; | |
28 | - | |
29 | -#endif /* ENCODEDGENERATORINTERPRETATION_HPP */ | |
30 | - |
morfeusz/InflexionGraph.cpp
... | ... | @@ -13,9 +13,9 @@ void InflexionGraph::addStartEdge(const Edge& e) { |
13 | 13 | if (this->graph.empty()) { |
14 | 14 | assert(this->node2ChunkStartPtr.empty()); |
15 | 15 | this->graph.push_back(vector<Edge>()); |
16 | - this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr); | |
16 | + this->node2ChunkStartPtr.push_back(e.chunk.textStartPtr); | |
17 | 17 | } |
18 | - assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr); | |
18 | + assert(this->node2ChunkStartPtr[0] == e.chunk.textStartPtr); | |
19 | 19 | this->graph[0].push_back(e); |
20 | 20 | } |
21 | 21 | |
... | ... | @@ -24,7 +24,7 @@ void InflexionGraph::addMiddleEdge(unsigned int startNode, const Edge& e) { |
24 | 24 | assert(startNode == this->graph.size()); |
25 | 25 | if (startNode == this->graph.size()) { |
26 | 26 | this->graph.push_back(vector<Edge>()); |
27 | - this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr); | |
27 | + this->node2ChunkStartPtr.push_back(e.chunk.textStartPtr); | |
28 | 28 | } |
29 | 29 | this->graph[startNode].push_back(e); |
30 | 30 | } |
... | ... | @@ -98,7 +98,7 @@ set<InflexionGraph::Path> InflexionGraph::getPossiblePaths(unsigned int node) { |
98 | 98 | vector<Edge>& edges = this->graph.at(node); |
99 | 99 | for (unsigned int i = 0; i < edges.size(); i++) { |
100 | 100 | Edge& e = edges[i]; |
101 | - InflexionGraph::PathElement pathElem(e.chunk.chunkStartPtr, e.chunk.interpsGroup.type); | |
101 | + InflexionGraph::PathElement pathElem(e.chunk.textStartPtr, e.chunk.segmentType); | |
102 | 102 | if (e.nextNode != this->graph.size()) { |
103 | 103 | set<Path> possiblePaths = this->getPossiblePaths(e.nextNode); |
104 | 104 | vector<Path> nextPaths(possiblePaths.begin(), possiblePaths.end()); |
... | ... | @@ -116,9 +116,9 @@ set<InflexionGraph::Path> InflexionGraph::getPossiblePaths(unsigned int node) { |
116 | 116 | static bool containsEqualEdge(const vector<InflexionGraph::Edge>& edges, const InflexionGraph::Edge& e) { |
117 | 117 | for (unsigned int i = 0; i < edges.size(); i++) { |
118 | 118 | const InflexionGraph::Edge& e1 = edges[i]; |
119 | - if (e1.chunk.chunkStartPtr == e.chunk.chunkStartPtr | |
119 | + if (e1.chunk.textStartPtr == e.chunk.textStartPtr | |
120 | 120 | && e1.chunk.lowercaseCodepoints == e.chunk.lowercaseCodepoints |
121 | - && e1.chunk.interpsGroup.type == e.chunk.interpsGroup.type | |
121 | + && e1.chunk.segmentType == e.chunk.segmentType | |
122 | 122 | && e1.nextNode == e.nextNode) { |
123 | 123 | return true; |
124 | 124 | } |
... | ... |
morfeusz/InterpretedChunk.hpp
... | ... | @@ -12,11 +12,13 @@ |
12 | 12 | #include "InterpsGroup.hpp" |
13 | 13 | |
14 | 14 | struct InterpretedChunk { |
15 | - const char* chunkStartPtr; | |
16 | - const char* chunkEndPtr; | |
15 | + unsigned char segmentType; | |
16 | + const char* textStartPtr; | |
17 | + const char* textEndPtr; | |
17 | 18 | std::vector<uint32_t> originalCodepoints; |
18 | 19 | std::vector<uint32_t> lowercaseCodepoints; |
19 | - InterpsGroup interpsGroup; | |
20 | + const unsigned char* interpsPtr; | |
21 | + const unsigned char* interpsEndPtr; | |
20 | 22 | bool shiftOrth; |
21 | 23 | bool orthWasShifted; |
22 | 24 | std::vector<InterpretedChunk> prefixChunks; |
... | ... |
morfeusz/InterpretedChunksDecoder.hpp
... | ... | @@ -69,9 +69,8 @@ public: |
69 | 69 | string lemmaPrefix; |
70 | 70 | if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) { |
71 | 71 | orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); |
72 | - const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr; | |
73 | - env.getCasePatternHelper().skipCasePattern(currPtr); | |
74 | - while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) { | |
72 | + const unsigned char* currPtr = interpretedChunk.interpsPtr; | |
73 | + while (currPtr < interpretedChunk.interpsEndPtr) { | |
75 | 74 | this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr, out); |
76 | 75 | } |
77 | 76 | } |
... | ... | @@ -104,7 +103,7 @@ protected: |
104 | 103 | encodedForm.suffixToAdd = (const char*) ptr; |
105 | 104 | ptr += strlen((const char*) ptr) + 1; |
106 | 105 | assert(encodedForm.casePattern.size() == 0); |
107 | - env.getCasePatternHelper().deserializeCasePattern(ptr, encodedForm.casePattern); | |
106 | + encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); | |
108 | 107 | } |
109 | 108 | private: |
110 | 109 | |
... | ... | @@ -126,8 +125,8 @@ private: |
126 | 125 | std::vector<MorphInterpretation>& out) const { |
127 | 126 | string lemma = lemmaPrefix; |
128 | 127 | EncodedInterpretation ei = this->deserializeInterp(ptr); |
128 | + this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma); | |
129 | 129 | if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.value.casePattern)) { |
130 | - this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma); | |
131 | 130 | pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); |
132 | 131 | out.push_back(MorphInterpretation( |
133 | 132 | startNode, endNode, |
... | ... | @@ -144,9 +143,9 @@ private: |
144 | 143 | for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { |
145 | 144 | const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; |
146 | 145 | orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); |
147 | - const unsigned char* ptr = prefixChunk.interpsGroup.ptr; | |
146 | + const unsigned char* ptr = prefixChunk.interpsPtr; | |
148 | 147 | std::vector<MorphInterpretation> mi; |
149 | - env.getCasePatternHelper().skipCasePattern(ptr); | |
148 | +// env.getCasePatternHelper().skipCasePattern(ptr); | |
150 | 149 | this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, ptr, mi); |
151 | 150 | if (!mi.empty()) { |
152 | 151 | lemmaPrefix += mi[0].getLemma(); |
... | ... | @@ -173,8 +172,8 @@ public: |
173 | 172 | string lemma; |
174 | 173 | convertPrefixes(interpretedChunk, orthPrefix, lemma); |
175 | 174 | lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); |
176 | - const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr; | |
177 | - while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) { | |
175 | + const unsigned char* currPtr = interpretedChunk.interpsPtr; | |
176 | + while (currPtr < interpretedChunk.interpsEndPtr) { | |
178 | 177 | MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); |
179 | 178 | // cerr << mi.toString(false) << endl; |
180 | 179 | // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; |
... | ... | @@ -190,7 +189,7 @@ private: |
190 | 189 | for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { |
191 | 190 | const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; |
192 | 191 | lemma += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); |
193 | - const unsigned char* ptr = prefixChunk.interpsGroup.ptr; | |
192 | + const unsigned char* ptr = prefixChunk.interpsPtr; | |
194 | 193 | MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr); |
195 | 194 | orthPrefix += mi.getOrth(); |
196 | 195 | } |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -108,7 +108,7 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { |
108 | 108 | from.prefixChunks.end()); |
109 | 109 | to.prefixChunks.push_back(from); |
110 | 110 | from.orthWasShifted = true; |
111 | - to.chunkStartPtr = from.chunkStartPtr; | |
111 | + to.textStartPtr = from.textStartPtr; | |
112 | 112 | } |
113 | 113 | |
114 | 114 | static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { |
... | ... | @@ -120,7 +120,7 @@ static inline string debugInterpsGroup(unsigned char type, const char* startPtr, |
120 | 120 | static inline string debugAccum(vector<InterpretedChunk>& accum) { |
121 | 121 | stringstream res; |
122 | 122 | for (unsigned int i = 0; i < accum.size(); i++) { |
123 | - res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr); | |
123 | + res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr); | |
124 | 124 | // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; |
125 | 125 | } |
126 | 126 | return res.str(); |
... | ... | @@ -168,33 +168,37 @@ void Morfeusz::doProcessOneWord( |
168 | 168 | vector<InterpsGroup> val(state.getValue()); |
169 | 169 | for (unsigned int i = 0; i < val.size(); i++) { |
170 | 170 | InterpsGroup& ig = val[i]; |
171 | - vector<bool> casePattern; | |
172 | -// env.getCasePatternHelper().skipCasePattern(ig.ptr); | |
173 | - const unsigned char* casePatternPtr = ig.ptr; | |
174 | - env.getCasePatternHelper().deserializeCasePattern(casePatternPtr, casePattern); | |
171 | + // vector<bool> casePattern; | |
172 | + // env.getCasePatternHelper().skipCasePattern(ig.ptr); | |
173 | + // const unsigned char* casePatternPtr = ig.ptr; | |
174 | + // env.getCasePatternHelper().deserializeCasePattern(casePatternPtr, casePattern); | |
175 | 175 | if (this->options.debug) { |
176 | 176 | cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; |
177 | 177 | } |
178 | - if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, originalCodepoints, casePattern)) { | |
179 | -// if (true) { | |
180 | - // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; | |
181 | - set<SegrulesState> newSegrulesStates; | |
182 | - env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); | |
183 | - if (this->options.debug && newSegrulesStates.empty()) { | |
184 | - cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; | |
185 | - } | |
186 | - // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; | |
178 | + // if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, originalCodepoints, casePattern)) { | |
179 | + // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; | |
180 | + set<SegrulesState> newSegrulesStates; | |
181 | + env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); | |
182 | + if (this->options.debug && newSegrulesStates.empty()) { | |
183 | + cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; | |
184 | + } | |
185 | + if (!newSegrulesStates.empty() && env.getCasePatternHelper().checkInterpsGroupCasePatterns(normalizedCodepoints, originalCodepoints, ig)) { | |
186 | + | |
187 | 187 | for ( |
188 | 188 | set<SegrulesState>::iterator it = newSegrulesStates.begin(); |
189 | 189 | it != newSegrulesStates.end(); |
190 | 190 | ++it) { |
191 | 191 | SegrulesState newSegrulesState = *it; |
192 | + const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig); | |
193 | + const unsigned char* interpsEndPtr = ig.ptr + ig.size; | |
192 | 194 | InterpretedChunk ic = { |
195 | + ig.type, | |
193 | 196 | inputStart, |
194 | 197 | currInput, |
195 | 198 | originalCodepoints, |
196 | 199 | normalizedCodepoints, |
197 | - ig, | |
200 | + interpsPtr, | |
201 | + interpsEndPtr, | |
198 | 202 | newSegrulesState.shiftOrthFromPrevious, |
199 | 203 | false, |
200 | 204 | vector<InterpretedChunk>(), |
... | ... |
morfeusz/fsa/const.cpp
... | ... | @@ -2,7 +2,7 @@ |
2 | 2 | #include "const.hpp" |
3 | 3 | |
4 | 4 | extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; |
5 | -extern const uint8_t VERSION_NUM = 14; | |
5 | +extern const uint8_t VERSION_NUM = 15; | |
6 | 6 | |
7 | 7 | extern const unsigned int VERSION_NUM_OFFSET = 4; |
8 | 8 | extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; |
... | ... |
morfeusz/fsa/fsa_impl.hpp
... | ... | @@ -66,13 +66,13 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
66 | 66 | |
67 | 67 | uint32_t magicNumber = ntohl(*((const uint32_t*) ptr)); |
68 | 68 | if (magicNumber != MAGIC_NUMBER) { |
69 | - throw FSAException("Invalid magic number"); | |
69 | + throw FSAException("Invalid file format"); | |
70 | 70 | } |
71 | 71 | |
72 | 72 | uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET); |
73 | 73 | if (versionNum != VERSION_NUM) { |
74 | 74 | std::ostringstream oss; |
75 | - oss << "Invalid version number: " << versionNum << ", should be: " << VERSION_NUM; | |
75 | + oss << "Invalid file format version number: " << (int) versionNum << ", should be: " << (int) VERSION_NUM; | |
76 | 76 | throw FSAException(oss.str()); |
77 | 77 | } |
78 | 78 | |
... | ... |