diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f0a411..f664787 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ set (Morfeusz_VERSION_MAJOR 2) set (Morfeusz_VERSION_MINOR 0) set (Morfeusz_VERSION_PATCH 0) set (Morfeusz_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morfeusz_VERSION_PATCH}") -if (NOT ${VERSION_SUFFIX} STREQUAL "") +if (VERSION_SUFFIX) set (Morfeusz_VERSION_TWEAK "${VERSION_SUFFIX}") set (Morfeusz_VERSION "${Morfeusz_VERSION}_${Morfeusz_VERSION_TWEAK}") endif () @@ -148,13 +148,11 @@ include (CPack) file (COPY fsabuilder testfiles input DESTINATION .) configure_file ( - "${PROJECT_SOURCE_DIR}/morfeusz/MorfeuszConfig.hpp.in" - "${PROJECT_BINARY_DIR}/morfeusz/MorfeuszConfig.hpp" + "${PROJECT_SOURCE_DIR}/morfeusz/MorfeuszVersion.hpp.in" + "${PROJECT_BINARY_DIR}/MorfeuszVersion.hpp" ) - -###### add main sources ######## - include_directories("${PROJECT_BINARY_DIR}" ) +###### add main sources ######## add_subdirectory (morfeusz) add_subdirectory (fsabuilder) diff --git a/README b/README index a24ea25..32ff73d 100644 --- a/README +++ b/README @@ -109,7 +109,7 @@ Create separate build directory, for example build-darwin. Run cross-compilation: ---------------------- -cmake -D CROSSMORFEUSZ_ROOT=<path_to_crossmorfeusz_dir> -DCMAKE_TOOLCHAIN_FILE=../morfeusz/Toolchain-xxx.cmake .. +cmake -D CROSSMORFEUSZ_ROOT=<path_to_crossmorfeusz_dir> -DCMAKE_TOOLCHAIN_FILE=../toolchains/Toolchain-xxx.cmake .. make make package package-java package-python diff --git a/buildAll.sh b/buildAll.sh index f8c700d..366801b 100755 --- a/buildAll.sh +++ b/buildAll.sh @@ -30,12 +30,12 @@ function build { srcDir=`pwd` buildDir=buildall/$os-$arch targetDir=$srcDir/target - toolchain=$srcDir/morfeusz/Toolchain-$os-$arch.cmake + toolchain=$srcDir/toolchains/Toolchain-$os-$arch.cmake echo "Will use $toolchain toolchain" rm -rf $buildDir - rm -rf $targetDir + #~ rm -rf $targetDir mkdir -p $buildDir mkdir -p $targetDir cd $buildDir @@ -68,7 +68,7 @@ function log { export -f build export -f log -rm -rf log +rm -rf log target mkdir -p log buildDictionaries 2>&1 | log All all @@ -79,6 +79,6 @@ buildDictionaries 2>&1 | log All all echo "build Windows amd64 package package-java 2>&1 | log Windows amd64" echo "build Windows i386 package package-java 2>&1 | log Windows i386" echo "build Darwin amd64 package package-java 2>&1 | log Darwin amd64" -} | xargs -n1 -P8 -d$'\n' bash -c +} | xargs -n1 -P5 -d$'\n' bash -c diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt index 0c9b004..6d18687 100644 --- a/morfeusz/CMakeLists.txt +++ b/morfeusz/CMakeLists.txt @@ -25,8 +25,6 @@ add_custom_target ( dictionaries DEPENDS analyzer-dictionary generator-dictionar include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ) -# add_custom_target (dupa DEPENDS "${INPUT_DICTIONARY_CPP}") - #### build ##### set(SRC_FILES @@ -42,37 +40,39 @@ set(SRC_FILES InflexionGraph.cpp charset/TextReader.cpp charset/CharsetConverter.cpp - charset/CaseConverter.cpp - charset/caseconv.cpp + case/CaseConverter.cpp + case/caseconv.cpp charset/conversion_tables.cpp cli/cli.cpp segrules/segrules.cpp segrules/SegrulesFSA.cpp - CasePatternHelper.cpp - decoder/InterpretedChunksDecoder.cpp - decoder/InterpretedChunksDecoder4Analyzer.cpp - decoder/InterpretedChunksDecoder4Generator.cpp - deserializer/InterpsGroupsReader.cpp - deserializer/MorphDeserializer.cpp + case/CasePatternHelper.cpp + deserialization/morphInterps/InterpretedChunksDecoder.cpp + deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.cpp + deserialization/morphInterps/InterpretedChunksDecoder4Generator.cpp + deserialization/InterpsGroupsReader.cpp + deserialization/MorphDeserializer.cpp ) set(INCLUDE_FILES const.hpp data/default_fsa.hpp + Environment.hpp Tagset.hpp Qualifiers.hpp fsa/const.hpp MorphInterpretation.hpp Morfeusz.hpp + MorfeuszVersion.hpp InflexionGraph.hpp charset/CharsetConverter.hpp charset/TextReader.hpp - charset/CaseConverter.hpp - charset/caseconv.hpp + case/CaseConverter.hpp + case/caseconv.hpp charset/conversion_tables.hpp cli/cli.hpp segrules/segrules.hpp - deserializer/MorphDeserializer.cpp + deserialization/MorphDeserializer.cpp ) add_library (libmorfeusz SHARED ${SRC_FILES}) @@ -82,24 +82,15 @@ set_target_properties (libmorfeusz PROPERTIES OUTPUT_NAME "morfeusz2") add_executable (morfeusz_analyzer morfeusz_analyzer.cpp) add_executable (morfeusz_generator morfeusz_generator.cpp) -add_executable (test_result_equals test_result_equals.cpp) -add_executable (test_recognize_dict test_recognize_dict.cpp) +add_executable (test_result_equals test/test_result_equals.cpp) +add_executable (test_recognize_dict test/test_recognize_dict.cpp) target_link_libraries (morfeusz_analyzer libmorfeusz) target_link_libraries (morfeusz_generator libmorfeusz) target_link_libraries (test_result_equals libmorfeusz) target_link_libraries (test_recognize_dict libmorfeusz) -if (${CMAKE_SYSTEM_NAME} MATCHES "Windows") - target_link_libraries (libmorfeusz ws2_32) - set (TARGET_LIB_DIR bin) -else () - set (TARGET_LIB_DIR lib) -endif () - -add_subdirectory (java) -add_subdirectory (python) -add_subdirectory (perl) +add_subdirectory (wrappers) if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") add_custom_target (morfeusz-repair-library @@ -108,6 +99,13 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") add_dependencies (morfeusz_analyzer morfeusz-repair-library) endif () +if (${CMAKE_SYSTEM_NAME} MATCHES "Windows") + target_link_libraries (libmorfeusz ws2_32) + set (TARGET_LIB_DIR bin) +else () + set (TARGET_LIB_DIR lib) +endif () + install (FILES ${INCLUDE_FILES} DESTINATION include/morfeusz) install (TARGETS libmorfeusz DESTINATION ${TARGET_LIB_DIR}) install (TARGETS morfeusz_analyzer morfeusz_generator DESTINATION bin) diff --git a/morfeusz/Environment.cpp b/morfeusz/Environment.cpp index da23154..6886992 100644 --- a/morfeusz/Environment.cpp +++ b/morfeusz/Environment.cpp @@ -8,14 +8,11 @@ #include <vector> #include <algorithm> #include "Environment.hpp" -#include "decoder/InterpretedChunksDecoder.hpp" -#include "deserializer/MorphDeserializer.hpp" +#include "deserialization/MorphDeserializer.hpp" #include "exceptions.hpp" -#include "decoder/InterpretedChunksDecoder4Analyzer.hpp" -#include "decoder/InterpretedChunksDecoder4Generator.hpp" - -//class InterpretedChunksDecoder4Analyzer; -//class InterpretedChunksDecoder4Generator; +#include "deserialization/morphInterps/InterpretedChunksDecoder.hpp" +#include "deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.hpp" +#include "deserialization/morphInterps/InterpretedChunksDecoder4Generator.hpp" static Deserializer<InterpsGroupsReader>& initializeDeserializer(MorfeuszProcessorType processorType) { static Deserializer<InterpsGroupsReader> *analyzerDeserializer diff --git a/morfeusz/Environment.hpp b/morfeusz/Environment.hpp index 20aad95..553f55b 100644 --- a/morfeusz/Environment.hpp +++ b/morfeusz/Environment.hpp @@ -13,56 +13,142 @@ class InterpretedChunksDecoder; class CasePatternHelper; -#include "charset/CaseConverter.hpp" +#include "case/CaseConverter.hpp" #include "charset/CharsetConverter.hpp" #include "fsa/fsa.hpp" #include "segrules/segrules.hpp" #include "const.hpp" #include "Tagset.hpp" -//#include "InterpretedChunksDecoder.hpp" #include "InterpsGroup.hpp" -#include "CasePatternHelper.hpp" +#include "case/CasePatternHelper.hpp" #include "Qualifiers.hpp" -#include "deserializer/InterpsGroupsReader.hpp" +#include "deserialization/InterpsGroupsReader.hpp" struct InterpsGroup; typedef FSA<InterpsGroupsReader> FSAType; -//typedef FSA< std::vector<InterpsGroup > > FSAType; +/** + * This class contains data required for morphological analysis/synthesis. + * It contains references to dictionary automaton, charset converter, tagset data etc. + * All of these can be changed by setters, changing Morfeusz behavior (different dictionary, charset, and other options). + */ class Environment { public: + /** + * Creates default environment with given initial charset, processor type (analyzer/generator) and default dictionary data ptr. + * + * @param charset + * @param morfeuszProcessor + * @param fileStartPtr + */ Environment( MorfeuszCharset charset, MorfeuszProcessorType morfeuszProcessor, const unsigned char* fileStartPtr); + /** + * Sets charset for this environment. + * + * @param charset + */ void setCharset(MorfeuszCharset charset); + /** + * Sets case sensitivity options. + * + * @param caseSensitive - if true, interpretations not matching case will be discarded. + */ void setCaseSensitive(bool caseSensitive); + /** + * Gets charset converter that is currently used by this environment. + * Changed by setting charset. + * + * @return - reference to charset converter. + */ const CharsetConverter& getCharsetConverter() const; + /** + * Returns case converter that is currently used by this environment. + * Changed by setting case sensitivity option. + * + * @return - reference to case converter. + */ const CaseConverter& getCaseConverter() const; + /** + * Sets new tagset for this environment. + * + * @param tagset + */ void setTagset(const Tagset& tagset); + + /** + * Gets currently used tagset. + * + * @return + */ const Tagset& getTagset() const; + /** + * Sets binary dictionary file used by this environment. + * + * @param filename - filename of the dictionary + */ void setFSAFile(const std::string& filename); + /** + * Sets segmentation rules option. + * + * @param option + * @param value + */ void setSegrulesOption(const std::string& option, const std::string& value); + /** + * Gets segmentation rules automaton. + * + * @return + */ const SegrulesFSA& getCurrentSegrulesFSA() const; + /** + * Gets dictionary automaton. + * + * @return + */ const FSAType& getFSA() const; + /** + * Returns decoder that converts interpretations to external format. + * @return + */ const InterpretedChunksDecoder& getInterpretedChunksDecoder() const; + /** + * Gets processor type (info if this is analyzer or generator environment) + * @return + */ MorfeuszProcessorType getProcessorType() const; + /** + * Return current case pattern helper + * + * @return + */ const CasePatternHelper& getCasePatternHelper() const; + /** + * Return current qualifiers helper. + * @return + */ const Qualifiers& getQualifiersHelper() const; + /** + * Returns true iff given codepoint denotes a separator char for ign handling. + * @param codepoint + * @return + */ bool isSeparator(uint32_t codepoint) const; virtual ~Environment(); diff --git a/morfeusz/InflexionGraph.hpp b/morfeusz/InflexionGraph.hpp index 99e8c19..171ed41 100644 --- a/morfeusz/InflexionGraph.hpp +++ b/morfeusz/InflexionGraph.hpp @@ -13,6 +13,10 @@ #include <utility> #include "InterpretedChunk.hpp" +/** + * This class build inflection graph (indexes the nodes, takes into account segments marked as "weak"). + * Takes care to make the number of nodes as little as possible. + */ class InflexionGraph { public: @@ -24,30 +28,60 @@ public: InterpretedChunk chunk; unsigned int nextNode; }; - + + /** + * Adds new path to the graph. + * + * @param path + * @param weak + */ void addPath(const std::vector<InterpretedChunk>& path, bool weak); // void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results); + /** + * Return current graph. + * + * @return + */ const std::vector< std::vector<InflexionGraph::Edge> >& getTheGraph(); + /** + * True iff the graph is empty. + * + * @return + */ bool empty() const; + /** + * Clears the graph. + */ void clear(); - - // virtual ~FlexionGraph(); private: typedef std::pair<const char*, int> PathElement; typedef std::set<PathElement> Path; + /** + * Adds an edge that starts a chunk. + * + * @param e + */ void addStartEdge(const Edge& e); - + + /** + * Adds non-starting edge. + * @param startNode + * @param e + */ void addMiddleEdge(unsigned int startNode, const Edge& e); + /** + * Minimizes the graph so it contains as little number of nodes as possible. + */ void minimizeGraph(); - + bool canMergeNodes(unsigned int node1, unsigned int node2); void doMergeNodes(unsigned int node1, unsigned int node2); diff --git a/morfeusz/InterpretedChunk.hpp b/morfeusz/InterpretedChunk.hpp index b2264a1..0bbc589 100644 --- a/morfeusz/InterpretedChunk.hpp +++ b/morfeusz/InterpretedChunk.hpp @@ -11,16 +11,59 @@ #include <vector> #include "InterpsGroup.hpp" +/** + * Denotes a part of text that has some not-yet-deserialized interpretations attached to it. + */ struct InterpretedChunk { + + /** + * The type of segment for this chunk. + */ unsigned char segmentType; + + /** + * Pointer to start of this chunks text + */ const char* textStartPtr; + + /** + * Pointer to end of this chunks text (exclusive) + */ const char* textEndPtr; + + /** + * Pointer to the start of this chunks binary data. + */ const unsigned char* interpsGroupPtr; + + /** + * Pointer to the end of this chunks binary data (exclusive) + */ const unsigned char* interpsEndPtr; + + /** + * true iff this chunk shifts orth to the one right to it (it is "A" in "A> B") + */ bool shiftOrth; + + /** + * true iff this chunk has attached data from its prefix chunk (when it is "B" segment in "A> B" segmentation rule) + */ bool orthWasShifted; + + /** + * Number of codepoints this chunks consists of. + */ int codepointsNum; + + /** + * Chunks that are in the prefix segments (those with ">" in segmentation rules, ie. "dig>* dig") + */ std::vector<InterpretedChunk> prefixChunks; + + /** + * Homonym id specified by the user. + */ std::string requiredHomonymId; }; diff --git a/morfeusz/InterpsGroup.hpp b/morfeusz/InterpsGroup.hpp index 5fb5966..7021e84 100644 --- a/morfeusz/InterpsGroup.hpp +++ b/morfeusz/InterpsGroup.hpp @@ -10,6 +10,10 @@ #include <stdint.h> +/** + * A structure representing one segment + * with pointer to its interpretations, case patterns etc. + */ struct InterpsGroup { unsigned char type; uint16_t size; diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index c0b80a5..f7a5fe8 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -11,15 +11,13 @@ #include "utils.hpp" #include "data/default_fsa.hpp" #include "Morfeusz.hpp" -#include "decoder/InterpretedChunksDecoder.hpp" +#include "deserialization/morphInterps/InterpretedChunksDecoder.hpp" #include "charset/CharsetConverter.hpp" #include "charset/charset_utils.hpp" -#include "charset/CaseConverter.hpp" +#include "case/CaseConverter.hpp" #include "segrules/segrules.hpp" #include "const.hpp" -#include "deserializationUtils.hpp" #include "charset/utf8.h" -#include "compressionByteUtils.hpp" // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp index 78dc278..1dbc488 100644 --- a/morfeusz/Morfeusz.hpp +++ b/morfeusz/Morfeusz.hpp @@ -13,12 +13,11 @@ #include <vector> #include <map> #include <set> -#include "EncodedInterpretation.hpp" #include "fsa/fsa.hpp" #include "MorphInterpretation.hpp" #include "InterpsGroup.hpp" +#include "case/CaseConverter.hpp" #include "charset/CharsetConverter.hpp" -#include "charset/CaseConverter.hpp" #include "charset/TextReader.hpp" #include "InterpretedChunk.hpp" #include "InflexionGraph.hpp" @@ -30,8 +29,8 @@ #include "segrules/segrules.hpp" #include "segrules/SegrulesFSA.hpp" -#include "deserializer/InterpsGroupsReader.hpp" -#include "deserializer/MorphDeserializer.hpp" +#include "deserialization/InterpsGroupsReader.hpp" +#include "deserialization/MorphDeserializer.hpp" class Morfeusz; class ResultsIterator; diff --git a/morfeusz/MorfeuszOptions.hpp b/morfeusz/MorfeuszOptions.hpp index cf975a6..444a19e 100644 --- a/morfeusz/MorfeuszOptions.hpp +++ b/morfeusz/MorfeuszOptions.hpp @@ -10,6 +10,9 @@ #include "const.hpp" +/** + * Represents options for Morfeusz analyzer/generator. + */ struct MorfeuszOptions { bool caseSensitive; MorfeuszCharset encoding; diff --git a/morfeusz/MorfeuszConfig.hpp.in b/morfeusz/MorfeuszVersion.hpp.in index f91543d..6bdfb2f 100644 --- a/morfeusz/MorfeuszConfig.hpp.in +++ b/morfeusz/MorfeuszVersion.hpp.in @@ -5,6 +5,4 @@ * Created on November 29, 2013, 10:03 PM */ -#define Morfeusz_VERSION_MAJOR @Morfeusz_VERSION_MAJOR@ -#define Morfeusz_VERSION_MINOR @Morfeusz_VERSION_MINOR@ - +#define MORFEUSZ_VERSION "@Morfeusz_VERSION@" diff --git a/morfeusz/MorphDeserializer.cpp b/morfeusz/MorphDeserializer.cpp deleted file mode 100644 index 2cde5ac..0000000 --- a/morfeusz/MorphDeserializer.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/* - * File: MorphDeserializer.cpp - * Author: mlenart - * - * Created on 12 listopad 2013, 15:31 - */ - -#include <map> -#include <algorithm> -#include "MorphDeserializer.hpp" -#include "EncodedInterpretation.hpp" -#include "InterpsGroup.hpp" -#include "deserializationUtils.hpp" - -MorphDeserializer::MorphDeserializer() { -} - -MorphDeserializer::~MorphDeserializer() { -} - -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { - const unsigned char* currPtr = ptr; - uint8_t interpTypesNum = readInt8(currPtr); - interps.clear(); - interps.reserve(interpTypesNum); - for (unsigned int i = 0; i < interpTypesNum; i++) { - InterpsGroup ig; - ig.type = readInt8(currPtr); - ig.size = readInt16(currPtr); - ig.ptr = currPtr; - currPtr += ig.size; - interps.push_back(ig); - } - return currPtr - ptr; -} diff --git a/morfeusz/MorphInterpretation.cpp b/morfeusz/MorphInterpretation.cpp index ce3ac87..d554ee9 100644 --- a/morfeusz/MorphInterpretation.cpp +++ b/morfeusz/MorphInterpretation.cpp @@ -8,7 +8,6 @@ #include <string> #include <sstream> #include "MorphInterpretation.hpp" -#include "EncodedInterpretation.hpp" #include "const.hpp" using namespace std; @@ -32,11 +31,13 @@ tagnum(tagnum), namenum(namenum), tag(env.getTagset().getTag(tagnum, env.getCharsetConverter())), name(env.getTagset().getName(namenum, env.getCharsetConverter())), -qualifiers(env.getQualifiersHelper().getQualifiers(qualifiersNum)) { +qualifiers(&env.getQualifiersHelper().getQualifiers(qualifiersNum)) { } +static const vector<std::string> emptyQualifiers; + MorphInterpretation::MorphInterpretation() : startNode(), endNode(), @@ -47,7 +48,7 @@ tagnum(), namenum(), tag(), name(), -qualifiers(){ +qualifiers(&emptyQualifiers){ } @@ -65,7 +66,7 @@ namenum(0), // qualifiersNum(0), tag(env.getTagset().getTag(0, env.getCharsetConverter())), name(env.getTagset().getName(0, env.getCharsetConverter())), -qualifiers() { +qualifiers(&emptyQualifiers) { } @@ -126,7 +127,7 @@ const std::string& MorphInterpretation::getName() const { } const vector<string>& MorphInterpretation::getQualifiers() const { - return this->qualifiers; + return *this->qualifiers; } static inline string getQualifiersStr(const MorphInterpretation& mi) { @@ -157,7 +158,7 @@ std::string MorphInterpretation::toString(bool includeNodeNumbers) const { if (!name.empty()) { res << "," << name; } - if (!qualifiers.empty()) { + if (!qualifiers->empty()) { res << "," << getQualifiersStr(*this); } return res.str(); diff --git a/morfeusz/MorphInterpretation.hpp b/morfeusz/MorphInterpretation.hpp index b637a3c..8426a89 100644 --- a/morfeusz/MorphInterpretation.hpp +++ b/morfeusz/MorphInterpretation.hpp @@ -13,10 +13,12 @@ class Environment; #include "Tagset.hpp" -#include "EncodedInterpretation.hpp" #include "charset/CharsetConverter.hpp" #include "Environment.hpp" +/** + * Morphological interpretation as seen by the user in the analysis/generation results. + */ class MorphInterpretation { public: MorphInterpretation( @@ -59,7 +61,7 @@ private: int namenum; std::string tag; std::string name; - std::vector<std::string> qualifiers; + const std::vector<std::string>* qualifiers; }; #endif /* MORPHINTERPRETATION_HPP */ diff --git a/morfeusz/Qualifiers.cpp b/morfeusz/Qualifiers.cpp index 9f6f949..4d3f81e 100644 --- a/morfeusz/Qualifiers.cpp +++ b/morfeusz/Qualifiers.cpp @@ -7,7 +7,7 @@ #include <iostream> #include "Qualifiers.hpp" -#include "deserializationUtils.hpp" +#include "deserialization/deserializationUtils.hpp" #include "fsa/const.hpp" using namespace std; @@ -32,13 +32,9 @@ qualifiers() { } } -vector<string> Qualifiers::getQualifiers(int n) const { +const vector<string>& Qualifiers::getQualifiers(int n) const { return this->qualifiers.at(n); } -unsigned int Qualifiers::getQualifiersNum() const { - return (unsigned int) this->qualifiers.size(); -} - Qualifiers::~Qualifiers() { } diff --git a/morfeusz/Qualifiers.hpp b/morfeusz/Qualifiers.hpp index 989d699..0e2f109 100644 --- a/morfeusz/Qualifiers.hpp +++ b/morfeusz/Qualifiers.hpp @@ -12,11 +12,21 @@ #include <string> #include <stdint.h> +/** + * Helper class used for decoding qualifiers set number into a vector of strings. + * + * @param ptr + */ class Qualifiers { public: explicit Qualifiers(const unsigned char* ptr); - std::vector<std::string> getQualifiers(int n) const; - unsigned int getQualifiersNum() const; + + /** + * Returns vector of qualifiers represented as strings. + * @param n - the index in qualifiers tab. + * @return - vector of qualifiers represented as strings. + */ + const std::vector<std::string>& getQualifiers(int n) const; virtual ~Qualifiers(); private: std::vector< std::vector<std::string> > qualifiers; diff --git a/morfeusz/Tagset.cpp b/morfeusz/Tagset.cpp index 0ab4a10..b598021 100644 --- a/morfeusz/Tagset.cpp +++ b/morfeusz/Tagset.cpp @@ -3,8 +3,7 @@ #include "Tagset.hpp" #include "fsa/const.hpp" #include "utils.hpp" -#include "endianness.hpp" -#include "deserializationUtils.hpp" +#include "deserialization/deserializationUtils.hpp" using namespace std; diff --git a/morfeusz/Tagset.hpp b/morfeusz/Tagset.hpp index 4bcc5b5..4cc7cc0 100644 --- a/morfeusz/Tagset.hpp +++ b/morfeusz/Tagset.hpp @@ -12,11 +12,34 @@ #include <vector> #include "charset/CharsetConverter.hpp" +/** + * Represents a tagset + */ class Tagset { public: + /** + * Constructs a tagset from binary data. + * + * @param fsaData - pointer to the beginning of automaton data. + */ explicit Tagset(const unsigned char* fsaData); -// Tagset(const Tagset& tagset); + + /** + * Returns tag (denoted by its index) as a string. + * + * @param tagNum - tag index in the tagset. + * @param charsetConverter - the charset converter used to convert from tagset internal encoding (UTF-8) into target encoding. + * @return - the tag encoded as string. + */ const std::string getTag(const int tagNum, const CharsetConverter& charsetConverter) const; + + /** + * Returns named entity type (denoted by its index) as a string. + * + * @param nameNum - name index in the tagset. + * @param charsetConverter - the charset converter used to convert from tagset internal encoding (UTF-8) into target encoding. + * @return - the named entity type encoded as string. + */ const std::string getName(const int nameNum, const CharsetConverter& charsetConverter) const; private: std::vector<std::string> tags; diff --git a/morfeusz/charset/CaseConverter.cpp b/morfeusz/case/CaseConverter.cpp index 740915d..740915d 100644 --- a/morfeusz/charset/CaseConverter.cpp +++ b/morfeusz/case/CaseConverter.cpp diff --git a/morfeusz/charset/CaseConverter.hpp b/morfeusz/case/CaseConverter.hpp index 99c19da..99c19da 100644 --- a/morfeusz/charset/CaseConverter.hpp +++ b/morfeusz/case/CaseConverter.hpp diff --git a/morfeusz/CasePatternHelper.cpp b/morfeusz/case/CasePatternHelper.cpp index 4a9c1b2..4a9c1b2 100644 --- a/morfeusz/CasePatternHelper.cpp +++ b/morfeusz/case/CasePatternHelper.cpp diff --git a/morfeusz/CasePatternHelper.hpp b/morfeusz/case/CasePatternHelper.hpp index 6b9b0ea..27ee131 100644 --- a/morfeusz/CasePatternHelper.hpp +++ b/morfeusz/case/CasePatternHelper.hpp @@ -10,12 +10,15 @@ #include <vector> #include "InterpsGroup.hpp" -#include "CasePatternHelper.hpp" -#include "compressionByteUtils.hpp" +#include "deserialization/morphInterps/compressionByteUtils.hpp" #include "Environment.hpp" class Environment; +/** + * Utility class used to for case-sensitive interpretations filtering + * (ie. to filter out "berlin" and keep "Berlin") + */ class CasePatternHelper { public: @@ -23,10 +26,23 @@ public: } + /** + * Set if this case pattern helper cares about case-sensitivity + * + * @param caseSensitive + */ void setCaseSensitive(bool caseSensitive) { this->caseSensitive = caseSensitive; } - + + /** + * Check if given word matches given case pattern + * + * @param lowercaseCodepoints - codepoints of checked word converter to lowercase + * @param originalCodepoints - codepoints of checked word + * @param casePattern - vector representing case pattern ( ie. [False, True] for "mBank") + * @return - true iff word denoted by given codepoints matches given case pattern + */ bool checkCasePattern( const std::vector<uint32_t>& lowercaseCodepoints, const std::vector<uint32_t>& originalCodepoints, @@ -41,12 +57,28 @@ public: return true; } + /** + * Check if given word has a chance of matching any of case patterns in given interps group. + * + * @param env - environment + * @param orthStart - pointer to start of word + * @param orthEnd - pointer to end of word + * @param ig - interps group + * @return - true iff word encoded from orthStart to orthEnd + * matches at least one of the interp group's morph interpretation's case pattern. + */ bool checkInterpsGroupOrthCasePatterns( const Environment& env, const char* orthStart, const char* orthEnd, const InterpsGroup& ig) const; + /** + * Deserializes case pattern encoded at given pointer. + * + * @param ptr + * @return - case pattern + */ static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr); private: bool caseSensitive; diff --git a/morfeusz/charset/caseconv.cpp b/morfeusz/case/caseconv.cpp index 67252d7..67252d7 100644 --- a/morfeusz/charset/caseconv.cpp +++ b/morfeusz/case/caseconv.cpp diff --git a/morfeusz/charset/caseconv.hpp b/morfeusz/case/caseconv.hpp index 4858cec..9e870d2 100644 --- a/morfeusz/charset/caseconv.hpp +++ b/morfeusz/case/caseconv.hpp @@ -8,6 +8,10 @@ #ifndef CASECONV_HPP #define CASECONV_HPP +/* + * Case conversion tables + */ + extern const unsigned int TO_LOWERCASE_TABLE_SIZE; extern const unsigned int EXT_TO_LOWERCASE_TABLE_SIZE; extern const uint32_t TO_LOWERCASE_TABLE[]; diff --git a/morfeusz/charset/CharsetConverter.cpp b/morfeusz/charset/CharsetConverter.cpp index 0db86bf..b588562 100644 --- a/morfeusz/charset/CharsetConverter.cpp +++ b/morfeusz/charset/CharsetConverter.cpp @@ -4,7 +4,7 @@ #include <algorithm> #include <inttypes.h> #include <iostream> -#include "../endianness.hpp" +#include "deserialization/endianness.hpp" #include "utf8.h" #include "CharsetConverter.hpp" #include "conversion_tables.hpp" diff --git a/morfeusz/charset/TextReader.hpp b/morfeusz/charset/TextReader.hpp index f74c10b..7fe8d49 100644 --- a/morfeusz/charset/TextReader.hpp +++ b/morfeusz/charset/TextReader.hpp @@ -8,7 +8,7 @@ #ifndef TEXTREADER_HPP #define TEXTREADER_HPP -#include "../Environment.hpp" +#include "Environment.hpp" class TextReader { public: diff --git a/morfeusz/outputUtils.hpp b/morfeusz/cli/outputUtils.hpp index 6ef2587..6ef2587 100644 --- a/morfeusz/outputUtils.hpp +++ b/morfeusz/cli/outputUtils.hpp diff --git a/morfeusz/deserializer/InterpsGroupsReader.cpp b/morfeusz/deserialization/InterpsGroupsReader.cpp index 17d0eca..4fbd977 100644 --- a/morfeusz/deserializer/InterpsGroupsReader.cpp +++ b/morfeusz/deserialization/InterpsGroupsReader.cpp @@ -6,7 +6,7 @@ */ #include "InterpsGroupsReader.hpp" -#include "../deserializationUtils.hpp" +#include "deserialization/deserializationUtils.hpp" InterpsGroupsReader::InterpsGroupsReader() : currPtr(NULL), endPtr(NULL) { diff --git a/morfeusz/deserializer/InterpsGroupsReader.hpp b/morfeusz/deserialization/InterpsGroupsReader.hpp index 08df711..08df711 100644 --- a/morfeusz/deserializer/InterpsGroupsReader.hpp +++ b/morfeusz/deserialization/InterpsGroupsReader.hpp diff --git a/morfeusz/deserializer/MorphDeserializer.cpp b/morfeusz/deserialization/MorphDeserializer.cpp index ed9fd70..44c01ca 100644 --- a/morfeusz/deserializer/MorphDeserializer.cpp +++ b/morfeusz/deserialization/MorphDeserializer.cpp @@ -6,7 +6,7 @@ */ #include "MorphDeserializer.hpp" -#include "../deserializationUtils.hpp" +#include "deserialization/deserializationUtils.hpp" MorphDeserializer::MorphDeserializer() { } diff --git a/morfeusz/deserializer/MorphDeserializer.hpp b/morfeusz/deserialization/MorphDeserializer.hpp index 3ffaa03..3ffaa03 100644 --- a/morfeusz/deserializer/MorphDeserializer.hpp +++ b/morfeusz/deserialization/MorphDeserializer.hpp diff --git a/morfeusz/deserializationUtils.hpp b/morfeusz/deserialization/deserializationUtils.hpp index 8f072ed..8f072ed 100644 --- a/morfeusz/deserializationUtils.hpp +++ b/morfeusz/deserialization/deserializationUtils.hpp diff --git a/morfeusz/endianness.hpp b/morfeusz/deserialization/endianness.hpp index 62a7717..62a7717 100644 --- a/morfeusz/endianness.hpp +++ b/morfeusz/deserialization/endianness.hpp diff --git a/morfeusz/EncodedInterpretation.hpp b/morfeusz/deserialization/morphInterps/EncodedInterpretation.hpp index aa54703..aa54703 100644 --- a/morfeusz/EncodedInterpretation.hpp +++ b/morfeusz/deserialization/morphInterps/EncodedInterpretation.hpp diff --git a/morfeusz/decoder/InterpretedChunksDecoder.cpp b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder.cpp index 8075cf1..8075cf1 100644 --- a/morfeusz/decoder/InterpretedChunksDecoder.cpp +++ b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder.cpp diff --git a/morfeusz/decoder/InterpretedChunksDecoder.hpp b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder.hpp index 94676a6..8b59a43 100644 --- a/morfeusz/decoder/InterpretedChunksDecoder.hpp +++ b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder.hpp @@ -16,11 +16,10 @@ #include "EncodedInterpretation.hpp" #include "InterpretedChunk.hpp" #include "EncodedInterpretation.hpp" -#include "charset/CaseConverter.hpp" +#include "case/CaseConverter.hpp" #include "Environment.hpp" #include "MorphInterpretation.hpp" -#include "CasePatternHelper.hpp" -#include "deserializationUtils.hpp" +#include "case/CasePatternHelper.hpp" #include "compressionByteUtils.hpp" #include "const.hpp" diff --git a/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.cpp index 1da1bd9..1da1bd9 100644 --- a/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp +++ b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.cpp diff --git a/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.hpp index 79bc2de..79bc2de 100644 --- a/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp +++ b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.hpp diff --git a/morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Generator.cpp index f9264bc..f9264bc 100644 --- a/morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp +++ b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Generator.cpp diff --git a/morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Generator.hpp index f2a3b3f..f2a3b3f 100644 --- a/morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp +++ b/morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Generator.hpp diff --git a/morfeusz/compressionByteUtils.hpp b/morfeusz/deserialization/morphInterps/compressionByteUtils.hpp index d6d3a20..d6d3a20 100644 --- a/morfeusz/compressionByteUtils.hpp +++ b/morfeusz/deserialization/morphInterps/compressionByteUtils.hpp diff --git a/morfeusz/fsa/cfsa1_impl.hpp b/morfeusz/fsa/cfsa1_impl.hpp index a25b0aa..86920ee 100644 --- a/morfeusz/fsa/cfsa1_impl.hpp +++ b/morfeusz/fsa/cfsa1_impl.hpp @@ -12,7 +12,7 @@ #include <climits> #include "fsa.hpp" -#include "../deserializationUtils.hpp" +#include "../deserialization/deserializationUtils.hpp" static const unsigned char CFSA1_ACCEPTING_FLAG = 128; //static const unsigned char CFSA1_ARRAY_FLAG = 64; diff --git a/morfeusz/fsa/cfsa2_impl.hpp b/morfeusz/fsa/cfsa2_impl.hpp index dc1b901..780cbbc 100644 --- a/morfeusz/fsa/cfsa2_impl.hpp +++ b/morfeusz/fsa/cfsa2_impl.hpp @@ -13,7 +13,7 @@ #include <iostream> #include "fsa.hpp" #include "../utils.hpp" -#include "../endianness.hpp" +#include "../deserialization/endianness.hpp" static const unsigned char HAS_REMAINING_FLAG = 128; static const unsigned char ACCEPTING_FLAG = 64; diff --git a/morfeusz/fsa/fsa_impl.hpp b/morfeusz/fsa/fsa_impl.hpp index e62a1ba..cd7e27c 100644 --- a/morfeusz/fsa/fsa_impl.hpp +++ b/morfeusz/fsa/fsa_impl.hpp @@ -17,7 +17,7 @@ #include <sstream> #include "const.hpp" #include "../utils.hpp" -#include "../endianness.hpp" +#include "../deserialization/endianness.hpp" //using namespace std; //static const unsigned int FSA_OFFSET = 6; diff --git a/morfeusz/java/dupa b/morfeusz/java/dupa deleted file mode 100644 index a0fcd34..0000000 --- a/morfeusz/java/dupa +++ /dev/null @@ -1,36 +0,0 @@ - -# SWIG -#set(CMAKE_SWIG_OUTDIR swig) -#FIND_PACKAGE(SWIG REQUIRED) -FIND_PACKAGE(JNI REQUIRED) -#INCLUDE(${SWIG_USE_FILE}) -include(UseJava) - -# SWIG Java -include_directories (${JAVA_INCLUDE_PATH}) -include_directories (..) - -set (SWIG_JAVA_OUTFILE swigJAVA.cpp) -# set (JAVA_WRAPPER_FILE ${CMAKE_SHARED_LIBRARY_PREFIX}morfeusz${CMAKE_SHARED_LIBRARY_SUFFIX}) -add_custom_command ( - OUTPUT ${SWIG_JAVA_OUTFILE} - COMMAND swig -java -c++ -package pl.waw.ipipan.morfeusz -o ${SWIG_JAVA_OUTFILE} -outdir ${CMAKE_SOURCE_DIR}/jmorfeusz/src/main/java/pl/waw/ipipan/morfeusz ${CMAKE_SOURCE_DIR}/morfeusz/morfeusz.i - DEPENDS libmorfeusz -) -#set (CMAKE_SHARED_LINKER_FLAGS "-s -Os -static-libstdc++ -static-libgcc") -add_library (jmorfeusz SHARED ${SWIG_JAVA_OUTFILE}) -target_link_libraries (jmorfeusz ${JAVA_LIBRARIES} libmorfeusz) -add_dependencies (jmorfeusz ${SWIG_JAVA_OUTFILE}) - -#set (CMAKE_SWIG_FLAGS -package pl.waw.ipipan.morfeusz) -#set (CMAKE_SWIG_OUTDIR ${CMAKE_SOURCE_DIR}/jmorfeusz/src/main/java/pl/waw/ipipan/morfeusz) - -#set_source_files_properties (../morfeusz.i PROPERTIES CPLUSPLUS ON) -#SWIG_ADD_MODULE(jmorfeusz java ../morfeusz.i) -#SWIG_LINK_LIBRARIES(jmorfeusz ${JAVA_LIBRARIES}) -#SWIG_LINK_LIBRARIES(jmorfeusz libmorfeusz) - -#if (${CMAKE_SYSTEM_NAME} MATCHES "Windows") -# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -# set (CMAKE_SHARED_LINKER_FLAGS "-s -Os -static-libstdc++ -static-libgcc") -#endif () diff --git a/morfeusz/morfeusz_analyzer.cpp b/morfeusz/morfeusz_analyzer.cpp index 028e6ad..7d2df30 100644 --- a/morfeusz/morfeusz_analyzer.cpp +++ b/morfeusz/morfeusz_analyzer.cpp @@ -12,14 +12,16 @@ #include "fsa/fsa.hpp" #include "Tagset.hpp" #include "Morfeusz.hpp" +#include "MorfeuszVersion.hpp" #include "const.hpp" #include "cli/cli.hpp" -#include "outputUtils.hpp" +#include "cli/outputUtils.hpp" using namespace std; int main(int argc, const char** argv) { + cerr << "Morfeusz analyzer, version: " << MORFEUSZ_VERSION << endl; ez::ezOptionParser& opt = *getOptions(argc, argv, ANALYZER); Morfeusz morfeusz; initializeMorfeusz(opt, morfeusz); diff --git a/morfeusz/morfeusz_generator.cpp b/morfeusz/morfeusz_generator.cpp index e98f40d..2b1cbe3 100644 --- a/morfeusz/morfeusz_generator.cpp +++ b/morfeusz/morfeusz_generator.cpp @@ -11,13 +11,15 @@ #include "fsa/fsa.hpp" #include "Tagset.hpp" #include "Morfeusz.hpp" +#include "MorfeuszVersion.hpp" #include "const.hpp" #include "cli/cli.hpp" -#include "outputUtils.hpp" +#include "cli/outputUtils.hpp" using namespace std; int main(int argc, const char** argv) { + cerr << "Morfeusz generator, version: " << MORFEUSZ_VERSION << endl; ez::ezOptionParser& opt = *getOptions(argc, argv, GENERATOR); Morfeusz morfeusz; initializeMorfeusz(opt, morfeusz); diff --git a/morfeusz/segrules/SegrulesFSA.hpp b/morfeusz/segrules/SegrulesFSA.hpp index 633bb34..d236e38 100644 --- a/morfeusz/segrules/SegrulesFSA.hpp +++ b/morfeusz/segrules/SegrulesFSA.hpp @@ -10,7 +10,7 @@ #include <set> #include <iostream> -#include "../deserializationUtils.hpp" +#include "../deserialization/deserializationUtils.hpp" struct SegrulesState { uint16_t offset; diff --git a/morfeusz/segrules/segrules.cpp b/morfeusz/segrules/segrules.cpp index e820176..daf53ee 100644 --- a/morfeusz/segrules/segrules.cpp +++ b/morfeusz/segrules/segrules.cpp @@ -1,8 +1,8 @@ #include "segrules.hpp" -#include "../fsa/fsa.hpp" -#include "../fsa/const.hpp" -#include "../deserializationUtils.hpp" +#include "fsa/fsa.hpp" +#include "fsa/const.hpp" +#include "deserialization/deserializationUtils.hpp" using namespace std; diff --git a/morfeusz/consoleUtils.hpp b/morfeusz/test/consoleUtils.hpp index 4513ad9..4513ad9 100644 --- a/morfeusz/consoleUtils.hpp +++ b/morfeusz/test/consoleUtils.hpp diff --git a/morfeusz/test_recognize_dict.cpp b/morfeusz/test/test_recognize_dict.cpp index 5f16ba0..ca8d2ff 100644 --- a/morfeusz/test_recognize_dict.cpp +++ b/morfeusz/test/test_recognize_dict.cpp @@ -8,7 +8,6 @@ //#include <cstdlib> #include <sstream> #include <iostream> -#include "EncodedInterpretation.hpp" #include "utils.hpp" #include "Morfeusz.hpp" #include "MorphInterpretation.hpp" diff --git a/morfeusz/test_result_equals.cpp b/morfeusz/test/test_result_equals.cpp index 98b83b7..98b83b7 100644 --- a/morfeusz/test_result_equals.cpp +++ b/morfeusz/test/test_result_equals.cpp diff --git a/morfeusz/test_synth_dict.cpp b/morfeusz/test_synth_dict.cpp deleted file mode 100644 index 4573ae1..0000000 --- a/morfeusz/test_synth_dict.cpp +++ /dev/null @@ -1,19 +0,0 @@ -/* - * File: test_synth_dict.cpp - * Author: mlenart - * - * Created on 21 styczeń 2014, 12:00 - */ - -#include <cstdlib> - -using namespace std; - -/* - * - */ -int main(int argc, char** argv) { - - return 0; -} - diff --git a/morfeusz/wrappers/CMakeLists.txt b/morfeusz/wrappers/CMakeLists.txt new file mode 100644 index 0000000..e3b49ef --- /dev/null +++ b/morfeusz/wrappers/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory (java) +add_subdirectory (python) +add_subdirectory (perl) \ No newline at end of file diff --git a/morfeusz/java/CMakeLists.txt b/morfeusz/wrappers/java/CMakeLists.txt index e58cc0e..c416afc 100644 --- a/morfeusz/java/CMakeLists.txt +++ b/morfeusz/wrappers/java/CMakeLists.txt @@ -6,14 +6,14 @@ find_package(Java REQUIRED) include_directories (${JAVA_INCLUDE_PATH}) include_directories (${JAVA_INCLUDE_PATH2}) -include_directories (..) +include_directories (${CMAKE_SOURCE_DIR}/morfeusz) set (SWIG_JAVA_OUTFILE "${CMAKE_CURRENT_BINARY_DIR}/swigJAVA.cpp") file (COPY pl DESTINATION .) set (JAVA_SRC_DIR "${CMAKE_CURRENT_BINARY_DIR}/pl/waw/ipipan/morfeusz") add_custom_command ( OUTPUT ${SWIG_JAVA_OUTFILE} - COMMAND swig -java -c++ -package pl.waw.ipipan.morfeusz -o ${SWIG_JAVA_OUTFILE} -outdir ${JAVA_SRC_DIR} ${CMAKE_SOURCE_DIR}/morfeusz/morfeusz.i + COMMAND swig -java -c++ -package pl.waw.ipipan.morfeusz -o ${SWIG_JAVA_OUTFILE} -outdir ${JAVA_SRC_DIR} ${CMAKE_SOURCE_DIR}/morfeusz/wrappers/morfeusz.i DEPENDS libmorfeusz ) add_custom_target(generate_java_wrapper ALL diff --git a/morfeusz/java/README b/morfeusz/wrappers/java/README index 575a5ff..575a5ff 100644 --- a/morfeusz/java/README +++ b/morfeusz/wrappers/java/README diff --git a/morfeusz/java/pl/waw/ipipan/morfeusz/app/App.java b/morfeusz/wrappers/java/pl/waw/ipipan/morfeusz/app/App.java index aa58dd9..aa58dd9 100644 --- a/morfeusz/java/pl/waw/ipipan/morfeusz/app/App.java +++ b/morfeusz/wrappers/java/pl/waw/ipipan/morfeusz/app/App.java diff --git a/morfeusz/java/pl/waw/ipipan/morfeusz/app/MorfeuszUtils.java b/morfeusz/wrappers/java/pl/waw/ipipan/morfeusz/app/MorfeuszUtils.java index 86022a9..86022a9 100644 --- a/morfeusz/java/pl/waw/ipipan/morfeusz/app/MorfeuszUtils.java +++ b/morfeusz/wrappers/java/pl/waw/ipipan/morfeusz/app/MorfeuszUtils.java diff --git a/morfeusz/morfeusz.i b/morfeusz/wrappers/morfeusz.i index 354bf41..ec93798 100644 --- a/morfeusz/morfeusz.i +++ b/morfeusz/wrappers/morfeusz.i @@ -91,10 +91,10 @@ import java.io.IOException; %ignore Tagset::Tagset(const unsigned char* fsaData); -%include "Morfeusz.hpp" -%include "MorphInterpretation.hpp" -%include "const.hpp" -%include "exceptions.hpp" +%include "../Morfeusz.hpp" +%include "../MorphInterpretation.hpp" +%include "../const.hpp" +%include "../exceptions.hpp" // instantiate vector of interpretations namespace std { diff --git a/morfeusz/perl/CMakeLists.txt b/morfeusz/wrappers/perl/CMakeLists.txt index 0ccf02c..58ee51e 100644 --- a/morfeusz/perl/CMakeLists.txt +++ b/morfeusz/wrappers/perl/CMakeLists.txt @@ -6,7 +6,7 @@ if (NOT CMAKE_CROSSCOMPILING) find_package (PerlLibs REQUIRED) include_directories (${PERL_INCLUDE_PATH}) - include_directories (..) + include_directories (../..) set (CMAKE_SWIG_FLAGS "") diff --git a/morfeusz/python/CMakeLists.txt b/morfeusz/wrappers/python/CMakeLists.txt index 42950c1..4ef7793 100644 --- a/morfeusz/python/CMakeLists.txt +++ b/morfeusz/wrappers/python/CMakeLists.txt @@ -9,14 +9,14 @@ set (PYMORFEUSZ_VERSION "0.1.0") # SWIG Java INCLUDE_DIRECTORIES (${PYTHON_INCLUDE_PATH}) -INCLUDE_DIRECTORIES (..) +INCLUDE_DIRECTORIES (../..) set (SWIG_PYTHON_OUTFILE_CXX "${CMAKE_CURRENT_BINARY_DIR}/swigPYTHON.cpp") set (SWIG_PYTHON_OUTFILE_PY "${CMAKE_CURRENT_BINARY_DIR}/morfeusz2.py") add_custom_command ( OUTPUT "${SWIG_PYTHON_OUTFILE_CXX}" "${SWIG_PYTHON_OUTFILE_PY}" - COMMAND swig -python -c++ -o "${SWIG_PYTHON_OUTFILE_CXX}" "${CMAKE_SOURCE_DIR}/morfeusz/morfeusz.i" + COMMAND swig -python -c++ -o "${SWIG_PYTHON_OUTFILE_CXX}" "${CMAKE_SOURCE_DIR}/morfeusz/wrappers/morfeusz.i" DEPENDS libmorfeusz ) add_custom_target (generate_python_wrapper diff --git a/morfeusz/python/setup.py.in b/morfeusz/wrappers/python/setup.py.in index 856ff15..856ff15 100644 --- a/morfeusz/python/setup.py.in +++ b/morfeusz/wrappers/python/setup.py.in diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index 54102cd..e3c06ab 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -5,6 +5,14 @@ displayName="build" projectFiles="true" root="build"> + <logicalFolder name="morfeusz" displayName="morfeusz" projectFiles="true"> + <logicalFolder name="wrappers" displayName="wrappers" projectFiles="true"> + <logicalFolder name="java" displayName="java" projectFiles="true"> + <itemPath>build/morfeusz/wrappers/java/swigJAVA.cpp</itemPath> + </logicalFolder> + <itemPath>build/morfeusz/wrappers/morfeuszPERL_wrap.cxx</itemPath> + </logicalFolder> + </logicalFolder> <itemPath>build/default_fsa.cpp</itemPath> <itemPath>build/default_synth_fsa.cpp</itemPath> </logicalFolder> @@ -17,22 +25,25 @@ <itemPath>build1/morfeusz/java/swigJAVA.cpp</itemPath> </logicalFolder> <df root="morfeusz" name="0"> - <df name="charset"> + <df name="case"> <in>CaseConverter.cpp</in> + <in>CasePatternHelper.cpp</in> + <in>caseconv.cpp</in> + </df> + <df name="charset"> <in>CharsetConverter.cpp</in> <in>TextReader.cpp</in> - <in>caseconv.cpp</in> <in>conversion_tables.cpp</in> </df> <df name="cli"> <in>cli.cpp</in> </df> - <df name="decoder"> - <in>InterpretedChunksDecoder.cpp</in> - <in>InterpretedChunksDecoder4Analyzer.cpp</in> - <in>InterpretedChunksDecoder4Generator.cpp</in> - </df> - <df name="deserializer"> + <df name="deserialization"> + <df name="morphInterps"> + <in>InterpretedChunksDecoder.cpp</in> + <in>InterpretedChunksDecoder4Analyzer.cpp</in> + <in>InterpretedChunksDecoder4Generator.cpp</in> + </df> <in>InterpsGroupsReader.cpp</in> <in>MorphDeserializer.cpp</in> </df> @@ -46,20 +57,19 @@ <in>SegrulesFSA.cpp</in> <in>segrules.cpp</in> </df> - <in>CasePatternHelper.cpp</in> + <df name="test"> + <in>test_recognize_dict.cpp</in> + <in>test_result_equals.cpp</in> + </df> <in>Environment.cpp</in> <in>InflexionGraph.cpp</in> <in>Morfeusz.cpp</in> - <in>MorphDeserializer.cpp</in> <in>MorphInterpretation.cpp</in> <in>Qualifiers.cpp</in> <in>Tagset.cpp</in> <in>const.cpp</in> - <in>main.cpp</in> <in>morfeusz_analyzer.cpp</in> <in>morfeusz_generator.cpp</in> - <in>test_recognize_dict.cpp</in> - <in>test_result_equals.cpp</in> </df> <logicalFolder name="morfeusz" displayName="morfeusz" @@ -101,10 +111,9 @@ <rebuildPropChanged>false</rebuildPropChanged> </toolsSet> <flagsDictionary> - <element flagsID="0" commonFlags="-O2 -std=c++98"/> - <element flagsID="1" commonFlags="-O2 -std=c++98 -fPIC"/> - <element flagsID="2" commonFlags="-std=c++98 -O3"/> - <element flagsID="3" commonFlags="-std=c++98 -O3 -fPIC"/> + <element flagsID="0" commonFlags="-std=c++98 -O3"/> + <element flagsID="1" commonFlags="-std=c++98 -O3 -fPIC"/> + <element flagsID="2" commonFlags="3"/> </flagsDictionary> <codeAssistance> </codeAssistance> @@ -114,18 +123,49 @@ <buildCommand>${MAKE} -f Makefile</buildCommand> <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> <executablePath>build/morfeusz/morfeusz_analyzer</executablePath> + <ccTool flags="1"> + <incDir> + <pElem>build</pElem> + <pElem>morfeusz</pElem> + </incDir> + <preprocessorList> + <Elem>NDEBUG</Elem> + </preprocessorList> + </ccTool> </makeTool> </makefileType> <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> + <ccTool> + <incDir> + <pElem>build/morfeusz</pElem> + </incDir> + <preprocessorList> + <Elem>__PIC__=2</Elem> + <Elem>__pic__=2</Elem> + <Elem>libmorfeusz_EXPORTS</Elem> + </preprocessorList> + <undefinedList> + <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> + </undefinedList> + </ccTool> </item> <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> + <ccTool> + <incDir> + <pElem>build/morfeusz</pElem> + </incDir> + <preprocessorList> + <Elem>__PIC__=2</Elem> + <Elem>__pic__=2</Elem> + <Elem>libmorfeusz_EXPORTS</Elem> + </preprocessorList> + <undefinedList> + <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> + </undefinedList> + </ccTool> </item> <item path="build/morfeusz/default_fsa.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> - <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> - </incDir> <preprocessorList> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> @@ -136,17 +176,13 @@ tool="1" flavor2="4"> <ccTool flags="1"> - <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> - </incDir> <preprocessorList> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> </item> <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="3"> + <ccTool flags="1"> </ccTool> </item> <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" @@ -155,14 +191,11 @@ flavor2="8"> <ccTool> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> <pElem>build/morfeusz</pElem> <pElem>/usr/lib/jvm/default-java/include</pElem> <pElem>build/morfeusz/java</pElem> </incDir> <preprocessorList> - <Elem>NDEBUG</Elem> <Elem>_OPTIMIZE__=1</Elem> <Elem>__PIC__=2</Elem> <Elem>__pic__=2</Elem> @@ -181,8 +214,6 @@ flavor2="4"> <ccTool flags="1"> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> <pElem>/usr/lib/perl/5.14/CORE</pElem> <pElem>build/morfeusz/perl</pElem> </incDir> @@ -197,14 +228,11 @@ flavor2="8"> <ccTool> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> <pElem>build/morfeusz</pElem> <pElem>/usr/include/python2.7</pElem> <pElem>build/morfeusz/python</pElem> </incDir> <preprocessorList> - <Elem>NDEBUG</Elem> <Elem>_OPTIMIZE__=1</Elem> <Elem>__PIC__=2</Elem> <Elem>__pic__=2</Elem> @@ -218,22 +246,40 @@ </ccTool> </item> <item path="build/morfeusz/python/swigPYTHON.cpp" + ex="true" + tool="3" + flavor2="4"> + </item> + <item path="build/morfeusz/wrappers/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> + </item> + <item path="build/morfeusz/wrappers/morfeuszPERL_wrap.cxx" + ex="false" + tool="1" + flavor2="4"> + <ccTool flags="1"> + <incDir> + <pElem>/usr/lib/perl/5.14/CORE</pElem> + <pElem>build/morfeusz/wrappers/perl</pElem> + </incDir> + <preprocessorList> + <Elem>morfeusz_perl_EXPORTS</Elem> + </preprocessorList> + </ccTool> </item> <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> </item> <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> <ccTool> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> <pElem>build/morfeusz</pElem> <pElem>morfeusz/build/morfeusz</pElem> </incDir> <preprocessorList> - <Elem>NDEBUG</Elem> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> @@ -241,13 +287,10 @@ <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> <ccTool> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> <pElem>build/morfeusz</pElem> <pElem>morfeusz/build/morfeusz</pElem> </incDir> <preprocessorList> - <Elem>NDEBUG</Elem> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> @@ -255,34 +298,32 @@ <folder path="0"> <ccTool> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> <pElem>build/morfeusz</pElem> </incDir> </ccTool> </folder> - <folder path="0/charset"> + <folder path="0/case"> <ccTool> <preprocessorList> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> </folder> - <folder path="0/cli"> + <folder path="0/charset"> <ccTool> <preprocessorList> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> </folder> - <folder path="0/decoder"> + <folder path="0/cli"> <ccTool> <preprocessorList> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> </folder> - <folder path="0/deserializer"> + <folder path="0/deserialization"> <ccTool> <preprocessorList> <Elem>libmorfeusz_EXPORTS</Elem> @@ -303,35 +344,24 @@ </preprocessorList> </ccTool> </folder> - <folder path="build"> + <folder path="build/morfeusz/wrappers/java"> <ccTool> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> - <pElem>build/morfeusz</pElem> + <pElem>/usr/lib/jvm/default-java/include</pElem> </incDir> <preprocessorList> - <Elem>NDEBUG</Elem> - <Elem>__PIC__=2</Elem> - <Elem>__pic__=2</Elem> - <Elem>libmorfeusz_EXPORTS</Elem> + <Elem>libjmorfeusz_EXPORTS</Elem> </preprocessorList> - <undefinedList> - <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> - </undefinedList> </ccTool> </folder> <folder path="java"> <ccTool> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> <pElem>build/morfeusz</pElem> <pElem>build1</pElem> <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> </incDir> <preprocessorList> - <Elem>NDEBUG</Elem> <Elem>libjmorfeusz_EXPORTS</Elem> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> @@ -340,12 +370,9 @@ <folder path="morfeusz/java"> <ccTool> <incDir> - <pElem>build</pElem> - <pElem>morfeusz</pElem> - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> + <pElem>/usr/lib/jvm/default-java/include</pElem> </incDir> <preprocessorList> - <Elem>NDEBUG</Elem> <Elem>libjmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> @@ -353,21 +380,10 @@ <folder path="morfeusz/python"> <ccTool> <incDir> - <pElem>morfeusz</pElem> <pElem>/usr/include/python2.7</pElem> </incDir> - <preprocessorList> - <Elem>NDEBUG</Elem> - </preprocessorList> </ccTool> </folder> - <item path="morfeusz/CasePatternHelper.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="1"> - <preprocessorList> - <Elem>libmorfeusz_EXPORTS</Elem> - </preprocessorList> - </ccTool> - </item> <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> <preprocessorList> @@ -383,17 +399,8 @@ </ccTool> </item> <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="3"> - <preprocessorList> - <Elem>NDEBUG</Elem> - <Elem>libmorfeusz_EXPORTS</Elem> - </preprocessorList> - </ccTool> - </item> - <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="3"> + <ccTool flags="2"> <preprocessorList> - <Elem>NDEBUG</Elem> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> @@ -419,36 +426,36 @@ </preprocessorList> </ccTool> </item> - <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> + <item path="morfeusz/case/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> </ccTool> </item> - <item path="morfeusz/charset/CharsetConverter.cpp" + <item path="morfeusz/case/CasePatternHelper.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="3"> - <preprocessorList> - <Elem>NDEBUG</Elem> - </preprocessorList> - </ccTool> - </item> - <item path="morfeusz/charset/TextReader.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> </ccTool> </item> - <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> + <item path="morfeusz/case/caseconv.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> </ccTool> </item> + <item path="morfeusz/charset/CharsetConverter.cpp" + ex="false" + tool="1" + flavor2="4"> + </item> + <item path="morfeusz/charset/TextReader.cpp" ex="false" tool="1" flavor2="4"> + </item> <item path="morfeusz/charset/conversion_tables.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="1"> - </ccTool> </item> <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> @@ -457,29 +464,31 @@ </preprocessorList> </ccTool> </item> - <item path="morfeusz/decoder/InterpretedChunksDecoder.cpp" + <item path="morfeusz/deserialization/InterpsGroupsReader.cpp" ex="false" tool="1" flavor2="4"> </item> - <item path="morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp" + <item path="morfeusz/deserialization/MorphDeserializer.cpp" ex="false" tool="1" flavor2="4"> </item> - <item path="morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp" + <item path="morfeusz/deserialization/morphInterps/InterpretedChunksDecoder.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> - <item path="morfeusz/deserializer/InterpsGroupsReader.cpp" + <item path="morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> </ccTool> </item> - <item path="morfeusz/deserializer/MorphDeserializer.cpp" + <item path="morfeusz/deserialization/morphInterps/InterpretedChunksDecoder4Generator.cpp" ex="false" tool="1" flavor2="4"> @@ -496,9 +505,6 @@ <incDir> <pElem>build/fsa</pElem> </incDir> - <preprocessorList> - <Elem>NDEBUG</Elem> - </preprocessorList> </ccTool> </item> <item path="morfeusz/fsa/test_recognize.cpp" ex="false" tool="1" flavor2="8"> @@ -506,9 +512,6 @@ <incDir> <pElem>build/fsa</pElem> </incDir> - <preprocessorList> - <Elem>NDEBUG</Elem> - </preprocessorList> </ccTool> </item> <item path="morfeusz/fsa/test_speed.cpp" ex="false" tool="1" flavor2="8"> @@ -516,43 +519,33 @@ <incDir> <pElem>build/fsa</pElem> </incDir> - <preprocessorList> - <Elem>NDEBUG</Elem> - </preprocessorList> - </ccTool> - </item> - <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="4"> - <ccTool> - <preprocessorList> - <Elem>NDEBUG</Elem> - <Elem>_OPTIMIZE__=1</Elem> - <Elem>libmorfeusz_EXPORTS</Elem> - </preprocessorList> - <undefinedList> - <Elem>__NO_INLINE__</Elem> - </undefinedList> </ccTool> </item> <item path="morfeusz/morfeusz_analyzer.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="2"> - <preprocessorList> - <Elem>NDEBUG</Elem> - </preprocessorList> + <ccTool flags="0"> </ccTool> </item> <item path="morfeusz/morfeusz_generator.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="0"> + </ccTool> </item> <item path="morfeusz/segrules/SegrulesFSA.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="1"> - </ccTool> </item> <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="1"> - </ccTool> </item> - <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> + <item path="morfeusz/test/test_recognize_dict.cpp" + ex="false" + tool="1" + flavor2="4"> + <ccTool flags="0"> + </ccTool> </item> - <item path="morfeusz/test_result_equals.cpp" ex="false" tool="1" flavor2="4"> + <item path="morfeusz/test/test_result_equals.cpp" + ex="false" + tool="1" + flavor2="4"> + <ccTool flags="0"> + </ccTool> </item> </conf> </confs> diff --git a/profile.sh b/profile.sh index b16f96c..c607808 100755 --- a/profile.sh +++ b/profile.sh @@ -4,9 +4,9 @@ rm -rf profbuild mkdir -p profbuild cd profbuild cmake -D INPUT_DICTIONARIES=../input/dodatki.tab,../input/PoliMorfSmall.tab -D CMAKE_BUILD_TYPE=Debug -D CMAKE_CXX_FLAGS="-g -O2" -D CMAKE_SHARED_LINKER_FLAGS="-lprofiler" -D CMAKE_EXE_LINKER_FLAGS="-lprofiler" .. -make -j4 +make rm -f /tmp/morfeusz.prof export LD_PRELOAD="/usr/lib/libprofiler.so" export CPUPROFILE="/tmp/morfeusz.prof" -morfeusz/morfeusz_analyzer -i /tmp/dupadupa < /mnt/storage/morfeusz/sents30k > /dev/null +morfeusz/morfeusz_analyzer -i /home/wkieras/output/sgjp_analyzer.fsa < /mnt/storage/morfeusz/sents10k > /dev/null ### pprof --gv profbuild/morfeusz/morfeusz_analyzer /tmp/morfeusz.prof diff --git a/morfeusz/Toolchain-Darwin-amd64.cmake b/toolchains/Toolchain-Darwin-amd64.cmake index f35ac3d..f35ac3d 100644 --- a/morfeusz/Toolchain-Darwin-amd64.cmake +++ b/toolchains/Toolchain-Darwin-amd64.cmake diff --git a/morfeusz/Toolchain-Linux-amd64.cmake b/toolchains/Toolchain-Linux-amd64.cmake index 8ebcce5..8ebcce5 100644 --- a/morfeusz/Toolchain-Linux-amd64.cmake +++ b/toolchains/Toolchain-Linux-amd64.cmake diff --git a/morfeusz/Toolchain-Linux-i386.cmake b/toolchains/Toolchain-Linux-i386.cmake index 2c648af..2c648af 100644 --- a/morfeusz/Toolchain-Linux-i386.cmake +++ b/toolchains/Toolchain-Linux-i386.cmake diff --git a/morfeusz/Toolchain-Windows-amd64.cmake b/toolchains/Toolchain-Windows-amd64.cmake index cf742eb..cf742eb 100644 --- a/morfeusz/Toolchain-Windows-amd64.cmake +++ b/toolchains/Toolchain-Windows-amd64.cmake diff --git a/morfeusz/Toolchain-Windows-i386.cmake b/toolchains/Toolchain-Windows-i386.cmake index 0686d7b..0686d7b 100644 --- a/morfeusz/Toolchain-Windows-i386.cmake +++ b/toolchains/Toolchain-Windows-i386.cmake