Morfeusz.hpp 5.54 KB
/* 
 * File:   Morfeusz.hpp
 * Author: mlenart
 *
 * Created on November 13, 2013, 5:21 PM
 */

#ifndef MORFEUSZ_HPP
#define	MORFEUSZ_HPP

#include <string>
#include <list>
#include <vector>
#include <map>
#include <set>
#include "EncodedInterpretation.hpp"
#include "fsa/fsa.hpp"
#include "MorphInterpretation.hpp"
#include "InterpsGroup.hpp"
#include "charset/CharsetConverter.hpp"
#include "charset/CaseConverter.hpp"
#include "InterpretedChunk.hpp"
#include "InflexionGraph.hpp"
#include "MorfeuszOptions.hpp"
#include "const.hpp"
#include "exceptions.hpp"
#include "Environment.hpp"

#include "segrules/segrules.hpp"
#include "segrules/SegrulesFSA.hpp"

class Morfeusz;
class ResultsIterator;

typedef State< std::vector<InterpsGroup > > StateType;

/**
 * Performs morphological analysis (analyze methods) and syntesis (generate methods).
 * 
 * It is NOT thread-safe
 * but it is possible to use separate Morfeusz instance for each concurrent thread.
 */
class Morfeusz {
public:
    
    /**
     * Create new instance of Morfeusz class.
     */
    Morfeusz();
    
    /**
     * Set a file used for morphological analysis.
     * 
     * @param filename
     */
    void setAnalyzerFile(const std::string& filename);
    
    /**
     * Set a file used for morphological synthesis.
     * 
     * @param filename
     */
    void setGeneratorFile(const std::string& filename);
    
    /**
     * Destroys Morfeusz object.
     */
    virtual ~Morfeusz();
    
    /**
     * Analyze given text and return the results as iterator.
     * 
     * @param text - text for morphological analysis
     * @return - iterator over morphological analysis results
     */
    ResultsIterator analyze(const std::string& text) const;
    
    /**
     * Perform morphological analysis on a given text and put results in a vector.
     * 
     * @param text - text to be analyzed
     * @param result - results vector
     */
    void analyze(const std::string& text, std::vector<MorphInterpretation>& result) const;
    
    /**
     * Perform morphological synthesis on a given lemma and return the results as iterator.
     * 
     * @param text - text for morphological analysis
     * @return - iterator over morphological analysis results
     */
    ResultsIterator generate(const std::string& lemma) const;
    
    /**
     * Perform morphological synthesis on a given lemma and return the results as iterator.
     * Limit results to interpretations with the specified tag.
     * 
     * @param text - text for morphological analysis
     * @param tag - tag of result interpretations
     * @return - iterator over morphological analysis results
     */
    ResultsIterator generate(const std::string& lemma, int tagnum) const;

    /**
     * Perform morphological synthesis on a given lemma and put results in a vector.
     * 
     * @param lemma - lemma to be analyzed
     * @param result - results vector
     */
    void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const;
    
    /**
     * Perform morphological synthesis on a given lemma and put results in a vector.
     * Limit results to interpretations with the specified tag.
     * 
     * @param lemma - lemma to be analyzed
     * @param tag - tag of result interpretations
     * @param result - results vector
     */
    void generate(const std::string& lemma, int tagnum, std::vector<MorphInterpretation>& result) const;

    /**
     * Set encoding for input and output string objects.
     * 
     * @param encoding
     */
    void setCharset(MorfeuszCharset encoding);
    
    /**
     * Set aggl segmentation option value.
     * 
     * @param aggl
     */
    void setAggl(const std::string& aggl);
    
    /**
     * Set praet segmentation option value.
     * 
     * @param praet
     */
    void setPraet(const std::string& praet);
    
    /**
     * If set to true characters case in analyzed text must match
     * the case in the recognized forms from dictionary.
     * 
     * @param caseSensitive
     */
    void setCaseSensitive(bool caseSensitive);
    
    /**
     * Set debug option value.
     * 
     * @param debug
     */
    void setDebug(bool debug);

    friend class ResultsIterator;
private:
    
    std::string prepareStringToProcess(const std::string& input) const;

    void processOneWord(
            const Environment& env,
            const char*& inputData,
            const char* inputEnd,
            int startNodeNum,
            std::vector<MorphInterpretation>& result,
            bool insideIgnHandler=false) const;

    void doProcessOneWord(
            const Environment& env,
            const char*& inputData,
            const char* inputEnd,
            SegrulesState segrulesState,
            std::vector<InterpretedChunk>& accum,
            InflexionGraph& graph) const;
    
    void handleIgnChunk(
        const Environment& env,
        const char* inputStart,
        const char* inputEnd,
        int startNodeNum,
        std::vector<MorphInterpretation>& results) const;

    void appendIgnotiumToResults(
            const Environment& env,
            const std::string& word,
            int startNodeNum,
            std::vector<MorphInterpretation>& results) const;
    
    Environment analyzerEnv;
    Environment generatorEnv;
    MorfeuszOptions options;
};

class ResultsIterator {
public:
    MorphInterpretation getNext();
    bool hasNext();
    friend class Morfeusz;
private:
    ResultsIterator(const std::vector<MorphInterpretation>& res);
    const char* rawInput;
    std::list<MorphInterpretation> resultsBuffer;
    int startNode;
};

#endif	/* MORFEUSZ_HPP */