Maca.hs 6.75 KB
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}


-- | The module provides interface for the Maca analysis tool.
-- See <http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki>
-- for more information about the analyser.


module NLP.Concraft.Polish.Maca
(
  MacaPool
, newMacaPool
, macaPar
) where


import           Control.Applicative ((<$>))
import           Control.Monad (void, forever, guard, replicateM, unless)
import           Control.Concurrent
import           Control.Exception
import           System.Process
import           System.IO
import           Data.Function (on)
import qualified Data.Char as C
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
import qualified Control.Monad.State.Strict as S
import qualified Control.Monad.Trans.Maybe as M
import           Control.Monad.Trans.Class (lift)

import           NLP.Concraft.Polish.Morphosyntax hiding (restore)
import qualified NLP.Concraft.Polish.Format.Plain as Plain


----------------------------
-- Maca instance
----------------------------


-- TODO: We don't have to use channels here.  Maximum one element
-- should be present in the input/output channel.
    

-- | Input channel.
type In = Chan T.Text


-- | Output channel.
type Out = Chan [Sent Tag]


-- | Maca communication channels.
newtype Maca = Maca (In, Out)


-- | Run Maca instance.
newMaca :: IO Maca
newMaca = do
    inCh  <- newChan
    outCh <- newChan
    void $ runMacaOn inCh outCh
    return $ Maca (inCh, outCh)


-- | Run Maca server on given channels.
-- TODO: Should check, if maca works.  In particular, if morfeusz is available.
runMacaOn :: In -> Out -> IO ThreadId
runMacaOn inCh outCh = mask $ \restore -> forkIO (do
    let cmd  = "maca-analyse"
        args = ["-q", "morfeusz-nkjp-official", "-o", "plain", "-l"]
    (Just inh, Just outh, Just errh, pid) <-
        createProcess (proc cmd args){ std_in  = CreatePipe
                                     , std_out = CreatePipe
                                     , std_err = CreatePipe }

    let excHandler = do
            let tryIO = try :: IO a -> IO (Either IOException a)
            void $ tryIO $ do
                err <- hGetContents errh
                unless (all C.isSpace err) $ do
                    putStr "Maca error: "
                    putStrLn err
            hClose inh; hClose outh; hClose errh
            terminateProcess pid
            waitForProcess pid

    -- TODO: Document, why LineBuffering is needed here.
    hSetBuffering outh LineBuffering
    flip onException excHandler $ restore $ forever $ do

        -- Take element from the input channel.
        txt <- readChan inCh
        -- putStr "REQUEST: "
        -- print txt

        -- Write text to maca stdin.
        -- TODO: Handle the "empty" case?
        T.hPutStr inh txt; hFlush inh

        -- Read maca response and put it in the output channel.
        writeChan outCh =<< readMacaResponse outh (textWeight txt)
    )


readMacaResponse :: Handle -> Int -> IO [Sent Tag]
readMacaResponse h n
    | n <= 0    = return []
    | otherwise = do
        x  <- readMacaSent h
        xs <- readMacaResponse h (n - sentWeight x)
        return (x : xs)


readMacaSent :: Handle -> IO (Sent Tag)
readMacaSent h =
    Plain.parseSent . L.unlines <$> getTxt
  where
    getTxt = do
        x <- L.hGetLine h
        if L.null x
            then return []
            else (x:) <$> getTxt


----------------------------
-- Client
----------------------------
    

-- | Analyse paragraph with Maca.
doMacaPar :: Maca -> T.Text -> IO [Sent Tag]
doMacaPar (Maca (inCh, outCh)) par = do
    let par' = T.intercalate "  " (T.lines par) `T.append` "\n"
    writeChan inCh par'
    restoreSpaces par <$> readChan outCh


-- | Restore info abouts spaces from a text and insert them
-- to a parsed paragraph.
restoreSpaces :: T.Text -> [Sent Tag] -> [Sent Tag]
restoreSpaces par sents =
    S.evalState (mapM onSent sents) (0, chunks)
  where
    -- For each space chunk in the paragraph compute
    -- total weight of earlier chunks.
    parts   = T.groupBy ((==) `on` C.isSpace) par
    weights = scanl1 (+) (map textWeight parts)
    chunks  = filter (T.any C.isSpace . fst) (zip parts weights)

    -- Stateful monadic computation which modifies spaces
    -- assigned to individual segments.
    onSent = mapM onWord
    onWord seg = do
        n <- addWeight seg
        s <- popSpace n
        let word' = (word seg) { space = s }
        return $ seg { word = word' }

    -- Add weight of the segment to the current weight.
    addWeight seg = S.state $ \(n, xs) ->
        let m = n + segWeight seg
        in (m, (m, xs))

    -- Pop space from the stack if its weight is lower than
    -- the current one.
    popSpace n = fmap (maybe None id) . M.runMaybeT $ do
        spaces  <- lift $ S.gets snd
        (sp, m) <- liftMaybe $ maybeHead spaces
        guard $ m < n
        lift $ S.modify $ \(n', xs) -> (n', tail xs)
        return $ toSpace sp
    liftMaybe = M.MaybeT . return
    maybeHead xs = case xs of
        (x:_)   -> Just x
        []      -> Nothing

    -- Parse strings representation of a Space.
    toSpace x
        | has '\n'  = NewLine 
        | has ' '   = Space 
        | otherwise = None
        where has c = maybe False (const True) (T.find (==c) x)


----------------------------
-- Pool
----------------------------


-- | A pool of Maca instances.
newtype MacaPool = MacaPool (Chan Maca)


-- | Run Maca server.
newMacaPool
    :: Int          -- ^ Number of Maca instances
    -> IO MacaPool
newMacaPool n = do
    chan  <- newChan
    macas <- replicateM n newMaca
    writeList2Chan chan macas
    return $ MacaPool chan


popMaca :: MacaPool -> IO Maca
popMaca (MacaPool c) = readChan c


putMaca :: Maca -> MacaPool -> IO ()
putMaca x (MacaPool c) = writeChan c x


-- | Analyse paragraph with Maca.  The function is thread-safe.  As a
-- pre-processing step, all non-printable characters are removed from
-- the input (based on empirical observations, Maca behaves likewise).
macaPar :: MacaPool -> T.Text -> IO [Sent Tag]
macaPar pool par0 = do
    let par = T.filter C.isPrint par0
    maca <- popMaca pool
    doMacaPar maca par `finally` putMaca maca pool


------------------------------------------------------------
-- Weight: a number of non-space characters
------------------------------------------------------------


-- | A weight of a text.
textWeight :: T.Text -> Int
-- textWeight = T.length . T.filter C.isAlphaNum
textWeight = T.length . T.filter (not . C.isSpace)
-- textWeight = T.length . T.filter ((&&) <$> not . C.isSpace <*> C.isPrint)


-- | A weight of a segment.
segWeight :: Seg t -> Int
segWeight = textWeight . orth . word


-- | A weight of a sentence.
sentWeight :: Sent t -> Int
sentWeight = sum . map segWeight