Blame view

concraft/src/NLP/Concraft/Polish/Maca.hs 6.75 KB
Jan Lupa authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}


-- | The module provides interface for the Maca analysis tool.
-- See <http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki>
-- for more information about the analyser.


module NLP.Concraft.Polish.Maca
(
  MacaPool
, newMacaPool
, macaPar
) where


import           Control.Applicative ((<$>))
import           Control.Monad (void, forever, guard, replicateM, unless)
import           Control.Concurrent
import           Control.Exception
import           System.Process
import           System.IO
import           Data.Function (on)
import qualified Data.Char as C
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
import qualified Control.Monad.State.Strict as S
import qualified Control.Monad.Trans.Maybe as M
import           Control.Monad.Trans.Class (lift)

import           NLP.Concraft.Polish.Morphosyntax hiding (restore)
import qualified NLP.Concraft.Polish.Format.Plain as Plain


----------------------------
-- Maca instance
----------------------------


-- TODO: We don't have to use channels here.  Maximum one element
-- should be present in the input/output channel.


-- | Input channel.
type In = Chan T.Text


-- | Output channel.
type Out = Chan [Sent Tag]


-- | Maca communication channels.
newtype Maca = Maca (In, Out)


-- | Run Maca instance.
newMaca :: IO Maca
newMaca = do
    inCh  <- newChan
    outCh <- newChan
    void $ runMacaOn inCh outCh
    return $ Maca (inCh, outCh)


-- | Run Maca server on given channels.
-- TODO: Should check, if maca works.  In particular, if morfeusz is available.
runMacaOn :: In -> Out -> IO ThreadId
runMacaOn inCh outCh = mask $ \restore -> forkIO (do
    let cmd  = "maca-analyse"
        args = ["-q", "morfeusz-nkjp-official", "-o", "plain", "-l"]
    (Just inh, Just outh, Just errh, pid) <-
        createProcess (proc cmd args){ std_in  = CreatePipe
                                     , std_out = CreatePipe
                                     , std_err = CreatePipe }

    let excHandler = do
            let tryIO = try :: IO a -> IO (Either IOException a)
            void $ tryIO $ do
                err <- hGetContents errh
                unless (all C.isSpace err) $ do
                    putStr "Maca error: "
                    putStrLn err
            hClose inh; hClose outh; hClose errh
            terminateProcess pid
            waitForProcess pid

    -- TODO: Document, why LineBuffering is needed here.
    hSetBuffering outh LineBuffering
    flip onException excHandler $ restore $ forever $ do

        -- Take element from the input channel.
        txt <- readChan inCh
        -- putStr "REQUEST: "
        -- print txt

        -- Write text to maca stdin.
        -- TODO: Handle the "empty" case?
        T.hPutStr inh txt; hFlush inh

        -- Read maca response and put it in the output channel.
        writeChan outCh =<< readMacaResponse outh (textWeight txt)
    )


readMacaResponse :: Handle -> Int -> IO [Sent Tag]
readMacaResponse h n
    | n <= 0    = return []
    | otherwise = do
        x  <- readMacaSent h
        xs <- readMacaResponse h (n - sentWeight x)
        return (x : xs)


readMacaSent :: Handle -> IO (Sent Tag)
readMacaSent h =
    Plain.parseSent . L.unlines <$> getTxt
  where
    getTxt = do
        x <- L.hGetLine h
        if L.null x
            then return []
            else (x:) <$> getTxt


----------------------------
-- Client
----------------------------


-- | Analyse paragraph with Maca.
doMacaPar :: Maca -> T.Text -> IO [Sent Tag]
doMacaPar (Maca (inCh, outCh)) par = do
    let par' = T.intercalate "  " (T.lines par) `T.append` "\n"
    writeChan inCh par'
    restoreSpaces par <$> readChan outCh


-- | Restore info abouts spaces from a text and insert them
-- to a parsed paragraph.
restoreSpaces :: T.Text -> [Sent Tag] -> [Sent Tag]
restoreSpaces par sents =
    S.evalState (mapM onSent sents) (0, chunks)
  where
    -- For each space chunk in the paragraph compute
    -- total weight of earlier chunks.
    parts   = T.groupBy ((==) `on` C.isSpace) par
    weights = scanl1 (+) (map textWeight parts)
    chunks  = filter (T.any C.isSpace . fst) (zip parts weights)

    -- Stateful monadic computation which modifies spaces
    -- assigned to individual segments.
    onSent = mapM onWord
    onWord seg = do
        n <- addWeight seg
        s <- popSpace n
        let word' = (word seg) { space = s }
        return $ seg { word = word' }

    -- Add weight of the segment to the current weight.
    addWeight seg = S.state $ \(n, xs) ->
        let m = n + segWeight seg
        in (m, (m, xs))

    -- Pop space from the stack if its weight is lower than
    -- the current one.
    popSpace n = fmap (maybe None id) . M.runMaybeT $ do
        spaces  <- lift $ S.gets snd
        (sp, m) <- liftMaybe $ maybeHead spaces
        guard $ m < n
        lift $ S.modify $ \(n', xs) -> (n', tail xs)
        return $ toSpace sp
    liftMaybe = M.MaybeT . return
    maybeHead xs = case xs of
        (x:_)   -> Just x
        []      -> Nothing

    -- Parse strings representation of a Space.
    toSpace x
        | has '\n'  = NewLine 
        | has ' '   = Space 
        | otherwise = None
        where has c = maybe False (const True) (T.find (==c) x)


----------------------------
-- Pool
----------------------------


-- | A pool of Maca instances.
newtype MacaPool = MacaPool (Chan Maca)


-- | Run Maca server.
newMacaPool
    :: Int          -- ^ Number of Maca instances
    -> IO MacaPool
newMacaPool n = do
    chan  <- newChan
    macas <- replicateM n newMaca
    writeList2Chan chan macas
    return $ MacaPool chan


popMaca :: MacaPool -> IO Maca
popMaca (MacaPool c) = readChan c


putMaca :: Maca -> MacaPool -> IO ()
putMaca x (MacaPool c) = writeChan c x


-- | Analyse paragraph with Maca.  The function is thread-safe.  As a
-- pre-processing step, all non-printable characters are removed from
-- the input (based on empirical observations, Maca behaves likewise).
macaPar :: MacaPool -> T.Text -> IO [Sent Tag]
macaPar pool par0 = do
    let par = T.filter C.isPrint par0
    maca <- popMaca pool
    doMacaPar maca par `finally` putMaca maca pool


------------------------------------------------------------
-- Weight: a number of non-space characters
------------------------------------------------------------


-- | A weight of a text.
textWeight :: T.Text -> Int
-- textWeight = T.length . T.filter C.isAlphaNum
textWeight = T.length . T.filter (not . C.isSpace)
-- textWeight = T.length . T.filter ((&&) <$> not . C.isSpace <*> C.isPrint)


-- | A weight of a segment.
segWeight :: Seg t -> Int
segWeight = textWeight . orth . word


-- | A weight of a sentence.
sentWeight :: Sent t -> Int
sentWeight = sum . map segWeight