Commit dc750b8bdd1d673aad5a0a1dda49fee85bfea1af

Authored by Wojciech Jaworski
1 parent 96883977

Włączenie concrafta do ENIAMintegration

concraft/.gitignore deleted
1   -dist
2   -cabal-dev
3   -*.o
4   -*.hi
5   -*.chi
6   -*.chs.h
7   -.virthualenv
8   -*.swp
9   -.stack-work
concraft/LICENSE deleted
1   -Copyright (c) 2013 Jakub Waszczuk
2   -All rights reserved.
3   -
4   -Redistribution and use in source and binary forms, with or without
5   -modification, are permitted provided that the following conditions
6   -are met:
7   -
8   - * Redistributions of source code must retain the above copyright
9   - notice, this list of conditions and the following disclaimer.
10   -
11   - * Redistributions in binary form must reproduce the above
12   - copyright notice, this list of conditions and the following
13   - disclaimer in the documentation and/or other materials provided
14   - with the distribution.
15   -
16   -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17   -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18   -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19   -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20   -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21   -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22   -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23   -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24   -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26   -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
concraft/README.md deleted
1   -The instructions below apply to modern systems with the following caveats:
2   -Before running stack install it might be necessary to execute stack setup in order to install the correct version of ghc.
3   -
4   -Maca can be installed using the instructions on nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki/InstallOnUbuntu11 with the following exceptions:
5   -The package libboost-all-dev can be installed in place of libboost1.42-all-dev as the latter might be unavailable.
6   -Instead of using adding Bartosz Zaborowski's repository one can download Morfeusz directly from
7   -ppa.launchpad.net/bartosz-zaborowski/nlp/ubuntu/pool/main/m/morfeusz-sgjp/morfeusz-sgjp_0.81-1~precise_amd64.deb
8   -Before installing SFST it is necessary to modify the file maca/third_party/SFST-1.2/SFST/src/fst.C
9   -In lines 445 and 449 the identifier it must be replaced with iter (or any other non-conflicting identifier).
10   -
11   -After Concraft is successfully installed its executable can be found in ~/.local/bin
12   -
13   -Below are the contents of the original README file.
14   -
15   -Concraft-pl
16   -===========
17   -
18   -This package provides a morphosyntactic tagger for the Polish language.
19   -The tool combines the following components into a pipeline:
20   -* A morphosyntactic segmentation and analysis tool [Maca][maca],
21   -* A morphosyntactic disambiguation library [Concraft][concraft],
22   -
23   -<!---
24   -* A simple, frequency-driven lemmatiser (TODO). Until the lemmatiser component
25   - is implemented, the tagger may output multiple interpretations (all related
26   - to the same morphosyntactic tag, but with different lemmas) in some cases.
27   --->
28   -
29   -As for now, the tagger doesn't provide any lemmatisation capabilities.
30   -As a result, it may output multiple interpretations (all related
31   -to the same morphosyntactic tag, but with different lemmas) for some known
32   -words, while for the out-of-vocabulary words it just outputs orthographic
33   -forms as lemmas.
34   -
35   -See the [homepage][homepage] if you wish to download a pre-trained
36   -model for the Polish language.
37   -
38   -
39   -Installation
40   -============
41   -
42   -It is recommanded to install Concraft-pl using the
43   -[Haskell Tool Stack][stack], which you will need to downoload and
44   -install on your machine beforehand. Then clone this repository into
45   -a local directory and use `stack` to install the library by running:
46   -
47   - stack install
48   -
49   -Unless you plan to use a custom preprocessing pipeline or run
50   -[Maca][maca] on a different machine (see section
51   -[Tagging analysed data](#tagging-analysed-data)), you will also need
52   -the [Maca][maca] tool. A detailed [installation guide][maca-install]
53   -can be found on the [Maca][maca] homepage.
54   -
55   -
56   -Data format
57   -===========
58   -
59   -The current version of Concraft-pl works on a simple `plain` text format supported by
60   -the [Corpus2][corpus2] tools. You will have to install these tools when you install
61   -[Maca][maca] anyway, so you can use them to convert the output generated
62   -by Concraft-pl to one of other formats supported by [Corpus2][corpus2].
63   -
64   -
65   -Training
66   -========
67   -
68   -If you have the training material with disambiguation annotations (stored in the
69   -`plain` text format) you can train the Concraft-pl model yourself.
70   -
71   - concraft-pl train train.plain -e eval.plain -o model.gz
72   -
73   -Concraft-pl uses the [NKJP][nkjp] [morphosyntactic tagset definition](config/nkjp-tagset.cfg)
74   -by default. It will also reanalyse the input data before the actual training. If you want
75   -to change this behaviour, use the `--tagset` and `--noana` command-line options.
76   -
77   -Consider using [runtime system options][ghc-rts]. You can speed up processing
78   -by making use of multiple cores by using the `-N` option. The `-s` option will
79   -produce the runtime statistics, such as the time spent in the garbage collector.
80   -If the program is spending too much time collecting garbage, you can try to
81   -increase the allocation area size with the `-A` option. If you have a big
82   -dataset and it doesn't fit in the computer memory, use the `--disk` flag.
83   -For example, to train the model using four threads and 256M allocation area
84   -size, run:
85   -
86   - concraft-pl train train.plain -e eval.plain -o model.gz +RTS -N4 -A256M -s
87   -
88   -Run `concraft-pl train --help` to learn more about the program arguments and
89   -possible training options.
90   -
91   -Finally, you may consider pruning the resultant model in order to reduce its size.
92   -Features with values close to 0 (in log-domain) have little effect on the modeled
93   -probability and, therefore, it should be safe to discard them.
94   -
95   - concraft-pl prune -t 0.05 input-model.gz pruned-model.gz
96   -
97   -
98   -Tagging
99   -=======
100   -
101   -Once you have a Concraft-pl model you can use the following command tag `input.txt` file:
102   -
103   - concraft-pl tag model.gz < input.txt > output.plain
104   -
105   -The input file is first divided into paragraphs (the tool interprets empty lines
106   -as paragraph ending markers). After that, [Maca][maca] is used to segment and analyse
107   -each paragraph. Finally, [Concraft][concraft] module is used to disambiguate each
108   -sentence in the [Maca][maca] output.
109   -
110   -With the `--marginals` option enabled, Concraft-pl will output marginal probabilities
111   -corresponding to individual tags (determined on the basis of the disambiguation model)
112   -instead of `disamb` markers.
113   -
114   -Run `concraft-pl tag --help` to learn more about possible tagging options.
115   -
116   -
117   -Server
118   -======
119   -
120   -Concraft-pl provides also a client/server mode. It is handy when, for example,
121   -you need to tag a large collection of small files. Loading Concraft-pl model
122   -from a disk takes considerable amount of time which makes the tagging method
123   -described above very slow in such a setting.
124   -
125   -To start the Concraft-pl server, run:
126   -
127   - concraft-pl server --inmodel model.gz
128   -
129   -You can supply a custom port number using a `--port` option. For example,
130   -to run the server on the `10101` port, use the following command:
131   -
132   - concraft-pl server --inmodel model.gz --port 10101
133   -
134   -To use the server in a multi-threaded environment, you need to specify the
135   -`-N` [RTS][ghc-rts] option. A set of options which usually yields good
136   -server performance is presented in the following example:
137   -
138   - concraft-pl server --inmodel model.gz +RTS -N -A4M -qg1 -I0
139   -
140   -Run `concraft-pl server --help` to learn more about possible server-mode options.
141   -
142   -The client mode works just like the tagging mode. The only difference is that,
143   -instead of supplying your client with a model, you need to specify the port number
144   -(in case you used a custom one when starting the server; otherwise, the default
145   -port number will be used).
146   -
147   - concraft-pl client --port 10101 < input.txt > output.plain
148   -
149   -Run `concraft-pl client --help` to learn more about possible client-mode options.
150   -
151   -
152   -Tagging analysed data
153   -=====================
154   -
155   -In some situations you might want to feed Concraft-pl with a previously
156   -analysed data. Perhaps your Maca instance is installed on a different
157   -machine, or maybe you want to use Concraft-pl with a custom
158   -preprocessing pipeline.
159   -
160   -If you want to use a preprocessing pipeline significantly different from
161   -the standard one (Maca), you should first train your own Concraft model.
162   -To train the model on analysed data use the `--noana` training flag.
163   -
164   -Use the same `--noana` flag when you want to tag analysed data.
165   -Input format should be the same as the output format.
166   -This option is currently not supported in the client/server mode.
167   -
168   -*Remember to use the same preprocessing pipeline (segmentation + analysis) for both
169   -training and disambiguation. Inconsistencies between training material and input
170   -data may severely harm the quality of disambiguation.*
171   -
172   -
173   -[stack]: http://docs.haskellstack.org "Haskell Tool Stack"
174   -[homepage]: http://zil.ipipan.waw.pl/Concraft "Homepage"
175   -[concraft]: https://github.com/kawu/concraft "Concraft"
176   -[hackage-repo]: http://hackage.haskell.org/package/concraft-pl "Concraft-pl Hackage repository"
177   -[maca]: http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki "Maca"
178   -[maca-install]: http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki#Download-and-install-MACA "Maca installation guide"
179   -[corpus2]: http://nlp.pwr.wroc.pl/redmine/projects/corpus2/wiki "Corpus2"
180   -[ghc]: http://www.haskell.org/ghc "Glasgow Haskell Compiler"
181   -[ghc-rts]: http://www.haskell.org/ghc/docs/latest/html/users_guide/runtime-control.html "GHC runtime system options"
182   -[cabal]: http://www.haskell.org/cabal "Cabal"
183   -[haskell-platform]: http://www.haskell.org/platform "Haskell Platform"
184   -[nkjp]: http://nkjp.pl/index.php?page=0&lang=1 "NKJP"
concraft/changelog deleted
1   --*-change-log-*-
2   -
3   -0.7.5 Aug 2016
4   - * Start using `stack` to avoid dependency problems
5   -
6   -0.7.4 Nov 2014
7   - * Ignore non-printable characters for compatibility with Maca
8   -
9   -0.7.0 Nov 2013
10   - * Output guessing results for unknown words
11   - * Allow "--noana" option in the client/server mode
12   - * Add option to output marginal probabilities
13   - * Add separate mode for model pruning
14   -
15   -0.6.0 Nov 2013
16   - * Output wordform instead of "None" as a lemma for unknown words
17   -
18   -0.5.0 Nov 2013
19   - * Restore the orth[-2] feature in the observation schema
concraft/concraft-pl.cabal deleted
1   -name: concraft-pl
2   -version: 0.7.5
3   -synopsis: Morphological tagger for Polish
4   -description:
5   - A morphological tagger for Polish based on the concraft library.
6   -license: BSD3
7   -license-file: LICENSE
8   -cabal-version: >= 1.6
9   -copyright: Copyright (c) 2013 Jakub Waszczuk
10   -author: Jakub Waszczuk
11   -maintainer: waszczuk.kuba@gmail.com
12   -stability: experimental
13   -category: Natural Language Processing
14   -homepage: http://zil.ipipan.waw.pl/Concraft
15   -build-type: Simple
16   -
17   -extra-source-files: changelog
18   -
19   -data-files: config/nkjp-tagset.cfg
20   -
21   -library
22   - hs-source-dirs: src
23   -
24   - build-depends:
25   - base >= 4 && < 5
26   - , concraft >= 0.9 && < 0.10
27   - , tagset-positional >= 0.3 && < 0.4
28   - , sgd >= 0.3.3 && < 0.4
29   - , containers >= 0.4 && < 0.6
30   - , bytestring >= 0.9 && < 0.11
31   - , text >= 0.11 && < 1.3
32   - , aeson >= 0.6 && < 0.9
33   - , binary >= 0.5 && < 0.8
34   - , process >= 1.1 && < 1.3
35   - , mtl >= 2.0 && < 2.3
36   - , transformers >= 0.2 && < 0.5
37   - , network >= 2.3 && < 2.7
38   - , lazy-io >= 0.1 && < 0.2
39   - , split >= 0.2 && < 0.3
40   -
41   - exposed-modules:
42   - NLP.Concraft.Polish
43   - , NLP.Concraft.Polish.Maca
44   - , NLP.Concraft.Polish.Morphosyntax
45   - , NLP.Concraft.Polish.Request
46   - , NLP.Concraft.Polish.Server
47   -
48   - other-modules:
49   - NLP.Concraft.Polish.Format.Plain
50   -
51   - ghc-options: -Wall -O2
52   -
53   -source-repository head
54   - type: git
55   - location: https://github.com/kawu/concraft-pl.git
56   -
57   -executable concraft-pl
58   - build-depends:
59   - cmdargs >= 0.10 && < 0.11
60   - hs-source-dirs: src, tools
61   - main-is: concraft-pl.hs
62   - ghc-options: -Wall -O2 -threaded -rtsopts
concraft/config/nkjp-tagset.cfg deleted
1   -# NKJP tagset definition
2   -
3   -[ATTR]
4   -
5   -# Attributes and their values
6   -nmb = sg pl
7   -cas = nom gen dat acc inst loc voc
8   -gnd = m1 m2 m3 f n # n1 n2 n3 p1 p2 p3
9   -per = pri sec ter
10   -deg = pos com sup
11   -asp = imperf perf
12   -ngt = aff neg
13   -acm = congr rec
14   -acn = akc nakc
15   -ppr = npraep praep
16   -agg = agl nagl
17   -vlc = nwok wok
18   -dot = pun npun
19   -# sentiment = spos sneg
20   -
21   -[RULE]
22   -
23   -# Parsing rules definitions.
24   -
25   -adja =
26   -adjp =
27   -adjc =
28   -conj =
29   -comp =
30   -interp =
31   -pred =
32   -xxx =
33   -adv = [deg]
34   -imps = asp
35   -inf = asp
36   -pant = asp
37   -pcon = asp
38   -qub = [vlc]
39   -prep = cas [vlc]
40   -siebie = cas
41   -subst = nmb cas gnd
42   -depr = nmb cas gnd
43   -ger = nmb cas gnd asp ngt
44   -ppron12 = nmb cas gnd per [acn]
45   -ppron3 = nmb cas gnd per [acn] [ppr]
46   -num = nmb cas gnd acm
47   -numcol = nmb cas gnd acm
48   -adj = nmb cas gnd deg
49   -pact = nmb cas gnd asp ngt
50   -ppas = nmb cas gnd asp ngt
51   -winien = nmb gnd asp
52   -praet = nmb gnd asp [agg]
53   -bedzie = nmb per asp
54   -fin = nmb per asp
55   -impt = nmb per asp
56   -aglt = nmb per asp vlc
57   -
58   -# `ign` is a special tag and we treat in a special way.
59   -# ign =
60   -
61   -brev = dot
62   -burk =
63   -interj =
64   -
concraft/eval/cv.sh deleted
1   -# Cross-validation script.
2   -
3   -
4   -# Root data directory
5   -data=$1
6   -
7   -# NKJP tagset configuration
8   -nkjp_tagset=$2
9   -
10   -
11   -echo ""
12   -echo "Data directory: "$1
13   -echo "Tagset file: "$2
14   -shift
15   -shift
16   -echo "Training arguments: ""$@"
17   -echo ""
18   -
19   -
20   -for i in "01" "02" "03" "04" "05" "06" "07" "08" "09" "10"
21   -do
22   - echo $i
23   -
24   - # Construct training material.
25   - echo -n "" > $data/train/train$i.plain
26   - for j in "01" "02" "03" "04" "05" "06" "07" "08" "09" "10"
27   - do
28   - if [ $i != $j ]; then
29   - echo "> "$j
30   - cat $data/folds/test$j.plain >> $data/train/train$i.plain
31   - fi
32   - done
33   -
34   - # Train tagger and tag eval file.
35   - concraft-pl train --tagset=$nkjp_tagset $data/train/train$i.plain -e $data/folds/test$i.plain -o $data/concraft/model$i.gz "$@"
36   - concraft-pl tag $data/concraft/model$i.gz < $data/text/test$i.txt > $data/tagged/test$i.plain
37   -
38   - echo -e "\nSTATS\n"
39   - concraft-pl compare --tagset=$nkjp_tagset $data/folds/test$i.plain $data/tagged/test$i.plain
40   - echo -e "\nEND STATS\n"
41   -
42   - # Delete the training material, we will not need it again.
43   - rm $data/train/train$i.plain
44   -done
concraft/nkjp-model-0.2.gz deleted
No preview for this file type
concraft/notes/binary.txt deleted
1   -Pamięć zajmowana przez poszczególne elementy binarnej reprezentacji modelu:
2   -
3   -15M ./codec.bin
4   -305M ./actualModel.bin
5   -202M ./featMap.bin
6   -130M ./featMap-otherMap.bin
7   -73M ./featMap-transMaps.bin
8   -104M ./valuesVect.bin
9   -
10   -Sortując zgodnie z rozmiarami (tylko elemety najniższego poziomu):
11   -
12   -130M ./featMap-otherMap.bin
13   -104M ./valuesVect.bin
14   -73M ./featMap-transMaps.bin
15   -15M ./codec.bin
16   -
17   -Trzeba wziąć pod uwagę, jakie są różnice pomiędzy rozmiarami
18   -zwykłej oraz binarnej reprezentacji poszczególnych struktur danych.
19   -Przykładowo, w przypadku wektora różnica ta prawdopodobnie jest
20   -niewielka. W przypadku map różnica jest prawdopodobnie duża.
21   -
22   -Rozmiar mapy `otherMap` to 3390538 par (klucz, wartość).
23   -Będziemy go oznaczali przez N. Zgodnie z
24   -http://blog.johantibell.com/2011/06/memory-footprints-of-some-common-data.html
25   -rozmiar takie mapy w pamięci można szacować jako (na systemie 64-bitowym):
26   -
27   - 6N words + size of keys & values
28   -
29   - 6N words = ~163 MB
30   - size of keys = N * 4 words = ~108 MB
31   - size of vals = N words = ~27 MB
32   -
33   - W sumie = ~300MB !
concraft/notes/notes.txt deleted
1   -Plany i pomysły na dalszy rozwój Concrafta.
2   -
3   -
4   -[HIGH]
5   -
6   -
7   -* Umożliwienie użytkownikowi określenia dodatkowych
8   - cech schematu obserwacji, spoza zestawu oferowanego
9   - przez bibliotekę podstawową. [HARD]
10   -
11   -
12   -[MEDIUM]
13   -
14   -
15   -* Trenowanie modelu bezpośrednio na korpusie NKJP. [EASY]
16   -
17   -
18   -* Cześć konfiguracji należałoby przenieść do plików konfiguracyjnych.
19   - Dotyczy to np. konfiguracji trenowania -- parametry SGD bez większych
20   - problemów mogłyby być zdefiniowane w postaci plików konfiguracyjnych.
21   - [MEDIUM]
22   -
23   -
24   -* Powyższy pomysł jest w bardziej ogólny. Chcielibyśmy dać użytkownikowi
25   - możliwość konfigurowania nie tylko parametrów SGD, ale również
26   - - schematu obserwacji
27   - - podziału na warstwy
28   - W szczególności, użytkownik powinien mieć możliwość zdefiniowania
29   - osobnego schematu obserwacji dla różnych warstw.
30   -
31   - PYTANIE: Czy obecnie tworzona jest jeden kodek dla wszystkich typów
32   - obserwacji? Zdecydowanie powinno tak być!
33   -
34   -
35   -* Fajnie byłoby użyć na poziomie kodeka biblioteki DAWG, dzięki czemu
36   - model byłby bardziej skompresowany. [MEDIUM/HARD]
37   -
38   - ALE: W postaci binarnej kodek zajmuje jedyne 15MB, w porównaniu do
39   - tego pamięć zajmowana przez inne części modelu jest ogromna!
40   -
41   -
42   -* Zmiana interfejsu biblioteki podstawowej: zamiast klasy
43   - Word można dodać np. "środowisko zdania". [HARD]
44   -
45   -
46   -[LOW]
47   -
48   -
49   -* Obsługa dodatkowych formatów. [MEDIUM]
50   - <- Mało ważne chociażby dlatego, że jeśli już Maca jest dostępna,
51   - to może być też wykorzystana do konwersji danych wyjściowych
52   - do innego formatu.
concraft/notes/plan.txt deleted
1   -Kroki prowadzące do opublikowania nowej wersji Concrafta w sieci.
2   -
3   -1. Zmergować gałąź 0.5 z gałęzią master w repozytorium https://github.com/kawu/concraft. DONE
4   -
5   -2. Zaktualizować opis na stronie http://zil.ipipan.waw.pl/Concraft. DONE
6   - W szczególności, należy umieścić tam nowy model. DONE
7   -
8   -[Dodatki]
9   -
10   -A. Obecny model 0.5 wytrenowany jest na 90% korpusu milionowego NKJP.
11   - Kiedyś później należałoby wytrenować model na całym korpusie.
concraft/src/NLP/Concraft/Polish.hs deleted
1   -{-# LANGUAGE OverloadedStrings #-}
2   -{-# LANGUAGE RecordWildCards #-}
3   -
4   -
5   -module NLP.Concraft.Polish
6   -(
7   --- * Model
8   - C.Concraft
9   -, C.saveModel
10   -, C.loadModel
11   -
12   --- * Tagging
13   -, tag
14   -, marginals
15   -
16   --- * Analysis
17   -, macaPar
18   -
19   --- * Training
20   -, TrainConf (..)
21   -, train
22   -
23   --- * Pruning
24   -, C.prune
25   -
26   --- -- * Analysis
27   --- , anaSent
28   --- , reAnaPar
29   -) where
30   -
31   -
32   -import Control.Applicative ((<$>))
33   -import qualified Data.Text.Lazy as L
34   -import qualified Data.Set as S
35   -
36   -import qualified Data.Tagset.Positional as P
37   -import qualified Numeric.SGD as SGD
38   -
39   -import qualified NLP.Concraft.Morphosyntax as X
40   -import qualified NLP.Concraft.Schema as S
41   -import NLP.Concraft.Schema (SchemaConf(..), entry, entryWith)
42   -import qualified NLP.Concraft.Guess as G
43   -import qualified NLP.Concraft.Disamb as D
44   -import qualified NLP.Concraft as C
45   --- import qualified NLP.Concraft.Analysis as A
46   -
47   -import NLP.Concraft.Polish.Morphosyntax hiding (tag)
48   -import NLP.Concraft.Polish.Maca
49   -
50   -
51   --------------------------------------------------
52   --- Default configuration
53   --------------------------------------------------
54   -
55   -
56   --- | Default configuration for the guessing observation schema.
57   -guessSchemaDefault :: SchemaConf
58   -guessSchemaDefault = S.nullConf
59   - { lowPrefixesC = entryWith [1, 2] [0]
60   - , lowSuffixesC = entryWith [1, 2] [0]
61   - , knownC = entry [0]
62   - , begPackedC = entry [0] }
63   -
64   -
65   --- | Default configuration for the guessing observation schema.
66   -disambSchemaDefault :: SchemaConf
67   -disambSchemaDefault = S.nullConf
68   - { lowOrthC = entry [-2, -1, 0, 1]
69   - , lowPrefixesC = oov $ entryWith [1, 2, 3] [0]
70   - , lowSuffixesC = oov $ entryWith [1, 2, 3] [0]
71   - , begPackedC = oov $ entry [0] }
72   - where
73   - oov (Just body) = Just $ body { S.oovOnly = True }
74   - oov Nothing = Nothing
75   -
76   -
77   --- | Default tiered tagging configuration.
78   -tiersDefault :: [D.Tier]
79   -tiersDefault =
80   - [tier1, tier2]
81   - where
82   - tier1 = D.Tier True $ S.fromList ["cas", "per"]
83   - tier2 = D.Tier False $ S.fromList
84   - [ "nmb", "gnd", "deg", "asp" , "ngt", "acm"
85   - , "acn", "ppr", "agg", "vlc", "dot" ]
86   -
87   -
88   --------------------------------------------------
89   --- Tagging
90   --------------------------------------------------
91   -
92   -
93   --- | Tag the analysed sentence.
94   -tag :: C.Concraft -> Sent Tag -> Sent Tag
95   -tag concraft sent =
96   - [ select' gs t seg
97   - | (seg, gs, t) <- zip3 sent gss ts ]
98   - where
99   - tagset = C.tagset concraft
100   - packed = packSent tagset sent
101   - tagged = C.tag concraft packed
102   - gss = map (map showTag . S.toList . fst) tagged
103   - ts = map (showTag . snd) tagged
104   - showTag = P.showTag tagset
105   -
106   -
107   --- | Tag the sentence with marginal probabilities.
108   -marginals :: C.Concraft -> Sent Tag -> Sent Tag
109   -marginals concraft sent
110   - = map (uncurry selectWMap)
111   - $ zip wmaps sent
112   - where
113   - tagset = C.tagset concraft
114   - packed = packSent tagset sent
115   - wmaps = map
116   - (X.mapWMap showTag)
117   - (C.marginals concraft packed)
118   - showTag = P.showTag tagset
119   -
120   -
121   --------------------------------------------------
122   --- Training
123   --------------------------------------------------
124   -
125   -
126   --- | Training configuration.
127   -data TrainConf = TrainConf {
128   - -- | Tagset.
129   - tagset :: P.Tagset
130   - -- | SGD parameters.
131   - , sgdArgs :: SGD.SgdArgs
132   - -- | Perform reanalysis.
133   - , reana :: Bool
134   - -- | Store SGD dataset on disk.
135   - , onDisk :: Bool
136   - -- | Numer of guessed tags for each word.
137   - , guessNum :: Int
138   - -- | `G.r0T` parameter.
139   - , r0 :: G.R0T }
140   -
141   --- | Train concraft model.
142   --- TODO: It should be possible to supply the two training procedures with
143   --- different SGD arguments.
144   -train
145   - :: TrainConf
146   - -> IO [SentO Tag] -- ^ Training data
147   - -> IO [SentO Tag] -- ^ Evaluation data
148   - -> IO C.Concraft
149   -train TrainConf{..} train0 eval0 = do
150   -
151   - pool <- newMacaPool 1
152   - let ana = anaSent tagset pool
153   - train1 = map (packSentO tagset) <$> train0
154   - eval1 = map (packSentO tagset) <$> eval0
155   -
156   - if reana
157   - then doReana ana train1 eval1
158   - else noReana train1 eval1
159   -
160   - where
161   -
162   - doReana ana = C.reAnaTrain tagset ana guessNum guessConf disambConf
163   - noReana tr ev = C.train tagset guessNum guessConf disambConf
164   - (map X.segs <$> tr) (map X.segs <$> ev)
165   -
166   - guessConf = G.TrainConf guessSchemaDefault sgdArgs onDisk r0
167   - disambConf = D.TrainConf tiersDefault disambSchemaDefault sgdArgs onDisk
168   -
169   -
170   --------------------------------------------------
171   --- Re-analysis
172   --------------------------------------------------
173   -
174   -
175   --- | Analyse the given sentence with Maca.
176   -anaSent :: P.Tagset -> MacaPool -> L.Text -> IO (X.Sent Word P.Tag)
177   -anaSent tagset pool
178   - = fmap (packSent tagset . concat)
179   - . macaPar pool . L.toStrict
180   -
181   -
182   --- -- | Reanalyse the input paragraph (lazy IO).
183   --- reAnaPar :: P.Tagset -> [SentO Tag] -> IO [Sent Tag]
184   --- reAnaPar tagset inp = do
185   --- pool <- newMacaPool 1
186   --- A.reAnaPar tagset (anaSent pool) inp
concraft/src/NLP/Concraft/Polish/Format/Plain.hs deleted
1   -{-# LANGUAGE OverloadedStrings #-}
2   -{-# LANGUAGE RecordWildCards #-}
3   -
4   --- | Simple format for morphosyntax representation which
5   --- with no spaces within and that one of the tags indicates
6   --- unknown words.
7   -
8   -module NLP.Concraft.Polish.Format.Plain
9   -(
10   --- * Parsing
11   - parsePlain
12   -, parsePara
13   -, parseSent
14   -
15   --- * Printing
16   -, ShowCfg (..)
17   -, showPlain
18   -, showPara
19   -, showSent
20   -) where
21   -
22   -import Data.Monoid (Monoid, mappend, mconcat)
23   -import Data.Maybe (catMaybes)
24   -import Data.List (groupBy)
25   -import Data.String (IsString)
26   -import qualified Data.Char as C
27   -import qualified Data.Map as M
28   -import qualified Data.Text as T
29   -import qualified Data.Text.Lazy as L
30   -import qualified Data.Text.Lazy.Builder as L
31   -import qualified Data.Text.Lazy.Read as R
32   -import Text.Printf (printf)
33   -
34   -import qualified NLP.Concraft.Morphosyntax as X
35   -import NLP.Concraft.Polish.Morphosyntax
36   -
37   --- | Parse the text in the plain format.
38   -parsePlain :: L.Text -> [[Sent Tag]]
39   -parsePlain =
40   - map parsePara' . groupBy f . L.splitOn "\n\n"
41   - where
42   - f _ xs = case L.uncons xs of
43   - Nothing -> False
44   - Just (x, _) -> not (C.isSpace x)
45   -
46   --- | Parse the paragraph in the plain format.
47   -parsePara :: L.Text -> [Sent Tag]
48   -parsePara = parsePara' . L.splitOn "\n\n"
49   -
50   --- | Parse paragraph already divided into sentence chunks.
51   -parsePara' :: [L.Text] -> [Sent Tag]
52   -parsePara' = map (parseSent . L.strip) . filter (not.isEmpty)
53   -
54   --- | Identify empty chunks of text.
55   -isEmpty :: L.Text -> Bool
56   -isEmpty = L.all C.isSpace
57   -
58   --- | Parse the sentence in the plain format.
59   -parseSent :: L.Text -> Sent Tag
60   -parseSent
61   - = map parseWord
62   - . groupBy (\_ x -> cond x)
63   - . L.lines
64   - where
65   - cond = ("\t" `L.isPrefixOf`)
66   -
67   -parseWord :: [L.Text] -> Seg Tag
68   -parseWord xs = Seg
69   - (Word _orth _space _known)
70   - _interps
71   - where
72   - (_orth, _space) = parseHeader (head xs)
73   - ys = map parseInterp (tail xs)
74   - _known = not (Nothing `elem` ys)
75   - -- _interps = M.fromListWith max (catMaybes ys)
76   - _interps = X.mkWMap $ catMaybes ys
77   -
78   -parseInterp :: L.Text -> Maybe (Interp Tag, Double)
79   -parseInterp =
80   - doIt . tail . L.splitOn "\t"
81   - where
82   - doIt [form, tag]
83   - | tag == ign = Nothing
84   - | otherwise = Just $
85   - (mkInterp form tag, 0)
86   - doIt [form, tag, "disamb"] =
87   - Just (mkInterp form tag, 1)
88   - doIt [form, tag, weight] = case R.double weight of
89   - Left er -> error $ "parseInterp (weight):" ++ show er
90   - Right w -> Just (mkInterp form tag, fst w)
91   - doIt xs = error $ "parseInterp: " ++ show xs
92   - mkInterp form tag = Interp (L.toStrict form) (L.toStrict tag)
93   -
94   -parseHeader :: L.Text -> (T.Text, Space)
95   -parseHeader xs =
96   - let [_orth, space] = L.splitOn "\t" xs
97   - in (L.toStrict _orth, parseSpace space)
98   -
99   --- TODO: Should we represent newlines and spaces in the `Space` data type?
100   -parseSpace :: L.Text -> Space
101   -parseSpace "none" = None
102   -parseSpace "space" = Space
103   -parseSpace "spaces" = Space -- Multiple spaces
104   -parseSpace "newline" = NewLine
105   -parseSpace "newlines" = NewLine -- Multiple newlines
106   -parseSpace xs = error ("parseSpace: " ++ L.unpack xs)
107   -
108   ------------
109   --- Printing
110   ------------
111   -
112   --- | Printing configuration.
113   -data ShowCfg = ShowCfg {
114   - -- | Print weights instead of 'disamb' tags.
115   - showWsCfg :: Bool }
116   -
117   --- | Show the plain data.
118   -showPlain :: ShowCfg ->[[Sent Tag]] -> L.Text
119   -showPlain cfg =
120   - L.intercalate "\n" . map (showPara cfg)
121   -
122   --- | Show the paragraph.
123   -showPara :: ShowCfg -> [Sent Tag] -> L.Text
124   -showPara cfg = L.toLazyText . mconcat . map (\xs -> buildSent cfg xs <> "\n")
125   -
126   --- | Show the sentence.
127   -showSent :: ShowCfg -> Sent Tag -> L.Text
128   -showSent cfg xs = L.toLazyText $ buildSent cfg xs
129   -
130   -buildSent :: ShowCfg -> Sent Tag -> L.Builder
131   -buildSent cfg = mconcat . map (buildWord cfg)
132   -
133   -buildWord :: ShowCfg -> Seg Tag -> L.Builder
134   -buildWord cfg Seg{..}
135   - = L.fromText orth <> "\t"
136   - <> buildSpace space <> "\n"
137   - <> buildKnown orth known
138   - <> buildInterps cfg interps
139   - where Word{..} = word
140   -
141   -buildInterps :: ShowCfg -> X.WMap (Interp Tag) -> L.Builder
142   -buildInterps ShowCfg{..} interps = mconcat
143   - [ "\t" <> buildBase interp <>
144   - "\t" <> buildTag interp <> buildDmb dmb
145   - | (interp, dmb) <- M.toList (X.unWMap interps) ]
146   - where
147   - buildTag = L.fromText . tag
148   - buildBase = L.fromText . base
149   - buildDmb = case showWsCfg of
150   - True -> \x -> between "\t" "\n"
151   - $ L.fromString
152   - $ printf "%.3f" x
153   - False -> \x -> if x > 0
154   - then "\tdisamb\n"
155   - else "\n"
156   - between x y z = x <> z <> y
157   -
158   -buildSpace :: Space -> L.Builder
159   -buildSpace None = "none"
160   -buildSpace Space = "space"
161   -buildSpace NewLine = "newline"
162   -
163   -buildKnown :: T.Text -> Bool -> L.Builder
164   -buildKnown _ True = ""
165   -buildKnown lemma False
166   - = "\t" <> L.fromText lemma
167   - <> "\t" <> L.fromText ign
168   - <> "\n"
169   -
170   -
171   ------------
172   --- Utils
173   ------------
174   -
175   -
176   --- | An infix synonym for 'mappend'.
177   -(<>) :: Monoid m => m -> m -> m
178   -(<>) = mappend
179   -{-# INLINE (<>) #-}
180   -
181   -
182   --- | Tag which indicates unknown words.
183   -ign :: IsString a => a
184   -ign = "ign"
185   -{-# INLINE ign #-}
concraft/src/NLP/Concraft/Polish/Maca.hs deleted
1   -{-# LANGUAGE OverloadedStrings #-}
2   -{-# LANGUAGE RecordWildCards #-}
3   -
4   -
5   --- | The module provides interface for the Maca analysis tool.
6   --- See <http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki>
7   --- for more information about the analyser.
8   -
9   -
10   -module NLP.Concraft.Polish.Maca
11   -(
12   - MacaPool
13   -, newMacaPool
14   -, macaPar
15   -) where
16   -
17   -
18   -import Control.Applicative ((<$>))
19   -import Control.Monad (void, forever, guard, replicateM, unless)
20   -import Control.Concurrent
21   -import Control.Exception
22   -import System.Process
23   -import System.IO
24   -import Data.Function (on)
25   -import qualified Data.Char as C
26   -import qualified Data.Text as T
27   -import qualified Data.Text.IO as T
28   -import qualified Data.Text.Lazy as L
29   -import qualified Data.Text.Lazy.IO as L
30   -import qualified Control.Monad.State.Strict as S
31   -import qualified Control.Monad.Trans.Maybe as M
32   -import Control.Monad.Trans.Class (lift)
33   -
34   -import NLP.Concraft.Polish.Morphosyntax hiding (restore)
35   -import qualified NLP.Concraft.Polish.Format.Plain as Plain
36   -
37   -
38   -----------------------------
39   --- Maca instance
40   -----------------------------
41   -
42   -
43   --- TODO: We don't have to use channels here. Maximum one element
44   --- should be present in the input/output channel.
45   -
46   -
47   --- | Input channel.
48   -type In = Chan T.Text
49   -
50   -
51   --- | Output channel.
52   -type Out = Chan [Sent Tag]
53   -
54   -
55   --- | Maca communication channels.
56   -newtype Maca = Maca (In, Out)
57   -
58   -
59   --- | Run Maca instance.
60   -newMaca :: IO Maca
61   -newMaca = do
62   - inCh <- newChan
63   - outCh <- newChan
64   - void $ runMacaOn inCh outCh
65   - return $ Maca (inCh, outCh)
66   -
67   -
68   --- | Run Maca server on given channels.
69   --- TODO: Should check, if maca works. In particular, if morfeusz is available.
70   -runMacaOn :: In -> Out -> IO ThreadId
71   -runMacaOn inCh outCh = mask $ \restore -> forkIO (do
72   - let cmd = "maca-analyse"
73   - args = ["-q", "morfeusz-nkjp-official", "-o", "plain", "-l"]
74   - (Just inh, Just outh, Just errh, pid) <-
75   - createProcess (proc cmd args){ std_in = CreatePipe
76   - , std_out = CreatePipe
77   - , std_err = CreatePipe }
78   -
79   - let excHandler = do
80   - let tryIO = try :: IO a -> IO (Either IOException a)
81   - void $ tryIO $ do
82   - err <- hGetContents errh
83   - unless (all C.isSpace err) $ do
84   - putStr "Maca error: "
85   - putStrLn err
86   - hClose inh; hClose outh; hClose errh
87   - terminateProcess pid
88   - waitForProcess pid
89   -
90   - -- TODO: Document, why LineBuffering is needed here.
91   - hSetBuffering outh LineBuffering
92   - flip onException excHandler $ restore $ forever $ do
93   -
94   - -- Take element from the input channel.
95   - txt <- readChan inCh
96   - -- putStr "REQUEST: "
97   - -- print txt
98   -
99   - -- Write text to maca stdin.
100   - -- TODO: Handle the "empty" case?
101   - T.hPutStr inh txt; hFlush inh
102   -
103   - -- Read maca response and put it in the output channel.
104   - writeChan outCh =<< readMacaResponse outh (textWeight txt)
105   - )
106   -
107   -
108   -readMacaResponse :: Handle -> Int -> IO [Sent Tag]
109   -readMacaResponse h n
110   - | n <= 0 = return []
111   - | otherwise = do
112   - x <- readMacaSent h
113   - xs <- readMacaResponse h (n - sentWeight x)
114   - return (x : xs)
115   -
116   -
117   -readMacaSent :: Handle -> IO (Sent Tag)
118   -readMacaSent h =
119   - Plain.parseSent . L.unlines <$> getTxt
120   - where
121   - getTxt = do
122   - x <- L.hGetLine h
123   - if L.null x
124   - then return []
125   - else (x:) <$> getTxt
126   -
127   -
128   -----------------------------
129   --- Client
130   -----------------------------
131   -
132   -
133   --- | Analyse paragraph with Maca.
134   -doMacaPar :: Maca -> T.Text -> IO [Sent Tag]
135   -doMacaPar (Maca (inCh, outCh)) par = do
136   - let par' = T.intercalate " " (T.lines par) `T.append` "\n"
137   - writeChan inCh par'
138   - restoreSpaces par <$> readChan outCh
139   -
140   -
141   --- | Restore info abouts spaces from a text and insert them
142   --- to a parsed paragraph.
143   -restoreSpaces :: T.Text -> [Sent Tag] -> [Sent Tag]
144   -restoreSpaces par sents =
145   - S.evalState (mapM onSent sents) (0, chunks)
146   - where
147   - -- For each space chunk in the paragraph compute
148   - -- total weight of earlier chunks.
149   - parts = T.groupBy ((==) `on` C.isSpace) par
150   - weights = scanl1 (+) (map textWeight parts)
151   - chunks = filter (T.any C.isSpace . fst) (zip parts weights)
152   -
153   - -- Stateful monadic computation which modifies spaces
154   - -- assigned to individual segments.
155   - onSent = mapM onWord
156   - onWord seg = do
157   - n <- addWeight seg
158   - s <- popSpace n
159   - let word' = (word seg) { space = s }
160   - return $ seg { word = word' }
161   -
162   - -- Add weight of the segment to the current weight.
163   - addWeight seg = S.state $ \(n, xs) ->
164   - let m = n + segWeight seg
165   - in (m, (m, xs))
166   -
167   - -- Pop space from the stack if its weight is lower than
168   - -- the current one.
169   - popSpace n = fmap (maybe None id) . M.runMaybeT $ do
170   - spaces <- lift $ S.gets snd
171   - (sp, m) <- liftMaybe $ maybeHead spaces
172   - guard $ m < n
173   - lift $ S.modify $ \(n', xs) -> (n', tail xs)
174   - return $ toSpace sp
175   - liftMaybe = M.MaybeT . return
176   - maybeHead xs = case xs of
177   - (x:_) -> Just x
178   - [] -> Nothing
179   -
180   - -- Parse strings representation of a Space.
181   - toSpace x
182   - | has '\n' = NewLine
183   - | has ' ' = Space
184   - | otherwise = None
185   - where has c = maybe False (const True) (T.find (==c) x)
186   -
187   -
188   -----------------------------
189   --- Pool
190   -----------------------------
191   -
192   -
193   --- | A pool of Maca instances.
194   -newtype MacaPool = MacaPool (Chan Maca)
195   -
196   -
197   --- | Run Maca server.
198   -newMacaPool
199   - :: Int -- ^ Number of Maca instances
200   - -> IO MacaPool
201   -newMacaPool n = do
202   - chan <- newChan
203   - macas <- replicateM n newMaca
204   - writeList2Chan chan macas
205   - return $ MacaPool chan
206   -
207   -
208   -popMaca :: MacaPool -> IO Maca
209   -popMaca (MacaPool c) = readChan c
210   -
211   -
212   -putMaca :: Maca -> MacaPool -> IO ()
213   -putMaca x (MacaPool c) = writeChan c x
214   -
215   -
216   --- | Analyse paragraph with Maca. The function is thread-safe. As a
217   --- pre-processing step, all non-printable characters are removed from
218   --- the input (based on empirical observations, Maca behaves likewise).
219   -macaPar :: MacaPool -> T.Text -> IO [Sent Tag]
220   -macaPar pool par0 = do
221   - let par = T.filter C.isPrint par0
222   - maca <- popMaca pool
223   - doMacaPar maca par `finally` putMaca maca pool
224   -
225   -
226   -------------------------------------------------------------
227   --- Weight: a number of non-space characters
228   -------------------------------------------------------------
229   -
230   -
231   --- | A weight of a text.
232   -textWeight :: T.Text -> Int
233   --- textWeight = T.length . T.filter C.isAlphaNum
234   -textWeight = T.length . T.filter (not . C.isSpace)
235   --- textWeight = T.length . T.filter ((&&) <$> not . C.isSpace <*> C.isPrint)
236   -
237   -
238   --- | A weight of a segment.
239   -segWeight :: Seg t -> Int
240   -segWeight = textWeight . orth . word
241   -
242   -
243   --- | A weight of a sentence.
244   -sentWeight :: Sent t -> Int
245   -sentWeight = sum . map segWeight
concraft/src/NLP/Concraft/Polish/Morphosyntax.hs deleted
1   -{-# LANGUAGE OverloadedStrings #-}
2   -{-# LANGUAGE RecordWildCards #-}
3   -{-# LANGUAGE TupleSections #-}
4   -
5   -
6   --- | Morphosyntax data layer in Polish.
7   -
8   -
9   -module NLP.Concraft.Polish.Morphosyntax
10   -(
11   --- * Tag
12   - Tag
13   -
14   --- * Segment
15   -, Seg (..)
16   -, Word (..)
17   -, Interp (..)
18   -, Space (..)
19   -, select
20   -, select'
21   -, selectWMap
22   -
23   --- * Sentence
24   -, Sent
25   -, SentO (..)
26   -, restore
27   -, withOrig
28   -
29   --- * Conversion
30   -, packSeg
31   -, packSent
32   -, packSentO
33   -) where
34   -
35   -
36   -import Control.Applicative ((<$>), (<*>))
37   -import Control.Arrow (first)
38   -import Data.Maybe (catMaybes)
39   -import Data.Aeson
40   -import Data.Binary (Binary, put, get, putWord8, getWord8)
41   -import qualified Data.Aeson as Aeson
42   -import qualified Data.Set as S
43   -import qualified Data.Map as M
44   -import qualified Data.Text as T
45   -import qualified Data.Text.Lazy as L
46   -import qualified Data.Tagset.Positional as P
47   -
48   -import qualified NLP.Concraft.Morphosyntax as X
49   -
50   -
51   --- | A textual representation of a morphosyntactic tag.
52   -type Tag = T.Text
53   -
54   -
55   ---------------------------------
56   --- Segment
57   ---------------------------------
58   -
59   -
60   --- | A segment consists of a word and a set of morphosyntactic interpretations.
61   -data Seg t = Seg
62   - { word :: Word
63   - -- | Interpretations of the token, each interpretation annotated
64   - -- with a /disamb/ Boolean value (if 'True', the interpretation
65   - -- is correct within the context).
66   - , interps :: X.WMap (Interp t) }
67   - deriving (Show, Eq, Ord)
68   -
69   -
70   -instance (Ord t, Binary t) => Binary (Seg t) where
71   - put Seg{..} = put word >> put interps
72   - get = Seg <$> get <*> get
73   -
74   -
75   --- | A word.
76   -data Word = Word
77   - { orth :: T.Text
78   - , space :: Space
79   - , known :: Bool }
80   - deriving (Show, Eq, Ord)
81   -
82   -
83   -instance X.Word Word where
84   - orth = orth
85   - oov = not.known
86   -
87   -
88   -instance ToJSON Word where
89   - toJSON Word{..} = object
90   - [ "orth" .= orth
91   - , "space" .= space
92   - , "known" .= known ]
93   -
94   -
95   -instance FromJSON Word where
96   - parseJSON (Object v) = Word
97   - <$> v .: "orth"
98   - <*> v .: "space"
99   - <*> v .: "known"
100   - parseJSON _ = error "parseJSON [Word]"
101   -
102   -
103   -instance Binary Word where
104   - put Word{..} = put orth >> put space >> put known
105   - get = Word <$> get <*> get <*> get
106   -
107   -
108   --- | A morphosyntactic interpretation.
109   -data Interp t = Interp
110   - { base :: T.Text
111   - , tag :: t }
112   - deriving (Show, Eq, Ord)
113   -
114   -
115   -instance (Ord t, Binary t) => Binary (Interp t) where
116   - put Interp{..} = put base >> put tag
117   - get = Interp <$> get <*> get
118   -
119   -
120   --- | No space, space or newline.
121   --- TODO: Perhaps we should use a bit more informative data type.
122   -data Space
123   - = None
124   - | Space
125   - | NewLine
126   - deriving (Show, Eq, Ord)
127   -
128   -
129   -instance Binary Space where
130   - put x = case x of
131   - None -> putWord8 1
132   - Space -> putWord8 2
133   - NewLine -> putWord8 3
134   - get = getWord8 >>= \x -> return $ case x of
135   - 1 -> None
136   - 2 -> Space
137   - _ -> NewLine
138   -
139   -
140   -instance ToJSON Space where
141   - toJSON x = Aeson.String $ case x of
142   - None -> "none"
143   - Space -> "space"
144   - NewLine -> "newline"
145   -
146   -
147   -instance FromJSON Space where
148   - parseJSON (Aeson.String x) = return $ case x of
149   - "none" -> None
150   - "space" -> Space
151   - "newline" -> NewLine
152   - _ -> error "parseJSON [Space]"
153   - parseJSON _ = error "parseJSON [Space]"
154   -
155   -
156   --- | Select one chosen interpretation.
157   -select :: Ord a => a -> Seg a -> Seg a
158   -select = select' []
159   -
160   -
161   --- | Select multiple interpretations and one chosen interpretation.
162   -select' :: Ord a => [a] -> a -> Seg a -> Seg a
163   -select' ys x = selectWMap . X.mkWMap $ (x, 1) : map (,0) ys
164   -
165   -
166   --- | Select interpretations.
167   -selectWMap :: Ord a => X.WMap a -> Seg a -> Seg a
168   -selectWMap wMap seg =
169   - seg { interps = newInterps }
170   - where
171   - wSet = S.fromList . map tag . M.keys . X.unWMap . interps $ seg
172   - newInterps = X.mkWMap $
173   - [ case M.lookup (tag interp) (X.unWMap wMap) of
174   - Just x -> (interp, x)
175   - Nothing -> (interp, 0)
176   - | interp <- (M.keys . X.unWMap) (interps seg) ]
177   - ++ catMaybes
178   - [ if tag `S.member` wSet
179   - then Nothing
180   - else Just (Interp lemma tag, x)
181   - | let lemma = orth $ word seg -- Default base form
182   - , (tag, x) <- M.toList (X.unWMap wMap) ]
183   -
184   -
185   ---------------------------------
186   --- Sentence
187   ---------------------------------
188   -
189   -
190   --- | A sentence.
191   -type Sent t = [Seg t]
192   -
193   -
194   --- | A sentence.
195   -data SentO t = SentO
196   - { segs :: [Seg t]
197   - , orig :: L.Text }
198   -
199   -
200   --- | Restore textual representation of a sentence.
201   --- The function is not very accurate, it could be improved
202   --- if we enrich representation of a space.
203   -restore :: Sent t -> L.Text
204   -restore =
205   - let wordStr Word{..} = [spaceStr space, orth]
206   - spaceStr None = ""
207   - spaceStr Space = " "
208   - spaceStr NewLine = "\n"
209   - in L.fromChunks . concatMap (wordStr . word)
210   -
211   -
212   --- | Use `restore` to translate `Sent` to a `SentO`.
213   -withOrig :: Sent t -> SentO t
214   -withOrig s = SentO
215   - { segs = s
216   - , orig = restore s }
217   -
218   -
219   ----------------------------
220   --- Conversion
221   ----------------------------
222   -
223   -
224   --- | Convert a segment to a segment from a core library.
225   -packSeg_ :: Ord a => Seg a -> X.Seg Word a
226   -packSeg_ Seg{..}
227   - = X.Seg word
228   - $ X.mkWMap
229   - $ map (first tag)
230   - $ M.toList
231   - $ X.unWMap interps
232   -
233   -
234   --- | Convert a segment to a segment from a core library.
235   -packSeg :: P.Tagset -> Seg Tag -> X.Seg Word P.Tag
236   -packSeg tagset = X.mapSeg (P.parseTag tagset) . packSeg_
237   -
238   -
239   --- | Convert a sentence to a sentence from a core library.
240   -packSent :: P.Tagset -> Sent Tag -> X.Sent Word P.Tag
241   -packSent = map . packSeg
242   -
243   -
244   --- | Convert a sentence to a sentence from a core library.
245   -packSentO :: P.Tagset -> SentO Tag -> X.SentO Word P.Tag
246   -packSentO tagset s = X.SentO
247   - { segs = packSent tagset (segs s)
248   - , orig = orig s }
concraft/src/NLP/Concraft/Polish/Request.hs deleted
1   -{-# LANGUAGE RecordWildCards #-}
2   -{-# LANGUAGE GeneralizedNewtypeDeriving #-}
3   -
4   -
5   -module NLP.Concraft.Polish.Request
6   -(
7   --- * Request
8   - Request (..)
9   -, Config (..)
10   --- ** Short
11   -, Short (..)
12   -, short
13   --- ** Long
14   -, Long (..)
15   -, long
16   -) where
17   -
18   -
19   -import Control.Applicative ((<$>), (<*>))
20   -import qualified Control.Monad.LazyIO as LazyIO
21   -import qualified Data.Char as Char
22   -import qualified Data.List.Split as Split
23   -import qualified Data.Text as T
24   -import qualified Data.Text.Lazy as L
25   -import qualified Data.Binary as B
26   -
27   -import NLP.Concraft.Polish
28   -import NLP.Concraft.Polish.Maca
29   -import NLP.Concraft.Polish.Morphosyntax hiding (tag)
30   -
31   -
32   --------------------------------------------------
33   --- Configuration
34   --------------------------------------------------
35   -
36   -
37   --- | A request with configuration.
38   -data Request t = Request {
39   - -- | The actuall request.
40   - rqBody :: t
41   - -- | Request configuration.
42   - , rqConf :: Config }
43   -
44   -
45   -instance B.Binary t => B.Binary (Request t) where
46   - put Request{..} = B.put rqBody >> B.put rqConf
47   - get = Request <$> B.get <*> B.get
48   -
49   -
50   --- | Tagging configuration.
51   -newtype Config = Config {
52   - -- | Tag with marginal probabilities.
53   - tagProbs :: Bool
54   - } deriving (B.Binary)
55   -
56   -
57   --------------------------------------------------
58   --- Short request
59   --------------------------------------------------
60   -
61   -
62   --- | A short request.
63   -data Short
64   - = Short T.Text
65   - | Par [Sent Tag]
66   -
67   -
68   -instance B.Binary Short where
69   - put (Short x) = B.putWord8 0 >> B.put x
70   - put (Par x) = B.putWord8 1 >> B.put x
71   - get = B.getWord8 >>= \x -> case x of
72   - 0 -> Short <$> B.get
73   - _ -> Par <$> B.get
74   -
75   -
76   --- | Process the short request.
77   -short :: MacaPool -> Concraft -> Request Short -> IO [Sent Tag]
78   -short pool concraft Request{..} = case rqBody of
79   - Short x -> map (tagit concraft) <$> macaPar pool x
80   - Par x -> return $ map (tagit concraft) x
81   - where
82   - tagit = if tagProbs rqConf then marginals else tag
83   -
84   -
85   --------------------------------------------------
86   --- Long request
87   --------------------------------------------------
88   -
89   -
90   --- | A request to parse a long text.
91   -data Long
92   - = Long L.Text
93   - | Doc [[Sent Tag]]
94   -
95   -
96   -instance B.Binary Long where
97   - put (Long x) = B.putWord8 0 >> B.put x
98   - put (Doc x) = B.putWord8 1 >> B.put x
99   - get = B.getWord8 >>= \x -> case x of
100   - 0 -> Long <$> B.get
101   - _ -> Doc <$> B.get
102   -
103   -
104   --- | Process the long request given the processor for the
105   --- short request.
106   -long :: (Request Short -> IO a) -> Request Long -> IO [a]
107   -long handler Request{..} = case rqBody of
108   - Long inp ->
109   - LazyIO.mapM f . map L.unlines
110   - . Split.splitWhen (L.all Char.isSpace)
111   - . L.lines $ inp
112   - Doc inp -> LazyIO.mapM g inp
113   - where
114   - f x = handler . r $ Short $ L.toStrict x
115   - g x = handler . r $ Par x
116   - r x = Request {rqBody = x, rqConf = rqConf}
concraft/src/NLP/Concraft/Polish/Server.hs deleted
1   -{-# LANGUAGE OverloadedStrings #-}
2   -{-# LANGUAGE RecordWildCards #-}
3   -
4   -
5   -module NLP.Concraft.Polish.Server
6   -(
7   --- * Server
8   - runConcraftServer
9   -
10   --- * Client
11   -, submit
12   -) where
13   -
14   -
15   -import Control.Applicative ((<$>))
16   -import Control.Monad (forever, void)
17   -import Control.Concurrent (forkIO)
18   -import System.IO (Handle, hFlush)
19   -import qualified Network as N
20   -import qualified Data.Binary as B
21   -import qualified Data.ByteString.Lazy as BS
22   -
23   -import NLP.Concraft.Polish.Morphosyntax hiding (tag)
24   -import NLP.Concraft.Polish.Maca
25   -import qualified NLP.Concraft.Polish as C
26   -import qualified NLP.Concraft.Polish.Request as R
27   -
28   -
29   --------------------------------------------------
30   --- Server
31   --------------------------------------------------
32   -
33   -
34   --- | Run a Concraft server on a given port.
35   -runConcraftServer :: MacaPool -> C.Concraft -> N.PortID -> IO ()
36   -runConcraftServer pool concraft port = N.withSocketsDo $ do
37   - sock <- N.listenOn port
38   - forever $ sockHandler pool concraft sock
39   -
40   -
41   --- | Read and process short requests from the socket.
42   -sockHandler :: MacaPool -> C.Concraft -> N.Socket -> IO ()
43   -sockHandler pool concraft sock = do
44   - (handle, _, _) <- N.accept sock
45   - -- putStrLn "Connection established"
46   - void $ forkIO $ do
47   - -- putStrLn "Waiting for input..."
48   - inp <- recvMsg handle
49   - -- putStr "> " >> T.putStrLn inp
50   - out <- R.short pool concraft inp
51   - -- putStr "No. of sentences: " >> print (length out)
52   - sendMsg handle out
53   -
54   -
55   --------------------------------------------------
56   --- Client
57   --------------------------------------------------
58   -
59   -
60   --- | Submit the given request.
61   -submit :: N.HostName -> N.PortID -> R.Request R.Short -> IO [Sent Tag]
62   -submit host port inp = do
63   - handle <- N.connectTo host port
64   - -- putStrLn "Connection established"
65   - -- putStr "Send request: " >> T.putStrLn inp
66   - sendMsg handle inp
67   - recvMsg handle
68   -
69   -
70   --------------------------------------------------
71   --- Communication
72   --------------------------------------------------
73   -
74   -
75   -sendMsg :: B.Binary a => Handle -> a -> IO ()
76   -sendMsg h msg = do
77   - let x = B.encode msg
78   - n = fromIntegral $ BS.length x
79   - sendInt h n
80   - BS.hPut h x
81   - hFlush h
82   -
83   -
84   -recvMsg :: B.Binary a => Handle -> IO a
85   -recvMsg h = do
86   - n <- recvInt h
87   - B.decode <$> BS.hGet h n
88   -
89   -
90   -sendInt :: Handle -> Int -> IO ()
91   -sendInt h x = BS.hPut h (B.encode x)
92   -
93   -
94   -recvInt :: Handle -> IO Int
95   -recvInt h = B.decode <$> BS.hGet h 8
96   -
97   -
98   --- -------------------------------------------------
99   --- -- Stream binary encoding
100   --- -------------------------------------------------
101   ---
102   ---
103   --- newtype Stream a = Stream { unstream :: [a] }
104   ---
105   ---
106   --- instance B.Binary a => B.Binary (Stream a) where
107   --- put (Stream []) = B.putWord8 0
108   --- put (Stream (x:xs)) = B.putWord8 1 >> B.put x >> B.put (Stream xs)
109   --- get = error "use lazyDecodeStream insted"
110   ---
111   ---
112   --- getMaybe :: B.Binary a => B.Get (Maybe a)
113   --- getMaybe = do
114   --- t <- B.getWord8
115   --- case t of
116   --- 0 -> return Nothing
117   --- _ -> fmap Just B.get
118   ---
119   ---
120   --- step :: B.Binary a => (ByteString, Int64) -> Maybe (a, (ByteString, Int64))
121   --- step (xs, offset) = case B.runGetState getMaybe xs offset of
122   --- (Just v, ys, newOffset) -> Just (v, (ys, newOffset))
123   --- _ -> Nothing
124   ---
125   ---
126   --- lazyDecodeList :: B.Binary a => ByteString -> [a]
127   --- lazyDecodeList xs = unfoldr step (xs, 0)
128   ---
129   ---
130   --- lazyDecodeStream :: B.Binary a => ByteString -> Stream a
131   --- lazyDecodeStream = Stream . lazyDecodeList
concraft/stack.yaml deleted
1   -# For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
2   -
3   -# Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2)
4   -resolver: lts-1.15
5   -
6   -# Local packages, usually specified by relative directory name
7   -packages:
8   -- '.'
9   -
10   -# Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3)
11   -extra-deps:
12   -- concraft-0.9.4
13   -- lazy-io-0.1.0
14   -- sgd-0.3.7
15   -- tagset-positional-0.3.0
16   -- crf-chain1-constrained-0.3.2
17   -- crf-chain2-tiers-0.2.4
18   -- data-lens-2.10.7
19   -- monad-codec-0.2.0
20   -- monad-ox-0.3.0
21   -- vector-binary-0.1.1
22   -- temporary-1.1.2.5
23   -
24   -# Override default flag values for local packages and extra-deps
25   -flags: {}
26   -
27   -# Extra package databases containing global packages
28   -extra-package-dbs: []
29   -
30   -# Control whether we use the GHC we find on the path
31   -# system-ghc: true
32   -
33   -# Require a specific version of stack, using version ranges
34   -# require-stack-version: -any # Default
35   -# require-stack-version: >= 1.0.0
36   -
37   -# Override the architecture used by stack, especially useful on Windows
38   -# arch: i386
39   -# arch: x86_64
40   -
41   -# Extra directories used by stack for building
42   -# extra-include-dirs: [/path/to/dir]
43   -# extra-lib-dirs: [/path/to/dir]
concraft/tools/concraft-pl.hs deleted
1   -{-# LANGUAGE DeriveDataTypeable #-}
2   -{-# LANGUAGE ScopedTypeVariables #-}
3   -{-# LANGUAGE RecordWildCards #-}
4   -
5   -
6   -import Control.Applicative ((<$>))
7   -import Control.Monad (unless)
8   -import System.Console.CmdArgs
9   -import System.IO (hFlush, stdout)
10   -import qualified Network as N
11   -import qualified Numeric.SGD as SGD
12   -import qualified Data.Text.Lazy as L
13   -import qualified Data.Text.Lazy.IO as L
14   -import Data.Tagset.Positional (parseTagset)
15   -import GHC.Conc (numCapabilities)
16   -
17   -import qualified NLP.Concraft.Morphosyntax.Accuracy as Acc
18   -import qualified NLP.Concraft.Guess as Guess
19   -
20   -import qualified NLP.Concraft.Polish.Maca as Maca
21   -import qualified NLP.Concraft.Polish as C
22   -import qualified NLP.Concraft.Polish.Request as R
23   -import qualified NLP.Concraft.Polish.Server as S
24   -import qualified NLP.Concraft.Polish.Morphosyntax as X
25   -import qualified NLP.Concraft.Polish.Format.Plain as P
26   -
27   -import Paths_concraft_pl (version, getDataFileName)
28   -import Data.Version (showVersion)
29   -
30   -
31   --- | Default port number.
32   -portDefault :: Int
33   -portDefault = 10089
34   -
35   -
36   ----------------------------------------
37   --- Command line options
38   ----------------------------------------
39   -
40   -
41   --- | Data formats.
42   -data Format = Plain deriving (Data, Typeable, Show)
43   -
44   -
45   --- | A description of the Concraft-pl tool
46   -concraftDesc :: String
47   -concraftDesc = "Concraft-pl " ++ showVersion version
48   -
49   -
50   -data Concraft
51   - = Train
52   - { trainPath :: FilePath
53   - , evalPath :: Maybe FilePath
54   - , format :: Format
55   - , tagsetPath :: Maybe FilePath
56   - , noAna :: Bool
57   - -- , discardHidden :: Bool
58   - , iterNum :: Double
59   - , batchSize :: Int
60   - , regVar :: Double
61   - , gain0 :: Double
62   - , tau :: Double
63   - , disk :: Bool
64   - , outModel :: FilePath
65   - , guessNum :: Int
66   - , r0 :: Guess.R0T }
67   - | Tag
68   - { inModel :: FilePath
69   - , noAna :: Bool
70   - , format :: Format
71   - , marginals :: Bool }
72   - -- , guessNum :: Int }
73   - | Server
74   - { inModel :: FilePath
75   - , port :: Int }
76   - | Client
77   - { noAna :: Bool
78   - , format :: Format
79   - , marginals :: Bool
80   - , host :: String
81   - , port :: Int }
82   - | Compare
83   - { tagsetPath :: Maybe FilePath
84   - , refPath :: FilePath
85   - , otherPath :: FilePath
86   - , format :: Format }
87   - | Prune
88   - { inModel :: FilePath
89   - , outModel :: FilePath
90   - , threshold :: Double }
91   --- | ReAna
92   --- { format :: Format }
93   - deriving (Data, Typeable, Show)
94   -
95   -
96   -trainMode :: Concraft
97   -trainMode = Train
98   - { trainPath = def &= argPos 1 &= typ "TRAIN-FILE"
99   - , evalPath = def &= typFile &= help "Evaluation file"
100   - , tagsetPath = def &= typFile &= help "Tagset definition file"
101   - , format = enum [Plain &= help "Plain format"]
102   - , noAna = False &= help "Do not perform reanalysis"
103   - -- , discardHidden = False &= help "Discard hidden features"
104   - , iterNum = 20 &= help "Number of SGD iterations"
105   - , batchSize = 50 &= help "Batch size"
106   - , regVar = 10.0 &= help "Regularization variance"
107   - , gain0 = 1.0 &= help "Initial gain parameter"
108   - , tau = 5.0 &= help "Initial tau parameter"
109   - , disk = False &= help "Store SGD dataset on disk"
110   - , outModel = def &= typFile &= help "Output Model file"
111   - , guessNum = 10 &= help "Number of guessed tags for each unknown word"
112   - , r0 = Guess.OovChosen &= help "R0 construction method" }
113   -
114   -
115   -tagMode :: Concraft
116   -tagMode = Tag
117   - { inModel = def &= argPos 0 &= typ "MODEL-FILE"
118   - , noAna = False &= help "Do not analyse input text"
119   - , format = enum [Plain &= help "Plain format"]
120   - , marginals = False &= help "Tag with marginal probabilities" }
121   - -- , guessNum = 10 &= help "Number of guessed tags for each unknown word" }
122   -
123   -
124   -serverMode :: Concraft
125   -serverMode = Server
126   - { inModel = def &= argPos 0 &= typ "MODEL-FILE"
127   - , port = portDefault &= help "Port number" }
128   -
129   -
130   -clientMode :: Concraft
131   -clientMode = Client
132   - { noAna = False &= help "Do not perform reanalysis"
133   - , port = portDefault &= help "Port number"
134   - , host = "localhost" &= help "Server host name"
135   - , format = enum [Plain &= help "Plain output format"]
136   - , marginals = False &= help "Tag with marginal probabilities" }
137   -
138   -
139   -compareMode :: Concraft
140   -compareMode = Compare
141   - { refPath = def &= argPos 1 &= typ "REFERENCE-FILE"
142   - , otherPath = def &= argPos 2 &= typ "OTHER-FILE"
143   - , tagsetPath = def &= typFile &= help "Tagset definition file"
144   - , format = enum [Plain &= help "Plain format"] }
145   -
146   -
147   -pruneMode :: Concraft
148   -pruneMode = Prune
149   - { inModel = def &= argPos 0 &= typ "INPUT-MODEL"
150   - , outModel = def &= argPos 1 &= typ "OUTPUT-MODEL"
151   - , threshold = 0.05 &=
152   - help "Remove disambiguation features below the threshold" }
153   -
154   -
155   --- reAnaMode :: Concraft
156   --- reAnaMode = ReAna
157   --- { format = enum [Plain &= help "Plain format"] }
158   -
159   -
160   -argModes :: Mode (CmdArgs Concraft)
161   -argModes = cmdArgsMode $ modes
162   - [trainMode, tagMode, serverMode, clientMode, compareMode, pruneMode]
163   - &= summary concraftDesc
164   - &= program "concraft-pl"
165   -
166   -
167   ----------------------------------------
168   --- Main
169   ----------------------------------------
170   -
171   -
172   -main :: IO ()
173   -main = exec =<< cmdArgsRun argModes
174   -
175   -
176   -exec :: Concraft -> IO ()
177   -
178   -
179   -exec Train{..} = do
180   - tagsetPath' <- case tagsetPath of
181   - Nothing -> getDataFileName "config/nkjp-tagset.cfg"
182   - Just x -> return x
183   - tagset <- parseTagset tagsetPath' <$> readFile tagsetPath'
184   - let train0 = parseFileO format trainPath
185   - let eval0 = parseFileO' format evalPath
186   - concraft <- C.train (trainConf tagset) train0 eval0
187   - unless (null outModel) $ do
188   - putStrLn $ "\nSaving model in " ++ outModel ++ "..."
189   - C.saveModel outModel concraft
190   - where
191   - sgdArgs = SGD.SgdArgs
192   - { SGD.batchSize = batchSize
193   - , SGD.regVar = regVar
194   - , SGD.iterNum = iterNum
195   - , SGD.gain0 = gain0
196   - , SGD.tau = tau }
197   - trainConf tagset = C.TrainConf
198   - { tagset = tagset
199   - , sgdArgs = sgdArgs
200   - , reana = not noAna
201   - , onDisk = disk
202   - , guessNum = guessNum
203   - , r0 = r0 }
204   -
205   -
206   -exec Tag{..} = do
207   - cft <- C.loadModel inModel
208   - pool <- Maca.newMacaPool numCapabilities
209   - inp <- L.getContents
210   - out <- R.long (R.short pool cft) $ rq $ if noAna
211   - then R.Doc $ parseText format inp
212   - else R.Long inp
213   - L.putStr $ showData showCfg out
214   - where
215   - rq x = R.Request
216   - { R.rqBody = x
217   - , R.rqConf = rqConf }
218   - rqConf = R.Config
219   - { R.tagProbs = marginals }
220   - showCfg = ShowCfg
221   - { formatCfg = format
222   - , showWsCfg = marginals }
223   -
224   -
225   -exec Server{..} = do
226   - putStr "Loading model..." >> hFlush stdout
227   - concraft <- C.loadModel inModel
228   - putStrLn " done"
229   - pool <- Maca.newMacaPool numCapabilities
230   - let portNum = N.PortNumber $ fromIntegral port
231   - putStrLn $ "Listening on port " ++ show port
232   - S.runConcraftServer pool concraft portNum
233   -
234   -
235   -exec Client{..} = do
236   - let portNum = N.PortNumber $ fromIntegral port
237   - inp <- L.getContents
238   - out <- R.long (S.submit host portNum) $ rq $ if noAna
239   - then R.Doc $ parseText format inp
240   - else R.Long inp
241   - L.putStr $ showData showCfg out
242   - where
243   - rq x = R.Request
244   - { R.rqBody = x
245   - , R.rqConf = rqConf }
246   - rqConf = R.Config
247   - { R.tagProbs = marginals }
248   - showCfg = ShowCfg
249   - { formatCfg = format
250   - , showWsCfg = marginals }
251   -
252   -
253   -exec Compare{..} = do
254   - tagsetPath' <- case tagsetPath of
255   - Nothing -> getDataFileName "config/nkjp-tagset.cfg"
256   - Just x -> return x
257   - tagset <- parseTagset tagsetPath' <$> readFile tagsetPath'
258   - let convert = map (X.packSeg tagset) . concat
259   - xs <- convert <$> parseFile format refPath
260   - ys <- convert <$> parseFile format otherPath
261   - let s = Acc.weakLB tagset xs ys
262   - putStrLn $ "Number of segments in reference file: " ++ show (Acc.gold s)
263   - putStrLn $ "Number of correct tags: " ++ show (Acc.good s)
264   - putStrLn $ "Weak accuracy lower bound: " ++ show (Acc.accuracy s)
265   -
266   -
267   -exec Prune{..} = do
268   - cft <- C.loadModel inModel
269   - C.saveModel outModel $ C.prune threshold cft
270   -
271   -
272   --- exec ReAna{..} = do
273   --- inp <- parseText format <$> L.getContents
274   --- out <- showData format <$>
275   -
276   -
277   ----------------------------------------
278   --- Reading files
279   ----------------------------------------
280   -
281   -
282   -parseFileO' :: Format -> Maybe FilePath -> IO [X.SentO X.Tag]
283   -parseFileO' format path = case path of
284   - Nothing -> return []
285   - Just pt -> parseFileO format pt
286   -
287   -
288   -parseFileO :: Format -> FilePath -> IO [X.SentO X.Tag]
289   -parseFileO format path = parseParaO format <$> L.readFile path
290   -
291   -
292   -parseFile :: Format -> FilePath -> IO [X.Sent X.Tag]
293   -parseFile format path = parsePara format <$> L.readFile path
294   -
295   -
296   ----------------------------------------
297   --- Parsing text
298   ----------------------------------------
299   -
300   -
301   --- parseTextO :: Format -> L.Text -> [[X.SentO X.Tag]]
302   --- parseTextO format = map (map X.withOrig) . parseText format
303   -
304   -
305   -parseParaO :: Format -> L.Text -> [X.SentO X.Tag]
306   -parseParaO format = map X.withOrig . parsePara format
307   -
308   -
309   ----------------------------------------
310   --- Parsing (format dependent)
311   ----------------------------------------
312   -
313   -
314   -parseText :: Format -> L.Text -> [[X.Sent X.Tag]]
315   -parseText Plain = P.parsePlain
316   -
317   -
318   -parsePara :: Format -> L.Text -> [X.Sent X.Tag]
319   -parsePara Plain = P.parsePara
320   -
321   -
322   ----------------------------------------
323   --- Showing (format dependent)
324   ----------------------------------------
325   -
326   -
327   -data ShowCfg = ShowCfg {
328   - -- | The format used.
329   - formatCfg :: Format
330   - -- | Show weights?
331   - , showWsCfg :: Bool }
332   -
333   -
334   -showData :: ShowCfg -> [[X.Sent X.Tag]] -> L.Text
335   -showData ShowCfg{..} = P.showPlain (P.ShowCfg {P.showWsCfg = showWsCfg})
integration/.gitignore 0 → 100644
  1 +test
... ...
integration/ENIAMpreIntegration.ml
... ... @@ -25,11 +25,48 @@ let mate_parser_enabled = ref false
25 25 let swigra_enabled = ref false
26 26 let polfie_enabled = ref false
27 27  
  28 +let concraft_model_filename = "../tools/concraft/nkjp-model-0.2.gz"
  29 +let concraft_server_pid = ref (-1)
28 30  
29   -(*
30   -UWAGA: Aby korzytać z concrafta trzeba najpierw postawić serwer wpisując z linii poleceń:
31   -concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz
32   -*)
  31 +let concraft_exists () =
  32 + let check_in, check_out, check_err = Unix.open_process_full ("command -v concraft-pl")
  33 + [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
  34 + let close_check () = Unix.close_process_full (check_in, check_out, check_err) in
  35 + try
  36 + ignore @@ input_line check_in;
  37 + ignore @@ close_check ();
  38 + true
  39 + with End_of_file -> ignore @@ close_check (); false
  40 +
  41 +let wait_for_server () =
  42 + let rec wait s a =
  43 + try Unix.connect s a
  44 + with e -> Unix.sleep 1; wait s a in
  45 + let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 6 in
  46 + let a = Unix.ADDR_INET (Unix.inet_addr_loopback, 10089) in
  47 + wait s a;
  48 + Unix.shutdown s Unix.SHUTDOWN_SEND;
  49 + Unix.close s
  50 +
  51 +let start_server m =
  52 + let client_out, server_out = Unix.pipe () in
  53 + let client_err, server_err = Unix.pipe () in
  54 + let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; m|]
  55 + Unix.stdin server_out server_err in
  56 + List.iter Unix.close [client_out; server_out; client_err; server_err];
  57 + wait_for_server ();
  58 + pid
  59 +
  60 +let stop_server pid =
  61 + Unix.kill pid Sys.sigint
  62 +
  63 +let initialize () =
  64 + if !concraft_enabled then (
  65 + if not (concraft_exists ()) then failwith "The command concraft-pl is missing. Please make sure Concraft is installed properly." else
  66 + if not (Sys.file_exists concraft_model_filename) then failwith "Concraft model file does not exist." else
  67 + print_endline "Starting Concraft Server";
  68 + concraft_server_pid := start_server concraft_model_filename;
  69 + print_endline "Server started")
33 70  
34 71 let read_whole_channel c =
35 72 let r = ref [] in
... ... @@ -40,6 +77,16 @@ let read_whole_channel c =
40 77 !r
41 78 with End_of_file -> List.rev (!r)
42 79  
  80 +let concraft_parse s =
  81 + let concraft_in, concraft_out, concraft_err =
  82 + Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
  83 + [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
  84 + let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
  85 + let result = read_whole_channel concraft_in in
  86 + ignore (Unix.close_process_full (concraft_in, concraft_out, concraft_err));
  87 + if err_msg <> "" then failwith err_msg else
  88 + result
  89 +
43 90 let rec process_concraft_result orth lemma interp others rev = function
44 91 [] -> List.rev ((orth,(lemma,interp) :: others) :: rev)
45 92 | "" :: l -> process_concraft_result orth lemma interp others rev l
... ... @@ -52,27 +99,6 @@ let rec process_concraft_result orth lemma interp others rev = function
52 99 | ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l
53 100 | _ -> failwith ("process_concraft_result: " ^ line))
54 101  
55   -let concraft_parse s =
56   - let concraft_in, concraft_out, concraft_err =
57   - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
58   - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
59   - let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
60   - let result = read_whole_channel concraft_in in
61   - if err_msg <> "" then failwith err_msg else
62   - process_concraft_result "" "" "" [] [] result
63   -
64   -(*let rec load_concraft_sentence white orth rev ic =
65   - (* print_endline "load_concraft_sentence 1"; *)
66   - (* print_endline ("concraft error message: " ^ input_line concraft_err); *)
67   - let s = input_line ic in
68   - (* print_endline ("load_concraft_sentence: " ^ s); *)
69   - if s = "" then List.rev rev else
70   - match Xstring.split_delim "\t" s with
71   - [""; lemma; interp; "disamb"] -> load_concraft_sentence "" "" ((white,orth,lemma,interp) :: rev) ic
72   - | [""; lemma; interp] -> load_concraft_sentence white orth rev ic
73   - | [orth; white] -> load_concraft_sentence white orth rev ic
74   - | _ -> failwith ("load_concraft_sentence: " ^ s)*)
75   -
76 102 let make_token (orth,l) =
77 103 if l = [] then failwith "make_token 1" else
78 104 let lemma,interp = List.hd l in
... ... @@ -82,10 +108,8 @@ let make_token (orth,l) =
82 108 {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp)}
83 109  
84 110 let parse_mate tokens pbeg s =
85   - (* print_endline ("parse_mate: " ^ s); *)
86   - (* Printf.fprintf concraft_out "%s\n\n%!" s;
87   - let l = load_concraft_sentence "" "" [] concraft_in in *)
88   - let l = concraft_parse s in
  111 + let result = concraft_parse s in
  112 + let l = process_concraft_result "" "" "" [] [] result in
89 113 let l = Xlist.map l make_token in
90 114 let l = {empty_token_env with token = Interp "<conll_root>"} :: l in
91 115 let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in
... ...
integration/TODO
1 1 - uporządkować położenie info_sentences i podzielić na część dotyczącą formatu i część dotyczącą korpusu
  2 +
  3 +- w ENIAMpreIntegration.parse_sentence zdania dla mate parsera tworzone są również wtedy, gdy są w cudzysłowach
... ...
integration/concraft_test.ml deleted
1   -
2   -(* let concraft_in, concraft_out, concraft_err = Unix.open_process_full "../../../.local/bin/concraft-pl tag ../concraft/nkjp-model-0.2.gz" [| |] *)
3   -(*let concraft_in, concraft_out, concraft_err =
4   - Unix.open_process_full "concraft-pl tag ../concraft/nkjp-model-0.2.gz"
5   - [|"PATH=" ^ Sys.getenv "PATH"|]
6   -
7   -let _ =
8   - print_endline "out";
9   - Printf.fprintf concraft_out "Ala ma kota.\n\n%!";
10   - print_endline "in";
11   - print_endline ("concraft error message: " ^ input_line concraft_err);
12   - ()*)
13   -
14   -(**********************)
15   -
16   -(*
17   -Aby korzytać z concrafta trzeba najpierw postawić serwer wpisując z linii poleceń:
18   -concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz
19   -*)
20   -
21   -let read_whole_channel c =
22   - let r = ref [] in
23   - try
24   - while true do
25   - r := (input_line c) :: !r
26   - done;
27   - !r
28   - with End_of_file -> List.rev (!r)
29   -
30   -(* Gdy serwer jest już włączony na concraft_err trafia komunikat:
31   -concraft-pl: bind: resource busy (Address already in use)
32   -w przeciwnym przypadku się program wiesza się na czytaniu concraft_in
33   -*)
34   -
35   -(*let _ =
36   - print_endline "Starting concraft server 1";
37   - let concraft_in, concraft_out, concraft_err =
38   - Unix.open_process_full "concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz"
39   - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
40   - (* let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
41   - let result = read_whole_channel concraft_in in *)
42   - print_endline "Starting concraft server 2";
43   - print_endline (input_line concraft_err);
44   - print_endline "Starting concraft server 3";
45   - print_endline (input_line concraft_err);
46   - print_endline "Starting concraft server 3";
47   - (* print_endline err_msg;
48   - print_endline "Starting concraft server 3";
49   - print_endline (String.concat "\n" result);
50   - print_endline "Starting concraft server 4"; *)
51   - ()*)
52   -
53   -let rec process_concraft_result orth lemma interp others rev = function
54   - [] -> List.rev ((orth,(lemma,interp) :: others) :: rev)
55   - | "" :: l -> process_concraft_result orth lemma interp others rev l
56   - | line :: l ->
57   - (match Xstring.split_delim "\t" line with
58   - [orth2;s] when s = "none" || s = "space" ->
59   - if orth = "" then process_concraft_result orth2 lemma interp others rev l
60   - else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l
61   - | ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l
62   - | ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l
63   - | _ -> failwith ("process_concraft_result: " ^ line))
64   -
65   -let concraft_parse s =
66   - let concraft_in, concraft_out, concraft_err =
67   - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
68   - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
69   - let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
70   - let result = read_whole_channel concraft_in in
71   - if err_msg <> "" then failwith err_msg else
72   - process_concraft_result "" "" "" [] [] result
73   -
74   -let print_parsed_tokens l =
75   - Xlist.iter l (fun (orth,l) ->
76   - if l = [] then failwith "print_parsed_tokens" else
77   - let lemma,interp = List.hd l in
78   - print_endline (orth ^ "\t" ^ lemma ^ "\t" ^ interp))
79   -
80   -let _ =
81   - print_parsed_tokens (concraft_parse "Ala ma kota.");
82   - print_parsed_tokens (concraft_parse "Szpak frunie.");
83   - print_parsed_tokens (concraft_parse "Miałem miał.");
84   - print_parsed_tokens (concraft_parse "Kiedyś miałem kota.");
85   - print_parsed_tokens (concraft_parse "Kiadyś kupiłem kota.");
86   - ()
integration/makefile
... ... @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa #eniam-integration.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9 9 SOURCES= ENIAM_CONLL.ml ENIAMpreIntegration.ml
... ... @@ -24,12 +24,9 @@ eniam-integration.cma: $(SOURCES)
24 24 eniam-integration.cmxa: $(SOURCES)
25 25 ocamlopt -linkall -a -o eniam-integration.cmxa $(INCLUDES) $^
26 26  
27   -test: test.ml $(SOURCES)
  27 +test: test.ml
28 28 $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml
29 29  
30   -concraft_test: concraft_test.ml
31   - $(OCAMLOPT) -o concraft_test $(OCAMLOPTFLAGS) concraft_test.ml
32   -
33 30 .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx
34 31  
35 32 .mll.ml:
... ...
integration/test.ml
... ... @@ -26,60 +26,17 @@ let test_strings = [
26 26 "Szpak powiedział: „Frunę. Kiszę.”";
27 27 ]
28 28  
29   -let wait_for_server () =
30   - let rec wait s a =
31   - try Unix.connect s a
32   - with _ -> Unix.sleep 1; wait s a in
33   - let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 6 in
34   - let a = Unix.ADDR_INET (Unix.inet_addr_loopback, 10089) in
35   - wait s a;
36   - Unix.shutdown s Unix.SHUTDOWN_SEND;
37   - Unix.close s
38   -
39   -let start_server () =
40   - let client_out, server_out = Unix.pipe () in
41   - let client_err, server_err = Unix.pipe () in
42   - let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; "nkjp-model-0.2.gz"|]
43   - Unix.stdin server_out server_err in
44   - List.iter Unix.close [client_out; server_out; client_err; server_err];
45   - wait_for_server ();
46   - pid
47   -
48   -let stop_server pid =
49   - Unix.kill pid Sys.sigint
50   -
51   -let tag s =
52   - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
53   - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|]
54   -
55   -let _ =
56   - print_endline "Starting Server";
57   - let pid = start_server () in
58   - print_endline "Ready";
59   - let concraft_in, concraft_out, concraft_err = tag "Ala ma kota." in
60   - try
61   - while true do
62   - print_endline @@ input_line concraft_in
63   - done
64   - with End_of_file -> ();
65   - try
66   - while true do
67   - print_endline @@ "concraft error message: " ^ input_line concraft_err
68   - done
69   - with End_of_file -> ();
70   - ignore @@ Unix.close_process_full (concraft_in, concraft_out, concraft_err);
71   - stop_server pid
72   -
73   -(*
74 29 let _ =
  30 + ENIAMpreIntegration.concraft_enabled := true;
75 31 ENIAMpreIntegration.mate_parser_enabled := true;
  32 + ENIAMpreIntegration.initialize ();
76 33 print_endline "Testy wbudowane";
77 34 Xlist.iter test_strings (fun s ->
78 35 print_endline ("\nTEST: " ^ s);
79 36 let text,tokens = ENIAMsubsyntax.parse_text s in
80 37 let text = ENIAMpreIntegration.parse_text ENIAMsubsyntaxTypes.Struct tokens text in
81 38 (* print_endline (ENIAMtokenizer.xml_of tokens); *)
82   - print_endline (ENIAMsubsyntaxStringOf.tokens tokens);
  39 + print_endline (ENIAMsubsyntaxStringOf.token_extarray tokens);
83 40 print_endline "";
84 41 print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));
85 42 (* print_endline "Testy użytkownika.";
... ... @@ -92,5 +49,5 @@ let _ =
92 49 print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
93 50 s := read_line ()
94 51 done;*)
  52 + ENIAMpreIntegration.stop_server !ENIAMpreIntegration.concraft_server_pid;
95 53 ()
96   -*)
... ...