Commit dc750b8bdd1d673aad5a0a1dda49fee85bfea1af
1 parent
96883977
Włączenie concrafta do ENIAMintegration
Showing
25 changed files
with
62 additions
and
2162 deletions
concraft/.gitignore deleted
concraft/LICENSE deleted
1 | -Copyright (c) 2013 Jakub Waszczuk | |
2 | -All rights reserved. | |
3 | - | |
4 | -Redistribution and use in source and binary forms, with or without | |
5 | -modification, are permitted provided that the following conditions | |
6 | -are met: | |
7 | - | |
8 | - * Redistributions of source code must retain the above copyright | |
9 | - notice, this list of conditions and the following disclaimer. | |
10 | - | |
11 | - * Redistributions in binary form must reproduce the above | |
12 | - copyright notice, this list of conditions and the following | |
13 | - disclaimer in the documentation and/or other materials provided | |
14 | - with the distribution. | |
15 | - | |
16 | -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
17 | -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
18 | -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
19 | -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
20 | -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
21 | -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
22 | -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
23 | -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
24 | -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
25 | -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
26 | -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
concraft/README.md deleted
1 | -The instructions below apply to modern systems with the following caveats: | |
2 | -Before running stack install it might be necessary to execute stack setup in order to install the correct version of ghc. | |
3 | - | |
4 | -Maca can be installed using the instructions on nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki/InstallOnUbuntu11 with the following exceptions: | |
5 | -The package libboost-all-dev can be installed in place of libboost1.42-all-dev as the latter might be unavailable. | |
6 | -Instead of using adding Bartosz Zaborowski's repository one can download Morfeusz directly from | |
7 | -ppa.launchpad.net/bartosz-zaborowski/nlp/ubuntu/pool/main/m/morfeusz-sgjp/morfeusz-sgjp_0.81-1~precise_amd64.deb | |
8 | -Before installing SFST it is necessary to modify the file maca/third_party/SFST-1.2/SFST/src/fst.C | |
9 | -In lines 445 and 449 the identifier it must be replaced with iter (or any other non-conflicting identifier). | |
10 | - | |
11 | -After Concraft is successfully installed its executable can be found in ~/.local/bin | |
12 | - | |
13 | -Below are the contents of the original README file. | |
14 | - | |
15 | -Concraft-pl | |
16 | -=========== | |
17 | - | |
18 | -This package provides a morphosyntactic tagger for the Polish language. | |
19 | -The tool combines the following components into a pipeline: | |
20 | -* A morphosyntactic segmentation and analysis tool [Maca][maca], | |
21 | -* A morphosyntactic disambiguation library [Concraft][concraft], | |
22 | - | |
23 | -<!--- | |
24 | -* A simple, frequency-driven lemmatiser (TODO). Until the lemmatiser component | |
25 | - is implemented, the tagger may output multiple interpretations (all related | |
26 | - to the same morphosyntactic tag, but with different lemmas) in some cases. | |
27 | ---> | |
28 | - | |
29 | -As for now, the tagger doesn't provide any lemmatisation capabilities. | |
30 | -As a result, it may output multiple interpretations (all related | |
31 | -to the same morphosyntactic tag, but with different lemmas) for some known | |
32 | -words, while for the out-of-vocabulary words it just outputs orthographic | |
33 | -forms as lemmas. | |
34 | - | |
35 | -See the [homepage][homepage] if you wish to download a pre-trained | |
36 | -model for the Polish language. | |
37 | - | |
38 | - | |
39 | -Installation | |
40 | -============ | |
41 | - | |
42 | -It is recommanded to install Concraft-pl using the | |
43 | -[Haskell Tool Stack][stack], which you will need to downoload and | |
44 | -install on your machine beforehand. Then clone this repository into | |
45 | -a local directory and use `stack` to install the library by running: | |
46 | - | |
47 | - stack install | |
48 | - | |
49 | -Unless you plan to use a custom preprocessing pipeline or run | |
50 | -[Maca][maca] on a different machine (see section | |
51 | -[Tagging analysed data](#tagging-analysed-data)), you will also need | |
52 | -the [Maca][maca] tool. A detailed [installation guide][maca-install] | |
53 | -can be found on the [Maca][maca] homepage. | |
54 | - | |
55 | - | |
56 | -Data format | |
57 | -=========== | |
58 | - | |
59 | -The current version of Concraft-pl works on a simple `plain` text format supported by | |
60 | -the [Corpus2][corpus2] tools. You will have to install these tools when you install | |
61 | -[Maca][maca] anyway, so you can use them to convert the output generated | |
62 | -by Concraft-pl to one of other formats supported by [Corpus2][corpus2]. | |
63 | - | |
64 | - | |
65 | -Training | |
66 | -======== | |
67 | - | |
68 | -If you have the training material with disambiguation annotations (stored in the | |
69 | -`plain` text format) you can train the Concraft-pl model yourself. | |
70 | - | |
71 | - concraft-pl train train.plain -e eval.plain -o model.gz | |
72 | - | |
73 | -Concraft-pl uses the [NKJP][nkjp] [morphosyntactic tagset definition](config/nkjp-tagset.cfg) | |
74 | -by default. It will also reanalyse the input data before the actual training. If you want | |
75 | -to change this behaviour, use the `--tagset` and `--noana` command-line options. | |
76 | - | |
77 | -Consider using [runtime system options][ghc-rts]. You can speed up processing | |
78 | -by making use of multiple cores by using the `-N` option. The `-s` option will | |
79 | -produce the runtime statistics, such as the time spent in the garbage collector. | |
80 | -If the program is spending too much time collecting garbage, you can try to | |
81 | -increase the allocation area size with the `-A` option. If you have a big | |
82 | -dataset and it doesn't fit in the computer memory, use the `--disk` flag. | |
83 | -For example, to train the model using four threads and 256M allocation area | |
84 | -size, run: | |
85 | - | |
86 | - concraft-pl train train.plain -e eval.plain -o model.gz +RTS -N4 -A256M -s | |
87 | - | |
88 | -Run `concraft-pl train --help` to learn more about the program arguments and | |
89 | -possible training options. | |
90 | - | |
91 | -Finally, you may consider pruning the resultant model in order to reduce its size. | |
92 | -Features with values close to 0 (in log-domain) have little effect on the modeled | |
93 | -probability and, therefore, it should be safe to discard them. | |
94 | - | |
95 | - concraft-pl prune -t 0.05 input-model.gz pruned-model.gz | |
96 | - | |
97 | - | |
98 | -Tagging | |
99 | -======= | |
100 | - | |
101 | -Once you have a Concraft-pl model you can use the following command tag `input.txt` file: | |
102 | - | |
103 | - concraft-pl tag model.gz < input.txt > output.plain | |
104 | - | |
105 | -The input file is first divided into paragraphs (the tool interprets empty lines | |
106 | -as paragraph ending markers). After that, [Maca][maca] is used to segment and analyse | |
107 | -each paragraph. Finally, [Concraft][concraft] module is used to disambiguate each | |
108 | -sentence in the [Maca][maca] output. | |
109 | - | |
110 | -With the `--marginals` option enabled, Concraft-pl will output marginal probabilities | |
111 | -corresponding to individual tags (determined on the basis of the disambiguation model) | |
112 | -instead of `disamb` markers. | |
113 | - | |
114 | -Run `concraft-pl tag --help` to learn more about possible tagging options. | |
115 | - | |
116 | - | |
117 | -Server | |
118 | -====== | |
119 | - | |
120 | -Concraft-pl provides also a client/server mode. It is handy when, for example, | |
121 | -you need to tag a large collection of small files. Loading Concraft-pl model | |
122 | -from a disk takes considerable amount of time which makes the tagging method | |
123 | -described above very slow in such a setting. | |
124 | - | |
125 | -To start the Concraft-pl server, run: | |
126 | - | |
127 | - concraft-pl server --inmodel model.gz | |
128 | - | |
129 | -You can supply a custom port number using a `--port` option. For example, | |
130 | -to run the server on the `10101` port, use the following command: | |
131 | - | |
132 | - concraft-pl server --inmodel model.gz --port 10101 | |
133 | - | |
134 | -To use the server in a multi-threaded environment, you need to specify the | |
135 | -`-N` [RTS][ghc-rts] option. A set of options which usually yields good | |
136 | -server performance is presented in the following example: | |
137 | - | |
138 | - concraft-pl server --inmodel model.gz +RTS -N -A4M -qg1 -I0 | |
139 | - | |
140 | -Run `concraft-pl server --help` to learn more about possible server-mode options. | |
141 | - | |
142 | -The client mode works just like the tagging mode. The only difference is that, | |
143 | -instead of supplying your client with a model, you need to specify the port number | |
144 | -(in case you used a custom one when starting the server; otherwise, the default | |
145 | -port number will be used). | |
146 | - | |
147 | - concraft-pl client --port 10101 < input.txt > output.plain | |
148 | - | |
149 | -Run `concraft-pl client --help` to learn more about possible client-mode options. | |
150 | - | |
151 | - | |
152 | -Tagging analysed data | |
153 | -===================== | |
154 | - | |
155 | -In some situations you might want to feed Concraft-pl with a previously | |
156 | -analysed data. Perhaps your Maca instance is installed on a different | |
157 | -machine, or maybe you want to use Concraft-pl with a custom | |
158 | -preprocessing pipeline. | |
159 | - | |
160 | -If you want to use a preprocessing pipeline significantly different from | |
161 | -the standard one (Maca), you should first train your own Concraft model. | |
162 | -To train the model on analysed data use the `--noana` training flag. | |
163 | - | |
164 | -Use the same `--noana` flag when you want to tag analysed data. | |
165 | -Input format should be the same as the output format. | |
166 | -This option is currently not supported in the client/server mode. | |
167 | - | |
168 | -*Remember to use the same preprocessing pipeline (segmentation + analysis) for both | |
169 | -training and disambiguation. Inconsistencies between training material and input | |
170 | -data may severely harm the quality of disambiguation.* | |
171 | - | |
172 | - | |
173 | -[stack]: http://docs.haskellstack.org "Haskell Tool Stack" | |
174 | -[homepage]: http://zil.ipipan.waw.pl/Concraft "Homepage" | |
175 | -[concraft]: https://github.com/kawu/concraft "Concraft" | |
176 | -[hackage-repo]: http://hackage.haskell.org/package/concraft-pl "Concraft-pl Hackage repository" | |
177 | -[maca]: http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki "Maca" | |
178 | -[maca-install]: http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki#Download-and-install-MACA "Maca installation guide" | |
179 | -[corpus2]: http://nlp.pwr.wroc.pl/redmine/projects/corpus2/wiki "Corpus2" | |
180 | -[ghc]: http://www.haskell.org/ghc "Glasgow Haskell Compiler" | |
181 | -[ghc-rts]: http://www.haskell.org/ghc/docs/latest/html/users_guide/runtime-control.html "GHC runtime system options" | |
182 | -[cabal]: http://www.haskell.org/cabal "Cabal" | |
183 | -[haskell-platform]: http://www.haskell.org/platform "Haskell Platform" | |
184 | -[nkjp]: http://nkjp.pl/index.php?page=0&lang=1 "NKJP" |
concraft/changelog deleted
1 | --*-change-log-*- | |
2 | - | |
3 | -0.7.5 Aug 2016 | |
4 | - * Start using `stack` to avoid dependency problems | |
5 | - | |
6 | -0.7.4 Nov 2014 | |
7 | - * Ignore non-printable characters for compatibility with Maca | |
8 | - | |
9 | -0.7.0 Nov 2013 | |
10 | - * Output guessing results for unknown words | |
11 | - * Allow "--noana" option in the client/server mode | |
12 | - * Add option to output marginal probabilities | |
13 | - * Add separate mode for model pruning | |
14 | - | |
15 | -0.6.0 Nov 2013 | |
16 | - * Output wordform instead of "None" as a lemma for unknown words | |
17 | - | |
18 | -0.5.0 Nov 2013 | |
19 | - * Restore the orth[-2] feature in the observation schema |
concraft/concraft-pl.cabal deleted
1 | -name: concraft-pl | |
2 | -version: 0.7.5 | |
3 | -synopsis: Morphological tagger for Polish | |
4 | -description: | |
5 | - A morphological tagger for Polish based on the concraft library. | |
6 | -license: BSD3 | |
7 | -license-file: LICENSE | |
8 | -cabal-version: >= 1.6 | |
9 | -copyright: Copyright (c) 2013 Jakub Waszczuk | |
10 | -author: Jakub Waszczuk | |
11 | -maintainer: waszczuk.kuba@gmail.com | |
12 | -stability: experimental | |
13 | -category: Natural Language Processing | |
14 | -homepage: http://zil.ipipan.waw.pl/Concraft | |
15 | -build-type: Simple | |
16 | - | |
17 | -extra-source-files: changelog | |
18 | - | |
19 | -data-files: config/nkjp-tagset.cfg | |
20 | - | |
21 | -library | |
22 | - hs-source-dirs: src | |
23 | - | |
24 | - build-depends: | |
25 | - base >= 4 && < 5 | |
26 | - , concraft >= 0.9 && < 0.10 | |
27 | - , tagset-positional >= 0.3 && < 0.4 | |
28 | - , sgd >= 0.3.3 && < 0.4 | |
29 | - , containers >= 0.4 && < 0.6 | |
30 | - , bytestring >= 0.9 && < 0.11 | |
31 | - , text >= 0.11 && < 1.3 | |
32 | - , aeson >= 0.6 && < 0.9 | |
33 | - , binary >= 0.5 && < 0.8 | |
34 | - , process >= 1.1 && < 1.3 | |
35 | - , mtl >= 2.0 && < 2.3 | |
36 | - , transformers >= 0.2 && < 0.5 | |
37 | - , network >= 2.3 && < 2.7 | |
38 | - , lazy-io >= 0.1 && < 0.2 | |
39 | - , split >= 0.2 && < 0.3 | |
40 | - | |
41 | - exposed-modules: | |
42 | - NLP.Concraft.Polish | |
43 | - , NLP.Concraft.Polish.Maca | |
44 | - , NLP.Concraft.Polish.Morphosyntax | |
45 | - , NLP.Concraft.Polish.Request | |
46 | - , NLP.Concraft.Polish.Server | |
47 | - | |
48 | - other-modules: | |
49 | - NLP.Concraft.Polish.Format.Plain | |
50 | - | |
51 | - ghc-options: -Wall -O2 | |
52 | - | |
53 | -source-repository head | |
54 | - type: git | |
55 | - location: https://github.com/kawu/concraft-pl.git | |
56 | - | |
57 | -executable concraft-pl | |
58 | - build-depends: | |
59 | - cmdargs >= 0.10 && < 0.11 | |
60 | - hs-source-dirs: src, tools | |
61 | - main-is: concraft-pl.hs | |
62 | - ghc-options: -Wall -O2 -threaded -rtsopts |
concraft/config/nkjp-tagset.cfg deleted
1 | -# NKJP tagset definition | |
2 | - | |
3 | -[ATTR] | |
4 | - | |
5 | -# Attributes and their values | |
6 | -nmb = sg pl | |
7 | -cas = nom gen dat acc inst loc voc | |
8 | -gnd = m1 m2 m3 f n # n1 n2 n3 p1 p2 p3 | |
9 | -per = pri sec ter | |
10 | -deg = pos com sup | |
11 | -asp = imperf perf | |
12 | -ngt = aff neg | |
13 | -acm = congr rec | |
14 | -acn = akc nakc | |
15 | -ppr = npraep praep | |
16 | -agg = agl nagl | |
17 | -vlc = nwok wok | |
18 | -dot = pun npun | |
19 | -# sentiment = spos sneg | |
20 | - | |
21 | -[RULE] | |
22 | - | |
23 | -# Parsing rules definitions. | |
24 | - | |
25 | -adja = | |
26 | -adjp = | |
27 | -adjc = | |
28 | -conj = | |
29 | -comp = | |
30 | -interp = | |
31 | -pred = | |
32 | -xxx = | |
33 | -adv = [deg] | |
34 | -imps = asp | |
35 | -inf = asp | |
36 | -pant = asp | |
37 | -pcon = asp | |
38 | -qub = [vlc] | |
39 | -prep = cas [vlc] | |
40 | -siebie = cas | |
41 | -subst = nmb cas gnd | |
42 | -depr = nmb cas gnd | |
43 | -ger = nmb cas gnd asp ngt | |
44 | -ppron12 = nmb cas gnd per [acn] | |
45 | -ppron3 = nmb cas gnd per [acn] [ppr] | |
46 | -num = nmb cas gnd acm | |
47 | -numcol = nmb cas gnd acm | |
48 | -adj = nmb cas gnd deg | |
49 | -pact = nmb cas gnd asp ngt | |
50 | -ppas = nmb cas gnd asp ngt | |
51 | -winien = nmb gnd asp | |
52 | -praet = nmb gnd asp [agg] | |
53 | -bedzie = nmb per asp | |
54 | -fin = nmb per asp | |
55 | -impt = nmb per asp | |
56 | -aglt = nmb per asp vlc | |
57 | - | |
58 | -# `ign` is a special tag and we treat in a special way. | |
59 | -# ign = | |
60 | - | |
61 | -brev = dot | |
62 | -burk = | |
63 | -interj = | |
64 | - |
concraft/eval/cv.sh deleted
1 | -# Cross-validation script. | |
2 | - | |
3 | - | |
4 | -# Root data directory | |
5 | -data=$1 | |
6 | - | |
7 | -# NKJP tagset configuration | |
8 | -nkjp_tagset=$2 | |
9 | - | |
10 | - | |
11 | -echo "" | |
12 | -echo "Data directory: "$1 | |
13 | -echo "Tagset file: "$2 | |
14 | -shift | |
15 | -shift | |
16 | -echo "Training arguments: ""$@" | |
17 | -echo "" | |
18 | - | |
19 | - | |
20 | -for i in "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" | |
21 | -do | |
22 | - echo $i | |
23 | - | |
24 | - # Construct training material. | |
25 | - echo -n "" > $data/train/train$i.plain | |
26 | - for j in "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" | |
27 | - do | |
28 | - if [ $i != $j ]; then | |
29 | - echo "> "$j | |
30 | - cat $data/folds/test$j.plain >> $data/train/train$i.plain | |
31 | - fi | |
32 | - done | |
33 | - | |
34 | - # Train tagger and tag eval file. | |
35 | - concraft-pl train --tagset=$nkjp_tagset $data/train/train$i.plain -e $data/folds/test$i.plain -o $data/concraft/model$i.gz "$@" | |
36 | - concraft-pl tag $data/concraft/model$i.gz < $data/text/test$i.txt > $data/tagged/test$i.plain | |
37 | - | |
38 | - echo -e "\nSTATS\n" | |
39 | - concraft-pl compare --tagset=$nkjp_tagset $data/folds/test$i.plain $data/tagged/test$i.plain | |
40 | - echo -e "\nEND STATS\n" | |
41 | - | |
42 | - # Delete the training material, we will not need it again. | |
43 | - rm $data/train/train$i.plain | |
44 | -done |
concraft/nkjp-model-0.2.gz deleted
No preview for this file type
concraft/notes/binary.txt deleted
1 | -Pamięć zajmowana przez poszczególne elementy binarnej reprezentacji modelu: | |
2 | - | |
3 | -15M ./codec.bin | |
4 | -305M ./actualModel.bin | |
5 | -202M ./featMap.bin | |
6 | -130M ./featMap-otherMap.bin | |
7 | -73M ./featMap-transMaps.bin | |
8 | -104M ./valuesVect.bin | |
9 | - | |
10 | -Sortując zgodnie z rozmiarami (tylko elemety najniższego poziomu): | |
11 | - | |
12 | -130M ./featMap-otherMap.bin | |
13 | -104M ./valuesVect.bin | |
14 | -73M ./featMap-transMaps.bin | |
15 | -15M ./codec.bin | |
16 | - | |
17 | -Trzeba wziąć pod uwagę, jakie są różnice pomiędzy rozmiarami | |
18 | -zwykłej oraz binarnej reprezentacji poszczególnych struktur danych. | |
19 | -Przykładowo, w przypadku wektora różnica ta prawdopodobnie jest | |
20 | -niewielka. W przypadku map różnica jest prawdopodobnie duża. | |
21 | - | |
22 | -Rozmiar mapy `otherMap` to 3390538 par (klucz, wartość). | |
23 | -Będziemy go oznaczali przez N. Zgodnie z | |
24 | -http://blog.johantibell.com/2011/06/memory-footprints-of-some-common-data.html | |
25 | -rozmiar takie mapy w pamięci można szacować jako (na systemie 64-bitowym): | |
26 | - | |
27 | - 6N words + size of keys & values | |
28 | - | |
29 | - 6N words = ~163 MB | |
30 | - size of keys = N * 4 words = ~108 MB | |
31 | - size of vals = N words = ~27 MB | |
32 | - | |
33 | - W sumie = ~300MB ! |
concraft/notes/notes.txt deleted
1 | -Plany i pomysły na dalszy rozwój Concrafta. | |
2 | - | |
3 | - | |
4 | -[HIGH] | |
5 | - | |
6 | - | |
7 | -* Umożliwienie użytkownikowi określenia dodatkowych | |
8 | - cech schematu obserwacji, spoza zestawu oferowanego | |
9 | - przez bibliotekę podstawową. [HARD] | |
10 | - | |
11 | - | |
12 | -[MEDIUM] | |
13 | - | |
14 | - | |
15 | -* Trenowanie modelu bezpośrednio na korpusie NKJP. [EASY] | |
16 | - | |
17 | - | |
18 | -* Cześć konfiguracji należałoby przenieść do plików konfiguracyjnych. | |
19 | - Dotyczy to np. konfiguracji trenowania -- parametry SGD bez większych | |
20 | - problemów mogłyby być zdefiniowane w postaci plików konfiguracyjnych. | |
21 | - [MEDIUM] | |
22 | - | |
23 | - | |
24 | -* Powyższy pomysł jest w bardziej ogólny. Chcielibyśmy dać użytkownikowi | |
25 | - możliwość konfigurowania nie tylko parametrów SGD, ale również | |
26 | - - schematu obserwacji | |
27 | - - podziału na warstwy | |
28 | - W szczególności, użytkownik powinien mieć możliwość zdefiniowania | |
29 | - osobnego schematu obserwacji dla różnych warstw. | |
30 | - | |
31 | - PYTANIE: Czy obecnie tworzona jest jeden kodek dla wszystkich typów | |
32 | - obserwacji? Zdecydowanie powinno tak być! | |
33 | - | |
34 | - | |
35 | -* Fajnie byłoby użyć na poziomie kodeka biblioteki DAWG, dzięki czemu | |
36 | - model byłby bardziej skompresowany. [MEDIUM/HARD] | |
37 | - | |
38 | - ALE: W postaci binarnej kodek zajmuje jedyne 15MB, w porównaniu do | |
39 | - tego pamięć zajmowana przez inne części modelu jest ogromna! | |
40 | - | |
41 | - | |
42 | -* Zmiana interfejsu biblioteki podstawowej: zamiast klasy | |
43 | - Word można dodać np. "środowisko zdania". [HARD] | |
44 | - | |
45 | - | |
46 | -[LOW] | |
47 | - | |
48 | - | |
49 | -* Obsługa dodatkowych formatów. [MEDIUM] | |
50 | - <- Mało ważne chociażby dlatego, że jeśli już Maca jest dostępna, | |
51 | - to może być też wykorzystana do konwersji danych wyjściowych | |
52 | - do innego formatu. |
concraft/notes/plan.txt deleted
1 | -Kroki prowadzące do opublikowania nowej wersji Concrafta w sieci. | |
2 | - | |
3 | -1. Zmergować gałąź 0.5 z gałęzią master w repozytorium https://github.com/kawu/concraft. DONE | |
4 | - | |
5 | -2. Zaktualizować opis na stronie http://zil.ipipan.waw.pl/Concraft. DONE | |
6 | - W szczególności, należy umieścić tam nowy model. DONE | |
7 | - | |
8 | -[Dodatki] | |
9 | - | |
10 | -A. Obecny model 0.5 wytrenowany jest na 90% korpusu milionowego NKJP. | |
11 | - Kiedyś później należałoby wytrenować model na całym korpusie. |
concraft/src/NLP/Concraft/Polish.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | - | |
4 | - | |
5 | -module NLP.Concraft.Polish | |
6 | -( | |
7 | --- * Model | |
8 | - C.Concraft | |
9 | -, C.saveModel | |
10 | -, C.loadModel | |
11 | - | |
12 | --- * Tagging | |
13 | -, tag | |
14 | -, marginals | |
15 | - | |
16 | --- * Analysis | |
17 | -, macaPar | |
18 | - | |
19 | --- * Training | |
20 | -, TrainConf (..) | |
21 | -, train | |
22 | - | |
23 | --- * Pruning | |
24 | -, C.prune | |
25 | - | |
26 | --- -- * Analysis | |
27 | --- , anaSent | |
28 | --- , reAnaPar | |
29 | -) where | |
30 | - | |
31 | - | |
32 | -import Control.Applicative ((<$>)) | |
33 | -import qualified Data.Text.Lazy as L | |
34 | -import qualified Data.Set as S | |
35 | - | |
36 | -import qualified Data.Tagset.Positional as P | |
37 | -import qualified Numeric.SGD as SGD | |
38 | - | |
39 | -import qualified NLP.Concraft.Morphosyntax as X | |
40 | -import qualified NLP.Concraft.Schema as S | |
41 | -import NLP.Concraft.Schema (SchemaConf(..), entry, entryWith) | |
42 | -import qualified NLP.Concraft.Guess as G | |
43 | -import qualified NLP.Concraft.Disamb as D | |
44 | -import qualified NLP.Concraft as C | |
45 | --- import qualified NLP.Concraft.Analysis as A | |
46 | - | |
47 | -import NLP.Concraft.Polish.Morphosyntax hiding (tag) | |
48 | -import NLP.Concraft.Polish.Maca | |
49 | - | |
50 | - | |
51 | -------------------------------------------------- | |
52 | --- Default configuration | |
53 | -------------------------------------------------- | |
54 | - | |
55 | - | |
56 | --- | Default configuration for the guessing observation schema. | |
57 | -guessSchemaDefault :: SchemaConf | |
58 | -guessSchemaDefault = S.nullConf | |
59 | - { lowPrefixesC = entryWith [1, 2] [0] | |
60 | - , lowSuffixesC = entryWith [1, 2] [0] | |
61 | - , knownC = entry [0] | |
62 | - , begPackedC = entry [0] } | |
63 | - | |
64 | - | |
65 | --- | Default configuration for the guessing observation schema. | |
66 | -disambSchemaDefault :: SchemaConf | |
67 | -disambSchemaDefault = S.nullConf | |
68 | - { lowOrthC = entry [-2, -1, 0, 1] | |
69 | - , lowPrefixesC = oov $ entryWith [1, 2, 3] [0] | |
70 | - , lowSuffixesC = oov $ entryWith [1, 2, 3] [0] | |
71 | - , begPackedC = oov $ entry [0] } | |
72 | - where | |
73 | - oov (Just body) = Just $ body { S.oovOnly = True } | |
74 | - oov Nothing = Nothing | |
75 | - | |
76 | - | |
77 | --- | Default tiered tagging configuration. | |
78 | -tiersDefault :: [D.Tier] | |
79 | -tiersDefault = | |
80 | - [tier1, tier2] | |
81 | - where | |
82 | - tier1 = D.Tier True $ S.fromList ["cas", "per"] | |
83 | - tier2 = D.Tier False $ S.fromList | |
84 | - [ "nmb", "gnd", "deg", "asp" , "ngt", "acm" | |
85 | - , "acn", "ppr", "agg", "vlc", "dot" ] | |
86 | - | |
87 | - | |
88 | -------------------------------------------------- | |
89 | --- Tagging | |
90 | -------------------------------------------------- | |
91 | - | |
92 | - | |
93 | --- | Tag the analysed sentence. | |
94 | -tag :: C.Concraft -> Sent Tag -> Sent Tag | |
95 | -tag concraft sent = | |
96 | - [ select' gs t seg | |
97 | - | (seg, gs, t) <- zip3 sent gss ts ] | |
98 | - where | |
99 | - tagset = C.tagset concraft | |
100 | - packed = packSent tagset sent | |
101 | - tagged = C.tag concraft packed | |
102 | - gss = map (map showTag . S.toList . fst) tagged | |
103 | - ts = map (showTag . snd) tagged | |
104 | - showTag = P.showTag tagset | |
105 | - | |
106 | - | |
107 | --- | Tag the sentence with marginal probabilities. | |
108 | -marginals :: C.Concraft -> Sent Tag -> Sent Tag | |
109 | -marginals concraft sent | |
110 | - = map (uncurry selectWMap) | |
111 | - $ zip wmaps sent | |
112 | - where | |
113 | - tagset = C.tagset concraft | |
114 | - packed = packSent tagset sent | |
115 | - wmaps = map | |
116 | - (X.mapWMap showTag) | |
117 | - (C.marginals concraft packed) | |
118 | - showTag = P.showTag tagset | |
119 | - | |
120 | - | |
121 | -------------------------------------------------- | |
122 | --- Training | |
123 | -------------------------------------------------- | |
124 | - | |
125 | - | |
126 | --- | Training configuration. | |
127 | -data TrainConf = TrainConf { | |
128 | - -- | Tagset. | |
129 | - tagset :: P.Tagset | |
130 | - -- | SGD parameters. | |
131 | - , sgdArgs :: SGD.SgdArgs | |
132 | - -- | Perform reanalysis. | |
133 | - , reana :: Bool | |
134 | - -- | Store SGD dataset on disk. | |
135 | - , onDisk :: Bool | |
136 | - -- | Numer of guessed tags for each word. | |
137 | - , guessNum :: Int | |
138 | - -- | `G.r0T` parameter. | |
139 | - , r0 :: G.R0T } | |
140 | - | |
141 | --- | Train concraft model. | |
142 | --- TODO: It should be possible to supply the two training procedures with | |
143 | --- different SGD arguments. | |
144 | -train | |
145 | - :: TrainConf | |
146 | - -> IO [SentO Tag] -- ^ Training data | |
147 | - -> IO [SentO Tag] -- ^ Evaluation data | |
148 | - -> IO C.Concraft | |
149 | -train TrainConf{..} train0 eval0 = do | |
150 | - | |
151 | - pool <- newMacaPool 1 | |
152 | - let ana = anaSent tagset pool | |
153 | - train1 = map (packSentO tagset) <$> train0 | |
154 | - eval1 = map (packSentO tagset) <$> eval0 | |
155 | - | |
156 | - if reana | |
157 | - then doReana ana train1 eval1 | |
158 | - else noReana train1 eval1 | |
159 | - | |
160 | - where | |
161 | - | |
162 | - doReana ana = C.reAnaTrain tagset ana guessNum guessConf disambConf | |
163 | - noReana tr ev = C.train tagset guessNum guessConf disambConf | |
164 | - (map X.segs <$> tr) (map X.segs <$> ev) | |
165 | - | |
166 | - guessConf = G.TrainConf guessSchemaDefault sgdArgs onDisk r0 | |
167 | - disambConf = D.TrainConf tiersDefault disambSchemaDefault sgdArgs onDisk | |
168 | - | |
169 | - | |
170 | -------------------------------------------------- | |
171 | --- Re-analysis | |
172 | -------------------------------------------------- | |
173 | - | |
174 | - | |
175 | --- | Analyse the given sentence with Maca. | |
176 | -anaSent :: P.Tagset -> MacaPool -> L.Text -> IO (X.Sent Word P.Tag) | |
177 | -anaSent tagset pool | |
178 | - = fmap (packSent tagset . concat) | |
179 | - . macaPar pool . L.toStrict | |
180 | - | |
181 | - | |
182 | --- -- | Reanalyse the input paragraph (lazy IO). | |
183 | --- reAnaPar :: P.Tagset -> [SentO Tag] -> IO [Sent Tag] | |
184 | --- reAnaPar tagset inp = do | |
185 | --- pool <- newMacaPool 1 | |
186 | --- A.reAnaPar tagset (anaSent pool) inp |
concraft/src/NLP/Concraft/Polish/Format/Plain.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | - | |
4 | --- | Simple format for morphosyntax representation which | |
5 | --- with no spaces within and that one of the tags indicates | |
6 | --- unknown words. | |
7 | - | |
8 | -module NLP.Concraft.Polish.Format.Plain | |
9 | -( | |
10 | --- * Parsing | |
11 | - parsePlain | |
12 | -, parsePara | |
13 | -, parseSent | |
14 | - | |
15 | --- * Printing | |
16 | -, ShowCfg (..) | |
17 | -, showPlain | |
18 | -, showPara | |
19 | -, showSent | |
20 | -) where | |
21 | - | |
22 | -import Data.Monoid (Monoid, mappend, mconcat) | |
23 | -import Data.Maybe (catMaybes) | |
24 | -import Data.List (groupBy) | |
25 | -import Data.String (IsString) | |
26 | -import qualified Data.Char as C | |
27 | -import qualified Data.Map as M | |
28 | -import qualified Data.Text as T | |
29 | -import qualified Data.Text.Lazy as L | |
30 | -import qualified Data.Text.Lazy.Builder as L | |
31 | -import qualified Data.Text.Lazy.Read as R | |
32 | -import Text.Printf (printf) | |
33 | - | |
34 | -import qualified NLP.Concraft.Morphosyntax as X | |
35 | -import NLP.Concraft.Polish.Morphosyntax | |
36 | - | |
37 | --- | Parse the text in the plain format. | |
38 | -parsePlain :: L.Text -> [[Sent Tag]] | |
39 | -parsePlain = | |
40 | - map parsePara' . groupBy f . L.splitOn "\n\n" | |
41 | - where | |
42 | - f _ xs = case L.uncons xs of | |
43 | - Nothing -> False | |
44 | - Just (x, _) -> not (C.isSpace x) | |
45 | - | |
46 | --- | Parse the paragraph in the plain format. | |
47 | -parsePara :: L.Text -> [Sent Tag] | |
48 | -parsePara = parsePara' . L.splitOn "\n\n" | |
49 | - | |
50 | --- | Parse paragraph already divided into sentence chunks. | |
51 | -parsePara' :: [L.Text] -> [Sent Tag] | |
52 | -parsePara' = map (parseSent . L.strip) . filter (not.isEmpty) | |
53 | - | |
54 | --- | Identify empty chunks of text. | |
55 | -isEmpty :: L.Text -> Bool | |
56 | -isEmpty = L.all C.isSpace | |
57 | - | |
58 | --- | Parse the sentence in the plain format. | |
59 | -parseSent :: L.Text -> Sent Tag | |
60 | -parseSent | |
61 | - = map parseWord | |
62 | - . groupBy (\_ x -> cond x) | |
63 | - . L.lines | |
64 | - where | |
65 | - cond = ("\t" `L.isPrefixOf`) | |
66 | - | |
67 | -parseWord :: [L.Text] -> Seg Tag | |
68 | -parseWord xs = Seg | |
69 | - (Word _orth _space _known) | |
70 | - _interps | |
71 | - where | |
72 | - (_orth, _space) = parseHeader (head xs) | |
73 | - ys = map parseInterp (tail xs) | |
74 | - _known = not (Nothing `elem` ys) | |
75 | - -- _interps = M.fromListWith max (catMaybes ys) | |
76 | - _interps = X.mkWMap $ catMaybes ys | |
77 | - | |
78 | -parseInterp :: L.Text -> Maybe (Interp Tag, Double) | |
79 | -parseInterp = | |
80 | - doIt . tail . L.splitOn "\t" | |
81 | - where | |
82 | - doIt [form, tag] | |
83 | - | tag == ign = Nothing | |
84 | - | otherwise = Just $ | |
85 | - (mkInterp form tag, 0) | |
86 | - doIt [form, tag, "disamb"] = | |
87 | - Just (mkInterp form tag, 1) | |
88 | - doIt [form, tag, weight] = case R.double weight of | |
89 | - Left er -> error $ "parseInterp (weight):" ++ show er | |
90 | - Right w -> Just (mkInterp form tag, fst w) | |
91 | - doIt xs = error $ "parseInterp: " ++ show xs | |
92 | - mkInterp form tag = Interp (L.toStrict form) (L.toStrict tag) | |
93 | - | |
94 | -parseHeader :: L.Text -> (T.Text, Space) | |
95 | -parseHeader xs = | |
96 | - let [_orth, space] = L.splitOn "\t" xs | |
97 | - in (L.toStrict _orth, parseSpace space) | |
98 | - | |
99 | --- TODO: Should we represent newlines and spaces in the `Space` data type? | |
100 | -parseSpace :: L.Text -> Space | |
101 | -parseSpace "none" = None | |
102 | -parseSpace "space" = Space | |
103 | -parseSpace "spaces" = Space -- Multiple spaces | |
104 | -parseSpace "newline" = NewLine | |
105 | -parseSpace "newlines" = NewLine -- Multiple newlines | |
106 | -parseSpace xs = error ("parseSpace: " ++ L.unpack xs) | |
107 | - | |
108 | ------------ | |
109 | --- Printing | |
110 | ------------ | |
111 | - | |
112 | --- | Printing configuration. | |
113 | -data ShowCfg = ShowCfg { | |
114 | - -- | Print weights instead of 'disamb' tags. | |
115 | - showWsCfg :: Bool } | |
116 | - | |
117 | --- | Show the plain data. | |
118 | -showPlain :: ShowCfg ->[[Sent Tag]] -> L.Text | |
119 | -showPlain cfg = | |
120 | - L.intercalate "\n" . map (showPara cfg) | |
121 | - | |
122 | --- | Show the paragraph. | |
123 | -showPara :: ShowCfg -> [Sent Tag] -> L.Text | |
124 | -showPara cfg = L.toLazyText . mconcat . map (\xs -> buildSent cfg xs <> "\n") | |
125 | - | |
126 | --- | Show the sentence. | |
127 | -showSent :: ShowCfg -> Sent Tag -> L.Text | |
128 | -showSent cfg xs = L.toLazyText $ buildSent cfg xs | |
129 | - | |
130 | -buildSent :: ShowCfg -> Sent Tag -> L.Builder | |
131 | -buildSent cfg = mconcat . map (buildWord cfg) | |
132 | - | |
133 | -buildWord :: ShowCfg -> Seg Tag -> L.Builder | |
134 | -buildWord cfg Seg{..} | |
135 | - = L.fromText orth <> "\t" | |
136 | - <> buildSpace space <> "\n" | |
137 | - <> buildKnown orth known | |
138 | - <> buildInterps cfg interps | |
139 | - where Word{..} = word | |
140 | - | |
141 | -buildInterps :: ShowCfg -> X.WMap (Interp Tag) -> L.Builder | |
142 | -buildInterps ShowCfg{..} interps = mconcat | |
143 | - [ "\t" <> buildBase interp <> | |
144 | - "\t" <> buildTag interp <> buildDmb dmb | |
145 | - | (interp, dmb) <- M.toList (X.unWMap interps) ] | |
146 | - where | |
147 | - buildTag = L.fromText . tag | |
148 | - buildBase = L.fromText . base | |
149 | - buildDmb = case showWsCfg of | |
150 | - True -> \x -> between "\t" "\n" | |
151 | - $ L.fromString | |
152 | - $ printf "%.3f" x | |
153 | - False -> \x -> if x > 0 | |
154 | - then "\tdisamb\n" | |
155 | - else "\n" | |
156 | - between x y z = x <> z <> y | |
157 | - | |
158 | -buildSpace :: Space -> L.Builder | |
159 | -buildSpace None = "none" | |
160 | -buildSpace Space = "space" | |
161 | -buildSpace NewLine = "newline" | |
162 | - | |
163 | -buildKnown :: T.Text -> Bool -> L.Builder | |
164 | -buildKnown _ True = "" | |
165 | -buildKnown lemma False | |
166 | - = "\t" <> L.fromText lemma | |
167 | - <> "\t" <> L.fromText ign | |
168 | - <> "\n" | |
169 | - | |
170 | - | |
171 | ------------ | |
172 | --- Utils | |
173 | ------------ | |
174 | - | |
175 | - | |
176 | --- | An infix synonym for 'mappend'. | |
177 | -(<>) :: Monoid m => m -> m -> m | |
178 | -(<>) = mappend | |
179 | -{-# INLINE (<>) #-} | |
180 | - | |
181 | - | |
182 | --- | Tag which indicates unknown words. | |
183 | -ign :: IsString a => a | |
184 | -ign = "ign" | |
185 | -{-# INLINE ign #-} |
concraft/src/NLP/Concraft/Polish/Maca.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | - | |
4 | - | |
5 | --- | The module provides interface for the Maca analysis tool. | |
6 | --- See <http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki> | |
7 | --- for more information about the analyser. | |
8 | - | |
9 | - | |
10 | -module NLP.Concraft.Polish.Maca | |
11 | -( | |
12 | - MacaPool | |
13 | -, newMacaPool | |
14 | -, macaPar | |
15 | -) where | |
16 | - | |
17 | - | |
18 | -import Control.Applicative ((<$>)) | |
19 | -import Control.Monad (void, forever, guard, replicateM, unless) | |
20 | -import Control.Concurrent | |
21 | -import Control.Exception | |
22 | -import System.Process | |
23 | -import System.IO | |
24 | -import Data.Function (on) | |
25 | -import qualified Data.Char as C | |
26 | -import qualified Data.Text as T | |
27 | -import qualified Data.Text.IO as T | |
28 | -import qualified Data.Text.Lazy as L | |
29 | -import qualified Data.Text.Lazy.IO as L | |
30 | -import qualified Control.Monad.State.Strict as S | |
31 | -import qualified Control.Monad.Trans.Maybe as M | |
32 | -import Control.Monad.Trans.Class (lift) | |
33 | - | |
34 | -import NLP.Concraft.Polish.Morphosyntax hiding (restore) | |
35 | -import qualified NLP.Concraft.Polish.Format.Plain as Plain | |
36 | - | |
37 | - | |
38 | ----------------------------- | |
39 | --- Maca instance | |
40 | ----------------------------- | |
41 | - | |
42 | - | |
43 | --- TODO: We don't have to use channels here. Maximum one element | |
44 | --- should be present in the input/output channel. | |
45 | - | |
46 | - | |
47 | --- | Input channel. | |
48 | -type In = Chan T.Text | |
49 | - | |
50 | - | |
51 | --- | Output channel. | |
52 | -type Out = Chan [Sent Tag] | |
53 | - | |
54 | - | |
55 | --- | Maca communication channels. | |
56 | -newtype Maca = Maca (In, Out) | |
57 | - | |
58 | - | |
59 | --- | Run Maca instance. | |
60 | -newMaca :: IO Maca | |
61 | -newMaca = do | |
62 | - inCh <- newChan | |
63 | - outCh <- newChan | |
64 | - void $ runMacaOn inCh outCh | |
65 | - return $ Maca (inCh, outCh) | |
66 | - | |
67 | - | |
68 | --- | Run Maca server on given channels. | |
69 | --- TODO: Should check, if maca works. In particular, if morfeusz is available. | |
70 | -runMacaOn :: In -> Out -> IO ThreadId | |
71 | -runMacaOn inCh outCh = mask $ \restore -> forkIO (do | |
72 | - let cmd = "maca-analyse" | |
73 | - args = ["-q", "morfeusz-nkjp-official", "-o", "plain", "-l"] | |
74 | - (Just inh, Just outh, Just errh, pid) <- | |
75 | - createProcess (proc cmd args){ std_in = CreatePipe | |
76 | - , std_out = CreatePipe | |
77 | - , std_err = CreatePipe } | |
78 | - | |
79 | - let excHandler = do | |
80 | - let tryIO = try :: IO a -> IO (Either IOException a) | |
81 | - void $ tryIO $ do | |
82 | - err <- hGetContents errh | |
83 | - unless (all C.isSpace err) $ do | |
84 | - putStr "Maca error: " | |
85 | - putStrLn err | |
86 | - hClose inh; hClose outh; hClose errh | |
87 | - terminateProcess pid | |
88 | - waitForProcess pid | |
89 | - | |
90 | - -- TODO: Document, why LineBuffering is needed here. | |
91 | - hSetBuffering outh LineBuffering | |
92 | - flip onException excHandler $ restore $ forever $ do | |
93 | - | |
94 | - -- Take element from the input channel. | |
95 | - txt <- readChan inCh | |
96 | - -- putStr "REQUEST: " | |
97 | - -- print txt | |
98 | - | |
99 | - -- Write text to maca stdin. | |
100 | - -- TODO: Handle the "empty" case? | |
101 | - T.hPutStr inh txt; hFlush inh | |
102 | - | |
103 | - -- Read maca response and put it in the output channel. | |
104 | - writeChan outCh =<< readMacaResponse outh (textWeight txt) | |
105 | - ) | |
106 | - | |
107 | - | |
108 | -readMacaResponse :: Handle -> Int -> IO [Sent Tag] | |
109 | -readMacaResponse h n | |
110 | - | n <= 0 = return [] | |
111 | - | otherwise = do | |
112 | - x <- readMacaSent h | |
113 | - xs <- readMacaResponse h (n - sentWeight x) | |
114 | - return (x : xs) | |
115 | - | |
116 | - | |
117 | -readMacaSent :: Handle -> IO (Sent Tag) | |
118 | -readMacaSent h = | |
119 | - Plain.parseSent . L.unlines <$> getTxt | |
120 | - where | |
121 | - getTxt = do | |
122 | - x <- L.hGetLine h | |
123 | - if L.null x | |
124 | - then return [] | |
125 | - else (x:) <$> getTxt | |
126 | - | |
127 | - | |
128 | ----------------------------- | |
129 | --- Client | |
130 | ----------------------------- | |
131 | - | |
132 | - | |
133 | --- | Analyse paragraph with Maca. | |
134 | -doMacaPar :: Maca -> T.Text -> IO [Sent Tag] | |
135 | -doMacaPar (Maca (inCh, outCh)) par = do | |
136 | - let par' = T.intercalate " " (T.lines par) `T.append` "\n" | |
137 | - writeChan inCh par' | |
138 | - restoreSpaces par <$> readChan outCh | |
139 | - | |
140 | - | |
141 | --- | Restore info abouts spaces from a text and insert them | |
142 | --- to a parsed paragraph. | |
143 | -restoreSpaces :: T.Text -> [Sent Tag] -> [Sent Tag] | |
144 | -restoreSpaces par sents = | |
145 | - S.evalState (mapM onSent sents) (0, chunks) | |
146 | - where | |
147 | - -- For each space chunk in the paragraph compute | |
148 | - -- total weight of earlier chunks. | |
149 | - parts = T.groupBy ((==) `on` C.isSpace) par | |
150 | - weights = scanl1 (+) (map textWeight parts) | |
151 | - chunks = filter (T.any C.isSpace . fst) (zip parts weights) | |
152 | - | |
153 | - -- Stateful monadic computation which modifies spaces | |
154 | - -- assigned to individual segments. | |
155 | - onSent = mapM onWord | |
156 | - onWord seg = do | |
157 | - n <- addWeight seg | |
158 | - s <- popSpace n | |
159 | - let word' = (word seg) { space = s } | |
160 | - return $ seg { word = word' } | |
161 | - | |
162 | - -- Add weight of the segment to the current weight. | |
163 | - addWeight seg = S.state $ \(n, xs) -> | |
164 | - let m = n + segWeight seg | |
165 | - in (m, (m, xs)) | |
166 | - | |
167 | - -- Pop space from the stack if its weight is lower than | |
168 | - -- the current one. | |
169 | - popSpace n = fmap (maybe None id) . M.runMaybeT $ do | |
170 | - spaces <- lift $ S.gets snd | |
171 | - (sp, m) <- liftMaybe $ maybeHead spaces | |
172 | - guard $ m < n | |
173 | - lift $ S.modify $ \(n', xs) -> (n', tail xs) | |
174 | - return $ toSpace sp | |
175 | - liftMaybe = M.MaybeT . return | |
176 | - maybeHead xs = case xs of | |
177 | - (x:_) -> Just x | |
178 | - [] -> Nothing | |
179 | - | |
180 | - -- Parse strings representation of a Space. | |
181 | - toSpace x | |
182 | - | has '\n' = NewLine | |
183 | - | has ' ' = Space | |
184 | - | otherwise = None | |
185 | - where has c = maybe False (const True) (T.find (==c) x) | |
186 | - | |
187 | - | |
188 | ----------------------------- | |
189 | --- Pool | |
190 | ----------------------------- | |
191 | - | |
192 | - | |
193 | --- | A pool of Maca instances. | |
194 | -newtype MacaPool = MacaPool (Chan Maca) | |
195 | - | |
196 | - | |
197 | --- | Run Maca server. | |
198 | -newMacaPool | |
199 | - :: Int -- ^ Number of Maca instances | |
200 | - -> IO MacaPool | |
201 | -newMacaPool n = do | |
202 | - chan <- newChan | |
203 | - macas <- replicateM n newMaca | |
204 | - writeList2Chan chan macas | |
205 | - return $ MacaPool chan | |
206 | - | |
207 | - | |
208 | -popMaca :: MacaPool -> IO Maca | |
209 | -popMaca (MacaPool c) = readChan c | |
210 | - | |
211 | - | |
212 | -putMaca :: Maca -> MacaPool -> IO () | |
213 | -putMaca x (MacaPool c) = writeChan c x | |
214 | - | |
215 | - | |
216 | --- | Analyse paragraph with Maca. The function is thread-safe. As a | |
217 | --- pre-processing step, all non-printable characters are removed from | |
218 | --- the input (based on empirical observations, Maca behaves likewise). | |
219 | -macaPar :: MacaPool -> T.Text -> IO [Sent Tag] | |
220 | -macaPar pool par0 = do | |
221 | - let par = T.filter C.isPrint par0 | |
222 | - maca <- popMaca pool | |
223 | - doMacaPar maca par `finally` putMaca maca pool | |
224 | - | |
225 | - | |
226 | ------------------------------------------------------------- | |
227 | --- Weight: a number of non-space characters | |
228 | ------------------------------------------------------------- | |
229 | - | |
230 | - | |
231 | --- | A weight of a text. | |
232 | -textWeight :: T.Text -> Int | |
233 | --- textWeight = T.length . T.filter C.isAlphaNum | |
234 | -textWeight = T.length . T.filter (not . C.isSpace) | |
235 | --- textWeight = T.length . T.filter ((&&) <$> not . C.isSpace <*> C.isPrint) | |
236 | - | |
237 | - | |
238 | --- | A weight of a segment. | |
239 | -segWeight :: Seg t -> Int | |
240 | -segWeight = textWeight . orth . word | |
241 | - | |
242 | - | |
243 | --- | A weight of a sentence. | |
244 | -sentWeight :: Sent t -> Int | |
245 | -sentWeight = sum . map segWeight |
concraft/src/NLP/Concraft/Polish/Morphosyntax.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | -{-# LANGUAGE TupleSections #-} | |
4 | - | |
5 | - | |
6 | --- | Morphosyntax data layer in Polish. | |
7 | - | |
8 | - | |
9 | -module NLP.Concraft.Polish.Morphosyntax | |
10 | -( | |
11 | --- * Tag | |
12 | - Tag | |
13 | - | |
14 | --- * Segment | |
15 | -, Seg (..) | |
16 | -, Word (..) | |
17 | -, Interp (..) | |
18 | -, Space (..) | |
19 | -, select | |
20 | -, select' | |
21 | -, selectWMap | |
22 | - | |
23 | --- * Sentence | |
24 | -, Sent | |
25 | -, SentO (..) | |
26 | -, restore | |
27 | -, withOrig | |
28 | - | |
29 | --- * Conversion | |
30 | -, packSeg | |
31 | -, packSent | |
32 | -, packSentO | |
33 | -) where | |
34 | - | |
35 | - | |
36 | -import Control.Applicative ((<$>), (<*>)) | |
37 | -import Control.Arrow (first) | |
38 | -import Data.Maybe (catMaybes) | |
39 | -import Data.Aeson | |
40 | -import Data.Binary (Binary, put, get, putWord8, getWord8) | |
41 | -import qualified Data.Aeson as Aeson | |
42 | -import qualified Data.Set as S | |
43 | -import qualified Data.Map as M | |
44 | -import qualified Data.Text as T | |
45 | -import qualified Data.Text.Lazy as L | |
46 | -import qualified Data.Tagset.Positional as P | |
47 | - | |
48 | -import qualified NLP.Concraft.Morphosyntax as X | |
49 | - | |
50 | - | |
51 | --- | A textual representation of a morphosyntactic tag. | |
52 | -type Tag = T.Text | |
53 | - | |
54 | - | |
55 | --------------------------------- | |
56 | --- Segment | |
57 | --------------------------------- | |
58 | - | |
59 | - | |
60 | --- | A segment consists of a word and a set of morphosyntactic interpretations. | |
61 | -data Seg t = Seg | |
62 | - { word :: Word | |
63 | - -- | Interpretations of the token, each interpretation annotated | |
64 | - -- with a /disamb/ Boolean value (if 'True', the interpretation | |
65 | - -- is correct within the context). | |
66 | - , interps :: X.WMap (Interp t) } | |
67 | - deriving (Show, Eq, Ord) | |
68 | - | |
69 | - | |
70 | -instance (Ord t, Binary t) => Binary (Seg t) where | |
71 | - put Seg{..} = put word >> put interps | |
72 | - get = Seg <$> get <*> get | |
73 | - | |
74 | - | |
75 | --- | A word. | |
76 | -data Word = Word | |
77 | - { orth :: T.Text | |
78 | - , space :: Space | |
79 | - , known :: Bool } | |
80 | - deriving (Show, Eq, Ord) | |
81 | - | |
82 | - | |
83 | -instance X.Word Word where | |
84 | - orth = orth | |
85 | - oov = not.known | |
86 | - | |
87 | - | |
88 | -instance ToJSON Word where | |
89 | - toJSON Word{..} = object | |
90 | - [ "orth" .= orth | |
91 | - , "space" .= space | |
92 | - , "known" .= known ] | |
93 | - | |
94 | - | |
95 | -instance FromJSON Word where | |
96 | - parseJSON (Object v) = Word | |
97 | - <$> v .: "orth" | |
98 | - <*> v .: "space" | |
99 | - <*> v .: "known" | |
100 | - parseJSON _ = error "parseJSON [Word]" | |
101 | - | |
102 | - | |
103 | -instance Binary Word where | |
104 | - put Word{..} = put orth >> put space >> put known | |
105 | - get = Word <$> get <*> get <*> get | |
106 | - | |
107 | - | |
108 | --- | A morphosyntactic interpretation. | |
109 | -data Interp t = Interp | |
110 | - { base :: T.Text | |
111 | - , tag :: t } | |
112 | - deriving (Show, Eq, Ord) | |
113 | - | |
114 | - | |
115 | -instance (Ord t, Binary t) => Binary (Interp t) where | |
116 | - put Interp{..} = put base >> put tag | |
117 | - get = Interp <$> get <*> get | |
118 | - | |
119 | - | |
120 | --- | No space, space or newline. | |
121 | --- TODO: Perhaps we should use a bit more informative data type. | |
122 | -data Space | |
123 | - = None | |
124 | - | Space | |
125 | - | NewLine | |
126 | - deriving (Show, Eq, Ord) | |
127 | - | |
128 | - | |
129 | -instance Binary Space where | |
130 | - put x = case x of | |
131 | - None -> putWord8 1 | |
132 | - Space -> putWord8 2 | |
133 | - NewLine -> putWord8 3 | |
134 | - get = getWord8 >>= \x -> return $ case x of | |
135 | - 1 -> None | |
136 | - 2 -> Space | |
137 | - _ -> NewLine | |
138 | - | |
139 | - | |
140 | -instance ToJSON Space where | |
141 | - toJSON x = Aeson.String $ case x of | |
142 | - None -> "none" | |
143 | - Space -> "space" | |
144 | - NewLine -> "newline" | |
145 | - | |
146 | - | |
147 | -instance FromJSON Space where | |
148 | - parseJSON (Aeson.String x) = return $ case x of | |
149 | - "none" -> None | |
150 | - "space" -> Space | |
151 | - "newline" -> NewLine | |
152 | - _ -> error "parseJSON [Space]" | |
153 | - parseJSON _ = error "parseJSON [Space]" | |
154 | - | |
155 | - | |
156 | --- | Select one chosen interpretation. | |
157 | -select :: Ord a => a -> Seg a -> Seg a | |
158 | -select = select' [] | |
159 | - | |
160 | - | |
161 | --- | Select multiple interpretations and one chosen interpretation. | |
162 | -select' :: Ord a => [a] -> a -> Seg a -> Seg a | |
163 | -select' ys x = selectWMap . X.mkWMap $ (x, 1) : map (,0) ys | |
164 | - | |
165 | - | |
166 | --- | Select interpretations. | |
167 | -selectWMap :: Ord a => X.WMap a -> Seg a -> Seg a | |
168 | -selectWMap wMap seg = | |
169 | - seg { interps = newInterps } | |
170 | - where | |
171 | - wSet = S.fromList . map tag . M.keys . X.unWMap . interps $ seg | |
172 | - newInterps = X.mkWMap $ | |
173 | - [ case M.lookup (tag interp) (X.unWMap wMap) of | |
174 | - Just x -> (interp, x) | |
175 | - Nothing -> (interp, 0) | |
176 | - | interp <- (M.keys . X.unWMap) (interps seg) ] | |
177 | - ++ catMaybes | |
178 | - [ if tag `S.member` wSet | |
179 | - then Nothing | |
180 | - else Just (Interp lemma tag, x) | |
181 | - | let lemma = orth $ word seg -- Default base form | |
182 | - , (tag, x) <- M.toList (X.unWMap wMap) ] | |
183 | - | |
184 | - | |
185 | --------------------------------- | |
186 | --- Sentence | |
187 | --------------------------------- | |
188 | - | |
189 | - | |
190 | --- | A sentence. | |
191 | -type Sent t = [Seg t] | |
192 | - | |
193 | - | |
194 | --- | A sentence. | |
195 | -data SentO t = SentO | |
196 | - { segs :: [Seg t] | |
197 | - , orig :: L.Text } | |
198 | - | |
199 | - | |
200 | --- | Restore textual representation of a sentence. | |
201 | --- The function is not very accurate, it could be improved | |
202 | --- if we enrich representation of a space. | |
203 | -restore :: Sent t -> L.Text | |
204 | -restore = | |
205 | - let wordStr Word{..} = [spaceStr space, orth] | |
206 | - spaceStr None = "" | |
207 | - spaceStr Space = " " | |
208 | - spaceStr NewLine = "\n" | |
209 | - in L.fromChunks . concatMap (wordStr . word) | |
210 | - | |
211 | - | |
212 | --- | Use `restore` to translate `Sent` to a `SentO`. | |
213 | -withOrig :: Sent t -> SentO t | |
214 | -withOrig s = SentO | |
215 | - { segs = s | |
216 | - , orig = restore s } | |
217 | - | |
218 | - | |
219 | ---------------------------- | |
220 | --- Conversion | |
221 | ---------------------------- | |
222 | - | |
223 | - | |
224 | --- | Convert a segment to a segment from a core library. | |
225 | -packSeg_ :: Ord a => Seg a -> X.Seg Word a | |
226 | -packSeg_ Seg{..} | |
227 | - = X.Seg word | |
228 | - $ X.mkWMap | |
229 | - $ map (first tag) | |
230 | - $ M.toList | |
231 | - $ X.unWMap interps | |
232 | - | |
233 | - | |
234 | --- | Convert a segment to a segment from a core library. | |
235 | -packSeg :: P.Tagset -> Seg Tag -> X.Seg Word P.Tag | |
236 | -packSeg tagset = X.mapSeg (P.parseTag tagset) . packSeg_ | |
237 | - | |
238 | - | |
239 | --- | Convert a sentence to a sentence from a core library. | |
240 | -packSent :: P.Tagset -> Sent Tag -> X.Sent Word P.Tag | |
241 | -packSent = map . packSeg | |
242 | - | |
243 | - | |
244 | --- | Convert a sentence to a sentence from a core library. | |
245 | -packSentO :: P.Tagset -> SentO Tag -> X.SentO Word P.Tag | |
246 | -packSentO tagset s = X.SentO | |
247 | - { segs = packSent tagset (segs s) | |
248 | - , orig = orig s } |
concraft/src/NLP/Concraft/Polish/Request.hs deleted
1 | -{-# LANGUAGE RecordWildCards #-} | |
2 | -{-# LANGUAGE GeneralizedNewtypeDeriving #-} | |
3 | - | |
4 | - | |
5 | -module NLP.Concraft.Polish.Request | |
6 | -( | |
7 | --- * Request | |
8 | - Request (..) | |
9 | -, Config (..) | |
10 | --- ** Short | |
11 | -, Short (..) | |
12 | -, short | |
13 | --- ** Long | |
14 | -, Long (..) | |
15 | -, long | |
16 | -) where | |
17 | - | |
18 | - | |
19 | -import Control.Applicative ((<$>), (<*>)) | |
20 | -import qualified Control.Monad.LazyIO as LazyIO | |
21 | -import qualified Data.Char as Char | |
22 | -import qualified Data.List.Split as Split | |
23 | -import qualified Data.Text as T | |
24 | -import qualified Data.Text.Lazy as L | |
25 | -import qualified Data.Binary as B | |
26 | - | |
27 | -import NLP.Concraft.Polish | |
28 | -import NLP.Concraft.Polish.Maca | |
29 | -import NLP.Concraft.Polish.Morphosyntax hiding (tag) | |
30 | - | |
31 | - | |
32 | -------------------------------------------------- | |
33 | --- Configuration | |
34 | -------------------------------------------------- | |
35 | - | |
36 | - | |
37 | --- | A request with configuration. | |
38 | -data Request t = Request { | |
39 | - -- | The actuall request. | |
40 | - rqBody :: t | |
41 | - -- | Request configuration. | |
42 | - , rqConf :: Config } | |
43 | - | |
44 | - | |
45 | -instance B.Binary t => B.Binary (Request t) where | |
46 | - put Request{..} = B.put rqBody >> B.put rqConf | |
47 | - get = Request <$> B.get <*> B.get | |
48 | - | |
49 | - | |
50 | --- | Tagging configuration. | |
51 | -newtype Config = Config { | |
52 | - -- | Tag with marginal probabilities. | |
53 | - tagProbs :: Bool | |
54 | - } deriving (B.Binary) | |
55 | - | |
56 | - | |
57 | -------------------------------------------------- | |
58 | --- Short request | |
59 | -------------------------------------------------- | |
60 | - | |
61 | - | |
62 | --- | A short request. | |
63 | -data Short | |
64 | - = Short T.Text | |
65 | - | Par [Sent Tag] | |
66 | - | |
67 | - | |
68 | -instance B.Binary Short where | |
69 | - put (Short x) = B.putWord8 0 >> B.put x | |
70 | - put (Par x) = B.putWord8 1 >> B.put x | |
71 | - get = B.getWord8 >>= \x -> case x of | |
72 | - 0 -> Short <$> B.get | |
73 | - _ -> Par <$> B.get | |
74 | - | |
75 | - | |
76 | --- | Process the short request. | |
77 | -short :: MacaPool -> Concraft -> Request Short -> IO [Sent Tag] | |
78 | -short pool concraft Request{..} = case rqBody of | |
79 | - Short x -> map (tagit concraft) <$> macaPar pool x | |
80 | - Par x -> return $ map (tagit concraft) x | |
81 | - where | |
82 | - tagit = if tagProbs rqConf then marginals else tag | |
83 | - | |
84 | - | |
85 | -------------------------------------------------- | |
86 | --- Long request | |
87 | -------------------------------------------------- | |
88 | - | |
89 | - | |
90 | --- | A request to parse a long text. | |
91 | -data Long | |
92 | - = Long L.Text | |
93 | - | Doc [[Sent Tag]] | |
94 | - | |
95 | - | |
96 | -instance B.Binary Long where | |
97 | - put (Long x) = B.putWord8 0 >> B.put x | |
98 | - put (Doc x) = B.putWord8 1 >> B.put x | |
99 | - get = B.getWord8 >>= \x -> case x of | |
100 | - 0 -> Long <$> B.get | |
101 | - _ -> Doc <$> B.get | |
102 | - | |
103 | - | |
104 | --- | Process the long request given the processor for the | |
105 | --- short request. | |
106 | -long :: (Request Short -> IO a) -> Request Long -> IO [a] | |
107 | -long handler Request{..} = case rqBody of | |
108 | - Long inp -> | |
109 | - LazyIO.mapM f . map L.unlines | |
110 | - . Split.splitWhen (L.all Char.isSpace) | |
111 | - . L.lines $ inp | |
112 | - Doc inp -> LazyIO.mapM g inp | |
113 | - where | |
114 | - f x = handler . r $ Short $ L.toStrict x | |
115 | - g x = handler . r $ Par x | |
116 | - r x = Request {rqBody = x, rqConf = rqConf} |
concraft/src/NLP/Concraft/Polish/Server.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | - | |
4 | - | |
5 | -module NLP.Concraft.Polish.Server | |
6 | -( | |
7 | --- * Server | |
8 | - runConcraftServer | |
9 | - | |
10 | --- * Client | |
11 | -, submit | |
12 | -) where | |
13 | - | |
14 | - | |
15 | -import Control.Applicative ((<$>)) | |
16 | -import Control.Monad (forever, void) | |
17 | -import Control.Concurrent (forkIO) | |
18 | -import System.IO (Handle, hFlush) | |
19 | -import qualified Network as N | |
20 | -import qualified Data.Binary as B | |
21 | -import qualified Data.ByteString.Lazy as BS | |
22 | - | |
23 | -import NLP.Concraft.Polish.Morphosyntax hiding (tag) | |
24 | -import NLP.Concraft.Polish.Maca | |
25 | -import qualified NLP.Concraft.Polish as C | |
26 | -import qualified NLP.Concraft.Polish.Request as R | |
27 | - | |
28 | - | |
29 | -------------------------------------------------- | |
30 | --- Server | |
31 | -------------------------------------------------- | |
32 | - | |
33 | - | |
34 | --- | Run a Concraft server on a given port. | |
35 | -runConcraftServer :: MacaPool -> C.Concraft -> N.PortID -> IO () | |
36 | -runConcraftServer pool concraft port = N.withSocketsDo $ do | |
37 | - sock <- N.listenOn port | |
38 | - forever $ sockHandler pool concraft sock | |
39 | - | |
40 | - | |
41 | --- | Read and process short requests from the socket. | |
42 | -sockHandler :: MacaPool -> C.Concraft -> N.Socket -> IO () | |
43 | -sockHandler pool concraft sock = do | |
44 | - (handle, _, _) <- N.accept sock | |
45 | - -- putStrLn "Connection established" | |
46 | - void $ forkIO $ do | |
47 | - -- putStrLn "Waiting for input..." | |
48 | - inp <- recvMsg handle | |
49 | - -- putStr "> " >> T.putStrLn inp | |
50 | - out <- R.short pool concraft inp | |
51 | - -- putStr "No. of sentences: " >> print (length out) | |
52 | - sendMsg handle out | |
53 | - | |
54 | - | |
55 | -------------------------------------------------- | |
56 | --- Client | |
57 | -------------------------------------------------- | |
58 | - | |
59 | - | |
60 | --- | Submit the given request. | |
61 | -submit :: N.HostName -> N.PortID -> R.Request R.Short -> IO [Sent Tag] | |
62 | -submit host port inp = do | |
63 | - handle <- N.connectTo host port | |
64 | - -- putStrLn "Connection established" | |
65 | - -- putStr "Send request: " >> T.putStrLn inp | |
66 | - sendMsg handle inp | |
67 | - recvMsg handle | |
68 | - | |
69 | - | |
70 | -------------------------------------------------- | |
71 | --- Communication | |
72 | -------------------------------------------------- | |
73 | - | |
74 | - | |
75 | -sendMsg :: B.Binary a => Handle -> a -> IO () | |
76 | -sendMsg h msg = do | |
77 | - let x = B.encode msg | |
78 | - n = fromIntegral $ BS.length x | |
79 | - sendInt h n | |
80 | - BS.hPut h x | |
81 | - hFlush h | |
82 | - | |
83 | - | |
84 | -recvMsg :: B.Binary a => Handle -> IO a | |
85 | -recvMsg h = do | |
86 | - n <- recvInt h | |
87 | - B.decode <$> BS.hGet h n | |
88 | - | |
89 | - | |
90 | -sendInt :: Handle -> Int -> IO () | |
91 | -sendInt h x = BS.hPut h (B.encode x) | |
92 | - | |
93 | - | |
94 | -recvInt :: Handle -> IO Int | |
95 | -recvInt h = B.decode <$> BS.hGet h 8 | |
96 | - | |
97 | - | |
98 | --- ------------------------------------------------- | |
99 | --- -- Stream binary encoding | |
100 | --- ------------------------------------------------- | |
101 | --- | |
102 | --- | |
103 | --- newtype Stream a = Stream { unstream :: [a] } | |
104 | --- | |
105 | --- | |
106 | --- instance B.Binary a => B.Binary (Stream a) where | |
107 | --- put (Stream []) = B.putWord8 0 | |
108 | --- put (Stream (x:xs)) = B.putWord8 1 >> B.put x >> B.put (Stream xs) | |
109 | --- get = error "use lazyDecodeStream insted" | |
110 | --- | |
111 | --- | |
112 | --- getMaybe :: B.Binary a => B.Get (Maybe a) | |
113 | --- getMaybe = do | |
114 | --- t <- B.getWord8 | |
115 | --- case t of | |
116 | --- 0 -> return Nothing | |
117 | --- _ -> fmap Just B.get | |
118 | --- | |
119 | --- | |
120 | --- step :: B.Binary a => (ByteString, Int64) -> Maybe (a, (ByteString, Int64)) | |
121 | --- step (xs, offset) = case B.runGetState getMaybe xs offset of | |
122 | --- (Just v, ys, newOffset) -> Just (v, (ys, newOffset)) | |
123 | --- _ -> Nothing | |
124 | --- | |
125 | --- | |
126 | --- lazyDecodeList :: B.Binary a => ByteString -> [a] | |
127 | --- lazyDecodeList xs = unfoldr step (xs, 0) | |
128 | --- | |
129 | --- | |
130 | --- lazyDecodeStream :: B.Binary a => ByteString -> Stream a | |
131 | --- lazyDecodeStream = Stream . lazyDecodeList |
concraft/stack.yaml deleted
1 | -# For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md | |
2 | - | |
3 | -# Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2) | |
4 | -resolver: lts-1.15 | |
5 | - | |
6 | -# Local packages, usually specified by relative directory name | |
7 | -packages: | |
8 | -- '.' | |
9 | - | |
10 | -# Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3) | |
11 | -extra-deps: | |
12 | -- concraft-0.9.4 | |
13 | -- lazy-io-0.1.0 | |
14 | -- sgd-0.3.7 | |
15 | -- tagset-positional-0.3.0 | |
16 | -- crf-chain1-constrained-0.3.2 | |
17 | -- crf-chain2-tiers-0.2.4 | |
18 | -- data-lens-2.10.7 | |
19 | -- monad-codec-0.2.0 | |
20 | -- monad-ox-0.3.0 | |
21 | -- vector-binary-0.1.1 | |
22 | -- temporary-1.1.2.5 | |
23 | - | |
24 | -# Override default flag values for local packages and extra-deps | |
25 | -flags: {} | |
26 | - | |
27 | -# Extra package databases containing global packages | |
28 | -extra-package-dbs: [] | |
29 | - | |
30 | -# Control whether we use the GHC we find on the path | |
31 | -# system-ghc: true | |
32 | - | |
33 | -# Require a specific version of stack, using version ranges | |
34 | -# require-stack-version: -any # Default | |
35 | -# require-stack-version: >= 1.0.0 | |
36 | - | |
37 | -# Override the architecture used by stack, especially useful on Windows | |
38 | -# arch: i386 | |
39 | -# arch: x86_64 | |
40 | - | |
41 | -# Extra directories used by stack for building | |
42 | -# extra-include-dirs: [/path/to/dir] | |
43 | -# extra-lib-dirs: [/path/to/dir] |
concraft/tools/concraft-pl.hs deleted
1 | -{-# LANGUAGE DeriveDataTypeable #-} | |
2 | -{-# LANGUAGE ScopedTypeVariables #-} | |
3 | -{-# LANGUAGE RecordWildCards #-} | |
4 | - | |
5 | - | |
6 | -import Control.Applicative ((<$>)) | |
7 | -import Control.Monad (unless) | |
8 | -import System.Console.CmdArgs | |
9 | -import System.IO (hFlush, stdout) | |
10 | -import qualified Network as N | |
11 | -import qualified Numeric.SGD as SGD | |
12 | -import qualified Data.Text.Lazy as L | |
13 | -import qualified Data.Text.Lazy.IO as L | |
14 | -import Data.Tagset.Positional (parseTagset) | |
15 | -import GHC.Conc (numCapabilities) | |
16 | - | |
17 | -import qualified NLP.Concraft.Morphosyntax.Accuracy as Acc | |
18 | -import qualified NLP.Concraft.Guess as Guess | |
19 | - | |
20 | -import qualified NLP.Concraft.Polish.Maca as Maca | |
21 | -import qualified NLP.Concraft.Polish as C | |
22 | -import qualified NLP.Concraft.Polish.Request as R | |
23 | -import qualified NLP.Concraft.Polish.Server as S | |
24 | -import qualified NLP.Concraft.Polish.Morphosyntax as X | |
25 | -import qualified NLP.Concraft.Polish.Format.Plain as P | |
26 | - | |
27 | -import Paths_concraft_pl (version, getDataFileName) | |
28 | -import Data.Version (showVersion) | |
29 | - | |
30 | - | |
31 | --- | Default port number. | |
32 | -portDefault :: Int | |
33 | -portDefault = 10089 | |
34 | - | |
35 | - | |
36 | ---------------------------------------- | |
37 | --- Command line options | |
38 | ---------------------------------------- | |
39 | - | |
40 | - | |
41 | --- | Data formats. | |
42 | -data Format = Plain deriving (Data, Typeable, Show) | |
43 | - | |
44 | - | |
45 | --- | A description of the Concraft-pl tool | |
46 | -concraftDesc :: String | |
47 | -concraftDesc = "Concraft-pl " ++ showVersion version | |
48 | - | |
49 | - | |
50 | -data Concraft | |
51 | - = Train | |
52 | - { trainPath :: FilePath | |
53 | - , evalPath :: Maybe FilePath | |
54 | - , format :: Format | |
55 | - , tagsetPath :: Maybe FilePath | |
56 | - , noAna :: Bool | |
57 | - -- , discardHidden :: Bool | |
58 | - , iterNum :: Double | |
59 | - , batchSize :: Int | |
60 | - , regVar :: Double | |
61 | - , gain0 :: Double | |
62 | - , tau :: Double | |
63 | - , disk :: Bool | |
64 | - , outModel :: FilePath | |
65 | - , guessNum :: Int | |
66 | - , r0 :: Guess.R0T } | |
67 | - | Tag | |
68 | - { inModel :: FilePath | |
69 | - , noAna :: Bool | |
70 | - , format :: Format | |
71 | - , marginals :: Bool } | |
72 | - -- , guessNum :: Int } | |
73 | - | Server | |
74 | - { inModel :: FilePath | |
75 | - , port :: Int } | |
76 | - | Client | |
77 | - { noAna :: Bool | |
78 | - , format :: Format | |
79 | - , marginals :: Bool | |
80 | - , host :: String | |
81 | - , port :: Int } | |
82 | - | Compare | |
83 | - { tagsetPath :: Maybe FilePath | |
84 | - , refPath :: FilePath | |
85 | - , otherPath :: FilePath | |
86 | - , format :: Format } | |
87 | - | Prune | |
88 | - { inModel :: FilePath | |
89 | - , outModel :: FilePath | |
90 | - , threshold :: Double } | |
91 | --- | ReAna | |
92 | --- { format :: Format } | |
93 | - deriving (Data, Typeable, Show) | |
94 | - | |
95 | - | |
96 | -trainMode :: Concraft | |
97 | -trainMode = Train | |
98 | - { trainPath = def &= argPos 1 &= typ "TRAIN-FILE" | |
99 | - , evalPath = def &= typFile &= help "Evaluation file" | |
100 | - , tagsetPath = def &= typFile &= help "Tagset definition file" | |
101 | - , format = enum [Plain &= help "Plain format"] | |
102 | - , noAna = False &= help "Do not perform reanalysis" | |
103 | - -- , discardHidden = False &= help "Discard hidden features" | |
104 | - , iterNum = 20 &= help "Number of SGD iterations" | |
105 | - , batchSize = 50 &= help "Batch size" | |
106 | - , regVar = 10.0 &= help "Regularization variance" | |
107 | - , gain0 = 1.0 &= help "Initial gain parameter" | |
108 | - , tau = 5.0 &= help "Initial tau parameter" | |
109 | - , disk = False &= help "Store SGD dataset on disk" | |
110 | - , outModel = def &= typFile &= help "Output Model file" | |
111 | - , guessNum = 10 &= help "Number of guessed tags for each unknown word" | |
112 | - , r0 = Guess.OovChosen &= help "R0 construction method" } | |
113 | - | |
114 | - | |
115 | -tagMode :: Concraft | |
116 | -tagMode = Tag | |
117 | - { inModel = def &= argPos 0 &= typ "MODEL-FILE" | |
118 | - , noAna = False &= help "Do not analyse input text" | |
119 | - , format = enum [Plain &= help "Plain format"] | |
120 | - , marginals = False &= help "Tag with marginal probabilities" } | |
121 | - -- , guessNum = 10 &= help "Number of guessed tags for each unknown word" } | |
122 | - | |
123 | - | |
124 | -serverMode :: Concraft | |
125 | -serverMode = Server | |
126 | - { inModel = def &= argPos 0 &= typ "MODEL-FILE" | |
127 | - , port = portDefault &= help "Port number" } | |
128 | - | |
129 | - | |
130 | -clientMode :: Concraft | |
131 | -clientMode = Client | |
132 | - { noAna = False &= help "Do not perform reanalysis" | |
133 | - , port = portDefault &= help "Port number" | |
134 | - , host = "localhost" &= help "Server host name" | |
135 | - , format = enum [Plain &= help "Plain output format"] | |
136 | - , marginals = False &= help "Tag with marginal probabilities" } | |
137 | - | |
138 | - | |
139 | -compareMode :: Concraft | |
140 | -compareMode = Compare | |
141 | - { refPath = def &= argPos 1 &= typ "REFERENCE-FILE" | |
142 | - , otherPath = def &= argPos 2 &= typ "OTHER-FILE" | |
143 | - , tagsetPath = def &= typFile &= help "Tagset definition file" | |
144 | - , format = enum [Plain &= help "Plain format"] } | |
145 | - | |
146 | - | |
147 | -pruneMode :: Concraft | |
148 | -pruneMode = Prune | |
149 | - { inModel = def &= argPos 0 &= typ "INPUT-MODEL" | |
150 | - , outModel = def &= argPos 1 &= typ "OUTPUT-MODEL" | |
151 | - , threshold = 0.05 &= | |
152 | - help "Remove disambiguation features below the threshold" } | |
153 | - | |
154 | - | |
155 | --- reAnaMode :: Concraft | |
156 | --- reAnaMode = ReAna | |
157 | --- { format = enum [Plain &= help "Plain format"] } | |
158 | - | |
159 | - | |
160 | -argModes :: Mode (CmdArgs Concraft) | |
161 | -argModes = cmdArgsMode $ modes | |
162 | - [trainMode, tagMode, serverMode, clientMode, compareMode, pruneMode] | |
163 | - &= summary concraftDesc | |
164 | - &= program "concraft-pl" | |
165 | - | |
166 | - | |
167 | ---------------------------------------- | |
168 | --- Main | |
169 | ---------------------------------------- | |
170 | - | |
171 | - | |
172 | -main :: IO () | |
173 | -main = exec =<< cmdArgsRun argModes | |
174 | - | |
175 | - | |
176 | -exec :: Concraft -> IO () | |
177 | - | |
178 | - | |
179 | -exec Train{..} = do | |
180 | - tagsetPath' <- case tagsetPath of | |
181 | - Nothing -> getDataFileName "config/nkjp-tagset.cfg" | |
182 | - Just x -> return x | |
183 | - tagset <- parseTagset tagsetPath' <$> readFile tagsetPath' | |
184 | - let train0 = parseFileO format trainPath | |
185 | - let eval0 = parseFileO' format evalPath | |
186 | - concraft <- C.train (trainConf tagset) train0 eval0 | |
187 | - unless (null outModel) $ do | |
188 | - putStrLn $ "\nSaving model in " ++ outModel ++ "..." | |
189 | - C.saveModel outModel concraft | |
190 | - where | |
191 | - sgdArgs = SGD.SgdArgs | |
192 | - { SGD.batchSize = batchSize | |
193 | - , SGD.regVar = regVar | |
194 | - , SGD.iterNum = iterNum | |
195 | - , SGD.gain0 = gain0 | |
196 | - , SGD.tau = tau } | |
197 | - trainConf tagset = C.TrainConf | |
198 | - { tagset = tagset | |
199 | - , sgdArgs = sgdArgs | |
200 | - , reana = not noAna | |
201 | - , onDisk = disk | |
202 | - , guessNum = guessNum | |
203 | - , r0 = r0 } | |
204 | - | |
205 | - | |
206 | -exec Tag{..} = do | |
207 | - cft <- C.loadModel inModel | |
208 | - pool <- Maca.newMacaPool numCapabilities | |
209 | - inp <- L.getContents | |
210 | - out <- R.long (R.short pool cft) $ rq $ if noAna | |
211 | - then R.Doc $ parseText format inp | |
212 | - else R.Long inp | |
213 | - L.putStr $ showData showCfg out | |
214 | - where | |
215 | - rq x = R.Request | |
216 | - { R.rqBody = x | |
217 | - , R.rqConf = rqConf } | |
218 | - rqConf = R.Config | |
219 | - { R.tagProbs = marginals } | |
220 | - showCfg = ShowCfg | |
221 | - { formatCfg = format | |
222 | - , showWsCfg = marginals } | |
223 | - | |
224 | - | |
225 | -exec Server{..} = do | |
226 | - putStr "Loading model..." >> hFlush stdout | |
227 | - concraft <- C.loadModel inModel | |
228 | - putStrLn " done" | |
229 | - pool <- Maca.newMacaPool numCapabilities | |
230 | - let portNum = N.PortNumber $ fromIntegral port | |
231 | - putStrLn $ "Listening on port " ++ show port | |
232 | - S.runConcraftServer pool concraft portNum | |
233 | - | |
234 | - | |
235 | -exec Client{..} = do | |
236 | - let portNum = N.PortNumber $ fromIntegral port | |
237 | - inp <- L.getContents | |
238 | - out <- R.long (S.submit host portNum) $ rq $ if noAna | |
239 | - then R.Doc $ parseText format inp | |
240 | - else R.Long inp | |
241 | - L.putStr $ showData showCfg out | |
242 | - where | |
243 | - rq x = R.Request | |
244 | - { R.rqBody = x | |
245 | - , R.rqConf = rqConf } | |
246 | - rqConf = R.Config | |
247 | - { R.tagProbs = marginals } | |
248 | - showCfg = ShowCfg | |
249 | - { formatCfg = format | |
250 | - , showWsCfg = marginals } | |
251 | - | |
252 | - | |
253 | -exec Compare{..} = do | |
254 | - tagsetPath' <- case tagsetPath of | |
255 | - Nothing -> getDataFileName "config/nkjp-tagset.cfg" | |
256 | - Just x -> return x | |
257 | - tagset <- parseTagset tagsetPath' <$> readFile tagsetPath' | |
258 | - let convert = map (X.packSeg tagset) . concat | |
259 | - xs <- convert <$> parseFile format refPath | |
260 | - ys <- convert <$> parseFile format otherPath | |
261 | - let s = Acc.weakLB tagset xs ys | |
262 | - putStrLn $ "Number of segments in reference file: " ++ show (Acc.gold s) | |
263 | - putStrLn $ "Number of correct tags: " ++ show (Acc.good s) | |
264 | - putStrLn $ "Weak accuracy lower bound: " ++ show (Acc.accuracy s) | |
265 | - | |
266 | - | |
267 | -exec Prune{..} = do | |
268 | - cft <- C.loadModel inModel | |
269 | - C.saveModel outModel $ C.prune threshold cft | |
270 | - | |
271 | - | |
272 | --- exec ReAna{..} = do | |
273 | --- inp <- parseText format <$> L.getContents | |
274 | --- out <- showData format <$> | |
275 | - | |
276 | - | |
277 | ---------------------------------------- | |
278 | --- Reading files | |
279 | ---------------------------------------- | |
280 | - | |
281 | - | |
282 | -parseFileO' :: Format -> Maybe FilePath -> IO [X.SentO X.Tag] | |
283 | -parseFileO' format path = case path of | |
284 | - Nothing -> return [] | |
285 | - Just pt -> parseFileO format pt | |
286 | - | |
287 | - | |
288 | -parseFileO :: Format -> FilePath -> IO [X.SentO X.Tag] | |
289 | -parseFileO format path = parseParaO format <$> L.readFile path | |
290 | - | |
291 | - | |
292 | -parseFile :: Format -> FilePath -> IO [X.Sent X.Tag] | |
293 | -parseFile format path = parsePara format <$> L.readFile path | |
294 | - | |
295 | - | |
296 | ---------------------------------------- | |
297 | --- Parsing text | |
298 | ---------------------------------------- | |
299 | - | |
300 | - | |
301 | --- parseTextO :: Format -> L.Text -> [[X.SentO X.Tag]] | |
302 | --- parseTextO format = map (map X.withOrig) . parseText format | |
303 | - | |
304 | - | |
305 | -parseParaO :: Format -> L.Text -> [X.SentO X.Tag] | |
306 | -parseParaO format = map X.withOrig . parsePara format | |
307 | - | |
308 | - | |
309 | ---------------------------------------- | |
310 | --- Parsing (format dependent) | |
311 | ---------------------------------------- | |
312 | - | |
313 | - | |
314 | -parseText :: Format -> L.Text -> [[X.Sent X.Tag]] | |
315 | -parseText Plain = P.parsePlain | |
316 | - | |
317 | - | |
318 | -parsePara :: Format -> L.Text -> [X.Sent X.Tag] | |
319 | -parsePara Plain = P.parsePara | |
320 | - | |
321 | - | |
322 | ---------------------------------------- | |
323 | --- Showing (format dependent) | |
324 | ---------------------------------------- | |
325 | - | |
326 | - | |
327 | -data ShowCfg = ShowCfg { | |
328 | - -- | The format used. | |
329 | - formatCfg :: Format | |
330 | - -- | Show weights? | |
331 | - , showWsCfg :: Bool } | |
332 | - | |
333 | - | |
334 | -showData :: ShowCfg -> [[X.Sent X.Tag]] -> L.Text | |
335 | -showData ShowCfg{..} = P.showPlain (P.ShowCfg {P.showWsCfg = showWsCfg}) |
integration/.gitignore
0 → 100644
1 | +test | |
... | ... |
integration/ENIAMpreIntegration.ml
... | ... | @@ -25,11 +25,48 @@ let mate_parser_enabled = ref false |
25 | 25 | let swigra_enabled = ref false |
26 | 26 | let polfie_enabled = ref false |
27 | 27 | |
28 | +let concraft_model_filename = "../tools/concraft/nkjp-model-0.2.gz" | |
29 | +let concraft_server_pid = ref (-1) | |
28 | 30 | |
29 | -(* | |
30 | -UWAGA: Aby korzytać z concrafta trzeba najpierw postawić serwer wpisując z linii poleceń: | |
31 | -concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz | |
32 | -*) | |
31 | +let concraft_exists () = | |
32 | + let check_in, check_out, check_err = Unix.open_process_full ("command -v concraft-pl") | |
33 | + [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in | |
34 | + let close_check () = Unix.close_process_full (check_in, check_out, check_err) in | |
35 | + try | |
36 | + ignore @@ input_line check_in; | |
37 | + ignore @@ close_check (); | |
38 | + true | |
39 | + with End_of_file -> ignore @@ close_check (); false | |
40 | + | |
41 | +let wait_for_server () = | |
42 | + let rec wait s a = | |
43 | + try Unix.connect s a | |
44 | + with e -> Unix.sleep 1; wait s a in | |
45 | + let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 6 in | |
46 | + let a = Unix.ADDR_INET (Unix.inet_addr_loopback, 10089) in | |
47 | + wait s a; | |
48 | + Unix.shutdown s Unix.SHUTDOWN_SEND; | |
49 | + Unix.close s | |
50 | + | |
51 | +let start_server m = | |
52 | + let client_out, server_out = Unix.pipe () in | |
53 | + let client_err, server_err = Unix.pipe () in | |
54 | + let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; m|] | |
55 | + Unix.stdin server_out server_err in | |
56 | + List.iter Unix.close [client_out; server_out; client_err; server_err]; | |
57 | + wait_for_server (); | |
58 | + pid | |
59 | + | |
60 | +let stop_server pid = | |
61 | + Unix.kill pid Sys.sigint | |
62 | + | |
63 | +let initialize () = | |
64 | + if !concraft_enabled then ( | |
65 | + if not (concraft_exists ()) then failwith "The command concraft-pl is missing. Please make sure Concraft is installed properly." else | |
66 | + if not (Sys.file_exists concraft_model_filename) then failwith "Concraft model file does not exist." else | |
67 | + print_endline "Starting Concraft Server"; | |
68 | + concraft_server_pid := start_server concraft_model_filename; | |
69 | + print_endline "Server started") | |
33 | 70 | |
34 | 71 | let read_whole_channel c = |
35 | 72 | let r = ref [] in |
... | ... | @@ -40,6 +77,16 @@ let read_whole_channel c = |
40 | 77 | !r |
41 | 78 | with End_of_file -> List.rev (!r) |
42 | 79 | |
80 | +let concraft_parse s = | |
81 | + let concraft_in, concraft_out, concraft_err = | |
82 | + Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client") | |
83 | + [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in | |
84 | + let err_msg = String.concat "\n" (read_whole_channel concraft_err) in | |
85 | + let result = read_whole_channel concraft_in in | |
86 | + ignore (Unix.close_process_full (concraft_in, concraft_out, concraft_err)); | |
87 | + if err_msg <> "" then failwith err_msg else | |
88 | + result | |
89 | + | |
43 | 90 | let rec process_concraft_result orth lemma interp others rev = function |
44 | 91 | [] -> List.rev ((orth,(lemma,interp) :: others) :: rev) |
45 | 92 | | "" :: l -> process_concraft_result orth lemma interp others rev l |
... | ... | @@ -52,27 +99,6 @@ let rec process_concraft_result orth lemma interp others rev = function |
52 | 99 | | ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l |
53 | 100 | | _ -> failwith ("process_concraft_result: " ^ line)) |
54 | 101 | |
55 | -let concraft_parse s = | |
56 | - let concraft_in, concraft_out, concraft_err = | |
57 | - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client") | |
58 | - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in | |
59 | - let err_msg = String.concat "\n" (read_whole_channel concraft_err) in | |
60 | - let result = read_whole_channel concraft_in in | |
61 | - if err_msg <> "" then failwith err_msg else | |
62 | - process_concraft_result "" "" "" [] [] result | |
63 | - | |
64 | -(*let rec load_concraft_sentence white orth rev ic = | |
65 | - (* print_endline "load_concraft_sentence 1"; *) | |
66 | - (* print_endline ("concraft error message: " ^ input_line concraft_err); *) | |
67 | - let s = input_line ic in | |
68 | - (* print_endline ("load_concraft_sentence: " ^ s); *) | |
69 | - if s = "" then List.rev rev else | |
70 | - match Xstring.split_delim "\t" s with | |
71 | - [""; lemma; interp; "disamb"] -> load_concraft_sentence "" "" ((white,orth,lemma,interp) :: rev) ic | |
72 | - | [""; lemma; interp] -> load_concraft_sentence white orth rev ic | |
73 | - | [orth; white] -> load_concraft_sentence white orth rev ic | |
74 | - | _ -> failwith ("load_concraft_sentence: " ^ s)*) | |
75 | - | |
76 | 102 | let make_token (orth,l) = |
77 | 103 | if l = [] then failwith "make_token 1" else |
78 | 104 | let lemma,interp = List.hd l in |
... | ... | @@ -82,10 +108,8 @@ let make_token (orth,l) = |
82 | 108 | {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp)} |
83 | 109 | |
84 | 110 | let parse_mate tokens pbeg s = |
85 | - (* print_endline ("parse_mate: " ^ s); *) | |
86 | - (* Printf.fprintf concraft_out "%s\n\n%!" s; | |
87 | - let l = load_concraft_sentence "" "" [] concraft_in in *) | |
88 | - let l = concraft_parse s in | |
111 | + let result = concraft_parse s in | |
112 | + let l = process_concraft_result "" "" "" [] [] result in | |
89 | 113 | let l = Xlist.map l make_token in |
90 | 114 | let l = {empty_token_env with token = Interp "<conll_root>"} :: l in |
91 | 115 | let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in |
... | ... |
integration/TODO
integration/concraft_test.ml deleted
1 | - | |
2 | -(* let concraft_in, concraft_out, concraft_err = Unix.open_process_full "../../../.local/bin/concraft-pl tag ../concraft/nkjp-model-0.2.gz" [| |] *) | |
3 | -(*let concraft_in, concraft_out, concraft_err = | |
4 | - Unix.open_process_full "concraft-pl tag ../concraft/nkjp-model-0.2.gz" | |
5 | - [|"PATH=" ^ Sys.getenv "PATH"|] | |
6 | - | |
7 | -let _ = | |
8 | - print_endline "out"; | |
9 | - Printf.fprintf concraft_out "Ala ma kota.\n\n%!"; | |
10 | - print_endline "in"; | |
11 | - print_endline ("concraft error message: " ^ input_line concraft_err); | |
12 | - ()*) | |
13 | - | |
14 | -(**********************) | |
15 | - | |
16 | -(* | |
17 | -Aby korzytać z concrafta trzeba najpierw postawić serwer wpisując z linii poleceń: | |
18 | -concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz | |
19 | -*) | |
20 | - | |
21 | -let read_whole_channel c = | |
22 | - let r = ref [] in | |
23 | - try | |
24 | - while true do | |
25 | - r := (input_line c) :: !r | |
26 | - done; | |
27 | - !r | |
28 | - with End_of_file -> List.rev (!r) | |
29 | - | |
30 | -(* Gdy serwer jest już włączony na concraft_err trafia komunikat: | |
31 | -concraft-pl: bind: resource busy (Address already in use) | |
32 | -w przeciwnym przypadku się program wiesza się na czytaniu concraft_in | |
33 | -*) | |
34 | - | |
35 | -(*let _ = | |
36 | - print_endline "Starting concraft server 1"; | |
37 | - let concraft_in, concraft_out, concraft_err = | |
38 | - Unix.open_process_full "concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz" | |
39 | - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in | |
40 | - (* let err_msg = String.concat "\n" (read_whole_channel concraft_err) in | |
41 | - let result = read_whole_channel concraft_in in *) | |
42 | - print_endline "Starting concraft server 2"; | |
43 | - print_endline (input_line concraft_err); | |
44 | - print_endline "Starting concraft server 3"; | |
45 | - print_endline (input_line concraft_err); | |
46 | - print_endline "Starting concraft server 3"; | |
47 | - (* print_endline err_msg; | |
48 | - print_endline "Starting concraft server 3"; | |
49 | - print_endline (String.concat "\n" result); | |
50 | - print_endline "Starting concraft server 4"; *) | |
51 | - ()*) | |
52 | - | |
53 | -let rec process_concraft_result orth lemma interp others rev = function | |
54 | - [] -> List.rev ((orth,(lemma,interp) :: others) :: rev) | |
55 | - | "" :: l -> process_concraft_result orth lemma interp others rev l | |
56 | - | line :: l -> | |
57 | - (match Xstring.split_delim "\t" line with | |
58 | - [orth2;s] when s = "none" || s = "space" -> | |
59 | - if orth = "" then process_concraft_result orth2 lemma interp others rev l | |
60 | - else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l | |
61 | - | ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l | |
62 | - | ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l | |
63 | - | _ -> failwith ("process_concraft_result: " ^ line)) | |
64 | - | |
65 | -let concraft_parse s = | |
66 | - let concraft_in, concraft_out, concraft_err = | |
67 | - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client") | |
68 | - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in | |
69 | - let err_msg = String.concat "\n" (read_whole_channel concraft_err) in | |
70 | - let result = read_whole_channel concraft_in in | |
71 | - if err_msg <> "" then failwith err_msg else | |
72 | - process_concraft_result "" "" "" [] [] result | |
73 | - | |
74 | -let print_parsed_tokens l = | |
75 | - Xlist.iter l (fun (orth,l) -> | |
76 | - if l = [] then failwith "print_parsed_tokens" else | |
77 | - let lemma,interp = List.hd l in | |
78 | - print_endline (orth ^ "\t" ^ lemma ^ "\t" ^ interp)) | |
79 | - | |
80 | -let _ = | |
81 | - print_parsed_tokens (concraft_parse "Ala ma kota."); | |
82 | - print_parsed_tokens (concraft_parse "Szpak frunie."); | |
83 | - print_parsed_tokens (concraft_parse "Miałem miał."); | |
84 | - print_parsed_tokens (concraft_parse "Kiedyś miałem kota."); | |
85 | - print_parsed_tokens (concraft_parse "Kiadyś kupiłem kota."); | |
86 | - () |
integration/makefile
... | ... | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa #eniam-integration.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | 9 | SOURCES= ENIAM_CONLL.ml ENIAMpreIntegration.ml |
... | ... | @@ -24,12 +24,9 @@ eniam-integration.cma: $(SOURCES) |
24 | 24 | eniam-integration.cmxa: $(SOURCES) |
25 | 25 | ocamlopt -linkall -a -o eniam-integration.cmxa $(INCLUDES) $^ |
26 | 26 | |
27 | -test: test.ml $(SOURCES) | |
27 | +test: test.ml | |
28 | 28 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml |
29 | 29 | |
30 | -concraft_test: concraft_test.ml | |
31 | - $(OCAMLOPT) -o concraft_test $(OCAMLOPTFLAGS) concraft_test.ml | |
32 | - | |
33 | 30 | .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx |
34 | 31 | |
35 | 32 | .mll.ml: |
... | ... |
integration/test.ml
... | ... | @@ -26,60 +26,17 @@ let test_strings = [ |
26 | 26 | "Szpak powiedział: „Frunę. Kiszę.”"; |
27 | 27 | ] |
28 | 28 | |
29 | -let wait_for_server () = | |
30 | - let rec wait s a = | |
31 | - try Unix.connect s a | |
32 | - with _ -> Unix.sleep 1; wait s a in | |
33 | - let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 6 in | |
34 | - let a = Unix.ADDR_INET (Unix.inet_addr_loopback, 10089) in | |
35 | - wait s a; | |
36 | - Unix.shutdown s Unix.SHUTDOWN_SEND; | |
37 | - Unix.close s | |
38 | - | |
39 | -let start_server () = | |
40 | - let client_out, server_out = Unix.pipe () in | |
41 | - let client_err, server_err = Unix.pipe () in | |
42 | - let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; "nkjp-model-0.2.gz"|] | |
43 | - Unix.stdin server_out server_err in | |
44 | - List.iter Unix.close [client_out; server_out; client_err; server_err]; | |
45 | - wait_for_server (); | |
46 | - pid | |
47 | - | |
48 | -let stop_server pid = | |
49 | - Unix.kill pid Sys.sigint | |
50 | - | |
51 | -let tag s = | |
52 | - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client") | |
53 | - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] | |
54 | - | |
55 | -let _ = | |
56 | - print_endline "Starting Server"; | |
57 | - let pid = start_server () in | |
58 | - print_endline "Ready"; | |
59 | - let concraft_in, concraft_out, concraft_err = tag "Ala ma kota." in | |
60 | - try | |
61 | - while true do | |
62 | - print_endline @@ input_line concraft_in | |
63 | - done | |
64 | - with End_of_file -> (); | |
65 | - try | |
66 | - while true do | |
67 | - print_endline @@ "concraft error message: " ^ input_line concraft_err | |
68 | - done | |
69 | - with End_of_file -> (); | |
70 | - ignore @@ Unix.close_process_full (concraft_in, concraft_out, concraft_err); | |
71 | - stop_server pid | |
72 | - | |
73 | -(* | |
74 | 29 | let _ = |
30 | + ENIAMpreIntegration.concraft_enabled := true; | |
75 | 31 | ENIAMpreIntegration.mate_parser_enabled := true; |
32 | + ENIAMpreIntegration.initialize (); | |
76 | 33 | print_endline "Testy wbudowane"; |
77 | 34 | Xlist.iter test_strings (fun s -> |
78 | 35 | print_endline ("\nTEST: " ^ s); |
79 | 36 | let text,tokens = ENIAMsubsyntax.parse_text s in |
80 | 37 | let text = ENIAMpreIntegration.parse_text ENIAMsubsyntaxTypes.Struct tokens text in |
81 | 38 | (* print_endline (ENIAMtokenizer.xml_of tokens); *) |
82 | - print_endline (ENIAMsubsyntaxStringOf.tokens tokens); | |
39 | + print_endline (ENIAMsubsyntaxStringOf.token_extarray tokens); | |
83 | 40 | print_endline ""; |
84 | 41 | print_endline (ENIAMsubsyntaxStringOf.text "" tokens text)); |
85 | 42 | (* print_endline "Testy użytkownika."; |
... | ... | @@ -92,5 +49,5 @@ let _ = |
92 | 49 | print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; |
93 | 50 | s := read_line () |
94 | 51 | done;*) |
52 | + ENIAMpreIntegration.stop_server !ENIAMpreIntegration.concraft_server_pid; | |
95 | 53 | () |
96 | -*) | |
... | ... |