multiservice-documentation.tex
66.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
\documentclass[runningheads,a4paper]{book}
\usepackage[OT4]{fontenc}
\usepackage{graphicx}
\usepackage[utf8]{inputenc}
\usepackage[polish, english]{babel}
\usepackage{url}
\usepackage{alltt}
\usepackage{xcolor}
\usepackage[top=3cm, bottom=3cm, left=2cm, right=2cm]{geometry}
\usepackage{hyperref}
\usepackage{xcolor}
\hypersetup{
colorlinks,
linkcolor={red!50!black},
citecolor={blue!50!black},
urlcolor={blue!80!black}
}
\usepackage{tikz}
\usepackage{enumitem}
\usepackage{comment}
\definecolor{fill}{RGB}{176,255,241}
\definecolor{font}{RGB}{138,0,0}
\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
\node[shape=circle,draw=none,inner sep=1.5pt,fill=fill, text=font, font=\bf] (char) {#1};}}
\begin{document}
\mainmatter
\title{Multiservice documentation v. 0.2}
\author{Mateusz Kopeć\\\\
Institute of Computer Science, Polish Academy of Sciences \\\\
\url{m.kopec@ipipan.waw.pl}}
\maketitle
\tableofcontents
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{General Information}
\textsc{Multiservice} is a robust linguistic Web service for Polish, combining several mature offline linguistic tools in a common online platform. Packaged TEI P5-based annotation is used as representation and interchange format for the service. In contrast to most frequent approaches, the architecture supports asynchronous handling of requests to enable processing large amounts of text.
Chapters 1-3 of this manual describe information valuable for the Multiservice users, Chapters 4-6 for Multiservice developers.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Usage}
You have two ways to use \textsc{Multiservice}:
\begin{enumerate}
\item use web service maintained by ICS PAS,
\item set up your own installation of the web service (manual in Chapter \ref{ch:installation}).
\end{enumerate}
Whichever you choose, you may communicate with that web service in one of three ways:
\begin{enumerate}
\item via online web client (\textsc{MultiserviceDemo}, see Chapter \ref{ch:multiservice-demo}),
\item via console web client (see Chapter \ref{ch:clients}),
\item via web service API, creating your own client (start with example clients described in Chapter \ref{ch:clients}).
\end{enumerate}
There is an online web client connected to the web service maintained by ICS PAS, available at \url{http://multiservice.nlp.ipipan.waw.pl}. This is probably the most simple way to test \textsc{Multiservice} and see it in action. However, if you have to process texts automatically, you should use one of console clients or create one of your own. For performance reasons it may also be necessary to set up your own web service back-end locally, instead of using the one maintained by ICS PAS.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Integrated Tools}
Offline versions of all integrated tools have been used by the linguistic community in Poland for several years and they proved their suitability and efficiency for linguistic engineering tasks. They constitute the basic building blocks of many local processing chains, but have never been made available online in a consistent manner (in a common infrastructure and format). Until now all integrated tools are open source and all are actively maintained and developed. Table \ref{tab:tools} shows currently available tools. More detailed description of them is given later in this chapter.
\begin{table}
\centering
\begin{tabular}{l|p{2.5cm}|p{2.5cm}|p{2.7cm}}
Service name & Required layers & Provided layers & Available options \\ \hline\hline
\textsc{Pantera} & - & segmentation, morphosyntax & useGuesser \\\hline
\textsc{Concraft} & - & segmentation, morphosyntax & - \\\hline
\textsc{WCRFT} & - & segmentation, morphosyntax & - \\\hline
\textsc{WMBT} & - & segmentation, morphosyntax & - \\\hline
\textsc{PoliTa} & - & segmentation, morphosyntax & - \\\hline
\textsc{Spejd} & segmentation, morphosyntax & words, groups & - \\\hline
\textsc{Sentipejd} & segmentation, morphosyntax & words & - \\\hline
\textsc{OpenTextSummarizer} & - & summary & ratio \\\hline
\textsc{ŚwietlickaSummarizer} & morphosyntax & summary & minPercent, maxPercent\\\hline
\textsc{LakonSummarizer} & morphosyntax & summary & ratio, length \\\hline
\textsc{DependencyParser} & morphosyntax & dependency parse & - \\\hline
\textsc{Ruler} & segmentation, morphosyntax, mentions & coreference & - \\\hline
\textsc{Bartek} & segmentation, morphosyntax, mentions & coreference & - \\\hline
\textsc{Nerf} & segmentation, morphosyntax & names & - \\\hline
\textsc{MentionDetector} & segmentation, morphosyntax & mentions & - \\
\end{tabular}
\caption{Available tools}
\label{tab:tools}
\end{table}
Most tools are associated with their configurations/models. These are available to download at \url{http://zil.ipipan.waw.pl/Multiservice}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Chain construction rules}
\label{sec:chains}
Tools presented in Table \ref{tab:tools} may be combined in a processing chain. Processing chain is a list of NLP tools, which are applied in sequence to input data, augmenting it with additional information, stored in separate `layers'.
An example of such chain would be to apply part of speech tagger \textsc{Pantera} to raw text data and then apply \textsc{DependencyParser}. As seen in the Table \ref{tab:tools}, \textsc{Pantera} doesn't have any required layers, which means it may be applied to raw text. \textsc{Pantera} adds segmentation and morphosyntax layers (storing segmentation of the text to tokens and their morphosyntactic interpretations, respectively) to the text. This allows \textsc{DependencyParser} to be run, as it requires the morphosyntactic layer to be present in its input (parser won't work without any tool providing this layer run beforehand). The output of \textsc{DependencyParser} is dependency parse layer, which is added to the original input text.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Part of speech taggers}
Several part of speech taggers are available in \textsc{Multiservice}. They all rely on common morphosyntactic analyzer for Polish, namely \textsc{Morfeusz}\footnote{\url{http://sgjp.pl/morfeusz/}}. It provides positional tags starting with part of speech information followed by values of morphosyntactic categories corresponding to the given part of speech. Currently we use version 1.0 of \textsc{Morfeusz}, using linguistic data coming from \textsc{PoliMorf}\footnote{\url{http://zil.ipipan.waw.pl/PoliMorf}} dictionary.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{Pantera}
\textsc{Pantera} is a morphosyntactic rule-based Brill tagger of Polish. It uses an optimised version of Brill’s algorithm adapted for specifics of inflectional languages. The tagging is performed in two steps, with a smaller set of morphosyntactic categories disambiguated in the first run (part of speech, case, person) and the remaining ones in the second run. Due to free word order nature of Polish the original set of rule templates as proposed by Brill has been extended to cover larger contexts. \textsc{Pantera} is freely available at \url{http://code.google.com/p/pantera-tagger/}, you may also see \url{http://zil.ipipan.waw.pl/PANTERA}.
Current \textsc{Pantera} model was trained on version 1.2 of 1-million word subcorpus of the National Corpus of Polish, available at \url{http://clip.ipipan.waw.pl/NationalCorpusOfPolish}. Training was done using the corpus reanalyzed\footnote{according to the procedure described at \url{http://nlp.pwr.wroc.pl/redmine/projects/wcrft/wiki/Training}} with Polimorf morphological dictionary, with \texttt{threshold=6}.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{Concraft}
\textsc{Concraft} is a morphosyntactic tagger for the Polish language. The tool combines the following components into a pipeline:
\begin{itemize}
\item a morphosyntactic segmentation and analysis tool \textsc{Maca}\footnote{\url{http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki}},
\item a morphosyntactic disambiguation library \textsc{Concraft}\footnote{\url{https://github.com/kawu/concraft}}.
\end{itemize}
As for now, the tagger doesn't provide any lemmatisation capabilities. As a result, it may output multiple interpretations (all related to the same morphosyntactic tag, but with different lemmas) for some known words, while for the out-of-vocabulary words it just outputs orthographic forms as lemmas. More information at \textsc{Concraft} webpage at \url{http://zil.ipipan.waw.pl/Concraft}.
Current \textsc{Concraft} model was trained on version 1.2 of 1-million word subcorpus of the National Corpus of Polish, available at \url{http://clip.ipipan.waw.pl/NationalCorpusOfPolish}. Training was done using the corpus reanalyzed\footnote{according to the procedure described at \url{http://nlp.pwr.wroc.pl/redmine/projects/wcrft/wiki/Training}} with Polimorf morphological dictionary.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{WCRFT}
\textsc{WCRFT} (Wrocław CRF Tagger) is a simple morpho-syntactic tagger for Polish producing state-of-the-art results. The tagger combines tiered tagging, conditional random fields (CRF) and features tailored for inflective languages written in WCCL. The algorithm and code are inspired by Wrocław Memory-Based Tagger. \textsc{WCRFT} uses CRF++ API as the underlying CRF implementation. Tiered tagging is assumed. Grammatical class is disambiguated first, then subsequent attributes (as defined in a config file) are taken care of. Each attribute is treated with a separate CRF and may be supplied a different set of feature templates. For details of the underlying algorithms, as well as tagger evaluation, please refer to the tagger website at \url{http://nlp.pwr.wroc.pl/redmine/projects/wcrft/wiki/WCRFT1}.
Current \textsc{WCRFT} model was trained on version 1.2 of 1-million word subcorpus of the National Corpus of Polish, available at \url{http://clip.ipipan.waw.pl/NationalCorpusOfPolish}. Training was done using the corpus reanalyzed\footnote{according to the procedure described at \url{http://nlp.pwr.wroc.pl/redmine/projects/wcrft/wiki/Training}} with Polimorf morphological dictionary.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{WMBT}
\textsc{WMBT} (Wrocław Memory-Based Tagger) is a simple morpho-syntactic tagger for Polish producing state-of-the-art results. \textsc{WMBT} uses TiMBL API as the underlying Memory-Based Learning implementation. The features for classification are generated by using WCCL. \textsc{WMBT} uses a tiered tagging approach. Grammatical class is disambiguated first, then subsequent attributes (as defined in a config file) are taken care of. Each attribute may be supplied a different set of features. More information available at tagger webpage: \url{http://nlp.pwr.wroc.pl/redmine/projects/wmbt/wiki}.
Current \textsc{WMBT} model was trained on version 1.2 of 1-million word subcorpus of the National Corpus of Polish, available at \url{http://clip.ipipan.waw.pl/NationalCorpusOfPolish}. Training was done using the corpus reanalyzed\footnote{according to the procedure described at \url{http://nlp.pwr.wroc.pl/redmine/projects/wcrft/wiki/Training}} with Polimorf morphological dictionary.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{PoliTa}
\textsc{PoliTa} is a meta-tagger, which uses many individual POS taggers to combine their decisions and produce a potentially better output than any of the individual methods by themselves. Its current implementation uses \textsc{Multiservice} itself to run several other taggers and then combine their results into single version. More information available at \url{http://zil.ipipan.waw.pl/PoliTa}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Parsers/shallow parsers}
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{Spejd}
\textsc{Spejd} is an engine for shallow parsing using cascade grammars, able to co-operate with TaKIPI for tokenization, segmentation, lemmatization and morphologic analysis. Parsing rules are defined using cascade regular grammars which match against orthographic forms or morphological interpretations of particular words. Spejd’s specification language is used, which supports a variety of actions to perform on the matching fragments: accepting and rejecting morphological interpretations, agreement of entire tags or particular grammatical categories, grouping (syntactic and semantic head may be specified independently). Users may provide custom rules or may use one of the provided sample rule sets. More information available at \url{http://zil.ipipan.waw.pl/Spejd/}.
Current \textsc{Spejd} configuration uses grammar of Polish (version 1.0), developed by K. Głowińska within \textsc{NKJP}, available at \url{http://clip.ipipan.waw.pl/LRT?action=AttachFile&do=view&target=gramatyka_Spejd_NKJP_1.0.zip}.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{Sentipejd}
\textsc{Sentipejd} is a \textsc{Spejd} grammar detecting sentiment carrying expressions in text. It requires a slightly modified Spejd version, currently not publicly available. \textsc{Sentipejd} consits of:
\begin{itemize}
\item shallow sentiment rules as described in an article: \textit{Shallow parsing in sentiment analysis of product reviews} by Aleksander Buczyński and Aleksander Wawer.
\item sentiment dictionary available from Polish sentiment dictionary, available at \url{http://zil.ipipan.waw.pl/SlownikWydzwieku}.
\end{itemize}
\textsc{Sentipage} webpage is avaialable here: \url{http://zil.ipipan.waw.pl/Sentipejd}.
Current \textsc{Sentipejd} version uses grammar available at \url{http://zil.ipipan.waw.pl/Multiservice}.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{DependencyParser}
\textsc{DependencyParser} is trained on the Polish Dependency Bank (PDB, Pol. Składnica zależnościowa\footnote{\url{http://zil.ipipan.waw.pl/Sk\%C5\%82adnica}}) with the the publicly available parsing system -- \textsc{MaltParser}\footnote{\url{http://maltparser.org/}}. \textsc{MaltParser} is a transition-based dependency parser that uses a deterministic parsing algorithm. The deterministic parsing algorithm builds a dependency structure of an input sentence based on transitions (shift-reduce actions) predicted by a classifier. The classifier learns to predict the next transition given training data and the parse history. Information about parser performance and Polish dependency relation types available at \url{http://zil.ipipan.waw.pl/PolishDependencyParser}.
Current model of \textsc{DependencyParser} was taken from its webpage.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Summarizers}
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{OpenTextSummarizer}
\textsc{OpenTextSummarizer} is an open source tool for summarising texts. The program reads a text and decides which sentences are important and which are not. It ships with Ubuntu, Fedora and other linux distros. OTS supports many (25+) languages which are configured in XML files. Several academic publications have benchmarked it and praised it. More information available at its webpage \url{http://libots.sourceforge.net/}.
Current configuration of \textsc{OpenTextSummarizer} is simply selecting \texttt{pl} dictionary. We use default version from Ubuntu repositories.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{ŚwietlickaSummarizer}
\textsc{ŚwietlickaSummarizer} is a tool for creating short text summaries. It utilises text extraction method, i.e. the output consists of sentences from the original text. The tool uses a number of machine learning algorithms, including neural networks, linear regression, Bayesian networks and decision trees. The output sentences are chosen based on different signals, such as the length of the sentence, its position in the text structure and properties of the words it contains. The system was trained specifically for newspaper articles in Polish. It is possible, however, to adjust it for other kinds of documents and languages. More information avaialble at: \url{http://clip.ipipan.waw.pl/Summarizer}.
Current configuration of \textsc{ŚwietlickaSummarizer} is an ensemble of best classifiers, it was claimed best by the tool's author in her master's thesis.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{LakonSummarizer}
\textsc{Lakon} is part of Adam Dudczak's master thesis, aimed at the evaluation of newspaper article summarization techniques based on sentence selection. More information about it at its webpage: \url{http://www.cs.put.poznan.pl/dweiss/research/lakon/}.
Current configuration of \textsc{Lakon} uses location-based features for sentence selections, as it proved to be most effective choice in evaluation conducted by the tool's author.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Coreference resolvers}
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{Ruler}
\textsc{Ruler} facilitates the automatic clustering of mentions into coreferent clusters using a simple deterministic rule-based method. You may find more about it at \url{http://zil.ipipan.waw.pl/Ruler}.
Its current configuration in \textsc{Multiservice} uses version 1.2 of \textsc{Ruler}.
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{Bartek}
\textsc{Bartek} is a statistical coreference resolver, it uses a machine learnt method. Bartek comes with default models trained on the full Polish Coreference Corpus and contains compiled resources extracted
from Polish Wikipedia and plWordnet. You may read more about it at \url{http://zil.ipipan.waw.pl/Bartek}.
\textsc{Bartek} in \textsc{Multiservice} is in version 1.1. It uses default models attached to that version.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Named entity recognizers}
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{Nerf}
\textsc{Nerf} is a statistical named entity recognition tool based on linear-chain conditional random fields. It recognises embedded structures of named entities consistent with the type hierarchy used in the National Corpus of Polish. You may read more about it at: \url{http://zil.ipipan.waw.pl/Nerf}.
In \textsc{Multiservice} we use the Haskell implementation, with the model trained on version 1.1 of 1-million word subcorpus of the National Corpus of Polish.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Other}
% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\subsubsection{MentionDetector}
\textsc{MentionDetector}, a tool for performing mention detection for coreference resolution. It uses output of tagger, shallow parser and named entity recogniser to collect and filter mentions. Named entity recogniser and shallow parser are only optional prerequisites, but provide better final results. Additionally, \textsc{MentionDetector} detects zero subject mentions by itself.
You may read more about \textsc{MentionDetector} at \url{http://zil.ipipan.waw.pl/MentionDetector}.
Its current configuration in \textsc{Multiservice} uses version 1.2, with default zero subject detection model trained using the full Polish Coreference Corpus, version 0.92.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Multiservice Demo}
\label{ch:multiservice-demo}
\textsc{Multiservice Demo} is an example web-based human interface for \textsc{Multiservice}. It may be installed on any web server to provide interface for any \textsc{Multservice} back-end. There is an instance of it available at \url{http://multiservice.nlp.ipipan.waw.pl/}, connected to back-end hosted by ICS PAS.
\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{img/multi_start.png}
\caption{\textsc{Multiservice Demo} starting page}
\label{fig:multi_start}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{User manual}
Home screen of the \textsc{Multiservice Demo} if presented in Figure \ref{fig:multi_start}. In fact it is the only one screen of this application, but it has several tabs which give different views. First of all, language of the interface may be changed by clicking chosen flag \circled{1}. Then, as this application is designed to demonstrate NLP tools, we need to state, what tools do want want to run and for which input, therefore we correctly are in the \texttt{Create request} tab \circled{2}.
The input may be raw text, which we are supposed to input in the bottom of the page, in the \texttt{Input text} lower tab \circled{6}, replacing the sample text starting with "To będzie...". Another option is to use the second lower tab named \texttt{Input URL} and paste the URL of the page we want to summarize.
The only thing left to do is to specify NLP tools to use. We may obey the rules presented in Section \ref{sec:chains} during the chain construction. If you want to use one of the predefined chains, choose an option via \texttt{Select predefined chain of actions} dropdown \circled{4}. If you do so, example chain will show up in the table in the right side of the dropdown \circled{7}. The tools in the chain are listed from the top, i.e. the first tool in the chain is in the top of the list. If a tool has optional options to specify, you may do so in the \texttt{Options} column. You may also remove the last element of the chain by clicking the \texttt{X} symbol next to it in the right part of the chain table. Via a dropdown menu named \texttt{Add new action at the end of the chain} \circled{3}. It obeys the chain construction rules, i.e. it shows only tools, for which requirements are already met in the existing chain.
If you want to construct a chain from scratch, simply start with \texttt{Add new action at the end of the chain} \circled{3} dropdown and add each tool in correct sequence to the chain. If you don't know, what are prerequisites for a NLP tool, you may take a look at the \texttt{Help} tab \circled{2}, which contains information similar to Table \ref{tab:tools}.
Finally, we may press the \texttt{Run} button \circled{5} and wait (usually up to few seconds) to get the processing results. In the meantime a popup will appear, showing current request status. If it succeeds, you will automatically be taken to the \texttt{Result of last request} tab, which is visible in the Figure \ref{fig:multi_result}.
\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{img/multi_result.png}
\caption{\textsc{Multiservice Demo} results page}
\label{fig:multi_result}
\end{figure}
This tab visualises outputs of selected NLP tools applied to given input. A set of tabs \circled{1}-\circled{9} allows to switch between various output layers. Number of layers present may vary depending on the chain of tools selected. One tab always present is the \texttt{JSON} tab, which shows raw output of the \textsc{Multiservice} back-end, i.e. all output data in \textsc{JSON} format, which is more fancy visualized in the other tabs.
\begin{enumerate}[label=\protect\circled{\arabic*}]
\setcounter{enumi}{1}
\item \texttt{Segmentation} layer presents segmentation of input text into sentences (\texttt{sent} annotation) and words (\texttt{seg} annotation).
\item \texttt{Morphosyntax} layer shows above each word its selected part of speech. Additionaly, when we hover over a word, we can see all morphosyntactic interpretations with the bold one selected by the tagger.
\item \texttt{Words} layer show so-called syntactic words, which connect words from segmentation into larger units. Broader part of speech tag is annotated above each unit.
\item \texttt{Groups} layer visualises syntactic groups, produced by a shallow parser. Group types are shown above groups.
\item \texttt{Named entities} layer shows named entities, with their types annotated above.
\item \texttt{Summary} layer presents final summary of the input as raw text.
\item \texttt{Dependency parse} layer presents dependency parse relations, together with relation types. Only one sentence is visible. To see a parse of selected sentence, it has to be clicked in the list below visualization.
\item \texttt{Mentions} layer shows mentions, which are input to coreference resolver.
\item \texttt{Coreference} layer presents coreference groups. To see, with which mention given mention is in a group, we have to hover over its annotation. The whole group will be highlighted.
\end{enumerate}
Any bugs or problems with this demo are welcome and can be reported via \texttt{Report a bug} tab \circled{2}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Implementation details}
\textsc{Multiservice Demo} is a \textsc{Django}\footnote{\url{https://www.djangoproject.com/}} application. \textsc{Django} is a high-level \textsc{Python} framework. It may be run on a web server using \textsc{FCGI} or \textsc{WSGI} interfaces. User interface is created with \textsc{jQuery UI}\footnote{\url{https://jqueryui.com/}}. Visualization of linguistic data is done with help of \textsc{brat}\footnote{\url{http://brat.nlplab.org/}} visualization tool, with some customizations. \textsc{Multiservice Demo} uses \textsc{SQLlite}\footnote{\url{https://www.sqlite.org/}} database for internal storage.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Clients}
\label{ch:clients}
There are currently three sample client applications for \textsc{Multiservice}, available to download at \url{http://git.nlp.ipipan.waw.pl/multiservice/clients}. They should be used as a starting point for your own implementation of \textsc{Multiservice} client. Two of them are written in Python and one in Java. They are presented in detail in the following sections.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Python Thrift client}
\label{sec:thrift-python-client}
\textsc{Thrift} is a software framework for interchanging data between software written in different programming languages. It is used internally by \textsc{Multiservice} to interchange data, therefore it is fully supported as either input or output format.
Python Thrift client is a single file: \texttt{thrift\_client.py}. It consumes text to process from its standard input, constructs a Thrift object with that text and sends it to \textsc{Multiservice} to be processed with a chain of tools specified as the program arguments. After processing, text comes back as a set of Thrift objects containing all the automatic annotation. They are converted to JSON format and printed to standard output.
If you want to have maximum capabilities, this client is the best way to start (or writing a Thrift client in another programming language), because you may sent partially annotated data to \textsc{Multiservice} and receive output as objects in memory, which may be easily worked with. See the source code of the client, how the Thrift request is created. Thrift data structures are defined in the main repository at \url{http://git.nlp.ipipan.waw.pl/multiservice/multiservice/tree/master/core/thrift}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Installation}
This client requires Python 2.7 interpreter installed, as well as the following Python libraries:
\begin{itemize}
\item jsonpickle
\item thrift
\item multiservice-0.1
\end{itemize}
The first two are available in public Python repositories, the last one has its sources published in the \textsc{Multiservice} back-end repository: \url{http://git.nlp.ipipan.waw.pl/multiservice/multiservice/tree/master/core/PyUtils}. For convinience, its compiled version is available also next to \texttt{thrift\_client.py} file in the \textsc{Multiservice} clients repository.
For example, in Ubuntu you may install all requirements using the following commands (we assume you downloaded \texttt{multiservice-0.1-py2.7.egg} file alongside \texttt{thrift\_client.py}):
\begin{verbatim}
sudo apt-get install python-setuptools
sudo easy_install jsonpickle
sudo easy_install thrift
sudo easy_install multiservice-0.1-py2.7.egg
\end{verbatim}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Usage}
The client uses standard input, so we may for example use it like:
\begin{verbatim}
echo "Ala ma kota." | python thrift_client.py Concraft Nerf Spejd
\end{verbatim}
or to process text from file named \texttt{input.txt}:
\begin{verbatim}
python thrift_client.py Concraft Nerf Spejd < input.txt
\end{verbatim}
Space-separated arguments after \texttt{thrift\_client.py} specify the target processing chain. Optionally, you may specify the \textsc{Multiservice} back-end host and port, using \texttt{--host} and \texttt{--port} parameters. By default, requests are sent to ICS PAS back end.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Python SOAP client}
\label{sec:soap-python-client}
Second type of Python client uses SOAP web service definition protocol, with the Web Services Description Language (WSDL) file present at \url{http://ws.multiservice.nlp.ipipan.waw.pl/WebService-1.0-SNAPSHOT/ClarinWS?wsdl}. There are libraries in many programming languages, which using such file generate the code of client applications.
Python SOAP client is a single file: \texttt{soap\_client.py}. It consumes text to process from its standard input, constructs a SOAP request with that text and sends it to \textsc{Multiservice} to be processed with a chain of tools specified as the program arguments. After processing, text comes back as XML response containing all the automatic annotation. The output comes in the Packaged TEI format (see Section \ref{sec:pac_tei}). Unfortunately, current implementation only produces morphosyntax and segmentation layers, the other ones would not be returned.
Theoretically, using SOAP protocol it should be possible to sent partially annotated data in Packaged TEI format to \textsc{Multiservice} and process it further, however given that only layers possible to output are morphosyntax and segmentation, it does not have much sense.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Installation}
This client requires Python 2.7 interpreter installed, as well as the following Python libraries:
\begin{itemize}
\item suds\footnote{\url{https://fedorahosted.org/suds/}}
\end{itemize}
It is available in public Python repositories. For example, in Ubuntu you may install all requirements using the following commands:
\begin{verbatim}
sudo apt-get install python-setuptools
sudo easy_install suds
\end{verbatim}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Usage}
The client uses standard input, so we may for example use it like:
\begin{verbatim}
echo "Ala ma kota." | python soap_client.py Concraft Nerf Spejd
\end{verbatim}
or to process text from file named \texttt{input.txt}:
\begin{verbatim}
python soap_client.py Concraft Nerf Spejd < input.txt
\end{verbatim}
Space-separated arguments after \texttt{soap\_client.py} specify the target processing chain. Optionally, you may specify the \textsc{Multiservice} back-end host and port, using \texttt{--host} and \texttt{--port} parameters. By default, requests are sent to ICS PAS back end.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Java SOAP client}
\label{sec:soap-java-client}
Java SOAP client uses SOAP web service definition protocol, with the Web Services Description Language (WSDL) file present at \url{http://ws.multiservice.nlp.ipipan.waw.pl/WebService-1.0-SNAPSHOT/ClarinWS?wsdl}. There are libraries in many programming languages, which using such file generate the code of client applications. Such procedure was used to generate code of this client application, using \textsc{Apache Axis2} library.
Similar to Python SOAP client, Java SOAP client consumes text to process from its standard input, constructs a SOAP request with that text and sends it to \textsc{Multiservice} to be processed with a chain of tools specified as the program arguments. After processing, text comes back as XML response containing all the automatic annotation. The output comes in the Packaged TEI format (see Section \ref{sec:pac_tei}). Unfortunately, current implementation only produces morphosyntax and segmentation layers, the other ones would not be returned.
Theoretically, using SOAP protocol it should be possible to sent partially annotated data in Packaged TEI format to \textsc{Multiservice} and process it further, however given that only layers possible to output are morphosyntax and segmentation, it does not have much sense.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Installation}
This client requires Maven and Java Development Kit in version at least 1.7 installed (for compilation, for usage of compiled \texttt{.jar} file, Java Runtime Environment is sufficient). To compile the \texttt{.jar}, execute the command:
\begin{verbatim}
mvn clean install
\end{verbatim}
from the \texttt{java} directory, where \texttt{pom.xml} file is present. The \texttt{.jar} is in the \texttt{target} folder and is named \texttt{client-1.0-SNAPSHOT.one-jar.jar}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Usage}
The client uses standard input, so we may for example use it like:
\begin{verbatim}
echo "Ala ma kota." | java -jar client-1.0-SNAPSHOT.one-jar.jar Concraft Nerf Spejd
\end{verbatim}
or to process text from file named \texttt{input.txt}:
\begin{verbatim}
java -jar client-1.0-SNAPSHOT.one-jar.jar Concraft Nerf Spejd < input.txt
\end{verbatim}
Space-separated arguments after \texttt{client-1.0-SNAPSHOT.one-jar.jar} specify the target processing chain. Requests are sent to ICS PAS back end.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Packaged TEI format}
\label{sec:pac_tei}
Packaged TEI P5-based Linguistic Representation is mostly compatible with \textsc{NKJP} representation described here: \url{http://nlp.ipipan.waw.pl/TEI4NKJP/}. The main difference is that all the files (headers, \texttt{text.xml}, \texttt{segmentation.xml}, \texttt{ann\_morphosyntax.xml} etc.) are `packed` into one file. That is instead of multiple files following this pattern:
\begin{verbatim}
<teiCorpus>
<teiHeader>
... corpus `global` header ...
</teiHeader>
<TEI>
<teiHeader>
... local header ...
</teiHeader>
<text>
... some annotation layer data (ex. segmentation)...
</text>
</TEI>
</teiCorpus>
\end{verbatim}
you have one file like that:
\begin{verbatim}
<teiCorpus>
<teiHeader>
... Multiservice `global` header (could be the same as in NKJP_header.xml) ...
</teiHeader>
<TEI>
<teiHeader>
... header specific for single annotation layer (could be the same as in header.xml) ...
</teiHeader>
<text>
... some annotation layer data (ex. segmentation - the same as ann_segmentation.xml) ...
</text>
</TEI>
<TEI>
... some other annotation layer ...
</TEI>
</teiCorpus>
\end{verbatim}
They easiest way to parse Packaged TEI P5-based Linguistic Representation is to use \textsc{TeiAPI} Java library. More information about \textsc{TeiAPI} (including Maven artifacts) is available at \url{http://zil.ipipan.waw.pl/TeiAPI}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Architecture}
To be able to process large amounts of text, requests sent to the Web service are handled in asynchronous manner. Invoking one of the available methods results in returning the request token (identifier) which can be used to check the request status and retrieve the result when processing completes. This design was directly inspired by `TaKIPI Web Service`: \url{http://nlp.pwr.wroc.pl/clarin/ws/takipi/} prepared by ICS PAS and WrocUT.
This chapter is under development.
\begin{comment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Available operations}
Following basic request manipulating methods are available:
\begin{itemize}
\item analyzeChain(text, requestParts, inputFormat, outputFormat) – enqueue a request to perform given list of operations (requestParts) on the given text. Each chain part consists of operation type (that is linguistic function such as tagging or shallow parsing), requested service (internal tool) name (e.g. Takipi, Pantera, etc.) and a map of properties specific to the provided tool. In this way one may request performing several operations at once (e.g. „tag text with Pantera, then perform shallow parsing with Spejd, possibly using tagging information provided by Pantera”). Returns a unique token of the request.
\item getStatus(token) – returns a status of request with the given token (currently may be one of: PENDING, IN PROGRESS, DONE or FAILED).
\item getResult(token) – if the status of request is DONE, returns request result. If the request is FAILED returns an error message.
\end{itemize}
There are more methods defined by the Web Service and they are covered in detail here[tu był link do usage]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{The Web Service}
The Web Service is the only part directly seen by the client. Whenever the analyzeChain function is invoked it stores request data in the database, queries for a daemon that is able to process the whole chain of operations (daemons that are not currently working are preferred) and sends it the request identifier.
getStatus and getResult functions simply query the database for information about the request.
The service communicates with SOAP messages and is implemented in Java using JAX-WS technology.
The source code is in `core/WebService` subproject.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Daemons}
Each daemon is a Java application listening on specified address and port, retrieving request identifiers to process (sent by Web Service) from the server socket.
A daemon may integrate multiple internal tools and invoke them to process request chains. When finished, it saves the result in the
database. Obviously multiple daemons with different configurations may be run.
The source code is in `Daemon` subproject.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Database Backend}
The database is responsible for exchanging data between Web Service and daemons. The most important database tables are:
\begin{itemize}
\item ServiceInfo – contains information about internal tools (services) and operation names it provides
\item DaemonInfo – daemons with their network locations (for communication with Web Service) and information about services they provide,
\item Request – contains text, status, result and list of request parts,
\item RequestPart – a linguistic function (e.g. shallow parsing) and service (tool) name (e.g. “Spejd”) to perform it.
\end{itemize}
Currently an underlying DBMS is PostreSQL and it is accessed by daemons and the Multiservice with Java Persistence API 2.0 (using EclipseLink library).
The source code for JPA classes mapping above-mentioned database tables is in the `Commons` subproject.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Cykl życia zlecenia}
Są dwa rodzaje zleceń:
\begin{itemize}
\item StringRequest - zarówno na wejściu, jak i na wyjściu jest napis. W polach inputSettings i outputSettings są ustawiane konwertery formatu wejściowego oraz wyjściowego (np. dla XML-a).
\item ObjectRequest - zarówno na wejściu, jak i na wyjściu jest od razu obiekt thriftowy TText
\end{itemize}
Zadania dla Klienta (patrz core/thrift/multiservice.thrift):
\begin{enumerate}
\item 1. Stworzyć obiekt typu StringRequest lub ObjectRequest
\item 2. Wykonać metodę putStringRequest lub putObjectRequest i otrzymać token
\item 3. Wywoływać co jakiś czas getRequestStatus dopóki wynik inny niż FAILED lub DONE. Inne możliwe statusy to PENDING (jeszcze nie zaczęto przetwarzania) oraz IN\_PROGRESS.
\item 4. Jeśli status == DONE, wywołać metodę getResultString lub getResultObject, jeśli status == FAILED, wywołać metodę getException, by zobaczyć co się stało
\item 5. Po pobraniu wyniku zlecenie jest kasowane i otrzymuje status DUMPED.
\end{enumerate}
Zadania dla RequestManagera (serwera) (patrz core/thrift/multiservice.thrift):
\begin{enumerate}
\item 1. (tylko dla putStringRequest) Wywołać jakiś InputService (patrz core/thrift/subservices.thrift) zdefiniowany w request.inputSettings i uzyskać za jego pomocą obiekt typu TText.
\item 2. (gdy otrzymaliśmy putStringRequest jak i putObjectRequest). Dla kolejnych elementów listy request.processingChain znajdujemy odpowiedni AnnotatingService i go wywołujemy. AnnotatingService uzupełnia zawartość obiektu TText o wyniki działania jakiegoś podserwisu (np. uzupełni listę obiektów TNamedEntity jest jest to rozpoznawacz nazw własnych).
\item 3. (tylko dla putStringRequest) Wywołać jakiś OutputService (patrz core/thrift/subservices.thrift) zdefiniowany w request.outputSettings i uzyskać za jego pomocą obiekt typu String (np. XML-a).
\end{enumerate}
Interaction with the web service, regardless of communication method choice, consists of the following steps:
\begin{enumerate}
\item The user sends to the service a processing request, specifying input text and a chain of chosen NLP tools, altogether with optional parameters for them.
\item The web service generates a token for the request (further used to operate on the given request with the service), stores the request in the queue and returns the token to the user.
\item The user keeps querying the service about the status of execution of a request identified with a given token until the status shows that the execution stopped because of error or ended successfully.
\item If the execution was successful, the user retrieves the result from the service, otherwise receives the error message instead.
\end{enumerate}
\end{comment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Code organisation}
\textsc{Multiservice} code is distributed in two git repositories:
\begin{itemize}
\item Main (back-end, subservices, web service, demo web client): \url{http://git.nlp.ipipan.waw.pl/multiservice/multiservice}
\item Command line clients: \url{http://git.nlp.ipipan.waw.pl/multiservice/clients}
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Main repository contents}
\begin{itemize}
\item \texttt{config} -- contains sample \textsc{Multiservice} configuration.
\begin{itemize}
\item \texttt{config.xml} -- file describing locations of available subservices.
\end{itemize}
\item \texttt{core} -- main components of \textsc{Multiservice}.
\begin{itemize}
\item \texttt{RequestManager} - najgłówniejszy serwis przyjmujący wszystkie zlecenia. Odbiera zlecenia ze świata (np. od Dema) i zleca je podserwisom. Jest napisany w Javie.
\item \texttt{thrift} - źródła thriftowe (do generowania kodu pośredniczącego). Zawiera thriftowe definicje obiektów do wymiany danych pomiędzy podserwisami i RequestManagerem. Więcej patrz tutaj: http://diwakergupta.github.io/thrift-missing-guide/
\item \texttt{MultiserviceDemo} - interfejs przeglądarkowy napisany w Django, wywołuje RequestManagera i ładnie pokazuje wyniki.
\item \texttt{WebService} - webserwis napisany w technologii JAX-RS, umożliwia odpalanie serwisu jako webserwisu typu SOAP.
\item \texttt{CppUtils} - thriftowy interfejs skonwertowany C++ (przez thrift/generate-all.sh) + różne wspomagacze (np. AnnotatingServer, który ułatwia tworzenie własnego serwera do anotacji)
\item \texttt{JavaUtils} - jw. ale w Javie
\item \texttt{PyUtils} - jw. ale w Pythonie (2.7)
\end{itemize}
\item \texttt{doc} -- documentation (this file).
\item \texttt{scripts} -- scripts for installing and running \textsc{Multiservice}.
\item \texttt{subservices} -- annotating subservices (implementing interfaces from \texttt{core/thrift/subservices.thrift}).
\begin{itemize}
\item \texttt{cpp} -- subservices written in C++.
\begin{itemize}
\item \texttt{PanteraService} -- service implementing the \textsc{Pantera} tagger.
\item \texttt{SpejdService} -- service implementing the \textsc{Spejd} shallow parser.
\end{itemize}
\item \texttt{haskell} -- subservices written in Haskell.
\begin{itemize}
\item \texttt{concraft-multiservice-master} -- service implementing the \textsc{Concraft} tagger.
\item \texttt{nerf-multiservice-master} -- service implementing the \textsc{Nerf} named entity recogniser.
\end{itemize}
\item \texttt{java} -- subservices written in Java.
\begin{itemize}
\item \texttt{BartekService} -- service implementing the \textsc{Bartek} coreference resolver.
\item \texttt{MentionDetectorService} -- service implementing the \textsc{MentionDetector}.
\item \texttt{ServiceWrapper} -- Java wrapper for C++ subservices.
\item \texttt{ConversionService} -- service offering conversions between plain text and TEI formats.
\item \texttt{OpenTextSummarizerService} -- service implementing the \textsc{OpenTextSummarizer}.
\item \texttt{SwietlickaSummarizerService} -- service implementing the \textsc{SwietlickaSummarizer}.
\item \texttt{DependencyParserService} -- service implementing the \textsc{DependencyParser}.
\item \texttt{RulerService} -- service implementing the \textsc{Ruler} coreference resolver.
\item \texttt{UrlParser} -- service able to parse webpage at given URL and return it as a Thrift text.
\item \texttt{LakonSummarizerService} -- service implementing the \textsc{Lakon} summarizer.
\item \texttt{SampleService} -- sample service, not used in production.
\end{itemize}
\item \texttt{python} -- subservices written in Python.
\begin{itemize}
\item \texttt{PolitaService} -- service implementing the \textsc{PoliTa} tagger.
\item \texttt{TEIWriterService} -- deprecated service implementing conversion of annotated data to TEI format (replaced by \texttt{java/ConversionService}).
\item \texttt{WCRFTService} -- service implementing the \textsc{WCRFT} tagger.
\item \texttt{WMBTService} -- service implementing the \textsc{WMBT} tagger.
\end{itemize}
\end{itemize}
\item \texttt{third\_party} -- sources and installation scripts for third party requirements.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Command line clients repository contents}
\begin{itemize}
\item \texttt{java} -- Java SOAP client (see Section \ref{sec:soap-java-client}).
\item \texttt{python} -- Python clients.
\begin{itemize}
\item \texttt{multiservice-0.1-py2.7.egg} -- compiled library for python Thrift client.
\item \texttt{soap\_client.py} -- python SOAP client (see Section \ref{sec:soap-python-client}).
\item \texttt{thrift\_client.py} -- python Thrift client (see Section \ref{sec:thrift-python-client}).
\end{itemize}
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Installation and administration}
\label{ch:installation}
This chapter present how the \textsc{Multiservice} back-end with web service and web client can be installed and administered in Ubuntu Server 14.04 system. Superuser privileges are needed, however installation may be manually tweaked to run without it. Users of other systems have to go through scripts line by line and implement their own equivalent commands.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Installation}
Installation is very straightforward: we first clone the git repository and run \texttt{installAll.sh} script.
\begin{verbatim}
sudo apt-get -y install git
git clone http://git.nlp.ipipan.waw.pl/multiservice/multiservice.git
cd multiservice/scripts
./installAll.sh
\end{verbatim}
This may take some time, probably over an hour. First part of the script downloads subservice models and configuration settings from \url{http://zil.ipipan.waw.pl/Multiservice} webpage. Then, third party components are installed:
\begin{itemize}
\item \textsc{Morfeusz}
\item \textsc{Pantera}
\item \textsc{Spejd}
\item \textsc{Maca}
\item \textsc{Concraft}
\item \textsc{WCRFT}
\item \textsc{WMBT}
\item \textsc{Thrift}
\end{itemize}
After that the script installs all subservices, the back-end managment system (\texttt{RequestManager}), SOAP webservice and \textsc{MultserviceDemo} (see Chapter \ref{ch:multiservice-demo}). Regarding subservices, it performs:
\begin{itemize}
\item \texttt{./configure \&\& make \&\& sudo make install} command for all C++ subservices,
\item \texttt{mvn package install -DskipTests=true} command for all Java subservices,
\item \texttt{sudo python setup.py install} command for all Python subservices,
\item \texttt{sudo cabal install --global} command for all Haskell subservices.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{\textsc{MultiserviceDemo} configuration}
We should adjust \textsc{MultiserviceDemo} settings (in file \texttt{core/MultiserwisDemo/MultiserwisDemo/settings.py} to reflect our server address, by changing following parameters:
\begin{itemize}
\item \texttt{ROOT\_URL} -- url at which demo will be hosted; for ICS PAS instance, it is \url{http://multiservice.nlp.ipipan.waw.pl/}.
\item \texttt{STATIC\_URL} -- url at which static files for demo will be hosted; for ICS PAS instance, it is \url{http://multiservice.nlp.ipipan.waw.pl/static/}.
\item \texttt{SECRET\_KEY} -- we should add a unique value.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Administration}
\subsection{Logging}
Log files are placed in \texttt{log} folder. Each subservice or component has its own pair of log files.
\subsection{Starting}
To start everything, we should call:
\begin{verbatim}
cd scripts && ./runAll.sh
\end{verbatim}
We should add this also to CRON, if we want the service to start automatically upon reboot.
\subsection{Restarting}
We can restart everything by running the same command as we used for starting: wystarczy:
\begin{verbatim}
cd scripts && ./runAll.sh
\end{verbatim}
To restart a single subservice or component, we need to identify the script that starts it, called by the \texttt{runAll.sh} script. Then we can run this script to restart given service. Start script first kill any daemons occupying their ports. Important: environment \texttt{MULTISERVICE\_LOG} and \texttt{PATH} should be initialised as in \texttt{runAll.sh}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Development of new subservices}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{General rules}
Subservices should not contain the code generated by Thrift from the common data structures definition file. Because of that, they should import libraries with them during their build. Sections for all currently used subservice programming languages present that in more detail.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{C++}
C++ subservices should be placed in the \texttt{subservices/cpp} folder. They should be managed by \texttt{autotools}. Installation script executes for each one of them:
\begin{verbatim}
./configure && make && make install
\end{verbatim}
\subsubsection{PanteraService and ServiceWrapper}
Because \textsc{Pantera} tagger is not a very reliable tool (it often results in segmentation faults), the \texttt{pantera\_service} C++ program is wrapped into a Java application called \texttt{ServiceWrapper}. \texttt{ServiceWrapper} is at the same time a standard subservice, as well a client, which calls another subservice (both implement \texttt{AnnotatingService} interface from \texttt{core/thrift/subservices.thrift}). If the execution of the called subservice ends with an error, \texttt{ServiceWrapper} restarts that subservice and try to use it once more.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Java}
Java subservices should be placed in the \texttt{subservices/java} folder. They should be managed by \texttt{maven}. Installation script executes for each one of them:
\begin{verbatim}
mvn package install -DskipTests=true
\end{verbatim}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Python}
Python subservices should be placed in the \texttt{subservices/python} folder. They should be managed by \texttt{setuptools}. Installation script executes for each one of them:
\begin{verbatim}
python setup.py install
\end{verbatim}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Haskell}
Haskell subservices should be placed in the \texttt{subservices/haskell} folder. Installation script executes for each one of them:
\begin{verbatim}
cabal install --global
\end{verbatim}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Adding new service}
To add a new subservice we need to implement a Thrift server with the \texttt{AnnotatingService} interface from \texttt{core/thrift/subservices.thrift} file. Examples of simple thrift services and clients are available at \url{http://thrift.apache.org/}.
This chapter is under development.
\begin{comment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{C++}
Przykładowym serwisem jest PanteraService. Zakłada on, że w systemie jest zainstalowana biblioteka libmultiservice (sprawdza za pomocą autotools, patrz configure.ac). Biblioteka ta jest dostarczana wraz z projektem core/CppUtils i zawiera źródła w C++ wygenerowane na podstawie wszystkich thriftowych interfejsów z katalogu core/thrift. Można ją zainstalować wywołując ./configure \&\& make \&\& make install w katalogu core/CppUtils lub odpalić skrypt ./installAll.sh w katalogu głównym (patrz też “Ogólne zasady”).
Biblioteka ta zawiera również klasę AnnotatingServer pomocną przy tworzeniu serwera.
Najważniejsze elementy plików \texttt{pantera\_service.cpp} and \texttt{PanteraServiceHandler.cpp} are presented in Figures \ref{panteraservice} and \ref{PanteraServiceHandler}.
\begin{figure}
\begin{verbatim}
// stworzenie obiektu, który obsługuje thriftowe metody
AnnotatingServiceIf* handler = new PanteraServiceHandler(enginePath, tagset);
// stworzenie serwera
AnnotatingServer server(handler, port);
// uruchomienie serwera
server.serve();
\end{verbatim}
\caption{pantera\_service.cpp}
\label{panteraservice}
\end{figure}
\begin{figure}
\begin{verbatim}
// realizuje metodę “annotate” z interfejsu AnnotatingService
void PanteraServiceHandler::annotate(
TText& resText,
const TText& text,
const map<string, string>& options) {
time_duration td;
…
// uruchom tagowanie morfoskładniowe
vector<NLPCommon::DefaultLexeme> taggingRes;
pantera->tag(
...,
convertTaggingOptions(options),
taggingRes);
ptime t2(microsec_clock::local_time());
// skonwertuj wynik panterowy na thriftowy
convertTaggingRes(taggingRes, resText);
// ustaw nagłówki
setAnnotationHeader(
resText,
TAnnotationLayer::SEGMENTATION, t2 - t1);
setAnnotationHeader(
resText,
TAnnotationLayer::MORPHOSYNTAX, t2 - t1);
resText.annotationDetails.hasMorphosyntaxDisambiguated = true;
resText.annotationDetails.hasSegmentsDisambiguated = true;
}
\end{verbatim}
\caption{PanteraServiceHandler.cpp}
\label{PanteraServiceHandler}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Java}
Najlepiej stworzyć projekt na podstawie subservices/java/SampleService będącego implementacją “najgłupszego możliwego” parsera zależnościowego.
Zdecydowanie warto też zajrzeć np. do subservices/java/LakonService (lub jakiegoś innego javowego serwisu), żeby zobaczyć pełniejszy przykład.
Żeby działało, musi być zainstalowany w lokalnym mavenowym repozytorium projekt w katalogu core/JavaUtils (wystarczy zrobić na nim mvn install lub odpalić skrypt ./installAll.sh w katalogu głównym) (patrz też “Ogólne zasady”).
Python
Również są przykładowe projekty - w katalogu subservices/python. Najlepiej stworzyć nowy podserwis na podstawie któregoś z nich. Aby działało trzeba odpalić python setup.py install w katalogu core/PyUtils lub skrypt ./installAll.sh w katalogu głównym.
Dodawanie nowego konwertera
Dodawanie konwertera odbywa się analogicznie do dodawania serwisu anotującego. Sam serwer działa tak samo, jedyna różnica to konieczność implementacji serwisu InputService lub OutputService zamianst AnnotatingService.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Dodanie nowego serwisu do webserwisu}
TODO
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Dodanie nowego serwisu do Dema}
\label{sec:add_to_demo}
W pliku \texttt{core/MultiserviceDemo/MultiserviceDemo/servicesHelper.py} trzeba dodać informację o nowym serwisie w zmiennej \texttt{services}. Ewentualnie można też dodać jakiś łańcuch (w \texttt{predefinedChains}).
\begin{alltt}
predefinedChains = [
[RequestPart('Pantera')],
[RequestPart('Pantera'), RequestPart('Spejd')],
\textcolor{red}{[RequestPart(MójNowyTager), RequestPart('Spejd')],}
...,
]
services = [
ServiceInfo(
name='Pantera',
serviceType=ServiceType.ANNOTATING,
requiredLayers=[],
providedLayers=[TAnnotationLayer.SEGMENTATION, TAnnotationLayer.MORPHOSYNTAX],
availableOptions=[]),
\textcolor{red}{ServiceInfo(
name=’MójNowyTager’,
serviceType=ServiceType.ANNOTATING,
requiredLayers=[],
providedLayers=[TAnnotationLayer.SEGMENTATION, TAnnotationLayer.MORPHOSYNTAX],
availableOptions=[]),}
ServiceInfo(
name='Spejd',
serviceType=ServiceType.ANNOTATING,
requiredLayers=[TAnnotationLayer.SEGMENTATION, TAnnotationLayer.MORPHOSYNTAX],
providedLayers=[TAnnotationLayer.WORDS, TAnnotationLayer.GROUPS],
availableOptions=[]),
...
]
\end{alltt}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Opcje dla nowego serwisu w Demie}
Niektóre serwisy przyjmują opcje (np. oczekiwany rozmiar wyniku dla streszczaczy). W pliku \texttt{core/MultiserviceDemo/MultiserviceDemo/servicesHelper.py} dla każdego serwisu są one zdefiniowane w zmiennej \texttt{availableOptions}. Każdy element ma być następującej postaci:
\begin{verbatim}
OptionInfo(
name='nazwaOpcji',
type=OptionType.INT,
defaultValue='...'
)
\end{verbatim}
Można podać \texttt{defaultValue=None} - oznacza to, że domyślna wartość ma być nieokreślona. Dozwolone pola \texttt{OptionType} są takie, jak zdefiniowano w \texttt{core/thrift/multiservice.thrift} (obsługiwane jest \texttt{INT, FLOAT, STRING, BOOLEAN}).
Obsługa wyświetlania opcji na liście jest w pliku (funkcja \texttt{getOptionsListElement}): \texttt{MultiserviceDemo/static/js/requestCreation.js}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Dodawanie nowego typu serwisu}
Przy dodawaniu nowego typu serwisu konieczne są kroki przy dodowaniu nowego serwisu, opisane wyżej, a dodatkowo to, co jest opisane w tej sekcji.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Dodanie nowego typu thriftowego}
Nowy typ thriftowy dodajemy do pliku core/thrift/types.thrift. Można dodawać zarówno do TSentence (tak jak np. list<TToken>), jak i do TText (jak np. summary) - zależnie od tego, co ten typ reprezentuje. Każde kolejne pole w danej strukturze powinno być kolejną liczbą naturalną, np:
\begin{alltt}
struct TSentence {
1: string id,
2: list<TToken> tokens,
3: list<TToken> rejectedTokens,
4: list<TSyntacticWord> words,
5: list<TSyntacticGroup> groups,
6: list<TNamedEntity> names,
7: list<DependencyParseNode> dependencyParse,
8: list<TMention> mentions,
9: list<TSentimentTag> sentimentTags,
\textcolor{red}{10: list<TJakiśNowyTyp> listaNowychObiektów}
}
\end{alltt}
Nowy typ powinien być reprezentowany przez jakąś warstwę anotacji (AnnotationLayer), którą też należy dodać:
\begin{alltt}
enum TAnnotationLayer {
SEGMENTATION,
MORPHOSYNTAX,
WORDS,
GROUPS,
NAMES,
SUMMARY,
DEPENDENCY_PARSE,
MENTIONS,
COREFERENCE,
SENTIMENT,
\textcolor{red}{JAKAŚ_NOWA_WARSTWA}
}
\end{alltt}
Dzięki temu będzie można (trzeba) dodawać nagłówki właściwe dla tej nowej werstwy anotacji (w TText::annotationHeaders)
Analogicznie trzeba dodać informację o nowej warstwie do dema. W pliku core/MultiserviceDemo/MultiserviceDemo/static/js/multiservice.js
\begin{alltt}
layerNumber2Name = {
0 : 'SEGMENTATION',
1 : 'MORPHOSYNTAX',
2 : 'WORDS',
3 : 'GROUPS',
4 : 'NAMES',
5 : 'SUMMARY',
6 : 'DEPENDENCY_PARSE',
7 : 'MENTIONS',
8 : 'COREFERENCE',
9: 'SENTIMENT',
\textcolor{red}{10: 'JAKAŚ_NOWA_WARSTWA'}
}
\end{alltt}
Przykłady powinny wystarczyć, w razie problemów szczegóły jak definiować nowe obiekty są tutaj: http://diwakergupta.github.io/thrift-missing-guide/
Wygenerowanie i przeinstalowanie nowej wersji źródeł thriftowych
Wystarczy wykonać:
\begin{verbatim}
./generateThrift.sh
./installAll.sh
\end{verbatim}
Ewentualnie można zainstalować projekty CppUtils, JavaUtils, PyUtils każdy osobno.
Warning: after layers are changed, some subservices may stop working and their source need to be adjusted (in addition to thrift code generation). Only one subservice known to have such issues is \texttt{concraft-multiservice-master}. As it is a tagger, it is run as first element of the chain and has to construct appropriate empty structures.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Dodawanie wizualizacji nowego typu serwisu}
Several steps are needed:
\begin{itemize}
\item Add new service to the \textsc{MultserviceDemo} (see Section \ref{sec:add_to_demo}).
\end{itemize}
W pliku \texttt{core/MultiserviceDemo/MultiserviceDemo/static/js/multiservice.js} dodać odpowiedni wpis do zmiennej \texttt{layer2Visualization}:
\begin{alltt}
var layer2Visualization = {
'SEGMENTATION' : [{
label : 'Segmentation',
elementId : 'bratSegmentationVisualization',
visualizationType : VisualizationType.TEXT,
visualizationFunction : bratVisualizeSegmentation
}],
\textcolor{red}{'MOJA_NOWA_WARSTWA' : [{
label : ‘My New Layer',
elementId : 'myNewLayerVisualization',
visualizationType : VisualizationType.TEXT,
visualizationFunction : bratVisualizeMyNewLayer
}],}
…
};
\end{alltt}
W pliku \texttt{core/MultiserviceDemo/MultiserviceDemo/static/js/brat\_visualization.js}:
\begin{itemize}
\item zadeklarować zmienną \texttt{myNewLayerCollData}:
\begin{alltt}
...
var mentionsCollData;
var coreferenceCollData;
var namesCollData;
\textcolor{red}{var myNewLayerCollData;}
\end{alltt}
\item Zainicjalizować zmienną \texttt{myNewLayerCollData} w funkcji \texttt{bratInitialize} (korzystając z przykładów dla pozostałych warstw, np. patrz \texttt{namesCollData}).
\item Napisać treść funkcji \texttt{bratVisualizeMyNewLayer} (np. na wzór \texttt{bratVisualizeNames}).
\item Trzeba będzie pewnie dopisać też parę funkcji pomocniczych i zmodyfikować trochę istniejących.
\end{itemize}
\end{comment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}