Commit e8f36ddb80503c8fc4b08b4b92ae313ce7236871

Authored by Wojciech Jaworski
1 parent fab3e07a

Uruchomienie Świgry

.gitignore
... ... @@ -8,3 +8,6 @@
8 8 *.log
9 9 *.tex.backup
10 10 tools/mate-tools/dist/*
  11 +tools/swigra/disambiguator-pcfg/*.pyc
  12 +tools/swigra/parser/httpd/forest-disamb.xml
  13 +tools/swigra/parser/httpd/forest.xml
... ...
integration/ENIAMpreIntegration.ml
... ... @@ -32,6 +32,11 @@ let mate_parser_path = "../tools/mate-tools/"
32 32 let mate_in = ref stdin
33 33 let mate_out = ref stdout
34 34  
  35 +let swigra_path = "../tools/swigra/parser"
  36 +let swigra_in = ref stdin
  37 +let swigra_out = ref stdout
  38 +let swigra_err = ref stdin
  39 +
35 40 let concraft_exists () =
36 41 let check_in, check_out, check_err = Unix.open_process_full ("command -v concraft-pl")
37 42 [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
... ... @@ -42,7 +47,7 @@ let concraft_exists () =
42 47 true
43 48 with End_of_file -> ignore @@ close_check (); false
44 49  
45   -let wait_for_server () =
  50 +let wait_for_concraft_server () =
46 51 let rec wait s a =
47 52 try Unix.connect s a
48 53 with e -> Unix.sleep 1; wait s a in
... ... @@ -52,32 +57,50 @@ let wait_for_server () =
52 57 Unix.shutdown s Unix.SHUTDOWN_SEND;
53 58 Unix.close s
54 59  
55   -let start_server m =
  60 +let start_concraft_server m =
56 61 let client_out, server_out = Unix.pipe () in
57 62 let client_err, server_err = Unix.pipe () in
58 63 let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; m|]
59 64 Unix.stdin server_out server_err in
60 65 List.iter Unix.close [client_out; server_out; client_err; server_err];
61   - wait_for_server ();
  66 + wait_for_concraft_server ();
62 67 pid
63 68  
64   -let stop_server pid =
  69 +let stop_concraft_server pid =
65 70 Unix.kill pid Sys.sigint
66 71  
  72 +let start_swigra_server dir =
  73 + let serv_in, serv_out, serv_err = Unix.open_process_full ("cd " ^ dir ^ "; ./swigra -w")
  74 + [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
  75 + ignore @@ input_line serv_in;
  76 + serv_in, serv_out, serv_err
  77 +
  78 +let stop_swigra_server (serv_in, serv_out, serv_err) =
  79 + output_string serv_out "halt.\n";
  80 + ignore @@ Unix.close_process_full (serv_in, serv_out, serv_err)
  81 +
67 82 let initialize () =
68 83 if !concraft_enabled then (
69 84 if not (concraft_exists ()) then failwith "The command concraft-pl is missing. Please make sure Concraft is installed properly." else
70 85 if not (Sys.file_exists concraft_model_filename) then failwith "Concraft model file does not exist." else
71 86 print_endline "Starting Concraft Server";
72   - concraft_server_pid := start_server concraft_model_filename;
  87 + concraft_server_pid := start_concraft_server concraft_model_filename;
73 88 print_endline "Server started");
74 89 if !mate_parser_enabled then (
75 90 let m_in, m_out = (*Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"*)
76 91 Unix.open_process ("java -jar " ^ mate_parser_path ^ "dist/anna-3.5.jar -model " ^
77 92 mate_parser_path ^ "examples/160622_Polish_MateParser.mdl -test") in
78 93 mate_in := m_in;
79   - mate_out := m_out)
  94 + mate_out := m_out);
  95 + if !swigra_enabled then (
  96 + let a,b,c = start_swigra_server swigra_path in
  97 + swigra_in := a;
  98 + swigra_out := b;
  99 + swigra_err := c)
80 100  
  101 +let stop_servers () =
  102 + if !concraft_enabled then stop_concraft_server !concraft_server_pid;
  103 + if !swigra_enabled then stop_swigra_server (!swigra_in, !swigra_out, !swigra_err)
81 104  
82 105 let read_whole_channel c =
83 106 let r = ref [] in
... ... @@ -155,18 +178,41 @@ let rec parse_mate_sentence tokens pbeg s =
155 178 print_endline "parse_mate2 5";
156 179 DepSentence[paths1;paths2]
157 180  
  181 +let curl_swigra s =
  182 + let curl_in, curl_out, curl_err = Unix.open_process_full ("curl 'http://localhost:3333/swigra' --data-urlencode 'q=" ^ s ^ "'")
  183 + [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
  184 + try
  185 + while true do
  186 + ignore @@ input_line curl_in
  187 + done
  188 + with End_of_file -> ignore @@ Unix.close_process_full (curl_in, curl_out, curl_err)
  189 +
  190 +let print_swigra_xml dir =
  191 + let xml_in = open_in @@ dir ^ "/httpd/forest-disamb.xml" in
  192 + try
  193 + while true do
  194 + print_endline @@ input_line xml_in
  195 + done
  196 + with End_of_file -> close_in xml_in
  197 +
  198 +let parse_swigra_sentence s =
  199 + curl_swigra s;
  200 + let xml = Xml.parse_file (swigra_path ^ "/httpd/forest-disamb.xml") in
  201 + print_swigra_xml swigra_path;
  202 + RawSentence s
  203 +
158 204 let compare_mode (x,_) (y,_) = compare_mode x y
159 205  
160 206 let rec parse_sentence mode tokens pbeg = function
161 207 RawSentence s ->
162 208 [Raw,RawSentence s] @
163 209 (if !mate_parser_enabled && !concraft_enabled then [Mate,parse_mate_sentence tokens pbeg s] else []) @
164   - (if !swigra_enabled then [Swigra,RawSentence s] else []) @
  210 + (if !swigra_enabled then [Swigra,parse_swigra_sentence s] else []) @
165 211 (if !polfie_enabled then [POLFIE,RawSentence s] else [])
166 212 | StructSentence(paths,last) -> [mode,StructSentence(paths,last)]
167 213 | DepSentence(paths) -> [mode,DepSentence paths]
168 214 | QuotedSentences sentences ->
169   - let sentences =Xlist.rev_map sentences (fun p ->
  215 + let sentences = Xlist.rev_map sentences (fun p ->
170 216 let sentence = parse_sentence mode tokens p.beg p.sentence in (* FIXME: p.pbeg czy pbeg *)
171 217 let sentence = match sentence with
172 218 [_,s] -> s
... ...
integration/makefile
... ... @@ -29,6 +29,10 @@ eniam-integration.cmxa: $(SOURCES)
29 29 test: test.ml
30 30 $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml
31 31  
  32 +# swigra_test: swigra_test.ml
  33 +# ocamlopt -o swigra_test $(OCAMLOPTFLAGS) swigra_test.ml
  34 +
  35 +
32 36 .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx
33 37  
34 38 .mll.ml:
... ...
integration/test.ml
... ... @@ -19,16 +19,17 @@
19 19  
20 20 let test_strings = [
21 21 "Szpak frunie.";
22   - "Kot np. miauczy.";
  22 + (* "Kot np. miauczy.";
23 23 "Ala ma kota.";
24 24 "Ale mają kota:";
25 25 "Szpak frunie. Kot miauczy.";
26   - "Szpak powiedział: „Frunę. Kiszę.”";
  26 + "Szpak powiedział: „Frunę. Kiszę.”"; *)
27 27 ]
28 28  
29 29 let _ =
30   - ENIAMpreIntegration.concraft_enabled := true;
31   - ENIAMpreIntegration.mate_parser_enabled := true;
  30 + (* ENIAMpreIntegration.concraft_enabled := true;
  31 + ENIAMpreIntegration.mate_parser_enabled := true; *)
  32 + ENIAMpreIntegration.swigra_enabled := true;
32 33 ENIAMsubsyntax.initialize ();
33 34 ENIAMpreIntegration.initialize ();
34 35 print_endline "Testy wbudowane";
... ... @@ -50,5 +51,5 @@ let _ =
50 51 print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
51 52 s := read_line ()
52 53 done;*)
53   - ENIAMpreIntegration.stop_server !ENIAMpreIntegration.concraft_server_pid;
  54 + ENIAMpreIntegration.stop_servers ();
54 55 ()
... ...
tokenizer/ENIAMpatterns.ml
... ... @@ -394,7 +394,9 @@ let digit_patterns4 = [
394 394  
395 395 let url_patterns1 = [
396 396 [L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
  397 + [L; D "dig"; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
397 398 [L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
  399 + [L; S "."; L; D "dig"; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
398 400 [L; S "."; L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
399 401 [L; S "."; L; S "."; L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
400 402 [L; S "."; L; S "."; L; S "."; L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
... ... @@ -447,9 +449,8 @@ let url_patterns2 = [
447 449 [L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
448 450 [L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
449 451 [L; S "."; L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
450   - [L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
  452 + [L; D "intnum"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
451 453 [L; S "."; L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
452   - [L; S "-"; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
453 454 [O "http"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url"));
454 455 [O "https"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url"));
455 456 ]
... ...
tokenizer/ENIAMtokens.ml
... ... @@ -381,7 +381,7 @@ let recognize_stem poss_s_beg has_sufix i letters =
381 381 else
382 382 if Xlist.size letters = 1 then
383 383 if first_capital letters then Variant[
384   - Token{t with token=SmallLetter orth; weight=cs_weight; attrs=CS :: t.attrs};
  384 + Token{t with token=SmallLetter(merge (lowercase_first letters)); weight=cs_weight; attrs=CS :: t.attrs};
385 385 Token{t with token=CapLetter(orth,merge (lowercase_first letters)); attrs=MaybeCS :: t.attrs}]
386 386 else Token{t with token=SmallLetter orth}
387 387 else
... ... @@ -790,6 +790,14 @@ let rec recognize_sign_group poss_s_beg i = function
790 790 Seq[Token{empty_token_env with orth=":";beg=i;len=factor;next=i+factor;token=Interp ":"; attrs=[MaybeCS]};
791 791 Token{empty_token_env with orth="]";beg=i+factor;len=factor;next=i+2*factor;token=Interp "]"; attrs=[MaybeCS]}]],i2,l,false
792 792 | (Sign ";") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "]") :: []) l (make_lemma (";]","sinterj"))
  793 + | (Sign ":") :: (Capital("P",_)) :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "P") :: []) l (make_lemma (":P","sinterj"))
  794 + | (Sign ";") :: (Capital("P",_)) :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "P") :: []) l (make_lemma (";P","sinterj"))
  795 + | (Sign ":") :: (Sign "-") :: (Capital("P",_)) :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "P") :: []) l (make_lemma (":-P","sinterj"))
  796 + | (Sign ";") :: (Sign "-") :: (Capital("P",_)) :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "-") :: (Sign "P") :: []) l (make_lemma (";-P","sinterj"))
  797 + | (Sign ":") :: (Capital("D",_)) :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "D") :: []) l (make_lemma (":D","sinterj"))
  798 + | (Sign ";") :: (Capital("D",_)) :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "D") :: []) l (make_lemma (";D","sinterj"))
  799 + | (Sign ":") :: (Sign "-") :: (Capital("D",_)) :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "D") :: []) l (make_lemma (":-D","sinterj"))
  800 + | (Sign ";") :: (Sign "-") :: (Capital("D",_)) :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "-") :: (Sign "D") :: []) l (make_lemma (";-D","sinterj"))
793 801 | (Sign "]") :: l -> create_sign_token poss_s_beg i [Sign "]"] l (Interp "]")
794 802 | (Sign "[") :: l -> create_sign_token poss_s_beg i [Sign "["] l (Interp "[")
795 803 | (Sign ":") :: l ->
... ... @@ -983,7 +991,7 @@ let rec recognize_sign_group poss_s_beg i = function
983 991 | (Sign "^") :: (Sign "^") :: l -> create_sign_token poss_s_beg i [Sign "^";Sign "^"] l (make_lemma ("^^","sinterj"))
984 992 | (Sign "^") :: l -> create_sign_token poss_s_beg i [Sign "^"] l (Symbol "^")
985 993 | (Sign "|") :: l -> create_sign_token poss_s_beg i [Sign "|"] l (Symbol "|")
986   - | (Sign "&") :: l ->
  994 + | (Sign "&") :: l ->
987 995 let t,i = create_empty_sign_token i [Sign "&"] in
988 996 Variant[Token{t with token=Symbol "&"};Token{t with token=make_lemma ("&","conj")}],i,l,false
989 997 | (Sign "=") :: l -> create_sign_token poss_s_beg i [Sign "="] l (Symbol "=")
... ...
tokenizer/test.ml
... ... @@ -68,15 +68,21 @@ let test_strings = [
68 68 "( Głosujmy !)";
69 69 "„Dialog”"; *)
70 70 (* "x br." *)
71   - "ponad 388 tys. ludzi";
72   - "ponad 388 tys. km.2";
  71 + (* "ponad 388 tys. ludzi";
  72 + "ponad 388 tys. km.2"; *)
73 73 (* "(PTTK Żyrardów, tel. 0-46 855-45-26)";
74 74 "40-045 Katowice, ul. Astrów 7, tel. (032) 51 30 86, tel. i faks 51 86 28, 517 193, 518 609"; *)
75   - "przeciętnie 7,5 tys. kibiców";
  75 + (* "przeciętnie 7,5 tys. kibiców";
76 76 "0,4mln";
77 77 "8,8665tys.";
78 78 "70-75 tys.";
79   - "70-75tys.";
  79 + "70-75tys."; *)
  80 + (* "myjni \"A-Car Auto\" Myjnia"; *)
  81 + "nowaka@lp2.pl";
  82 + (* "poufale :P) Im"; *)
  83 + (*"Piłsudskiego 12 A Konstancin-Jeziorna";
  84 + "Mary Mary";
  85 + "TTTTTTTK TTTTTTTK"; *)
80 86 ]
81 87  
82 88 let _ =
... ...