Commit 5b9fa813664ddc56e2dc9baee016feb22ca3969e

Authored by Wojciech Jaworski
2 parents b3b2268e 305543d8

Merge branch 'corpora' of ssh://git.nlp.ipipan.waw.pl:8888/wojciech.jaworski/ENIAM into corpora

corpora/freq_test.ml 0 → 100644
  1 +
  2 +open Xstd
  3 +
  4 +let _ =
  5 + let l = File.load_tab "../resources/NKJP1M/NKJP1M_frequencies.tab" (function
  6 + [orth; lemma; interp; freq] -> orth, lemma, interp, int_of_string freq
  7 + | _ -> failwith "load_frequencies") in
  8 + let qmap = Xlist.fold l StringQMap.empty (fun qmap (orth, lemma, interp, freq) ->
  9 + StringQMap.add_val qmap (lemma ^ "\t" ^ interp) freq) in
  10 + StringQMap.iter qmap (fun k v -> Printf.printf "%d\t%s\n" v k)
... ...
corpora/makefile
... ... @@ -13,6 +13,8 @@ all:
13 13 lib:
14 14 $(OCAMLOPT) -linkall -a -o corpora.cmxa $(INCLUDES) $(MODS)
15 15  
  16 +freq_test:
  17 + $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml
16 18  
17 19 .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx
18 20  
... ... @@ -35,4 +37,4 @@ lib:
35 37 $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
36 38  
37 39 clean:
38   - rm -f *~ *.cm[oix] *.o generate corpora
  40 + rm -f *~ *.cm[oix] *.o generate corpora freq_test
... ...
parser/exec.ml
... ... @@ -137,36 +137,6 @@ let rec translate_text = function
137 137 | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) ->
138 138 translate_mode mode, translate_text text))
139 139  
140   -let string_of_mode = function
141   - Raw -> "Raw"
142   - | Struct -> "Struct"
143   - | CONLL -> "CONLL"
144   - | ENIAM -> "ENIAM"
145   - | Mate -> "Mate"
146   -
147   -let rec string_of_sentence = function
148   - RawSentence s -> sprintf "RawSentence(%s)" s
149   - | StructSentence(paths,last) -> sprintf "StructSentence(%s,%d)" (Visualization.string_of_paths1 paths) last
150   - | ORSentence _ -> failwith "string_of_sentence: ni"
151   - | AltSentence l -> sprintf "AltSentence([\n %s])" (String.concat ";\n " (Xlist.map l (fun (mode,sentence) ->
152   - string_of_mode mode ^ ", " ^ string_of_sentence sentence)))
153   - | _ -> failwith "string_of_sentence: ni"
154   -
155   -let rec string_of_paragraph = function
156   - RawParagraph s -> sprintf "RawParagraph(%s)" s
157   - | StructParagraph sentences ->
158   - sprintf "StructParagraph([\n %s])" (String.concat ";\n " (Xlist.map sentences (fun p ->
159   - sprintf "{pid=%s; pbeg=%d; plen=%d; psentence=%s}" p.pid p.pbeg p.plen (string_of_sentence p.psentence))))
160   - | AltParagraph l -> sprintf "AltParagraph(\n %s)" (String.concat "\n " (Xlist.map l (fun (mode,paragraph) ->
161   - string_of_mode mode ^ ", " ^ string_of_paragraph paragraph)))
162   -
163   -let rec string_of_text = function
164   - RawText s -> sprintf "RawText(%s)" s
165   - | StructText(paragraphs,next_id) ->
166   - sprintf "StructText([\n %s],%d)" (String.concat ";\n " (Xlist.map paragraphs string_of_paragraph)) next_id
167   - | AltText l -> sprintf "AltText(\n %s)" (String.concat "\n " (Xlist.map l (fun (mode,text) ->
168   - string_of_mode mode ^ ", " ^ string_of_text text)))
169   -
170 140 let eniam_parse_sentence timeout test_only_flag paths last next_id =
171 141 let result = empty_eniam_parse_result in
172 142 let time2 = time_fun () in
... ...
parser/pipe.ml
... ... @@ -163,8 +163,10 @@ let process_conll_corpus filename =
163 163 let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in
164 164 Xlist.iter corpus (fun query ->
165 165 let result = Exec.process_query ic oc 3000. false "x" query 10 in
166   - printf "input_text:\n%s\n" (Exec.string_of_text result.input_text);
167   - printf "pre_text:\n%s\n" (Exec.string_of_text result.pre_text);
  166 + Visualization.print_html_text "results/" "input_text" result.input_text;
  167 + Visualization.print_html_text "results/" "pre_text" result.pre_text;
  168 + (* printf "input_text:\n%s\n" (Visualization.string_of_text result.input_text);
  169 + printf "pre_text:\n%s\n" (Visualization.string_of_text result.pre_text); *)
168 170 Exec.print_result stdout result;
169 171 (* Visualization.print_paths "results/" "paths" result.paths; *)
170 172 ());
... ...
parser/visualization.ml
... ... @@ -60,12 +60,12 @@ let paths_to_string_indexed (paths,last,next_id) =
60 60 ^ Printf.sprintf "\nlast=%d next_id=%d" last next_id
61 61  
62 62 let string_of_token_record1 t =
63   - (* sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s;id=%d;lnode=%d;rnode=%d;conll_id=%s;conll_super=%s;conll_label=%s;attrs=[%s]}"
  63 + sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s;id=%d;lnode=%d;rnode=%d;conll_id=%s;conll_super=%s;conll_label=%s;attrs=[%s]}"
64 64 t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (string_of_token t.PreTypes.token)
65 65 t.PreTypes.id t.PreTypes.lnode t.PreTypes.rnode t.PreTypes.conll_id t.PreTypes.conll_super t.PreTypes.conll_label
66   - (String.concat ";" t.PreTypes.attrs) *)
67   - sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s}"
68   - t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (string_of_token t.PreTypes.token)
  66 + (String.concat ";" t.PreTypes.attrs)
  67 + (* sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s}"
  68 + t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (string_of_token t.PreTypes.token) *)
69 69  
70 70 let string_of_paths1 paths =
71 71 String.concat "\n " (Xlist.map paths string_of_token_record1)
... ... @@ -619,3 +619,106 @@ let print_other_result file cg_bin_path query result =
619 619 fprintf file "\n<H3>%s</H3>\n" query;
620 620 fprintf file "\n<P>%s\n" (generate_status_message result result.status);
621 621 fprintf file "%s\n" page_trailer
  622 +
  623 +let string_of_mode = function
  624 + Raw -> "Raw"
  625 + | Struct -> "Struct"
  626 + | CONLL -> "CONLL"
  627 + | ENIAM -> "ENIAM"
  628 + | Mate -> "Mate"
  629 +
  630 +let rec string_of_sentence = function
  631 + RawSentence s -> sprintf "RawSentence(%s)" s
  632 + | StructSentence(paths,last) -> sprintf "StructSentence(%s,%d)" (string_of_paths1 paths) last
  633 + | ORSentence _ -> failwith "string_of_sentence: ni"
  634 + | AltSentence l -> sprintf "AltSentence([\n %s])" (String.concat ";\n " (Xlist.map l (fun (mode,sentence) ->
  635 + string_of_mode mode ^ ", " ^ string_of_sentence sentence)))
  636 + | _ -> failwith "string_of_sentence: ni"
  637 +
  638 +let rec string_of_paragraph = function
  639 + RawParagraph s -> sprintf "RawParagraph(%s)" s
  640 + | StructParagraph sentences ->
  641 + sprintf "StructParagraph([\n %s])" (String.concat ";\n " (Xlist.map sentences (fun p ->
  642 + sprintf "{pid=%s; pbeg=%d; plen=%d; psentence=%s}" p.pid p.pbeg p.plen (string_of_sentence p.psentence))))
  643 + | AltParagraph l -> sprintf "AltParagraph(\n %s)" (String.concat "\n " (Xlist.map l (fun (mode,paragraph) ->
  644 + string_of_mode mode ^ ", " ^ string_of_paragraph paragraph)))
  645 +
  646 +let rec string_of_text = function
  647 + RawText s -> sprintf "RawText(%s)" s
  648 + | StructText(paragraphs,next_id) ->
  649 + sprintf "StructText([\n %s],%d)" (String.concat ";\n " (Xlist.map paragraphs string_of_paragraph)) next_id
  650 + | AltText l -> sprintf "AltText(\n %s)" (String.concat "\n " (Xlist.map l (fun (mode,text) ->
  651 + string_of_mode mode ^ ", " ^ string_of_text text)))
  652 +
  653 +let html_header =
  654 +"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">
  655 +<html>
  656 + <head>
  657 + <META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\">
  658 + <TITLE>ENIAM: Kategorialny Parser Składniowo-Semantyczny</TITLE>
  659 + <META HTTP-EQUIV=\"Content-Language\" CONTENT=\"pl\">
  660 + </head>
  661 +
  662 + <body>
  663 + <center>"
  664 +
  665 +let html_trailer =
  666 +"</center>
  667 + </body>
  668 +</html>"
  669 +
  670 +let escape_html s =
  671 + Int.fold 0 (String.length s - 1) "" (fun t i ->
  672 + match String.sub s i 1 with
  673 + "<" -> t ^ "&lt;"
  674 + | ">" -> t ^ "&gt;"
  675 + | "&" -> t ^ "&amp;"
  676 + | c -> t ^ c)
  677 +
  678 +let html_of_struct_sentence paths last =
  679 + "<table><tr><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td><td><b>id</b></td><td><b>lnode</b></td><td><b>rnode</b></td><td><b>conll_id</b></td><td><b>conll_super</b></td><td><b>conll_label</b></td></td><td><b>attrs</b></td></tr>" ^
  680 + String.concat "\n" (Xlist.map paths (fun t ->
  681 + sprintf "<tr><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>"
  682 + t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (escape_html (string_of_token t.PreTypes.token))
  683 + t.PreTypes.id t.PreTypes.lnode t.PreTypes.rnode t.PreTypes.conll_id t.PreTypes.conll_super t.PreTypes.conll_label
  684 + (String.concat "; " t.PreTypes.attrs))) ^
  685 + sprintf "<tr><td></td><td>%d</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr>" last ^
  686 + "</table>"
  687 +
  688 +let rec html_of_sentence = function
  689 + RawSentence s -> s
  690 + | StructSentence(paths,last) -> html_of_struct_sentence paths last
  691 + | ORSentence _ -> failwith "html_of_sentence: ni"
  692 + | AltSentence l ->
  693 + "<table border=1>" ^
  694 + String.concat "\n" (Xlist.map l (fun (mode,sentence) ->
  695 + sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_sentence sentence))) ^
  696 + "</table>"
  697 + | _ -> failwith "html_of_sentence: ni"
  698 +
  699 +let rec html_of_paragraph = function
  700 + RawParagraph s -> s
  701 + | StructParagraph sentences ->
  702 + String.concat "<BR>\n" (Xlist.map sentences (fun p ->
  703 + sprintf "pid=%s pbeg=%d plen=%d<BR>%s" p.pid p.pbeg p.plen (html_of_sentence p.psentence)))
  704 + | AltParagraph l ->
  705 + "<table border=2>" ^
  706 + String.concat "\n" (Xlist.map l (fun (mode,paragraph) ->
  707 + sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_paragraph paragraph))) ^
  708 + "</table>"
  709 +
  710 +let rec html_of_text = function
  711 + RawText s -> s
  712 + | StructText(paragraphs,next_id) ->
  713 + sprintf "next_id=%d<BR>\n%s" next_id (String.concat "<BR>\n" (Xlist.map paragraphs html_of_paragraph))
  714 + | AltText l ->
  715 + "<table border=3>" ^
  716 + String.concat "\n" (Xlist.map l (fun (mode,text) ->
  717 + sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_text text))) ^
  718 + "</table>"
  719 +
  720 +let print_html_text path name text =
  721 + File.file_out (path ^ name ^ ".html") (fun file ->
  722 + fprintf file "%s\n" html_header;
  723 + fprintf file "%s\n" (html_of_text text);
  724 + fprintf file "%s\n" html_trailer)
... ...
pre/preProcessing.ml
... ... @@ -556,9 +556,9 @@ let parse query next_id =
556 556 let paths = translate_digs paths in
557 557 let paths = PreWordnet.assign_senses paths in
558 558 (* print_endline "a14"; *)
559   - let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *)
560   -(* print_endline "a15"; *)
561 559 let paths = assign_valence paths in
  560 +(* print_endline "a15"; *)
  561 + let paths = combine_interps paths in
562 562 (* print_endline "a16"; *)
563 563 let paths = disambiguate_senses paths in
564 564 let paths = assign_simplified_valence paths in
... ...