Commit 5b9fa813664ddc56e2dc9baee016feb22ca3969e
Merge branch 'corpora' of ssh://git.nlp.ipipan.waw.pl:8888/wojciech.jaworski/ENIAM into corpora
Showing
6 changed files
with
126 additions
and
39 deletions
corpora/freq_test.ml
0 → 100644
1 | + | |
2 | +open Xstd | |
3 | + | |
4 | +let _ = | |
5 | + let l = File.load_tab "../resources/NKJP1M/NKJP1M_frequencies.tab" (function | |
6 | + [orth; lemma; interp; freq] -> orth, lemma, interp, int_of_string freq | |
7 | + | _ -> failwith "load_frequencies") in | |
8 | + let qmap = Xlist.fold l StringQMap.empty (fun qmap (orth, lemma, interp, freq) -> | |
9 | + StringQMap.add_val qmap (lemma ^ "\t" ^ interp) freq) in | |
10 | + StringQMap.iter qmap (fun k v -> Printf.printf "%d\t%s\n" v k) | |
... | ... |
corpora/makefile
... | ... | @@ -13,6 +13,8 @@ all: |
13 | 13 | lib: |
14 | 14 | $(OCAMLOPT) -linkall -a -o corpora.cmxa $(INCLUDES) $(MODS) |
15 | 15 | |
16 | +freq_test: | |
17 | + $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml | |
16 | 18 | |
17 | 19 | .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx |
18 | 20 | |
... | ... | @@ -35,4 +37,4 @@ lib: |
35 | 37 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
36 | 38 | |
37 | 39 | clean: |
38 | - rm -f *~ *.cm[oix] *.o generate corpora | |
40 | + rm -f *~ *.cm[oix] *.o generate corpora freq_test | |
... | ... |
parser/exec.ml
... | ... | @@ -137,36 +137,6 @@ let rec translate_text = function |
137 | 137 | | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) -> |
138 | 138 | translate_mode mode, translate_text text)) |
139 | 139 | |
140 | -let string_of_mode = function | |
141 | - Raw -> "Raw" | |
142 | - | Struct -> "Struct" | |
143 | - | CONLL -> "CONLL" | |
144 | - | ENIAM -> "ENIAM" | |
145 | - | Mate -> "Mate" | |
146 | - | |
147 | -let rec string_of_sentence = function | |
148 | - RawSentence s -> sprintf "RawSentence(%s)" s | |
149 | - | StructSentence(paths,last) -> sprintf "StructSentence(%s,%d)" (Visualization.string_of_paths1 paths) last | |
150 | - | ORSentence _ -> failwith "string_of_sentence: ni" | |
151 | - | AltSentence l -> sprintf "AltSentence([\n %s])" (String.concat ";\n " (Xlist.map l (fun (mode,sentence) -> | |
152 | - string_of_mode mode ^ ", " ^ string_of_sentence sentence))) | |
153 | - | _ -> failwith "string_of_sentence: ni" | |
154 | - | |
155 | -let rec string_of_paragraph = function | |
156 | - RawParagraph s -> sprintf "RawParagraph(%s)" s | |
157 | - | StructParagraph sentences -> | |
158 | - sprintf "StructParagraph([\n %s])" (String.concat ";\n " (Xlist.map sentences (fun p -> | |
159 | - sprintf "{pid=%s; pbeg=%d; plen=%d; psentence=%s}" p.pid p.pbeg p.plen (string_of_sentence p.psentence)))) | |
160 | - | AltParagraph l -> sprintf "AltParagraph(\n %s)" (String.concat "\n " (Xlist.map l (fun (mode,paragraph) -> | |
161 | - string_of_mode mode ^ ", " ^ string_of_paragraph paragraph))) | |
162 | - | |
163 | -let rec string_of_text = function | |
164 | - RawText s -> sprintf "RawText(%s)" s | |
165 | - | StructText(paragraphs,next_id) -> | |
166 | - sprintf "StructText([\n %s],%d)" (String.concat ";\n " (Xlist.map paragraphs string_of_paragraph)) next_id | |
167 | - | AltText l -> sprintf "AltText(\n %s)" (String.concat "\n " (Xlist.map l (fun (mode,text) -> | |
168 | - string_of_mode mode ^ ", " ^ string_of_text text))) | |
169 | - | |
170 | 140 | let eniam_parse_sentence timeout test_only_flag paths last next_id = |
171 | 141 | let result = empty_eniam_parse_result in |
172 | 142 | let time2 = time_fun () in |
... | ... |
parser/pipe.ml
... | ... | @@ -163,8 +163,10 @@ let process_conll_corpus filename = |
163 | 163 | let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in |
164 | 164 | Xlist.iter corpus (fun query -> |
165 | 165 | let result = Exec.process_query ic oc 3000. false "x" query 10 in |
166 | - printf "input_text:\n%s\n" (Exec.string_of_text result.input_text); | |
167 | - printf "pre_text:\n%s\n" (Exec.string_of_text result.pre_text); | |
166 | + Visualization.print_html_text "results/" "input_text" result.input_text; | |
167 | + Visualization.print_html_text "results/" "pre_text" result.pre_text; | |
168 | + (* printf "input_text:\n%s\n" (Visualization.string_of_text result.input_text); | |
169 | + printf "pre_text:\n%s\n" (Visualization.string_of_text result.pre_text); *) | |
168 | 170 | Exec.print_result stdout result; |
169 | 171 | (* Visualization.print_paths "results/" "paths" result.paths; *) |
170 | 172 | ()); |
... | ... |
parser/visualization.ml
... | ... | @@ -60,12 +60,12 @@ let paths_to_string_indexed (paths,last,next_id) = |
60 | 60 | ^ Printf.sprintf "\nlast=%d next_id=%d" last next_id |
61 | 61 | |
62 | 62 | let string_of_token_record1 t = |
63 | - (* sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s;id=%d;lnode=%d;rnode=%d;conll_id=%s;conll_super=%s;conll_label=%s;attrs=[%s]}" | |
63 | + sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s;id=%d;lnode=%d;rnode=%d;conll_id=%s;conll_super=%s;conll_label=%s;attrs=[%s]}" | |
64 | 64 | t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (string_of_token t.PreTypes.token) |
65 | 65 | t.PreTypes.id t.PreTypes.lnode t.PreTypes.rnode t.PreTypes.conll_id t.PreTypes.conll_super t.PreTypes.conll_label |
66 | - (String.concat ";" t.PreTypes.attrs) *) | |
67 | - sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s}" | |
68 | - t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (string_of_token t.PreTypes.token) | |
66 | + (String.concat ";" t.PreTypes.attrs) | |
67 | + (* sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s}" | |
68 | + t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (string_of_token t.PreTypes.token) *) | |
69 | 69 | |
70 | 70 | let string_of_paths1 paths = |
71 | 71 | String.concat "\n " (Xlist.map paths string_of_token_record1) |
... | ... | @@ -619,3 +619,106 @@ let print_other_result file cg_bin_path query result = |
619 | 619 | fprintf file "\n<H3>%s</H3>\n" query; |
620 | 620 | fprintf file "\n<P>%s\n" (generate_status_message result result.status); |
621 | 621 | fprintf file "%s\n" page_trailer |
622 | + | |
623 | +let string_of_mode = function | |
624 | + Raw -> "Raw" | |
625 | + | Struct -> "Struct" | |
626 | + | CONLL -> "CONLL" | |
627 | + | ENIAM -> "ENIAM" | |
628 | + | Mate -> "Mate" | |
629 | + | |
630 | +let rec string_of_sentence = function | |
631 | + RawSentence s -> sprintf "RawSentence(%s)" s | |
632 | + | StructSentence(paths,last) -> sprintf "StructSentence(%s,%d)" (string_of_paths1 paths) last | |
633 | + | ORSentence _ -> failwith "string_of_sentence: ni" | |
634 | + | AltSentence l -> sprintf "AltSentence([\n %s])" (String.concat ";\n " (Xlist.map l (fun (mode,sentence) -> | |
635 | + string_of_mode mode ^ ", " ^ string_of_sentence sentence))) | |
636 | + | _ -> failwith "string_of_sentence: ni" | |
637 | + | |
638 | +let rec string_of_paragraph = function | |
639 | + RawParagraph s -> sprintf "RawParagraph(%s)" s | |
640 | + | StructParagraph sentences -> | |
641 | + sprintf "StructParagraph([\n %s])" (String.concat ";\n " (Xlist.map sentences (fun p -> | |
642 | + sprintf "{pid=%s; pbeg=%d; plen=%d; psentence=%s}" p.pid p.pbeg p.plen (string_of_sentence p.psentence)))) | |
643 | + | AltParagraph l -> sprintf "AltParagraph(\n %s)" (String.concat "\n " (Xlist.map l (fun (mode,paragraph) -> | |
644 | + string_of_mode mode ^ ", " ^ string_of_paragraph paragraph))) | |
645 | + | |
646 | +let rec string_of_text = function | |
647 | + RawText s -> sprintf "RawText(%s)" s | |
648 | + | StructText(paragraphs,next_id) -> | |
649 | + sprintf "StructText([\n %s],%d)" (String.concat ";\n " (Xlist.map paragraphs string_of_paragraph)) next_id | |
650 | + | AltText l -> sprintf "AltText(\n %s)" (String.concat "\n " (Xlist.map l (fun (mode,text) -> | |
651 | + string_of_mode mode ^ ", " ^ string_of_text text))) | |
652 | + | |
653 | +let html_header = | |
654 | +"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"> | |
655 | +<html> | |
656 | + <head> | |
657 | + <META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\"> | |
658 | + <TITLE>ENIAM: Kategorialny Parser Składniowo-Semantyczny</TITLE> | |
659 | + <META HTTP-EQUIV=\"Content-Language\" CONTENT=\"pl\"> | |
660 | + </head> | |
661 | + | |
662 | + <body> | |
663 | + <center>" | |
664 | + | |
665 | +let html_trailer = | |
666 | +"</center> | |
667 | + </body> | |
668 | +</html>" | |
669 | + | |
670 | +let escape_html s = | |
671 | + Int.fold 0 (String.length s - 1) "" (fun t i -> | |
672 | + match String.sub s i 1 with | |
673 | + "<" -> t ^ "<" | |
674 | + | ">" -> t ^ ">" | |
675 | + | "&" -> t ^ "&" | |
676 | + | c -> t ^ c) | |
677 | + | |
678 | +let html_of_struct_sentence paths last = | |
679 | + "<table><tr><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td><td><b>id</b></td><td><b>lnode</b></td><td><b>rnode</b></td><td><b>conll_id</b></td><td><b>conll_super</b></td><td><b>conll_label</b></td></td><td><b>attrs</b></td></tr>" ^ | |
680 | + String.concat "\n" (Xlist.map paths (fun t -> | |
681 | + sprintf "<tr><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" | |
682 | + t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (escape_html (string_of_token t.PreTypes.token)) | |
683 | + t.PreTypes.id t.PreTypes.lnode t.PreTypes.rnode t.PreTypes.conll_id t.PreTypes.conll_super t.PreTypes.conll_label | |
684 | + (String.concat "; " t.PreTypes.attrs))) ^ | |
685 | + sprintf "<tr><td></td><td>%d</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr>" last ^ | |
686 | + "</table>" | |
687 | + | |
688 | +let rec html_of_sentence = function | |
689 | + RawSentence s -> s | |
690 | + | StructSentence(paths,last) -> html_of_struct_sentence paths last | |
691 | + | ORSentence _ -> failwith "html_of_sentence: ni" | |
692 | + | AltSentence l -> | |
693 | + "<table border=1>" ^ | |
694 | + String.concat "\n" (Xlist.map l (fun (mode,sentence) -> | |
695 | + sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_sentence sentence))) ^ | |
696 | + "</table>" | |
697 | + | _ -> failwith "html_of_sentence: ni" | |
698 | + | |
699 | +let rec html_of_paragraph = function | |
700 | + RawParagraph s -> s | |
701 | + | StructParagraph sentences -> | |
702 | + String.concat "<BR>\n" (Xlist.map sentences (fun p -> | |
703 | + sprintf "pid=%s pbeg=%d plen=%d<BR>%s" p.pid p.pbeg p.plen (html_of_sentence p.psentence))) | |
704 | + | AltParagraph l -> | |
705 | + "<table border=2>" ^ | |
706 | + String.concat "\n" (Xlist.map l (fun (mode,paragraph) -> | |
707 | + sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_paragraph paragraph))) ^ | |
708 | + "</table>" | |
709 | + | |
710 | +let rec html_of_text = function | |
711 | + RawText s -> s | |
712 | + | StructText(paragraphs,next_id) -> | |
713 | + sprintf "next_id=%d<BR>\n%s" next_id (String.concat "<BR>\n" (Xlist.map paragraphs html_of_paragraph)) | |
714 | + | AltText l -> | |
715 | + "<table border=3>" ^ | |
716 | + String.concat "\n" (Xlist.map l (fun (mode,text) -> | |
717 | + sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_text text))) ^ | |
718 | + "</table>" | |
719 | + | |
720 | +let print_html_text path name text = | |
721 | + File.file_out (path ^ name ^ ".html") (fun file -> | |
722 | + fprintf file "%s\n" html_header; | |
723 | + fprintf file "%s\n" (html_of_text text); | |
724 | + fprintf file "%s\n" html_trailer) | |
... | ... |
pre/preProcessing.ml
... | ... | @@ -556,9 +556,9 @@ let parse query next_id = |
556 | 556 | let paths = translate_digs paths in |
557 | 557 | let paths = PreWordnet.assign_senses paths in |
558 | 558 | (* print_endline "a14"; *) |
559 | - let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) | |
560 | -(* print_endline "a15"; *) | |
561 | 559 | let paths = assign_valence paths in |
560 | +(* print_endline "a15"; *) | |
561 | + let paths = combine_interps paths in | |
562 | 562 | (* print_endline "a16"; *) |
563 | 563 | let paths = disambiguate_senses paths in |
564 | 564 | let paths = assign_simplified_valence paths in |
... | ... |