generowanie wyjścia dla subsyntax

Wojciech Jaworski
1 parent b06dc862
Showing 8 changed files with 239 additions and 13 deletions
LCGparser/test.ml
semsources/src stycz/dzieło stycz1.pdf
subsyntax/ENIAMsubsyntaxGraphOf.ml
subsyntax/ENIAMsubsyntaxHTMLof.ml
subsyntax/ENIAMsubsyntaxXMLof.ml
subsyntax/makefile
subsyntax/test.ml
tokenizer/ENIAMtokens.ml
@@ -45,8 +45,8 @@ let examples = [
     0, 1, "Jakiego","jaki","adj",Raised(WithVar("case",With[Atom "gen"; Atom "acc"],"A",ImpSet(ImpSet(Tensor[Atom "cp"; Atom "int"; Atom "jaki"],
                                                                                              [Forward,Imp(Tensor[Atom "ip"],Forward,Tensor[Atom "np"; AVar "case"])]),
                                                                                              [Forward,Imp(Tensor[Atom "np"; AVar "case"],Backward,Tensor[Atom "adjp"; AVar "case"])])));
-    (* 1, 2, "kota","kot","subst",  Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Maybe(Tensor[Atom "adjp"; AVar "case"])]))); *)
-    1, 2, "kota","kot","subst",  Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Tensor[Atom "adjp"; AVar "case"]])));
+    1, 2, "kota","kot","subst",  Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Maybe(Tensor[Atom "adjp"; AVar "case"])])));
+    (* 1, 2, "kota","kot","subst",  Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Tensor[Atom "adjp"; AVar "case"]]))); *)
     2, 3, "Ala","Ala","subst",   Basic(Tensor[Atom "np"; Atom "nom"]);
     3, 4, "ma","mieć","fin",     Basic(ImpSet(Tensor[Atom "ip"],[Both,Tensor[Atom "np"; Atom "nom"];Both,Tensor[Atom "np"; Atom "acc"]]));
     (* 3, 4, "kota","kot","subst",  Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Tensor[Atom "adjp"; AVar "case"]]))); *)
+(*
+ *  ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
+ *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+ *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
+ *
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *)
+
+open ENIAMsubsyntaxTypes
+open Printf
+
+let print_tokens path name tokens =
+  File.file_out (path ^ name ^ ".gv") (fun file ->
+    fprintf file "digraph G {\n";
+    Xlist.iter tokens (fun t ->
+      let lemma = ENIAMtokens.get_lemma t.ENIAMtokenizerTypes.token in
+      if lemma <> "" then fprintf file "  %d -> %d [label=\"%s\\n%s\"]\n" t.ENIAMtokenizerTypes.beg t.ENIAMtokenizerTypes.next t.ENIAMtokenizerTypes.orth lemma);
+    fprintf file "}\n");
+  Sys.chdir path;
+  ignore (Sys.command ("dot -Tpng " ^ name ^ ".gv -o " ^ name ^ ".png"));
+  String.iter (function '/' -> Sys.chdir ".." | _ -> ()) path
+(*
+ *  ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
+ *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+ *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
+ *
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *)
+
+open ENIAMsubsyntaxTypes
+open Printf
+
+let html_header =
+"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">
+<html>
+  <head>
+	<META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\">
+	<TITLE>ENIAM: Kategorialny Parser Składniowo-Semantyczny</TITLE>
+	<META HTTP-EQUIV=\"Content-Language\" CONTENT=\"pl\">
+  </head>
+
+  <body>
+ <center>"
+
+let html_trailer =
+"</center>
+  </body>
+</html>"
+
+let escape_html s =
+  Int.fold 0 (String.length s - 1) "" (fun t i ->
+    match String.sub s i 1 with
+       "<" -> t ^ "&lt;"
+     | ">" -> t ^ "&gt;"
+     | "&" -> t ^ "&amp;"
+     | c -> t ^ c)
+
+let html_of_struct_sentence tokens paths last =
+  "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>lnode</b></td><td><b>rnode</b></td></tr>" ^
+  String.concat "\n" (Xlist.map (List.sort compare paths) (fun (id,lnode,rnode) ->
+    let t = ExtArray.get tokens id in
+    sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td></tr>"
+      t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^
+  sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^
+  "</table>"
+
+let html_of_dep_sentence tokens paths =
+  "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^
+  String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id ->
+    let id,super,label = paths.(conll_id) in
+    let t = ExtArray.get tokens id in
+    (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>"
+      t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^
+  "</table>"
+
+let html_of_tokens tokens =
+  "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^
+  String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id ->
+    let t = ExtArray.get tokens id in
+    (sprintf "<tr><td>%d</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td><td>%s</td></tr>"
+      id t.ENIAMtokenizerTypes.orth t.ENIAMtokenizerTypes.beg t.ENIAMtokenizerTypes.len t.ENIAMtokenizerTypes.next (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token))
+      (String.concat "; " t.ENIAMtokenizerTypes.attrs)) :: l))) ^
+  "</table>"
+
+let rec html_of_sentence path tokens = function
+    RawSentence s -> s
+  | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last
+  | DepSentence paths -> html_of_dep_sentence tokens paths
+  | QuotedSentences sentences ->
+      String.concat "<BR>\n" (Xlist.map sentences (fun p ->
+        sprintf "pid=%s pbeg=%d plen=%d pnext=%d<BR>%s" p.pid p.pbeg p.plen p.pnext (html_of_sentence path tokens p.psentence)))
+  | AltSentence l -> (*print_endline "AltSentence";*)
+     "<table border=1>" ^
+     String.concat "\n" (Xlist.map l (fun (mode,sentence) ->
+       sprintf "<tr><td>%s</td><td>%s</td></tr>" (ENIAMsubsyntaxStringOf.mode mode) (html_of_sentence path tokens sentence))) ^
+     "</table>"
+  (* | _ -> failwith "html_of_sentence: ni" *)
+
+let rec html_of_paragraph path tokens = function
+    RawParagraph s -> (*print_endline "RawParagraph";*) s
+  | StructParagraph sentences -> (*print_endline "StructParagraph";*)
+      String.concat "<BR>\n" (Xlist.map sentences (fun p ->
+        sprintf "pid=%s pbeg=%d plen=%d pnext=%d<BR>%s" p.pid p.pbeg p.plen p.pnext (html_of_sentence path tokens p.psentence)))
+  | AltParagraph l -> (*print_endline "AltParagraph";*)
+     "<table border=2>" ^
+     String.concat "\n" (Xlist.map l (fun (mode,paragraph) ->
+       sprintf "<tr><td>%s</td><td>%s</td></tr>" (ENIAMsubsyntaxStringOf.mode mode) (html_of_paragraph path tokens paragraph))) ^
+     "</table>"
+
+let rec html_of_text path tokens = function
+    RawText s -> s
+  | StructText paragraphs ->
+      String.concat "<BR>\n" (Xlist.map paragraphs (html_of_paragraph path tokens))
+  | AltText l ->
+     "<table border=3>" ^
+     String.concat "\n" (Xlist.map l (fun (mode,text) ->
+       sprintf "<tr><td>%s</td><td>%s</td></tr>" (ENIAMsubsyntaxStringOf.mode mode) (html_of_text path tokens text))) ^
+     "</table>"
+
+let print_html_text path name tokens text =
+  File.file_out (path ^ name ^ ".html") (fun file ->
+    fprintf file "%s\n" html_header;
+    fprintf file "%s<BR>\n" (html_of_text path tokens text);
+    fprintf file "%s<BR>\n" (html_of_tokens tokens);
+(*    fprintf file "%s<BR>\n" (html_of_tokens_simple_valence tokens);
+    fprintf file "%s<BR>\n" (html_of_tokens_valence tokens);*)
+    fprintf file "%s\n" html_trailer)
+(*
+ *  ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
+ *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+ *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
+ *
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *)
+
+open ENIAMsubsyntaxTypes
+open Printf
+
+let mode = function
+    Raw -> "Raw"
+  | Struct -> "Struct"
+  | CONLL -> "CONLL"
+  | ENIAM -> "ENIAM"
+  | Mate -> "Mate"
+  | Swigra -> "Swigra"
+  | POLFIE -> "POLFIE"
+
+let tokens t =
+  Xml.Element("tokens",[], "\n" (List.rev (Int.fold 0 (ExtArray.size t - 1) [] (fun l id ->
+    ENIAMtokens.xml_of_token_record id (ExtArray.get t id)))))
+
+let xml_of_dep_sentence paths =
+  Xlist.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id ->
+    let id,super,label = paths.(conll_id) in
+    Xml.Element("edge",["conll_id",string_of_int conll_id;"id",string_of_int id] @
+      (if super = (-1) then [] else ["super",string_of_int super]) @
+      (if label = "" then [] else ["label",label]),[])))
+
+let xml_of_edge (id,lnode,rnode) =
+    Xml.Element("edge",["id",string_of_int id;"lnode",string_of_int lnode;"rnode",string_of_int rnode],[])
+
+let set_mode m =
+  if m = "" then [] else ["mode",m]
+
+let rec sentence m = function
+    RawSentence s -> Xml.Element("RawSentence",set_mode m,[Xml.PCData s])
+  | StructSentence(paths,last) -> Xml.Element("StructSentence",(set_mode m) @ ["last",last],Xlist.map paths xml_of_edge)
+  | DepSentence paths -> Xml.Element("DepSentence",
+          (set_mode m) @ ["size",string_of_int (Array.length paths)],xml_of_dep_sentence paths)
+  | QuotedSentences sentences ->
+      Xml.Element("QuotedSentences",set_mode m,Xlist.map sentences (fun p ->
+        Xml.Element("Sentence",["id",p.pid;"beg",p.pbeg;"len",p.plen;"next",p.pnext],[sentence "" p.psentence])))
+  | AltSentence l -> XmlElement("AltSentence",set_mode m,Xlist.map l (fun (m,t) -> sentence (mode m) t))
+
+let rec paragraph m = function
+    RawParagraph s -> Xml.Element("RawParagraph",set_mode m,[Xml.PCData s])
+  | StructParagraph sentences ->
+      XmlElement("StructParagraph",set_mode m,Xlist.map sentences (fun p ->
+        Xml.Element("Sentence",["id",p.pid;"beg",p.pbeg;"len",p.plen;"next",p.pnext],[sentence "" p.psentence])))
+  | AltParagraph l -> XmlElement("AltParagraph",set_mode m,Xlist.map l (fun (m,t) -> paragraph (mode m) t))
+
+let rec text m = function
+    RawText s -> Xml.Element("RawText",set_mode m,[Xml.PCData s])
+  | StructText paragraphs -> XmlElement("StructText",set_mode m,Xlist.map paragraphs (paragraph ""))
+  | AltText l -> XmlElement("AltText",set_mode m,Xlist.map l (fun (m,t) -> text (mode m)))
@@ -6,15 +6,15 @@ OCAMLFLAGS=$(INCLUDES) -g
 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
 INSTALLDIR=`ocamlc -where`/eniam
  
-SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml
+SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml
  
 all: eniam-subsyntax.cma eniam-subsyntax.cmxa
  
 install: all
 	mkdir -p $(INSTALLDIR)
 	cp eniam-subsyntax.cmxa eniam-subsyntax.a eniam-subsyntax.cma $(INSTALLDIR)
-	cp ENIAMsubsyntaxTypes.cmi ENIAMsubsyntaxStringOf.cmi ENIAMpaths.cmi ENIAM_MWE.cmi ENIAMsentences.cmi ENIAMsubsyntax.cmi $(INSTALLDIR)
-	cp ENIAMsubsyntaxTypes.cmx ENIAMsubsyntaxStringOf.cmx ENIAMpaths.cmx ENIAM_MWE.cmx ENIAMsentences.cmx ENIAMsubsyntax.cmx $(INSTALLDIR)
+	cp ENIAMsubsyntaxTypes.cmi ENIAMsubsyntaxStringOf.cmi ENIAMsubsyntaxHTMLof.cmi ENIAMsubsyntaxGraphOf.cmi ENIAMpaths.cmi ENIAM_MWE.cmi ENIAMsentences.cmi ENIAMsubsyntax.cmi $(INSTALLDIR)
+	cp ENIAMsubsyntaxTypes.cmx ENIAMsubsyntaxStringOf.cmx ENIAMsubsyntaxHTMLof.cmx ENIAMsubsyntaxGraphOf.cmx ENIAMpaths.cmx ENIAM_MWE.cmx ENIAMsentences.cmx ENIAMsubsyntax.cmx $(INSTALLDIR)
 	mkdir -p /usr/share/eniam/subsyntax
 	cp resources/*  /usr/share/eniam/subsyntax
  
@@ -19,10 +19,10 @@
  
  
 let test_strings = [
-  "Szpak frunie.";
+  (* "Szpak frunie."; *)
   "Kot np. miauczy.";
-  "Ala ma kota.";
-  "Ale mają kota:"
+  (* "Ala ma kota.";
+  "Ale mają kota:" *)
   (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *)
   (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *)
   (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *)
@@ -31,17 +31,19 @@ let test_strings = [
 ]
  
 let test_strings2 = [
-  "Szpak frunie. Kot miauczy.";
-  "Szpak powiedział: „Frunę. Kiszę.”";
+(*  "Szpak frunie. Kot miauczy.";
+  "Szpak powiedział: „Frunę. Śpiewam.”";*)
+  (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *)
   ]
  
 let _ =
   print_endline "Testy wbudowane";
   Xlist.iter test_strings (fun s ->
     print_endline ("\nTEST: " ^ s);
-    let paths = ENIAMsubsyntax.parse s in
+    let tokens = ENIAMsubsyntax.parse s in
     (* print_endline (ENIAMtokenizer.xml_of tokens); *)
-    print_endline (ENIAMpaths.to_string (paths,0)));
+    print_endline (ENIAMpaths.to_string (tokens,0));
+    ENIAMsubsyntaxGraphOf.print_tokens "results/" "test" tokens);
   print_endline "Testy wbudowane 2";
   Xlist.iter test_strings2 (fun s ->
     print_endline ("\nTEST: " ^ s);
@@ -49,7 +51,8 @@ let _ =
     (* print_endline (ENIAMtokenizer.xml_of tokens); *)
     print_endline (ENIAMsubsyntaxStringOf.tokens tokens);
     print_endline "";
-    print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));
+    print_endline (ENIAMsubsyntaxStringOf.text "" tokens text);
+    ENIAMsubsyntaxHTMLof.print_html_text "results/" "test" tokens text);
 (*  print_endline "Testy użytkownika.";
   print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
   let s = ref (read_line ()) in
@@ -91,6 +91,11 @@ let get_orth = function
   | Interp orth  -> orth
   | _ -> ""(*failwith "get_orth"*)
  
+let rec get_lemma = function
+    ENIAMtokenizerTypes.Interp orth -> orth
+  | ENIAMtokenizerTypes.Lemma(lemma,cat,_) -> lemma ^ "\n" ^ cat
+  | ENIAMtokenizerTypes.Proper(lemma,cat,_,_) -> lemma ^ "\n" ^ cat
+  | _ -> ""
  
 let months = StringSet.of_list ["1"; "2"; "3"; "4"; "5"; "6"; "7"; "8"; "9"; "01"; "02"; "03"; "04"; "05"; "06"; "07"; "08"; "09"; "10"; "11"; "12"]
 let hours = StringSet.of_list ["0"; "1"; "2"; "3"; "4"; "5"; "6"; "7"; "8"; "9"; "00"; "01"; "02"; "03"; "04"; "05"; "06"; "07"; "08"; "09";