Commit 8bb00e36b6a350b91a1d6e499537ec6293eef3ea

Authored by Wojciech Jaworski
1 parent b06dc862

generowanie wyjścia dla subsyntax

LCGparser/test.ml
... ... @@ -45,8 +45,8 @@ let examples = [
45 45 0, 1, "Jakiego","jaki","adj",Raised(WithVar("case",With[Atom "gen"; Atom "acc"],"A",ImpSet(ImpSet(Tensor[Atom "cp"; Atom "int"; Atom "jaki"],
46 46 [Forward,Imp(Tensor[Atom "ip"],Forward,Tensor[Atom "np"; AVar "case"])]),
47 47 [Forward,Imp(Tensor[Atom "np"; AVar "case"],Backward,Tensor[Atom "adjp"; AVar "case"])])));
48   - (* 1, 2, "kota","kot","subst", Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Maybe(Tensor[Atom "adjp"; AVar "case"])]))); *)
49   - 1, 2, "kota","kot","subst", Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Tensor[Atom "adjp"; AVar "case"]])));
  48 + 1, 2, "kota","kot","subst", Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Maybe(Tensor[Atom "adjp"; AVar "case"])])));
  49 + (* 1, 2, "kota","kot","subst", Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Tensor[Atom "adjp"; AVar "case"]]))); *)
50 50 2, 3, "Ala","Ala","subst", Basic(Tensor[Atom "np"; Atom "nom"]);
51 51 3, 4, "ma","mieć","fin", Basic(ImpSet(Tensor[Atom "ip"],[Both,Tensor[Atom "np"; Atom "nom"];Both,Tensor[Atom "np"; Atom "acc"]]));
52 52 (* 3, 4, "kota","kot","subst", Basic(WithVar("case",With[Atom "gen"; Atom "acc"],"B",ImpSet(Tensor[Atom "np"; AVar "case"],[Backward,Tensor[Atom "adjp"; AVar "case"]]))); *)
... ...
semsources/src stycz/dzieło stycz1.pdf deleted
No preview for this file type
subsyntax/ENIAMsubsyntaxGraphOf.ml 0 → 100644
  1 +(*
  2 + * ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
  3 + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open ENIAMsubsyntaxTypes
  21 +open Printf
  22 +
  23 +let print_tokens path name tokens =
  24 + File.file_out (path ^ name ^ ".gv") (fun file ->
  25 + fprintf file "digraph G {\n";
  26 + Xlist.iter tokens (fun t ->
  27 + let lemma = ENIAMtokens.get_lemma t.ENIAMtokenizerTypes.token in
  28 + if lemma <> "" then fprintf file " %d -> %d [label=\"%s\\n%s\"]\n" t.ENIAMtokenizerTypes.beg t.ENIAMtokenizerTypes.next t.ENIAMtokenizerTypes.orth lemma);
  29 + fprintf file "}\n");
  30 + Sys.chdir path;
  31 + ignore (Sys.command ("dot -Tpng " ^ name ^ ".gv -o " ^ name ^ ".png"));
  32 + String.iter (function '/' -> Sys.chdir ".." | _ -> ()) path
... ...
subsyntax/ENIAMsubsyntaxHTMLof.ml 0 → 100644
  1 +(*
  2 + * ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
  3 + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open ENIAMsubsyntaxTypes
  21 +open Printf
  22 +
  23 +let html_header =
  24 +"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">
  25 +<html>
  26 + <head>
  27 + <META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\">
  28 + <TITLE>ENIAM: Kategorialny Parser Składniowo-Semantyczny</TITLE>
  29 + <META HTTP-EQUIV=\"Content-Language\" CONTENT=\"pl\">
  30 + </head>
  31 +
  32 + <body>
  33 + <center>"
  34 +
  35 +let html_trailer =
  36 +"</center>
  37 + </body>
  38 +</html>"
  39 +
  40 +let escape_html s =
  41 + Int.fold 0 (String.length s - 1) "" (fun t i ->
  42 + match String.sub s i 1 with
  43 + "<" -> t ^ "&lt;"
  44 + | ">" -> t ^ "&gt;"
  45 + | "&" -> t ^ "&amp;"
  46 + | c -> t ^ c)
  47 +
  48 +let html_of_struct_sentence tokens paths last =
  49 + "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>lnode</b></td><td><b>rnode</b></td></tr>" ^
  50 + String.concat "\n" (Xlist.map (List.sort compare paths) (fun (id,lnode,rnode) ->
  51 + let t = ExtArray.get tokens id in
  52 + sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td></tr>"
  53 + t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^
  54 + sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^
  55 + "</table>"
  56 +
  57 +let html_of_dep_sentence tokens paths =
  58 + "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^
  59 + String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id ->
  60 + let id,super,label = paths.(conll_id) in
  61 + let t = ExtArray.get tokens id in
  62 + (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>"
  63 + t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^
  64 + "</table>"
  65 +
  66 +let html_of_tokens tokens =
  67 + "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^
  68 + String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id ->
  69 + let t = ExtArray.get tokens id in
  70 + (sprintf "<tr><td>%d</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td><td>%s</td></tr>"
  71 + id t.ENIAMtokenizerTypes.orth t.ENIAMtokenizerTypes.beg t.ENIAMtokenizerTypes.len t.ENIAMtokenizerTypes.next (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token))
  72 + (String.concat "; " t.ENIAMtokenizerTypes.attrs)) :: l))) ^
  73 + "</table>"
  74 +
  75 +let rec html_of_sentence path tokens = function
  76 + RawSentence s -> s
  77 + | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last
  78 + | DepSentence paths -> html_of_dep_sentence tokens paths
  79 + | QuotedSentences sentences ->
  80 + String.concat "<BR>\n" (Xlist.map sentences (fun p ->
  81 + sprintf "pid=%s pbeg=%d plen=%d pnext=%d<BR>%s" p.pid p.pbeg p.plen p.pnext (html_of_sentence path tokens p.psentence)))
  82 + | AltSentence l -> (*print_endline "AltSentence";*)
  83 + "<table border=1>" ^
  84 + String.concat "\n" (Xlist.map l (fun (mode,sentence) ->
  85 + sprintf "<tr><td>%s</td><td>%s</td></tr>" (ENIAMsubsyntaxStringOf.mode mode) (html_of_sentence path tokens sentence))) ^
  86 + "</table>"
  87 + (* | _ -> failwith "html_of_sentence: ni" *)
  88 +
  89 +let rec html_of_paragraph path tokens = function
  90 + RawParagraph s -> (*print_endline "RawParagraph";*) s
  91 + | StructParagraph sentences -> (*print_endline "StructParagraph";*)
  92 + String.concat "<BR>\n" (Xlist.map sentences (fun p ->
  93 + sprintf "pid=%s pbeg=%d plen=%d pnext=%d<BR>%s" p.pid p.pbeg p.plen p.pnext (html_of_sentence path tokens p.psentence)))
  94 + | AltParagraph l -> (*print_endline "AltParagraph";*)
  95 + "<table border=2>" ^
  96 + String.concat "\n" (Xlist.map l (fun (mode,paragraph) ->
  97 + sprintf "<tr><td>%s</td><td>%s</td></tr>" (ENIAMsubsyntaxStringOf.mode mode) (html_of_paragraph path tokens paragraph))) ^
  98 + "</table>"
  99 +
  100 +let rec html_of_text path tokens = function
  101 + RawText s -> s
  102 + | StructText paragraphs ->
  103 + String.concat "<BR>\n" (Xlist.map paragraphs (html_of_paragraph path tokens))
  104 + | AltText l ->
  105 + "<table border=3>" ^
  106 + String.concat "\n" (Xlist.map l (fun (mode,text) ->
  107 + sprintf "<tr><td>%s</td><td>%s</td></tr>" (ENIAMsubsyntaxStringOf.mode mode) (html_of_text path tokens text))) ^
  108 + "</table>"
  109 +
  110 +let print_html_text path name tokens text =
  111 + File.file_out (path ^ name ^ ".html") (fun file ->
  112 + fprintf file "%s\n" html_header;
  113 + fprintf file "%s<BR>\n" (html_of_text path tokens text);
  114 + fprintf file "%s<BR>\n" (html_of_tokens tokens);
  115 +(* fprintf file "%s<BR>\n" (html_of_tokens_simple_valence tokens);
  116 + fprintf file "%s<BR>\n" (html_of_tokens_valence tokens);*)
  117 + fprintf file "%s\n" html_trailer)
... ...
subsyntax/ENIAMsubsyntaxXMLof.ml 0 → 100644
  1 +(*
  2 + * ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
  3 + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open ENIAMsubsyntaxTypes
  21 +open Printf
  22 +
  23 +let mode = function
  24 + Raw -> "Raw"
  25 + | Struct -> "Struct"
  26 + | CONLL -> "CONLL"
  27 + | ENIAM -> "ENIAM"
  28 + | Mate -> "Mate"
  29 + | Swigra -> "Swigra"
  30 + | POLFIE -> "POLFIE"
  31 +
  32 +let tokens t =
  33 + Xml.Element("tokens",[], "\n" (List.rev (Int.fold 0 (ExtArray.size t - 1) [] (fun l id ->
  34 + ENIAMtokens.xml_of_token_record id (ExtArray.get t id)))))
  35 +
  36 +let xml_of_dep_sentence paths =
  37 + Xlist.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id ->
  38 + let id,super,label = paths.(conll_id) in
  39 + Xml.Element("edge",["conll_id",string_of_int conll_id;"id",string_of_int id] @
  40 + (if super = (-1) then [] else ["super",string_of_int super]) @
  41 + (if label = "" then [] else ["label",label]),[])))
  42 +
  43 +let xml_of_edge (id,lnode,rnode) =
  44 + Xml.Element("edge",["id",string_of_int id;"lnode",string_of_int lnode;"rnode",string_of_int rnode],[])
  45 +
  46 +let set_mode m =
  47 + if m = "" then [] else ["mode",m]
  48 +
  49 +let rec sentence m = function
  50 + RawSentence s -> Xml.Element("RawSentence",set_mode m,[Xml.PCData s])
  51 + | StructSentence(paths,last) -> Xml.Element("StructSentence",(set_mode m) @ ["last",last],Xlist.map paths xml_of_edge)
  52 + | DepSentence paths -> Xml.Element("DepSentence",
  53 + (set_mode m) @ ["size",string_of_int (Array.length paths)],xml_of_dep_sentence paths)
  54 + | QuotedSentences sentences ->
  55 + Xml.Element("QuotedSentences",set_mode m,Xlist.map sentences (fun p ->
  56 + Xml.Element("Sentence",["id",p.pid;"beg",p.pbeg;"len",p.plen;"next",p.pnext],[sentence "" p.psentence])))
  57 + | AltSentence l -> XmlElement("AltSentence",set_mode m,Xlist.map l (fun (m,t) -> sentence (mode m) t))
  58 +
  59 +let rec paragraph m = function
  60 + RawParagraph s -> Xml.Element("RawParagraph",set_mode m,[Xml.PCData s])
  61 + | StructParagraph sentences ->
  62 + XmlElement("StructParagraph",set_mode m,Xlist.map sentences (fun p ->
  63 + Xml.Element("Sentence",["id",p.pid;"beg",p.pbeg;"len",p.plen;"next",p.pnext],[sentence "" p.psentence])))
  64 + | AltParagraph l -> XmlElement("AltParagraph",set_mode m,Xlist.map l (fun (m,t) -> paragraph (mode m) t))
  65 +
  66 +let rec text m = function
  67 + RawText s -> Xml.Element("RawText",set_mode m,[Xml.PCData s])
  68 + | StructText paragraphs -> XmlElement("StructText",set_mode m,Xlist.map paragraphs (paragraph ""))
  69 + | AltText l -> XmlElement("AltText",set_mode m,Xlist.map l (fun (m,t) -> text (mode m)))
... ...
subsyntax/makefile
... ... @@ -6,15 +6,15 @@ OCAMLFLAGS=$(INCLUDES) -g
6 6 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9   -SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml
  9 +SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml
10 10  
11 11 all: eniam-subsyntax.cma eniam-subsyntax.cmxa
12 12  
13 13 install: all
14 14 mkdir -p $(INSTALLDIR)
15 15 cp eniam-subsyntax.cmxa eniam-subsyntax.a eniam-subsyntax.cma $(INSTALLDIR)
16   - cp ENIAMsubsyntaxTypes.cmi ENIAMsubsyntaxStringOf.cmi ENIAMpaths.cmi ENIAM_MWE.cmi ENIAMsentences.cmi ENIAMsubsyntax.cmi $(INSTALLDIR)
17   - cp ENIAMsubsyntaxTypes.cmx ENIAMsubsyntaxStringOf.cmx ENIAMpaths.cmx ENIAM_MWE.cmx ENIAMsentences.cmx ENIAMsubsyntax.cmx $(INSTALLDIR)
  16 + cp ENIAMsubsyntaxTypes.cmi ENIAMsubsyntaxStringOf.cmi ENIAMsubsyntaxHTMLof.cmi ENIAMsubsyntaxGraphOf.cmi ENIAMpaths.cmi ENIAM_MWE.cmi ENIAMsentences.cmi ENIAMsubsyntax.cmi $(INSTALLDIR)
  17 + cp ENIAMsubsyntaxTypes.cmx ENIAMsubsyntaxStringOf.cmx ENIAMsubsyntaxHTMLof.cmx ENIAMsubsyntaxGraphOf.cmx ENIAMpaths.cmx ENIAM_MWE.cmx ENIAMsentences.cmx ENIAMsubsyntax.cmx $(INSTALLDIR)
18 18 mkdir -p /usr/share/eniam/subsyntax
19 19 cp resources/* /usr/share/eniam/subsyntax
20 20  
... ...
subsyntax/test.ml
... ... @@ -19,10 +19,10 @@
19 19  
20 20  
21 21 let test_strings = [
22   - "Szpak frunie.";
  22 + (* "Szpak frunie."; *)
23 23 "Kot np. miauczy.";
24   - "Ala ma kota.";
25   - "Ale mają kota:"
  24 + (* "Ala ma kota.";
  25 + "Ale mają kota:" *)
26 26 (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *)
27 27 (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *)
28 28 (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *)
... ... @@ -31,17 +31,19 @@ let test_strings = [
31 31 ]
32 32  
33 33 let test_strings2 = [
34   - "Szpak frunie. Kot miauczy.";
35   - "Szpak powiedział: „Frunę. Kiszę.”";
  34 +(* "Szpak frunie. Kot miauczy.";
  35 + "Szpak powiedział: „Frunę. Śpiewam.”";*)
  36 + (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *)
36 37 ]
37 38  
38 39 let _ =
39 40 print_endline "Testy wbudowane";
40 41 Xlist.iter test_strings (fun s ->
41 42 print_endline ("\nTEST: " ^ s);
42   - let paths = ENIAMsubsyntax.parse s in
  43 + let tokens = ENIAMsubsyntax.parse s in
43 44 (* print_endline (ENIAMtokenizer.xml_of tokens); *)
44   - print_endline (ENIAMpaths.to_string (paths,0)));
  45 + print_endline (ENIAMpaths.to_string (tokens,0));
  46 + ENIAMsubsyntaxGraphOf.print_tokens "results/" "test" tokens);
45 47 print_endline "Testy wbudowane 2";
46 48 Xlist.iter test_strings2 (fun s ->
47 49 print_endline ("\nTEST: " ^ s);
... ... @@ -49,7 +51,8 @@ let _ =
49 51 (* print_endline (ENIAMtokenizer.xml_of tokens); *)
50 52 print_endline (ENIAMsubsyntaxStringOf.tokens tokens);
51 53 print_endline "";
52   - print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));
  54 + print_endline (ENIAMsubsyntaxStringOf.text "" tokens text);
  55 + ENIAMsubsyntaxHTMLof.print_html_text "results/" "test" tokens text);
53 56 (* print_endline "Testy użytkownika.";
54 57 print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
55 58 let s = ref (read_line ()) in
... ...
tokenizer/ENIAMtokens.ml
... ... @@ -91,6 +91,11 @@ let get_orth = function
91 91 | Interp orth -> orth
92 92 | _ -> ""(*failwith "get_orth"*)
93 93  
  94 +let rec get_lemma = function
  95 + ENIAMtokenizerTypes.Interp orth -> orth
  96 + | ENIAMtokenizerTypes.Lemma(lemma,cat,_) -> lemma ^ "\n" ^ cat
  97 + | ENIAMtokenizerTypes.Proper(lemma,cat,_,_) -> lemma ^ "\n" ^ cat
  98 + | _ -> ""
94 99  
95 100 let months = StringSet.of_list ["1"; "2"; "3"; "4"; "5"; "6"; "7"; "8"; "9"; "01"; "02"; "03"; "04"; "05"; "06"; "07"; "08"; "09"; "10"; "11"; "12"]
96 101 let hours = StringSet.of_list ["0"; "1"; "2"; "3"; "4"; "5"; "6"; "7"; "8"; "9"; "00"; "01"; "02"; "03"; "04"; "05"; "06"; "07"; "08"; "09";
... ...