diff --git a/config b/config new file mode 100644 index 0000000..c12fc32 --- /dev/null +++ b/config @@ -0,0 +1,26 @@ +# General resource path +RESOURCES_PATH=../resources/ + +# Localization of Walenty in TEI format +WALENTY=/usr/share/walenty/walenty_20160412.xml + +# Port number for pre server +PRE_PORT=3258 + +# Host name for pre server +PRE_HOST=localhost + +# Path to the directory for parsed sentences +RESULTS_PATH=../results/ + +# Maximum number of generated solutions +MAX_NO_SOLUTIONS=10 + +# LCG parser timeout in seconds +LCG_TIMEOUT=100 + +# LCG parser memory size (maximum number of nodes of parsed term) +LCG_NO_NODES=10000000 + +# Number of parser processes +NO_PROCESSES=4 diff --git a/morphology/dict.ml b/morphology/dict.ml index bb1e1df..0b1495f 100644 --- a/morphology/dict.ml +++ b/morphology/dict.ml @@ -315,9 +315,9 @@ let exceptional_lemmata = StringSet.of_list ([ ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*) - ] @ File.load_lines "data/obce.tab" (* @ - File.load_lines "data/validated_adj.tab" @ File.load_lines "data/validated_noun.tab" @ - File.load_lines "data/validated_verb.tab" @ File.load_lines "data/adv_nieodprzymiotnikowe.tab" *)) + ] @ File.load_lines "../morphology/data/obce.tab" (* @ + File.load_lines "../morphology/data/validated_adj.tab" @ File.load_lines "../morphology/data/validated_noun.tab" @ + File.load_lines "../morphology/data/validated_verb.tab" @ File.load_lines "../morphology/data/adv_nieodprzymiotnikowe.tab" *)) let remove_exceptional_lemmata dict = Xlist.fold dict [] (fun dict entry -> diff --git a/morphology/inflexion.a b/morphology/inflexion.a new file mode 100644 index 0000000..e9fdc93 --- /dev/null +++ b/morphology/inflexion.a diff --git a/morphology/inflexion.cmxa b/morphology/inflexion.cmxa new file mode 100644 index 0000000..9fe1bc0 --- /dev/null +++ b/morphology/inflexion.cmxa diff --git a/morphology/ruleGenerator.ml b/morphology/ruleGenerator.ml index 1e4b98b..02b2950 100644 --- a/morphology/ruleGenerator.ml +++ b/morphology/ruleGenerator.ml @@ -2,7 +2,7 @@ open Xstd open Printf open Types -let alternation_map = Rules.alternation_map +(* let alternation_map = Rules.alternation_map *) let rule_types = Xlist.fold [ (* Xlist.map (StringMap.find alternation_map "obce_ch") (fun (_,s,t) -> sprintf "%sch\t%s" s t), "{x}ych\t{x}"; diff --git a/morphology/rules.ml b/morphology/rules.ml index 63d07d8..0653427 100644 --- a/morphology/rules.ml +++ b/morphology/rules.ml @@ -44,7 +44,7 @@ let load_alternations filename = | _ -> failwith "load_alternations") in (name,List.rev alts) :: alternations -let alternations () = load_alternations "data/alternations.dic" +let alternations () = load_alternations "../morphology/data/alternations.dic" let revert_alternations l = Xlist.map l (fun a -> {a with afind=a.aset; aset=a.afind}) @@ -76,9 +76,9 @@ let load_pref_rules filename = | _ -> failwith "load_pref_rules") in (name,List.rev rules) :: pref_rules -let rules () = load_suf_rules "data/rules.dic" -let rev_rules () = load_suf_rules "data/rev_rules.dic" -let pref_rules () = load_pref_rules "data/pref_rules.dic" +let rules () = load_suf_rules "../morphology/data/rules.dic" +let rev_rules () = load_suf_rules "../morphology/data/rev_rules.dic" +let pref_rules () = load_pref_rules "../morphology/data/pref_rules.dic" let load_freq_rules filename = File.fold_tab filename [] (fun rules -> function @@ -120,7 +120,7 @@ let rule_map alternation_map rev_alternation_map rules rev_rules pref_rules = let map = Xlist.fold rev_rules map (fun map (k,v) -> StringMap.add map k (prepare_rev_rules rev_alternation_map v)) in Xlist.fold pref_rules map (fun map (k,v) -> StringMap.add map k (prepare_pref_rules v)) -let schemata () = File.load_tab "data/schemata.dic" (fun l -> l) +let schemata () = File.load_tab "../morphology/data/schemata.dic" (fun l -> l) (**********************************************************************************************) @@ -236,7 +236,7 @@ module InterpTree = struct end -let interp_tree () = InterpTree.create (load_interp_rules "data/interp_rules.dic") +let interp_tree () = InterpTree.create (load_interp_rules "../morphology/data/interp_rules.dic") (**********************************************************************************************) diff --git a/parser/.gitignore b/parser/.gitignore new file mode 100644 index 0000000..d5cbca7 --- /dev/null +++ b/parser/.gitignore @@ -0,0 +1,2 @@ +pipe +results/* diff --git a/parser/makefile b/parser/makefile index 20f4b94..d2b7e7b 100755 --- a/parser/makefile +++ b/parser/makefile @@ -1,32 +1,32 @@ OCAMLC=ocamlc OCAMLOPT=ocamlopt OCAMLDEP=ocamldep -#INCLUDES=-I +xml-light -I +xlib -I ../../lib/latexvis -I ../lib/xt -I ../../Clarin-pl/podzadania/nkjp/fold_text -I ../podzadania/morfeusz -I ../pre -INCLUDES=--I +xml-light -I +xlib -I ../pre +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../../../Dropbox/lib/latexvis -I ../lib/xt -I ../../../Dropbox/Clarin-pl/podzadania/nkjp/fold_text -I ../podzadania/morfeusz -I ../pre +#INCLUDES=-I +xml-light -I +xlib -I ../pre OCAMLFLAGS=$(INCLUDES) -g -#OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa latexvis.cmxa nkjp.cmxa -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa latexvis.cmxa #nkjp.cmxa +#OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa PRE= ../pre/paths.ml ../pre/walTypes.ml ../pre/preTypes.ml ../pre/walStringOf.ml -#LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGlatexOf.ml LCGreductions.ml LCGlexicon.ml LCGvalence.ml -LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGreductions.ml LCGlexicon.ml LCGvalence.ml +LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGlatexOf.ml LCGreductions.ml LCGlexicon.ml LCGvalence.ml +#LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGreductions.ml LCGlexicon.ml LCGvalence.ml DISAMB= disambSelPref.ml disambLemma.ml -#SEM= semGraph.ml semTypes.ml semStringOf.ml semLatexOf.ml semMmlOf.ml semMrl.ml -SEM= semGraph.ml semTypes.ml semStringOf.ml semMmlOf.ml semMrl.ml -EXEC= execTypes.ml visualization.ml exec.ml - -all: -# $(OCAMLOPT) -o pipe $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) pipe.ml +SEM= semGraph.ml semTypes.ml semStringOf.ml semLatexOf.ml semMmlOf.ml semMrl.ml +#SEM= semGraph.ml semTypes.ml semStringOf.ml semMmlOf.ml semMrl.ml +EXEC= execTypes.ml visualization.ml exec.ml + +all: + $(OCAMLOPT) -o pipe $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) pipe.ml # $(OCAMLOPT) -o server $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) server.ml -# $(OCAMLOPT) -o parser.cgi $(OCAMLOPTFLAGS) $(PRE) LCGtypes.ml LCGstringOf.ml semTypes.ml semMmlOf.ml execTypes.ml visualization.ml webInterface.ml - $(OCAMLOPT) -o eniam.distr $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) overseer.ml - $(OCAMLOPT) -o eniam.worker $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) worker.ml -# $(OCAMLOPT) -o parser.api $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) apiInterface.ml - $(OCAMLOPT) -o eniam $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) singleInterface.ml - -# pipe: -# $(OCAMLOPT) -o pipe $(OCAMLOPTFLAGS) $(PRE) $(PROC_F) -# +# $(OCAMLOPT) -o parser.cgi $(OCAMLOPTFLAGS) $(PRE) LCGtypes.ml LCGstringOf.ml semTypes.ml semMmlOf.ml execTypes.ml visualization.ml webInterface.ml +# $(OCAMLOPT) -o eniam.distr $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) overseer.ml +# $(OCAMLOPT) -o eniam.worker $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) worker.ml +# $(OCAMLOPT) -o parser.api $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) apiInterface.ml +# $(OCAMLOPT) -o eniam $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) singleInterface.ml + +# pipe: +# $(OCAMLOPT) -o pipe $(OCAMLOPTFLAGS) $(PRE) $(PROC_F) +# # of_xml: # $(OCAMLOPT) -o of_xml $(OCAMLOPTFLAGS) LCGofXml.ml diff --git a/parser/visualization.ml b/parser/visualization.ml index f999b70..5648f2f 100644 --- a/parser/visualization.ml +++ b/parser/visualization.ml @@ -22,11 +22,11 @@ open Xstd open Printf open PreTypes -let string_of_interps interps = +let string_of_interps interps = String.concat "|" (Xlist.map interps (fun interp -> (String.concat ":" (Xlist.map interp (fun interp2 -> (String.concat "." interp2)))))) - + let rec string_of_token = function PreTypes.SmallLetter orth -> sprintf "SmallLetter(%s)" orth | PreTypes.CapLetter(orth,lc) -> sprintf "CapLetter(%s,%s)" orth lc @@ -43,22 +43,22 @@ let rec string_of_token = function | PreTypes.Proper(lemma,cat,interps,senses) -> sprintf "Proper(%s,%s,%s,%s)" lemma cat (string_of_interps interps) (String.concat "|" senses) | PreTypes.Compound(sense,l) -> sprintf "Compound(%s,[%s])" sense (String.concat ";" (Xlist.map l string_of_token)) -let rec spaces i = +let rec spaces i = if i = 0 then "" else " " ^ spaces (i-1) - + let rec string_of_tokens i = function - PreTypes.Token t -> sprintf "%s{orth=%s;beg=%d;len=%d;next=%d;token=%s;id=%d;weight=%.2f;attrs=[%s];\n%s senses=[%s];\n%s valence=[%s];\n%s simple_valence=[%s];lroles=%s,%s}" (spaces i) t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (string_of_token t.PreTypes.token) + PreTypes.Token t -> sprintf "%s{orth=%s;beg=%d;len=%d;next=%d;token=%s;id=%d;weight=%.2f;attrs=[%s];\n%s senses=[%s];\n%s valence=[%s];\n%s simple_valence=[%s];lroles=%s,%s}" (spaces i) t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (string_of_token t.PreTypes.token) t.PreTypes.id t.PreTypes.weight (String.concat ";" t.PreTypes.attrs) (spaces i) (String.concat ";" (Xlist.map t.PreTypes.senses (fun (sense,hipero,weight) -> sprintf "%s[%s]%.2f" sense (String.concat "," hipero) weight))) (spaces i) (String.concat ";" (Xlist.map t.PreTypes.valence (WalStringOf.fnum_frame ""))) (spaces i) (String.concat ";" (Xlist.map t.PreTypes.simple_valence (WalStringOf.fnum_frame ""))) (fst t.lroles) (snd t.lroles) | PreTypes.Variant l -> sprintf "%sVariant[\n%s]" (spaces i) (String.concat ";\n" (Xlist.map l (string_of_tokens (i+1)))) - | PreTypes.Seq l -> sprintf "%sSeq[\n%s]" (spaces i) (String.concat ";\n" (Xlist.map l (string_of_tokens (i+1)))) - -let paths_to_string_indexed (paths,last,next_id) = - String.concat "\n" (Xlist.map paths (fun (i,j,t) -> + | PreTypes.Seq l -> sprintf "%sSeq[\n%s]" (spaces i) (String.concat ";\n" (Xlist.map l (string_of_tokens (i+1)))) + +let paths_to_string_indexed (paths,last,next_id) = + String.concat "\n" (Xlist.map paths (fun (i,j,t) -> Printf.sprintf "%2d %2d %s" i j (string_of_tokens 0 (PreTypes.Token t)))) ^ Printf.sprintf "\nlast=%d next_id=%d" last next_id - -let rec xml_of_graph = function + +let rec xml_of_graph = function Node t -> Xml.Element("node",["pred",t.pred;"cat",t.cat;"weight",string_of_float t.weight;"id",string_of_int t.id],[ Xml.Element("gs",[],[xml_of_graph t.gs]); Xml.Element("agf",[],[Xml.PCData (WalStringOf.gf t.agf)]); @@ -75,11 +75,11 @@ let rec xml_of_graph = function Xml.Element("relations",[],[xml_of_graph c.cx_relations])]) | Relation(r,a,t) -> Xml.Element("relation",[],[ Xml.Element("role",[],[xml_of_graph r]); - Xml.Element("role_attr",[],[xml_of_graph r]); + Xml.Element("role_attr",[],[xml_of_graph r]); xml_of_graph t]) | RevRelation(r,a,t) -> Xml.Element("revrelation",[],[ Xml.Element("role",[],[xml_of_graph r]); - Xml.Element("role_attr",[],[xml_of_graph r]); + Xml.Element("role_attr",[],[xml_of_graph r]); xml_of_graph t]) | SingleRelation(r) -> Xml.Element("singlerelation",[],[xml_of_graph r]) | Tuple l -> Xml.Element("tuple",[],Xlist.map l xml_of_graph) @@ -90,18 +90,18 @@ let rec xml_of_graph = function | Ref i -> Xml.Element("ref",["id",string_of_int i],[]) | Morf _ -> Xml.Element("dot",[],[]) (* FIXME!!! *) | t -> failwith ("xml_of_graph: " ^ LCGstringOf.linear_term 0 t) - -let print_xml_graph path name references = + +let print_xml_graph path name references = let l = Int.fold 0 (Array.length references - 1) [] (fun l i -> (i, xml_of_graph references.(i)) :: l) in let xml = Xml.Element("graph",[],Xlist.rev_map l (fun (i,xml) -> Xml.Element("graph_node",["id",string_of_int i],[xml]))) in - File.file_out (path ^ name ^ ".xml") (fun file -> + File.file_out (path ^ name ^ ".xml") (fun file -> fprintf file "%s\n" (Xml.to_string_fmt xml)) -let print_xml_tree path name tree = +let print_xml_tree path name tree = let xml = xml_of_graph tree in - File.file_out (path ^ name ^ ".xml") (fun file -> + File.file_out (path ^ name ^ ".xml") (fun file -> fprintf file "%s\n" (Xml.to_string_fmt xml)) @@ -112,14 +112,14 @@ let rec get_refs rev = function | Variant(e,l) -> Xlist.fold l rev (fun rev (i,t) -> get_refs rev t) | Dot -> rev | _ -> (*failwith*)print_endline "get_refs"; rev - -let escape_string s = + +let escape_string s = Int.fold 0 (String.length s - 1) "" (fun t i -> match String.sub s i 1 with "<" -> t ^ "〈" | ">" -> t ^ "〉" | c -> t ^ c) - + let string_of_node t = let l = [ "PRED",Val t.pred;"CAT",Val t.cat;"ID",Val (string_of_int t.id);"WEIGHT",Val (string_of_float t.weight);"GS",t.gs; @@ -130,64 +130,64 @@ let string_of_node t = "{ " ^ String.concat " | " (Xlist.map l (fun (e,t) -> "{ " ^ e ^ " | " ^ escape_string (LCGstringOf.linear_term 0 t) ^ " }")) ^ " }" let single_rel_id_count = ref 0 - + let get_single_rel_id () = let id = !single_rel_id_count in incr single_rel_id_count; "s" ^ string_of_int id - + let print_edge file label upper id = - if upper <> "" then + if upper <> "" then if label = "" then fprintf file " %s -> %s\n" upper id else fprintf file " %s -> %s [label=\"%s\"]\n" upper id label - -(*let rec print_graph_rec2 file edge upper = function + +(*let rec print_graph_rec2 file edge upper = function Tuple l -> Xlist.iter l (print_graph_rec2 file edge upper) - | Node t -> + | Node t -> let id = get_single_rel_id () in fprintf file " %s [label=\"%s\"]\n" id (string_of_node t); print_edge file edge upper id; print_graph_rec2 file "" id t.args - | Concept t -> + | Concept t -> let id = get_single_rel_id () in - fprintf file " %s [shape=box,label=\"%s %s\"]\n" id - (LCGchart.string_of_linear_term 0 t.c_sense) + fprintf file " %s [shape=box,label=\"%s %s\"]\n" id + (LCGchart.string_of_linear_term 0 t.c_sense) (if t.c_name=Dot then "" else "„" ^ LCGchart.string_of_linear_term 0 t.c_name ^ "”"); (* FIXME *) print_edge file edge upper id; print_graph_rec2 file "" id t.c_relations - | SingleRelation(role) -> + | SingleRelation(role) -> let id = get_single_rel_id () in fprintf file " %s [shape=circle,label=\"%s\"]\n" id (LCGchart.string_of_linear_term 0 role); if upper <> "" then fprintf file " %s -> %s\n" upper id - | Variant(e,l) -> + | Variant(e,l) -> fprintf file " %s [shape=diamond]\n" e; print_edge file edge upper e; Xlist.iter l (fun (i,t) -> print_graph_rec2 file i e t) | Dot -> () | Ref i -> print_edge file edge upper ("x" ^ string_of_int i) | t -> failwith ("print_graph_rec2: " ^ LCGchart.string_of_linear_term 0 t) *) - + let rec string_of_quant_rec quant = function Tuple l -> Xlist.fold l quant string_of_quant_rec | Variant(e,l) -> (LCGstringOf.linear_term 0 (Variant(e,l))) :: quant | Dot -> quant | Val s -> s :: quant | _ -> failwith "string_of_quant_rec" - + let string_of_quant t = let l = string_of_quant_rec [] t in let s = String.concat " " l in if s = "" then "" else "<I>" ^ s ^ "</I> " - -let rec print_graph_rec file edge upper id = function - Node t -> + +let rec print_graph_rec file edge upper id = function + Node t -> fprintf file " %s [label=\"%s\"]\n" id (string_of_node t); print_edge file edge upper id; print_graph_rec2 file "" id t.args - | Concept t -> - fprintf file " %s [shape=box,label=<%s%s %s>]\n" id - (string_of_quant t.c_quant) - (LCGstringOf.linear_term 0 t.c_sense) + | Concept t -> + fprintf file " %s [shape=box,label=<%s%s %s>]\n" id + (string_of_quant t.c_quant) + (LCGstringOf.linear_term 0 t.c_sense) (if t.c_name=Dot then "" else "„" ^ LCGstringOf.linear_term 0 t.c_name ^ "”"); (* FIXME *) print_edge file edge upper id; print_graph_rec2 file "" id t.c_relations @@ -197,7 +197,7 @@ let rec print_graph_rec file edge upper id = function print_edge file edge upper id; print_graph_rec2 file "" id t.cx_contents; print_graph_rec2 file "" id t.cx_relations; - | Relation(role,role_attr,t) -> + | Relation(role,role_attr,t) -> fprintf file " %s [shape=circle,label=\"%s\\n%s\"]\n" id (LCGstringOf.linear_term 0 role) (LCGstringOf.linear_term 0 role_attr); print_edge file edge upper id; print_graph_rec2 file "" id t @@ -205,31 +205,31 @@ let rec print_graph_rec file edge upper id = function fprintf file " %s [shape=circle,label=\"%s\\n%s\"]\n" id (LCGstringOf.linear_term 0 role) (LCGstringOf.linear_term 0 role_attr); print_edge file edge upper id; print_graph_rec2 file "" id t - | SingleRelation(role) -> + | SingleRelation(role) -> fprintf file " %s [shape=circle,label=\"%s\"]\n" id (LCGstringOf.linear_term 0 role); print_edge file edge upper id - | AddRelation(t,role,role_attr,s) -> + | AddRelation(t,role,role_attr,s) -> fprintf file " %s [shape=circle,label=\"AddRelation\\n%s\\n%s\"]\n" id role role_attr; print_edge file edge upper id; print_graph_rec2 file "" id t; print_graph_rec2 file "" id s; - | SetContextName(s,t) -> + | SetContextName(s,t) -> fprintf file " %s [shape=circle,label=\"SetContextName\\n%s\"]\n" id s; print_edge file edge upper id; print_graph_rec2 file "" id t - | RemoveRelation t -> + | RemoveRelation t -> fprintf file " %s [shape=circle,label=\"RemoveRelation\"]\n" id; print_edge file edge upper id; print_graph_rec2 file "" id t - | Variant(e,l) -> + | Variant(e,l) -> fprintf file " %s [shape=diamond,label=\"%s\"]\n" id e; print_edge file edge upper id; Xlist.iter l (fun (i,t) -> print_graph_rec2 file i id t) - | Choice choice -> + | Choice choice -> fprintf file " %s [shape=Mdiamond,label=\"%s\"]\n" id ""; print_edge file edge upper id; StringMap.iter choice (fun ei t -> print_graph_rec2 file ei id t) - | Val s -> + | Val s -> fprintf file " %s [shape=box,label=\"%s\"]\n" id s; print_edge file edge upper id | Dot -> () @@ -237,63 +237,63 @@ let rec print_graph_rec file edge upper id = function print_edge file edge upper id*) | Ref i -> print_edge file edge upper ("x" ^ string_of_int i) | t -> failwith ("print_graph_rec: " ^ LCGstringOf.linear_term 0 t) - + and print_graph_rec2 file edge upper = function Tuple l -> Xlist.iter l (print_graph_rec2 file edge upper) | t -> print_graph_rec file edge upper (get_single_rel_id ()) t - + (*let rec print_graph_rec file is_rev upper i = function (* FIXME: dokończyć is_rev *) - Node t -> + Node t -> (* let orth = if t.id = 0 then "" else.(t.id).PreTypes.orth in fprintf file " %s [label=\"%s\\n%s\\n%s:%s\"]\n" i (LCGstringOf.linear_term 0 t.gs) orth t.pred t.cat;*) fprintf file " %s [label=\"%s\"]\n" i (string_of_node t); - if upper <> "" then + if upper <> "" then if is_rev then fprintf file " %s -> %s\n" i upper else fprintf file " %s -> %s\n" upper i; print_graph_rec file false i i t.args - | Concept t -> - fprintf file " %s [shape=box,label=\"%s %s\"]\n" ("c" ^ i) - (LCGstringOf.linear_term 0 t.c_sense) + | Concept t -> + fprintf file " %s [shape=box,label=\"%s %s\"]\n" ("c" ^ i) + (LCGstringOf.linear_term 0 t.c_sense) (if t.c_name=Dot then "" else "„" ^ LCGstringOf.linear_term 0 t.c_name ^ "”"); (* FIXME *) - if upper <> "" then + if upper <> "" then if is_rev then fprintf file " %s -> %s\n" ("c" ^ i) upper else fprintf file " %s -> %s\n" upper ("c" ^ i); print_graph_rec file false ("c" ^ i) i t.c_relations | Context t -> fprintf file " %s [shape=Msquare,label=\"\"]\n" ("i" ^ i); - if upper <> "" then + if upper <> "" then if is_rev then fprintf file " %s -> %s\n" ("i" ^ i) upper else fprintf file " %s -> %s\n" upper ("i" ^ i); print_graph_rec file false ("i" ^ i) i t.cx_contents - | SingleRelation(role) -> + | SingleRelation(role) -> let id = get_single_rel_id () in fprintf file " %s [shape=circle,label=\"%s\"]\n" id (LCGstringOf.linear_term 0 role); if upper <> "" then fprintf file " %s -> %s\n" upper id - | Relation(role,role_attr,t) -> + | Relation(role,role_attr,t) -> fprintf file " %s [shape=circle,label=\"%s\\n%s\"]\n" i (LCGstringOf.linear_term 0 role) (LCGstringOf.linear_term 0 role_attr); if upper <> "" then fprintf file " %s -> %s\n" upper i; print_graph_rec file false i i t - | RevRelation(role,role_attr,t) -> + | RevRelation(role,role_attr,t) -> fprintf file " %s [shape=circle,label=\"%s\\n%s\"]\n" i (LCGstringOf.linear_term 0 role) (LCGstringOf.linear_term 0 role_attr); if upper <> "" then fprintf file " %s -> %s\n" i upper; print_graph_rec file true i i t | Tuple l -> Xlist.iter l (print_graph_rec file is_rev upper i) - | Variant(e,l) -> + | Variant(e,l) -> fprintf file " %s [shape=diamond]\n" e; if upper <> "" then fprintf file " %s -> %s\n" upper e; Xlist.iter l (fun (i2,t) -> print_graph_rec file false e ("x" ^ i ^ "y" ^ i2) t) | Dot -> () | Ref i2 -> fprintf file " %s -> %d\n" upper i2 | t -> failwith ("print_graph_rec: " ^ LCGstringOf.linear_term 0 t)*) - -let print_graph path name references = + +let print_graph path name references = single_rel_id_count := 0; - File.file_out (path ^ name ^ ".gv") (fun file -> + File.file_out (path ^ name ^ ".gv") (fun file -> fprintf file "digraph G {\n node [shape=record]\n"; Int.iter 0 (Array.length references - 1) (fun i -> print_graph_rec file (*false*) "" "" ("x" ^ string_of_int i) references.(i)); -(* Int.iter 0 (Array.length references - 1) (fun i -> +(* Int.iter 0 (Array.length references - 1) (fun i -> match references.(i) with - Node t -> + Node t -> fprintf file " %d [label=\"%s\"]\n" i (string_of_node t); let refs = get_refs [] t.args in Xlist.iter refs (fun r -> @@ -304,31 +304,31 @@ let print_graph path name references = ignore (Sys.command ("dot -Tpng " ^ name ^ ".gv -o " ^ name ^ ".png")); Sys.chdir ".." -let id_counter = ref 0 - +let id_counter = ref 0 + let print_edge2 file edge_rev edge_label edge_head edge_tail upper id = let edge_head,edge_tail,upper,id = if edge_rev then edge_tail,edge_head,id,upper else edge_head,edge_tail,upper,id in - let l = + let l = (if edge_label = "" then [] else ["label=\"" ^ edge_label ^ "\""]) @ (if edge_head = "" then [] else ["ltail=\"" ^ edge_head ^ "\""]) @ (if edge_tail = "" then [] else ["lhead=\"" ^ edge_tail ^ "\""]) in - if upper <> 0 then + if upper <> 0 then if l = [] then fprintf file " %d -> %d\n" upper id else fprintf file " %d -> %d [%s]\n" upper id (String.concat "," l) - -let rec print_graph2_rec file edge_rev edge_label edge_head upper = function - Node t -> + +let rec print_graph2_rec file edge_rev edge_label edge_head upper = function + Node t -> let id = !id_counter in incr id_counter; fprintf file " %d [label=\"%s\"]\n" id (string_of_node t); print_edge2 file edge_rev edge_label edge_head "" upper id; print_graph2_rec file false "" "" id t.args - | Concept t -> + | Concept t -> let id = !id_counter in incr id_counter; - fprintf file " %d [shape=box,label=<%s%s %s>]\n" id - (string_of_quant t.c_quant) - (LCGstringOf.linear_term 0 t.c_sense) + fprintf file " %d [shape=box,label=<%s%s %s>]\n" id + (string_of_quant t.c_quant) + (LCGstringOf.linear_term 0 t.c_sense) (if t.c_name=Dot then "" else "„" ^ LCGstringOf.linear_term 0 t.c_name ^ "”"); (* FIXME *) print_edge2 file edge_rev edge_label edge_head "" upper id; print_graph2_rec file false "" "" id t.c_relations @@ -338,64 +338,64 @@ let rec print_graph2_rec file edge_rev edge_label edge_head upper = function if t.cx_sense = Dot then fprintf file " subgraph cluster%d {\nlabel=\"\"\n" id else fprintf file " subgraph cluster%d {\nlabel=\"%s\"\n" id (LCGstringOf.linear_term 0 t.cx_sense); print_graph2_rec file false "" "" 0 t.cx_contents; - fprintf file " }\n"; + fprintf file " }\n"; print_edge2 file edge_rev edge_label edge_head ("cluster" ^ string_of_int id) upper (id+1); print_graph2_rec file false "" ("cluster" ^ string_of_int id) (id+1) t.cx_relations; - | Relation(role,role_attr,t) -> + | Relation(role,role_attr,t) -> let id = !id_counter in incr id_counter; fprintf file " %d [shape=circle,label=\"%s\\n%s\"]\n" id (LCGstringOf.linear_term 0 role) (LCGstringOf.linear_term 0 role_attr); print_edge2 file false edge_label edge_head "" upper id; print_graph2_rec file false "" "" id t - | RevRelation(role,role_attr,t) -> + | RevRelation(role,role_attr,t) -> let id = !id_counter in incr id_counter; fprintf file " %d [shape=circle,label=\"%s\\n%s\"]\n" id (LCGstringOf.linear_term 0 role) (LCGstringOf.linear_term 0 role_attr); print_edge2 file true edge_label edge_head "" upper id; print_graph2_rec file true "" "" id t - | SingleRelation(role) -> + | SingleRelation(role) -> let id = !id_counter in incr id_counter; fprintf file " %d [shape=circle,label=\"%s\"]\n" id (LCGstringOf.linear_term 0 role); print_edge2 file false edge_label edge_head "" upper id - | AddRelation(t,role,role_attr,s) -> + | AddRelation(t,role,role_attr,s) -> let id = !id_counter in incr id_counter; fprintf file " %d [shape=circle,label=\"AddRelation\\n%s\\n%s\"]\n" id role role_attr; print_edge2 file edge_rev edge_label edge_head "" upper id; print_graph2_rec file false "" "" id t; print_graph2_rec file false "" "" id s - | RemoveRelation t -> + | RemoveRelation t -> let id = !id_counter in incr id_counter; fprintf file " %d [shape=circle,label=\"RemoveRelation\"]\n" id; print_edge2 file edge_rev edge_label edge_head "" upper id; print_graph2_rec file false "" "" id t - | SetContextName(s,t) -> + | SetContextName(s,t) -> let id = !id_counter in incr id_counter; fprintf file " %d [shape=circle,label=\"SetContextName\\n%s\"]\n" id s; print_edge2 file edge_rev edge_label edge_head "" upper id; print_graph2_rec file false "" "" id t; | Tuple l -> Xlist.iter l (print_graph2_rec file edge_rev edge_label edge_head upper) - | Variant(e,l) -> + | Variant(e,l) -> let id = !id_counter in incr id_counter; fprintf file " %d [shape=diamond,label=\"%s\"]\n" id e; print_edge2 file edge_rev edge_label edge_head "" upper id; Xlist.iter l (fun (i,t) -> print_graph2_rec file edge_rev i "" id t) - | Val s -> + | Val s -> let id = !id_counter in incr id_counter; fprintf file " %d [shape=box,label=\"%s\"]\n" id s; print_edge2 file edge_rev edge_label edge_head "" upper id | Dot -> () | t -> failwith ("print_graph_rec: " ^ LCGstringOf.linear_term 0 t) - -let print_graph2 path name query t = + +let print_graph2 path name query t = (* print_endline *) id_counter := 1; - File.file_out (path ^ name ^ ".gv") (fun file -> + File.file_out (path ^ name ^ ".gv") (fun file -> fprintf file "digraph G {\n compound=true\n node [shape=record]\n"; print_graph2_rec file false "" "" 0 t; fprintf file "label=\"%s\"\n }\n" query); @@ -407,11 +407,11 @@ let rec get_lemma = function PreTypes.Interp orth -> orth | PreTypes.Lemma(lemma,cat,_) -> lemma ^ "\n" ^ cat | _ -> "" - -let print_paths path name paths = - File.file_out (path ^ name ^ ".gv") (fun file -> + +let print_paths path name paths = + File.file_out (path ^ name ^ ".gv") (fun file -> fprintf file "digraph G {\n"; - Array.iter (fun t -> + Array.iter (fun t -> let lemma = get_lemma t.PreTypes.token in if lemma <> "" then fprintf file " %d -> %d [label=\"%s\\n%s\"]\n" t.PreTypes.beg t.PreTypes.next t.PreTypes.orth lemma) paths; fprintf file "}\n"); @@ -419,39 +419,39 @@ let print_paths path name paths = ignore (Sys.command ("dot -Tpng " ^ name ^ ".gv -o " ^ name ^ ".png")); Sys.chdir ".." -let rec print_tree_rec2 file paths edge upper = function +let rec print_tree_rec2 file paths edge upper = function Tuple l -> Xlist.iter l (print_tree_rec2 file paths edge upper) - | Variant(e,l) -> + | Variant(e,l) -> fprintf file " %s [shape=diamond]\n" e; print_edge file edge upper e; Xlist.iter l (fun (i,t) -> print_tree_rec2 file paths i e t) | Dot -> () | Ref i -> print_edge file edge upper ("x" ^ string_of_int i) | t -> failwith ("print_tree_rec: " ^ LCGstringOf.linear_term 0 t) - -let rec print_tree_rec file paths edge upper id = function - Node t -> + +let rec print_tree_rec file paths edge upper id = function + Node t -> let orth = if t.id = 0 then "" else paths.(t.id).PreTypes.orth in fprintf file " %s [label=\"%s\\n%s\\n%s:%s\\n%f\"]\n" id (LCGstringOf.linear_term 0 t.gs) orth t.pred t.cat t.weight; print_edge file edge upper id; print_tree_rec2 file paths "" id t.args - | Variant(e,l) -> + | Variant(e,l) -> fprintf file " %s [shape=diamond,label=\"%s\"]\n" id e; print_edge file edge upper id; Xlist.iter l (fun (i,t) -> print_tree_rec file paths i id (id ^ "y" ^ i) t) - | Choice choice -> + | Choice choice -> fprintf file " %s [shape=Mdiamond,label=\"%s\"]\n" id ""; print_edge file edge upper id; StringMap.iter choice (fun ei t -> print_tree_rec file paths ei id (id ^ "b" ^ ei) t) | Dot -> () | t -> failwith ("print_tree_rec: " ^ LCGstringOf.linear_term 0 t) - -let print_tree path name paths references = - File.file_out (path ^ name ^ ".gv") (fun file -> + +let print_tree path name paths references = + File.file_out (path ^ name ^ ".gv") (fun file -> fprintf file "digraph G {\n node [shape=box]\n"; Int.iter 0 (Array.length references - 1) (fun i -> print_tree_rec file paths "" "" ("x" ^ string_of_int i) references.(i)); (* match references.(i) with - Node t -> + Node t -> let orth = if t.id = 0 then "" else paths.(t.id).PreTypes.orth in fprintf file " %d [label=\"%s\\n%s\\n%s:%s\"]\n" i (LCGstringOf.linear_term 0 t.gs) orth t.pred t.cat; let refs = get_refs [] t.args in @@ -463,13 +463,13 @@ let print_tree path name paths references = ignore (Sys.command ("dot -Tpng " ^ name ^ ".gv -o " ^ name ^ ".png")); Sys.chdir ".." -(*let print_tree filename paths references = - File.file_out filename (fun file -> +(*let print_tree filename paths references = + File.file_out filename (fun file -> fprintf file "digraph G {\n"; let set = Xlist.fold paths IntSet.empty (fun set t -> IntSet.add (IntSet.add set t.PreTypes.beg) t.PreTypes.next) in IntSet.iter set (fun i -> fprintf file " %d [width=0; height=0; label=\"\"]\n" i); - Xlist.iter paths (fun t -> + Xlist.iter paths (fun t -> let lemma = get_lemma t.PreTypes.token in if lemma <> "" then ( let s = if t.PreTypes.orth = "" then lemma else t.PreTypes.orth ^ "\n" ^ lemma in @@ -480,8 +480,8 @@ let print_tree path name paths references = ignore (Sys.command "dot -Tpng tree.gv -o tree.png"); Sys.chdir ".."*) -(*let print_tree filename paths references = - File.file_out filename (fun file -> +(*let print_tree filename paths references = + File.file_out filename (fun file -> fprintf file "digraph G {\n"; fprintf file " subgraph {\n ordering=out\n"; let same = Xlist.fold (Xlist.sort paths (fun s t -> compare s.PreTypes.beg t.PreTypes.beg)) [] (fun same t -> @@ -494,9 +494,9 @@ let print_tree path name paths references = else same) in fprintf file " }\n"; fprintf file " { rank = same; %s }\n" (String.concat "; " (Xlist.map same (fun i -> sprintf "\"i%d\"" i))); - Int.iter 0 (Array.length references - 1) (fun i -> + Int.iter 0 (Array.length references - 1) (fun i -> match references.(i) with - Node t -> + Node t -> fprintf file " %d [label=\"%s\"]\n" i t.pred; fprintf file " %d -> i%d\n" i t.id; let refs = get_refs [] t.args in @@ -508,9 +508,9 @@ let print_tree path name paths references = ignore (Sys.command "dot -Tpng tree.gv -o tree.png"); Sys.chdir ".."*) -(*let rec schema_latex schema = - "\\begin{tabular}{l}" ^ - String.concat "\\\\" (Xlist.map schema (fun s -> +let rec schema_latex schema = + "\\begin{tabular}{l}" ^ + String.concat "\\\\" (Xlist.map schema (fun s -> LatexMain.escape_string (String.concat "," ( (if s.WalTypes.gf = WalTypes.ARG then [] else [WalStringOf.gf s.WalTypes.gf])@ (if s.WalTypes.role = "" then [] else [s.WalTypes.role])@ @@ -519,32 +519,32 @@ let print_tree path name paths references = "\\end{tabular}" let fnum_frame_latex = function - fnum,WalTypes.Frame(atrs,s) -> + fnum,WalTypes.Frame(atrs,s) -> Printf.sprintf "%d: %s: %s" fnum (LatexMain.escape_string (WalStringOf.frame_atrs atrs)) (schema_latex s) - | fnum,WalTypes.LexFrame(id,p,r,s) -> + | fnum,WalTypes.LexFrame(id,p,r,s) -> Printf.sprintf "%d: %s: %s: %s: %s" fnum id (LatexMain.escape_string (WalStringOf.pos p)) (WalStringOf.restr r) (schema_latex s) - | fnum,WalTypes.ComprepFrame(le,p,r,s) -> + | fnum,WalTypes.ComprepFrame(le,p,r,s) -> Printf.sprintf "%d: %s: %s: %s: %s" fnum le (LatexMain.escape_string (WalStringOf.pos p)) (WalStringOf.restr r) (schema_latex s) - + let print_paths_latex name paths = LatexMain.latex_file_out "results/" name "a0" false (fun file -> fprintf file "\\begin{longtable}{|l|l|l|l|l|l|l|p{4cm}|l|l|l|l|}\n\\hline\north & beg & len & next & token & id & weight & attrs & lroles & senses & simple valence & valence\\\\\n"; Int.iter 0 (Array.length paths - 1) (fun i -> let t = paths.(i) in - fprintf file "%s & %d & %d & %d & %s & %d & %.4f & %s & %s %s &\\begin{tabular}{l|l|p{4cm}}%s\\end{tabular} &\\begin{tabular}{l}%s\\end{tabular} &\\begin{tabular}{l}%s\\end{tabular}\\\\\n\\hline\n" + fprintf file "%s & %d & %d & %d & %s & %d & %.4f & %s & %s %s &\\begin{tabular}{l|l|p{4cm}}%s\\end{tabular} &\\begin{tabular}{l}%s\\end{tabular} &\\begin{tabular}{l}%s\\end{tabular}\\\\\n\\hline\n" t.PreTypes.orth t.PreTypes.beg t.PreTypes.len t.PreTypes.next (LatexMain.escape_string (string_of_token t.PreTypes.token)) t.PreTypes.id t.PreTypes.weight - (String.concat ";" t.PreTypes.attrs) (fst t.PreTypes.lroles) (snd t.PreTypes.lroles) + (String.concat ";" t.PreTypes.attrs) (fst t.PreTypes.lroles) (snd t.PreTypes.lroles) (String.concat "\\\\\n" (Xlist.map t.PreTypes.senses (fun (sense,hipero,weight) -> sprintf "%s & %.2f & %s" sense weight (String.concat "," hipero)))) - (String.concat "\\\\\n\\hline\n" (Xlist.map t.PreTypes.simple_valence (fun x -> fnum_frame_latex x))) + (String.concat "\\\\\n\\hline\n" (Xlist.map t.PreTypes.simple_valence (fun x -> fnum_frame_latex x))) (String.concat "\\\\\n\\hline\n" (Xlist.map t.PreTypes.valence (fun x -> fnum_frame_latex x)))); fprintf file "\\end{longtable}"); - LatexMain.latex_compile_and_clean "results/" name *) - + LatexMain.latex_compile_and_clean "results/" name + let print_mml path name mml = File.file_out (path ^ name ^ ".mml") (fun file -> fprintf file "<!DOCTYPE math PUBLIC \"-//W3C//DTD MathML 2.0//EN\" \"http://www.w3.org/Math/DTD/mathml2/mathml2.dtd\">\n"; fprintf file "%s\n" (Xml.to_string_fmt mml)) - + let page_header path = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"> <html> @@ -556,11 +556,11 @@ let page_header path = <body> <center> - <h1>ENIAM: Kategorialny Parser Składniowo-Semantyczny</h1> - <h3>Podaj tekst:</h3> - <form method=POST action=\"" ^ path ^ "parser.cgi\"> - <p><input type=\"text\" name=\"text0\" value=\"\" size=\"40\"></p> - <p><input type=\"submit\" value=\"Analizuj\" size=\"60\"></p> + <h1>ENIAM: Kategorialny Parser Składniowo-Semantyczny</h1> + <h3>Podaj tekst:</h3> + <form method=POST action=\"" ^ path ^ "parser.cgi\"> + <p><input type=\"text\" name=\"text0\" value=\"\" size=\"40\"></p> + <p><input type=\"submit\" value=\"Analizuj\" size=\"60\"></p> </form>" let page_trailer = @@ -569,25 +569,25 @@ let page_trailer = Copyright © 2016 Institute of Computer Science Polish Academy of Sciences<BR> </center> </body> -</html>" +</html>" let print_webpage file cg_bin_path html_path id query n max_n mml = fprintf file "%s\n" (page_header cg_bin_path); fprintf file "\n<H3>%s</H3>\n" query; - fprintf file "<P>%s %s\n" + fprintf file "<P>%s %s\n" (if n = 1 then "" else sprintf "<A HREF=\"%spage%s_%d.html\">Poprzednia interpretacja</A>" html_path id (n-1)) (if n = max_n then "" else sprintf "<A HREF=\"%spage%s_%d.html\">Następna interpretacja</A>" html_path id (n+1)); fprintf file "<P><IMG SRC=\"%stree%s_%d.png\">\n" html_path id n; fprintf file "<P>%s\n" (Xml.to_string_fmt mml); fprintf file "<P><A HREF=\"%stree%s_%d.xml\">Graf w formacie XML</A>\n" html_path id n; fprintf file "<P><A HREF=\"%sformula%s_%d.mml\">Formuła w formacie MathML</A>\n" html_path id n; - fprintf file "<P>%s %s\n" + fprintf file "<P>%s %s\n" (if n = 1 then "" else sprintf "<A HREF=\"%spage%s_%d.html\">Poprzednia interpretacja</A>" html_path id (n-1)) (if n = max_n then "" else sprintf "<A HREF=\"%spage%s_%d.html\">Następna interpretacja</A>" html_path id (n+1)); fprintf file "%s\n" page_trailer -open ExecTypes - +open ExecTypes + let generate_status_message result = function Idle -> "Server error: " ^ result.msg | PreprocessingError -> "Error during preprocessing: " ^ result.msg @@ -602,13 +602,8 @@ let generate_status_message result = function | NotTranslated -> "Unable to generate logical form" | Parsed -> "parsed" -let print_other_result file cg_bin_path query result = +let print_other_result file cg_bin_path query result = fprintf file "%s\n" (page_header cg_bin_path); fprintf file "\n<H3>%s</H3>\n" query; fprintf file "\n<P>%s\n" (generate_status_message result result.status); fprintf file "%s\n" page_trailer - - - - - \ No newline at end of file diff --git a/pre/inflexion.ml b/pre/inflexion.ml deleted file mode 100644 index eb8a4fc..0000000 --- a/pre/inflexion.ml +++ /dev/null @@ -1,301 +0,0 @@ -(* - * ENIAM: Categorial Syntactic-Semantic Parser for Polish - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - *) - -open Xstd -open Printf - -let load_alt filename = - let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in - List.rev (Xlist.fold l [] (fun l line -> - if String.length line = 0 then l else - if String.get line 0 = '#' then l else - match Str.split_delim (Str.regexp "\t") line with - [orth; lemma; interp] -> (orth,lemma,interp) :: l - | _ -> failwith ("load_alt: " ^ line))) - -let load_dict filename = - let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in - List.rev (Xlist.fold l [] (fun l line -> - if String.length line = 0 then l else - if String.get line 0 = '#' then l else - match Str.split_delim (Str.regexp "\t") line with - [stem; lemma_suf2; rule_names] -> (stem,lemma_suf2,Str.split (Str.regexp " ") rule_names) :: l - | _ -> failwith ("load_dict: " ^ line))) - -let load_rules filename = - let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in - List.rev (Xlist.fold l [] (fun l line -> - if String.length line = 0 then l else - if String.get line 0 = '#' then l else - match Str.split_delim (Str.regexp "\t") line with - [rule_name; quantity; lemma_suf; orth_suf; interp] -> (rule_name,int_of_string quantity,lemma_suf,orth_suf,interp) :: l - | _ -> failwith ("load_rules: " ^ line))) - -let make_rules_map rules = - Xlist.fold rules StringMap.empty (fun rules (rule_name,quantity,lemma_suf,orth_suf,interp) -> - let rules2 = try StringMap.find rules orth_suf with Not_found -> StringMap.empty in - let rules2 = StringMap.add rules2 rule_name (lemma_suf,interp) in - StringMap.add rules orth_suf rules2) - -module OrderedChar = struct - - type t = char - - let compare = compare - -end - -module CharMap = Xmap.Make(OrderedChar) - -type char_tree = N of char_tree CharMap.t * (string * string list) list * (string * string) list - (* następne możliwości * (lemma_suf2 * lista reguł) list * lista alt *) - -type char_tree_suf = M of char_tree_suf CharMap.t * (string * int * string * string) list - (* następne możliwości * (rule_name * lemma_suf * interp) list *) - -let empty_char_tree = N(CharMap.empty,[],[]) -let empty_char_tree_suf = M(CharMap.empty,[]) - -let rec add_path_dict stem i n lemma_suf2 rule_names (N(map,rules,alts)) = - if i = n then N(map,(lemma_suf2,rule_names) :: rules,alts) else - let tree = try CharMap.find map (String.get stem i) with Not_found -> empty_char_tree in - let tree = add_path_dict stem (i+1) n lemma_suf2 rule_names tree in - N(CharMap.add map (String.get stem i) tree,rules,alts) - -let rec add_path_alt stem i n lemma interp (N(map,rules,alts)) = - if i = n then N(map,rules,(lemma,interp) :: alts) else - let tree = try CharMap.find map (String.get stem i) with Not_found -> empty_char_tree in - let tree = add_path_alt stem (i+1) n lemma interp tree in - N(CharMap.add map (String.get stem i) tree,rules,alts) - -let make_char_tree dict alt = - let tree = Xlist.fold dict empty_char_tree (fun tree (stem,lemma_suf2,rule_names) -> - add_path_dict stem 0 (String.length stem) lemma_suf2 rule_names tree) in - Xlist.fold alt tree (fun tree (orth,lemma,interp) -> - add_path_alt orth 0 (String.length orth) lemma interp tree) - -let rec add_path_rules rule_name quantity orth_suf i lemma_suf interp (M(map,rules)) = - if i = -1 then M(map,(rule_name,quantity,lemma_suf,interp) :: rules) else - let tree = try CharMap.find map (String.get orth_suf i) with Not_found -> empty_char_tree_suf in - let tree = add_path_rules rule_name quantity orth_suf (i-1) lemma_suf interp tree in - M(CharMap.add map (String.get orth_suf i) tree,rules) - -let make_char_tree_suf rules = - let tree = Xlist.fold rules empty_char_tree_suf (fun tree (rule_name,quantity,lemma_suf,orth_suf,interp) -> - add_path_rules rule_name quantity orth_suf (String.length orth_suf - 1) lemma_suf interp tree) in - tree - -let rec find_char_tree_rec i n orth (N(map,rules,alts)) = - if i = n then [orth,"",rules,alts] else - let l = try find_char_tree_rec (i+1) n orth (CharMap.find map (String.get orth i)) with Not_found -> [] in - (String.sub orth 0 i,String.sub orth i (n-i),rules,[]) :: l - -let find_char_tree tree rules orth = - let l = find_char_tree_rec 0 (String.length orth) orth tree in - Xlist.fold l [] (fun found (stem,suf,rule_names,alts) -> - let rules2 = try StringMap.find rules suf with Not_found -> StringMap.empty in - let found = alts @ found in - Xlist.fold rule_names found (fun found (lemma_suf2,rule_names2) -> - Xlist.fold rule_names2 found (fun found rule_name -> - try - let lemma_suf,interp = StringMap.find rules2 rule_name in - let lemma = if lemma_suf2 = "" then stem ^ lemma_suf else stem ^ lemma_suf ^ ":" ^ lemma_suf2 in - (lemma,interp) :: found - with Not_found -> found))) - -let rec find_char_tree_suf_rec i orth (M(map,rules)) = - if i = 0 then Xlist.fold rules [] (fun l (rule_name,quantity,lemma_suf,interp) -> ("", rule_name, quantity, lemma_suf, interp, i) :: l) else - let l = try find_char_tree_suf_rec (i-1) orth (CharMap.find map (String.get orth (i-1))) with Not_found -> [] in - Xlist.fold rules l (fun l (rule_name, quantity, lemma_suf,interp) -> - (String.sub orth 0 i, rule_name, quantity, String.sub orth 0 i ^ lemma_suf, interp, i) :: l) - -let find_char_tree_suf rules_tree stem_map alt_map orth = - let alt_l = Xlist.rev_map (try StringMap.find alt_map orth with Not_found -> []) (fun (lemma,interp) -> lemma,interp,1,[]) in - let l = find_char_tree_suf_rec (String.length orth) orth rules_tree in - let found = Xlist.fold l alt_l (fun found (stem,rule_name,quantity,lemma,interp,i) -> - try - let rule_names = StringMap.find stem_map stem in - if StringSet.mem rule_names rule_name then (lemma,interp,1,[]) :: found else found (* FIXME: czy na pewno nie dodawać reguł niepasujących? to powoduje że lemat tak samo brzmiący a mający inną odmianę nie zostanie rozpoznany *) - with Not_found -> if quantity < 100 || (String.length orth = i && stem = lemma) then found else (lemma,interp,quantity,["lemma not validated"]) :: found) in (* FIXME: ucięcie żadkich reguł powinno być inaczej sterowane *) -(* if found = [] then [orth,"unk",1,["token not found"]] else *) - let found = (orth,"unk",1,["token not found"]) :: found in - let valid = Xlist.fold found [] (fun valid -> function - lemma,interp,quantity,[] -> (lemma,interp,quantity,[]) :: valid - | _ -> valid) in - if valid = [] then found else valid - -let prepare_inflexion alt_filename dict_filename rules_filename = - let alt = load_alt alt_filename in - let dict = load_dict dict_filename in - let rules = load_rules rules_filename in - let tree = make_char_tree dict alt in - let rules = make_rules_map rules in - tree,rules - -let tree,rules = -(* prepare_inflexion (morfeusz_path ^ Paths.alt_adj) (morfeusz_path ^ Paths.dict_adj) (morfeusz_path ^ Paths.rules_adj) *) -(* prepare_inflexion (morfeusz_path ^ Paths.alt_all) (morfeusz_path ^ Paths.dict_all) (morfeusz_path ^ Paths.rules_all) *) - empty_char_tree,StringMap.empty - -let make_alt_map alt = - Xlist.fold alt StringMap.empty (fun alt_map (orth,lemma,interp) -> - StringMap.add_inc alt_map orth [lemma,interp] (fun l -> (lemma,interp) :: l)) - -let prepare_inflexion_suf alt_filename dict_filename rules_filename = - let alt = load_alt alt_filename in - let rules = load_rules rules_filename in - let rules_tree = make_char_tree_suf rules in - let alt_map = make_alt_map alt in - let dict = load_dict dict_filename in - let stem_map = Xlist.fold dict StringMap.empty (fun stem_map (stem,lemma_suf2,rule_names) -> - StringMap.add_inc stem_map stem (StringSet.of_list rule_names) (fun set -> Xlist.fold rule_names set StringSet.add)) in - alt_map,rules_tree,stem_map - -let alt_map,rules_tree,stem_map = - prepare_inflexion_suf (Paths.sgjp_path ^ Paths.alt_all) (Paths.sgjp_path ^ Paths.dict_all) (Paths.sgjp_path ^ Paths.rules_all) - -let check_prefix pat s = - let n = String.length pat in - if n > String.length s then false else - String.sub s 0 n = pat - -let cut_prefix pat s = - let i = String.length pat in - let n = String.length s in - if i >= n then "" else - try String.sub s i (n-i) with _ -> failwith ("cut_prefix: " ^ s ^ " " ^ string_of_int i) - -let check_sufix pat s = - let n = String.length pat in - let m = String.length s in - if n > m then false else - String.sub s (m-n) n = pat - -let cut_sufix pat s = - let i = String.length pat in - let n = String.length s in - try String.sub s 0 (n-i) with _ -> failwith ("cut_sufix: " ^ s) - -let rec select_interp_sufix pat = function - [] -> [] - | (lemma,interp) :: l -> if check_sufix pat interp then (lemma,interp) :: (select_interp_sufix pat l) else select_interp_sufix pat l - -let rec select_interp_sufix_suf pat = function - [] -> [] - | (lemma,interp,quantity,attrs) :: l -> if check_sufix pat interp then (lemma,interp,quantity,attrs) :: (select_interp_sufix_suf pat l) else select_interp_sufix_suf pat l - -let rec remove_interp_sufix pat = function - [] -> [] - | (lemma,interp) :: l -> if check_sufix pat interp then remove_interp_sufix pat l else (lemma,interp) :: (remove_interp_sufix pat l) - -let rec remove_interp_sufix_suf pat = function - [] -> [] - | (lemma,interp,quantity,attrs) :: l -> - if interp = "adv:sup" then (lemma,interp,quantity,attrs) :: (remove_interp_sufix_suf pat l) else (* FIXME: zaślepka, wymaga poprawienia algorytmu generowania słowników *) - if check_sufix pat interp then remove_interp_sufix_suf pat l else (lemma,interp,quantity,attrs) :: (remove_interp_sufix_suf pat l) - -let get_interpretations orth = - (if check_prefix "naj" orth then select_interp_sufix ":sup" (find_char_tree tree rules (cut_prefix "naj" orth)) else []) @ - (if check_prefix "nie" orth then select_interp_sufix ":neg" (find_char_tree tree rules (cut_prefix "nie" orth)) else []) @ - (remove_interp_sufix ":neg" (remove_interp_sufix ":sup" (find_char_tree tree rules orth))) - -let get_interpretations_suf orth = (* FIXME: nie działa dla adv:sup pisanych z wielkiej litery np Najdoskonalej Najlepiej *) - if orth = "siebie" then ["siebie","siebie:acc.gen",1,[]] else - if orth = "sobie" then ["siebie","siebie:dat.loc",1,[]] else - if orth = "sobą" then ["siebie","siebie:inst",1,[]] else - (if check_prefix "naj" orth then select_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "naj" orth)) else []) @ - (if check_prefix "nie" orth then select_interp_sufix_suf ":neg" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "nie" orth)) else []) @ - (if check_prefix "Naj" orth then select_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "Naj" orth)) else []) @ - (if check_prefix "Nie" orth then select_interp_sufix_suf ":neg" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "Nie" orth)) else []) @ - (remove_interp_sufix_suf ":neg" (remove_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map orth))) - -(* Testy *) - -let print_interpretations l = - Xlist.iter (Xlist.sort l compare) (fun (lemma,interp) -> - printf "%s\t%s\n" lemma interp) - -(*let _ = - let l = get_interpretations "życzliwą" in - print_interpretations l; - let l = get_interpretations "żyźniejszego" in - print_interpretations l; - let l = get_interpretations "zwiśli" in - print_interpretations l; - let l = get_interpretations "najzieleńsza" in - print_interpretations l; - let l = get_interpretations "najtandetniejsza" in - print_interpretations l; - let l = get_interpretations "nieżelazny" in - print_interpretations l; - ()*) - -(*let sgjp_filename = "sgjp-20151020.tab" -let polimorf_filename = "polimorf-20151020.tab" - -let _ = - let interp_sel = Morf.load_interp_sel "data/interps.tab" in - print_endline "loading sgjp"; - let sgjp = Morf.load_tab (morfeusz_path ^ sgjp_filename) in - print_endline "loading polimorf"; - let polimorf = Morf.load_tab (morfeusz_path ^ polimorf_filename) in - print_endline "merging"; - let dicts = Morf.merge_dicts [sgjp;polimorf] in - let adj_interp_sel = StringMap.find interp_sel "adj" in - let adj_sup_interp_sel = StringMap.find interp_sel "adj-sup" in -(* let dicts = Morf.remove_prefix dicts "naj" adj_sup_interp_sel in *) - print_endline "preparing queries"; - let queries = StringMap.fold dicts StringMap.empty (fun queries lemma interps -> - let interps = Morf.select_interps interps (adj_interp_sel @ adj_sup_interp_sel) in - StringMap.fold interps queries (fun queries interp orths -> - Xlist.fold orths queries (fun queries orth -> - let s = lemma ^ "\t" ^ interp in - StringMap.add_inc queries orth (StringSet.singleton s) (fun set -> StringSet.add set s)))) in - print_endline "testing"; - StringMap.iter queries (fun orth set -> - let set = Xlist.fold (get_interpretations orth) set (fun set (lemma,interp) -> - let s = lemma ^ "\t" ^ interp in - if StringSet.mem set s then StringSet.remove set s else ( - printf "excessing interpretation: %s\t%s" orth s; - set)) in - if StringSet.is_empty set then () else - StringSet.iter set (fun s -> - printf "lacking interpretation: %s\t%s" orth s))*) - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/pre/makefile b/pre/makefile index 0259df4..af9f7fa 100755 --- a/pre/makefile +++ b/pre/makefile @@ -1,15 +1,15 @@ OCAMLC=ocamlc OCAMLOPT=ocamlopt OCAMLDEP=ocamldep -INCLUDES=-I +xml-light -I +xlib +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../morphology OCAMLFLAGS=$(INCLUDES) -g -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa inflexion.cmxa INSTALLDIR=`ocamlc -where` WAL= paths.ml walTypes.ml walStringOf.ml preTypes.ml preWordnet.ml walParser.ml walTEI.ml walFrames.ml -PRE= preTokenizer.ml preAcronyms.ml prePatterns.ml morf.ml inflexion.ml prePaths.ml preMWE.ml preSemantics.ml preProcessing.ml - -all: +PRE= preTokenizer.ml preAcronyms.ml prePatterns.ml prePaths.ml preMWE.ml preSemantics.ml preProcessing.ml + +all: $(OCAMLOPT) -o pre $(OCAMLOPTFLAGS) $(WAL) $(PRE) .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx diff --git a/pre/morf.ml b/pre/morf.ml deleted file mode 100644 index b6218f9..0000000 --- a/pre/morf.ml +++ /dev/null @@ -1,122 +0,0 @@ -(* - * ENIAM: Categorial Syntactic-Semantic Parser for Polish - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - *) - -open Xstd - -let load_tab filename = - let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in - List.rev (Xlist.fold l [] (fun l line -> - if String.length line = 0 then l else - if String.get line 0 = '#' then l else - match Str.split (Str.regexp "\t") line with - orth :: lemma :: interp :: _ -> (orth,lemma,interp) :: l - | _ -> failwith ("load_tab: " ^ line))) - -let load_tab_full filename = - let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in - List.rev (Xlist.fold l [] (fun l line -> - if String.length line = 0 then l else - if String.get line 0 = '#' then l else - match Str.split (Str.regexp "\t") line with - [orth; lemma; interp] -> (orth,lemma,interp,"","") :: l - | [orth; lemma; interp; cl] -> (orth,lemma,interp,cl,"") :: l - | [orth; lemma; interp; cl; cl2] -> (orth,lemma,interp,cl,cl2) :: l -(* | orth :: lemma :: interp :: cl :: cl2 -> (orth,lemma,interp,cl,String.concat ";" cl2) :: l *) - | _ -> failwith ("load_tab_full: " ^ line))) - -let merge_dicts l = - Xlist.fold l StringMap.empty (fun dicts tab -> - Xlist.fold tab dicts (fun dicts (orth,lemma,interp) -> - let interps = try StringMap.find dicts lemma with Not_found -> StringMap.empty in - let interps = StringMap.add_inc interps interp [orth] (fun orths -> - if Xlist.mem orths orth then orths else orth :: orths) in - StringMap.add dicts lemma interps)) - -let load_interp_sel filename = - let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in - Xlist.fold l StringMap.empty (fun interp_sel line -> - if String.length line = 0 then interp_sel else - if String.get line 0 = '#' then interp_sel else - match Str.split (Str.regexp "\t") line with - [group;interp;label] -> StringMap.add_inc interp_sel group [interp,label] (fun l -> (interp,label) :: l) - | _ -> failwith ("load_interp_sel: " ^ line)) - - -let rec merge_digraph = function - [] -> [] - | "c" :: "h" :: l -> "ch" :: (merge_digraph l) - | "c" :: "z" :: l -> "cz" :: (merge_digraph l) - | "d" :: "z" :: l -> "dz" :: (merge_digraph l) - | "d" :: "ź" :: l -> "dź" :: (merge_digraph l) - | "d" :: "ż" :: l -> "dż" :: (merge_digraph l) - | "r" :: "z" :: l -> "rz" :: (merge_digraph l) - | "s" :: "z" :: l -> "sz" :: (merge_digraph l) - | "b" :: "'" :: l -> "b'" :: (merge_digraph l) - | "f" :: "'" :: l -> "f'" :: (merge_digraph l) - | s :: l -> s :: (merge_digraph l) - -let text_to_chars s = Xunicode.classified_chars_of_utf8_string s -(* (try UTF8.validate s with UTF8.Malformed_code -> failwith ("Invalid UTF8 string: " ^ s)); - let r = ref [] in - UTF8.iter (fun c -> - r := (UTF8.init 1 (fun _ -> c)) :: (!r)) s; - merge_digraph (List.rev (!r))*) - - -let check_prefix pat s = - let n = String.length pat in - if n > String.length s then false else - String.sub s 0 n = pat - -let cut_prefix pat s = - let i = String.length pat in - let n = String.length s in - if i >= n then "" else - try String.sub s i (n-i) with _ -> failwith ("cut_prefix: " ^ s ^ " " ^ string_of_int i) - -let check_sufix pat s = - let n = String.length pat in - let m = String.length s in - if n > m then false else - String.sub s (m-n) n = pat - -let cut_sufix pat s = - let i = String.length pat in - let n = String.length s in - try String.sub s 0 (n-i) with _ -> failwith ("cut_sufix: " ^ s) - -let apply_transform orth (s,t) = - if check_sufix s orth then cut_sufix s orth ^ t else raise Not_found - -let split_colon s = - match Str.split_delim (Str.regexp ":") s with - [s] -> s, "" - | [s;t] -> s, t - | _ -> failwith "split_colon" - -let get_cat s = - match Str.split_delim (Str.regexp ":") s with - cat :: _ -> cat - | _ -> failwith "get_cat" - -let select_interps interps interp_sel = - Xlist.fold interp_sel StringMap.empty (fun new_interps (interp,_) -> - try - StringMap.add new_interps interp (StringMap.find interps interp) - with Not_found -> new_interps) diff --git a/pre/prePaths.ml b/pre/prePaths.ml index 4d8f958..abc931b 100644 --- a/pre/prePaths.ml +++ b/pre/prePaths.ml @@ -20,17 +20,17 @@ open Xstd open PreTypes -let to_string (paths,last) = +let to_string (paths,last) = String.concat "\n" (Xlist.map paths (fun t -> PreTokenizer.string_of_tokens 0 (Token t))) ^ Printf.sprintf "\nlast=%d" last -let to_string_indexed (paths,last) = - String.concat "\n" (Xlist.map paths (fun (i,j,t) -> +let to_string_indexed (paths,last) = + String.concat "\n" (Xlist.map paths (fun (i,j,t) -> Printf.sprintf "%2d %2d %s" i j (PreTokenizer.string_of_tokens 0 (Token t)))) ^ Printf.sprintf "\nlast=%d" last (*let indexed_token_record_to_xml i j t = - let lemma,pos,tags = + let lemma,pos,tags = match t.token with Lemma(lemma,pos,tags) -> lemma,pos,tags | _ -> failwith "indexed_token_record_to_xml" in @@ -46,11 +46,11 @@ let to_string_indexed (paths,last) = Xlist.map t.senses (fun (sense,hipero,weight) -> Xml.Element("sense",["name",sense;"weight",string_of_float weight], Xlist.map hipero (fun s -> Xml.Element("hipero",[],[Xml.PCData s])))))]) - -let to_xml (paths,last) = + +let to_xml (paths,last) = Xml.Element("paths",["last",string_of_int last], Xlist.map paths (fun (i,j,t) -> indexed_token_record_to_xml i j t)) *) - + let compare_token_record p r = let v = compare p.beg r.beg in if v <> 0 then v else @@ -58,30 +58,30 @@ let compare_token_record p r = if v <> 0 then v else compare p r -let sort (paths,last) = +let sort (paths,last) = Xlist.sort paths compare_token_record, last let rec uniq_rec rev = function [] -> List.rev rev | [p] -> List.rev (p :: rev) | p :: r :: l -> if p = r then uniq_rec rev (r :: l) else uniq_rec (p :: rev) (r :: l) - -let uniq (paths,last) = + +let uniq (paths,last) = uniq_rec [] paths, last - + let rec translate_into_paths_rec paths = function Token t -> t :: paths | Seq l -> Xlist.fold l paths translate_into_paths_rec | Variant l -> Xlist.fold l paths translate_into_paths_rec - -let translate_into_paths tokens = - let paths = Xlist.fold tokens [] (fun paths token -> + +let translate_into_paths tokens = + let paths = Xlist.fold tokens [] (fun paths token -> translate_into_paths_rec paths token) in let last = if paths = [] then 0 else (List.hd paths).next in let paths = sort (paths,last) in let paths = uniq paths in - paths - + paths + (**********************************************************************************) let excluded_interps = StringSet.of_list [ @@ -214,7 +214,7 @@ let transformed_interps = Xlist.fold [ "praet:sg:n1.n2:imperf.perf:nagl","praet:sg:n1.n2:imperf.perf"; "praet:sg:n1.n2:imperf:nagl","praet:sg:n1.n2:imperf"; ] StringMap.empty (fun map (k,v) -> StringMap.add map k v) - + let merge_lemmata l = let map = Xlist.fold l StringMap.empty (fun map (lemma,interp,quantity,attrs) -> let interp = if interp = "num:comp" then "numc" else interp in @@ -229,88 +229,88 @@ let merge_lemmata l = StringMap.add_inc map pos [tags] (fun l -> tags :: l))), max 1 (quantity / Xlist.size interps), attrs) in - StringMap.fold map [] (fun l _ (lemma,map,quantity,attrs) -> + StringMap.fold map [] (fun l _ (lemma,map,quantity,attrs) -> StringMap.fold map l (fun l cat interp -> (lemma,cat,interp,quantity,attrs) :: l)) let uppercase lemma cl ll = let n = String.length lemma in let nll = String.length ll in - cl ^ String.sub lemma nll (n - nll) - -let quant_mod quantity = + cl ^ String.sub lemma nll (n - nll) + +let quant_mod quantity = log10 (float quantity) - -let lemmatize_token = function - | {token=AllSmall s} as t -> - t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations_suf s)) (fun (lemma,cat,interp,quantity,attrs) -> + +let lemmatize_token = function + | {token=AllSmall s} as t -> + t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations s)) (fun (lemma,cat,interp,quantity,attrs) -> {t with token=Lemma(lemma,cat,interp); weight=t.weight+.(quant_mod quantity); attrs=attrs @ t.attrs})) - | {token=SmallLetter s} as t -> - t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations_suf s)) (fun (lemma,cat,interp,quantity,attrs) -> + | {token=SmallLetter s} as t -> + t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations s)) (fun (lemma,cat,interp,quantity,attrs) -> {t with token=Lemma(lemma,cat,interp); weight=t.weight+.(quant_mod quantity); attrs=attrs @ t.attrs})) - | {token=FirstCap(s,lower,cl,ll)} as t -> - let l = Xlist.map (merge_lemmata (Inflexion.get_interpretations_suf s)) (fun (lemma,cat,interp,quantity,attrs) -> + | {token=FirstCap(s,lower,cl,ll)} as t -> + let l = Xlist.map (merge_lemmata (Inflexion.get_interpretations s)) (fun (lemma,cat,interp,quantity,attrs) -> {t with token=Lemma(lemma,cat,interp); weight=t.weight+.(quant_mod quantity); attrs="notvalidated proper" :: attrs @ t.attrs}) in let b = Xlist.fold l false (fun b t -> if Xlist.mem t.attrs "lemma not validated" || Xlist.mem t.attrs "token not found" then b else true) in - if b then t :: l else - t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations_suf lower)) (fun (lemma,cat,interp,quantity,attrs) -> + if b then t :: l else + t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations lower)) (fun (lemma,cat,interp,quantity,attrs) -> {t with token=Lemma(uppercase lemma cl ll,cat,interp); weight=t.weight+.(quant_mod quantity); attrs="notvalidated proper" :: "lemmatized as lowercase" :: attrs @ t.attrs})) - | {token=AllCap(s,_,_)} as t -> - t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations_suf s)) (fun (lemma,cat,interp,quantity,attrs) -> + | {token=AllCap(s,_,_)} as t -> + t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations s)) (fun (lemma,cat,interp,quantity,attrs) -> {t with token=Lemma(lemma,cat,interp); weight=t.weight+.(quant_mod quantity); attrs="notvalidated proper" :: attrs @ t.attrs})) - | {token=CapLetter(s,_)} as t -> - t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations_suf s)) (fun (lemma,cat,interp,quantity,attrs) -> + | {token=CapLetter(s,_)} as t -> + t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations s)) (fun (lemma,cat,interp,quantity,attrs) -> {t with token=Lemma(lemma,cat,interp); weight=t.weight+.(quant_mod quantity); attrs="notvalidated proper" :: attrs @ t.attrs})) - | {token=SomeCap s} as t -> - t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations_suf s)) (fun (lemma,cat,interp,quantity,attrs) -> + | {token=SomeCap s} as t -> + t :: (Xlist.map (merge_lemmata (Inflexion.get_interpretations s)) (fun (lemma,cat,interp,quantity,attrs) -> {t with token=Lemma(lemma,cat,interp); weight=t.weight+.(quant_mod quantity); attrs="notvalidated proper" :: attrs @ t.attrs})) | t -> [t] - -let rec lemmatize_rec rev = function - [] -> List.rev rev + +let rec lemmatize_rec rev = function + [] -> List.rev rev | t :: l -> lemmatize_rec (lemmatize_token t @ rev) l let lemmatize (paths,last) = List.rev (Xlist.fold (lemmatize_rec [] paths) [] (fun paths t -> match t.token with - Lemma _ -> if (Xlist.mem t.attrs "lemma not validated" || Xlist.mem t.attrs "token not found") && + Lemma _ -> if (Xlist.mem t.attrs "lemma not validated" || Xlist.mem t.attrs "token not found") && (Xlist.mem t.attrs "required validated lemmatization") then paths else t(*{t with attrs=List.remove "required validated lemmatization" t.attrs}*) :: paths | _ -> t :: paths)), last - + (* TODO: docelowa lematyzacja: - lematyzacja za pomocą półręcznie wytworzonych reguł lematyzacji i listy wyjątków - walidacja lematów za pomocą listy znanych lematów zawierającej lemat, kategorię, rodzaj subst, aspekt verb (obejmuje też walidację akronimów) - rozpoznawanie wyrażeń wielosłownych (mwe i mte) za pomocą listy zawierającej ich lematy i szablony odmiany *) - + (**********************************************************************************) - + (**********************************************************************************) - + (**********************************************************************************) - - - + + + (*let rec get_beg_id = function Token t -> t.beg | Seq(t :: _) -> get_beg_id t | Variant(t :: _) -> get_beg_id t | _ -> failwith "get_beg_id" - + let rec get_end_id = function Token t -> t.beg + t.len | Seq [] -> failwith "get_end_id" | Seq l -> get_end_id (List.hd (List.rev l)) | Variant(t :: _) -> get_end_id t | _ -> failwith "get_end_id"*) - + (*let rec lemmatize_tokens paths next_id = function Token({token=Dig(v,cat)} as t)-> PrePaths.add_edge paths t.beg next_id t.orth v (parse_postags cat) t.beg t.len | Token({token=Lemma(lemma,interp)} as t) -> PrePaths.add_edge paths t.beg next_id t.orth lemma (parse_postags interp) t.beg t.len | Token({token=Interp lemma} as t) -> PrePaths.add_edge paths t.beg next_id t.orth lemma (parse_postags "interp") t.beg t.len - | Token({token=AllSmall s} as t) -> + | Token({token=AllSmall s} as t) -> Xlist.fold (Inflexion.get_interpretations s) paths (fun paths (lemma,postags) -> PrePaths.add_edge paths t.beg next_id t.orth lemma (parse_postags postags) t.beg t.len) | Token({token=FirstCap(s,s2)} as t) -> (* FIXME: dodać wersję z s2 ; uporządkować słownik; dodać akronimy *) @@ -321,19 +321,19 @@ let rec get_end_id = function | Seq(t :: next :: l) -> lemmatize_tokens (lemmatize_tokens paths (get_beg_id next) t) next_id (Seq(next :: l)) | Seq [] -> failwith "lemmatize_tokens" | Variant l -> Xlist.fold l paths (fun paths t -> lemmatize_tokens paths next_id t)*) - + (*let rec lemmatize_paths_tokens paths = function (* FIXME: uzgodnić postać lematów *) Token({token=Dig(v,cat)} as t)-> PrePaths.add_edge paths t.beg t.next t.orth v (parse_postags cat) t.attrs t.beg t.len - | Token({token=Lemma(lemma,interp)} as t) -> + | Token({token=Lemma(lemma,interp)} as t) -> if Xlist.mem t.attrs "lemmatized as lowercase" || Xlist.mem t.attrs "lemma not validated" then paths else (* FIXME *) PrePaths.add_edge paths t.beg t.next t.orth lemma (parse_postags interp) t.attrs t.beg t.len | Token({token=Interp lemma} as t) -> PrePaths.add_edge paths t.beg t.next t.orth lemma (parse_postags "interp") t.attrs t.beg t.len | Token({token=Proper _} as t) -> failwith "lemmatize_paths_tokens: ni" | Token({token=Compound _} as t) -> failwith "lemmatize_paths_tokens: ni" -(* | Token({token=AllSmall s} as t) -> +(* | Token({token=AllSmall s} as t) -> Xlist.fold (Inflexion.get_interpretations s) paths (fun paths (lemma,postags) -> PrePaths.add_edge paths t.beg t.next t.orth lemma (parse_postags postags) t.beg t.len) - | Token({token=SmallLetter s} as t) -> + | Token({token=SmallLetter s} as t) -> Xlist.fold (Inflexion.get_interpretations s) paths (fun paths (lemma,postags) -> PrePaths.add_edge paths t.beg t.next t.orth lemma (parse_postags postags) t.beg t.len) | Token({token=FirstCap(s,s2)} as t) -> (* FIXME: dodać wersję z s2 ; uporządkować słownik; dodać akronimy *) @@ -345,19 +345,19 @@ let rec get_end_id = function | Token _ -> paths | Seq l -> Xlist.fold l paths (fun paths t -> lemmatize_paths_tokens paths t) | Variant l -> Xlist.fold l paths (fun paths t -> lemmatize_paths_tokens paths t) - + (*let rec lemmatize paths = function t :: next :: l -> lemmatize (lemmatize_tokens paths (get_beg_id next) t) (next :: l) | [t] -> lemmatize_tokens paths (get_end_id t) t | [] -> failwith "lemmatize"*) - + let rec lemmatize_paths paths = function t :: l -> lemmatize_paths (lemmatize_paths_tokens paths t) l | [] -> paths *) - + (* FIXME: dodać 'co do' prep:gen *) - - + + (* Dane do przekazania: - lematy i interpretacje: generowanie typów i termów - orths @@ -365,8 +365,8 @@ let rec lemmatize_paths paths = function - struktura grafu: wyróżniki przy niejednoznaczności - sensy wraz z hiperonimami - <indent> *) - -(* + +(* Ala zjadła kota. Ala subst:sg:nom:f imię -> istota kot subst:sg:nom:m2 pospolita - kot 2 - istota 1 istota żywa 1 zwierzę 1 strunowiec 1 czaszkowiec 1 kręgowiec 1 tetrapod 1 owodniowiec 1 ssak 1 ssak żyworodny 1 łożyskowiec 1 ssak drapieżny 1 kot 1 kot 2 @@ -392,10 +392,10 @@ czas 3 godzina 4 do opisu czasu trwania: jednostka czasu 1: godzina 3, sekunda 2, (minuta 4 - nie podłączona) dzień 2, miesiąc 1, rok 1/2 -*) - -(** - +*) + +(** + (* empty *) let empty = IntMap.empty, 0, 0 @@ -406,11 +406,11 @@ let poss_record_empty = {interp=[]; attrs=[]; proper=[]; senses=[]} (* add *) -let dict_add dict lemma postags attrs beg len = +let dict_add dict lemma postags attrs beg len = if postags = [] then dict else let interps = try StringMap.find dict.lemmas lemma with Not_found -> StringMap.empty in let interps = Xlist.fold postags interps (fun interps (pos,tags) -> - StringMap.add_inc interps pos {poss_record_empty with interp=[tags]; attrs=attrs} (fun l -> + StringMap.add_inc interps pos {poss_record_empty with interp=[tags]; attrs=attrs} (fun l -> {l with interp=tags :: l.interp; attrs=StringSet.to_list (StringSet.union (StringSet.of_list l.attrs) (StringSet.of_list attrs))})) in if dict.dbeg <> beg && dict.dbeg <> -1 then failwith "dict_add" else if dict.dlen <> len && dict.dlen <> -1 then failwith "dict_add" else @@ -424,52 +424,52 @@ let add_simple map i j orth lemma postags attrs beg len = let orths = StringMap.add orths orth dict in let map2 = IntMap.add map2 j orths in IntMap.add map i map2 - + let add_edge (map,last,n) i j orth lemma postags attrs beg len = add_simple map i j orth lemma postags attrs beg len, max j last, max j n - + let rec add_path (map,last,n) i j = function [] -> failwith "add_path" - | [orth,lemma,postags,beg,len] -> + | [orth,lemma,postags,beg,len] -> add_simple map i j orth lemma postags [] beg len, last, n - | (orth,lemma,postags,beg,len) :: l -> + | (orth,lemma,postags,beg,len) :: l -> add_path (add_simple map i (n+1) orth lemma postags [] beg len, last, n+1) (n+1) j l -(* +(* let insert (map,last,n) i j orth dict = let map2 = try IntMap.find map i with Not_found -> IntMap.empty in let orths = try IntMap.find map2 j with Not_found -> StringMap.empty in let orths = StringMap.add orths orth dict in - let map2 = IntMap.add map2 j orths in + let map2 = IntMap.add map2 j orths in IntMap.add map i map2, last, n - + let rec insert_path (map,last,n) i j = function [] -> failwith "add_path" - | [orth,dict] -> + | [orth,dict] -> insert (map,last,n) i j orth dict - | (orth,dict) :: l -> + | (orth,dict) :: l -> insert_path (insert (map,last,n+1) i (n+1) orth dict) (n+1) j l - -let set_sentence_begin (map,last,n) i j orth = + +let set_sentence_begin (map,last,n) i j orth = try let map2 = IntMap.find map i in let orths = IntMap.find map2 j in let dict = StringMap.find orths orth in let orths = StringMap.add orths orth {dict with sentence_begin=true} in let map2 = IntMap.add map2 j orths in - IntMap.add map i map2, last, n + IntMap.add map i map2, last, n with Not_found -> failwith "set_sentence_begin" -let set_sentence_end (map,last,n) i j orth = +let set_sentence_end (map,last,n) i j orth = try let map2 = IntMap.find map i in let orths = IntMap.find map2 j in let dict = StringMap.find orths orth in let orths = StringMap.add orths orth {dict with sentence_end=true} in let map2 = IntMap.add map2 j orths in - IntMap.add map i map2, last, n + IntMap.add map i map2, last, n with Not_found -> failwith "set_sentence_end" -let is_sentence_end (map,last,n) i j orth = +let is_sentence_end (map,last,n) i j orth = try let map2 = IntMap.find map i in let orths = IntMap.find map2 j in @@ -503,7 +503,7 @@ let rec find_paths_bound (map,last,n) k i = let tails = find_paths_bound (map,last,n) (k-1) j in StringMap.fold set paths (fun paths s _ -> Xlist.fold tails paths (fun paths tail -> (s :: tail) :: paths))) - + let rec find_paths_rec (map,last,n) i = if i = last then [[]] else if not (IntMap.mem map i) then failwith "find_paths_rec" else @@ -511,14 +511,14 @@ let rec find_paths_rec (map,last,n) i = let tails = find_paths_rec (map,last,n) j in StringMap.fold set paths (fun paths s _ -> Xlist.fold tails paths (fun paths tail -> (s :: tail) :: paths))) - + let find_paths (map,last,n) = find_paths_rec (map,last,n) 0 -*) -let has_lemma orths = +*) +let has_lemma orths = StringMap.fold orths false (fun b _ dict -> if StringMap.is_empty dict.lemmas then b else true) - + let rec no_possible_path_rec map last i = if last = i then false else let map2 = try IntMap.find map i with Not_found -> IntMap.empty in @@ -526,27 +526,27 @@ let rec no_possible_path_rec map last i = if has_lemma orths then b && no_possible_path_rec map last j else b) - + let no_possible_path (map,last,n) = no_possible_path_rec map last 0 -(* +(* let rec match_path_rec map found i rev = function - [] -> (i :: rev) :: found + [] -> (i :: rev) :: found | s :: l -> let map2 = try IntMap.find map i with Not_found -> IntMap.empty in let found2 = IntMap.fold map2 [] (fun found2 j set -> if StringMap.mem set s then j :: found2 else found2) in Xlist.fold found2 found (fun found j -> match_path_rec map found j (i :: rev) l) - + let match_path (map,last,n) = function [] -> failwith "match_path" - | s :: l -> + | s :: l -> let found = IntMap.fold map [] (fun found i map2 -> IntMap.fold map2 found (fun found j set -> if StringMap.mem set s then (i,j) :: found else found)) in Xlist.fold found [] (fun found (i,j) -> match_path_rec map found j [i] l) -let get_matched orths = function +let get_matched orths = function Orth s -> if StringMap.mem orths s then [s] else [] | Pos s -> (*print_endline ("a1 " ^ s);*) StringSet.to_list (StringMap.fold orths StringSet.empty (fun set orth dict -> StringMap.fold dict.lemmas set (fun set lemma interps -> @@ -554,19 +554,19 @@ let get_matched orths = function (* print_endline ("a2 " ^ pos); *) if s = pos then StringSet.add set orth else set)))) (* | All -> orths *) - + let rec match_path_ex_rec map found i rev = function - [] -> ((i,[]) :: rev) :: found + [] -> ((i,[]) :: rev) :: found | s :: l -> let map2 = try IntMap.find map i with Not_found -> IntMap.empty in let found2 = IntMap.fold map2 [] (fun found2 j orths -> let l = get_matched orths s in if l <> [] then (j,l) :: found2 else found2) in Xlist.fold found2 found (fun found (j,l2) -> match_path_ex_rec map found j ((i,l2) :: rev) l) - + let match_path_ex (map,last,n) = function [] -> failwith "match_path_ex" - | s :: l -> + | s :: l -> let found = IntMap.fold map [] (fun found i map2 -> IntMap.fold map2 found (fun found j orths -> let l = get_matched orths s in @@ -578,49 +578,49 @@ let last_node (_,last,_) = last let set_last_node (map,last,n) new_last = map, new_last, n let find (map,last,n) i = - try + try IntMap.fold (IntMap.find map i) [] (fun found j orths -> StringMap.fold orths found (fun found orth _ -> (i,j,orth) :: found)) with Not_found -> [] let find_full (map,last,n) i = - try + try IntMap.fold (IntMap.find map i) [] (fun found j orths -> StringMap.fold orths found (fun found orth dict -> (i,j,orth,dict) :: found)) with Not_found -> [] *) -let fold (map,last,n) s f = +let fold (map,last,n) s f = IntMap.fold map s (fun s i map2 -> IntMap.fold map2 s (fun s j set -> StringMap.fold set s (fun s orth lemmas -> f s orth i j lemmas))) -(* -let map (map,last,n) f = +(* +let map (map,last,n) f = IntMap.map map (fun map2 -> IntMap.map map2 (fun orths -> StringMap.map orths (fun lemmas -> f lemmas))), last, n - -let mapi (map,last,n) f = + +let mapi (map,last,n) f = IntMap.mapi map (fun i map2 -> IntMap.mapi map2 (fun j orths -> StringMap.mapi orths (fun orth lemmas -> f orth i j lemmas))), last, n - -let get_edges (map,_,_) i j = + +let get_edges (map,_,_) i j = IntMap.find (IntMap.find map i) j - -let get_edges_from (map,_,_) i = + +let get_edges_from (map,_,_) i = IntMap.find map i -*) +*) let rec topological_sort_rec map visited l i = - if IntSet.mem visited i then (l,visited) else + if IntSet.mem visited i then (l,visited) else let l, visited = IntMap.fold (try IntMap.find map i with Not_found -> IntMap.empty) (l,IntSet.add visited i) (fun (l,visited) j _ -> topological_sort_rec map visited l j) in i :: l, visited - + let topological_sort (map,last,n) = let l, _ = topological_sort_rec map IntSet.empty [] 0 in let translation, k = Xlist.fold l (IntMap.empty,0) (fun (translation,k) i -> @@ -630,27 +630,27 @@ let topological_sort (map,last,n) = try IntMap.add map2 (IntMap.find translation j) orths with Not_found -> map2) in try IntMap.add map (IntMap.find translation i) map2 with Not_found -> map) in map, (try IntMap.find translation last with Not_found -> failwith "topological_sort 3"), k-1 - + (*let interp_to_string interp = String.concat " " (Xlist.fold interp.interp [] (fun l tags -> (String.concat ":" (Xlist.map tags (String.concat "."))) :: l)) - + let interps_to_string interps = String.concat " " (StringMap.fold interps [] (fun l pos interp -> (pos ^ "[" ^ interp_to_string interp ^ "]") :: l)) - + let lemmas_to_string lemmas = String.concat " " (StringMap.fold lemmas [] (fun l lemma interps -> (lemma ^ "[" ^ interps_to_string interps ^ "]") :: l)) - -let to_string (map,last,n) = + +let to_string (map,last,n) = let l = IntMap.fold map [] (fun l i map2 -> IntMap.fold map2 l (fun l j orths -> - (Printf.sprintf "%5d %5d %s" i j (String.concat " " (StringMap.fold orths [] (fun l2 orths dict -> + (Printf.sprintf "%5d %5d %s" i j (String.concat " " (StringMap.fold orths [] (fun l2 orths dict -> (Printf.sprintf "%s %5d %5d [%s]" orths dict.dbeg dict.dlen (lemmas_to_string dict.lemmas)) :: l2)))) :: l)) in Printf.sprintf "last=%d n=%d\n %s" last n (String.concat "\n " (List.sort compare l))*) - (* -let make_unique_orths (map,last,n) = + (* +let make_unique_orths (map,last,n) = let names = fold (map,last,n) StringQMap.empty (fun names orth _ _ _ -> StringQMap.add names orth) in let names = StringQMap.fold names StringSet.empty (fun names name n -> @@ -658,10 +658,10 @@ let make_unique_orths (map,last,n) = let map,_ = IntMap.fold map (IntMap.empty,StringMap.empty) (fun (map,used) i map2 -> let map2,used = IntMap.fold map2 (IntMap.empty,used) (fun (map2,used) j orths -> let orths,used = StringMap.fold orths (StringMap.empty,used) (fun (orths,used) orth lemmas -> - let orth,used = + let orth,used = if StringSet.mem names orth then - let n = - try StringMap.find used orth + 1 + let n = + try StringMap.find used orth + 1 with Not_found -> 1 in orth ^ "-" ^ string_of_int n, StringMap.add used orth n else orth,used in @@ -670,6 +670,6 @@ let make_unique_orths (map,last,n) = IntMap.add map i map2, used) in map,last,n -*) - -**) \ No newline at end of file +*) + +**)