From 84afa6af17ed75f7b1ffaa90d26a22b8fe209aee Mon Sep 17 00:00:00 2001
From: Wojciech Jaworski <wjaworski@mimuw.edu.pl>
Date: Fri, 23 Mar 2018 12:17:47 +0100
Subject: [PATCH] Uzupełnienia kategorii w leksykonie i wizualizacji
---
LCGlexicon/ENIAM_LCGlexicon.ml | 1 +
LCGlexicon/ENIAM_LCGlexiconTypes.ml | 2 +-
LCGlexicon/ENIAMcategoriesPL.ml | 3 +++
exec/ENIAMvisualization.ml | 181 +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
subsyntax/ENIAMsentences.ml | 2 +-
subsyntax/ENIAMsubsyntax.ml | 17 +++++++++--------
tokenizer/ENIAMtokens.ml | 3 ++-
7 files changed, 18 insertions(+), 191 deletions(-)
diff --git a/LCGlexicon/ENIAM_LCGlexicon.ml b/LCGlexicon/ENIAM_LCGlexicon.ml
index abdd428..d658be3 100644
--- a/LCGlexicon/ENIAM_LCGlexicon.ml
+++ b/LCGlexicon/ENIAM_LCGlexicon.ml
@@ -234,6 +234,7 @@ let make_quantification e rules =
let make_node id orth lemma pos syntax weight cat_list is_raised =
let attrs = Xlist.fold cat_list [] (fun attrs -> function
| Lemma -> attrs
+ | IncludeLemmata -> attrs
| Pos -> attrs
| Pos2 -> attrs
| Cat -> ("CAT",SubstVar "cat") :: attrs
diff --git a/LCGlexicon/ENIAM_LCGlexiconTypes.ml b/LCGlexicon/ENIAM_LCGlexiconTypes.ml
index ed57378..6e84f31 100644
--- a/LCGlexicon/ENIAM_LCGlexiconTypes.ml
+++ b/LCGlexicon/ENIAM_LCGlexiconTypes.ml
@@ -25,7 +25,7 @@ type categories = {lemma: string; pos: string; pos2: string; cat: string; coerce
}
type selector =
- Lemma | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode |
+ Lemma | IncludeLemmata | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode |
Number | Case | Gender | Person | Grad | Praep |
Acm | Aspect | Negation | Mood | Tense | Nsyn | Nsem | Ctype | Mode | Psem |
Icat | Inumber | Igender | Iperson | Nperson | Ncat | Plemma |
diff --git a/LCGlexicon/ENIAMcategoriesPL.ml b/LCGlexicon/ENIAMcategoriesPL.ml
index 1b4eb2f..ea5675b 100644
--- a/LCGlexicon/ENIAMcategoriesPL.ml
+++ b/LCGlexicon/ENIAMcategoriesPL.ml
@@ -29,6 +29,7 @@ let all_persons = ["pri";"sec";"ter"]
let selector_values = Xlist.fold [
Lemma, [];
+ IncludeLemmata, [];
Pos, ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"fixed";"num";"numcomp";"intnum";
"realnum";"intnum-interval";"realnum-interval";"symbol";"ordnum";
"date";"date-interval";"hour-minute";"hour";"hour-minute-interval";
@@ -445,6 +446,7 @@ let selector_names = StringSet.of_list [
let string_of_selector = function
Lemma -> "lemma"
+ | IncludeLemmata -> "include-lemmata"
(* | NewLemma -> "newlemma" *)
| Pos -> "pos"
| Pos2 -> "pos2"
@@ -494,6 +496,7 @@ let string_of_selectors selectors =
let selector_of_string = function
"lemma" -> Lemma
+ | "include-lemmata" -> IncludeLemmata
(* | NewLemma -> "newlemma" *)
| "pos" -> Pos
| "pos2" -> Pos2
diff --git a/exec/ENIAMvisualization.ml b/exec/ENIAMvisualization.ml
index 60d5d68..a59290c 100644
--- a/exec/ENIAMvisualization.ml
+++ b/exec/ENIAMvisualization.ml
@@ -23,10 +23,6 @@ open Printf
open ENIAMtokenizerTypes
open ENIAMexecTypes
-type marked =
- Chart of (string * string * string list) list
- | Message of string
-
let string_of_status = function
Idle -> "Idle"
| PreprocessingError -> "PreprocessingError"
@@ -826,7 +822,7 @@ let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<sieb
"<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>";
"<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>";
"<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>";
- "<interp>";"<part>";"<unk>";"<building-number>"]
+ "<interp>";"<part>";"<unk>";"<building-number>";"<html-tag>";"<list-item>";"<numcomp>";"<phone-number>";"<postal-code>"]
let cat_tokens_sequence text_fragments g =
let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) ->
@@ -866,85 +862,6 @@ let cat_tokens_sequence text_fragments g =
| b :: l -> if a = b then b :: l else a :: b :: l) in*)
String.concat " " (Xlist.map l (fun (n,texts) -> texts))
-let excluded_cats = StringSet.union omited (StringSet.of_list ["0";"Prep";"s";"BracketSet";"<root>";"by";"nie";"się";"jak";"int";
- "wieś";"ulica";"osada leśna";"część miejscowości";"astr.";"przysiółek";"nazwisko";"część miasta";
- "imię";"geograficzna";"pseudonim";"gmina wiejska";"osada";"firma";"język programowania";"kolonia";
- "instytucja";"gmina miejska";"miasto";"pora roku";"miesiąc";"krój pisma";"gmina miejsko-wiejska";
- "obszar wiejski";"powiat";"organizacja";"dzielnica";"własna";"marka";"przydomek";"hour-minute";"inicjał"; ])
-
-let load_colours_of_cats filename =
- File.fold_tab filename StringMap.empty (fun map -> function
- [cat; colour] -> StringMap.add map cat colour
- | line -> failwith ("load_colours_of_cats: " ^ String.concat "\t" line))
-
-let colours_of_cats = ref StringMap.empty
-
-let initialize () =
- colours_of_cats := load_colours_of_cats colours_filename
-
-(* let colours_of_cats = Xlist.fold [
- "ChemCompound","#00ffff";
- "ChemFunGroup","#ff00cc";
- "Measure","#ffff00";
- "Contain","#00ff00";
- "Number","#0000ff";
- "Dose","#ff0000";
- (* "","#";
- "","#"; *)
- ] StringMap.empty (fun map (cat,colour) -> StringMap.add map cat colour) *)
-
-let rec merge_cat_chart rev = function
- (i,j,s,x) :: (m,n,t,y) :: l ->
- (* printf "i=%d j=%d s=%s m=%d n=%d t=%s\n%!" i j s m n t; *)
- if j=m && s=t then merge_cat_chart rev ((i,n,t,y) :: l)
- else merge_cat_chart ((i,j,s,x) :: rev) ((m,n,t,y) :: l)
- | l -> List.rev (l @ rev)
-
-let cat_chart text_fragments g =
- (* print_endline "cat_chart 1"; *)
- let l,last = ENIAM_LCGchart.fold g ([],0) (fun (l,last) (symbol,node1,node2,sem,layer) ->
- (* printf "node1=%d node2=%d symbol=%s\n" node1 node2 (ENIAM_LCGstringOf.grammar_symbol 0 symbol); *)
- (node1,node2,extract_pos_cat [] symbol) :: l, max node2 last) in
- let a = Array.make (Array.length g) StringSet.empty in
- Xlist.iter l (fun (node1,node2,cat) ->
- if StringSet.mem excluded_cats cat then () else
- Int.iter node1 (node2 - 1) (fun i ->
- a.(i) <- StringSet.add a.(i) cat));
- let l = List.rev (Int.fold 0 (Array.length g - 1) [] (fun l i ->
- if i >= last then l else
- let cats = List.sort compare (StringSet.to_list a.(i)) in
- (i,i+1,String.concat "|" cats, cats ) :: l)) in
- let l = merge_cat_chart [] l in
- (* print_endline "cat_chart 2"; *)
- List.rev (Xlist.fold l [] (fun l (node1,node2,key,cats) ->
- let t = get_text_fragment text_fragments node1 node2 in
- (* if t = "???" then printf "node1=%d node2=%d key=%s cats=[%s]\n%!" node1 node2 key (String.concat ";" cats); *)
- if node1 = node2 then l else
- (t,key,cats) :: l))
-
-let create_styles ll =
- fst (Xlist.fold ll (StringMap.empty,1) (fun (map,n) -> function
- | (_,_,Message l) -> map,n
- | (_,_,Chart l) ->
- Xlist.fold l (map,n) (fun (map,n) (_,key,cats) ->
- if StringMap.mem map key || key = "" then map,n else
- let colours = List.rev (Xlist.rev_map cats (fun cat ->
- try StringMap.find !colours_of_cats cat with Not_found -> print_endline ("create_styles: unknown cat " ^ cat); "#ffffff")) in
- let colours,_ = Xlist.fold colours ([],0) (fun (colours,i) colour ->
- (Printf.sprintf "%s %dpx,%s %dpx" colour (i*7) colour ((i+1)*7)) :: colours, i+1) in
- StringMap.add map key ("B" ^ string_of_int n,List.rev colours),n+1)))
-
-let render_styles styles =
- "<style type=\"text/css\">\n " ^
- String.concat "\n " (List.rev (StringMap.fold styles [] (fun l _ (name,colours) ->
- (Printf.sprintf ".%s { background-image:repeating-linear-gradient(-45deg,%s); }" name
- (String.concat "," colours)) :: l))) ^
- "</style>"
-
-let assign_style styles (t,key,_) =
- if key = "" then t else
- let id,_ = try StringMap.find styles key with Not_found -> failwith ("assign_style: " ^ key) in
- Printf.sprintf "<span class=\"%s\">%s</span>" id t
(* verbosity:
0 -> jedynie informacja o statusie zdania
@@ -1574,99 +1491,3 @@ let rec to_string_text verbosity tokens = function
RawText s -> []
| StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens))
| AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text))
-
-let rec to_string2_paragraph verbosity tokens = function
- RawParagraph s -> []
- | StructParagraph sentences ->
- let l = List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence)) in
- List.rev (Xlist.rev_map l (fun t -> "","",Message t))
- | AltParagraph((Name,RawParagraph name) :: l) ->
- let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) in
- List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t))
- | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph))
- | ErrorParagraph s -> ["","",Message "SubsyntaxError"]
-
-let rec to_string2_text verbosity tokens = function
- RawText s -> []
- | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string2_paragraph verbosity tokens))
- | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string2_text verbosity tokens text))
-
-let rec skip_tag = function
- ">" :: l -> l
- | s :: l -> skip_tag l
- | [] -> []
-
-let rec check_name_length_rec n rev = function
- "<" :: l -> check_name_length_rec n rev (skip_tag l)
- | [s] -> String.concat "" (List.rev (s :: rev))
- | [] -> String.concat "" (List.rev rev)
- | s :: l ->
- if n > 1 then check_name_length_rec (n-1) (s :: rev) l
- else String.concat "" (List.rev ("…" :: rev))
-
-let check_name_length n s =
- let l = Xunicode.utf8_chars_of_utf8_string s in
- check_name_length_rec n [] l
- (* if String.length s > n then
- String.sub s 0 (n-1) ^ "…"
- else s *)
-
-let to_string2_simplify name_length= function
- name,_,Message s ->
- if name_length <= 0 then s
- else (check_name_length name_length name) ^ "\t" ^ s
- | _ -> failwith "to_string2_simplify"
-
-
-let marked_string_of_eniam_sentence verbosity tokens (result : eniam_parse_result) =
- let status_string = string_of_status result.status in
- if result.status = NotParsed then
- [status_string, Chart(cat_chart result.text_fragments result.chart1)]
- else [status_string,Message result.msg]
-
-let rec marked_string_of_sentence verbosity tokens = function
- RawSentence s -> []
- | StructSentence(paths,last) -> []
- | DepSentence paths -> []
- | ENIAMSentence result -> marked_string_of_eniam_sentence verbosity tokens result
- | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence))
- | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> marked_string_of_sentence verbosity tokens sentence))
-
-let rec marked_string_of_paragraph verbosity tokens = function
- RawParagraph s -> []
- | StructParagraph sentences ->
- let l = List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) in
- List.rev (Xlist.rev_map l (fun (s,t) -> "",s,t))
- | AltParagraph((Name,RawParagraph name) :: l) ->
- let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) in
- List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t))
- | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph))
- | ErrorParagraph s -> ["","SubsyntaxError",Message s]
-
-let rec marked_string_of_text verbosity tokens = function
- RawText s -> []
- | StructText paragraphs -> List.flatten (Xlist.map paragraphs (marked_string_of_paragraph verbosity tokens))
- | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> marked_string_of_text verbosity tokens text))
-
-let print_html_marked_simple_text path name name_length l =
- File.file_out (path ^ name ^ ".html") (fun file ->
- fprintf file "%s\n" html_header;
- (* print_endline "print_html_marked_text 1"; *)
- (* print_endline "print_html_marked_text 2"; *)
- let styles = create_styles l in
- (* print_endline "print_html_marked_text 3"; *)
- fprintf file "%s\n" (render_styles styles);
- if name_length <= 0 then
- Xlist.iter l (function
- name, "NotParsed", Chart t -> fprintf file "%s<BR>\n" (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles))));
- | name, status, Chart t -> fprintf file "%s: %s<BR>\n" status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles))));
- | name, status, Message t -> fprintf file "%s: %s<BR>\n" status (escape_html t))
- else (
- fprintf file "<TABLE border=1>\n";
- Xlist.iter l (function
- name, "NotParsed", Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s</TD><TR>\n" (check_name_length name_length name) (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles))));
- | name, status, Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles))));
- | name, status, Message t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (escape_html t));
- fprintf file "</TABLE>\n");
- (* print_endline "print_html_marked_text 4"; *)
- fprintf file "%s\n" html_trailer)
diff --git a/subsyntax/ENIAMsentences.ml b/subsyntax/ENIAMsentences.ml
index 04eca9a..c3bf4dc 100644
--- a/subsyntax/ENIAMsentences.ml
+++ b/subsyntax/ENIAMsentences.ml
@@ -156,7 +156,7 @@ let find_tokens_in_chart tokens chart lnode rnode cat =
else found) in
match found with
[x] -> x
- | [] -> failwith "Unable to extract sentences. Check puntuation."
+ | [] -> failwith "Unable to extract sentences. Check punctuation."
| _ -> failwith "find_tokens_in_chart"
(*let find_tokens_in_chart_id tokens chart lnode rnode cat =
diff --git a/subsyntax/ENIAMsubsyntax.ml b/subsyntax/ENIAMsubsyntax.ml
index de59f39..79d3db0 100644
--- a/subsyntax/ENIAMsubsyntax.ml
+++ b/subsyntax/ENIAMsubsyntax.ml
@@ -348,15 +348,15 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query =
(* print_endline ("parse_text_tokens: " ^ query); *)
let paragraphs = Xstring.split "\n\\|\r" query in
let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
+ let paragraphs = List.rev (Xlist.rev_map paragraphs (fun paragraph ->
+ if par_names_flag then
+ match Xstring.split "\t" paragraph with
+ [name; paragraph] -> name, paragraph
+ | _ -> failwith ("parse_text_tokens: " ^ paragraph)
+ else "", paragraph)) in
let n = if Xlist.size paragraphs = 1 then 0 else 1 in
- let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
+ let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) (name,paragraph) ->
try
- let name, paragraph =
- if par_names_flag then
- match Xstring.split "\t" paragraph with
- [name; paragraph] -> name, paragraph
- | _ -> failwith ("parse_text_tokens: " ^ paragraph)
- else "", paragraph in
(* print_endline paragraph; *)
let paths = parse paragraph in
(* print_endline "parse_text 1"; *)
@@ -367,7 +367,8 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query =
(AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @
[Raw,RawParagraph paragraph; Struct,StructParagraph sentences])) :: paragraphs, n+1
with e ->
- (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in
+ (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @
+ [Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)])) :: paragraphs, n+1) in
AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens
let parse_text sentence_split_flag par_names_flag query =
diff --git a/tokenizer/ENIAMtokens.ml b/tokenizer/ENIAMtokens.ml
index b958b5e..73e8f5c 100644
--- a/tokenizer/ENIAMtokens.ml
+++ b/tokenizer/ENIAMtokens.ml
@@ -1056,7 +1056,8 @@ let rec recognize_sign_group poss_s_beg i = function
| (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t")
| (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r")
| (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n")
- | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®")
+ | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (make_lemma ("®","symbol"))
+ | (Sign "™") :: l -> create_sign_token poss_s_beg i [Sign "™"] l (make_lemma ("™","symbol"))
| (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ")
| (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ")
| (Sign "†") :: l -> create_sign_token poss_s_beg i [Sign "†"] l (Interp "†")
--
libgit2 0.22.2