diff --git a/LCGlexicon/ENIAM_LCGlexicon.ml b/LCGlexicon/ENIAM_LCGlexicon.ml index abdd428..d658be3 100644 --- a/LCGlexicon/ENIAM_LCGlexicon.ml +++ b/LCGlexicon/ENIAM_LCGlexicon.ml @@ -234,6 +234,7 @@ let make_quantification e rules = let make_node id orth lemma pos syntax weight cat_list is_raised = let attrs = Xlist.fold cat_list [] (fun attrs -> function | Lemma -> attrs + | IncludeLemmata -> attrs | Pos -> attrs | Pos2 -> attrs | Cat -> ("CAT",SubstVar "cat") :: attrs diff --git a/LCGlexicon/ENIAM_LCGlexiconTypes.ml b/LCGlexicon/ENIAM_LCGlexiconTypes.ml index ed57378..6e84f31 100644 --- a/LCGlexicon/ENIAM_LCGlexiconTypes.ml +++ b/LCGlexicon/ENIAM_LCGlexiconTypes.ml @@ -25,7 +25,7 @@ type categories = {lemma: string; pos: string; pos2: string; cat: string; coerce } type selector = - Lemma | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode | + Lemma | IncludeLemmata | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode | Number | Case | Gender | Person | Grad | Praep | Acm | Aspect | Negation | Mood | Tense | Nsyn | Nsem | Ctype | Mode | Psem | Icat | Inumber | Igender | Iperson | Nperson | Ncat | Plemma | diff --git a/LCGlexicon/ENIAMcategoriesPL.ml b/LCGlexicon/ENIAMcategoriesPL.ml index 1b4eb2f..ea5675b 100644 --- a/LCGlexicon/ENIAMcategoriesPL.ml +++ b/LCGlexicon/ENIAMcategoriesPL.ml @@ -29,6 +29,7 @@ let all_persons = ["pri";"sec";"ter"] let selector_values = Xlist.fold [ Lemma, []; + IncludeLemmata, []; Pos, ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"fixed";"num";"numcomp";"intnum"; "realnum";"intnum-interval";"realnum-interval";"symbol";"ordnum"; "date";"date-interval";"hour-minute";"hour";"hour-minute-interval"; @@ -445,6 +446,7 @@ let selector_names = StringSet.of_list [ let string_of_selector = function Lemma -> "lemma" + | IncludeLemmata -> "include-lemmata" (* | NewLemma -> "newlemma" *) | Pos -> "pos" | Pos2 -> "pos2" @@ -494,6 +496,7 @@ let string_of_selectors selectors = let selector_of_string = function "lemma" -> Lemma + | "include-lemmata" -> IncludeLemmata (* | NewLemma -> "newlemma" *) | "pos" -> Pos | "pos2" -> Pos2 diff --git a/exec/ENIAMvisualization.ml b/exec/ENIAMvisualization.ml index 60d5d68..a59290c 100644 --- a/exec/ENIAMvisualization.ml +++ b/exec/ENIAMvisualization.ml @@ -23,10 +23,6 @@ open Printf open ENIAMtokenizerTypes open ENIAMexecTypes -type marked = - Chart of (string * string * string list) list - | Message of string - let string_of_status = function Idle -> "Idle" | PreprocessingError -> "PreprocessingError" @@ -826,7 +822,7 @@ let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<sieb "<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>"; "<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>"; "<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>"; - "<interp>";"<part>";"<unk>";"<building-number>"] + "<interp>";"<part>";"<unk>";"<building-number>";"<html-tag>";"<list-item>";"<numcomp>";"<phone-number>";"<postal-code>"] let cat_tokens_sequence text_fragments g = let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) -> @@ -866,85 +862,6 @@ let cat_tokens_sequence text_fragments g = | b :: l -> if a = b then b :: l else a :: b :: l) in*) String.concat " " (Xlist.map l (fun (n,texts) -> texts)) -let excluded_cats = StringSet.union omited (StringSet.of_list ["0";"Prep";"s";"BracketSet";"<root>";"by";"nie";"się";"jak";"int"; - "wieś";"ulica";"osada leśna";"część miejscowości";"astr.";"przysiółek";"nazwisko";"część miasta"; - "imię";"geograficzna";"pseudonim";"gmina wiejska";"osada";"firma";"język programowania";"kolonia"; - "instytucja";"gmina miejska";"miasto";"pora roku";"miesiąc";"krój pisma";"gmina miejsko-wiejska"; - "obszar wiejski";"powiat";"organizacja";"dzielnica";"własna";"marka";"przydomek";"hour-minute";"inicjał"; ]) - -let load_colours_of_cats filename = - File.fold_tab filename StringMap.empty (fun map -> function - [cat; colour] -> StringMap.add map cat colour - | line -> failwith ("load_colours_of_cats: " ^ String.concat "\t" line)) - -let colours_of_cats = ref StringMap.empty - -let initialize () = - colours_of_cats := load_colours_of_cats colours_filename - -(* let colours_of_cats = Xlist.fold [ - "ChemCompound","#00ffff"; - "ChemFunGroup","#ff00cc"; - "Measure","#ffff00"; - "Contain","#00ff00"; - "Number","#0000ff"; - "Dose","#ff0000"; - (* "","#"; - "","#"; *) - ] StringMap.empty (fun map (cat,colour) -> StringMap.add map cat colour) *) - -let rec merge_cat_chart rev = function - (i,j,s,x) :: (m,n,t,y) :: l -> - (* printf "i=%d j=%d s=%s m=%d n=%d t=%s\n%!" i j s m n t; *) - if j=m && s=t then merge_cat_chart rev ((i,n,t,y) :: l) - else merge_cat_chart ((i,j,s,x) :: rev) ((m,n,t,y) :: l) - | l -> List.rev (l @ rev) - -let cat_chart text_fragments g = - (* print_endline "cat_chart 1"; *) - let l,last = ENIAM_LCGchart.fold g ([],0) (fun (l,last) (symbol,node1,node2,sem,layer) -> - (* printf "node1=%d node2=%d symbol=%s\n" node1 node2 (ENIAM_LCGstringOf.grammar_symbol 0 symbol); *) - (node1,node2,extract_pos_cat [] symbol) :: l, max node2 last) in - let a = Array.make (Array.length g) StringSet.empty in - Xlist.iter l (fun (node1,node2,cat) -> - if StringSet.mem excluded_cats cat then () else - Int.iter node1 (node2 - 1) (fun i -> - a.(i) <- StringSet.add a.(i) cat)); - let l = List.rev (Int.fold 0 (Array.length g - 1) [] (fun l i -> - if i >= last then l else - let cats = List.sort compare (StringSet.to_list a.(i)) in - (i,i+1,String.concat "|" cats, cats ) :: l)) in - let l = merge_cat_chart [] l in - (* print_endline "cat_chart 2"; *) - List.rev (Xlist.fold l [] (fun l (node1,node2,key,cats) -> - let t = get_text_fragment text_fragments node1 node2 in - (* if t = "???" then printf "node1=%d node2=%d key=%s cats=[%s]\n%!" node1 node2 key (String.concat ";" cats); *) - if node1 = node2 then l else - (t,key,cats) :: l)) - -let create_styles ll = - fst (Xlist.fold ll (StringMap.empty,1) (fun (map,n) -> function - | (_,_,Message l) -> map,n - | (_,_,Chart l) -> - Xlist.fold l (map,n) (fun (map,n) (_,key,cats) -> - if StringMap.mem map key || key = "" then map,n else - let colours = List.rev (Xlist.rev_map cats (fun cat -> - try StringMap.find !colours_of_cats cat with Not_found -> print_endline ("create_styles: unknown cat " ^ cat); "#ffffff")) in - let colours,_ = Xlist.fold colours ([],0) (fun (colours,i) colour -> - (Printf.sprintf "%s %dpx,%s %dpx" colour (i*7) colour ((i+1)*7)) :: colours, i+1) in - StringMap.add map key ("B" ^ string_of_int n,List.rev colours),n+1))) - -let render_styles styles = - "<style type=\"text/css\">\n " ^ - String.concat "\n " (List.rev (StringMap.fold styles [] (fun l _ (name,colours) -> - (Printf.sprintf ".%s { background-image:repeating-linear-gradient(-45deg,%s); }" name - (String.concat "," colours)) :: l))) ^ - "</style>" - -let assign_style styles (t,key,_) = - if key = "" then t else - let id,_ = try StringMap.find styles key with Not_found -> failwith ("assign_style: " ^ key) in - Printf.sprintf "<span class=\"%s\">%s</span>" id t (* verbosity: 0 -> jedynie informacja o statusie zdania @@ -1574,99 +1491,3 @@ let rec to_string_text verbosity tokens = function RawText s -> [] | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens)) | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text)) - -let rec to_string2_paragraph verbosity tokens = function - RawParagraph s -> [] - | StructParagraph sentences -> - let l = List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence)) in - List.rev (Xlist.rev_map l (fun t -> "","",Message t)) - | AltParagraph((Name,RawParagraph name) :: l) -> - let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) in - List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t)) - | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) - | ErrorParagraph s -> ["","",Message "SubsyntaxError"] - -let rec to_string2_text verbosity tokens = function - RawText s -> [] - | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string2_paragraph verbosity tokens)) - | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string2_text verbosity tokens text)) - -let rec skip_tag = function - ">" :: l -> l - | s :: l -> skip_tag l - | [] -> [] - -let rec check_name_length_rec n rev = function - "<" :: l -> check_name_length_rec n rev (skip_tag l) - | [s] -> String.concat "" (List.rev (s :: rev)) - | [] -> String.concat "" (List.rev rev) - | s :: l -> - if n > 1 then check_name_length_rec (n-1) (s :: rev) l - else String.concat "" (List.rev ("…" :: rev)) - -let check_name_length n s = - let l = Xunicode.utf8_chars_of_utf8_string s in - check_name_length_rec n [] l - (* if String.length s > n then - String.sub s 0 (n-1) ^ "…" - else s *) - -let to_string2_simplify name_length= function - name,_,Message s -> - if name_length <= 0 then s - else (check_name_length name_length name) ^ "\t" ^ s - | _ -> failwith "to_string2_simplify" - - -let marked_string_of_eniam_sentence verbosity tokens (result : eniam_parse_result) = - let status_string = string_of_status result.status in - if result.status = NotParsed then - [status_string, Chart(cat_chart result.text_fragments result.chart1)] - else [status_string,Message result.msg] - -let rec marked_string_of_sentence verbosity tokens = function - RawSentence s -> [] - | StructSentence(paths,last) -> [] - | DepSentence paths -> [] - | ENIAMSentence result -> marked_string_of_eniam_sentence verbosity tokens result - | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) - | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> marked_string_of_sentence verbosity tokens sentence)) - -let rec marked_string_of_paragraph verbosity tokens = function - RawParagraph s -> [] - | StructParagraph sentences -> - let l = List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) in - List.rev (Xlist.rev_map l (fun (s,t) -> "",s,t)) - | AltParagraph((Name,RawParagraph name) :: l) -> - let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) in - List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t)) - | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) - | ErrorParagraph s -> ["","SubsyntaxError",Message s] - -let rec marked_string_of_text verbosity tokens = function - RawText s -> [] - | StructText paragraphs -> List.flatten (Xlist.map paragraphs (marked_string_of_paragraph verbosity tokens)) - | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> marked_string_of_text verbosity tokens text)) - -let print_html_marked_simple_text path name name_length l = - File.file_out (path ^ name ^ ".html") (fun file -> - fprintf file "%s\n" html_header; - (* print_endline "print_html_marked_text 1"; *) - (* print_endline "print_html_marked_text 2"; *) - let styles = create_styles l in - (* print_endline "print_html_marked_text 3"; *) - fprintf file "%s\n" (render_styles styles); - if name_length <= 0 then - Xlist.iter l (function - name, "NotParsed", Chart t -> fprintf file "%s<BR>\n" (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); - | name, status, Chart t -> fprintf file "%s: %s<BR>\n" status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); - | name, status, Message t -> fprintf file "%s: %s<BR>\n" status (escape_html t)) - else ( - fprintf file "<TABLE border=1>\n"; - Xlist.iter l (function - name, "NotParsed", Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s</TD><TR>\n" (check_name_length name_length name) (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); - | name, status, Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); - | name, status, Message t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (escape_html t)); - fprintf file "</TABLE>\n"); - (* print_endline "print_html_marked_text 4"; *) - fprintf file "%s\n" html_trailer) diff --git a/subsyntax/ENIAMsentences.ml b/subsyntax/ENIAMsentences.ml index 04eca9a..c3bf4dc 100644 --- a/subsyntax/ENIAMsentences.ml +++ b/subsyntax/ENIAMsentences.ml @@ -156,7 +156,7 @@ let find_tokens_in_chart tokens chart lnode rnode cat = else found) in match found with [x] -> x - | [] -> failwith "Unable to extract sentences. Check puntuation." + | [] -> failwith "Unable to extract sentences. Check punctuation." | _ -> failwith "find_tokens_in_chart" (*let find_tokens_in_chart_id tokens chart lnode rnode cat = diff --git a/subsyntax/ENIAMsubsyntax.ml b/subsyntax/ENIAMsubsyntax.ml index de59f39..79d3db0 100644 --- a/subsyntax/ENIAMsubsyntax.ml +++ b/subsyntax/ENIAMsubsyntax.ml @@ -348,15 +348,15 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query = (* print_endline ("parse_text_tokens: " ^ query); *) let paragraphs = Xstring.split "\n\\|\r" query in let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in + let paragraphs = List.rev (Xlist.rev_map paragraphs (fun paragraph -> + if par_names_flag then + match Xstring.split "\t" paragraph with + [name; paragraph] -> name, paragraph + | _ -> failwith ("parse_text_tokens: " ^ paragraph) + else "", paragraph)) in let n = if Xlist.size paragraphs = 1 then 0 else 1 in - let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph -> + let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) (name,paragraph) -> try - let name, paragraph = - if par_names_flag then - match Xstring.split "\t" paragraph with - [name; paragraph] -> name, paragraph - | _ -> failwith ("parse_text_tokens: " ^ paragraph) - else "", paragraph in (* print_endline paragraph; *) let paths = parse paragraph in (* print_endline "parse_text 1"; *) @@ -367,7 +367,8 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query = (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ [Raw,RawParagraph paragraph; Struct,StructParagraph sentences])) :: paragraphs, n+1 with e -> - (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in + (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ + [Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)])) :: paragraphs, n+1) in AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens let parse_text sentence_split_flag par_names_flag query = diff --git a/tokenizer/ENIAMtokens.ml b/tokenizer/ENIAMtokens.ml index b958b5e..73e8f5c 100644 --- a/tokenizer/ENIAMtokens.ml +++ b/tokenizer/ENIAMtokens.ml @@ -1056,7 +1056,8 @@ let rec recognize_sign_group poss_s_beg i = function | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t") | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r") | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n") - | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®") + | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (make_lemma ("®","symbol")) + | (Sign "™") :: l -> create_sign_token poss_s_beg i [Sign "™"] l (make_lemma ("™","symbol")) | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") | (Sign "†") :: l -> create_sign_token poss_s_beg i [Sign "†"] l (Interp "†")