Commit 84afa6af17ed75f7b1ffaa90d26a22b8fe209aee

Authored by Wojciech Jaworski
1 parent ccd5d99e

Uzupełnienia kategorii w leksykonie i wizualizacji

LCGlexicon/ENIAM_LCGlexicon.ml
... ... @@ -234,6 +234,7 @@ let make_quantification e rules =
234 234 let make_node id orth lemma pos syntax weight cat_list is_raised =
235 235 let attrs = Xlist.fold cat_list [] (fun attrs -> function
236 236 | Lemma -> attrs
  237 + | IncludeLemmata -> attrs
237 238 | Pos -> attrs
238 239 | Pos2 -> attrs
239 240 | Cat -> ("CAT",SubstVar "cat") :: attrs
... ...
LCGlexicon/ENIAM_LCGlexiconTypes.ml
... ... @@ -25,7 +25,7 @@ type categories = {lemma: string; pos: string; pos2: string; cat: string; coerce
25 25 }
26 26  
27 27 type selector =
28   - Lemma | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode |
  28 + Lemma | IncludeLemmata | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode |
29 29 Number | Case | Gender | Person | Grad | Praep |
30 30 Acm | Aspect | Negation | Mood | Tense | Nsyn | Nsem | Ctype | Mode | Psem |
31 31 Icat | Inumber | Igender | Iperson | Nperson | Ncat | Plemma |
... ...
LCGlexicon/ENIAMcategoriesPL.ml
... ... @@ -29,6 +29,7 @@ let all_persons = ["pri";"sec";"ter"]
29 29  
30 30 let selector_values = Xlist.fold [
31 31 Lemma, [];
  32 + IncludeLemmata, [];
32 33 Pos, ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"fixed";"num";"numcomp";"intnum";
33 34 "realnum";"intnum-interval";"realnum-interval";"symbol";"ordnum";
34 35 "date";"date-interval";"hour-minute";"hour";"hour-minute-interval";
... ... @@ -445,6 +446,7 @@ let selector_names = StringSet.of_list [
445 446  
446 447 let string_of_selector = function
447 448 Lemma -> "lemma"
  449 + | IncludeLemmata -> "include-lemmata"
448 450 (* | NewLemma -> "newlemma" *)
449 451 | Pos -> "pos"
450 452 | Pos2 -> "pos2"
... ... @@ -494,6 +496,7 @@ let string_of_selectors selectors =
494 496  
495 497 let selector_of_string = function
496 498 "lemma" -> Lemma
  499 + | "include-lemmata" -> IncludeLemmata
497 500 (* | NewLemma -> "newlemma" *)
498 501 | "pos" -> Pos
499 502 | "pos2" -> Pos2
... ...
exec/ENIAMvisualization.ml
... ... @@ -23,10 +23,6 @@ open Printf
23 23 open ENIAMtokenizerTypes
24 24 open ENIAMexecTypes
25 25  
26   -type marked =
27   - Chart of (string * string * string list) list
28   - | Message of string
29   -
30 26 let string_of_status = function
31 27 Idle -> "Idle"
32 28 | PreprocessingError -> "PreprocessingError"
... ... @@ -826,7 +822,7 @@ let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<sieb
826 822 "<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>";
827 823 "<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>";
828 824 "<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>";
829   - "<interp>";"<part>";"<unk>";"<building-number>"]
  825 + "<interp>";"<part>";"<unk>";"<building-number>";"<html-tag>";"<list-item>";"<numcomp>";"<phone-number>";"<postal-code>"]
830 826  
831 827 let cat_tokens_sequence text_fragments g =
832 828 let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) ->
... ... @@ -866,85 +862,6 @@ let cat_tokens_sequence text_fragments g =
866 862 | b :: l -> if a = b then b :: l else a :: b :: l) in*)
867 863 String.concat " " (Xlist.map l (fun (n,texts) -> texts))
868 864  
869   -let excluded_cats = StringSet.union omited (StringSet.of_list ["0";"Prep";"s";"BracketSet";"<root>";"by";"nie";"się";"jak";"int";
870   - "wieś";"ulica";"osada leśna";"część miejscowości";"astr.";"przysiółek";"nazwisko";"część miasta";
871   - "imię";"geograficzna";"pseudonim";"gmina wiejska";"osada";"firma";"język programowania";"kolonia";
872   - "instytucja";"gmina miejska";"miasto";"pora roku";"miesiąc";"krój pisma";"gmina miejsko-wiejska";
873   - "obszar wiejski";"powiat";"organizacja";"dzielnica";"własna";"marka";"przydomek";"hour-minute";"inicjał"; ])
874   -
875   -let load_colours_of_cats filename =
876   - File.fold_tab filename StringMap.empty (fun map -> function
877   - [cat; colour] -> StringMap.add map cat colour
878   - | line -> failwith ("load_colours_of_cats: " ^ String.concat "\t" line))
879   -
880   -let colours_of_cats = ref StringMap.empty
881   -
882   -let initialize () =
883   - colours_of_cats := load_colours_of_cats colours_filename
884   -
885   -(* let colours_of_cats = Xlist.fold [
886   - "ChemCompound","#00ffff";
887   - "ChemFunGroup","#ff00cc";
888   - "Measure","#ffff00";
889   - "Contain","#00ff00";
890   - "Number","#0000ff";
891   - "Dose","#ff0000";
892   - (* "","#";
893   - "","#"; *)
894   - ] StringMap.empty (fun map (cat,colour) -> StringMap.add map cat colour) *)
895   -
896   -let rec merge_cat_chart rev = function
897   - (i,j,s,x) :: (m,n,t,y) :: l ->
898   - (* printf "i=%d j=%d s=%s m=%d n=%d t=%s\n%!" i j s m n t; *)
899   - if j=m && s=t then merge_cat_chart rev ((i,n,t,y) :: l)
900   - else merge_cat_chart ((i,j,s,x) :: rev) ((m,n,t,y) :: l)
901   - | l -> List.rev (l @ rev)
902   -
903   -let cat_chart text_fragments g =
904   - (* print_endline "cat_chart 1"; *)
905   - let l,last = ENIAM_LCGchart.fold g ([],0) (fun (l,last) (symbol,node1,node2,sem,layer) ->
906   - (* printf "node1=%d node2=%d symbol=%s\n" node1 node2 (ENIAM_LCGstringOf.grammar_symbol 0 symbol); *)
907   - (node1,node2,extract_pos_cat [] symbol) :: l, max node2 last) in
908   - let a = Array.make (Array.length g) StringSet.empty in
909   - Xlist.iter l (fun (node1,node2,cat) ->
910   - if StringSet.mem excluded_cats cat then () else
911   - Int.iter node1 (node2 - 1) (fun i ->
912   - a.(i) <- StringSet.add a.(i) cat));
913   - let l = List.rev (Int.fold 0 (Array.length g - 1) [] (fun l i ->
914   - if i >= last then l else
915   - let cats = List.sort compare (StringSet.to_list a.(i)) in
916   - (i,i+1,String.concat "|" cats, cats ) :: l)) in
917   - let l = merge_cat_chart [] l in
918   - (* print_endline "cat_chart 2"; *)
919   - List.rev (Xlist.fold l [] (fun l (node1,node2,key,cats) ->
920   - let t = get_text_fragment text_fragments node1 node2 in
921   - (* if t = "???" then printf "node1=%d node2=%d key=%s cats=[%s]\n%!" node1 node2 key (String.concat ";" cats); *)
922   - if node1 = node2 then l else
923   - (t,key,cats) :: l))
924   -
925   -let create_styles ll =
926   - fst (Xlist.fold ll (StringMap.empty,1) (fun (map,n) -> function
927   - | (_,_,Message l) -> map,n
928   - | (_,_,Chart l) ->
929   - Xlist.fold l (map,n) (fun (map,n) (_,key,cats) ->
930   - if StringMap.mem map key || key = "" then map,n else
931   - let colours = List.rev (Xlist.rev_map cats (fun cat ->
932   - try StringMap.find !colours_of_cats cat with Not_found -> print_endline ("create_styles: unknown cat " ^ cat); "#ffffff")) in
933   - let colours,_ = Xlist.fold colours ([],0) (fun (colours,i) colour ->
934   - (Printf.sprintf "%s %dpx,%s %dpx" colour (i*7) colour ((i+1)*7)) :: colours, i+1) in
935   - StringMap.add map key ("B" ^ string_of_int n,List.rev colours),n+1)))
936   -
937   -let render_styles styles =
938   - "<style type=\"text/css\">\n " ^
939   - String.concat "\n " (List.rev (StringMap.fold styles [] (fun l _ (name,colours) ->
940   - (Printf.sprintf ".%s { background-image:repeating-linear-gradient(-45deg,%s); }" name
941   - (String.concat "," colours)) :: l))) ^
942   - "</style>"
943   -
944   -let assign_style styles (t,key,_) =
945   - if key = "" then t else
946   - let id,_ = try StringMap.find styles key with Not_found -> failwith ("assign_style: " ^ key) in
947   - Printf.sprintf "<span class=\"%s\">%s</span>" id t
948 865  
949 866 (* verbosity:
950 867 0 -> jedynie informacja o statusie zdania
... ... @@ -1574,99 +1491,3 @@ let rec to_string_text verbosity tokens = function
1574 1491 RawText s -> []
1575 1492 | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens))
1576 1493 | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text))
1577   -
1578   -let rec to_string2_paragraph verbosity tokens = function
1579   - RawParagraph s -> []
1580   - | StructParagraph sentences ->
1581   - let l = List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence)) in
1582   - List.rev (Xlist.rev_map l (fun t -> "","",Message t))
1583   - | AltParagraph((Name,RawParagraph name) :: l) ->
1584   - let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) in
1585   - List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t))
1586   - | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph))
1587   - | ErrorParagraph s -> ["","",Message "SubsyntaxError"]
1588   -
1589   -let rec to_string2_text verbosity tokens = function
1590   - RawText s -> []
1591   - | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string2_paragraph verbosity tokens))
1592   - | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string2_text verbosity tokens text))
1593   -
1594   -let rec skip_tag = function
1595   - ">" :: l -> l
1596   - | s :: l -> skip_tag l
1597   - | [] -> []
1598   -
1599   -let rec check_name_length_rec n rev = function
1600   - "<" :: l -> check_name_length_rec n rev (skip_tag l)
1601   - | [s] -> String.concat "" (List.rev (s :: rev))
1602   - | [] -> String.concat "" (List.rev rev)
1603   - | s :: l ->
1604   - if n > 1 then check_name_length_rec (n-1) (s :: rev) l
1605   - else String.concat "" (List.rev ("…" :: rev))
1606   -
1607   -let check_name_length n s =
1608   - let l = Xunicode.utf8_chars_of_utf8_string s in
1609   - check_name_length_rec n [] l
1610   - (* if String.length s > n then
1611   - String.sub s 0 (n-1) ^ "…"
1612   - else s *)
1613   -
1614   -let to_string2_simplify name_length= function
1615   - name,_,Message s ->
1616   - if name_length <= 0 then s
1617   - else (check_name_length name_length name) ^ "\t" ^ s
1618   - | _ -> failwith "to_string2_simplify"
1619   -
1620   -
1621   -let marked_string_of_eniam_sentence verbosity tokens (result : eniam_parse_result) =
1622   - let status_string = string_of_status result.status in
1623   - if result.status = NotParsed then
1624   - [status_string, Chart(cat_chart result.text_fragments result.chart1)]
1625   - else [status_string,Message result.msg]
1626   -
1627   -let rec marked_string_of_sentence verbosity tokens = function
1628   - RawSentence s -> []
1629   - | StructSentence(paths,last) -> []
1630   - | DepSentence paths -> []
1631   - | ENIAMSentence result -> marked_string_of_eniam_sentence verbosity tokens result
1632   - | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence))
1633   - | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> marked_string_of_sentence verbosity tokens sentence))
1634   -
1635   -let rec marked_string_of_paragraph verbosity tokens = function
1636   - RawParagraph s -> []
1637   - | StructParagraph sentences ->
1638   - let l = List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) in
1639   - List.rev (Xlist.rev_map l (fun (s,t) -> "",s,t))
1640   - | AltParagraph((Name,RawParagraph name) :: l) ->
1641   - let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) in
1642   - List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t))
1643   - | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph))
1644   - | ErrorParagraph s -> ["","SubsyntaxError",Message s]
1645   -
1646   -let rec marked_string_of_text verbosity tokens = function
1647   - RawText s -> []
1648   - | StructText paragraphs -> List.flatten (Xlist.map paragraphs (marked_string_of_paragraph verbosity tokens))
1649   - | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> marked_string_of_text verbosity tokens text))
1650   -
1651   -let print_html_marked_simple_text path name name_length l =
1652   - File.file_out (path ^ name ^ ".html") (fun file ->
1653   - fprintf file "%s\n" html_header;
1654   - (* print_endline "print_html_marked_text 1"; *)
1655   - (* print_endline "print_html_marked_text 2"; *)
1656   - let styles = create_styles l in
1657   - (* print_endline "print_html_marked_text 3"; *)
1658   - fprintf file "%s\n" (render_styles styles);
1659   - if name_length <= 0 then
1660   - Xlist.iter l (function
1661   - name, "NotParsed", Chart t -> fprintf file "%s<BR>\n" (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles))));
1662   - | name, status, Chart t -> fprintf file "%s: %s<BR>\n" status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles))));
1663   - | name, status, Message t -> fprintf file "%s: %s<BR>\n" status (escape_html t))
1664   - else (
1665   - fprintf file "<TABLE border=1>\n";
1666   - Xlist.iter l (function
1667   - name, "NotParsed", Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s</TD><TR>\n" (check_name_length name_length name) (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles))));
1668   - | name, status, Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles))));
1669   - | name, status, Message t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (escape_html t));
1670   - fprintf file "</TABLE>\n");
1671   - (* print_endline "print_html_marked_text 4"; *)
1672   - fprintf file "%s\n" html_trailer)
... ...
subsyntax/ENIAMsentences.ml
... ... @@ -156,7 +156,7 @@ let find_tokens_in_chart tokens chart lnode rnode cat =
156 156 else found) in
157 157 match found with
158 158 [x] -> x
159   - | [] -> failwith "Unable to extract sentences. Check puntuation."
  159 + | [] -> failwith "Unable to extract sentences. Check punctuation."
160 160 | _ -> failwith "find_tokens_in_chart"
161 161  
162 162 (*let find_tokens_in_chart_id tokens chart lnode rnode cat =
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -348,15 +348,15 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query =
348 348 (* print_endline ("parse_text_tokens: " ^ query); *)
349 349 let paragraphs = Xstring.split "\n\\|\r" query in
350 350 let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
  351 + let paragraphs = List.rev (Xlist.rev_map paragraphs (fun paragraph ->
  352 + if par_names_flag then
  353 + match Xstring.split "\t" paragraph with
  354 + [name; paragraph] -> name, paragraph
  355 + | _ -> failwith ("parse_text_tokens: " ^ paragraph)
  356 + else "", paragraph)) in
351 357 let n = if Xlist.size paragraphs = 1 then 0 else 1 in
352   - let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
  358 + let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) (name,paragraph) ->
353 359 try
354   - let name, paragraph =
355   - if par_names_flag then
356   - match Xstring.split "\t" paragraph with
357   - [name; paragraph] -> name, paragraph
358   - | _ -> failwith ("parse_text_tokens: " ^ paragraph)
359   - else "", paragraph in
360 360 (* print_endline paragraph; *)
361 361 let paths = parse paragraph in
362 362 (* print_endline "parse_text 1"; *)
... ... @@ -367,7 +367,8 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query =
367 367 (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @
368 368 [Raw,RawParagraph paragraph; Struct,StructParagraph sentences])) :: paragraphs, n+1
369 369 with e ->
370   - (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in
  370 + (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @
  371 + [Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)])) :: paragraphs, n+1) in
371 372 AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens
372 373  
373 374 let parse_text sentence_split_flag par_names_flag query =
... ...
tokenizer/ENIAMtokens.ml
... ... @@ -1056,7 +1056,8 @@ let rec recognize_sign_group poss_s_beg i = function
1056 1056 | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t")
1057 1057 | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r")
1058 1058 | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n")
1059   - | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®")
  1059 + | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (make_lemma ("®","symbol"))
  1060 + | (Sign "™") :: l -> create_sign_token poss_s_beg i [Sign "™"] l (make_lemma ("™","symbol"))
1060 1061 | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ")
1061 1062 | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ")
1062 1063 | (Sign "†") :: l -> create_sign_token poss_s_beg i [Sign "†"] l (Interp "†")
... ...