Commit 84afa6af17ed75f7b1ffaa90d26a22b8fe209aee
1 parent
ccd5d99e
Uzupełnienia kategorii w leksykonie i wizualizacji
Showing
7 changed files
with
18 additions
and
191 deletions
LCGlexicon/ENIAM_LCGlexicon.ml
... | ... | @@ -234,6 +234,7 @@ let make_quantification e rules = |
234 | 234 | let make_node id orth lemma pos syntax weight cat_list is_raised = |
235 | 235 | let attrs = Xlist.fold cat_list [] (fun attrs -> function |
236 | 236 | | Lemma -> attrs |
237 | + | IncludeLemmata -> attrs | |
237 | 238 | | Pos -> attrs |
238 | 239 | | Pos2 -> attrs |
239 | 240 | | Cat -> ("CAT",SubstVar "cat") :: attrs |
... | ... |
LCGlexicon/ENIAM_LCGlexiconTypes.ml
... | ... | @@ -25,7 +25,7 @@ type categories = {lemma: string; pos: string; pos2: string; cat: string; coerce |
25 | 25 | } |
26 | 26 | |
27 | 27 | type selector = |
28 | - Lemma | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode | | |
28 | + Lemma | IncludeLemmata | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode | | |
29 | 29 | Number | Case | Gender | Person | Grad | Praep | |
30 | 30 | Acm | Aspect | Negation | Mood | Tense | Nsyn | Nsem | Ctype | Mode | Psem | |
31 | 31 | Icat | Inumber | Igender | Iperson | Nperson | Ncat | Plemma | |
... | ... |
LCGlexicon/ENIAMcategoriesPL.ml
... | ... | @@ -29,6 +29,7 @@ let all_persons = ["pri";"sec";"ter"] |
29 | 29 | |
30 | 30 | let selector_values = Xlist.fold [ |
31 | 31 | Lemma, []; |
32 | + IncludeLemmata, []; | |
32 | 33 | Pos, ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"fixed";"num";"numcomp";"intnum"; |
33 | 34 | "realnum";"intnum-interval";"realnum-interval";"symbol";"ordnum"; |
34 | 35 | "date";"date-interval";"hour-minute";"hour";"hour-minute-interval"; |
... | ... | @@ -445,6 +446,7 @@ let selector_names = StringSet.of_list [ |
445 | 446 | |
446 | 447 | let string_of_selector = function |
447 | 448 | Lemma -> "lemma" |
449 | + | IncludeLemmata -> "include-lemmata" | |
448 | 450 | (* | NewLemma -> "newlemma" *) |
449 | 451 | | Pos -> "pos" |
450 | 452 | | Pos2 -> "pos2" |
... | ... | @@ -494,6 +496,7 @@ let string_of_selectors selectors = |
494 | 496 | |
495 | 497 | let selector_of_string = function |
496 | 498 | "lemma" -> Lemma |
499 | + | "include-lemmata" -> IncludeLemmata | |
497 | 500 | (* | NewLemma -> "newlemma" *) |
498 | 501 | | "pos" -> Pos |
499 | 502 | | "pos2" -> Pos2 |
... | ... |
exec/ENIAMvisualization.ml
... | ... | @@ -23,10 +23,6 @@ open Printf |
23 | 23 | open ENIAMtokenizerTypes |
24 | 24 | open ENIAMexecTypes |
25 | 25 | |
26 | -type marked = | |
27 | - Chart of (string * string * string list) list | |
28 | - | Message of string | |
29 | - | |
30 | 26 | let string_of_status = function |
31 | 27 | Idle -> "Idle" |
32 | 28 | | PreprocessingError -> "PreprocessingError" |
... | ... | @@ -826,7 +822,7 @@ let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<sieb |
826 | 822 | "<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>"; |
827 | 823 | "<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>"; |
828 | 824 | "<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>"; |
829 | - "<interp>";"<part>";"<unk>";"<building-number>"] | |
825 | + "<interp>";"<part>";"<unk>";"<building-number>";"<html-tag>";"<list-item>";"<numcomp>";"<phone-number>";"<postal-code>"] | |
830 | 826 | |
831 | 827 | let cat_tokens_sequence text_fragments g = |
832 | 828 | let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) -> |
... | ... | @@ -866,85 +862,6 @@ let cat_tokens_sequence text_fragments g = |
866 | 862 | | b :: l -> if a = b then b :: l else a :: b :: l) in*) |
867 | 863 | String.concat " " (Xlist.map l (fun (n,texts) -> texts)) |
868 | 864 | |
869 | -let excluded_cats = StringSet.union omited (StringSet.of_list ["0";"Prep";"s";"BracketSet";"<root>";"by";"nie";"się";"jak";"int"; | |
870 | - "wieś";"ulica";"osada leśna";"część miejscowości";"astr.";"przysiółek";"nazwisko";"część miasta"; | |
871 | - "imię";"geograficzna";"pseudonim";"gmina wiejska";"osada";"firma";"język programowania";"kolonia"; | |
872 | - "instytucja";"gmina miejska";"miasto";"pora roku";"miesiąc";"krój pisma";"gmina miejsko-wiejska"; | |
873 | - "obszar wiejski";"powiat";"organizacja";"dzielnica";"własna";"marka";"przydomek";"hour-minute";"inicjał"; ]) | |
874 | - | |
875 | -let load_colours_of_cats filename = | |
876 | - File.fold_tab filename StringMap.empty (fun map -> function | |
877 | - [cat; colour] -> StringMap.add map cat colour | |
878 | - | line -> failwith ("load_colours_of_cats: " ^ String.concat "\t" line)) | |
879 | - | |
880 | -let colours_of_cats = ref StringMap.empty | |
881 | - | |
882 | -let initialize () = | |
883 | - colours_of_cats := load_colours_of_cats colours_filename | |
884 | - | |
885 | -(* let colours_of_cats = Xlist.fold [ | |
886 | - "ChemCompound","#00ffff"; | |
887 | - "ChemFunGroup","#ff00cc"; | |
888 | - "Measure","#ffff00"; | |
889 | - "Contain","#00ff00"; | |
890 | - "Number","#0000ff"; | |
891 | - "Dose","#ff0000"; | |
892 | - (* "","#"; | |
893 | - "","#"; *) | |
894 | - ] StringMap.empty (fun map (cat,colour) -> StringMap.add map cat colour) *) | |
895 | - | |
896 | -let rec merge_cat_chart rev = function | |
897 | - (i,j,s,x) :: (m,n,t,y) :: l -> | |
898 | - (* printf "i=%d j=%d s=%s m=%d n=%d t=%s\n%!" i j s m n t; *) | |
899 | - if j=m && s=t then merge_cat_chart rev ((i,n,t,y) :: l) | |
900 | - else merge_cat_chart ((i,j,s,x) :: rev) ((m,n,t,y) :: l) | |
901 | - | l -> List.rev (l @ rev) | |
902 | - | |
903 | -let cat_chart text_fragments g = | |
904 | - (* print_endline "cat_chart 1"; *) | |
905 | - let l,last = ENIAM_LCGchart.fold g ([],0) (fun (l,last) (symbol,node1,node2,sem,layer) -> | |
906 | - (* printf "node1=%d node2=%d symbol=%s\n" node1 node2 (ENIAM_LCGstringOf.grammar_symbol 0 symbol); *) | |
907 | - (node1,node2,extract_pos_cat [] symbol) :: l, max node2 last) in | |
908 | - let a = Array.make (Array.length g) StringSet.empty in | |
909 | - Xlist.iter l (fun (node1,node2,cat) -> | |
910 | - if StringSet.mem excluded_cats cat then () else | |
911 | - Int.iter node1 (node2 - 1) (fun i -> | |
912 | - a.(i) <- StringSet.add a.(i) cat)); | |
913 | - let l = List.rev (Int.fold 0 (Array.length g - 1) [] (fun l i -> | |
914 | - if i >= last then l else | |
915 | - let cats = List.sort compare (StringSet.to_list a.(i)) in | |
916 | - (i,i+1,String.concat "|" cats, cats ) :: l)) in | |
917 | - let l = merge_cat_chart [] l in | |
918 | - (* print_endline "cat_chart 2"; *) | |
919 | - List.rev (Xlist.fold l [] (fun l (node1,node2,key,cats) -> | |
920 | - let t = get_text_fragment text_fragments node1 node2 in | |
921 | - (* if t = "???" then printf "node1=%d node2=%d key=%s cats=[%s]\n%!" node1 node2 key (String.concat ";" cats); *) | |
922 | - if node1 = node2 then l else | |
923 | - (t,key,cats) :: l)) | |
924 | - | |
925 | -let create_styles ll = | |
926 | - fst (Xlist.fold ll (StringMap.empty,1) (fun (map,n) -> function | |
927 | - | (_,_,Message l) -> map,n | |
928 | - | (_,_,Chart l) -> | |
929 | - Xlist.fold l (map,n) (fun (map,n) (_,key,cats) -> | |
930 | - if StringMap.mem map key || key = "" then map,n else | |
931 | - let colours = List.rev (Xlist.rev_map cats (fun cat -> | |
932 | - try StringMap.find !colours_of_cats cat with Not_found -> print_endline ("create_styles: unknown cat " ^ cat); "#ffffff")) in | |
933 | - let colours,_ = Xlist.fold colours ([],0) (fun (colours,i) colour -> | |
934 | - (Printf.sprintf "%s %dpx,%s %dpx" colour (i*7) colour ((i+1)*7)) :: colours, i+1) in | |
935 | - StringMap.add map key ("B" ^ string_of_int n,List.rev colours),n+1))) | |
936 | - | |
937 | -let render_styles styles = | |
938 | - "<style type=\"text/css\">\n " ^ | |
939 | - String.concat "\n " (List.rev (StringMap.fold styles [] (fun l _ (name,colours) -> | |
940 | - (Printf.sprintf ".%s { background-image:repeating-linear-gradient(-45deg,%s); }" name | |
941 | - (String.concat "," colours)) :: l))) ^ | |
942 | - "</style>" | |
943 | - | |
944 | -let assign_style styles (t,key,_) = | |
945 | - if key = "" then t else | |
946 | - let id,_ = try StringMap.find styles key with Not_found -> failwith ("assign_style: " ^ key) in | |
947 | - Printf.sprintf "<span class=\"%s\">%s</span>" id t | |
948 | 865 | |
949 | 866 | (* verbosity: |
950 | 867 | 0 -> jedynie informacja o statusie zdania |
... | ... | @@ -1574,99 +1491,3 @@ let rec to_string_text verbosity tokens = function |
1574 | 1491 | RawText s -> [] |
1575 | 1492 | | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens)) |
1576 | 1493 | | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text)) |
1577 | - | |
1578 | -let rec to_string2_paragraph verbosity tokens = function | |
1579 | - RawParagraph s -> [] | |
1580 | - | StructParagraph sentences -> | |
1581 | - let l = List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence)) in | |
1582 | - List.rev (Xlist.rev_map l (fun t -> "","",Message t)) | |
1583 | - | AltParagraph((Name,RawParagraph name) :: l) -> | |
1584 | - let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) in | |
1585 | - List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t)) | |
1586 | - | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) | |
1587 | - | ErrorParagraph s -> ["","",Message "SubsyntaxError"] | |
1588 | - | |
1589 | -let rec to_string2_text verbosity tokens = function | |
1590 | - RawText s -> [] | |
1591 | - | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string2_paragraph verbosity tokens)) | |
1592 | - | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string2_text verbosity tokens text)) | |
1593 | - | |
1594 | -let rec skip_tag = function | |
1595 | - ">" :: l -> l | |
1596 | - | s :: l -> skip_tag l | |
1597 | - | [] -> [] | |
1598 | - | |
1599 | -let rec check_name_length_rec n rev = function | |
1600 | - "<" :: l -> check_name_length_rec n rev (skip_tag l) | |
1601 | - | [s] -> String.concat "" (List.rev (s :: rev)) | |
1602 | - | [] -> String.concat "" (List.rev rev) | |
1603 | - | s :: l -> | |
1604 | - if n > 1 then check_name_length_rec (n-1) (s :: rev) l | |
1605 | - else String.concat "" (List.rev ("…" :: rev)) | |
1606 | - | |
1607 | -let check_name_length n s = | |
1608 | - let l = Xunicode.utf8_chars_of_utf8_string s in | |
1609 | - check_name_length_rec n [] l | |
1610 | - (* if String.length s > n then | |
1611 | - String.sub s 0 (n-1) ^ "…" | |
1612 | - else s *) | |
1613 | - | |
1614 | -let to_string2_simplify name_length= function | |
1615 | - name,_,Message s -> | |
1616 | - if name_length <= 0 then s | |
1617 | - else (check_name_length name_length name) ^ "\t" ^ s | |
1618 | - | _ -> failwith "to_string2_simplify" | |
1619 | - | |
1620 | - | |
1621 | -let marked_string_of_eniam_sentence verbosity tokens (result : eniam_parse_result) = | |
1622 | - let status_string = string_of_status result.status in | |
1623 | - if result.status = NotParsed then | |
1624 | - [status_string, Chart(cat_chart result.text_fragments result.chart1)] | |
1625 | - else [status_string,Message result.msg] | |
1626 | - | |
1627 | -let rec marked_string_of_sentence verbosity tokens = function | |
1628 | - RawSentence s -> [] | |
1629 | - | StructSentence(paths,last) -> [] | |
1630 | - | DepSentence paths -> [] | |
1631 | - | ENIAMSentence result -> marked_string_of_eniam_sentence verbosity tokens result | |
1632 | - | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) | |
1633 | - | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> marked_string_of_sentence verbosity tokens sentence)) | |
1634 | - | |
1635 | -let rec marked_string_of_paragraph verbosity tokens = function | |
1636 | - RawParagraph s -> [] | |
1637 | - | StructParagraph sentences -> | |
1638 | - let l = List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) in | |
1639 | - List.rev (Xlist.rev_map l (fun (s,t) -> "",s,t)) | |
1640 | - | AltParagraph((Name,RawParagraph name) :: l) -> | |
1641 | - let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) in | |
1642 | - List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t)) | |
1643 | - | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) | |
1644 | - | ErrorParagraph s -> ["","SubsyntaxError",Message s] | |
1645 | - | |
1646 | -let rec marked_string_of_text verbosity tokens = function | |
1647 | - RawText s -> [] | |
1648 | - | StructText paragraphs -> List.flatten (Xlist.map paragraphs (marked_string_of_paragraph verbosity tokens)) | |
1649 | - | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> marked_string_of_text verbosity tokens text)) | |
1650 | - | |
1651 | -let print_html_marked_simple_text path name name_length l = | |
1652 | - File.file_out (path ^ name ^ ".html") (fun file -> | |
1653 | - fprintf file "%s\n" html_header; | |
1654 | - (* print_endline "print_html_marked_text 1"; *) | |
1655 | - (* print_endline "print_html_marked_text 2"; *) | |
1656 | - let styles = create_styles l in | |
1657 | - (* print_endline "print_html_marked_text 3"; *) | |
1658 | - fprintf file "%s\n" (render_styles styles); | |
1659 | - if name_length <= 0 then | |
1660 | - Xlist.iter l (function | |
1661 | - name, "NotParsed", Chart t -> fprintf file "%s<BR>\n" (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | |
1662 | - | name, status, Chart t -> fprintf file "%s: %s<BR>\n" status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | |
1663 | - | name, status, Message t -> fprintf file "%s: %s<BR>\n" status (escape_html t)) | |
1664 | - else ( | |
1665 | - fprintf file "<TABLE border=1>\n"; | |
1666 | - Xlist.iter l (function | |
1667 | - name, "NotParsed", Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s</TD><TR>\n" (check_name_length name_length name) (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | |
1668 | - | name, status, Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | |
1669 | - | name, status, Message t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (escape_html t)); | |
1670 | - fprintf file "</TABLE>\n"); | |
1671 | - (* print_endline "print_html_marked_text 4"; *) | |
1672 | - fprintf file "%s\n" html_trailer) | |
... | ... |
subsyntax/ENIAMsentences.ml
... | ... | @@ -156,7 +156,7 @@ let find_tokens_in_chart tokens chart lnode rnode cat = |
156 | 156 | else found) in |
157 | 157 | match found with |
158 | 158 | [x] -> x |
159 | - | [] -> failwith "Unable to extract sentences. Check puntuation." | |
159 | + | [] -> failwith "Unable to extract sentences. Check punctuation." | |
160 | 160 | | _ -> failwith "find_tokens_in_chart" |
161 | 161 | |
162 | 162 | (*let find_tokens_in_chart_id tokens chart lnode rnode cat = |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -348,15 +348,15 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query = |
348 | 348 | (* print_endline ("parse_text_tokens: " ^ query); *) |
349 | 349 | let paragraphs = Xstring.split "\n\\|\r" query in |
350 | 350 | let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in |
351 | + let paragraphs = List.rev (Xlist.rev_map paragraphs (fun paragraph -> | |
352 | + if par_names_flag then | |
353 | + match Xstring.split "\t" paragraph with | |
354 | + [name; paragraph] -> name, paragraph | |
355 | + | _ -> failwith ("parse_text_tokens: " ^ paragraph) | |
356 | + else "", paragraph)) in | |
351 | 357 | let n = if Xlist.size paragraphs = 1 then 0 else 1 in |
352 | - let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph -> | |
358 | + let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) (name,paragraph) -> | |
353 | 359 | try |
354 | - let name, paragraph = | |
355 | - if par_names_flag then | |
356 | - match Xstring.split "\t" paragraph with | |
357 | - [name; paragraph] -> name, paragraph | |
358 | - | _ -> failwith ("parse_text_tokens: " ^ paragraph) | |
359 | - else "", paragraph in | |
360 | 360 | (* print_endline paragraph; *) |
361 | 361 | let paths = parse paragraph in |
362 | 362 | (* print_endline "parse_text 1"; *) |
... | ... | @@ -367,7 +367,8 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query = |
367 | 367 | (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ |
368 | 368 | [Raw,RawParagraph paragraph; Struct,StructParagraph sentences])) :: paragraphs, n+1 |
369 | 369 | with e -> |
370 | - (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in | |
370 | + (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ | |
371 | + [Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)])) :: paragraphs, n+1) in | |
371 | 372 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens |
372 | 373 | |
373 | 374 | let parse_text sentence_split_flag par_names_flag query = |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -1056,7 +1056,8 @@ let rec recognize_sign_group poss_s_beg i = function |
1056 | 1056 | | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t") |
1057 | 1057 | | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r") |
1058 | 1058 | | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n") |
1059 | - | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®") | |
1059 | + | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (make_lemma ("®","symbol")) | |
1060 | + | (Sign "™") :: l -> create_sign_token poss_s_beg i [Sign "™"] l (make_lemma ("™","symbol")) | |
1060 | 1061 | | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") |
1061 | 1062 | | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") |
1062 | 1063 | | (Sign "†") :: l -> create_sign_token poss_s_beg i [Sign "†"] l (Interp "†") |
... | ... |