Commit 84afa6af17ed75f7b1ffaa90d26a22b8fe209aee
1 parent
ccd5d99e
Uzupełnienia kategorii w leksykonie i wizualizacji
Showing
7 changed files
with
18 additions
and
191 deletions
LCGlexicon/ENIAM_LCGlexicon.ml
@@ -234,6 +234,7 @@ let make_quantification e rules = | @@ -234,6 +234,7 @@ let make_quantification e rules = | ||
234 | let make_node id orth lemma pos syntax weight cat_list is_raised = | 234 | let make_node id orth lemma pos syntax weight cat_list is_raised = |
235 | let attrs = Xlist.fold cat_list [] (fun attrs -> function | 235 | let attrs = Xlist.fold cat_list [] (fun attrs -> function |
236 | | Lemma -> attrs | 236 | | Lemma -> attrs |
237 | + | IncludeLemmata -> attrs | ||
237 | | Pos -> attrs | 238 | | Pos -> attrs |
238 | | Pos2 -> attrs | 239 | | Pos2 -> attrs |
239 | | Cat -> ("CAT",SubstVar "cat") :: attrs | 240 | | Cat -> ("CAT",SubstVar "cat") :: attrs |
LCGlexicon/ENIAM_LCGlexiconTypes.ml
@@ -25,7 +25,7 @@ type categories = {lemma: string; pos: string; pos2: string; cat: string; coerce | @@ -25,7 +25,7 @@ type categories = {lemma: string; pos: string; pos2: string; cat: string; coerce | ||
25 | } | 25 | } |
26 | 26 | ||
27 | type selector = | 27 | type selector = |
28 | - Lemma | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode | | 28 | + Lemma | IncludeLemmata | (*NewLemma |*) Pos | Pos2 | Cat | Coerced | Role | SNode | |
29 | Number | Case | Gender | Person | Grad | Praep | | 29 | Number | Case | Gender | Person | Grad | Praep | |
30 | Acm | Aspect | Negation | Mood | Tense | Nsyn | Nsem | Ctype | Mode | Psem | | 30 | Acm | Aspect | Negation | Mood | Tense | Nsyn | Nsem | Ctype | Mode | Psem | |
31 | Icat | Inumber | Igender | Iperson | Nperson | Ncat | Plemma | | 31 | Icat | Inumber | Igender | Iperson | Nperson | Ncat | Plemma | |
LCGlexicon/ENIAMcategoriesPL.ml
@@ -29,6 +29,7 @@ let all_persons = ["pri";"sec";"ter"] | @@ -29,6 +29,7 @@ let all_persons = ["pri";"sec";"ter"] | ||
29 | 29 | ||
30 | let selector_values = Xlist.fold [ | 30 | let selector_values = Xlist.fold [ |
31 | Lemma, []; | 31 | Lemma, []; |
32 | + IncludeLemmata, []; | ||
32 | Pos, ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"fixed";"num";"numcomp";"intnum"; | 33 | Pos, ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"fixed";"num";"numcomp";"intnum"; |
33 | "realnum";"intnum-interval";"realnum-interval";"symbol";"ordnum"; | 34 | "realnum";"intnum-interval";"realnum-interval";"symbol";"ordnum"; |
34 | "date";"date-interval";"hour-minute";"hour";"hour-minute-interval"; | 35 | "date";"date-interval";"hour-minute";"hour";"hour-minute-interval"; |
@@ -445,6 +446,7 @@ let selector_names = StringSet.of_list [ | @@ -445,6 +446,7 @@ let selector_names = StringSet.of_list [ | ||
445 | 446 | ||
446 | let string_of_selector = function | 447 | let string_of_selector = function |
447 | Lemma -> "lemma" | 448 | Lemma -> "lemma" |
449 | + | IncludeLemmata -> "include-lemmata" | ||
448 | (* | NewLemma -> "newlemma" *) | 450 | (* | NewLemma -> "newlemma" *) |
449 | | Pos -> "pos" | 451 | | Pos -> "pos" |
450 | | Pos2 -> "pos2" | 452 | | Pos2 -> "pos2" |
@@ -494,6 +496,7 @@ let string_of_selectors selectors = | @@ -494,6 +496,7 @@ let string_of_selectors selectors = | ||
494 | 496 | ||
495 | let selector_of_string = function | 497 | let selector_of_string = function |
496 | "lemma" -> Lemma | 498 | "lemma" -> Lemma |
499 | + | "include-lemmata" -> IncludeLemmata | ||
497 | (* | NewLemma -> "newlemma" *) | 500 | (* | NewLemma -> "newlemma" *) |
498 | | "pos" -> Pos | 501 | | "pos" -> Pos |
499 | | "pos2" -> Pos2 | 502 | | "pos2" -> Pos2 |
exec/ENIAMvisualization.ml
@@ -23,10 +23,6 @@ open Printf | @@ -23,10 +23,6 @@ open Printf | ||
23 | open ENIAMtokenizerTypes | 23 | open ENIAMtokenizerTypes |
24 | open ENIAMexecTypes | 24 | open ENIAMexecTypes |
25 | 25 | ||
26 | -type marked = | ||
27 | - Chart of (string * string * string list) list | ||
28 | - | Message of string | ||
29 | - | ||
30 | let string_of_status = function | 26 | let string_of_status = function |
31 | Idle -> "Idle" | 27 | Idle -> "Idle" |
32 | | PreprocessingError -> "PreprocessingError" | 28 | | PreprocessingError -> "PreprocessingError" |
@@ -826,7 +822,7 @@ let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<sieb | @@ -826,7 +822,7 @@ let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<sieb | ||
826 | "<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>"; | 822 | "<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>"; |
827 | "<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>"; | 823 | "<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>"; |
828 | "<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>"; | 824 | "<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>"; |
829 | - "<interp>";"<part>";"<unk>";"<building-number>"] | 825 | + "<interp>";"<part>";"<unk>";"<building-number>";"<html-tag>";"<list-item>";"<numcomp>";"<phone-number>";"<postal-code>"] |
830 | 826 | ||
831 | let cat_tokens_sequence text_fragments g = | 827 | let cat_tokens_sequence text_fragments g = |
832 | let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) -> | 828 | let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) -> |
@@ -866,85 +862,6 @@ let cat_tokens_sequence text_fragments g = | @@ -866,85 +862,6 @@ let cat_tokens_sequence text_fragments g = | ||
866 | | b :: l -> if a = b then b :: l else a :: b :: l) in*) | 862 | | b :: l -> if a = b then b :: l else a :: b :: l) in*) |
867 | String.concat " " (Xlist.map l (fun (n,texts) -> texts)) | 863 | String.concat " " (Xlist.map l (fun (n,texts) -> texts)) |
868 | 864 | ||
869 | -let excluded_cats = StringSet.union omited (StringSet.of_list ["0";"Prep";"s";"BracketSet";"<root>";"by";"nie";"się";"jak";"int"; | ||
870 | - "wieś";"ulica";"osada leśna";"część miejscowości";"astr.";"przysiółek";"nazwisko";"część miasta"; | ||
871 | - "imię";"geograficzna";"pseudonim";"gmina wiejska";"osada";"firma";"język programowania";"kolonia"; | ||
872 | - "instytucja";"gmina miejska";"miasto";"pora roku";"miesiąc";"krój pisma";"gmina miejsko-wiejska"; | ||
873 | - "obszar wiejski";"powiat";"organizacja";"dzielnica";"własna";"marka";"przydomek";"hour-minute";"inicjał"; ]) | ||
874 | - | ||
875 | -let load_colours_of_cats filename = | ||
876 | - File.fold_tab filename StringMap.empty (fun map -> function | ||
877 | - [cat; colour] -> StringMap.add map cat colour | ||
878 | - | line -> failwith ("load_colours_of_cats: " ^ String.concat "\t" line)) | ||
879 | - | ||
880 | -let colours_of_cats = ref StringMap.empty | ||
881 | - | ||
882 | -let initialize () = | ||
883 | - colours_of_cats := load_colours_of_cats colours_filename | ||
884 | - | ||
885 | -(* let colours_of_cats = Xlist.fold [ | ||
886 | - "ChemCompound","#00ffff"; | ||
887 | - "ChemFunGroup","#ff00cc"; | ||
888 | - "Measure","#ffff00"; | ||
889 | - "Contain","#00ff00"; | ||
890 | - "Number","#0000ff"; | ||
891 | - "Dose","#ff0000"; | ||
892 | - (* "","#"; | ||
893 | - "","#"; *) | ||
894 | - ] StringMap.empty (fun map (cat,colour) -> StringMap.add map cat colour) *) | ||
895 | - | ||
896 | -let rec merge_cat_chart rev = function | ||
897 | - (i,j,s,x) :: (m,n,t,y) :: l -> | ||
898 | - (* printf "i=%d j=%d s=%s m=%d n=%d t=%s\n%!" i j s m n t; *) | ||
899 | - if j=m && s=t then merge_cat_chart rev ((i,n,t,y) :: l) | ||
900 | - else merge_cat_chart ((i,j,s,x) :: rev) ((m,n,t,y) :: l) | ||
901 | - | l -> List.rev (l @ rev) | ||
902 | - | ||
903 | -let cat_chart text_fragments g = | ||
904 | - (* print_endline "cat_chart 1"; *) | ||
905 | - let l,last = ENIAM_LCGchart.fold g ([],0) (fun (l,last) (symbol,node1,node2,sem,layer) -> | ||
906 | - (* printf "node1=%d node2=%d symbol=%s\n" node1 node2 (ENIAM_LCGstringOf.grammar_symbol 0 symbol); *) | ||
907 | - (node1,node2,extract_pos_cat [] symbol) :: l, max node2 last) in | ||
908 | - let a = Array.make (Array.length g) StringSet.empty in | ||
909 | - Xlist.iter l (fun (node1,node2,cat) -> | ||
910 | - if StringSet.mem excluded_cats cat then () else | ||
911 | - Int.iter node1 (node2 - 1) (fun i -> | ||
912 | - a.(i) <- StringSet.add a.(i) cat)); | ||
913 | - let l = List.rev (Int.fold 0 (Array.length g - 1) [] (fun l i -> | ||
914 | - if i >= last then l else | ||
915 | - let cats = List.sort compare (StringSet.to_list a.(i)) in | ||
916 | - (i,i+1,String.concat "|" cats, cats ) :: l)) in | ||
917 | - let l = merge_cat_chart [] l in | ||
918 | - (* print_endline "cat_chart 2"; *) | ||
919 | - List.rev (Xlist.fold l [] (fun l (node1,node2,key,cats) -> | ||
920 | - let t = get_text_fragment text_fragments node1 node2 in | ||
921 | - (* if t = "???" then printf "node1=%d node2=%d key=%s cats=[%s]\n%!" node1 node2 key (String.concat ";" cats); *) | ||
922 | - if node1 = node2 then l else | ||
923 | - (t,key,cats) :: l)) | ||
924 | - | ||
925 | -let create_styles ll = | ||
926 | - fst (Xlist.fold ll (StringMap.empty,1) (fun (map,n) -> function | ||
927 | - | (_,_,Message l) -> map,n | ||
928 | - | (_,_,Chart l) -> | ||
929 | - Xlist.fold l (map,n) (fun (map,n) (_,key,cats) -> | ||
930 | - if StringMap.mem map key || key = "" then map,n else | ||
931 | - let colours = List.rev (Xlist.rev_map cats (fun cat -> | ||
932 | - try StringMap.find !colours_of_cats cat with Not_found -> print_endline ("create_styles: unknown cat " ^ cat); "#ffffff")) in | ||
933 | - let colours,_ = Xlist.fold colours ([],0) (fun (colours,i) colour -> | ||
934 | - (Printf.sprintf "%s %dpx,%s %dpx" colour (i*7) colour ((i+1)*7)) :: colours, i+1) in | ||
935 | - StringMap.add map key ("B" ^ string_of_int n,List.rev colours),n+1))) | ||
936 | - | ||
937 | -let render_styles styles = | ||
938 | - "<style type=\"text/css\">\n " ^ | ||
939 | - String.concat "\n " (List.rev (StringMap.fold styles [] (fun l _ (name,colours) -> | ||
940 | - (Printf.sprintf ".%s { background-image:repeating-linear-gradient(-45deg,%s); }" name | ||
941 | - (String.concat "," colours)) :: l))) ^ | ||
942 | - "</style>" | ||
943 | - | ||
944 | -let assign_style styles (t,key,_) = | ||
945 | - if key = "" then t else | ||
946 | - let id,_ = try StringMap.find styles key with Not_found -> failwith ("assign_style: " ^ key) in | ||
947 | - Printf.sprintf "<span class=\"%s\">%s</span>" id t | ||
948 | 865 | ||
949 | (* verbosity: | 866 | (* verbosity: |
950 | 0 -> jedynie informacja o statusie zdania | 867 | 0 -> jedynie informacja o statusie zdania |
@@ -1574,99 +1491,3 @@ let rec to_string_text verbosity tokens = function | @@ -1574,99 +1491,3 @@ let rec to_string_text verbosity tokens = function | ||
1574 | RawText s -> [] | 1491 | RawText s -> [] |
1575 | | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens)) | 1492 | | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens)) |
1576 | | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text)) | 1493 | | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text)) |
1577 | - | ||
1578 | -let rec to_string2_paragraph verbosity tokens = function | ||
1579 | - RawParagraph s -> [] | ||
1580 | - | StructParagraph sentences -> | ||
1581 | - let l = List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence)) in | ||
1582 | - List.rev (Xlist.rev_map l (fun t -> "","",Message t)) | ||
1583 | - | AltParagraph((Name,RawParagraph name) :: l) -> | ||
1584 | - let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) in | ||
1585 | - List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t)) | ||
1586 | - | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) | ||
1587 | - | ErrorParagraph s -> ["","",Message "SubsyntaxError"] | ||
1588 | - | ||
1589 | -let rec to_string2_text verbosity tokens = function | ||
1590 | - RawText s -> [] | ||
1591 | - | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string2_paragraph verbosity tokens)) | ||
1592 | - | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string2_text verbosity tokens text)) | ||
1593 | - | ||
1594 | -let rec skip_tag = function | ||
1595 | - ">" :: l -> l | ||
1596 | - | s :: l -> skip_tag l | ||
1597 | - | [] -> [] | ||
1598 | - | ||
1599 | -let rec check_name_length_rec n rev = function | ||
1600 | - "<" :: l -> check_name_length_rec n rev (skip_tag l) | ||
1601 | - | [s] -> String.concat "" (List.rev (s :: rev)) | ||
1602 | - | [] -> String.concat "" (List.rev rev) | ||
1603 | - | s :: l -> | ||
1604 | - if n > 1 then check_name_length_rec (n-1) (s :: rev) l | ||
1605 | - else String.concat "" (List.rev ("…" :: rev)) | ||
1606 | - | ||
1607 | -let check_name_length n s = | ||
1608 | - let l = Xunicode.utf8_chars_of_utf8_string s in | ||
1609 | - check_name_length_rec n [] l | ||
1610 | - (* if String.length s > n then | ||
1611 | - String.sub s 0 (n-1) ^ "…" | ||
1612 | - else s *) | ||
1613 | - | ||
1614 | -let to_string2_simplify name_length= function | ||
1615 | - name,_,Message s -> | ||
1616 | - if name_length <= 0 then s | ||
1617 | - else (check_name_length name_length name) ^ "\t" ^ s | ||
1618 | - | _ -> failwith "to_string2_simplify" | ||
1619 | - | ||
1620 | - | ||
1621 | -let marked_string_of_eniam_sentence verbosity tokens (result : eniam_parse_result) = | ||
1622 | - let status_string = string_of_status result.status in | ||
1623 | - if result.status = NotParsed then | ||
1624 | - [status_string, Chart(cat_chart result.text_fragments result.chart1)] | ||
1625 | - else [status_string,Message result.msg] | ||
1626 | - | ||
1627 | -let rec marked_string_of_sentence verbosity tokens = function | ||
1628 | - RawSentence s -> [] | ||
1629 | - | StructSentence(paths,last) -> [] | ||
1630 | - | DepSentence paths -> [] | ||
1631 | - | ENIAMSentence result -> marked_string_of_eniam_sentence verbosity tokens result | ||
1632 | - | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) | ||
1633 | - | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> marked_string_of_sentence verbosity tokens sentence)) | ||
1634 | - | ||
1635 | -let rec marked_string_of_paragraph verbosity tokens = function | ||
1636 | - RawParagraph s -> [] | ||
1637 | - | StructParagraph sentences -> | ||
1638 | - let l = List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) in | ||
1639 | - List.rev (Xlist.rev_map l (fun (s,t) -> "",s,t)) | ||
1640 | - | AltParagraph((Name,RawParagraph name) :: l) -> | ||
1641 | - let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) in | ||
1642 | - List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t)) | ||
1643 | - | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) | ||
1644 | - | ErrorParagraph s -> ["","SubsyntaxError",Message s] | ||
1645 | - | ||
1646 | -let rec marked_string_of_text verbosity tokens = function | ||
1647 | - RawText s -> [] | ||
1648 | - | StructText paragraphs -> List.flatten (Xlist.map paragraphs (marked_string_of_paragraph verbosity tokens)) | ||
1649 | - | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> marked_string_of_text verbosity tokens text)) | ||
1650 | - | ||
1651 | -let print_html_marked_simple_text path name name_length l = | ||
1652 | - File.file_out (path ^ name ^ ".html") (fun file -> | ||
1653 | - fprintf file "%s\n" html_header; | ||
1654 | - (* print_endline "print_html_marked_text 1"; *) | ||
1655 | - (* print_endline "print_html_marked_text 2"; *) | ||
1656 | - let styles = create_styles l in | ||
1657 | - (* print_endline "print_html_marked_text 3"; *) | ||
1658 | - fprintf file "%s\n" (render_styles styles); | ||
1659 | - if name_length <= 0 then | ||
1660 | - Xlist.iter l (function | ||
1661 | - name, "NotParsed", Chart t -> fprintf file "%s<BR>\n" (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | ||
1662 | - | name, status, Chart t -> fprintf file "%s: %s<BR>\n" status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | ||
1663 | - | name, status, Message t -> fprintf file "%s: %s<BR>\n" status (escape_html t)) | ||
1664 | - else ( | ||
1665 | - fprintf file "<TABLE border=1>\n"; | ||
1666 | - Xlist.iter l (function | ||
1667 | - name, "NotParsed", Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s</TD><TR>\n" (check_name_length name_length name) (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | ||
1668 | - | name, status, Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | ||
1669 | - | name, status, Message t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (escape_html t)); | ||
1670 | - fprintf file "</TABLE>\n"); | ||
1671 | - (* print_endline "print_html_marked_text 4"; *) | ||
1672 | - fprintf file "%s\n" html_trailer) |
subsyntax/ENIAMsentences.ml
@@ -156,7 +156,7 @@ let find_tokens_in_chart tokens chart lnode rnode cat = | @@ -156,7 +156,7 @@ let find_tokens_in_chart tokens chart lnode rnode cat = | ||
156 | else found) in | 156 | else found) in |
157 | match found with | 157 | match found with |
158 | [x] -> x | 158 | [x] -> x |
159 | - | [] -> failwith "Unable to extract sentences. Check puntuation." | 159 | + | [] -> failwith "Unable to extract sentences. Check punctuation." |
160 | | _ -> failwith "find_tokens_in_chart" | 160 | | _ -> failwith "find_tokens_in_chart" |
161 | 161 | ||
162 | (*let find_tokens_in_chart_id tokens chart lnode rnode cat = | 162 | (*let find_tokens_in_chart_id tokens chart lnode rnode cat = |
subsyntax/ENIAMsubsyntax.ml
@@ -348,15 +348,15 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query = | @@ -348,15 +348,15 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query = | ||
348 | (* print_endline ("parse_text_tokens: " ^ query); *) | 348 | (* print_endline ("parse_text_tokens: " ^ query); *) |
349 | let paragraphs = Xstring.split "\n\\|\r" query in | 349 | let paragraphs = Xstring.split "\n\\|\r" query in |
350 | let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in | 350 | let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in |
351 | + let paragraphs = List.rev (Xlist.rev_map paragraphs (fun paragraph -> | ||
352 | + if par_names_flag then | ||
353 | + match Xstring.split "\t" paragraph with | ||
354 | + [name; paragraph] -> name, paragraph | ||
355 | + | _ -> failwith ("parse_text_tokens: " ^ paragraph) | ||
356 | + else "", paragraph)) in | ||
351 | let n = if Xlist.size paragraphs = 1 then 0 else 1 in | 357 | let n = if Xlist.size paragraphs = 1 then 0 else 1 in |
352 | - let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph -> | 358 | + let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) (name,paragraph) -> |
353 | try | 359 | try |
354 | - let name, paragraph = | ||
355 | - if par_names_flag then | ||
356 | - match Xstring.split "\t" paragraph with | ||
357 | - [name; paragraph] -> name, paragraph | ||
358 | - | _ -> failwith ("parse_text_tokens: " ^ paragraph) | ||
359 | - else "", paragraph in | ||
360 | (* print_endline paragraph; *) | 360 | (* print_endline paragraph; *) |
361 | let paths = parse paragraph in | 361 | let paths = parse paragraph in |
362 | (* print_endline "parse_text 1"; *) | 362 | (* print_endline "parse_text 1"; *) |
@@ -367,7 +367,8 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query = | @@ -367,7 +367,8 @@ let parse_text_tokens sentence_split_flag par_names_flag tokens query = | ||
367 | (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ | 367 | (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ |
368 | [Raw,RawParagraph paragraph; Struct,StructParagraph sentences])) :: paragraphs, n+1 | 368 | [Raw,RawParagraph paragraph; Struct,StructParagraph sentences])) :: paragraphs, n+1 |
369 | with e -> | 369 | with e -> |
370 | - (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in | 370 | + (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ |
371 | + [Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)])) :: paragraphs, n+1) in | ||
371 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens | 372 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens |
372 | 373 | ||
373 | let parse_text sentence_split_flag par_names_flag query = | 374 | let parse_text sentence_split_flag par_names_flag query = |
tokenizer/ENIAMtokens.ml
@@ -1056,7 +1056,8 @@ let rec recognize_sign_group poss_s_beg i = function | @@ -1056,7 +1056,8 @@ let rec recognize_sign_group poss_s_beg i = function | ||
1056 | | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t") | 1056 | | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t") |
1057 | | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r") | 1057 | | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r") |
1058 | | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n") | 1058 | | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n") |
1059 | - | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®") | 1059 | + | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (make_lemma ("®","symbol")) |
1060 | + | (Sign "™") :: l -> create_sign_token poss_s_beg i [Sign "™"] l (make_lemma ("™","symbol")) | ||
1060 | | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") | 1061 | | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") |
1061 | | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") | 1062 | | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") |
1062 | | (Sign "†") :: l -> create_sign_token poss_s_beg i [Sign "†"] l (Interp "†") | 1063 | | (Sign "†") :: l -> create_sign_token poss_s_beg i [Sign "†"] l (Interp "†") |