Commit ccd5d99ec39b0f9792a1e9c077e3592be5713299
1 parent
9d5402d4
nazwy akapitów
Showing
8 changed files
with
226 additions
and
12 deletions
exec/ENIAMexec.ml
... | ... | @@ -34,6 +34,7 @@ let translate_mode = function |
34 | 34 | | ENIAMsubsyntaxTypes.Swigra -> Swigra |
35 | 35 | | ENIAMsubsyntaxTypes.POLFIE -> POLFIE |
36 | 36 | | ENIAMsubsyntaxTypes.Error -> Error |
37 | + | ENIAMsubsyntaxTypes.Name -> Name | |
37 | 38 | |
38 | 39 | let rec translate_sentence = function |
39 | 40 | ENIAMsubsyntaxTypes.RawSentence s -> RawSentence s |
... | ... | @@ -139,10 +140,27 @@ let create_text_fragments tokens paths last = |
139 | 140 | text_fragments.(i) <- map); |
140 | 141 | text_fragments |
141 | 142 | |
143 | +(*let create_beg_positions tokens paths last = | |
144 | + let beg_positions = Array.make last (-1) in | |
145 | + Xlist.iter paths (fun (id,lnode,rnode) -> | |
146 | + let t = ExtArray.get tokens id in | |
147 | + beg_positions.(lnode) <- t.ENIAMtokenizerTypes.beg); | |
148 | + beg_positions | |
149 | + | |
150 | +let create_end_positions tokens paths last = | |
151 | + let end_positions = Array.make last (-1) in | |
152 | + Xlist.iter paths (fun (id,lnode,rnode) -> | |
153 | + let t = ExtArray.get tokens id in | |
154 | + end_positions.(rnode) <- t.ENIAMtokenizerTypes.beg + t.ENIAMtokenizerTypes.len); | |
155 | + end_positions*) | |
156 | + | |
142 | 157 | let eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last = |
143 | 158 | ENIAM_LCGreductions.reset_variant_label (); |
144 | 159 | let result = {empty_eniam_parse_result with paths_size = Xlist.size paths} in |
145 | - let result = if verbosity = 0 then result else {result with text_fragments=create_text_fragments tokens paths last} in | |
160 | + let result = if verbosity = 0 then result else {result with | |
161 | + text_fragments=create_text_fragments tokens paths last; | |
162 | + (*beg_positions=create_beg_positions tokens paths last; | |
163 | + end_positions=create_end_positions tokens paths last;*)} in | |
146 | 164 | let time1 = time_fun () in |
147 | 165 | try |
148 | 166 | (* print_endline "eniam_parse_sentence 1"; *) |
... | ... |
exec/ENIAMexecTypes.ml
... | ... | @@ -78,7 +78,7 @@ type semantic_processing_result = { |
78 | 78 | } |
79 | 79 | *) |
80 | 80 | type mode = |
81 | - Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE | Error | |
81 | + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE | Error | Name | |
82 | 82 | |
83 | 83 | type sentence = |
84 | 84 | RawSentence of string |
... | ... | @@ -309,6 +309,7 @@ let rec fold_text mode s f = function |
309 | 309 | fold_text mode s f text) |
310 | 310 | |
311 | 311 | let rules_filename = ENIAM_LCGlexiconTypes.resource_path ^ "/LCGlexicon/lexicon-pl.dic" |
312 | +let colours_filename = ENIAMwalTypes.data_path ^ "/colours.tab" | |
312 | 313 | |
313 | 314 | let lcg_rules = ref ([] : (int * (ENIAM_LCGtypes.linear_term ExtArray.t -> |
314 | 315 | (ENIAM_LCGtypes.SymbolMap.key * ENIAM_LCGtypes.linear_term) list -> |
... | ... |
exec/ENIAMvisualization.ml
... | ... | @@ -23,6 +23,10 @@ open Printf |
23 | 23 | open ENIAMtokenizerTypes |
24 | 24 | open ENIAMexecTypes |
25 | 25 | |
26 | +type marked = | |
27 | + Chart of (string * string * string list) list | |
28 | + | Message of string | |
29 | + | |
26 | 30 | let string_of_status = function |
27 | 31 | Idle -> "Idle" |
28 | 32 | | PreprocessingError -> "PreprocessingError" |
... | ... | @@ -668,6 +672,7 @@ let string_of_mode = function |
668 | 672 | | Swigra -> "Swigra" |
669 | 673 | | POLFIE -> "POLFIE" |
670 | 674 | | Error -> "Error" |
675 | + | Name -> "Name" | |
671 | 676 | (* |
672 | 677 | (*let rec string_of_sentence = function |
673 | 678 | RawSentence s -> sprintf "RawSentence(%s)" s |
... | ... | @@ -811,7 +816,7 @@ let rec extract_pos_cat vars = function |
811 | 816 | |
812 | 817 | let get_text_fragment text_fragments node1 node2 = |
813 | 818 | try IntMap.find text_fragments.(node1) node2 |
814 | - with Not_found -> "???"(*failwith (Printf.sprintf "chart: text_fragment not found %d-%d" node1 node2)*) | |
819 | + with (*Not_found*)_ -> "???"(*failwith (Printf.sprintf "chart: text_fragment not found %d-%d" node1 node2)*) | |
815 | 820 | |
816 | 821 | let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<siebie>";"<prep>"; |
817 | 822 | "<num>";"<intnum>";"<realnum>";"<intnum-interval>";"<realnum-interval>";"<symbol>";"<ordnum>"; |
... | ... | @@ -861,6 +866,86 @@ let cat_tokens_sequence text_fragments g = |
861 | 866 | | b :: l -> if a = b then b :: l else a :: b :: l) in*) |
862 | 867 | String.concat " " (Xlist.map l (fun (n,texts) -> texts)) |
863 | 868 | |
869 | +let excluded_cats = StringSet.union omited (StringSet.of_list ["0";"Prep";"s";"BracketSet";"<root>";"by";"nie";"się";"jak";"int"; | |
870 | + "wieś";"ulica";"osada leśna";"część miejscowości";"astr.";"przysiółek";"nazwisko";"część miasta"; | |
871 | + "imię";"geograficzna";"pseudonim";"gmina wiejska";"osada";"firma";"język programowania";"kolonia"; | |
872 | + "instytucja";"gmina miejska";"miasto";"pora roku";"miesiąc";"krój pisma";"gmina miejsko-wiejska"; | |
873 | + "obszar wiejski";"powiat";"organizacja";"dzielnica";"własna";"marka";"przydomek";"hour-minute";"inicjał"; ]) | |
874 | + | |
875 | +let load_colours_of_cats filename = | |
876 | + File.fold_tab filename StringMap.empty (fun map -> function | |
877 | + [cat; colour] -> StringMap.add map cat colour | |
878 | + | line -> failwith ("load_colours_of_cats: " ^ String.concat "\t" line)) | |
879 | + | |
880 | +let colours_of_cats = ref StringMap.empty | |
881 | + | |
882 | +let initialize () = | |
883 | + colours_of_cats := load_colours_of_cats colours_filename | |
884 | + | |
885 | +(* let colours_of_cats = Xlist.fold [ | |
886 | + "ChemCompound","#00ffff"; | |
887 | + "ChemFunGroup","#ff00cc"; | |
888 | + "Measure","#ffff00"; | |
889 | + "Contain","#00ff00"; | |
890 | + "Number","#0000ff"; | |
891 | + "Dose","#ff0000"; | |
892 | + (* "","#"; | |
893 | + "","#"; *) | |
894 | + ] StringMap.empty (fun map (cat,colour) -> StringMap.add map cat colour) *) | |
895 | + | |
896 | +let rec merge_cat_chart rev = function | |
897 | + (i,j,s,x) :: (m,n,t,y) :: l -> | |
898 | + (* printf "i=%d j=%d s=%s m=%d n=%d t=%s\n%!" i j s m n t; *) | |
899 | + if j=m && s=t then merge_cat_chart rev ((i,n,t,y) :: l) | |
900 | + else merge_cat_chart ((i,j,s,x) :: rev) ((m,n,t,y) :: l) | |
901 | + | l -> List.rev (l @ rev) | |
902 | + | |
903 | +let cat_chart text_fragments g = | |
904 | + (* print_endline "cat_chart 1"; *) | |
905 | + let l,last = ENIAM_LCGchart.fold g ([],0) (fun (l,last) (symbol,node1,node2,sem,layer) -> | |
906 | + (* printf "node1=%d node2=%d symbol=%s\n" node1 node2 (ENIAM_LCGstringOf.grammar_symbol 0 symbol); *) | |
907 | + (node1,node2,extract_pos_cat [] symbol) :: l, max node2 last) in | |
908 | + let a = Array.make (Array.length g) StringSet.empty in | |
909 | + Xlist.iter l (fun (node1,node2,cat) -> | |
910 | + if StringSet.mem excluded_cats cat then () else | |
911 | + Int.iter node1 (node2 - 1) (fun i -> | |
912 | + a.(i) <- StringSet.add a.(i) cat)); | |
913 | + let l = List.rev (Int.fold 0 (Array.length g - 1) [] (fun l i -> | |
914 | + if i >= last then l else | |
915 | + let cats = List.sort compare (StringSet.to_list a.(i)) in | |
916 | + (i,i+1,String.concat "|" cats, cats ) :: l)) in | |
917 | + let l = merge_cat_chart [] l in | |
918 | + (* print_endline "cat_chart 2"; *) | |
919 | + List.rev (Xlist.fold l [] (fun l (node1,node2,key,cats) -> | |
920 | + let t = get_text_fragment text_fragments node1 node2 in | |
921 | + (* if t = "???" then printf "node1=%d node2=%d key=%s cats=[%s]\n%!" node1 node2 key (String.concat ";" cats); *) | |
922 | + if node1 = node2 then l else | |
923 | + (t,key,cats) :: l)) | |
924 | + | |
925 | +let create_styles ll = | |
926 | + fst (Xlist.fold ll (StringMap.empty,1) (fun (map,n) -> function | |
927 | + | (_,_,Message l) -> map,n | |
928 | + | (_,_,Chart l) -> | |
929 | + Xlist.fold l (map,n) (fun (map,n) (_,key,cats) -> | |
930 | + if StringMap.mem map key || key = "" then map,n else | |
931 | + let colours = List.rev (Xlist.rev_map cats (fun cat -> | |
932 | + try StringMap.find !colours_of_cats cat with Not_found -> print_endline ("create_styles: unknown cat " ^ cat); "#ffffff")) in | |
933 | + let colours,_ = Xlist.fold colours ([],0) (fun (colours,i) colour -> | |
934 | + (Printf.sprintf "%s %dpx,%s %dpx" colour (i*7) colour ((i+1)*7)) :: colours, i+1) in | |
935 | + StringMap.add map key ("B" ^ string_of_int n,List.rev colours),n+1))) | |
936 | + | |
937 | +let render_styles styles = | |
938 | + "<style type=\"text/css\">\n " ^ | |
939 | + String.concat "\n " (List.rev (StringMap.fold styles [] (fun l _ (name,colours) -> | |
940 | + (Printf.sprintf ".%s { background-image:repeating-linear-gradient(-45deg,%s); }" name | |
941 | + (String.concat "," colours)) :: l))) ^ | |
942 | + "</style>" | |
943 | + | |
944 | +let assign_style styles (t,key,_) = | |
945 | + if key = "" then t else | |
946 | + let id,_ = try StringMap.find styles key with Not_found -> failwith ("assign_style: " ^ key) in | |
947 | + Printf.sprintf "<span class=\"%s\">%s</span>" id t | |
948 | + | |
864 | 949 | (* verbosity: |
865 | 950 | 0 -> jedynie informacja o statusie zdania |
866 | 951 | 1 -> zawartość struktur danych istotnych dla uzyskanego statusu |
... | ... | @@ -1271,6 +1356,7 @@ let file_prefix_of_mode = function |
1271 | 1356 | | Swigra -> "S" |
1272 | 1357 | | POLFIE -> "P" |
1273 | 1358 | | Error -> "Er" |
1359 | + | Name -> "N" | |
1274 | 1360 | |
1275 | 1361 | let rec html_of_sentence path file_prefix mode img verbosity tokens = function |
1276 | 1362 | RawSentence s -> escape_html s |
... | ... | @@ -1488,3 +1574,99 @@ let rec to_string_text verbosity tokens = function |
1488 | 1574 | RawText s -> [] |
1489 | 1575 | | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens)) |
1490 | 1576 | | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text)) |
1577 | + | |
1578 | +let rec to_string2_paragraph verbosity tokens = function | |
1579 | + RawParagraph s -> [] | |
1580 | + | StructParagraph sentences -> | |
1581 | + let l = List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence)) in | |
1582 | + List.rev (Xlist.rev_map l (fun t -> "","",Message t)) | |
1583 | + | AltParagraph((Name,RawParagraph name) :: l) -> | |
1584 | + let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) in | |
1585 | + List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t)) | |
1586 | + | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string2_paragraph verbosity tokens paragraph)) | |
1587 | + | ErrorParagraph s -> ["","",Message "SubsyntaxError"] | |
1588 | + | |
1589 | +let rec to_string2_text verbosity tokens = function | |
1590 | + RawText s -> [] | |
1591 | + | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string2_paragraph verbosity tokens)) | |
1592 | + | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string2_text verbosity tokens text)) | |
1593 | + | |
1594 | +let rec skip_tag = function | |
1595 | + ">" :: l -> l | |
1596 | + | s :: l -> skip_tag l | |
1597 | + | [] -> [] | |
1598 | + | |
1599 | +let rec check_name_length_rec n rev = function | |
1600 | + "<" :: l -> check_name_length_rec n rev (skip_tag l) | |
1601 | + | [s] -> String.concat "" (List.rev (s :: rev)) | |
1602 | + | [] -> String.concat "" (List.rev rev) | |
1603 | + | s :: l -> | |
1604 | + if n > 1 then check_name_length_rec (n-1) (s :: rev) l | |
1605 | + else String.concat "" (List.rev ("…" :: rev)) | |
1606 | + | |
1607 | +let check_name_length n s = | |
1608 | + let l = Xunicode.utf8_chars_of_utf8_string s in | |
1609 | + check_name_length_rec n [] l | |
1610 | + (* if String.length s > n then | |
1611 | + String.sub s 0 (n-1) ^ "…" | |
1612 | + else s *) | |
1613 | + | |
1614 | +let to_string2_simplify name_length= function | |
1615 | + name,_,Message s -> | |
1616 | + if name_length <= 0 then s | |
1617 | + else (check_name_length name_length name) ^ "\t" ^ s | |
1618 | + | _ -> failwith "to_string2_simplify" | |
1619 | + | |
1620 | + | |
1621 | +let marked_string_of_eniam_sentence verbosity tokens (result : eniam_parse_result) = | |
1622 | + let status_string = string_of_status result.status in | |
1623 | + if result.status = NotParsed then | |
1624 | + [status_string, Chart(cat_chart result.text_fragments result.chart1)] | |
1625 | + else [status_string,Message result.msg] | |
1626 | + | |
1627 | +let rec marked_string_of_sentence verbosity tokens = function | |
1628 | + RawSentence s -> [] | |
1629 | + | StructSentence(paths,last) -> [] | |
1630 | + | DepSentence paths -> [] | |
1631 | + | ENIAMSentence result -> marked_string_of_eniam_sentence verbosity tokens result | |
1632 | + | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) | |
1633 | + | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> marked_string_of_sentence verbosity tokens sentence)) | |
1634 | + | |
1635 | +let rec marked_string_of_paragraph verbosity tokens = function | |
1636 | + RawParagraph s -> [] | |
1637 | + | StructParagraph sentences -> | |
1638 | + let l = List.flatten (Xlist.map sentences (fun p -> marked_string_of_sentence verbosity tokens p.sentence)) in | |
1639 | + List.rev (Xlist.rev_map l (fun (s,t) -> "",s,t)) | |
1640 | + | AltParagraph((Name,RawParagraph name) :: l) -> | |
1641 | + let l = List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) in | |
1642 | + List.rev (Xlist.rev_map l (fun (_,s,t) -> name,s,t)) | |
1643 | + | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> marked_string_of_paragraph verbosity tokens paragraph)) | |
1644 | + | ErrorParagraph s -> ["","SubsyntaxError",Message s] | |
1645 | + | |
1646 | +let rec marked_string_of_text verbosity tokens = function | |
1647 | + RawText s -> [] | |
1648 | + | StructText paragraphs -> List.flatten (Xlist.map paragraphs (marked_string_of_paragraph verbosity tokens)) | |
1649 | + | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> marked_string_of_text verbosity tokens text)) | |
1650 | + | |
1651 | +let print_html_marked_simple_text path name name_length l = | |
1652 | + File.file_out (path ^ name ^ ".html") (fun file -> | |
1653 | + fprintf file "%s\n" html_header; | |
1654 | + (* print_endline "print_html_marked_text 1"; *) | |
1655 | + (* print_endline "print_html_marked_text 2"; *) | |
1656 | + let styles = create_styles l in | |
1657 | + (* print_endline "print_html_marked_text 3"; *) | |
1658 | + fprintf file "%s\n" (render_styles styles); | |
1659 | + if name_length <= 0 then | |
1660 | + Xlist.iter l (function | |
1661 | + name, "NotParsed", Chart t -> fprintf file "%s<BR>\n" (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | |
1662 | + | name, status, Chart t -> fprintf file "%s: %s<BR>\n" status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | |
1663 | + | name, status, Message t -> fprintf file "%s: %s<BR>\n" status (escape_html t)) | |
1664 | + else ( | |
1665 | + fprintf file "<TABLE border=1>\n"; | |
1666 | + Xlist.iter l (function | |
1667 | + name, "NotParsed", Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s</TD><TR>\n" (check_name_length name_length name) (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | |
1668 | + | name, status, Chart t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (String.concat "" (List.rev (Xlist.rev_map t (assign_style styles)))); | |
1669 | + | name, status, Message t -> fprintf file "<TR><TD>%s</TD><TD>%s: %s</TD><TR>\n" (check_name_length name_length name) status (escape_html t)); | |
1670 | + fprintf file "</TABLE>\n"); | |
1671 | + (* print_endline "print_html_marked_text 4"; *) | |
1672 | + fprintf file "%s\n" html_trailer) | |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -344,13 +344,19 @@ let parse query = |
344 | 344 | (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a19"; *) |
345 | 345 | paths(*, next_id*) |
346 | 346 | |
347 | -let parse_text_tokens sentence_split_flag tokens query = | |
347 | +let parse_text_tokens sentence_split_flag par_names_flag tokens query = | |
348 | 348 | (* print_endline ("parse_text_tokens: " ^ query); *) |
349 | 349 | let paragraphs = Xstring.split "\n\\|\r" query in |
350 | 350 | let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in |
351 | 351 | let n = if Xlist.size paragraphs = 1 then 0 else 1 in |
352 | 352 | let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph -> |
353 | 353 | try |
354 | + let name, paragraph = | |
355 | + if par_names_flag then | |
356 | + match Xstring.split "\t" paragraph with | |
357 | + [name; paragraph] -> name, paragraph | |
358 | + | _ -> failwith ("parse_text_tokens: " ^ paragraph) | |
359 | + else "", paragraph in | |
354 | 360 | (* print_endline paragraph; *) |
355 | 361 | let paths = parse paragraph in |
356 | 362 | (* print_endline "parse_text 1"; *) |
... | ... | @@ -358,25 +364,26 @@ let parse_text_tokens sentence_split_flag tokens query = |
358 | 364 | let sentences = |
359 | 365 | if sentence_split_flag then ENIAMsentences.split_into_sentences pid paragraph tokens paths |
360 | 366 | else ENIAMsentences.no_split_into_sentences pid paragraph tokens paths in |
361 | - (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1 | |
367 | + (AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ | |
368 | + [Raw,RawParagraph paragraph; Struct,StructParagraph sentences])) :: paragraphs, n+1 | |
362 | 369 | with e -> |
363 | 370 | (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in |
364 | 371 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens |
365 | 372 | |
366 | -let parse_text sentence_split_flag query = | |
373 | +let parse_text sentence_split_flag par_names_flag query = | |
367 | 374 | (* print_endline ("parse_text: " ^ query); *) |
368 | 375 | let tokens = ExtArray.make 100 empty_token_env in |
369 | 376 | let _ = ExtArray.add tokens empty_token_env in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) |
370 | - parse_text_tokens sentence_split_flag tokens query | |
377 | + parse_text_tokens sentence_split_flag par_names_flag tokens query | |
371 | 378 | |
372 | 379 | let catch_parse text = |
373 | 380 | try |
374 | 381 | let tokens = parse text in tokens,"" |
375 | 382 | with e -> [], Printexc.to_string e |
376 | 383 | |
377 | -let catch_parse_text sentence_split_flag text = | |
384 | +let catch_parse_text sentence_split_flag par_names_flag text = | |
378 | 385 | try |
379 | - let text,tokens = parse_text sentence_split_flag text in text,tokens,"" | |
386 | + let text,tokens = parse_text sentence_split_flag par_names_flag text in text,tokens,"" | |
380 | 387 | with e -> |
381 | 388 | RawText text, |
382 | 389 | ExtArray.make 0 empty_token_env, |
... | ... |
subsyntax/ENIAMsubsyntaxStringOf.ml
subsyntax/ENIAMsubsyntaxTypes.ml
... | ... | @@ -20,7 +20,7 @@ |
20 | 20 | open ENIAMtokenizerTypes |
21 | 21 | |
22 | 22 | type mode = |
23 | - Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE | Error | |
23 | + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE | Error | Name | |
24 | 24 | |
25 | 25 | type sentence = |
26 | 26 | RawSentence of string |
... | ... | @@ -83,6 +83,7 @@ let int_of_mode = function |
83 | 83 | | Swigra -> 5 |
84 | 84 | | POLFIE -> 6 |
85 | 85 | | Error -> 7 |
86 | + | Name -> 8 | |
86 | 87 | |
87 | 88 | let compare_mode x y = |
88 | 89 | compare (int_of_mode x) (int_of_mode y) |
... | ... |
subsyntax/interface.ml
... | ... | @@ -24,6 +24,7 @@ let output = ref Text |
24 | 24 | let comm_stdio = ref true |
25 | 25 | let sentence_split = ref Full |
26 | 26 | let port = ref 5439 |
27 | +let par_names = ref false | |
27 | 28 | |
28 | 29 | let spec_list = [ |
29 | 30 | "-s", Arg.Unit (fun () -> sentence_split:=Full), "Split input into sentences (default)"; |
... | ... | @@ -40,6 +41,8 @@ let spec_list = [ |
40 | 41 | "--no-strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=false), "Do not perform strong disambiguation (default)"; |
41 | 42 | "--internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=true), "Relaxed attitude towards interpunction"; |
42 | 43 | "--no-internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=false), "Strict attitude towards interpunction (default)"; |
44 | + "--par-names", Arg.Unit (fun () -> par_names:=true), "Identifiers of paragraphs provided"; | |
45 | + "--no-par-names", Arg.Unit (fun () -> par_names:=false), "No identifiers of paragraphs provided (default)"; | |
43 | 46 | ] |
44 | 47 | |
45 | 48 | let usage_msg = |
... | ... | @@ -68,8 +71,8 @@ let rec main_loop in_chan out_chan = |
68 | 71 | print_endline "input text end"; *) |
69 | 72 | (if !sentence_split = Full || !sentence_split = Partial then |
70 | 73 | let text,tokens,msg = |
71 | - if !sentence_split = Full then ENIAMsubsyntax.catch_parse_text true text | |
72 | - else ENIAMsubsyntax.catch_parse_text false text in | |
74 | + if !sentence_split = Full then ENIAMsubsyntax.catch_parse_text true !par_names text | |
75 | + else ENIAMsubsyntax.catch_parse_text false !par_names text in | |
73 | 76 | (match !output with |
74 | 77 | Text -> |
75 | 78 | if msg = "" then output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -1059,6 +1059,7 @@ let rec recognize_sign_group poss_s_beg i = function |
1059 | 1059 | | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®") |
1060 | 1060 | | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") |
1061 | 1061 | | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") |
1062 | + | (Sign "†") :: l -> create_sign_token poss_s_beg i [Sign "†"] l (Interp "†") | |
1062 | 1063 | | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s) |
1063 | 1064 | | l -> failwith "recognize_sign_group" |
1064 | 1065 | |
... | ... |