Commit 0c36e02e82975a65c9794c6e55eaa5530fdaa16a
1 parent
e0385166
Rozszerzenie funkcjonalnoniści tokenizera
Showing
14 changed files
with
326 additions
and
120 deletions
LCGlexicon/ENIAM_LCGlexicon.ml
... | ... | @@ -312,6 +312,7 @@ let create_entries rules id orth cats valence lex_entries = |
312 | 312 | (* variable_name_ref := []; *) |
313 | 313 | if cats.pos="interp" && cats.lemma="<clause>" then (BracketSet(Forward),Dot) :: l else |
314 | 314 | if cats.pos="interp" && cats.lemma="</clause>" then (BracketSet(Backward),Dot) :: l else |
315 | + if (cats.pos2="noun" || cats.pos2="verb" || cats.pos2="adj" || cats.pos2="adv") && cats.cat="X" && not !default_category_flag then l else | |
315 | 316 | let e = get_labels () in |
316 | 317 | (* print_endline "create_entries 1"; *) |
317 | 318 | let rules = find_rules rules cats in |
... | ... |
LCGlexicon/ENIAM_LCGlexiconTypes.ml
... | ... | @@ -79,6 +79,8 @@ let empty_cats = {lemma=""; pos=""; pos2=""; cat="X"; coerced=[]; |
79 | 79 | nsyn=[]; nsem=[]; modes=[]; psem=[]; |
80 | 80 | } |
81 | 81 | |
82 | +let default_category_flag = ref true | |
83 | + | |
82 | 84 | let resource_path = |
83 | 85 | try Sys.getenv "ENIAM_RESOURCE_PATH" |
84 | 86 | with Not_found -> |
... | ... |
exec/ENIAMvisualization.ml
... | ... | @@ -24,7 +24,7 @@ open ENIAMtokenizerTypes |
24 | 24 | open ENIAMexecTypes |
25 | 25 | |
26 | 26 | let string_of_status = function |
27 | - Idle -> "Idle" | |
27 | + Idle -> "Idle" | |
28 | 28 | | PreprocessingError -> "PreprocessingError" |
29 | 29 | | LexiconError -> "LexiconError" |
30 | 30 | | ParseError -> "ParseError" |
... | ... | @@ -786,6 +786,80 @@ let create_latex_dep_chart path name dep_chart = |
786 | 786 | LatexMain.latex_compile_and_clean path name |
787 | 787 | *) |
788 | 788 | |
789 | +let rec extract_pos_cat_internal vars = function | |
790 | + | Atom x -> x | |
791 | + | AVar x -> (try extract_pos_cat_internal vars (Xlist.assoc vars x) with Not_found -> failwith "extract_pos_cat_internal") | |
792 | + | With l -> String.concat "&" (Xlist.map l (extract_pos_cat_internal vars)) | |
793 | + | Zero -> "0" | |
794 | + | Top -> "T" | |
795 | + | |
796 | +let rec extract_pos_cat vars = function | |
797 | + | Tensor [] -> failwith "extract_pos_cat: ni" | |
798 | + | Tensor [pos] -> extract_pos_cat_internal vars pos | |
799 | + | Tensor (Atom "num" :: _) -> "Number" | |
800 | + | Tensor (Atom "prepnp" :: _) -> "Prep" | |
801 | + | Tensor (pos :: cat :: _) -> (*extract_pos_cat_internal vars pos ^ "*" ^*) extract_pos_cat_internal vars cat | |
802 | + | Plus l -> failwith "extract_pos_cat: ni" | |
803 | + | Imp(s,d,t2) -> extract_pos_cat vars s | |
804 | + | One -> failwith "extract_pos_cat: ni" | |
805 | + | ImpSet(s,l) -> extract_pos_cat vars s | |
806 | + | WithVar(v,g,e,s) -> extract_pos_cat ((v,g) :: vars) s | |
807 | + | Star s -> failwith "extract_pos_cat: ni" | |
808 | + | Bracket(lf,rf,s) -> extract_pos_cat vars s | |
809 | + | BracketSet d -> "BracketSet" | |
810 | + | Maybe s -> failwith "extract_pos_cat: ni" | |
811 | + | |
812 | +let get_text_fragment text_fragments node1 node2 = | |
813 | + try IntMap.find text_fragments.(node1) node2 | |
814 | + with Not_found -> "???"(*failwith (Printf.sprintf "chart: text_fragment not found %d-%d" node1 node2)*) | |
815 | + | |
816 | +let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<siebie>";"<prep>"; | |
817 | + "<num>";"<intnum>";"<realnum>";"<intnum-interval>";"<realnum-interval>";"<symbol>";"<ordnum>"; | |
818 | + "<date>";"<date-interval>";"<hour-minute>";"<hour>";"<hour-minute-interval>";"<hour-interval>"; | |
819 | + "<year>";"<year-interval>";"<day>";"<day-interval>";"<day-month>";"<day-month-interval>"; | |
820 | + "<month-interval>";"<roman>";"<roman-interval>";"<roman-ordnum>";"<match-result>";"<url>"; | |
821 | + "<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>"; | |
822 | + "<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>"; | |
823 | + "<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>"; | |
824 | + "<interp>";"<part>";"<unk>";"<building-number>"] | |
825 | + | |
826 | +let cat_tokens_sequence text_fragments g = | |
827 | + let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) -> | |
828 | + node1,node2, | |
829 | + (if m < node1 then | |
830 | + if n < node1 then [n, node1, get_text_fragment text_fragments n node1, "null"] | |
831 | + else if n = node1 then [] | |
832 | + else [node1, n, get_text_fragment text_fragments node1 n, "overlap"] | |
833 | + else if m = node1 then | |
834 | + if n < node2 then [m, n, get_text_fragment text_fragments m n, "overlap"] | |
835 | + else if n = node2 then [] | |
836 | + else [node1, node2, get_text_fragment text_fragments node1 node2, "overlap"] | |
837 | + else failwith "cat_tokens_sequence") @ | |
838 | + [node1, node2, get_text_fragment text_fragments node1 node2, extract_pos_cat [] symbol] @ l) in | |
839 | + let map = Xlist.fold l IntMap.empty (fun map (m,n,text,symbol) -> | |
840 | + IntMap.add_inc map (1000000*m+n) [text,symbol] (fun l -> (text,symbol) :: l)) in | |
841 | + let map = IntMap.map map (fun l -> | |
842 | + let t,ov,set = Xlist.fold l ("",false,StringSet.empty) (fun (t,ov,set) (text,symbol) -> | |
843 | + if symbol = "null" then text,ov,set | |
844 | + else if symbol = "overlap" then t,true,set | |
845 | + else if StringSet.mem omited symbol then text,ov,set | |
846 | + else t,ov,StringSet.add set symbol) in | |
847 | + let l = if StringSet.is_empty set then [t] else StringSet.to_list set in | |
848 | + if ov then "OVERLAP{" ^ String.concat " " l ^ "}" else | |
849 | + match l with | |
850 | + [t] -> t | |
851 | + | _ -> "{" ^ String.concat " " l ^ "}") in | |
852 | + let l = List.sort compare (IntMap.fold map [] (fun l k texts -> (k,texts) :: l)) in | |
853 | +(* let l = Xlist.sort l (fun (m1,n1,text1,symbol1) (m2,n2,text2,symbol2) -> | |
854 | + if m1 <> m2 then compare m1 m2 else | |
855 | + if n1 <> n2 then compare n1 n2 else | |
856 | + compare symbol1 symbol2) in | |
857 | + let l = if l = [] then l else | |
858 | + Xlist.fold (List.tl l) [List.hd l] (fun l a -> | |
859 | + match l with | |
860 | + [] -> failwith "cat_tokens_sequence" | |
861 | + | b :: l -> if a = b then b :: l else a :: b :: l) in*) | |
862 | + String.concat " " (Xlist.map l (fun (n,texts) -> texts)) | |
789 | 863 | |
790 | 864 | (* verbosity: |
791 | 865 | 0 -> jedynie informacja o statusie zdania |
... | ... | @@ -796,13 +870,13 @@ let create_latex_dep_chart path name dep_chart = |
796 | 870 | let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam_parse_result) = |
797 | 871 | match result.status with |
798 | 872 | Idle -> "<font color=\"red\">idle</font>\n" |
799 | - | LexiconError -> sprintf "<font color=\"red\">error_lex</font>: %s paths_size=%d\n" result.msg result.paths_size | |
873 | + | LexiconError -> sprintf "<font color=\"red\">error_lex</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size | |
800 | 874 | | ParseError -> |
801 | 875 | if verbosity = 0 then () else ( |
802 | 876 | ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; |
803 | 877 | ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2; |
804 | 878 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2); |
805 | - sprintf "<font color=\"red\">error_parse</font>: %s paths_size=%d\n" result.msg result.paths_size ^ | |
879 | + sprintf "<font color=\"red\">error_parse</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size ^ | |
806 | 880 | (if verbosity = 0 then "" else |
807 | 881 | sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^ |
808 | 882 | sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^ |
... | ... | @@ -814,7 +888,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
814 | 888 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2); |
815 | 889 | if verbosity = 0 then () else ( |
816 | 890 | ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2); |
817 | - sprintf "<font color=\"red\">timeout</font>: %s paths_size=%d\n" result.msg result.paths_size ^ | |
891 | + sprintf "<font color=\"red\">timeout</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size ^ | |
818 | 892 | (if verbosity < 2 then "" else |
819 | 893 | sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^ |
820 | 894 | sprintf "<BR><A HREF=\"%s_2_references.pdf\">References 2</A>\n" file_prefix) ^ |
... | ... | @@ -840,6 +914,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
840 | 914 | sprintf "<BR><A HREF=\"%s_3_references.pdf\">References 3</A>\n" file_prefix ^ |
841 | 915 | sprintf "<BR><A HREF=\"%s_3_chart.pdf\">Chart 3</A>\n" file_prefix) ^ |
842 | 916 | (if verbosity = 0 then "" else |
917 | + sprintf "<BR>%s\n" (escape_html (cat_tokens_sequence result.text_fragments (ENIAM_LCGchart.select_maximal result.chart1))) ^ | |
843 | 918 | sprintf "<BR><A HREF=\"%s_3_chart_selection.pdf\">Chart 3 Selection</A>\n" file_prefix) ^ |
844 | 919 | "" |
845 | 920 | | ReductionError -> |
... | ... | @@ -851,7 +926,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
851 | 926 | ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; |
852 | 927 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3); |
853 | 928 | (if verbosity < 2 then "" else |
854 | - sprintf "<font color=\"red\">error_reduction</font>: %s paths_size=%d chart_size=%d\n" result.msg result.paths_size result.chart_size ^ | |
929 | + sprintf "<font color=\"red\">error_reduction</font>: %s paths_size=%d chart_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size ^ | |
855 | 930 | sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^ |
856 | 931 | sprintf "<BR><A HREF=\"%s_2_references.pdf\">References 2</A>\n" file_prefix ^ |
857 | 932 | sprintf "<BR><A HREF=\"%s_3_chart.pdf\">Chart 3</A>\n" file_prefix) ^ |
... | ... | @@ -909,7 +984,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
909 | 984 | Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 result.term4)); |
910 | 985 | Xlatex.latex_compile_and_clean path (file_prefix ^ "_4_term"); |
911 | 986 | ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_4_dependency_tree") "a0" result.dependency_tree4); |
912 | - sprintf "<font color=\"red\">error_reduction2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^ | |
987 | + sprintf "<font color=\"red\">error_reduction2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^ | |
913 | 988 | (if verbosity < 2 then "" else |
914 | 989 | sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^ |
915 | 990 | sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^ |
... | ... | @@ -939,7 +1014,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
939 | 1014 | ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_6b_dependency_tree") result.dependency_tree6b; |
940 | 1015 | ENIAM_LCGgraphOf.print_simplified_dependency_tree path (file_prefix ^ "_6a_simple_dependency_tree") result.dependency_tree6a; |
941 | 1016 | ENIAM_LCGgraphOf.print_simplified_dependency_tree path (file_prefix ^ "_6b_simple_dependency_tree") result.dependency_tree6b); |
942 | - sprintf "<font color=\"red\">error_reduction3</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^ | |
1017 | + sprintf "<font color=\"red\">error_reduction3</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^ | |
943 | 1018 | (if verbosity < 2 then "" else |
944 | 1019 | sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^ |
945 | 1020 | sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^ |
... | ... | @@ -1010,7 +1085,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
1010 | 1085 | if ExtArray.size result.dependency_tree8 <> 0 then ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_8_dependency_tree") "a3" result.dependency_tree8; |
1011 | 1086 | if result.dependency_tree9 <> [| |] then ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") "a3" result.dependency_tree9; |
1012 | 1087 | if result.dependency_tree9 <> [| |] then ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") result.dependency_tree9); |
1013 | - sprintf "<font color=\"red\">error_sem_valence</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^ | |
1088 | + sprintf "<font color=\"red\">error_sem_valence</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^ | |
1014 | 1089 | (if verbosity = 0 then "" else |
1015 | 1090 | sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix ^ |
1016 | 1091 | (if result.dependency_tree7 <> [| |] then sprintf "<BR><A HREF=\"%s_7_dependency_tree.pdf\">Dependency Tree References 7</A>\n" file_prefix else "") ^ |
... | ... | @@ -1038,7 +1113,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
1038 | 1113 | if ExtArray.size result.dependency_tree8 <> 0 then ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_8_dependency_tree") "a3" result.dependency_tree8; |
1039 | 1114 | if result.dependency_tree9 <> [| |] then ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") "a3" result.dependency_tree9; |
1040 | 1115 | if result.dependency_tree9 <> [| |] then ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") result.dependency_tree9)); |
1041 | - sprintf "<font color=\"red\">error_sem_graph</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^ | |
1116 | + sprintf "<font color=\"red\">error_sem_graph</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^ | |
1042 | 1117 | (if verbosity = 2 then |
1043 | 1118 | sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix ^ |
1044 | 1119 | (if result.semantic_graph10 <> [| |] then sprintf "<BR><A HREF=\"%s_10_semantic_graph.pdf\">Semantic Graph References 10</A>\n" file_prefix else "") ^ |
... | ... | @@ -1061,7 +1136,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
1061 | 1136 | | SemGraphError2 -> |
1062 | 1137 | if verbosity = 0 then () else ( |
1063 | 1138 | ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_11_semantic_graph") "" result.semantic_graph11); |
1064 | - sprintf "<font color=\"red\">error_sem_graph2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^ | |
1139 | + sprintf "<font color=\"red\">error_sem_graph2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^ | |
1065 | 1140 | (if verbosity = 0 then "" else |
1066 | 1141 | sprintf "<BR><IMG SRC=\"%s_11_semantic_graph.png\">\n" file_prefix) ^ |
1067 | 1142 | "" |
... | ... | @@ -1077,7 +1152,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
1077 | 1152 | ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_11_semantic_graph") "" result.semantic_graph11); |
1078 | 1153 | if verbosity = 0 then () else ( |
1079 | 1154 | ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_12_semantic_graph") "" result.semantic_graph12); |
1080 | - sprintf "<font color=\"red\">sem_not_validated</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^ | |
1155 | + sprintf "<font color=\"red\">sem_not_validated</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^ | |
1081 | 1156 | (if verbosity < 2 then "" else |
1082 | 1157 | sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix ^ |
1083 | 1158 | sprintf "<BR><A HREF=\"%s_7_dependency_tree.pdf\">Dependency Tree References 7</A>\n" file_prefix ^ |
... | ... | @@ -1386,3 +1461,28 @@ let rec print_main_result_first_page_text cg_bin_path path id tokens = function |
1386 | 1461 | (List.rev (Xlist.fold paragraphs [] find_prev_next_paragraph)) in |
1387 | 1462 | print_main_result_first_page_paragraph cg_bin_path path id tokens prev_next_map (List.hd paragraphs) |
1388 | 1463 | | AltText l -> Xlist.iter l (fun (mode,text) -> print_main_result_first_page_text cg_bin_path path id tokens text) |
1464 | + | |
1465 | +let to_string_eniam_sentence verbosity tokens (result : eniam_parse_result) = | |
1466 | + let status_string = string_of_status result.status in | |
1467 | + if result.status = NotParsed then | |
1468 | + [status_string ^ ": " ^ cat_tokens_sequence result.text_fragments (ENIAM_LCGchart.select_maximal result.chart1)] | |
1469 | + else [status_string] | |
1470 | + | |
1471 | +let rec to_string_sentence verbosity tokens = function | |
1472 | + RawSentence s -> [] | |
1473 | + | StructSentence(paths,last) -> [] | |
1474 | + | DepSentence paths -> [] | |
1475 | + | ENIAMSentence result -> to_string_eniam_sentence verbosity tokens result | |
1476 | + | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence)) | |
1477 | + | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> to_string_sentence verbosity tokens sentence)) | |
1478 | + | |
1479 | +let rec to_string_paragraph verbosity tokens = function | |
1480 | + RawParagraph s -> [] | |
1481 | + | StructParagraph sentences -> List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence)) | |
1482 | + | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string_paragraph verbosity tokens paragraph)) | |
1483 | + | ErrorParagraph s -> ["SubsyntaxError"] | |
1484 | + | |
1485 | +let rec to_string_text verbosity tokens = function | |
1486 | + RawText s -> [] | |
1487 | + | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens)) | |
1488 | + | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text)) | |
... | ... |
subsyntax/ENIAM_MWE.ml
... | ... | @@ -164,12 +164,13 @@ let get_single_letter_orths paths = |
164 | 164 | IntMap.fold map orths (fun orths _ l -> |
165 | 165 | TokenEnvSet.fold l orths (fun orths t -> |
166 | 166 | match t.token with |
167 | - SmallLetter lemma -> StringSet.add orths lemma | |
167 | + SmallLetter lemma -> (*if lemma <> "g" then*) StringSet.add orths lemma (*else orths*) (* FIXME: !!!! *) | |
168 | 168 | | CapLetter(lemma,_) -> StringSet.add orths lemma |
169 | 169 | | _ -> orths))) |
170 | 170 | |
171 | 171 | let preselect orths lemmas rules l = |
172 | 172 | Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) -> |
173 | + (* print_endline ("preselect: " ^ lemma); *) | |
173 | 174 | let b = Xlist.fold match_list true (fun b -> function |
174 | 175 | O s -> StringSet.mem orths s && b |
175 | 176 | | L(s,_,_) -> StringSet.mem lemmas s && b |
... | ... | @@ -179,6 +180,7 @@ let preselect orths lemmas rules l = |
179 | 180 | let preselect_dict orths lemmas dict rules = |
180 | 181 | StringSet.fold orths rules (fun rules orth -> |
181 | 182 | try |
183 | + (* print_endline ("preselect_dict: " ^ orth); *) | |
182 | 184 | preselect orths lemmas rules (StringMap.find dict orth) |
183 | 185 | with Not_found -> rules) |
184 | 186 | |
... | ... | @@ -195,7 +197,7 @@ let add_ordnum_rules orths rules = |
195 | 197 | (false,[D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules)) |
196 | 198 | |
197 | 199 | let add_quot_rule rules = |
198 | - (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules | |
200 | + (false,[I "„x";I "<sentence>"; I "<clause>"],"„","Interp",[]) :: rules | |
199 | 201 | |
200 | 202 | let add_building_number_rules dig_orths letter_orths rules = |
201 | 203 | StringSet.fold dig_orths rules (fun rules dig1 -> |
... | ... | @@ -215,15 +217,22 @@ let add_building_number_rules dig_orths letter_orths rules = |
215 | 217 | |
216 | 218 | let select_rules paths mwe_dict mwe_dict2 = |
217 | 219 | let orths = get_orths paths in |
220 | + (* print_endline ("ENIAM_MWE.select_rules 1 orths=[" ^ String.concat ";" (StringSet.to_list orths) ^ "]"); *) | |
218 | 221 | let lemmas = get_lemmas paths in |
219 | 222 | let intnum_orths = get_intnum_orths paths in |
220 | - let year_orths = get_year_orths paths in | |
221 | - let letter_orths = get_single_letter_orths paths in | |
223 | + (* let year_orths = get_year_orths paths in *) | |
224 | + (* let letter_orths = get_single_letter_orths paths in *) | |
222 | 225 | let rules = preselect_dict orths lemmas mwe_dict [] in |
226 | + (* print_endline ("ENIAM_MWE.select_rules 1 |rules|=" ^ string_of_int (Xlist.size rules)); *) | |
227 | + (* Xlist.iter rules (fun (is_mwe,match_list,lemma,cat,interp) -> print_endline lemma); *) | |
223 | 228 | let rules = preselect_dict2 orths lemmas mwe_dict2 rules in |
229 | + (* print_endline ("ENIAM_MWE.select_rules 2 |rules|=" ^ string_of_int (Xlist.size rules)); *) | |
224 | 230 | let rules = add_ordnum_rules intnum_orths rules in |
231 | + (* print_endline ("ENIAM_MWE.select_rules 3 |rules|=" ^ string_of_int (Xlist.size rules)); *) | |
225 | 232 | let rules = add_quot_rule rules in |
226 | - let rules = add_building_number_rules year_orths letter_orths rules in | |
233 | + (* print_endline ("ENIAM_MWE.select_rules 4 |rules|=" ^ string_of_int (Xlist.size rules)); *) | |
234 | + (* let rules = add_building_number_rules year_orths letter_orths rules in *) (* FIXME !!!! *) | |
235 | + (* print_endline ("ENIAM_MWE.select_rules 5 |rules|=" ^ string_of_int (Xlist.size rules) ^ " |year_orths|=" ^ string_of_int (StringSet.size year_orths) ^ " |letter_orths|=" ^ string_of_int (StringSet.size letter_orths)); *) | |
227 | 236 | rules |
228 | 237 | |
229 | 238 | let rec check_interp sels = function |
... | ... | @@ -306,7 +315,8 @@ let create_token is_mwe (matching:token_env list) sels lemma cat interp = (* FIX |
306 | 315 | beg=beg; |
307 | 316 | len=len; |
308 | 317 | next=t.next; |
309 | - token=Lemma(lemma,cat,[Xlist.map interp (function | |
318 | + token=if cat = "Interp" then Interp lemma else | |
319 | + Lemma(lemma,cat,[Xlist.map interp (function | |
310 | 320 | S s -> (try Xlist.assoc sels s with Not_found -> ["_"]) |
311 | 321 | | V s -> Xstring.split "\\." s |
312 | 322 | | G -> ["_"])]); |
... | ... | @@ -327,18 +337,34 @@ let apply_rule paths (is_mwe,match_list,lemma,cat,interp) = |
327 | 337 | add_token paths token |
328 | 338 | with Not_found -> paths) |
329 | 339 | |
340 | +let count_path_size paths = | |
341 | + IntMap.fold paths 0 (fun n _ map2 -> | |
342 | + IntMap.fold map2 n (fun n _ set -> | |
343 | + TokenEnvSet.size set + n)) | |
344 | + | |
330 | 345 | let process (paths,last) = |
346 | + (* print_endline ("ENIAM_MWE.process 1 |paths|=" ^ string_of_int (Xlist.size paths)); *) | |
331 | 347 | let paths = Xlist.fold paths IntMap.empty add_token in |
348 | + (* print_endline ("ENIAM_MWE.process 2 |paths|=" ^ string_of_int (count_path_size paths)); *) | |
332 | 349 | let rules = select_rules paths !mwe_dict !mwe_dict2 in |
350 | + (* print_endline ("ENIAM_MWE.process 3 |rules|=" ^ string_of_int (Xlist.size rules)); *) | |
333 | 351 | let paths = Xlist.fold rules paths apply_rule in |
352 | + (* print_endline ("ENIAM_MWE.process 4 |paths|=" ^ string_of_int (count_path_size paths)); *) | |
334 | 353 | let rules = select_rules paths !mwe_dict !mwe_dict2 in |
354 | + (* print_endline ("ENIAM_MWE.process 5 |rules|=" ^ string_of_int (Xlist.size rules)); *) | |
335 | 355 | let paths = Xlist.fold rules paths apply_rule in |
356 | + (* print_endline ("ENIAM_MWE.process 6 |paths|=" ^ string_of_int (count_path_size paths)); *) | |
336 | 357 | let rules = select_rules paths !mwe_dict !mwe_dict2 in |
358 | + (* print_endline ("ENIAM_MWE.process 7 |rules|=" ^ string_of_int (Xlist.size rules)); *) | |
337 | 359 | let paths = Xlist.fold rules paths apply_rule in |
360 | + (* print_endline "ENIAM_MWE.process 8"; *) | |
338 | 361 | let rules = select_rules paths !mwe_dict !mwe_dict2 in |
362 | + (* print_endline "ENIAM_MWE.process 9"; *) | |
339 | 363 | let paths = Xlist.fold rules paths apply_rule in |
364 | + (* print_endline "ENIAM_MWE.process 10"; *) | |
340 | 365 | let paths = IntMap.fold paths [] (fun paths _ map -> |
341 | 366 | IntMap.fold map paths (fun paths _ l -> |
342 | 367 | TokenEnvSet.fold l paths (fun paths t -> |
343 | 368 | t :: paths))) in |
369 | + (* print_endline "ENIAM_MWE.process 11"; *) | |
344 | 370 | ENIAMpaths.sort (paths,last) |
... | ... |
subsyntax/ENIAMsentences.ml
... | ... | @@ -139,6 +139,13 @@ let find_query paragraph tokens chart last = |
139 | 139 | (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>") |
140 | 140 | (fun ids -> Tokens("query",ids)) |
141 | 141 | |
142 | +let find_query2 paragraph tokens chart last = | |
143 | + parse_bracket_rule paragraph tokens chart last | |
144 | + (fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>") | |
145 | + (fun tokens id -> true) | |
146 | + (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>") | |
147 | + (fun ids -> Tokens("query",ids)) | |
148 | + | |
142 | 149 | let find_tokens_in_chart tokens chart lnode rnode cat = |
143 | 150 | let found = Xlist.fold chart.(lnode) [] (fun found (id,rnode2) -> |
144 | 151 | if rnode = rnode2 then |
... | ... | @@ -149,7 +156,8 @@ let find_tokens_in_chart tokens chart lnode rnode cat = |
149 | 156 | else found) in |
150 | 157 | match found with |
151 | 158 | [x] -> x |
152 | - | _ -> failwith "Unable to extract sentences. Check puntuation." | |
159 | + | [] -> failwith "Unable to extract sentences. Check puntuation." | |
160 | + | _ -> failwith "find_tokens_in_chart" | |
153 | 161 | |
154 | 162 | (*let find_tokens_in_chart_id tokens chart lnode rnode cat = |
155 | 163 | let found = Int.fold 0 last [] (fun ids lnode -> |
... | ... | @@ -215,6 +223,13 @@ let extract_sentences pid tokens chart last = |
215 | 223 | psentence=AltSentence[Raw,RawSentence paragraph; |
216 | 224 | ENIAM,StructSentence("",paths,last)]}]*) |
217 | 225 | |
226 | +let extract_sentences2 pid tokens chart last = | |
227 | + let ids = find_tokens_in_chart tokens chart 0 last "query" in | |
228 | + let paths,last = make_paths tokens ids in | |
229 | + let sentences = [{id="0"; beg=0; len=last; next=last; file_prefix=""; | |
230 | + sentence=AltSentence([ENIAM,StructSentence(paths,last)])}] in | |
231 | + add_struct_sentence_ids pid sentences | |
232 | + | |
218 | 233 | (* |
219 | 234 | let is_sentence = function |
220 | 235 | Sentence _ -> true |
... | ... | @@ -269,6 +284,7 @@ let make_chart paths last = |
269 | 284 | chart |
270 | 285 | |
271 | 286 | let split_into_sentences pid paragraph tokens paths = |
287 | + (* print_endline "split_into_sentences"; *) | |
272 | 288 | let paths = make_ids tokens paths in |
273 | 289 | let paths,last = prepare_indexes paths in |
274 | 290 | let chart = make_chart paths last in |
... | ... | @@ -280,3 +296,12 @@ let split_into_sentences pid paragraph tokens paths = |
280 | 296 | find_paren_sentences par tokens chart last; |
281 | 297 | find_query par tokens chart last; |
282 | 298 | extract_sentences pid tokens chart last |
299 | + | |
300 | +let no_split_into_sentences pid paragraph tokens paths = | |
301 | + (* print_endline "no_split_into_sentences"; *) | |
302 | + let paths = make_ids tokens paths in | |
303 | + let paths,last = prepare_indexes paths in | |
304 | + let chart = make_chart paths last in | |
305 | + let par = Array.of_list ([""] @ Xunicode.utf8_chars_of_utf8_string paragraph @ [""]) in | |
306 | + find_query2 par tokens chart last; | |
307 | + extract_sentences2 pid tokens chart last | |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -233,7 +233,8 @@ let rec select_tokens2_rec last paths nodes map = |
233 | 233 | select_tokens2_rec last paths nodes map |
234 | 234 | |
235 | 235 | let rec calculate_quality q = function |
236 | - CS :: l -> calculate_quality (q-2) l | |
236 | + FC :: l -> calculate_quality (q-2) l | |
237 | + | CS :: l -> calculate_quality (q-2) l | |
237 | 238 | | MaybeCS :: l -> calculate_quality q l |
238 | 239 | | ReqValLemm :: l -> calculate_quality q l |
239 | 240 | | MWE :: l -> calculate_quality (q+6) l |
... | ... | @@ -313,7 +314,7 @@ let initialize () = |
313 | 314 | |
314 | 315 | let parse query = |
315 | 316 | let l = ENIAMtokenizer.parse query in |
316 | -(* print_endline "a6"; *) | |
317 | + (* print_endline "a6"; *) | |
317 | 318 | let paths = ENIAMpaths.translate_into_paths l in |
318 | 319 | (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a7"; *) |
319 | 320 | (* print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *) |
... | ... | @@ -324,21 +325,13 @@ let parse query = |
324 | 325 | let paths,_ = ENIAM_MWE.process paths in |
325 | 326 | (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a12"; *) |
326 | 327 | (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *) |
327 | -(* let paths = find_proper_names paths in*) | |
328 | 328 | let paths = List.rev (Xlist.rev_map paths find_proper_names) in |
329 | 329 | (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a13"; *) |
330 | 330 | (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *) |
331 | 331 | let paths = modify_weights paths in |
332 | 332 | let paths = translate_digs paths in |
333 | -(* let paths = assign_senses paths in | |
334 | -(* print_endline "a14"; *) | |
335 | - let paths = assign_valence paths in*) | |
336 | -(* print_endline "a15"; *) | |
333 | + (* print_endline "a14"; *) | |
337 | 334 | let paths = combine_interps paths in |
338 | -(* print_endline "a16"; *) | |
339 | -(* let paths = disambiguate_senses paths in | |
340 | - let paths = assign_simplified_valence paths in | |
341 | - let paths = PreSemantics.assign_semantics paths in*) | |
342 | 335 | (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a16"; *) |
343 | 336 | (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *) |
344 | 337 | let paths = select_tokens paths in |
... | ... | @@ -351,36 +344,39 @@ let parse query = |
351 | 344 | (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a19"; *) |
352 | 345 | paths(*, next_id*) |
353 | 346 | |
354 | -let parse_text_tokens tokens query = | |
347 | +let parse_text_tokens sentence_split_flag tokens query = | |
355 | 348 | (* print_endline ("parse_text_tokens: " ^ query); *) |
356 | 349 | let paragraphs = Xstring.split "\n\\|\r" query in |
357 | 350 | let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in |
358 | 351 | let n = if Xlist.size paragraphs = 1 then 0 else 1 in |
359 | 352 | let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph -> |
360 | 353 | try |
354 | + (* print_endline paragraph; *) | |
361 | 355 | let paths = parse paragraph in |
362 | 356 | (* print_endline "parse_text 1"; *) |
363 | 357 | let pid = if n = 0 then "" else string_of_int n ^ "_" in |
364 | - let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in | |
358 | + let sentences = | |
359 | + if sentence_split_flag then ENIAMsentences.split_into_sentences pid paragraph tokens paths | |
360 | + else ENIAMsentences.no_split_into_sentences pid paragraph tokens paths in | |
365 | 361 | (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1 |
366 | 362 | with e -> |
367 | 363 | (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in |
368 | 364 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens |
369 | 365 | |
370 | -let parse_text query = | |
366 | +let parse_text sentence_split_flag query = | |
371 | 367 | (* print_endline ("parse_text: " ^ query); *) |
372 | 368 | let tokens = ExtArray.make 100 empty_token_env in |
373 | 369 | let _ = ExtArray.add tokens empty_token_env in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) |
374 | - parse_text_tokens tokens query | |
370 | + parse_text_tokens sentence_split_flag tokens query | |
375 | 371 | |
376 | 372 | let catch_parse text = |
377 | 373 | try |
378 | 374 | let tokens = parse text in tokens,"" |
379 | 375 | with e -> [], Printexc.to_string e |
380 | 376 | |
381 | -let catch_parse_text text = | |
377 | +let catch_parse_text sentence_split_flag text = | |
382 | 378 | try |
383 | - let text,tokens = parse_text text in text,tokens,"" | |
379 | + let text,tokens = parse_text sentence_split_flag text in text,tokens,"" | |
384 | 380 | with e -> |
385 | 381 | RawText text, |
386 | 382 | ExtArray.make 0 empty_token_env, |
... | ... |
subsyntax/interface.ml
... | ... | @@ -18,24 +18,28 @@ |
18 | 18 | *) |
19 | 19 | |
20 | 20 | type output = Text | Xml | Html | Marsh | Graphviz |
21 | +type sentence_split = Full | Partial | None | |
21 | 22 | |
22 | 23 | let output = ref Text |
23 | 24 | let comm_stdio = ref true |
24 | -let sentence_split = ref true | |
25 | +let sentence_split = ref Full | |
25 | 26 | let port = ref 5439 |
26 | 27 | |
27 | 28 | let spec_list = [ |
28 | - "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | |
29 | - "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | |
29 | + "-s", Arg.Unit (fun () -> sentence_split:=Full), "Split input into sentences (default)"; | |
30 | + "-a", Arg.Unit (fun () -> sentence_split:=Partial), "Split input into paragraphs, do not split input into sentences"; | |
31 | + "-n", Arg.Unit (fun () -> sentence_split:=None), "Do not split input into sentences"; | |
30 | 32 | "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; |
31 | 33 | "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; |
32 | 34 | "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; |
33 | 35 | "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; |
34 | 36 | "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; |
35 | 37 | "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; |
36 | - "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; | |
38 | + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=None), "Output as graphviz dot file; turns sentence split off"; | |
37 | 39 | "--strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=true), "Perform strong disambiguation"; |
38 | 40 | "--no-strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=false), "Do not perform strong disambiguation (default)"; |
41 | + "--internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=true), "Relaxed attitude towards interpunction"; | |
42 | + "--no-internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=false), "Strict attitude towards interpunction (default)"; | |
39 | 43 | ] |
40 | 44 | |
41 | 45 | let usage_msg = |
... | ... | @@ -62,8 +66,10 @@ let rec main_loop in_chan out_chan = |
62 | 66 | (* print_endline "input text begin"; |
63 | 67 | print_endline text; |
64 | 68 | print_endline "input text end"; *) |
65 | - (if !sentence_split then | |
66 | - let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in | |
69 | + (if !sentence_split = Full || !sentence_split = Partial then | |
70 | + let text,tokens,msg = | |
71 | + if !sentence_split = Full then ENIAMsubsyntax.catch_parse_text true text | |
72 | + else ENIAMsubsyntax.catch_parse_text false text in | |
67 | 73 | (match !output with |
68 | 74 | Text -> |
69 | 75 | if msg = "" then output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ |
... | ... |
subsyntax/resources/brev.tab
... | ... | @@ -209,6 +209,7 @@ J . Jezioro subst:_:_:n:ncol |
209 | 209 | j . jak adv:pos |
210 | 210 | j . język subst:_:_:m3 |
211 | 211 | J . jezioro subst:_:_:n:ncol |
212 | +j . jednostka subst:_:_:f | |
212 | 213 | Jdt Księga Judyty subst:sg:_:f |
213 | 214 | Jer . Księga Jeremiasza subst:sg:_:f |
214 | 215 | Jez . Jezioro subst:_:_:n:ncol |
... | ... | @@ -736,7 +737,9 @@ zob . zobaczyć impt:sg:sec:perf |
736 | 737 | Zw . związek subst:_:_:m3 |
737 | 738 | ż . żeński adj:_:_:_:pos |
738 | 739 | ż . żółty adj:_:_:_:pos |
739 | -μ m mikrometr subst:_:_:m3 | |
740 | +µ m mikrometr subst:_:_:m3 | |
741 | +µ mol mikromol subst:_:_:m3 | |
742 | +µ g mikrogram subst:_:_:m3 | |
740 | 743 | A . A. subst:_:_:m1.f |
741 | 744 | B . B. subst:_:_:m1.f |
742 | 745 | C . C. subst:_:_:m1.f |
... | ... |
tokenizer/ENIAMacronyms.ml
... | ... | @@ -419,12 +419,12 @@ let acronym_patterns = [ |
419 | 419 | [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m1" | _ -> failwith "acronym_patterns"); |
420 | 420 | [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m2" | _ -> failwith "acronym_patterns"); |
421 | 421 | [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns"); |
422 | - [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:n2" | _ -> failwith "acronym_patterns"); | |
422 | + [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:n:ncol" | _ -> failwith "acronym_patterns"); | |
423 | 423 | [CL; S "-"; CL; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:loc:f" | _ -> failwith "acronym_patterns"); |
424 | 424 | [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m1" | _ -> failwith "acronym_patterns"); |
425 | 425 | [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m2" | _ -> failwith "acronym_patterns"); |
426 | 426 | [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns"); |
427 | - [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:n2" | _ -> failwith "acronym_patterns"); | |
427 | + [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:n:ncol" | _ -> failwith "acronym_patterns"); | |
428 | 428 | [CL; S "-"; CL; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:inst:f" | _ -> failwith "acronym_patterns"); |
429 | 429 | [CL; S "-"; CL; S "-"; O "cie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "T" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns"); |
430 | 430 | [CL; S "-"; CL; S "-"; O "cie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "T" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns"); |
... | ... | @@ -448,7 +448,7 @@ let acronym_patterns = [ |
448 | 448 | [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m1" | _ -> failwith "acronym_patterns"); |
449 | 449 | [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m2" | _ -> failwith "acronym_patterns"); |
450 | 450 | [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m3" | _ -> failwith "acronym_patterns"); |
451 | - [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:n2" | _ -> failwith "acronym_patterns"); | |
451 | + [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:n:ncol" | _ -> failwith "acronym_patterns"); | |
452 | 452 | [L; S "-"; L; S "-"; O "etach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns"); |
453 | 453 | [L; S "-"; L; S "-"; O "etami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns"); |
454 | 454 | [L; S "-"; L; S "-"; O "etem"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m3" | _ -> failwith "acronym_patterns"); |
... | ... | @@ -488,7 +488,7 @@ let acronym_patterns = [ |
488 | 488 | [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m1" | _ -> failwith "acronym_patterns"); |
489 | 489 | [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m2" | _ -> failwith "acronym_patterns"); |
490 | 490 | [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m3" | _ -> failwith "acronym_patterns"); |
491 | - [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:n2" | _ -> failwith "acronym_patterns"); | |
491 | + [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:n:ncol" | _ -> failwith "acronym_patterns"); | |
492 | 492 | [CL; S "-"; CL; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:dat:f" | _ -> failwith "acronym_patterns"); |
493 | 493 | [L; S "-"; L; S "-"; O "otach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns"); |
494 | 494 | [L; S "-"; L; S "-"; O "otami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns"); |
... | ... | @@ -503,13 +503,13 @@ let acronym_patterns = [ |
503 | 503 | [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m1" | _ -> failwith "acronym_patterns"); |
504 | 504 | [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m2" | _ -> failwith "acronym_patterns"); |
505 | 505 | [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m3" | _ -> failwith "acronym_patterns"); |
506 | - [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:n2" | _ -> failwith "acronym_patterns"); | |
506 | + [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:n:ncol" | _ -> failwith "acronym_patterns"); | |
507 | 507 | [L; S "-"; L; S "-"; O "owie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m1" | _ -> failwith "acronym_patterns"); |
508 | 508 | [L; S "-"; L; S "-"; O "owie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m1" | _ -> failwith "acronym_patterns"); |
509 | 509 | [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:acc:m2" | _ -> failwith "acronym_patterns"); |
510 | 510 | [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:m2" | _ -> failwith "acronym_patterns"); |
511 | 511 | [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:m3" | _ -> failwith "acronym_patterns"); |
512 | - [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:n2" | _ -> failwith "acronym_patterns"); | |
512 | + [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:n:ncol" | _ -> failwith "acronym_patterns"); | |
513 | 513 | [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m1" | _ -> failwith "acronym_patterns"); |
514 | 514 | [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m2" | _ -> failwith "acronym_patterns"); |
515 | 515 | [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns"); |
... | ... | @@ -520,30 +520,30 @@ let acronym_patterns = [ |
520 | 520 | [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "depr:pl:voc:m2" | _ -> failwith "acronym_patterns"); |
521 | 521 | [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m2" | _ -> failwith "acronym_patterns"); |
522 | 522 | [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m3" | _ -> failwith "acronym_patterns"); |
523 | - [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:n2" | _ -> failwith "acronym_patterns"); | |
523 | + [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:n:ncol" | _ -> failwith "acronym_patterns"); | |
524 | 524 | [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m2" | _ -> failwith "acronym_patterns"); |
525 | 525 | [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m3" | _ -> failwith "acronym_patterns"); |
526 | - [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:n2" | _ -> failwith "acronym_patterns"); | |
526 | + [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:n:ncol" | _ -> failwith "acronym_patterns"); | |
527 | 527 | [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m2" | _ -> failwith "acronym_patterns"); |
528 | 528 | [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m3" | _ -> failwith "acronym_patterns"); |
529 | - [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:n2" | _ -> failwith "acronym_patterns"); | |
529 | + [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:n:ncol" | _ -> failwith "acronym_patterns"); | |
530 | 530 | [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:acc:f" | _ -> failwith "acronym_patterns"); |
531 | 531 | [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:nom:f" | _ -> failwith "acronym_patterns"); |
532 | 532 | [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:voc:f" | _ -> failwith "acronym_patterns"); |
533 | 533 | [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:gen:f" | _ -> failwith "acronym_patterns"); |
534 | 534 | [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m2" | _ -> failwith "acronym_patterns"); |
535 | 535 | [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns"); |
536 | - [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:n2" | _ -> failwith "acronym_patterns"); | |
536 | + [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:n:ncol" | _ -> failwith "acronym_patterns"); | |
537 | 537 | [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m2" | _ -> failwith "acronym_patterns"); |
538 | 538 | [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns"); |
539 | - [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:n2" | _ -> failwith "acronym_patterns"); | |
539 | + [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:n:ncol" | _ -> failwith "acronym_patterns"); | |
540 | 540 | [L; S "-"; L; S "-"; O "zie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns"); |
541 | 541 | [L; S "-"; L; S "-"; O "zie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns"); |
542 | 542 | [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m1" | _ -> failwith "acronym_patterns"); |
543 | 543 | [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m1" | _ -> failwith "acronym_patterns"); |
544 | 544 | [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m2" | _ -> failwith "acronym_patterns"); |
545 | 545 | [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m3" | _ -> failwith "acronym_patterns"); |
546 | - [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:n2" | _ -> failwith "acronym_patterns"); | |
546 | + [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:n:ncol" | _ -> failwith "acronym_patterns"); | |
547 | 547 | [CL; S "-"; CL; S "-"; O "ą"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:inst:f" | _ -> failwith "acronym_patterns"); |
548 | 548 | [CL; S "-"; CL; S "-"; O "ę"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:acc:f" | _ -> failwith "acronym_patterns"); |
549 | 549 | [L; S "-"; L; S "-"; O "ista"], (function [x;y;z;_;_] -> compose_lemma3 x y z "-ista" "subst:sg:nom:m1" | _ -> failwith "acronym_patterns"); |
... | ... | @@ -706,6 +706,7 @@ let abr_patterns = [ |
706 | 706 | [O "itd"; S "."], (function [a;b] -> std a b [1,"i","conj";1,"tak","adv:pos";1,"daleko","adv:com"] | _ -> failwith "abr_patterns"); |
707 | 707 | [O "itede"; S "."], (function [a;b] -> std a b [1,"i","conj";2,"tak","adv:pos";2,"daleko","adv:com"] | _ -> failwith "abr_patterns"); |
708 | 708 | [O "itp"; S "."], (function [a;b] -> std a b [1,"i","conj";1,"tym","adv";1,"podobny","adj:pl:nom:_:pos"] | _ -> failwith "abr_patterns"); |
709 | + [O "j"; S "."; O "m"; S "."], (function [a;b;c;d] -> [ct [a;b] "jednostka" "subst:_:_:f"; ct [c;d] "miary" "subst:sg:gen:f"] | _ -> failwith "abr_patterns"); | |
709 | 710 | [O "jw"; S "."], (function [a;b] -> std a b [1,"jak","adv:pos";1,"wysoko","adv:com"] | _ -> failwith "abr_patterns"); |
710 | 711 | [O "JWP"], (function [a] -> st a [1,"jaśnie","adv:pos";1,"wielmożny","adj:_:$C:m1:pos";1,"pan","subst:_:$C:m1"] | _ -> failwith "abr_patterns"); |
711 | 712 | [O "JWP"], (function [a] -> st a [1,"jaśnie","adv:pos";1,"wielmożny","adj:_:$C:f:pos";1,"pani","subst:_:$C:f"] | _ -> failwith "abr_patterns"); |
... | ... | @@ -717,35 +718,36 @@ let abr_patterns = [ |
717 | 718 | [O "m"; S "."; O "in"; S "."], (function [a;b;c;d] -> [ct [a;b] "między" "prep:inst"; ct [c;d] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns"); |
718 | 719 | [O "m"; S "."; O "in"], (function [a;b;c] -> [ct [a;b] "między" "prep:inst"; ct [c] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns"); |
719 | 720 | [O "m"; S "."; O "inn"; S "."], (function [a;b;c;d] -> [ct [a;b] "między" "prep:inst"; ct [c;d] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns"); |
720 | - [O "m"; S "."; O "st"; S "."], (function [a;b;c;d] -> [ct [a;b] "miasto" "subst:_:$C:n2"; ct [c;d] "stołeczny" "adj:_:$C:n2:pos"] | _ -> failwith "abr_patterns"); | |
721 | + [O "m"; S "."; O "st"; S "."], (function [a;b;c;d] -> [ct [a;b] "miasto" "subst:_:$C:n:ncol"; ct [c;d] "stołeczny" "adj:_:$C:n:pos"] | _ -> failwith "abr_patterns"); | |
721 | 722 | [O "m"; O "^"; O "2"], (function [a;b;c] -> [ct [a] "metr" "subst:_:$C:m3"; ct [b;c] "kwadratowy" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns"); |
722 | 723 | [O "m"; O "2"], (function [a;b] -> [ct [a] "metr" "subst:_:$C:m3"; ct [b] "kwadratowy" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns"); |
723 | 724 | [O "m"; O "3"], (function [a;b] -> [ct [a] "metr" "subst:_:$C:m3"; ct [b] "sześcienny" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns"); |
724 | 725 | (* [O "min"; S "."], (function [a;b] -> std a b [1,"między","prep:inst";2,"inny","adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns"); *) |
726 | + [O "mc"; S "."], (function [a;b] -> std a b [1,"masa","subst:sg:$C:f";1,"ciało","subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns"); | |
725 | 727 | [O "mkw"; S "."], (function [a;b] -> std a b [1,"metr","subst:_:$C:m3";2,"kwadratowy","adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns"); |
726 | 728 | [O "n"; S "."; O "e"; S "."], (function [a;b;c;d] -> [ct [a;b] "nasz" "adj:sg:gen:f:pos"; ct [c;d] "era" "subst:sg:gen:f"] | _ -> failwith "abr_patterns"); |
727 | - [O "n"; S "."; O "p"; S "."; O "m"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "nad" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n2"] | _ -> failwith "abr_patterns"); | |
729 | + [O "n"; S "."; O "p"; S "."; O "m"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "nad" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns"); | |
728 | 730 | [O "np"; S "."], (function [a;b] -> std a b [1,"na","prep:acc";1,"przykład","subst:sg:acc:m3"] | _ -> failwith "abr_patterns"); |
729 | 731 | [O "nt"; S "."], (function [a;b] -> std a b [1,"na","prep:acc";1,"temat","subst:sg:acc:m3"] | _ -> failwith "abr_patterns"); |
730 | 732 | [O "NTG"], (function [a] -> st a [1,"nie","qub";1,"ta","adj:sg:nom:f:pos";1,"grupa","subst:sg:nom:f"] | _ -> failwith "abr_patterns"); |
731 | 733 | [O "o"; S "."; O "o"; S "."], (function [a;b;c;d] -> [ct [a;b] "ograniczony" "adj:sg:$C:f:pos"; ct [c;d] "odpowiedzialność" "subst:sg:$C:f"] | _ -> failwith "abr_patterns"); |
732 | 734 | [O "p"; S "."; O "n"; S "."; O "e"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "przed" "prep:inst"; ct [c;d] "nasz" "adj:sg:inst:f:pos"; ct [e;f] "era" "subst:sg:inst:f"] | _ -> failwith "abr_patterns"); |
733 | 735 | [O "p"; S "."; O "o"; S "."], (function [a;b;c;d] -> [ct [a;b] "pełniący" "pact:_:_:m1.m2.m3:imperf:aff"; ct [c;d] "obowiązek" "subst:pl:acc:m3"] | _ -> failwith "abr_patterns"); |
734 | - [O "p"; S "."; O "p"; S "."; O "m"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "pod" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n2"] | _ -> failwith "abr_patterns"); | |
736 | + [O "p"; S "."; O "p"; S "."; O "m"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "pod" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns"); | |
735 | 737 | [O "p"; S "."; O "t"; S "."], (function [a;b;c;d] -> [ct [a;b] "pod" "prep:inst:nwokc"; ct [c;d] "tytuł" "subst:sg:inst:m3"] | _ -> failwith "abr_patterns"); |
736 | 738 | [O "pn"; S "."], (function [a;b] -> std a b [1,"pod","prep:inst";1,"nazwa","subst:sg:inst:f"] | _ -> failwith "abr_patterns"); |
737 | 739 | [O "pne"; S "."], (function [a;b] -> std a b [1,"przed","prep:inst";1,"nasz","adj:sg:inst:f:pos";1,"era","subst:sg:inst:f"] | _ -> failwith "abr_patterns"); |
738 | 740 | [O "pt"; S "."], (function [a;b] -> std a b [1,"pod","prep:inst";1,"tytuł","subst:sg:inst:m3"] | _ -> failwith "abr_patterns"); |
739 | 741 | [O "PW"], (function [a] -> st a [1,"prywatny","adj:_:$C:f:pos";1,"wiadomość","subst:_:$C:f"] | _ -> failwith "abr_patterns"); |
740 | - [O "pw"; S "."], (function [a;b] -> std a b [1,"pod","prep:inst";1,"wezwanie","subst:sg:inst:n2"] | _ -> failwith "abr_patterns"); | |
742 | + [O "pw"; S "."], (function [a;b] -> std a b [1,"pod","prep:inst";1,"wezwanie","subst:sg:inst:n:ncol"] | _ -> failwith "abr_patterns"); | |
741 | 743 | (* [O "S"; S "."; O "A"; S "."], (function [a;b;c;d] -> [ct [a;b] "spółka" "subst:sg:$C:f"; ct [c;d] "akcyjny" "adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns"); |
742 | 744 | [O "s"; S "."; O "c"; S "."], (function [a;b;c;d] -> [ct [a;b] "spółka" "subst:sg:$C:f"; ct [c;d] "cywilny" "adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns");*) |
743 | 745 | (* [O "SA"], (function [a] -> st a [1,"spółka","subst:sg:$C:f";1,"akcyjny","adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns"); *) |
744 | 746 | [O "ś"; S "."; O "p"; S "."], (function [a;b;c;d] -> [ct [a;b] "święty" "adj:sg:gen:f:pos"; ct [c;d] "pamięć" "subst:sg:gen:f"] | _ -> failwith "abr_patterns"); |
745 | 747 | [O "śp"; S "."], (function [a;b] -> std a b [1,"święty","adj:sg:gen:f:pos";1,"pamięć","subst:sg:gen:f"] | _ -> failwith "abr_patterns"); |
746 | 748 | [O "tgz"; S "."], (function [a;b] -> std a b [2,"tak","adv";1,"zwać","ppas:_:_:_:_:aff"] | _ -> failwith "abr_patterns"); |
747 | - [O "tj"; S "."], (function [a;b] -> std a b [1,"to","subst:sg:nom:n2";1,"być","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns"); | |
748 | - [O "tzn"; S "."], (function [a;b] -> std a b [1,"to","subst:sg:nom:n2";2,"znaczyć","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns"); | |
749 | + [O "tj"; S "."], (function [a;b] -> std a b [1,"to","subst:sg:nom:n:ncol";1,"być","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns"); | |
750 | + [O "tzn"; S "."], (function [a;b] -> std a b [1,"to","subst:sg:nom:n:ncol";2,"znaczyć","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns"); | |
749 | 751 | [O "tzw"; S "."], (function [a;b] -> std a b [1,"tak","adv:pos";2,"zwać","ppas:_:_:_:imperf:aff"] | _ -> failwith "abr_patterns"); |
750 | 752 | [O "ub"; S "."; O "r"; S "."], (function [a;b;c;d] -> [ct [a;b] "ubiegły" "adj:sg:$C:m3:pos"; ct [c;d] "rok" "subst:sg:$C:m3"] | _ -> failwith "abr_patterns"); |
751 | 753 | [O "w"; S "."; O "w"; S "."], (function [a;b;c;d] -> [ct [a;b] "wysoko" "adv:com"; ct [c;d] "wymienić" "ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns"); |
... | ... |
tokenizer/ENIAMpatterns.ml
... | ... | @@ -405,7 +405,7 @@ let digit_patterns4 = [ |
405 | 405 | [C "realnum-interval"; O "mld"], (function [x;_] -> make_tys 9 x | _ -> failwith "digit_patterns8"); |
406 | 406 | ] |
407 | 407 | |
408 | -let url_patterns1 = [ | |
408 | +(*let url_patterns1 = [ | |
409 | 409 | [L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); |
410 | 410 | [L; D "dig"; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); |
411 | 411 | [L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); |
... | ... | @@ -461,9 +461,12 @@ let url_patterns1 = [ |
461 | 461 | let url_patterns2 = [ |
462 | 462 | [L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
463 | 463 | [L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
464 | + [L; S "_"; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); | |
464 | 465 | [L; S "."; L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
466 | + [L; S "."; D "dig"; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); | |
465 | 467 | [L; D "intnum"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
466 | 468 | [L; S "."; L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
469 | + [L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); | |
467 | 470 | [O "http"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url")); |
468 | 471 | [O "https"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url")); |
469 | 472 | ] |
... | ... | @@ -472,7 +475,7 @@ let url_patterns3 = [ |
472 | 475 | [D "url"; S "/"], (function l -> Dig(concat_orths2 l,"url")); |
473 | 476 | [D "url"; S "/"; L], (function l -> Dig(concat_orths2 l,"url")); |
474 | 477 | [D "url"; S "/"; L; S "."; L], (function l -> Dig(concat_orths2 l,"url")); |
475 | -] | |
478 | +]*) | |
476 | 479 | |
477 | 480 | let html_patterns = [ |
478 | 481 | [S "<"; L; S ">"], (function l -> Dig(concat_orths2 l,"html-tag")); |
... | ... | @@ -701,7 +704,7 @@ let manage_query_boundaries tokens = |
701 | 704 | if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else |
702 | 705 | if find_beg_pattern [I "</query>";I "”s"] tokens then |
703 | 706 | replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else |
704 | - if find_beg_pattern [I "</query>";I ")s";I "</sentence>"] tokens then tokens else | |
707 | + if find_beg_pattern [I "</query>";I ")s"(*;I "</sentence>"*)] tokens then tokens else | |
705 | 708 | replace_beg_pattern [I "</query>"] add_sentence_end tokens in |
706 | 709 | let tokens = Xlist.rev_map tokens revert_tokens in |
707 | 710 | tokens |
... | ... | @@ -724,12 +727,12 @@ let find_replacement_patterns tokens = |
724 | 727 | let tokens = find_patterns ENIAMacronyms.name_patterns tokens in |
725 | 728 | (* Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *) |
726 | 729 | let tokens = normalize_tokens [] tokens in |
727 | - let tokens = find_patterns url_patterns1 tokens in | |
730 | +(* let tokens = find_patterns url_patterns1 tokens in | |
728 | 731 | let tokens = normalize_tokens [] tokens in |
729 | 732 | let tokens = find_patterns url_patterns2 tokens in |
730 | 733 | let tokens = normalize_tokens [] tokens in |
731 | 734 | let tokens = find_patterns url_patterns3 tokens in |
732 | - let tokens = normalize_tokens [] tokens in | |
735 | + let tokens = normalize_tokens [] tokens in*) | |
733 | 736 | let tokens = find_patterns html_patterns tokens in |
734 | 737 | let tokens = normalize_tokens [] tokens in |
735 | 738 | (* Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *) |
... | ... |
tokenizer/ENIAMtokenizer.ml
... | ... | @@ -21,7 +21,8 @@ open Xstd |
21 | 21 | open ENIAMtokenizerTypes |
22 | 22 | |
23 | 23 | let initialize () = |
24 | - ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns () | |
24 | + ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns (); | |
25 | + ENIAMurl.top_level_domains := ENIAMurl.load_top_level_domains () | |
25 | 26 | |
26 | 27 | let string_of = |
27 | 28 | ENIAMtokens.string_of_tokens |
... | ... |
tokenizer/ENIAMtokenizerTypes.ml
... | ... | @@ -41,7 +41,7 @@ type token = |
41 | 41 | | Tokens of string * int list (*cat * token id list *) |
42 | 42 | |
43 | 43 | type attr = |
44 | - CS | MaybeCS | ReqValLemm | MWE | LemmNotVal | TokNotFound | NotValProper | LemmLowercase | Roman | Capitalics | |
44 | + FC | CS | MaybeCS | ReqValLemm | MWE | LemmNotVal | TokNotFound | NotValProper | LemmLowercase | Roman | Capitalics | |
45 | 45 | | SentBeg | SentEnd | SentBegEnd |
46 | 46 | | BrevLemma of string |
47 | 47 | | Disamb of string * string * string list list |
... | ... | @@ -71,6 +71,8 @@ type pat = L | CL | SL | (*SL2 |*) D of string | C of string | S of string | RD |
71 | 71 | let empty_token_env = { |
72 | 72 | orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} |
73 | 73 | |
74 | +let internet_mode = ref false | |
75 | + | |
74 | 76 | let resource_path = |
75 | 77 | try Sys.getenv "ENIAM_RESOURCE_PATH" |
76 | 78 | with Not_found -> |
... | ... | @@ -82,6 +84,8 @@ let resource_path = |
82 | 84 | let mte_filename = resource_path ^ "/tokenizer/mte_20151215.tab" |
83 | 85 | let mte_filename2 = resource_path ^ "/tokenizer/mte.tab" |
84 | 86 | |
87 | +let top_level_domains_filename = resource_path ^ "/tokenizer/top-level-domains.tab" | |
88 | + | |
85 | 89 | module OrderedTokenEnv = struct |
86 | 90 | |
87 | 91 | type t = token_env |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -87,7 +87,8 @@ let rec xml_of_token = function |
87 | 87 | | Tokens(cat,l) -> Xml.Element("Tokens",["pos",cat],Xlist.map l (fun x -> Xml.Element("id",[],[Xml.PCData (string_of_int x)]))) |
88 | 88 | |
89 | 89 | let string_of_attr = function |
90 | - CS -> "cs" | |
90 | + FC -> "first capital" | |
91 | + | CS -> "cs" | |
91 | 92 | | MaybeCS -> "maybe cs" |
92 | 93 | | ReqValLemm -> "required validated lemmatization" |
93 | 94 | | MWE -> "mwe" |
... | ... | @@ -212,9 +213,9 @@ let merge_digits poss_s_beg i digs = |
212 | 213 | (if Xlist.size digs <= 3 && List.hd digs <> "0" then [t (Dig(v,"pref3dig"));sc_t (Dig(v,"pref3dig"))] else []) in*) |
213 | 214 | Variant variants |
214 | 215 | |
215 | -let merge_url poss_s_beg i digs = | |
216 | +(* let merge_url poss_s_beg i digs = | |
216 | 217 | let orth = String.concat "" digs in |
217 | - Variant(dig_tokens orth poss_s_beg i digs orth "url") | |
218 | + Variant(dig_tokens orth poss_s_beg i digs orth "url") *) | |
218 | 219 | |
219 | 220 | let recognize_roman_I v = function |
220 | 221 | Capital("I",_) :: Capital("I",_) :: Capital("I",_) :: [] -> v+3,false |
... | ... | @@ -335,6 +336,7 @@ let get_first_lower = function |
335 | 336 | | _ -> failwith "get_first_lower" |
336 | 337 | |
337 | 338 | let cs_weight = -1. |
339 | +let fc_weight = -10. | |
338 | 340 | let sc_cap_weight = -0.3 |
339 | 341 | |
340 | 342 | let is_add_attr_token = function |
... | ... | @@ -361,13 +363,17 @@ let recognize_stem poss_s_beg has_sufix i letters = |
361 | 363 | Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SmallLetter(merge (lowercase_first letters)); attrs=MaybeCS :: t.attrs}]; |
362 | 364 | Token{t with token=CapLetter(orth,merge (lowercase_first letters)); attrs=MaybeCS :: t.attrs}; |
363 | 365 | Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=CapLetter(orth,merge (lowercase_first letters)); weight=sc_cap_weight; attrs=MaybeCS :: t.attrs}]] |
366 | + else if !internet_mode then Variant[ | |
367 | + Token{t with token=SmallLetter orth}; | |
368 | + Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SmallLetter orth}]] | |
364 | 369 | else Token{t with token=SmallLetter orth} |
365 | 370 | else |
366 | 371 | if first_capital letters then |
367 | - if rest_small letters then Variant[ | |
372 | + if rest_small letters then Variant([ | |
368 | 373 | Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall(merge (lowercase_first letters))}]; |
369 | 374 | Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)}; |
370 | - Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters); weight=sc_cap_weight}]] | |
375 | + Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters); weight=sc_cap_weight}]] @ | |
376 | + (if !internet_mode then [Token{t with token=AllSmall(merge (lowercase_first letters)); weight=fc_weight; attrs=FC :: t.attrs}] else [])) | |
371 | 377 | else if rest_capital letters then Variant([ |
372 | 378 | Token{t with token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs}; |
373 | 379 | Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs}]; |
... | ... | @@ -377,6 +383,13 @@ let recognize_stem poss_s_beg has_sufix i letters = |
377 | 383 | Token{t with token=AllCap(orth,merge (lowercase_rest letters),merge (lowercase_all letters)); attrs=MaybeCS :: t.attrs}; |
378 | 384 | Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllCap(orth,merge (lowercase_rest letters),merge (lowercase_all letters)); attrs=MaybeCS :: t.attrs}]])) |
379 | 385 | else Token{t with token=SomeCap orth} |
386 | + else if !internet_mode then | |
387 | + if rest_small letters then Variant[ | |
388 | + Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall orth}]; | |
389 | + Token{t with token=AllSmall orth}] | |
390 | + else Variant[ | |
391 | + Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SomeCap orth}]; | |
392 | + Token{t with token=SomeCap orth}] | |
380 | 393 | else |
381 | 394 | if rest_small letters then Token{t with token=AllSmall orth} |
382 | 395 | else Token{t with token=SomeCap orth} |
... | ... | @@ -388,8 +401,9 @@ let recognize_stem poss_s_beg has_sufix i letters = |
388 | 401 | else Token{t with token=SmallLetter orth} |
389 | 402 | else |
390 | 403 | if first_capital letters then |
391 | - if rest_small letters then | |
392 | - Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)} | |
404 | + if rest_small letters then Variant([ | |
405 | + Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)}] @ | |
406 | + (if !internet_mode then [Token{t with token=AllSmall(merge (lowercase_first letters)); weight=fc_weight; attrs=FC :: t.attrs}] else [])) | |
393 | 407 | else if rest_capital letters then Variant([ |
394 | 408 | Token{t with token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs}; |
395 | 409 | Token{t with token=FirstCap(merge (lowercase_rest letters),merge (lowercase_all letters),get_first_cap letters,get_first_lower letters); weight=cs_weight; attrs=CS :: t.attrs}] @ |
... | ... | @@ -547,24 +561,24 @@ let rec group_others rev = function |
547 | 561 | | x :: l -> List.rev rev, x :: l |
548 | 562 | |
549 | 563 | let create_sign_token poss_s_beg i signs l token = |
550 | - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in | |
564 | + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | Small s -> s | _ -> failwith "create_sign_token")) in | |
551 | 565 | let len = Xlist.size signs * factor in |
552 | 566 | Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=token; attrs=[MaybeCS]},i+len,l,poss_s_beg |
553 | 567 | |
554 | 568 | let create_empty_sign_token i signs = |
555 | - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in | |
569 | + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_empty_sign_token")) in | |
556 | 570 | let len = Xlist.size signs * factor in |
557 | 571 | {empty_token_env with orth=orth;beg=i;len=len;next=i+len; attrs=[MaybeCS]},i+len |
558 | 572 | |
559 | 573 | let create_sentence_seq i signs l lemma = |
560 | - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in | |
574 | + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq")) in | |
561 | 575 | let len = Xlist.size signs * factor in |
562 | 576 | Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"}; |
563 | 577 | Token{empty_token_env with orth=orth;beg=i+20;len=len-30;next=i+len-10;token=make_lemma (lemma,"sinterj")}; |
564 | 578 | Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}] |
565 | 579 | |
566 | 580 | let create_sentence_seq_hapl i signs l lemma = |
567 | - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in | |
581 | + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_hapl")) in | |
568 | 582 | let len = Xlist.size signs * factor in |
569 | 583 | Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]}; |
570 | 584 | Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"}; |
... | ... | @@ -572,7 +586,7 @@ let create_sentence_seq_hapl i signs l lemma = |
572 | 586 | Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}] |
573 | 587 | |
574 | 588 | let create_sentence_seq_q i signs l lemma = |
575 | - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in | |
589 | + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_q")) in | |
576 | 590 | let len = Xlist.size signs * factor in |
577 | 591 | Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "?"}; |
578 | 592 | Token{empty_token_env with beg=i+20;len=10;next=i+30;token=Interp "</clause>"}; |
... | ... | @@ -580,7 +594,7 @@ let create_sentence_seq_q i signs l lemma = |
580 | 594 | Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}] |
581 | 595 | |
582 | 596 | let create_sentence_seq_hapl_q i signs l lemma = |
583 | - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in | |
597 | + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_hapl_q")) in | |
584 | 598 | let len = Xlist.size signs * factor in |
585 | 599 | Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]}; |
586 | 600 | Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "?"}; |
... | ... | @@ -589,7 +603,7 @@ let create_sentence_seq_hapl_q i signs l lemma = |
589 | 603 | Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}] |
590 | 604 | |
591 | 605 | let create_or_beg i signs l poss_s_beg = |
592 | - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in | |
606 | + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_or_beg")) in | |
593 | 607 | let len = Xlist.size signs * factor in |
594 | 608 | Variant[ |
595 | 609 | Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=Symbol "-"; attrs=[MaybeCS]}; |
... | ... | @@ -606,7 +620,7 @@ let create_or_beg i signs l poss_s_beg = |
606 | 620 | ],i+len,l,poss_s_beg |
607 | 621 | |
608 | 622 | let create_or_beg2 i signs l poss_s_beg = |
609 | - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in | |
623 | + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_or_beg2")) in | |
610 | 624 | let len = Xlist.size signs * factor in |
611 | 625 | Variant[ |
612 | 626 | Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=Interp "<or>"}; |
... | ... | @@ -631,18 +645,18 @@ let is_dot_sentence_end_marker = function |
631 | 645 | | _ -> false |
632 | 646 | |
633 | 647 | let not_dot_sentence_end_marker = function |
634 | - Sign " " :: Small _ :: _ -> true | |
635 | - | Sign "" :: Small _ :: _ -> true | |
636 | - | Sign " " :: Small _ :: _ -> true | |
648 | + Sign " " :: Small _ :: _ -> if !internet_mode then false else true | |
649 | + | Sign "" :: Small _ :: _ -> if !internet_mode then false else true | |
650 | + | Sign " " :: Small _ :: _ -> if !internet_mode then false else true | |
637 | 651 | | Sign "," :: _ -> true |
638 | 652 | | Sign ":" :: _ -> true |
639 | 653 | | Sign "?" :: _ -> true |
640 | 654 | | Sign "!" :: _ -> true |
641 | - | Small _ :: _ -> true | |
642 | - | ForeignSmall _ :: _ -> true | |
643 | - | Capital _ :: _ -> true | |
644 | - | ForeignCapital _ :: _ -> true | |
645 | - | Digit _ :: _ -> true | |
655 | + | Small _ :: _ -> if !internet_mode then false else true | |
656 | + | ForeignSmall _ :: _ -> if !internet_mode then false else true | |
657 | + | Capital _ :: _ -> if !internet_mode then false else true | |
658 | + | ForeignCapital _ :: _ -> if !internet_mode then false else true | |
659 | + | Digit _ :: _ -> if !internet_mode then false else true | |
646 | 660 | | _ -> false |
647 | 661 | |
648 | 662 | let is_comma_digit_marker = function |
... | ... | @@ -705,6 +719,7 @@ let rec recognize_sign_group poss_s_beg i = function |
705 | 719 | | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ") |
706 | 720 | | (Sign "") :: l -> create_sign_token poss_s_beg i [Sign ""] l (Symbol " ") |
707 | 721 | | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ") |
722 | + | (Sign "&") :: (Small "n") :: (Small "b") :: (Small "s") :: (Small "p") :: (Sign ";") :: l -> create_sign_token poss_s_beg i ((Sign "&") :: (Small "n") :: (Small "b") :: (Small "s") :: (Small "p") :: (Sign ";") :: []) l (Symbol " ") | |
708 | 723 | | (Sign "\"") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "\""] l |
709 | 724 | | (Sign "\"") :: l -> |
710 | 725 | let t,i2 = create_empty_sign_token i [Sign "\""] in |
... | ... | @@ -775,12 +790,16 @@ let rec recognize_sign_group poss_s_beg i = function |
775 | 790 | | (Sign ";") :: (Sign ")") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: (Sign ")") :: []) l (make_lemma (";))","sinterj")) *) |
776 | 791 | | (Sign ":") :: (Sign "|") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "|") :: []) l (make_lemma (":|","sinterj")) |
777 | 792 | | (Sign ":") :: (Sign "\\") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "\\") :: []) l (make_lemma (":\\","sinterj")) |
793 | + | (Sign ":") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "/") :: []) l (make_lemma (":/","sinterj")) | |
778 | 794 | | (Sign ":") :: (Sign "-") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "/") :: []) l (make_lemma (":-/","sinterj")) |
779 | 795 | (* | (Sign ":") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign ")") :: []) l (make_lemma (":)","sinterj")) |
780 | 796 | | (Sign ";") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: []) l (make_lemma (";)","sinterj")) *) |
781 | 797 | | (Sign ")") :: l -> (*create_sign_token poss_s_beg i [Sign ")"] l (Interp ")")*) |
782 | - let t,i = create_empty_sign_token i [Sign ")"] in | |
783 | - Variant[Token{t with token=Symbol ")"};Token{t with token=Interp ")"};Token{t with token=Interp ")s"}],i,l,poss_s_beg | |
798 | + let t,i2 = create_empty_sign_token i [Sign ")"] in | |
799 | + Variant[Token{t with token=Symbol ")"};Token{t with token=Interp ")"};Token{t with token=Interp ")s"}; | |
800 | + Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Interp "</clause>"}; | |
801 | + Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</sentence>"}; | |
802 | + Token{empty_token_env with orth=":";beg=i+20;len=factor-20;next=i+factor;token=Interp ")s"}]],i2,l,true | |
784 | 803 | | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj")) |
785 | 804 | | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj")) |
786 | 805 | | (Sign "[") :: (Sign "+") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign "+") :: (Sign "]") :: []) l (make_lemma ("[+]","symbol")) |
... | ... | @@ -928,29 +947,32 @@ let rec recognize_sign_group poss_s_beg i = function |
928 | 947 | create_sentence_seq i ((Sign ".") :: (Sign ".") :: []) l "…"; |
929 | 948 | Token{empty_token_env with orth="..";beg=i;len=2*factor;next=i+2*factor;token=make_lemma ("…","sinterj"); attrs=[MaybeCS]}],i+2*factor,l,true |
930 | 949 | | (Sign ".") :: l -> |
931 | - if is_dot_sentence_end_marker l then | |
950 | + if is_dot_sentence_end_marker l then ((*Printf.printf "dot 1 i=%d\n%!" i;*) | |
932 | 951 | Variant[Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]}; |
933 | 952 | Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"}; |
934 | 953 | Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]; |
935 | 954 | Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"}; |
936 | - Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]],i+factor,l,true | |
937 | - else if not_dot_sentence_end_marker l then | |
938 | - Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]},i+factor,l,false | |
939 | - else | |
955 | + Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]],i+factor,l,true) | |
956 | + else if not_dot_sentence_end_marker l then ((*Printf.printf "dot 2 i=%d\n%!" i;*) | |
957 | + Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]},i+factor,l,false) | |
958 | + else ((*Printf.printf "dot 3 i=%d\n%!" i;*) | |
940 | 959 | Variant[Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]}; |
941 | 960 | Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"}; |
942 | 961 | Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]; |
943 | 962 | Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"}; |
944 | 963 | Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]; |
945 | - Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]}],i+factor,l,true | |
964 | + Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]}],i+factor,l,true) | |
946 | 965 | | (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*";Sign "*";Sign "*"] l (Interp "*****") (* zastępniki liter *) |
947 | 966 | | (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*";Sign "*"] l (Interp "****") |
948 | 967 | | (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*"] l (Interp "***") |
949 | 968 | | (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*"] l (Interp "**") |
950 | - | (Sign "*") :: l -> (* Interp zastępnik liter i cudzysłów, symbol listy *) | |
951 | - let t,i = create_empty_sign_token i [Sign "*"] in | |
952 | - Variant[Token{t with token=Interp "*"};Token{t with token=Symbol "*"}],i,l,poss_s_beg | |
953 | - | (Sign "+") :: l -> create_sign_token poss_s_beg i [Sign "+"] l (Symbol "+") | |
969 | + | (Sign "*") :: l -> (* Interp to zastępnik liter i cudzysłów, symbol listy *) | |
970 | + let t,i2 = create_empty_sign_token i [Sign "*"] in | |
971 | + Variant([Token{t with token=Interp "*"};Token{t with token=Symbol "*"}] @ | |
972 | + (if !internet_mode then [sc_dig_token "*" i [Sign "*"] (make_lemma ("*","symbol"))] else [])),i2,l,poss_s_beg | |
973 | + | (Sign "+") :: l -> (* Interp to spójnik *) | |
974 | + let t,i2 = create_empty_sign_token i [Sign "+"] in | |
975 | + Variant[Token{t with token=Interp "+"};Token{t with token=Symbol "+"}],i2,l,poss_s_beg | |
954 | 976 | | (Sign "«") :: l -> |
955 | 977 | let t,i = create_empty_sign_token i [Sign "«"] in |
956 | 978 | Variant[Token{t with token=Interp "«"};Token{t with token=Interp "«s"}],i,l,poss_s_beg |
... | ... | @@ -1005,6 +1027,7 @@ let rec recognize_sign_group poss_s_beg i = function |
1005 | 1027 | | (Sign "_") :: l -> create_sign_token poss_s_beg i [Sign "_"] l (Symbol "_") |
1006 | 1028 | | (Sign "@") :: l -> create_sign_token poss_s_beg i [Sign "@"] l (Symbol "@") |
1007 | 1029 | | (Sign "×") :: l -> create_sign_token poss_s_beg i [Sign "×"] l (Symbol "×") |
1030 | + | (Sign "±") :: l -> create_sign_token poss_s_beg i [Sign "±"] l (Symbol "±") | |
1008 | 1031 | | (Sign "%") :: l -> |
1009 | 1032 | let t,i = create_empty_sign_token i [Sign "%"] in |
1010 | 1033 | Variant[Token{t with token=Symbol "%"};Token{t with token=make_lemma ("procent","subst:_:_:m3")}],i,l,false |
... | ... | @@ -1018,12 +1041,15 @@ let rec recognize_sign_group poss_s_beg i = function |
1018 | 1041 | | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t") |
1019 | 1042 | | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r") |
1020 | 1043 | | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n") |
1044 | + | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®") | |
1045 | + | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") | |
1046 | + | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ") | |
1021 | 1047 | | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s) |
1022 | 1048 | | l -> failwith "recognize_sign_group" |
1023 | 1049 | |
1024 | 1050 | (* FIXME: "„Szpak” frunie." trzeba przenie przenieść <sentence> przed „, ale zostawić po „s. *) |
1025 | 1051 | |
1026 | -let rec group_url rev = function | |
1052 | +(*let rec group_url rev = function | |
1027 | 1053 | Small s :: l -> group_url (s :: rev) l |
1028 | 1054 | | Capital(s,t) :: l -> group_url (s :: rev) l |
1029 | 1055 | | ForeignSmall s :: l -> group_url (s :: rev) l |
... | ... | @@ -1040,24 +1066,34 @@ let rec group_url rev = function |
1040 | 1066 | | Sign "," :: l -> group_url ("," :: rev) l |
1041 | 1067 | | Sign "~" :: l -> group_url ("~" :: rev) l |
1042 | 1068 | | Sign "_" :: l -> group_url ("_" :: rev) l |
1043 | - | l -> List.rev rev, l | |
1069 | + | l -> List.rev rev, l*) | |
1070 | + | |
1071 | +let merge_url poss_s_beg i len orth cat = | |
1072 | + if poss_s_beg then | |
1073 | + Variant[Token{empty_token_env with orth=orth;beg=i;len=len*factor;next=i+len*factor;token=Dig(orth,cat)}; | |
1074 | + Seq[s_beg i;c_beg (i+1);Token{empty_token_env with orth=orth;beg=i+2;len=len*factor-2;next=i+len*factor;token=Dig(orth,cat)}]] | |
1075 | + else | |
1076 | + Token{empty_token_env with orth=orth;beg=i;len=len*factor;next=i+len*factor;token=Dig(orth,cat)} | |
1044 | 1077 | |
1045 | 1078 | let rec group_chars poss_s_beg i rev = function |
1046 | 1079 | [] -> List.rev ((Token{empty_token_env with beg=i;len=factor;next=i+factor;token=Interp "</query>"}) :: rev) |
1047 | - | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l -> | |
1080 | + (* | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l -> | |
1048 | 1081 | let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l |
1049 | 1082 | | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Small "s") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l -> |
1050 | - let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l | |
1051 | - | (Digit s) :: l -> let x,l = group_digits [] ((Digit s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_digits poss_s_beg i x) :: rev) l | |
1052 | - | (Sign s) :: l -> let x,i,l,poss_s_beg = recognize_sign_group poss_s_beg i ((Sign s) :: l) in group_chars poss_s_beg i (x :: rev) l | |
1053 | - | (Capital(s,t)) :: l -> let x,l = group_letters [] ((Capital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l | |
1054 | - | (ForeignCapital(s,t)) :: l -> let x,l = group_letters [] ((ForeignCapital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l | |
1055 | - | (Small s) :: l -> let x,l = group_letters [] ((Small s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l | |
1056 | - | (ForeignSmall s) :: l -> let x,l = group_letters [] ((ForeignSmall s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l | |
1057 | - | (Other(s,x)) :: l -> | |
1083 | + let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l *) | |
1084 | + | Digit s :: l -> let x,l = group_digits [] ((Digit s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_digits poss_s_beg i x) :: rev) l | |
1085 | + | Sign s :: l -> let x,i,l,poss_s_beg = recognize_sign_group poss_s_beg i ((Sign s) :: l) in group_chars poss_s_beg i (x :: rev) l | |
1086 | + | Capital(s,t) :: l -> let x,l = group_letters [] ((Capital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l | |
1087 | + | ForeignCapital(s,t) :: l -> let x,l = group_letters [] ((ForeignCapital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l | |
1088 | + | Small s :: l -> let x,l = group_letters [] ((Small s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l | |
1089 | + | ForeignSmall s :: l -> let x,l = group_letters [] ((ForeignSmall s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l | |
1090 | + | Emoticon s :: l -> group_chars poss_s_beg (i + factor) ((Token{empty_token_env with orth=s;beg=i;len=factor;next=i+factor;token=make_lemma (s,"sinterj")}) :: rev) l | |
1091 | + | Other("url",len) :: Sign s :: l -> group_chars false (i + len * factor) ((merge_url poss_s_beg i len s "url") :: rev) l | |
1092 | + | Other("email",len) :: Sign s :: l -> group_chars false (i + len * factor) ((merge_url poss_s_beg i len s "email") :: rev) l | |
1093 | + | Other(s,x) :: l -> | |
1058 | 1094 | let x,l = group_others [] ((Other(s,x)) :: l) in |
1059 | 1095 | group_chars false (i + Xlist.size x * factor) |
1060 | - ((Token{empty_token_env with orth=String.concat "" x;beg=i;len=Xlist.size x * factor;next=i+factor;token=Other(String.concat "" x)}) :: rev) l | |
1096 | + ((Token{empty_token_env with orth=String.concat "" x;beg=i;len=Xlist.size x * factor;next=i+Xlist.size x * factor;token=Other(String.concat "" x)}) :: rev) l | |
1061 | 1097 | |
1062 | 1098 | let tokenize l = |
1063 | - (Token{empty_token_env with beg=0;len=factor;next=factor;token=Interp "<query>"}) :: (group_chars true factor [] l) | |
1099 | + (Token{empty_token_env with beg=0;len=factor;next=factor;token=Interp "<query>"}) :: (group_chars true factor [] (ENIAMurl.find l)) | |
... | ... |
tokenizer/makefile
... | ... | @@ -6,25 +6,26 @@ OCAMLFLAGS=$(INCLUDES) -g |
6 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | -SOURCES= ENIAMtokenizerTypes.ml ENIAMtokens.ml ENIAMacronyms.ml ENIAMpatterns.ml ENIAMtokenizer.ml | |
9 | +SOURCES= ENIAMtokenizerTypes.ml ENIAMurl.ml ENIAMtokens.ml ENIAMacronyms.ml ENIAMpatterns.ml ENIAMtokenizer.ml | |
10 | 10 | |
11 | 11 | all: eniam-tokenizer.cma eniam-tokenizer.cmxa |
12 | 12 | |
13 | 13 | install: all |
14 | 14 | mkdir -p $(INSTALLDIR) |
15 | 15 | cp eniam-tokenizer.cmxa eniam-tokenizer.a eniam-tokenizer.cma $(INSTALLDIR) |
16 | - cp ENIAMtokenizerTypes.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR) | |
17 | - cp ENIAMtokenizerTypes.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR) | |
16 | + cp ENIAMtokenizerTypes.cmi ENIAMurl.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR) | |
17 | + cp ENIAMtokenizerTypes.cmx ENIAMurl.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR) | |
18 | 18 | mkdir -p /usr/share/eniam/tokenizer |
19 | 19 | cp resources/* /usr/share/eniam/tokenizer |
20 | 20 | |
21 | 21 | install-local: all |
22 | 22 | mkdir -p $(INSTALLDIR) |
23 | 23 | cp eniam-tokenizer.cmxa eniam-tokenizer.a eniam-tokenizer.cma $(INSTALLDIR) |
24 | - cp ENIAMtokenizerTypes.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR) | |
25 | - cp ENIAMtokenizerTypes.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR) | |
24 | + cp ENIAMtokenizerTypes.cmi ENIAMurl.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR) | |
25 | + cp ENIAMtokenizerTypes.cmx ENIAMurl.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR) | |
26 | 26 | mkdir -p /usr/local/share/eniam/tokenizer |
27 | 27 | cp resources/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte_20151215.tab |
28 | + cp resources/top-level-domains.tab /usr/local/share/eniam/tokenizer/top-level-domains.tab | |
28 | 29 | cp resources/README /usr/local/share/eniam/tokenizer/README |
29 | 30 | # ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab |
30 | 31 | |
... | ... |