Commit 0c36e02e82975a65c9794c6e55eaa5530fdaa16a

Authored by Wojciech Jaworski
1 parent e0385166

Rozszerzenie funkcjonalnoniści tokenizera

LCGlexicon/ENIAM_LCGlexicon.ml
... ... @@ -312,6 +312,7 @@ let create_entries rules id orth cats valence lex_entries =
312 312 (* variable_name_ref := []; *)
313 313 if cats.pos="interp" && cats.lemma="<clause>" then (BracketSet(Forward),Dot) :: l else
314 314 if cats.pos="interp" && cats.lemma="</clause>" then (BracketSet(Backward),Dot) :: l else
  315 + if (cats.pos2="noun" || cats.pos2="verb" || cats.pos2="adj" || cats.pos2="adv") && cats.cat="X" && not !default_category_flag then l else
315 316 let e = get_labels () in
316 317 (* print_endline "create_entries 1"; *)
317 318 let rules = find_rules rules cats in
... ...
LCGlexicon/ENIAM_LCGlexiconTypes.ml
... ... @@ -79,6 +79,8 @@ let empty_cats = {lemma=&quot;&quot;; pos=&quot;&quot;; pos2=&quot;&quot;; cat=&quot;X&quot;; coerced=[];
79 79 nsyn=[]; nsem=[]; modes=[]; psem=[];
80 80 }
81 81  
  82 +let default_category_flag = ref true
  83 +
82 84 let resource_path =
83 85 try Sys.getenv "ENIAM_RESOURCE_PATH"
84 86 with Not_found ->
... ...
exec/ENIAMvisualization.ml
... ... @@ -24,7 +24,7 @@ open ENIAMtokenizerTypes
24 24 open ENIAMexecTypes
25 25  
26 26 let string_of_status = function
27   - Idle -> "Idle"
  27 + Idle -> "Idle"
28 28 | PreprocessingError -> "PreprocessingError"
29 29 | LexiconError -> "LexiconError"
30 30 | ParseError -> "ParseError"
... ... @@ -786,6 +786,80 @@ let create_latex_dep_chart path name dep_chart =
786 786 LatexMain.latex_compile_and_clean path name
787 787 *)
788 788  
  789 +let rec extract_pos_cat_internal vars = function
  790 + | Atom x -> x
  791 + | AVar x -> (try extract_pos_cat_internal vars (Xlist.assoc vars x) with Not_found -> failwith "extract_pos_cat_internal")
  792 + | With l -> String.concat "&" (Xlist.map l (extract_pos_cat_internal vars))
  793 + | Zero -> "0"
  794 + | Top -> "T"
  795 +
  796 +let rec extract_pos_cat vars = function
  797 + | Tensor [] -> failwith "extract_pos_cat: ni"
  798 + | Tensor [pos] -> extract_pos_cat_internal vars pos
  799 + | Tensor (Atom "num" :: _) -> "Number"
  800 + | Tensor (Atom "prepnp" :: _) -> "Prep"
  801 + | Tensor (pos :: cat :: _) -> (*extract_pos_cat_internal vars pos ^ "*" ^*) extract_pos_cat_internal vars cat
  802 + | Plus l -> failwith "extract_pos_cat: ni"
  803 + | Imp(s,d,t2) -> extract_pos_cat vars s
  804 + | One -> failwith "extract_pos_cat: ni"
  805 + | ImpSet(s,l) -> extract_pos_cat vars s
  806 + | WithVar(v,g,e,s) -> extract_pos_cat ((v,g) :: vars) s
  807 + | Star s -> failwith "extract_pos_cat: ni"
  808 + | Bracket(lf,rf,s) -> extract_pos_cat vars s
  809 + | BracketSet d -> "BracketSet"
  810 + | Maybe s -> failwith "extract_pos_cat: ni"
  811 +
  812 +let get_text_fragment text_fragments node1 node2 =
  813 + try IntMap.find text_fragments.(node1) node2
  814 + with Not_found -> "???"(*failwith (Printf.sprintf "chart: text_fragment not found %d-%d" node1 node2)*)
  815 +
  816 +let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<siebie>";"<prep>";
  817 + "<num>";"<intnum>";"<realnum>";"<intnum-interval>";"<realnum-interval>";"<symbol>";"<ordnum>";
  818 + "<date>";"<date-interval>";"<hour-minute>";"<hour>";"<hour-minute-interval>";"<hour-interval>";
  819 + "<year>";"<year-interval>";"<day>";"<day-interval>";"<day-month>";"<day-month-interval>";
  820 + "<month-interval>";"<roman>";"<roman-interval>";"<roman-ordnum>";"<match-result>";"<url>";
  821 + "<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>";
  822 + "<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>";
  823 + "<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>";
  824 + "<interp>";"<part>";"<unk>";"<building-number>"]
  825 +
  826 +let cat_tokens_sequence text_fragments g =
  827 + let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) ->
  828 + node1,node2,
  829 + (if m < node1 then
  830 + if n < node1 then [n, node1, get_text_fragment text_fragments n node1, "null"]
  831 + else if n = node1 then []
  832 + else [node1, n, get_text_fragment text_fragments node1 n, "overlap"]
  833 + else if m = node1 then
  834 + if n < node2 then [m, n, get_text_fragment text_fragments m n, "overlap"]
  835 + else if n = node2 then []
  836 + else [node1, node2, get_text_fragment text_fragments node1 node2, "overlap"]
  837 + else failwith "cat_tokens_sequence") @
  838 + [node1, node2, get_text_fragment text_fragments node1 node2, extract_pos_cat [] symbol] @ l) in
  839 + let map = Xlist.fold l IntMap.empty (fun map (m,n,text,symbol) ->
  840 + IntMap.add_inc map (1000000*m+n) [text,symbol] (fun l -> (text,symbol) :: l)) in
  841 + let map = IntMap.map map (fun l ->
  842 + let t,ov,set = Xlist.fold l ("",false,StringSet.empty) (fun (t,ov,set) (text,symbol) ->
  843 + if symbol = "null" then text,ov,set
  844 + else if symbol = "overlap" then t,true,set
  845 + else if StringSet.mem omited symbol then text,ov,set
  846 + else t,ov,StringSet.add set symbol) in
  847 + let l = if StringSet.is_empty set then [t] else StringSet.to_list set in
  848 + if ov then "OVERLAP{" ^ String.concat " " l ^ "}" else
  849 + match l with
  850 + [t] -> t
  851 + | _ -> "{" ^ String.concat " " l ^ "}") in
  852 + let l = List.sort compare (IntMap.fold map [] (fun l k texts -> (k,texts) :: l)) in
  853 +(* let l = Xlist.sort l (fun (m1,n1,text1,symbol1) (m2,n2,text2,symbol2) ->
  854 + if m1 <> m2 then compare m1 m2 else
  855 + if n1 <> n2 then compare n1 n2 else
  856 + compare symbol1 symbol2) in
  857 + let l = if l = [] then l else
  858 + Xlist.fold (List.tl l) [List.hd l] (fun l a ->
  859 + match l with
  860 + [] -> failwith "cat_tokens_sequence"
  861 + | b :: l -> if a = b then b :: l else a :: b :: l) in*)
  862 + String.concat " " (Xlist.map l (fun (n,texts) -> texts))
789 863  
790 864 (* verbosity:
791 865 0 -> jedynie informacja o statusie zdania
... ... @@ -796,13 +870,13 @@ let create_latex_dep_chart path name dep_chart =
796 870 let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam_parse_result) =
797 871 match result.status with
798 872 Idle -> "<font color=\"red\">idle</font>\n"
799   - | LexiconError -> sprintf "<font color=\"red\">error_lex</font>: %s paths_size=%d\n" result.msg result.paths_size
  873 + | LexiconError -> sprintf "<font color=\"red\">error_lex</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size
800 874 | ParseError ->
801 875 if verbosity = 0 then () else (
802 876 ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
803 877 ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
804 878 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2);
805   - sprintf "<font color=\"red\">error_parse</font>: %s paths_size=%d\n" result.msg result.paths_size ^
  879 + sprintf "<font color=\"red\">error_parse</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size ^
806 880 (if verbosity = 0 then "" else
807 881 sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
808 882 sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^
... ... @@ -814,7 +888,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
814 888 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2);
815 889 if verbosity = 0 then () else (
816 890 ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2);
817   - sprintf "<font color=\"red\">timeout</font>: %s paths_size=%d\n" result.msg result.paths_size ^
  891 + sprintf "<font color=\"red\">timeout</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size ^
818 892 (if verbosity < 2 then "" else
819 893 sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
820 894 sprintf "<BR><A HREF=\"%s_2_references.pdf\">References 2</A>\n" file_prefix) ^
... ... @@ -840,6 +914,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
840 914 sprintf "<BR><A HREF=\"%s_3_references.pdf\">References 3</A>\n" file_prefix ^
841 915 sprintf "<BR><A HREF=\"%s_3_chart.pdf\">Chart 3</A>\n" file_prefix) ^
842 916 (if verbosity = 0 then "" else
  917 + sprintf "<BR>%s\n" (escape_html (cat_tokens_sequence result.text_fragments (ENIAM_LCGchart.select_maximal result.chart1))) ^
843 918 sprintf "<BR><A HREF=\"%s_3_chart_selection.pdf\">Chart 3 Selection</A>\n" file_prefix) ^
844 919 ""
845 920 | ReductionError ->
... ... @@ -851,7 +926,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
851 926 ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
852 927 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3);
853 928 (if verbosity < 2 then "" else
854   - sprintf "<font color=\"red\">error_reduction</font>: %s paths_size=%d chart_size=%d\n" result.msg result.paths_size result.chart_size ^
  929 + sprintf "<font color=\"red\">error_reduction</font>: %s paths_size=%d chart_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size ^
855 930 sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^
856 931 sprintf "<BR><A HREF=\"%s_2_references.pdf\">References 2</A>\n" file_prefix ^
857 932 sprintf "<BR><A HREF=\"%s_3_chart.pdf\">Chart 3</A>\n" file_prefix) ^
... ... @@ -909,7 +984,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
909 984 Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 result.term4));
910 985 Xlatex.latex_compile_and_clean path (file_prefix ^ "_4_term");
911 986 ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_4_dependency_tree") "a0" result.dependency_tree4);
912   - sprintf "<font color=\"red\">error_reduction2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
  987 + sprintf "<font color=\"red\">error_reduction2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
913 988 (if verbosity < 2 then "" else
914 989 sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
915 990 sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^
... ... @@ -939,7 +1014,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
939 1014 ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_6b_dependency_tree") result.dependency_tree6b;
940 1015 ENIAM_LCGgraphOf.print_simplified_dependency_tree path (file_prefix ^ "_6a_simple_dependency_tree") result.dependency_tree6a;
941 1016 ENIAM_LCGgraphOf.print_simplified_dependency_tree path (file_prefix ^ "_6b_simple_dependency_tree") result.dependency_tree6b);
942   - sprintf "<font color=\"red\">error_reduction3</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
  1017 + sprintf "<font color=\"red\">error_reduction3</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
943 1018 (if verbosity < 2 then "" else
944 1019 sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
945 1020 sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^
... ... @@ -1010,7 +1085,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
1010 1085 if ExtArray.size result.dependency_tree8 <> 0 then ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_8_dependency_tree") "a3" result.dependency_tree8;
1011 1086 if result.dependency_tree9 <> [| |] then ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") "a3" result.dependency_tree9;
1012 1087 if result.dependency_tree9 <> [| |] then ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") result.dependency_tree9);
1013   - sprintf "<font color=\"red\">error_sem_valence</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
  1088 + sprintf "<font color=\"red\">error_sem_valence</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
1014 1089 (if verbosity = 0 then "" else
1015 1090 sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix ^
1016 1091 (if result.dependency_tree7 <> [| |] then sprintf "<BR><A HREF=\"%s_7_dependency_tree.pdf\">Dependency Tree References 7</A>\n" file_prefix else "") ^
... ... @@ -1038,7 +1113,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
1038 1113 if ExtArray.size result.dependency_tree8 <> 0 then ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_8_dependency_tree") "a3" result.dependency_tree8;
1039 1114 if result.dependency_tree9 <> [| |] then ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") "a3" result.dependency_tree9;
1040 1115 if result.dependency_tree9 <> [| |] then ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") result.dependency_tree9));
1041   - sprintf "<font color=\"red\">error_sem_graph</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
  1116 + sprintf "<font color=\"red\">error_sem_graph</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
1042 1117 (if verbosity = 2 then
1043 1118 sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix ^
1044 1119 (if result.semantic_graph10 <> [| |] then sprintf "<BR><A HREF=\"%s_10_semantic_graph.pdf\">Semantic Graph References 10</A>\n" file_prefix else "") ^
... ... @@ -1061,7 +1136,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
1061 1136 | SemGraphError2 ->
1062 1137 if verbosity = 0 then () else (
1063 1138 ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_11_semantic_graph") "" result.semantic_graph11);
1064   - sprintf "<font color=\"red\">error_sem_graph2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
  1139 + sprintf "<font color=\"red\">error_sem_graph2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
1065 1140 (if verbosity = 0 then "" else
1066 1141 sprintf "<BR><IMG SRC=\"%s_11_semantic_graph.png\">\n" file_prefix) ^
1067 1142 ""
... ... @@ -1077,7 +1152,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
1077 1152 ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_11_semantic_graph") "" result.semantic_graph11);
1078 1153 if verbosity = 0 then () else (
1079 1154 ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_12_semantic_graph") "" result.semantic_graph12);
1080   - sprintf "<font color=\"red\">sem_not_validated</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
  1155 + sprintf "<font color=\"red\">sem_not_validated</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
1081 1156 (if verbosity < 2 then "" else
1082 1157 sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix ^
1083 1158 sprintf "<BR><A HREF=\"%s_7_dependency_tree.pdf\">Dependency Tree References 7</A>\n" file_prefix ^
... ... @@ -1386,3 +1461,28 @@ let rec print_main_result_first_page_text cg_bin_path path id tokens = function
1386 1461 (List.rev (Xlist.fold paragraphs [] find_prev_next_paragraph)) in
1387 1462 print_main_result_first_page_paragraph cg_bin_path path id tokens prev_next_map (List.hd paragraphs)
1388 1463 | AltText l -> Xlist.iter l (fun (mode,text) -> print_main_result_first_page_text cg_bin_path path id tokens text)
  1464 +
  1465 +let to_string_eniam_sentence verbosity tokens (result : eniam_parse_result) =
  1466 + let status_string = string_of_status result.status in
  1467 + if result.status = NotParsed then
  1468 + [status_string ^ ": " ^ cat_tokens_sequence result.text_fragments (ENIAM_LCGchart.select_maximal result.chart1)]
  1469 + else [status_string]
  1470 +
  1471 +let rec to_string_sentence verbosity tokens = function
  1472 + RawSentence s -> []
  1473 + | StructSentence(paths,last) -> []
  1474 + | DepSentence paths -> []
  1475 + | ENIAMSentence result -> to_string_eniam_sentence verbosity tokens result
  1476 + | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence))
  1477 + | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> to_string_sentence verbosity tokens sentence))
  1478 +
  1479 +let rec to_string_paragraph verbosity tokens = function
  1480 + RawParagraph s -> []
  1481 + | StructParagraph sentences -> List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence))
  1482 + | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string_paragraph verbosity tokens paragraph))
  1483 + | ErrorParagraph s -> ["SubsyntaxError"]
  1484 +
  1485 +let rec to_string_text verbosity tokens = function
  1486 + RawText s -> []
  1487 + | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens))
  1488 + | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text))
... ...
subsyntax/ENIAM_MWE.ml
... ... @@ -164,12 +164,13 @@ let get_single_letter_orths paths =
164 164 IntMap.fold map orths (fun orths _ l ->
165 165 TokenEnvSet.fold l orths (fun orths t ->
166 166 match t.token with
167   - SmallLetter lemma -> StringSet.add orths lemma
  167 + SmallLetter lemma -> (*if lemma <> "g" then*) StringSet.add orths lemma (*else orths*) (* FIXME: !!!! *)
168 168 | CapLetter(lemma,_) -> StringSet.add orths lemma
169 169 | _ -> orths)))
170 170  
171 171 let preselect orths lemmas rules l =
172 172 Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
  173 + (* print_endline ("preselect: " ^ lemma); *)
173 174 let b = Xlist.fold match_list true (fun b -> function
174 175 O s -> StringSet.mem orths s && b
175 176 | L(s,_,_) -> StringSet.mem lemmas s && b
... ... @@ -179,6 +180,7 @@ let preselect orths lemmas rules l =
179 180 let preselect_dict orths lemmas dict rules =
180 181 StringSet.fold orths rules (fun rules orth ->
181 182 try
  183 + (* print_endline ("preselect_dict: " ^ orth); *)
182 184 preselect orths lemmas rules (StringMap.find dict orth)
183 185 with Not_found -> rules)
184 186  
... ... @@ -195,7 +197,7 @@ let add_ordnum_rules orths rules =
195 197 (false,[D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
196 198  
197 199 let add_quot_rule rules =
198   - (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules
  200 + (false,[I "„x";I "<sentence>"; I "<clause>"],"„","Interp",[]) :: rules
199 201  
200 202 let add_building_number_rules dig_orths letter_orths rules =
201 203 StringSet.fold dig_orths rules (fun rules dig1 ->
... ... @@ -215,15 +217,22 @@ let add_building_number_rules dig_orths letter_orths rules =
215 217  
216 218 let select_rules paths mwe_dict mwe_dict2 =
217 219 let orths = get_orths paths in
  220 + (* print_endline ("ENIAM_MWE.select_rules 1 orths=[" ^ String.concat ";" (StringSet.to_list orths) ^ "]"); *)
218 221 let lemmas = get_lemmas paths in
219 222 let intnum_orths = get_intnum_orths paths in
220   - let year_orths = get_year_orths paths in
221   - let letter_orths = get_single_letter_orths paths in
  223 + (* let year_orths = get_year_orths paths in *)
  224 + (* let letter_orths = get_single_letter_orths paths in *)
222 225 let rules = preselect_dict orths lemmas mwe_dict [] in
  226 + (* print_endline ("ENIAM_MWE.select_rules 1 |rules|=" ^ string_of_int (Xlist.size rules)); *)
  227 + (* Xlist.iter rules (fun (is_mwe,match_list,lemma,cat,interp) -> print_endline lemma); *)
223 228 let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
  229 + (* print_endline ("ENIAM_MWE.select_rules 2 |rules|=" ^ string_of_int (Xlist.size rules)); *)
224 230 let rules = add_ordnum_rules intnum_orths rules in
  231 + (* print_endline ("ENIAM_MWE.select_rules 3 |rules|=" ^ string_of_int (Xlist.size rules)); *)
225 232 let rules = add_quot_rule rules in
226   - let rules = add_building_number_rules year_orths letter_orths rules in
  233 + (* print_endline ("ENIAM_MWE.select_rules 4 |rules|=" ^ string_of_int (Xlist.size rules)); *)
  234 + (* let rules = add_building_number_rules year_orths letter_orths rules in *) (* FIXME !!!! *)
  235 + (* print_endline ("ENIAM_MWE.select_rules 5 |rules|=" ^ string_of_int (Xlist.size rules) ^ " |year_orths|=" ^ string_of_int (StringSet.size year_orths) ^ " |letter_orths|=" ^ string_of_int (StringSet.size letter_orths)); *)
227 236 rules
228 237  
229 238 let rec check_interp sels = function
... ... @@ -306,7 +315,8 @@ let create_token is_mwe (matching:token_env list) sels lemma cat interp = (* FIX
306 315 beg=beg;
307 316 len=len;
308 317 next=t.next;
309   - token=Lemma(lemma,cat,[Xlist.map interp (function
  318 + token=if cat = "Interp" then Interp lemma else
  319 + Lemma(lemma,cat,[Xlist.map interp (function
310 320 S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
311 321 | V s -> Xstring.split "\\." s
312 322 | G -> ["_"])]);
... ... @@ -327,18 +337,34 @@ let apply_rule paths (is_mwe,match_list,lemma,cat,interp) =
327 337 add_token paths token
328 338 with Not_found -> paths)
329 339  
  340 +let count_path_size paths =
  341 + IntMap.fold paths 0 (fun n _ map2 ->
  342 + IntMap.fold map2 n (fun n _ set ->
  343 + TokenEnvSet.size set + n))
  344 +
330 345 let process (paths,last) =
  346 + (* print_endline ("ENIAM_MWE.process 1 |paths|=" ^ string_of_int (Xlist.size paths)); *)
331 347 let paths = Xlist.fold paths IntMap.empty add_token in
  348 + (* print_endline ("ENIAM_MWE.process 2 |paths|=" ^ string_of_int (count_path_size paths)); *)
332 349 let rules = select_rules paths !mwe_dict !mwe_dict2 in
  350 + (* print_endline ("ENIAM_MWE.process 3 |rules|=" ^ string_of_int (Xlist.size rules)); *)
333 351 let paths = Xlist.fold rules paths apply_rule in
  352 + (* print_endline ("ENIAM_MWE.process 4 |paths|=" ^ string_of_int (count_path_size paths)); *)
334 353 let rules = select_rules paths !mwe_dict !mwe_dict2 in
  354 + (* print_endline ("ENIAM_MWE.process 5 |rules|=" ^ string_of_int (Xlist.size rules)); *)
335 355 let paths = Xlist.fold rules paths apply_rule in
  356 + (* print_endline ("ENIAM_MWE.process 6 |paths|=" ^ string_of_int (count_path_size paths)); *)
336 357 let rules = select_rules paths !mwe_dict !mwe_dict2 in
  358 + (* print_endline ("ENIAM_MWE.process 7 |rules|=" ^ string_of_int (Xlist.size rules)); *)
337 359 let paths = Xlist.fold rules paths apply_rule in
  360 + (* print_endline "ENIAM_MWE.process 8"; *)
338 361 let rules = select_rules paths !mwe_dict !mwe_dict2 in
  362 + (* print_endline "ENIAM_MWE.process 9"; *)
339 363 let paths = Xlist.fold rules paths apply_rule in
  364 + (* print_endline "ENIAM_MWE.process 10"; *)
340 365 let paths = IntMap.fold paths [] (fun paths _ map ->
341 366 IntMap.fold map paths (fun paths _ l ->
342 367 TokenEnvSet.fold l paths (fun paths t ->
343 368 t :: paths))) in
  369 + (* print_endline "ENIAM_MWE.process 11"; *)
344 370 ENIAMpaths.sort (paths,last)
... ...
subsyntax/ENIAMsentences.ml
... ... @@ -139,6 +139,13 @@ let find_query paragraph tokens chart last =
139 139 (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
140 140 (fun ids -> Tokens("query",ids))
141 141  
  142 +let find_query2 paragraph tokens chart last =
  143 + parse_bracket_rule paragraph tokens chart last
  144 + (fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>")
  145 + (fun tokens id -> true)
  146 + (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
  147 + (fun ids -> Tokens("query",ids))
  148 +
142 149 let find_tokens_in_chart tokens chart lnode rnode cat =
143 150 let found = Xlist.fold chart.(lnode) [] (fun found (id,rnode2) ->
144 151 if rnode = rnode2 then
... ... @@ -149,7 +156,8 @@ let find_tokens_in_chart tokens chart lnode rnode cat =
149 156 else found) in
150 157 match found with
151 158 [x] -> x
152   - | _ -> failwith "Unable to extract sentences. Check puntuation."
  159 + | [] -> failwith "Unable to extract sentences. Check puntuation."
  160 + | _ -> failwith "find_tokens_in_chart"
153 161  
154 162 (*let find_tokens_in_chart_id tokens chart lnode rnode cat =
155 163 let found = Int.fold 0 last [] (fun ids lnode ->
... ... @@ -215,6 +223,13 @@ let extract_sentences pid tokens chart last =
215 223 psentence=AltSentence[Raw,RawSentence paragraph;
216 224 ENIAM,StructSentence("",paths,last)]}]*)
217 225  
  226 +let extract_sentences2 pid tokens chart last =
  227 + let ids = find_tokens_in_chart tokens chart 0 last "query" in
  228 + let paths,last = make_paths tokens ids in
  229 + let sentences = [{id="0"; beg=0; len=last; next=last; file_prefix="";
  230 + sentence=AltSentence([ENIAM,StructSentence(paths,last)])}] in
  231 + add_struct_sentence_ids pid sentences
  232 +
218 233 (*
219 234 let is_sentence = function
220 235 Sentence _ -> true
... ... @@ -269,6 +284,7 @@ let make_chart paths last =
269 284 chart
270 285  
271 286 let split_into_sentences pid paragraph tokens paths =
  287 + (* print_endline "split_into_sentences"; *)
272 288 let paths = make_ids tokens paths in
273 289 let paths,last = prepare_indexes paths in
274 290 let chart = make_chart paths last in
... ... @@ -280,3 +296,12 @@ let split_into_sentences pid paragraph tokens paths =
280 296 find_paren_sentences par tokens chart last;
281 297 find_query par tokens chart last;
282 298 extract_sentences pid tokens chart last
  299 +
  300 +let no_split_into_sentences pid paragraph tokens paths =
  301 + (* print_endline "no_split_into_sentences"; *)
  302 + let paths = make_ids tokens paths in
  303 + let paths,last = prepare_indexes paths in
  304 + let chart = make_chart paths last in
  305 + let par = Array.of_list ([""] @ Xunicode.utf8_chars_of_utf8_string paragraph @ [""]) in
  306 + find_query2 par tokens chart last;
  307 + extract_sentences2 pid tokens chart last
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -233,7 +233,8 @@ let rec select_tokens2_rec last paths nodes map =
233 233 select_tokens2_rec last paths nodes map
234 234  
235 235 let rec calculate_quality q = function
236   - CS :: l -> calculate_quality (q-2) l
  236 + FC :: l -> calculate_quality (q-2) l
  237 + | CS :: l -> calculate_quality (q-2) l
237 238 | MaybeCS :: l -> calculate_quality q l
238 239 | ReqValLemm :: l -> calculate_quality q l
239 240 | MWE :: l -> calculate_quality (q+6) l
... ... @@ -313,7 +314,7 @@ let initialize () =
313 314  
314 315 let parse query =
315 316 let l = ENIAMtokenizer.parse query in
316   -(* print_endline "a6"; *)
  317 + (* print_endline "a6"; *)
317 318 let paths = ENIAMpaths.translate_into_paths l in
318 319 (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a7"; *)
319 320 (* print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *)
... ... @@ -324,21 +325,13 @@ let parse query =
324 325 let paths,_ = ENIAM_MWE.process paths in
325 326 (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a12"; *)
326 327 (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
327   -(* let paths = find_proper_names paths in*)
328 328 let paths = List.rev (Xlist.rev_map paths find_proper_names) in
329 329 (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a13"; *)
330 330 (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
331 331 let paths = modify_weights paths in
332 332 let paths = translate_digs paths in
333   -(* let paths = assign_senses paths in
334   -(* print_endline "a14"; *)
335   - let paths = assign_valence paths in*)
336   -(* print_endline "a15"; *)
  333 + (* print_endline "a14"; *)
337 334 let paths = combine_interps paths in
338   -(* print_endline "a16"; *)
339   -(* let paths = disambiguate_senses paths in
340   - let paths = assign_simplified_valence paths in
341   - let paths = PreSemantics.assign_semantics paths in*)
342 335 (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a16"; *)
343 336 (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
344 337 let paths = select_tokens paths in
... ... @@ -351,36 +344,39 @@ let parse query =
351 344 (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a19"; *)
352 345 paths(*, next_id*)
353 346  
354   -let parse_text_tokens tokens query =
  347 +let parse_text_tokens sentence_split_flag tokens query =
355 348 (* print_endline ("parse_text_tokens: " ^ query); *)
356 349 let paragraphs = Xstring.split "\n\\|\r" query in
357 350 let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
358 351 let n = if Xlist.size paragraphs = 1 then 0 else 1 in
359 352 let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
360 353 try
  354 + (* print_endline paragraph; *)
361 355 let paths = parse paragraph in
362 356 (* print_endline "parse_text 1"; *)
363 357 let pid = if n = 0 then "" else string_of_int n ^ "_" in
364   - let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in
  358 + let sentences =
  359 + if sentence_split_flag then ENIAMsentences.split_into_sentences pid paragraph tokens paths
  360 + else ENIAMsentences.no_split_into_sentences pid paragraph tokens paths in
365 361 (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1
366 362 with e ->
367 363 (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in
368 364 AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens
369 365  
370   -let parse_text query =
  366 +let parse_text sentence_split_flag query =
371 367 (* print_endline ("parse_text: " ^ query); *)
372 368 let tokens = ExtArray.make 100 empty_token_env in
373 369 let _ = ExtArray.add tokens empty_token_env in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
374   - parse_text_tokens tokens query
  370 + parse_text_tokens sentence_split_flag tokens query
375 371  
376 372 let catch_parse text =
377 373 try
378 374 let tokens = parse text in tokens,""
379 375 with e -> [], Printexc.to_string e
380 376  
381   -let catch_parse_text text =
  377 +let catch_parse_text sentence_split_flag text =
382 378 try
383   - let text,tokens = parse_text text in text,tokens,""
  379 + let text,tokens = parse_text sentence_split_flag text in text,tokens,""
384 380 with e ->
385 381 RawText text,
386 382 ExtArray.make 0 empty_token_env,
... ...
subsyntax/interface.ml
... ... @@ -18,24 +18,28 @@
18 18 *)
19 19  
20 20 type output = Text | Xml | Html | Marsh | Graphviz
  21 +type sentence_split = Full | Partial | None
21 22  
22 23 let output = ref Text
23 24 let comm_stdio = ref true
24   -let sentence_split = ref true
  25 +let sentence_split = ref Full
25 26 let port = ref 5439
26 27  
27 28 let spec_list = [
28   - "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
29   - "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences";
  29 + "-s", Arg.Unit (fun () -> sentence_split:=Full), "Split input into sentences (default)";
  30 + "-a", Arg.Unit (fun () -> sentence_split:=Partial), "Split input into paragraphs, do not split input into sentences";
  31 + "-n", Arg.Unit (fun () -> sentence_split:=None), "Do not split input into sentences";
30 32 "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
31 33 "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
32 34 "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
33 35 "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
34 36 "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
35 37 "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
36   - "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";
  38 + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=None), "Output as graphviz dot file; turns sentence split off";
37 39 "--strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=true), "Perform strong disambiguation";
38 40 "--no-strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=false), "Do not perform strong disambiguation (default)";
  41 + "--internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=true), "Relaxed attitude towards interpunction";
  42 + "--no-internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=false), "Strict attitude towards interpunction (default)";
39 43 ]
40 44  
41 45 let usage_msg =
... ... @@ -62,8 +66,10 @@ let rec main_loop in_chan out_chan =
62 66 (* print_endline "input text begin";
63 67 print_endline text;
64 68 print_endline "input text end"; *)
65   - (if !sentence_split then
66   - let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in
  69 + (if !sentence_split = Full || !sentence_split = Partial then
  70 + let text,tokens,msg =
  71 + if !sentence_split = Full then ENIAMsubsyntax.catch_parse_text true text
  72 + else ENIAMsubsyntax.catch_parse_text false text in
67 73 (match !output with
68 74 Text ->
69 75 if msg = "" then output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^
... ...
subsyntax/resources/brev.tab
... ... @@ -209,6 +209,7 @@ J . Jezioro subst:_:_:n:ncol
209 209 j . jak adv:pos
210 210 j . język subst:_:_:m3
211 211 J . jezioro subst:_:_:n:ncol
  212 +j . jednostka subst:_:_:f
212 213 Jdt Księga Judyty subst:sg:_:f
213 214 Jer . Księga Jeremiasza subst:sg:_:f
214 215 Jez . Jezioro subst:_:_:n:ncol
... ... @@ -736,7 +737,9 @@ zob . zobaczyć impt:sg:sec:perf
736 737 Zw . związek subst:_:_:m3
737 738 ż . żeński adj:_:_:_:pos
738 739 ż . żółty adj:_:_:_:pos
739   -μ m mikrometr subst:_:_:m3
  740 +µ m mikrometr subst:_:_:m3
  741 +µ mol mikromol subst:_:_:m3
  742 +µ g mikrogram subst:_:_:m3
740 743 A . A. subst:_:_:m1.f
741 744 B . B. subst:_:_:m1.f
742 745 C . C. subst:_:_:m1.f
... ...
tokenizer/ENIAMacronyms.ml
... ... @@ -419,12 +419,12 @@ let acronym_patterns = [
419 419 [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m1" | _ -> failwith "acronym_patterns");
420 420 [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m2" | _ -> failwith "acronym_patterns");
421 421 [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns");
422   - [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:n2" | _ -> failwith "acronym_patterns");
  422 + [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:n:ncol" | _ -> failwith "acronym_patterns");
423 423 [CL; S "-"; CL; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:loc:f" | _ -> failwith "acronym_patterns");
424 424 [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m1" | _ -> failwith "acronym_patterns");
425 425 [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m2" | _ -> failwith "acronym_patterns");
426 426 [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns");
427   - [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:n2" | _ -> failwith "acronym_patterns");
  427 + [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:n:ncol" | _ -> failwith "acronym_patterns");
428 428 [CL; S "-"; CL; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:inst:f" | _ -> failwith "acronym_patterns");
429 429 [CL; S "-"; CL; S "-"; O "cie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "T" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns");
430 430 [CL; S "-"; CL; S "-"; O "cie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "T" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns");
... ... @@ -448,7 +448,7 @@ let acronym_patterns = [
448 448 [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m1" | _ -> failwith "acronym_patterns");
449 449 [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m2" | _ -> failwith "acronym_patterns");
450 450 [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m3" | _ -> failwith "acronym_patterns");
451   - [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:n2" | _ -> failwith "acronym_patterns");
  451 + [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:n:ncol" | _ -> failwith "acronym_patterns");
452 452 [L; S "-"; L; S "-"; O "etach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns");
453 453 [L; S "-"; L; S "-"; O "etami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns");
454 454 [L; S "-"; L; S "-"; O "etem"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m3" | _ -> failwith "acronym_patterns");
... ... @@ -488,7 +488,7 @@ let acronym_patterns = [
488 488 [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m1" | _ -> failwith "acronym_patterns");
489 489 [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m2" | _ -> failwith "acronym_patterns");
490 490 [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m3" | _ -> failwith "acronym_patterns");
491   - [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:n2" | _ -> failwith "acronym_patterns");
  491 + [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:n:ncol" | _ -> failwith "acronym_patterns");
492 492 [CL; S "-"; CL; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:dat:f" | _ -> failwith "acronym_patterns");
493 493 [L; S "-"; L; S "-"; O "otach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns");
494 494 [L; S "-"; L; S "-"; O "otami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns");
... ... @@ -503,13 +503,13 @@ let acronym_patterns = [
503 503 [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m1" | _ -> failwith "acronym_patterns");
504 504 [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m2" | _ -> failwith "acronym_patterns");
505 505 [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m3" | _ -> failwith "acronym_patterns");
506   - [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:n2" | _ -> failwith "acronym_patterns");
  506 + [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:n:ncol" | _ -> failwith "acronym_patterns");
507 507 [L; S "-"; L; S "-"; O "owie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m1" | _ -> failwith "acronym_patterns");
508 508 [L; S "-"; L; S "-"; O "owie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m1" | _ -> failwith "acronym_patterns");
509 509 [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:acc:m2" | _ -> failwith "acronym_patterns");
510 510 [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:m2" | _ -> failwith "acronym_patterns");
511 511 [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:m3" | _ -> failwith "acronym_patterns");
512   - [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:n2" | _ -> failwith "acronym_patterns");
  512 + [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:n:ncol" | _ -> failwith "acronym_patterns");
513 513 [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m1" | _ -> failwith "acronym_patterns");
514 514 [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m2" | _ -> failwith "acronym_patterns");
515 515 [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns");
... ... @@ -520,30 +520,30 @@ let acronym_patterns = [
520 520 [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "depr:pl:voc:m2" | _ -> failwith "acronym_patterns");
521 521 [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m2" | _ -> failwith "acronym_patterns");
522 522 [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m3" | _ -> failwith "acronym_patterns");
523   - [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:n2" | _ -> failwith "acronym_patterns");
  523 + [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:n:ncol" | _ -> failwith "acronym_patterns");
524 524 [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m2" | _ -> failwith "acronym_patterns");
525 525 [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m3" | _ -> failwith "acronym_patterns");
526   - [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:n2" | _ -> failwith "acronym_patterns");
  526 + [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:n:ncol" | _ -> failwith "acronym_patterns");
527 527 [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m2" | _ -> failwith "acronym_patterns");
528 528 [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m3" | _ -> failwith "acronym_patterns");
529   - [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:n2" | _ -> failwith "acronym_patterns");
  529 + [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:n:ncol" | _ -> failwith "acronym_patterns");
530 530 [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:acc:f" | _ -> failwith "acronym_patterns");
531 531 [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:nom:f" | _ -> failwith "acronym_patterns");
532 532 [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:voc:f" | _ -> failwith "acronym_patterns");
533 533 [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:gen:f" | _ -> failwith "acronym_patterns");
534 534 [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m2" | _ -> failwith "acronym_patterns");
535 535 [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns");
536   - [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:n2" | _ -> failwith "acronym_patterns");
  536 + [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:n:ncol" | _ -> failwith "acronym_patterns");
537 537 [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m2" | _ -> failwith "acronym_patterns");
538 538 [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns");
539   - [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:n2" | _ -> failwith "acronym_patterns");
  539 + [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:n:ncol" | _ -> failwith "acronym_patterns");
540 540 [L; S "-"; L; S "-"; O "zie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns");
541 541 [L; S "-"; L; S "-"; O "zie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns");
542 542 [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m1" | _ -> failwith "acronym_patterns");
543 543 [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m1" | _ -> failwith "acronym_patterns");
544 544 [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m2" | _ -> failwith "acronym_patterns");
545 545 [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m3" | _ -> failwith "acronym_patterns");
546   - [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:n2" | _ -> failwith "acronym_patterns");
  546 + [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:n:ncol" | _ -> failwith "acronym_patterns");
547 547 [CL; S "-"; CL; S "-"; O "ą"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:inst:f" | _ -> failwith "acronym_patterns");
548 548 [CL; S "-"; CL; S "-"; O "ę"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:acc:f" | _ -> failwith "acronym_patterns");
549 549 [L; S "-"; L; S "-"; O "ista"], (function [x;y;z;_;_] -> compose_lemma3 x y z "-ista" "subst:sg:nom:m1" | _ -> failwith "acronym_patterns");
... ... @@ -706,6 +706,7 @@ let abr_patterns = [
706 706 [O "itd"; S "."], (function [a;b] -> std a b [1,"i","conj";1,"tak","adv:pos";1,"daleko","adv:com"] | _ -> failwith "abr_patterns");
707 707 [O "itede"; S "."], (function [a;b] -> std a b [1,"i","conj";2,"tak","adv:pos";2,"daleko","adv:com"] | _ -> failwith "abr_patterns");
708 708 [O "itp"; S "."], (function [a;b] -> std a b [1,"i","conj";1,"tym","adv";1,"podobny","adj:pl:nom:_:pos"] | _ -> failwith "abr_patterns");
  709 + [O "j"; S "."; O "m"; S "."], (function [a;b;c;d] -> [ct [a;b] "jednostka" "subst:_:_:f"; ct [c;d] "miary" "subst:sg:gen:f"] | _ -> failwith "abr_patterns");
709 710 [O "jw"; S "."], (function [a;b] -> std a b [1,"jak","adv:pos";1,"wysoko","adv:com"] | _ -> failwith "abr_patterns");
710 711 [O "JWP"], (function [a] -> st a [1,"jaśnie","adv:pos";1,"wielmożny","adj:_:$C:m1:pos";1,"pan","subst:_:$C:m1"] | _ -> failwith "abr_patterns");
711 712 [O "JWP"], (function [a] -> st a [1,"jaśnie","adv:pos";1,"wielmożny","adj:_:$C:f:pos";1,"pani","subst:_:$C:f"] | _ -> failwith "abr_patterns");
... ... @@ -717,35 +718,36 @@ let abr_patterns = [
717 718 [O "m"; S "."; O "in"; S "."], (function [a;b;c;d] -> [ct [a;b] "między" "prep:inst"; ct [c;d] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns");
718 719 [O "m"; S "."; O "in"], (function [a;b;c] -> [ct [a;b] "między" "prep:inst"; ct [c] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns");
719 720 [O "m"; S "."; O "inn"; S "."], (function [a;b;c;d] -> [ct [a;b] "między" "prep:inst"; ct [c;d] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns");
720   - [O "m"; S "."; O "st"; S "."], (function [a;b;c;d] -> [ct [a;b] "miasto" "subst:_:$C:n2"; ct [c;d] "stołeczny" "adj:_:$C:n2:pos"] | _ -> failwith "abr_patterns");
  721 + [O "m"; S "."; O "st"; S "."], (function [a;b;c;d] -> [ct [a;b] "miasto" "subst:_:$C:n:ncol"; ct [c;d] "stołeczny" "adj:_:$C:n:pos"] | _ -> failwith "abr_patterns");
721 722 [O "m"; O "^"; O "2"], (function [a;b;c] -> [ct [a] "metr" "subst:_:$C:m3"; ct [b;c] "kwadratowy" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns");
722 723 [O "m"; O "2"], (function [a;b] -> [ct [a] "metr" "subst:_:$C:m3"; ct [b] "kwadratowy" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns");
723 724 [O "m"; O "3"], (function [a;b] -> [ct [a] "metr" "subst:_:$C:m3"; ct [b] "sześcienny" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns");
724 725 (* [O "min"; S "."], (function [a;b] -> std a b [1,"między","prep:inst";2,"inny","adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns"); *)
  726 + [O "mc"; S "."], (function [a;b] -> std a b [1,"masa","subst:sg:$C:f";1,"ciało","subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns");
725 727 [O "mkw"; S "."], (function [a;b] -> std a b [1,"metr","subst:_:$C:m3";2,"kwadratowy","adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns");
726 728 [O "n"; S "."; O "e"; S "."], (function [a;b;c;d] -> [ct [a;b] "nasz" "adj:sg:gen:f:pos"; ct [c;d] "era" "subst:sg:gen:f"] | _ -> failwith "abr_patterns");
727   - [O "n"; S "."; O "p"; S "."; O "m"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "nad" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n2"] | _ -> failwith "abr_patterns");
  729 + [O "n"; S "."; O "p"; S "."; O "m"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "nad" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns");
728 730 [O "np"; S "."], (function [a;b] -> std a b [1,"na","prep:acc";1,"przykład","subst:sg:acc:m3"] | _ -> failwith "abr_patterns");
729 731 [O "nt"; S "."], (function [a;b] -> std a b [1,"na","prep:acc";1,"temat","subst:sg:acc:m3"] | _ -> failwith "abr_patterns");
730 732 [O "NTG"], (function [a] -> st a [1,"nie","qub";1,"ta","adj:sg:nom:f:pos";1,"grupa","subst:sg:nom:f"] | _ -> failwith "abr_patterns");
731 733 [O "o"; S "."; O "o"; S "."], (function [a;b;c;d] -> [ct [a;b] "ograniczony" "adj:sg:$C:f:pos"; ct [c;d] "odpowiedzialność" "subst:sg:$C:f"] | _ -> failwith "abr_patterns");
732 734 [O "p"; S "."; O "n"; S "."; O "e"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "przed" "prep:inst"; ct [c;d] "nasz" "adj:sg:inst:f:pos"; ct [e;f] "era" "subst:sg:inst:f"] | _ -> failwith "abr_patterns");
733 735 [O "p"; S "."; O "o"; S "."], (function [a;b;c;d] -> [ct [a;b] "pełniący" "pact:_:_:m1.m2.m3:imperf:aff"; ct [c;d] "obowiązek" "subst:pl:acc:m3"] | _ -> failwith "abr_patterns");
734   - [O "p"; S "."; O "p"; S "."; O "m"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "pod" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n2"] | _ -> failwith "abr_patterns");
  736 + [O "p"; S "."; O "p"; S "."; O "m"; S "."], (function [a;b;c;d;e;f] -> [ct [a;b] "pod" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns");
735 737 [O "p"; S "."; O "t"; S "."], (function [a;b;c;d] -> [ct [a;b] "pod" "prep:inst:nwokc"; ct [c;d] "tytuł" "subst:sg:inst:m3"] | _ -> failwith "abr_patterns");
736 738 [O "pn"; S "."], (function [a;b] -> std a b [1,"pod","prep:inst";1,"nazwa","subst:sg:inst:f"] | _ -> failwith "abr_patterns");
737 739 [O "pne"; S "."], (function [a;b] -> std a b [1,"przed","prep:inst";1,"nasz","adj:sg:inst:f:pos";1,"era","subst:sg:inst:f"] | _ -> failwith "abr_patterns");
738 740 [O "pt"; S "."], (function [a;b] -> std a b [1,"pod","prep:inst";1,"tytuł","subst:sg:inst:m3"] | _ -> failwith "abr_patterns");
739 741 [O "PW"], (function [a] -> st a [1,"prywatny","adj:_:$C:f:pos";1,"wiadomość","subst:_:$C:f"] | _ -> failwith "abr_patterns");
740   - [O "pw"; S "."], (function [a;b] -> std a b [1,"pod","prep:inst";1,"wezwanie","subst:sg:inst:n2"] | _ -> failwith "abr_patterns");
  742 + [O "pw"; S "."], (function [a;b] -> std a b [1,"pod","prep:inst";1,"wezwanie","subst:sg:inst:n:ncol"] | _ -> failwith "abr_patterns");
741 743 (* [O "S"; S "."; O "A"; S "."], (function [a;b;c;d] -> [ct [a;b] "spółka" "subst:sg:$C:f"; ct [c;d] "akcyjny" "adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns");
742 744 [O "s"; S "."; O "c"; S "."], (function [a;b;c;d] -> [ct [a;b] "spółka" "subst:sg:$C:f"; ct [c;d] "cywilny" "adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns");*)
743 745 (* [O "SA"], (function [a] -> st a [1,"spółka","subst:sg:$C:f";1,"akcyjny","adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns"); *)
744 746 [O "ś"; S "."; O "p"; S "."], (function [a;b;c;d] -> [ct [a;b] "święty" "adj:sg:gen:f:pos"; ct [c;d] "pamięć" "subst:sg:gen:f"] | _ -> failwith "abr_patterns");
745 747 [O "śp"; S "."], (function [a;b] -> std a b [1,"święty","adj:sg:gen:f:pos";1,"pamięć","subst:sg:gen:f"] | _ -> failwith "abr_patterns");
746 748 [O "tgz"; S "."], (function [a;b] -> std a b [2,"tak","adv";1,"zwać","ppas:_:_:_:_:aff"] | _ -> failwith "abr_patterns");
747   - [O "tj"; S "."], (function [a;b] -> std a b [1,"to","subst:sg:nom:n2";1,"być","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns");
748   - [O "tzn"; S "."], (function [a;b] -> std a b [1,"to","subst:sg:nom:n2";2,"znaczyć","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns");
  749 + [O "tj"; S "."], (function [a;b] -> std a b [1,"to","subst:sg:nom:n:ncol";1,"być","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns");
  750 + [O "tzn"; S "."], (function [a;b] -> std a b [1,"to","subst:sg:nom:n:ncol";2,"znaczyć","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns");
749 751 [O "tzw"; S "."], (function [a;b] -> std a b [1,"tak","adv:pos";2,"zwać","ppas:_:_:_:imperf:aff"] | _ -> failwith "abr_patterns");
750 752 [O "ub"; S "."; O "r"; S "."], (function [a;b;c;d] -> [ct [a;b] "ubiegły" "adj:sg:$C:m3:pos"; ct [c;d] "rok" "subst:sg:$C:m3"] | _ -> failwith "abr_patterns");
751 753 [O "w"; S "."; O "w"; S "."], (function [a;b;c;d] -> [ct [a;b] "wysoko" "adv:com"; ct [c;d] "wymienić" "ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns");
... ...
tokenizer/ENIAMpatterns.ml
... ... @@ -405,7 +405,7 @@ let digit_patterns4 = [
405 405 [C "realnum-interval"; O "mld"], (function [x;_] -> make_tys 9 x | _ -> failwith "digit_patterns8");
406 406 ]
407 407  
408   -let url_patterns1 = [
  408 +(*let url_patterns1 = [
409 409 [L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
410 410 [L; D "dig"; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
411 411 [L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
... ... @@ -461,9 +461,12 @@ let url_patterns1 = [
461 461 let url_patterns2 = [
462 462 [L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
463 463 [L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
  464 + [L; S "_"; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
464 465 [L; S "."; L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
  466 + [L; S "."; D "dig"; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
465 467 [L; D "intnum"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
466 468 [L; S "."; L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
  469 + [L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
467 470 [O "http"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url"));
468 471 [O "https"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url"));
469 472 ]
... ... @@ -472,7 +475,7 @@ let url_patterns3 = [
472 475 [D "url"; S "/"], (function l -> Dig(concat_orths2 l,"url"));
473 476 [D "url"; S "/"; L], (function l -> Dig(concat_orths2 l,"url"));
474 477 [D "url"; S "/"; L; S "."; L], (function l -> Dig(concat_orths2 l,"url"));
475   -]
  478 +]*)
476 479  
477 480 let html_patterns = [
478 481 [S "<"; L; S ">"], (function l -> Dig(concat_orths2 l,"html-tag"));
... ... @@ -701,7 +704,7 @@ let manage_query_boundaries tokens =
701 704 if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else
702 705 if find_beg_pattern [I "</query>";I "”s"] tokens then
703 706 replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else
704   - if find_beg_pattern [I "</query>";I ")s";I "</sentence>"] tokens then tokens else
  707 + if find_beg_pattern [I "</query>";I ")s"(*;I "</sentence>"*)] tokens then tokens else
705 708 replace_beg_pattern [I "</query>"] add_sentence_end tokens in
706 709 let tokens = Xlist.rev_map tokens revert_tokens in
707 710 tokens
... ... @@ -724,12 +727,12 @@ let find_replacement_patterns tokens =
724 727 let tokens = find_patterns ENIAMacronyms.name_patterns tokens in
725 728 (* Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *)
726 729 let tokens = normalize_tokens [] tokens in
727   - let tokens = find_patterns url_patterns1 tokens in
  730 +(* let tokens = find_patterns url_patterns1 tokens in
728 731 let tokens = normalize_tokens [] tokens in
729 732 let tokens = find_patterns url_patterns2 tokens in
730 733 let tokens = normalize_tokens [] tokens in
731 734 let tokens = find_patterns url_patterns3 tokens in
732   - let tokens = normalize_tokens [] tokens in
  735 + let tokens = normalize_tokens [] tokens in*)
733 736 let tokens = find_patterns html_patterns tokens in
734 737 let tokens = normalize_tokens [] tokens in
735 738 (* Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *)
... ...
tokenizer/ENIAMtokenizer.ml
... ... @@ -21,7 +21,8 @@ open Xstd
21 21 open ENIAMtokenizerTypes
22 22  
23 23 let initialize () =
24   - ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns ()
  24 + ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns ();
  25 + ENIAMurl.top_level_domains := ENIAMurl.load_top_level_domains ()
25 26  
26 27 let string_of =
27 28 ENIAMtokens.string_of_tokens
... ...
tokenizer/ENIAMtokenizerTypes.ml
... ... @@ -41,7 +41,7 @@ type token =
41 41 | Tokens of string * int list (*cat * token id list *)
42 42  
43 43 type attr =
44   - CS | MaybeCS | ReqValLemm | MWE | LemmNotVal | TokNotFound | NotValProper | LemmLowercase | Roman | Capitalics
  44 + FC | CS | MaybeCS | ReqValLemm | MWE | LemmNotVal | TokNotFound | NotValProper | LemmLowercase | Roman | Capitalics
45 45 | SentBeg | SentEnd | SentBegEnd
46 46 | BrevLemma of string
47 47 | Disamb of string * string * string list list
... ... @@ -71,6 +71,8 @@ type pat = L | CL | SL | (*SL2 |*) D of string | C of string | S of string | RD
71 71 let empty_token_env = {
72 72 orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.}
73 73  
  74 +let internet_mode = ref false
  75 +
74 76 let resource_path =
75 77 try Sys.getenv "ENIAM_RESOURCE_PATH"
76 78 with Not_found ->
... ... @@ -82,6 +84,8 @@ let resource_path =
82 84 let mte_filename = resource_path ^ "/tokenizer/mte_20151215.tab"
83 85 let mte_filename2 = resource_path ^ "/tokenizer/mte.tab"
84 86  
  87 +let top_level_domains_filename = resource_path ^ "/tokenizer/top-level-domains.tab"
  88 +
85 89 module OrderedTokenEnv = struct
86 90  
87 91 type t = token_env
... ...
tokenizer/ENIAMtokens.ml
... ... @@ -87,7 +87,8 @@ let rec xml_of_token = function
87 87 | Tokens(cat,l) -> Xml.Element("Tokens",["pos",cat],Xlist.map l (fun x -> Xml.Element("id",[],[Xml.PCData (string_of_int x)])))
88 88  
89 89 let string_of_attr = function
90   - CS -> "cs"
  90 + FC -> "first capital"
  91 + | CS -> "cs"
91 92 | MaybeCS -> "maybe cs"
92 93 | ReqValLemm -> "required validated lemmatization"
93 94 | MWE -> "mwe"
... ... @@ -212,9 +213,9 @@ let merge_digits poss_s_beg i digs =
212 213 (if Xlist.size digs <= 3 && List.hd digs <> "0" then [t (Dig(v,"pref3dig"));sc_t (Dig(v,"pref3dig"))] else []) in*)
213 214 Variant variants
214 215  
215   -let merge_url poss_s_beg i digs =
  216 +(* let merge_url poss_s_beg i digs =
216 217 let orth = String.concat "" digs in
217   - Variant(dig_tokens orth poss_s_beg i digs orth "url")
  218 + Variant(dig_tokens orth poss_s_beg i digs orth "url") *)
218 219  
219 220 let recognize_roman_I v = function
220 221 Capital("I",_) :: Capital("I",_) :: Capital("I",_) :: [] -> v+3,false
... ... @@ -335,6 +336,7 @@ let get_first_lower = function
335 336 | _ -> failwith "get_first_lower"
336 337  
337 338 let cs_weight = -1.
  339 +let fc_weight = -10.
338 340 let sc_cap_weight = -0.3
339 341  
340 342 let is_add_attr_token = function
... ... @@ -361,13 +363,17 @@ let recognize_stem poss_s_beg has_sufix i letters =
361 363 Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SmallLetter(merge (lowercase_first letters)); attrs=MaybeCS :: t.attrs}];
362 364 Token{t with token=CapLetter(orth,merge (lowercase_first letters)); attrs=MaybeCS :: t.attrs};
363 365 Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=CapLetter(orth,merge (lowercase_first letters)); weight=sc_cap_weight; attrs=MaybeCS :: t.attrs}]]
  366 + else if !internet_mode then Variant[
  367 + Token{t with token=SmallLetter orth};
  368 + Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SmallLetter orth}]]
364 369 else Token{t with token=SmallLetter orth}
365 370 else
366 371 if first_capital letters then
367   - if rest_small letters then Variant[
  372 + if rest_small letters then Variant([
368 373 Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall(merge (lowercase_first letters))}];
369 374 Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)};
370   - Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters); weight=sc_cap_weight}]]
  375 + Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters); weight=sc_cap_weight}]] @
  376 + (if !internet_mode then [Token{t with token=AllSmall(merge (lowercase_first letters)); weight=fc_weight; attrs=FC :: t.attrs}] else []))
371 377 else if rest_capital letters then Variant([
372 378 Token{t with token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs};
373 379 Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs}];
... ... @@ -377,6 +383,13 @@ let recognize_stem poss_s_beg has_sufix i letters =
377 383 Token{t with token=AllCap(orth,merge (lowercase_rest letters),merge (lowercase_all letters)); attrs=MaybeCS :: t.attrs};
378 384 Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllCap(orth,merge (lowercase_rest letters),merge (lowercase_all letters)); attrs=MaybeCS :: t.attrs}]]))
379 385 else Token{t with token=SomeCap orth}
  386 + else if !internet_mode then
  387 + if rest_small letters then Variant[
  388 + Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall orth}];
  389 + Token{t with token=AllSmall orth}]
  390 + else Variant[
  391 + Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SomeCap orth}];
  392 + Token{t with token=SomeCap orth}]
380 393 else
381 394 if rest_small letters then Token{t with token=AllSmall orth}
382 395 else Token{t with token=SomeCap orth}
... ... @@ -388,8 +401,9 @@ let recognize_stem poss_s_beg has_sufix i letters =
388 401 else Token{t with token=SmallLetter orth}
389 402 else
390 403 if first_capital letters then
391   - if rest_small letters then
392   - Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)}
  404 + if rest_small letters then Variant([
  405 + Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)}] @
  406 + (if !internet_mode then [Token{t with token=AllSmall(merge (lowercase_first letters)); weight=fc_weight; attrs=FC :: t.attrs}] else []))
393 407 else if rest_capital letters then Variant([
394 408 Token{t with token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs};
395 409 Token{t with token=FirstCap(merge (lowercase_rest letters),merge (lowercase_all letters),get_first_cap letters,get_first_lower letters); weight=cs_weight; attrs=CS :: t.attrs}] @
... ... @@ -547,24 +561,24 @@ let rec group_others rev = function
547 561 | x :: l -> List.rev rev, x :: l
548 562  
549 563 let create_sign_token poss_s_beg i signs l token =
550   - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
  564 + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | Small s -> s | _ -> failwith "create_sign_token")) in
551 565 let len = Xlist.size signs * factor in
552 566 Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=token; attrs=[MaybeCS]},i+len,l,poss_s_beg
553 567  
554 568 let create_empty_sign_token i signs =
555   - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
  569 + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_empty_sign_token")) in
556 570 let len = Xlist.size signs * factor in
557 571 {empty_token_env with orth=orth;beg=i;len=len;next=i+len; attrs=[MaybeCS]},i+len
558 572  
559 573 let create_sentence_seq i signs l lemma =
560   - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
  574 + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq")) in
561 575 let len = Xlist.size signs * factor in
562 576 Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"};
563 577 Token{empty_token_env with orth=orth;beg=i+20;len=len-30;next=i+len-10;token=make_lemma (lemma,"sinterj")};
564 578 Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}]
565 579  
566 580 let create_sentence_seq_hapl i signs l lemma =
567   - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
  581 + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_hapl")) in
568 582 let len = Xlist.size signs * factor in
569 583 Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]};
570 584 Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"};
... ... @@ -572,7 +586,7 @@ let create_sentence_seq_hapl i signs l lemma =
572 586 Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}]
573 587  
574 588 let create_sentence_seq_q i signs l lemma =
575   - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
  589 + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_q")) in
576 590 let len = Xlist.size signs * factor in
577 591 Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "?"};
578 592 Token{empty_token_env with beg=i+20;len=10;next=i+30;token=Interp "</clause>"};
... ... @@ -580,7 +594,7 @@ let create_sentence_seq_q i signs l lemma =
580 594 Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}]
581 595  
582 596 let create_sentence_seq_hapl_q i signs l lemma =
583   - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
  597 + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_hapl_q")) in
584 598 let len = Xlist.size signs * factor in
585 599 Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]};
586 600 Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "?"};
... ... @@ -589,7 +603,7 @@ let create_sentence_seq_hapl_q i signs l lemma =
589 603 Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}]
590 604  
591 605 let create_or_beg i signs l poss_s_beg =
592   - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
  606 + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_or_beg")) in
593 607 let len = Xlist.size signs * factor in
594 608 Variant[
595 609 Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=Symbol "-"; attrs=[MaybeCS]};
... ... @@ -606,7 +620,7 @@ let create_or_beg i signs l poss_s_beg =
606 620 ],i+len,l,poss_s_beg
607 621  
608 622 let create_or_beg2 i signs l poss_s_beg =
609   - let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
  623 + let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_or_beg2")) in
610 624 let len = Xlist.size signs * factor in
611 625 Variant[
612 626 Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=Interp "<or>"};
... ... @@ -631,18 +645,18 @@ let is_dot_sentence_end_marker = function
631 645 | _ -> false
632 646  
633 647 let not_dot_sentence_end_marker = function
634   - Sign " " :: Small _ :: _ -> true
635   - | Sign "" :: Small _ :: _ -> true
636   - | Sign " " :: Small _ :: _ -> true
  648 + Sign " " :: Small _ :: _ -> if !internet_mode then false else true
  649 + | Sign "" :: Small _ :: _ -> if !internet_mode then false else true
  650 + | Sign " " :: Small _ :: _ -> if !internet_mode then false else true
637 651 | Sign "," :: _ -> true
638 652 | Sign ":" :: _ -> true
639 653 | Sign "?" :: _ -> true
640 654 | Sign "!" :: _ -> true
641   - | Small _ :: _ -> true
642   - | ForeignSmall _ :: _ -> true
643   - | Capital _ :: _ -> true
644   - | ForeignCapital _ :: _ -> true
645   - | Digit _ :: _ -> true
  655 + | Small _ :: _ -> if !internet_mode then false else true
  656 + | ForeignSmall _ :: _ -> if !internet_mode then false else true
  657 + | Capital _ :: _ -> if !internet_mode then false else true
  658 + | ForeignCapital _ :: _ -> if !internet_mode then false else true
  659 + | Digit _ :: _ -> if !internet_mode then false else true
646 660 | _ -> false
647 661  
648 662 let is_comma_digit_marker = function
... ... @@ -705,6 +719,7 @@ let rec recognize_sign_group poss_s_beg i = function
705 719 | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ")
706 720 | (Sign "") :: l -> create_sign_token poss_s_beg i [Sign ""] l (Symbol " ")
707 721 | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ")
  722 + | (Sign "&") :: (Small "n") :: (Small "b") :: (Small "s") :: (Small "p") :: (Sign ";") :: l -> create_sign_token poss_s_beg i ((Sign "&") :: (Small "n") :: (Small "b") :: (Small "s") :: (Small "p") :: (Sign ";") :: []) l (Symbol " ")
708 723 | (Sign "\"") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "\""] l
709 724 | (Sign "\"") :: l ->
710 725 let t,i2 = create_empty_sign_token i [Sign "\""] in
... ... @@ -775,12 +790,16 @@ let rec recognize_sign_group poss_s_beg i = function
775 790 | (Sign ";") :: (Sign ")") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: (Sign ")") :: []) l (make_lemma (";))","sinterj")) *)
776 791 | (Sign ":") :: (Sign "|") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "|") :: []) l (make_lemma (":|","sinterj"))
777 792 | (Sign ":") :: (Sign "\\") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "\\") :: []) l (make_lemma (":\\","sinterj"))
  793 + | (Sign ":") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "/") :: []) l (make_lemma (":/","sinterj"))
778 794 | (Sign ":") :: (Sign "-") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "/") :: []) l (make_lemma (":-/","sinterj"))
779 795 (* | (Sign ":") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign ")") :: []) l (make_lemma (":)","sinterj"))
780 796 | (Sign ";") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: []) l (make_lemma (";)","sinterj")) *)
781 797 | (Sign ")") :: l -> (*create_sign_token poss_s_beg i [Sign ")"] l (Interp ")")*)
782   - let t,i = create_empty_sign_token i [Sign ")"] in
783   - Variant[Token{t with token=Symbol ")"};Token{t with token=Interp ")"};Token{t with token=Interp ")s"}],i,l,poss_s_beg
  798 + let t,i2 = create_empty_sign_token i [Sign ")"] in
  799 + Variant[Token{t with token=Symbol ")"};Token{t with token=Interp ")"};Token{t with token=Interp ")s"};
  800 + Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Interp "</clause>"};
  801 + Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</sentence>"};
  802 + Token{empty_token_env with orth=":";beg=i+20;len=factor-20;next=i+factor;token=Interp ")s"}]],i2,l,true
784 803 | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj"))
785 804 | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj"))
786 805 | (Sign "[") :: (Sign "+") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign "+") :: (Sign "]") :: []) l (make_lemma ("[+]","symbol"))
... ... @@ -928,29 +947,32 @@ let rec recognize_sign_group poss_s_beg i = function
928 947 create_sentence_seq i ((Sign ".") :: (Sign ".") :: []) l "…";
929 948 Token{empty_token_env with orth="..";beg=i;len=2*factor;next=i+2*factor;token=make_lemma ("…","sinterj"); attrs=[MaybeCS]}],i+2*factor,l,true
930 949 | (Sign ".") :: l ->
931   - if is_dot_sentence_end_marker l then
  950 + if is_dot_sentence_end_marker l then ((*Printf.printf "dot 1 i=%d\n%!" i;*)
932 951 Variant[Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]};
933 952 Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"};
934 953 Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}];
935 954 Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"};
936   - Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]],i+factor,l,true
937   - else if not_dot_sentence_end_marker l then
938   - Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]},i+factor,l,false
939   - else
  955 + Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]],i+factor,l,true)
  956 + else if not_dot_sentence_end_marker l then ((*Printf.printf "dot 2 i=%d\n%!" i;*)
  957 + Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]},i+factor,l,false)
  958 + else ((*Printf.printf "dot 3 i=%d\n%!" i;*)
940 959 Variant[Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]};
941 960 Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"};
942 961 Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}];
943 962 Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"};
944 963 Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}];
945   - Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]}],i+factor,l,true
  964 + Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]}],i+factor,l,true)
946 965 | (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*";Sign "*";Sign "*"] l (Interp "*****") (* zastępniki liter *)
947 966 | (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*";Sign "*"] l (Interp "****")
948 967 | (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*"] l (Interp "***")
949 968 | (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*"] l (Interp "**")
950   - | (Sign "*") :: l -> (* Interp zastępnik liter i cudzysłów, symbol listy *)
951   - let t,i = create_empty_sign_token i [Sign "*"] in
952   - Variant[Token{t with token=Interp "*"};Token{t with token=Symbol "*"}],i,l,poss_s_beg
953   - | (Sign "+") :: l -> create_sign_token poss_s_beg i [Sign "+"] l (Symbol "+")
  969 + | (Sign "*") :: l -> (* Interp to zastępnik liter i cudzysłów, symbol listy *)
  970 + let t,i2 = create_empty_sign_token i [Sign "*"] in
  971 + Variant([Token{t with token=Interp "*"};Token{t with token=Symbol "*"}] @
  972 + (if !internet_mode then [sc_dig_token "*" i [Sign "*"] (make_lemma ("*","symbol"))] else [])),i2,l,poss_s_beg
  973 + | (Sign "+") :: l -> (* Interp to spójnik *)
  974 + let t,i2 = create_empty_sign_token i [Sign "+"] in
  975 + Variant[Token{t with token=Interp "+"};Token{t with token=Symbol "+"}],i2,l,poss_s_beg
954 976 | (Sign "«") :: l ->
955 977 let t,i = create_empty_sign_token i [Sign "«"] in
956 978 Variant[Token{t with token=Interp "«"};Token{t with token=Interp "«s"}],i,l,poss_s_beg
... ... @@ -1005,6 +1027,7 @@ let rec recognize_sign_group poss_s_beg i = function
1005 1027 | (Sign "_") :: l -> create_sign_token poss_s_beg i [Sign "_"] l (Symbol "_")
1006 1028 | (Sign "@") :: l -> create_sign_token poss_s_beg i [Sign "@"] l (Symbol "@")
1007 1029 | (Sign "×") :: l -> create_sign_token poss_s_beg i [Sign "×"] l (Symbol "×")
  1030 + | (Sign "±") :: l -> create_sign_token poss_s_beg i [Sign "±"] l (Symbol "±")
1008 1031 | (Sign "%") :: l ->
1009 1032 let t,i = create_empty_sign_token i [Sign "%"] in
1010 1033 Variant[Token{t with token=Symbol "%"};Token{t with token=make_lemma ("procent","subst:_:_:m3")}],i,l,false
... ... @@ -1018,12 +1041,15 @@ let rec recognize_sign_group poss_s_beg i = function
1018 1041 | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t")
1019 1042 | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r")
1020 1043 | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n")
  1044 + | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®")
  1045 + | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ")
  1046 + | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ")
1021 1047 | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s)
1022 1048 | l -> failwith "recognize_sign_group"
1023 1049  
1024 1050 (* FIXME: "„Szpak” frunie." trzeba przenie przenieść <sentence> przed „, ale zostawić po „s. *)
1025 1051  
1026   -let rec group_url rev = function
  1052 +(*let rec group_url rev = function
1027 1053 Small s :: l -> group_url (s :: rev) l
1028 1054 | Capital(s,t) :: l -> group_url (s :: rev) l
1029 1055 | ForeignSmall s :: l -> group_url (s :: rev) l
... ... @@ -1040,24 +1066,34 @@ let rec group_url rev = function
1040 1066 | Sign "," :: l -> group_url ("," :: rev) l
1041 1067 | Sign "~" :: l -> group_url ("~" :: rev) l
1042 1068 | Sign "_" :: l -> group_url ("_" :: rev) l
1043   - | l -> List.rev rev, l
  1069 + | l -> List.rev rev, l*)
  1070 +
  1071 +let merge_url poss_s_beg i len orth cat =
  1072 + if poss_s_beg then
  1073 + Variant[Token{empty_token_env with orth=orth;beg=i;len=len*factor;next=i+len*factor;token=Dig(orth,cat)};
  1074 + Seq[s_beg i;c_beg (i+1);Token{empty_token_env with orth=orth;beg=i+2;len=len*factor-2;next=i+len*factor;token=Dig(orth,cat)}]]
  1075 + else
  1076 + Token{empty_token_env with orth=orth;beg=i;len=len*factor;next=i+len*factor;token=Dig(orth,cat)}
1044 1077  
1045 1078 let rec group_chars poss_s_beg i rev = function
1046 1079 [] -> List.rev ((Token{empty_token_env with beg=i;len=factor;next=i+factor;token=Interp "</query>"}) :: rev)
1047   - | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l ->
  1080 + (* | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l ->
1048 1081 let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l
1049 1082 | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Small "s") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l ->
1050   - let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l
1051   - | (Digit s) :: l -> let x,l = group_digits [] ((Digit s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_digits poss_s_beg i x) :: rev) l
1052   - | (Sign s) :: l -> let x,i,l,poss_s_beg = recognize_sign_group poss_s_beg i ((Sign s) :: l) in group_chars poss_s_beg i (x :: rev) l
1053   - | (Capital(s,t)) :: l -> let x,l = group_letters [] ((Capital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
1054   - | (ForeignCapital(s,t)) :: l -> let x,l = group_letters [] ((ForeignCapital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
1055   - | (Small s) :: l -> let x,l = group_letters [] ((Small s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
1056   - | (ForeignSmall s) :: l -> let x,l = group_letters [] ((ForeignSmall s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
1057   - | (Other(s,x)) :: l ->
  1083 + let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l *)
  1084 + | Digit s :: l -> let x,l = group_digits [] ((Digit s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_digits poss_s_beg i x) :: rev) l
  1085 + | Sign s :: l -> let x,i,l,poss_s_beg = recognize_sign_group poss_s_beg i ((Sign s) :: l) in group_chars poss_s_beg i (x :: rev) l
  1086 + | Capital(s,t) :: l -> let x,l = group_letters [] ((Capital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
  1087 + | ForeignCapital(s,t) :: l -> let x,l = group_letters [] ((ForeignCapital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
  1088 + | Small s :: l -> let x,l = group_letters [] ((Small s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
  1089 + | ForeignSmall s :: l -> let x,l = group_letters [] ((ForeignSmall s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
  1090 + | Emoticon s :: l -> group_chars poss_s_beg (i + factor) ((Token{empty_token_env with orth=s;beg=i;len=factor;next=i+factor;token=make_lemma (s,"sinterj")}) :: rev) l
  1091 + | Other("url",len) :: Sign s :: l -> group_chars false (i + len * factor) ((merge_url poss_s_beg i len s "url") :: rev) l
  1092 + | Other("email",len) :: Sign s :: l -> group_chars false (i + len * factor) ((merge_url poss_s_beg i len s "email") :: rev) l
  1093 + | Other(s,x) :: l ->
1058 1094 let x,l = group_others [] ((Other(s,x)) :: l) in
1059 1095 group_chars false (i + Xlist.size x * factor)
1060   - ((Token{empty_token_env with orth=String.concat "" x;beg=i;len=Xlist.size x * factor;next=i+factor;token=Other(String.concat "" x)}) :: rev) l
  1096 + ((Token{empty_token_env with orth=String.concat "" x;beg=i;len=Xlist.size x * factor;next=i+Xlist.size x * factor;token=Other(String.concat "" x)}) :: rev) l
1061 1097  
1062 1098 let tokenize l =
1063   - (Token{empty_token_env with beg=0;len=factor;next=factor;token=Interp "<query>"}) :: (group_chars true factor [] l)
  1099 + (Token{empty_token_env with beg=0;len=factor;next=factor;token=Interp "<query>"}) :: (group_chars true factor [] (ENIAMurl.find l))
... ...
tokenizer/makefile
... ... @@ -6,25 +6,26 @@ OCAMLFLAGS=$(INCLUDES) -g
6 6 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9   -SOURCES= ENIAMtokenizerTypes.ml ENIAMtokens.ml ENIAMacronyms.ml ENIAMpatterns.ml ENIAMtokenizer.ml
  9 +SOURCES= ENIAMtokenizerTypes.ml ENIAMurl.ml ENIAMtokens.ml ENIAMacronyms.ml ENIAMpatterns.ml ENIAMtokenizer.ml
10 10  
11 11 all: eniam-tokenizer.cma eniam-tokenizer.cmxa
12 12  
13 13 install: all
14 14 mkdir -p $(INSTALLDIR)
15 15 cp eniam-tokenizer.cmxa eniam-tokenizer.a eniam-tokenizer.cma $(INSTALLDIR)
16   - cp ENIAMtokenizerTypes.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR)
17   - cp ENIAMtokenizerTypes.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR)
  16 + cp ENIAMtokenizerTypes.cmi ENIAMurl.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR)
  17 + cp ENIAMtokenizerTypes.cmx ENIAMurl.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR)
18 18 mkdir -p /usr/share/eniam/tokenizer
19 19 cp resources/* /usr/share/eniam/tokenizer
20 20  
21 21 install-local: all
22 22 mkdir -p $(INSTALLDIR)
23 23 cp eniam-tokenizer.cmxa eniam-tokenizer.a eniam-tokenizer.cma $(INSTALLDIR)
24   - cp ENIAMtokenizerTypes.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR)
25   - cp ENIAMtokenizerTypes.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR)
  24 + cp ENIAMtokenizerTypes.cmi ENIAMurl.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR)
  25 + cp ENIAMtokenizerTypes.cmx ENIAMurl.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR)
26 26 mkdir -p /usr/local/share/eniam/tokenizer
27 27 cp resources/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte_20151215.tab
  28 + cp resources/top-level-domains.tab /usr/local/share/eniam/tokenizer/top-level-domains.tab
28 29 cp resources/README /usr/local/share/eniam/tokenizer/README
29 30 # ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab
30 31  
... ...