Rozszerzenie funkcjonalnoniści tokenizera

Wojciech Jaworski
1 parent e0385166
Showing 14 changed files with 326 additions and 120 deletions
LCGlexicon/ENIAM_LCGlexicon.ml
LCGlexicon/ENIAM_LCGlexiconTypes.ml
exec/ENIAMvisualization.ml
subsyntax/ENIAM_MWE.ml
subsyntax/ENIAMsentences.ml
subsyntax/ENIAMsubsyntax.ml
subsyntax/interface.ml
subsyntax/resources/brev.tab
tokenizer/ENIAMacronyms.ml
tokenizer/ENIAMpatterns.ml
tokenizer/ENIAMtokenizer.ml
tokenizer/ENIAMtokenizerTypes.ml
tokenizer/ENIAMtokens.ml
tokenizer/makefile
@@ -312,6 +312,7 @@ let create_entries rules id orth cats valence lex_entries =
       (* variable_name_ref := []; *)
       if cats.pos="interp" && cats.lemma="<clause>" then (BracketSet(Forward),Dot) :: l else
       if cats.pos="interp" && cats.lemma="</clause>" then (BracketSet(Backward),Dot) :: l else
+      if (cats.pos2="noun" ||  cats.pos2="verb" ||  cats.pos2="adj" ||  cats.pos2="adv") && cats.cat="X" && not !default_category_flag then l else
         let e = get_labels () in
         (* print_endline "create_entries 1"; *)
         let rules = find_rules rules cats in
@@ -79,6 +79,8 @@ let empty_cats = {lemma=&quot;&quot;; pos=&quot;&quot;; pos2=&quot;&quot;; cat=&quot;X&quot;; coerced=[];
                   nsyn=[]; nsem=[]; modes=[]; psem=[];
                  }
  
+let default_category_flag = ref true
+
 let resource_path =
   try Sys.getenv "ENIAM_RESOURCE_PATH"
   with Not_found ->
@@ -24,7 +24,7 @@ open ENIAMtokenizerTypes
 open ENIAMexecTypes
  
 let string_of_status = function
-    Idle -> "Idle"
+      Idle -> "Idle"
     | PreprocessingError -> "PreprocessingError"
     | LexiconError -> "LexiconError"
     | ParseError -> "ParseError"
@@ -786,6 +786,80 @@ let create_latex_dep_chart path name dep_chart =
   LatexMain.latex_compile_and_clean path name
 *)
  
+let rec extract_pos_cat_internal vars = function
+  | Atom x -> x
+  | AVar x -> (try extract_pos_cat_internal vars (Xlist.assoc vars x) with Not_found -> failwith "extract_pos_cat_internal")
+  | With l -> String.concat "&" (Xlist.map l (extract_pos_cat_internal vars))
+  | Zero -> "0"
+  | Top -> "T"
+
+let rec extract_pos_cat vars = function
+  | Tensor [] -> failwith "extract_pos_cat: ni"
+  | Tensor [pos] -> extract_pos_cat_internal vars pos
+  | Tensor (Atom "num" :: _) -> "Number"
+  | Tensor (Atom "prepnp" :: _) -> "Prep"
+  | Tensor (pos :: cat :: _) -> (*extract_pos_cat_internal vars pos ^ "*" ^*) extract_pos_cat_internal vars cat
+  | Plus l -> failwith "extract_pos_cat: ni"
+  | Imp(s,d,t2) -> extract_pos_cat vars s
+  | One -> failwith "extract_pos_cat: ni"
+  | ImpSet(s,l) -> extract_pos_cat vars s
+  | WithVar(v,g,e,s) -> extract_pos_cat ((v,g) :: vars) s
+  | Star s -> failwith "extract_pos_cat: ni"
+  | Bracket(lf,rf,s) -> extract_pos_cat vars s
+  | BracketSet d -> "BracketSet"
+  | Maybe s -> failwith "extract_pos_cat: ni"
+
+let get_text_fragment text_fragments node1 node2 =
+  try IntMap.find text_fragments.(node1) node2
+  with Not_found -> "???"(*failwith (Printf.sprintf "chart: text_fragment not found %d-%d" node1 node2)*)
+
+let omited = StringSet.of_list ["<subst>";"<depr>";"<ppron12>";"<ppron3>";"<siebie>";"<prep>";
+  "<num>";"<intnum>";"<realnum>";"<intnum-interval>";"<realnum-interval>";"<symbol>";"<ordnum>";
+  "<date>";"<date-interval>";"<hour-minute>";"<hour>";"<hour-minute-interval>";"<hour-interval>";
+  "<year>";"<year-interval>";"<day>";"<day-interval>";"<day-month>";"<day-month-interval>";
+  "<month-interval>";"<roman>";"<roman-interval>";"<roman-ordnum>";"<match-result>";"<url>";
+  "<email>";"<obj-id>";"<adj>";"<apron>";"<adjc>";"<adjp>";"<adja>";"<adv>";"<ger>";"<pact>";
+  "<ppas>";"<fin>";"<bedzie>";"<praet>";"<winien>";"<impt>";"<imps>";"<pred>";"<aglt>";"<inf>";
+  "<pcon>";"<pant>";"<qub>";"<comp>";"<compar>";"<conj>";"<interj>";"<sinterj>";"<burk>";
+  "<interp>";"<part>";"<unk>";"<building-number>"]
+
+let cat_tokens_sequence text_fragments g =
+  let _,_,l = ENIAM_LCGchart.fold g (0,0,[]) (fun (m,n,l) (symbol,node1,node2,sem,layer) ->
+    node1,node2,
+    (if m < node1 then
+      if n < node1 then [n, node1, get_text_fragment text_fragments n node1, "null"]
+      else if n = node1 then []
+      else [node1, n, get_text_fragment text_fragments node1 n, "overlap"]
+    else if m = node1 then
+      if n < node2 then [m, n, get_text_fragment text_fragments m n, "overlap"]
+      else if n = node2 then []
+      else [node1, node2, get_text_fragment text_fragments node1 node2, "overlap"]
+    else failwith "cat_tokens_sequence") @
+    [node1, node2, get_text_fragment text_fragments node1 node2, extract_pos_cat [] symbol] @ l) in
+  let map = Xlist.fold l IntMap.empty (fun map (m,n,text,symbol) ->
+    IntMap.add_inc map (1000000*m+n) [text,symbol] (fun l -> (text,symbol) :: l)) in
+  let map = IntMap.map map (fun l ->
+    let t,ov,set = Xlist.fold l ("",false,StringSet.empty) (fun (t,ov,set) (text,symbol) ->
+      if symbol = "null" then text,ov,set
+      else if symbol = "overlap" then t,true,set
+      else if StringSet.mem omited symbol then text,ov,set
+      else t,ov,StringSet.add set symbol) in
+    let l = if StringSet.is_empty set then [t] else StringSet.to_list set in
+    if ov then "OVERLAP{" ^ String.concat " " l ^ "}" else
+    match l with
+      [t] -> t
+    | _ -> "{" ^ String.concat " " l ^ "}") in
+  let l = List.sort compare (IntMap.fold map [] (fun l k texts -> (k,texts) :: l)) in
+(*  let l = Xlist.sort l (fun (m1,n1,text1,symbol1) (m2,n2,text2,symbol2) ->
+    if m1 <> m2 then compare m1 m2 else
+    if n1 <> n2 then compare n1 n2 else
+    compare symbol1 symbol2) in
+  let l = if l = [] then l else
+    Xlist.fold (List.tl l) [List.hd l] (fun l a ->
+      match l with
+        [] -> failwith "cat_tokens_sequence"
+      | b :: l -> if a = b then b :: l else a :: b :: l) in*)
+  String.concat " " (Xlist.map l (fun (n,texts) -> texts))
  
 (* verbosity:
   0 -> jedynie informacja o statusie zdania
@@ -796,13 +870,13 @@ let create_latex_dep_chart path name dep_chart =
 let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam_parse_result) =
   match result.status with
     Idle -> "<font color=\"red\">idle</font>\n"
-  | LexiconError -> sprintf "<font color=\"red\">error_lex</font>: %s paths_size=%d\n" result.msg result.paths_size
+  | LexiconError -> sprintf "<font color=\"red\">error_lex</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size
   | ParseError ->
       if verbosity = 0 then () else (
         ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
         ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
         ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2);
-      sprintf "<font color=\"red\">error_parse</font>: %s paths_size=%d\n" result.msg result.paths_size ^
+      sprintf "<font color=\"red\">error_parse</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size ^
       (if verbosity = 0 then "" else
         sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
         sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^
@@ -814,7 +888,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
         ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2);
       if verbosity = 0 then () else (
         ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2);
-      sprintf "<font color=\"red\">timeout</font>: %s paths_size=%d\n" result.msg result.paths_size ^
+      sprintf "<font color=\"red\">timeout</font>: %s paths_size=%d\n" (escape_html result.msg) result.paths_size ^
       (if verbosity < 2 then "" else
         sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
         sprintf "<BR><A HREF=\"%s_2_references.pdf\">References 2</A>\n" file_prefix) ^
@@ -840,6 +914,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
         sprintf "<BR><A HREF=\"%s_3_references.pdf\">References 3</A>\n" file_prefix ^
         sprintf "<BR><A HREF=\"%s_3_chart.pdf\">Chart 3</A>\n" file_prefix) ^
       (if verbosity = 0 then "" else
+        sprintf "<BR>%s\n" (escape_html (cat_tokens_sequence result.text_fragments (ENIAM_LCGchart.select_maximal result.chart1))) ^
         sprintf "<BR><A HREF=\"%s_3_chart_selection.pdf\">Chart 3 Selection</A>\n" file_prefix) ^
       ""
   | ReductionError ->
@@ -851,7 +926,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
         ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
         ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3);
       (if verbosity < 2 then "" else
-        sprintf "<font color=\"red\">error_reduction</font>: %s paths_size=%d chart_size=%d\n" result.msg result.paths_size result.chart_size ^
+        sprintf "<font color=\"red\">error_reduction</font>: %s paths_size=%d chart_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size ^
         sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^
         sprintf "<BR><A HREF=\"%s_2_references.pdf\">References 2</A>\n" file_prefix ^
         sprintf "<BR><A HREF=\"%s_3_chart.pdf\">Chart 3</A>\n" file_prefix) ^
@@ -909,7 +984,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
           Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 result.term4));
         Xlatex.latex_compile_and_clean path (file_prefix ^ "_4_term");
         ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_4_dependency_tree") "a0" result.dependency_tree4);
-      sprintf "<font color=\"red\">error_reduction2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
+      sprintf "<font color=\"red\">error_reduction2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
       (if verbosity < 2 then "" else
         sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
         sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^
@@ -939,7 +1014,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
         ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_6b_dependency_tree") result.dependency_tree6b;
         ENIAM_LCGgraphOf.print_simplified_dependency_tree path (file_prefix ^ "_6a_simple_dependency_tree") result.dependency_tree6a;
         ENIAM_LCGgraphOf.print_simplified_dependency_tree path (file_prefix ^ "_6b_simple_dependency_tree") result.dependency_tree6b);
-      sprintf "<font color=\"red\">error_reduction3</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
+      sprintf "<font color=\"red\">error_reduction3</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
       (if verbosity < 2 then "" else
         sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
         sprintf "<BR><A HREF=\"%s_2_chart.pdf\">Chart 2</A>\n" file_prefix ^
@@ -1010,7 +1085,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
         if ExtArray.size result.dependency_tree8 <> 0 then ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_8_dependency_tree") "a3" result.dependency_tree8;
         if result.dependency_tree9 <> [| |] then ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") "a3" result.dependency_tree9;
         if result.dependency_tree9 <> [| |] then ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") result.dependency_tree9);
-      sprintf "<font color=\"red\">error_sem_valence</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
+      sprintf "<font color=\"red\">error_sem_valence</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
       (if verbosity = 0 then "" else
         sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix ^
         (if result.dependency_tree7 <> [| |] then sprintf "<BR><A HREF=\"%s_7_dependency_tree.pdf\">Dependency Tree References 7</A>\n" file_prefix else "")  ^
@@ -1038,7 +1113,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
         if ExtArray.size result.dependency_tree8 <> 0 then ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_8_dependency_tree") "a3" result.dependency_tree8;
         if result.dependency_tree9 <> [| |] then ENIAM_LCGlatexOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") "a3" result.dependency_tree9;
         if result.dependency_tree9 <> [| |] then ENIAM_LCGgraphOf.print_dependency_tree path (file_prefix ^ "_9_dependency_tree") result.dependency_tree9));
-      sprintf "<font color=\"red\">error_sem_graph</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
+      sprintf "<font color=\"red\">error_sem_graph</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
       (if verbosity = 2 then
         sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix ^
         (if result.semantic_graph10 <> [| |] then sprintf "<BR><A HREF=\"%s_10_semantic_graph.pdf\">Semantic Graph References 10</A>\n" file_prefix else "") ^
@@ -1061,7 +1136,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
   | SemGraphError2 ->
       if verbosity = 0 then () else (
         ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_11_semantic_graph") "" result.semantic_graph11);
-      sprintf "<font color=\"red\">error_sem_graph2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
+      sprintf "<font color=\"red\">error_sem_graph2</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
       (if verbosity = 0 then "" else
         sprintf "<BR><IMG SRC=\"%s_11_semantic_graph.png\">\n" file_prefix)  ^
       ""
@@ -1077,7 +1152,7 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
         ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_11_semantic_graph") "" result.semantic_graph11);
       if verbosity = 0 then () else (
         ENIAMsemGraphOf.print_semantic_graph2 path (file_prefix ^ "_12_semantic_graph") "" result.semantic_graph12);
-      sprintf "<font color=\"red\">sem_not_validated</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.msg result.paths_size result.chart_size result.dependency_tree_size ^
+      sprintf "<font color=\"red\">sem_not_validated</font>: %s paths_size=%d chart_size=%d dependency_tree_size=%d\n" (escape_html result.msg) result.paths_size result.chart_size result.dependency_tree_size ^
       (if verbosity < 2 then "" else
         sprintf "<BR><A HREF=\"%s_6b_dependency_tree.pdf\">Dependency Tree References 6b</A>\n" file_prefix  ^
         sprintf "<BR><A HREF=\"%s_7_dependency_tree.pdf\">Dependency Tree References 7</A>\n" file_prefix  ^
@@ -1386,3 +1461,28 @@ let rec print_main_result_first_page_text cg_bin_path path id tokens = function
         (List.rev (Xlist.fold paragraphs [] find_prev_next_paragraph)) in
       print_main_result_first_page_paragraph cg_bin_path path id tokens prev_next_map (List.hd paragraphs)
   | AltText l -> Xlist.iter l (fun (mode,text) -> print_main_result_first_page_text cg_bin_path path id tokens text)
+
+let to_string_eniam_sentence verbosity tokens (result : eniam_parse_result) =
+  let status_string = string_of_status result.status in
+  if result.status = NotParsed then
+    [status_string ^ ": " ^ cat_tokens_sequence result.text_fragments (ENIAM_LCGchart.select_maximal result.chart1)]
+  else [status_string]
+
+let rec to_string_sentence verbosity tokens = function
+    RawSentence s -> []
+  | StructSentence(paths,last) -> []
+  | DepSentence paths -> []
+  | ENIAMSentence result -> to_string_eniam_sentence verbosity tokens result
+  | QuotedSentences sentences -> List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence))
+  | AltSentence l -> List.flatten (Xlist.map l (fun (mode,sentence) -> to_string_sentence verbosity tokens sentence))
+
+let rec to_string_paragraph verbosity tokens = function
+    RawParagraph s -> []
+  | StructParagraph sentences -> List.flatten (Xlist.map sentences (fun p -> to_string_sentence verbosity tokens p.sentence))
+  | AltParagraph l -> List.flatten (Xlist.map l (fun (mode,paragraph) -> to_string_paragraph verbosity tokens paragraph))
+  | ErrorParagraph s -> ["SubsyntaxError"]
+
+let rec to_string_text verbosity tokens = function
+    RawText s -> []
+  | StructText paragraphs -> List.flatten (Xlist.map paragraphs (to_string_paragraph verbosity tokens))
+  | AltText l -> List.flatten (Xlist.map l (fun (mode,text) -> to_string_text verbosity tokens text))
@@ -164,12 +164,13 @@ let get_single_letter_orths paths =
     IntMap.fold map orths (fun orths _ l ->
       TokenEnvSet.fold l orths (fun orths t ->
         match t.token with
-          SmallLetter lemma -> StringSet.add orths lemma
+          SmallLetter lemma -> (*if lemma <> "g" then*) StringSet.add orths lemma (*else orths*) (* FIXME: !!!! *)
         | CapLetter(lemma,_) -> StringSet.add orths lemma
         | _ -> orths)))
  
 let preselect orths lemmas rules l =
   Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
+    (* print_endline ("preselect: " ^ lemma); *)
     let b = Xlist.fold match_list true (fun b -> function
         O s -> StringSet.mem orths s && b
       | L(s,_,_) -> StringSet.mem lemmas s && b
@@ -179,6 +180,7 @@ let preselect orths lemmas rules l =
 let preselect_dict orths lemmas dict rules =
   StringSet.fold orths rules (fun rules orth ->
     try
+      (* print_endline ("preselect_dict: " ^ orth); *)
       preselect orths lemmas rules (StringMap.find dict orth)
     with Not_found -> rules)
  
@@ -195,7 +197,7 @@ let add_ordnum_rules orths rules =
       (false,[D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
  
 let add_quot_rule rules =
-  (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules
+  (false,[I "„x";I "<sentence>"; I "<clause>"],"„","Interp",[]) :: rules
  
 let add_building_number_rules dig_orths letter_orths rules =
   StringSet.fold dig_orths rules (fun rules dig1 ->
@@ -215,15 +217,22 @@ let add_building_number_rules dig_orths letter_orths rules =
  
 let select_rules paths mwe_dict mwe_dict2 =
   let orths = get_orths paths in
+  (* print_endline ("ENIAM_MWE.select_rules 1 orths=[" ^ String.concat ";" (StringSet.to_list orths) ^ "]"); *)
   let lemmas = get_lemmas paths in
   let intnum_orths = get_intnum_orths paths in
-  let year_orths = get_year_orths paths in
-  let letter_orths = get_single_letter_orths paths in
+  (* let year_orths = get_year_orths paths in *)
+  (* let letter_orths = get_single_letter_orths paths in *)
   let rules = preselect_dict orths lemmas mwe_dict [] in
+  (* print_endline ("ENIAM_MWE.select_rules 1 |rules|=" ^ string_of_int (Xlist.size rules)); *)
+  (* Xlist.iter rules (fun (is_mwe,match_list,lemma,cat,interp) -> print_endline lemma); *)
   let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
+  (* print_endline ("ENIAM_MWE.select_rules 2 |rules|=" ^ string_of_int (Xlist.size rules)); *)
   let rules = add_ordnum_rules intnum_orths rules in
+  (* print_endline ("ENIAM_MWE.select_rules 3 |rules|=" ^ string_of_int (Xlist.size rules)); *)
   let rules = add_quot_rule rules in
-  let rules = add_building_number_rules year_orths letter_orths rules in
+  (* print_endline ("ENIAM_MWE.select_rules 4 |rules|=" ^ string_of_int (Xlist.size rules)); *)
+  (* let rules = add_building_number_rules year_orths letter_orths rules in *) (* FIXME !!!! *)
+  (* print_endline ("ENIAM_MWE.select_rules 5 |rules|=" ^ string_of_int (Xlist.size rules) ^ " |year_orths|=" ^ string_of_int (StringSet.size year_orths) ^ " |letter_orths|=" ^ string_of_int (StringSet.size letter_orths)); *)
   rules
  
 let rec check_interp sels = function
@@ -306,7 +315,8 @@ let create_token is_mwe (matching:token_env list) sels lemma cat interp = (* FIX
     beg=beg;
     len=len;
     next=t.next;
-    token=Lemma(lemma,cat,[Xlist.map interp (function
+    token=if cat = "Interp" then Interp lemma else
+      Lemma(lemma,cat,[Xlist.map interp (function
         S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
       | V s -> Xstring.split "\\." s
       | G -> ["_"])]);
@@ -327,18 +337,34 @@ let apply_rule paths (is_mwe,match_list,lemma,cat,interp) =
       add_token paths token
     with Not_found -> paths)
  
+let count_path_size paths =
+  IntMap.fold paths 0 (fun n _ map2 ->
+    IntMap.fold map2 n (fun n _ set ->
+      TokenEnvSet.size set + n))
+
 let process (paths,last) =
+  (* print_endline ("ENIAM_MWE.process 1 |paths|=" ^ string_of_int (Xlist.size paths)); *)
   let paths = Xlist.fold paths IntMap.empty add_token in
+  (* print_endline ("ENIAM_MWE.process 2 |paths|=" ^ string_of_int (count_path_size paths)); *)
   let rules = select_rules paths !mwe_dict !mwe_dict2 in
+  (* print_endline ("ENIAM_MWE.process 3 |rules|=" ^ string_of_int (Xlist.size rules)); *)
   let paths = Xlist.fold rules paths apply_rule in
+  (* print_endline ("ENIAM_MWE.process 4 |paths|=" ^ string_of_int (count_path_size paths)); *)
   let rules = select_rules paths !mwe_dict !mwe_dict2 in
+  (* print_endline ("ENIAM_MWE.process 5 |rules|=" ^ string_of_int (Xlist.size rules)); *)
   let paths = Xlist.fold rules paths apply_rule in
+  (* print_endline ("ENIAM_MWE.process 6 |paths|=" ^ string_of_int (count_path_size paths)); *)
   let rules = select_rules paths !mwe_dict !mwe_dict2 in
+  (* print_endline ("ENIAM_MWE.process 7 |rules|=" ^ string_of_int (Xlist.size rules)); *)
   let paths = Xlist.fold rules paths apply_rule in
+  (* print_endline "ENIAM_MWE.process 8"; *)
   let rules = select_rules paths !mwe_dict !mwe_dict2 in
+  (* print_endline "ENIAM_MWE.process 9"; *)
   let paths = Xlist.fold rules paths apply_rule in
+  (* print_endline "ENIAM_MWE.process 10"; *)
   let paths = IntMap.fold paths [] (fun paths _ map ->
     IntMap.fold map paths (fun paths _ l ->
       TokenEnvSet.fold l paths (fun paths t ->
         t :: paths))) in
+  (* print_endline "ENIAM_MWE.process 11"; *)
   ENIAMpaths.sort (paths,last)
@@ -139,6 +139,13 @@ let find_query paragraph tokens chart last =
     (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
     (fun ids -> Tokens("query",ids))
  
+let find_query2 paragraph tokens chart last =
+  parse_bracket_rule paragraph tokens chart last
+    (fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>")
+    (fun tokens id -> true)
+    (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
+    (fun ids -> Tokens("query",ids))
+
 let find_tokens_in_chart tokens chart lnode rnode cat =
   let found = Xlist.fold chart.(lnode) [] (fun found (id,rnode2) ->
       if rnode = rnode2 then
@@ -149,7 +156,8 @@ let find_tokens_in_chart tokens chart lnode rnode cat =
       else found) in
   match found with
     [x] -> x
-  | _ -> failwith "Unable to extract sentences. Check puntuation."
+  | [] -> failwith "Unable to extract sentences. Check puntuation."
+  | _ -> failwith "find_tokens_in_chart"
  
 (*let find_tokens_in_chart_id tokens chart lnode rnode cat =
   let found = Int.fold 0 last [] (fun ids lnode ->
@@ -215,6 +223,13 @@ let extract_sentences pid tokens chart last =
     psentence=AltSentence[Raw,RawSentence paragraph;
                           ENIAM,StructSentence("",paths,last)]}]*)
  
+let extract_sentences2 pid tokens chart last =
+  let ids = find_tokens_in_chart tokens chart 0 last "query" in
+  let paths,last = make_paths tokens ids in
+  let sentences = [{id="0"; beg=0; len=last; next=last; file_prefix="";
+    sentence=AltSentence([ENIAM,StructSentence(paths,last)])}] in
+  add_struct_sentence_ids pid sentences
+
 (*
 let is_sentence = function
     Sentence _ -> true
@@ -269,6 +284,7 @@ let make_chart paths last =
   chart
  
 let split_into_sentences pid paragraph tokens paths =
+  (* print_endline "split_into_sentences"; *)
   let paths = make_ids tokens paths in
   let paths,last = prepare_indexes paths in
   let chart = make_chart paths last in
@@ -280,3 +296,12 @@ let split_into_sentences pid paragraph tokens paths =
   find_paren_sentences par tokens chart last;
   find_query par tokens chart last;
   extract_sentences pid tokens chart last
+
+let no_split_into_sentences pid paragraph tokens paths =
+  (* print_endline "no_split_into_sentences"; *)
+  let paths = make_ids tokens paths in
+  let paths,last = prepare_indexes paths in
+  let chart = make_chart paths last in
+  let par = Array.of_list ([""] @ Xunicode.utf8_chars_of_utf8_string paragraph @ [""]) in
+  find_query2 par tokens chart last;
+  extract_sentences2 pid tokens chart last
@@ -233,7 +233,8 @@ let rec select_tokens2_rec last paths nodes map =
   select_tokens2_rec last paths nodes map
  
 let rec calculate_quality q = function
-    CS :: l -> calculate_quality (q-2) l
+    FC :: l -> calculate_quality (q-2) l
+  | CS :: l -> calculate_quality (q-2) l
   | MaybeCS :: l -> calculate_quality q l
   | ReqValLemm :: l -> calculate_quality q l
   | MWE :: l -> calculate_quality (q+6) l
@@ -313,7 +314,7 @@ let initialize () =
  
 let parse query =
   let l = ENIAMtokenizer.parse query in
-(*   print_endline "a6"; *)
+  (* print_endline "a6"; *)
   let paths = ENIAMpaths.translate_into_paths l in
   (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a7"; *)
   (* print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *)
@@ -324,21 +325,13 @@ let parse query =
   let paths,_ = ENIAM_MWE.process paths in
   (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a12"; *)
   (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
-(*  let paths = find_proper_names paths in*)
   let paths =  List.rev (Xlist.rev_map paths find_proper_names) in
   (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a13"; *)
   (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
   let paths = modify_weights paths in
   let paths = translate_digs paths in
-(*  let paths = assign_senses paths in
-(*   print_endline "a14"; *)
-  let paths = assign_valence paths in*)
-(*   print_endline "a15"; *)
+  (* print_endline "a14"; *)
   let paths = combine_interps paths in
-(*   print_endline "a16"; *)
-(*  let paths = disambiguate_senses paths in
-  let paths = assign_simplified_valence paths in
-  let paths = PreSemantics.assign_semantics paths in*)
   (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a16"; *)
   (* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
   let paths = select_tokens paths in
@@ -351,36 +344,39 @@ let parse query =
   (* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a19"; *)
   paths(*, next_id*)
  
-let parse_text_tokens tokens query =
+let parse_text_tokens sentence_split_flag tokens query =
   (* print_endline ("parse_text_tokens: " ^ query); *)
   let paragraphs = Xstring.split "\n\\|\r" query in
   let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
   let n = if Xlist.size paragraphs = 1 then 0 else 1 in
   let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
     try
+      (* print_endline paragraph; *)
       let paths = parse paragraph in
       (* print_endline "parse_text 1"; *)
       let pid = if n = 0 then "" else string_of_int n ^ "_" in
-      let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in
+      let sentences =
+        if sentence_split_flag then ENIAMsentences.split_into_sentences pid paragraph tokens paths
+        else ENIAMsentences.no_split_into_sentences pid paragraph tokens paths in
       (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1
     with e ->
       (AltParagraph[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)]) :: paragraphs, n+1) in
   AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens
  
-let parse_text query =
+let parse_text sentence_split_flag query =
   (* print_endline ("parse_text: " ^ query); *)
   let tokens = ExtArray.make 100 empty_token_env in
   let _ = ExtArray.add tokens empty_token_env in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
-  parse_text_tokens tokens query
+  parse_text_tokens sentence_split_flag tokens query
  
 let catch_parse text =
   try
     let tokens = parse text in tokens,""
   with e -> [], Printexc.to_string e
  
-let catch_parse_text text =
+let catch_parse_text sentence_split_flag text =
   try
-    let text,tokens = parse_text text in text,tokens,""
+    let text,tokens = parse_text sentence_split_flag text in text,tokens,""
   with e ->
     RawText text,
     ExtArray.make 0 empty_token_env,
@@ -18,24 +18,28 @@
  *)
  
 type output = Text | Xml | Html | Marsh | Graphviz
+type sentence_split = Full | Partial | None
  
 let output = ref Text
 let comm_stdio = ref true
-let sentence_split = ref true
+let sentence_split = ref Full
 let port = ref 5439
  
 let spec_list = [
-  "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
-  "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences";
+  "-s", Arg.Unit (fun () -> sentence_split:=Full), "Split input into sentences (default)";
+  "-a", Arg.Unit (fun () -> sentence_split:=Partial), "Split input into paragraphs, do not split input into sentences";
+  "-n", Arg.Unit (fun () -> sentence_split:=None), "Do not split input into sentences";
   "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
   "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
   "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
   "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
   "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
   "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
-  "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";
+  "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=None), "Output as graphviz dot file; turns sentence split off";
   "--strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=true), "Perform strong disambiguation";
   "--no-strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=false), "Do not perform strong disambiguation (default)";
+  "--internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=true), "Relaxed attitude towards interpunction";
+  "--no-internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=false), "Strict attitude towards interpunction (default)";
   ]
  
 let usage_msg =
@@ -62,8 +66,10 @@ let rec main_loop in_chan out_chan =
     (* print_endline "input text begin";
     print_endline text;
     print_endline "input text end"; *)
-    (if !sentence_split then
-       let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in
+    (if !sentence_split = Full || !sentence_split = Partial then
+       let text,tokens,msg =
+         if !sentence_split = Full then ENIAMsubsyntax.catch_parse_text true text
+         else ENIAMsubsyntax.catch_parse_text false text in
        (match !output with
           Text ->
              if msg = "" then output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^
@@ -209,6 +209,7 @@ J .	Jezioro	subst:_:_:n:ncol
 j .	jak	adv:pos
 j .	język	subst:_:_:m3
 J .	jezioro	subst:_:_:n:ncol
+j .	jednostka	subst:_:_:f
 Jdt	Księga Judyty	subst:sg:_:f
 Jer .	Księga Jeremiasza	subst:sg:_:f
 Jez .	Jezioro	subst:_:_:n:ncol
@@ -736,7 +737,9 @@ zob .	zobaczyć	impt:sg:sec:perf
 Zw .	związek	subst:_:_:m3
 ż .	żeński	adj:_:_:_:pos
 ż .	żółty	adj:_:_:_:pos
-μ m	mikrometr	subst:_:_:m3
+µ m	mikrometr	subst:_:_:m3
+µ mol	mikromol	subst:_:_:m3
+µ g	mikrogram	subst:_:_:m3
 A .	A.	subst:_:_:m1.f
 B .	B.	subst:_:_:m1.f
 C .	C.	subst:_:_:m1.f
@@ -419,12 +419,12 @@ let acronym_patterns = [
   [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:n:ncol" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "ach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:loc:f" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:n:ncol" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "ami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:inst:f" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "cie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "T" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "cie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "T" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns");
@@ -448,7 +448,7 @@ let acronym_patterns = [
   [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "em"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:n:ncol" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "etach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "etami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "etem"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:inst:m3" | _ -> failwith "acronym_patterns");
@@ -488,7 +488,7 @@ let acronym_patterns = [
   [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:dat:n:ncol" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "om"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:dat:f" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "otach"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:loc:m3" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "otami"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:inst:m3" | _ -> failwith "acronym_patterns");
@@ -503,13 +503,13 @@ let acronym_patterns = [
   [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "owi"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:dat:n:ncol" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "owie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "owie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:acc:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:gen:n:ncol" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "u"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns");
@@ -520,30 +520,30 @@ let acronym_patterns = [
   [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "depr:pl:voc:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:n:ncol" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:nom:n:ncol" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:voc:n:ncol" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:acc:f" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:nom:f" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:pl:voc:f" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "y"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:gen:f" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:n:ncol" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "ze"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:n:ncol" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "zie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:loc:m3" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "zie"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:sg:voc:m3" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:acc:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m1" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m2" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:m3" | _ -> failwith "acronym_patterns");
-  [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:n2" | _ -> failwith "acronym_patterns");
+  [L; S "-"; L; S "-"; O "ów"], (function [x;y;z;_;_] -> compose_lemma3 x y z "" "subst:pl:gen:n:ncol" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "ą"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:inst:f" | _ -> failwith "acronym_patterns");
   [CL; S "-"; CL; S "-"; O "ę"], (function [x;y;z;_;_] -> compose_lemma3 x y z "A" "subst:sg:acc:f" | _ -> failwith "acronym_patterns");
   [L; S "-"; L; S "-"; O "ista"], (function [x;y;z;_;_] -> compose_lemma3 x y z "-ista" "subst:sg:nom:m1" | _ -> failwith "acronym_patterns");
@@ -706,6 +706,7 @@ let abr_patterns = [
   [O "itd"; S "."],				(function [a;b] -> 	std a b [1,"i","conj";1,"tak","adv:pos";1,"daleko","adv:com"] | _ -> failwith "abr_patterns");
   [O "itede"; S "."],				(function [a;b] -> 	std a b [1,"i","conj";2,"tak","adv:pos";2,"daleko","adv:com"] | _ -> failwith "abr_patterns");
   [O "itp"; S "."],				(function [a;b] -> 	std a b [1,"i","conj";1,"tym","adv";1,"podobny","adj:pl:nom:_:pos"] | _ -> failwith "abr_patterns");
+  [O "j"; S "."; O "m"; S "."],		(function [a;b;c;d] -> 	[ct [a;b] "jednostka" "subst:_:_:f"; ct [c;d] "miary" "subst:sg:gen:f"] | _ -> failwith "abr_patterns");
   [O "jw"; S "."],				(function [a;b] -> 	std a b [1,"jak","adv:pos";1,"wysoko","adv:com"] | _ -> failwith "abr_patterns");
   [O "JWP"],					(function [a] -> 	st a [1,"jaśnie","adv:pos";1,"wielmożny","adj:_:$C:m1:pos";1,"pan","subst:_:$C:m1"] | _ -> failwith "abr_patterns");
   [O "JWP"],					(function [a] -> 	st a [1,"jaśnie","adv:pos";1,"wielmożny","adj:_:$C:f:pos";1,"pani","subst:_:$C:f"] | _ -> failwith "abr_patterns");
@@ -717,35 +718,36 @@ let abr_patterns = [
   [O "m"; S "."; O "in"; S "."],		(function [a;b;c;d] -> 	[ct [a;b] "między" "prep:inst"; ct [c;d] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns");
   [O "m"; S "."; O "in"],			(function [a;b;c] -> 	[ct [a;b] "między" "prep:inst"; ct [c] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns");
   [O "m"; S "."; O "inn"; S "."],		(function [a;b;c;d] -> 	[ct [a;b] "między" "prep:inst"; ct [c;d] "inny" "adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns");
-  [O "m"; S "."; O "st"; S "."],		(function [a;b;c;d] -> 	[ct [a;b] "miasto" "subst:_:$C:n2"; ct [c;d] "stołeczny" "adj:_:$C:n2:pos"] | _ -> failwith "abr_patterns");
+  [O "m"; S "."; O "st"; S "."],		(function [a;b;c;d] -> 	[ct [a;b] "miasto" "subst:_:$C:n:ncol"; ct [c;d] "stołeczny" "adj:_:$C:n:pos"] | _ -> failwith "abr_patterns");
   [O "m"; O "^"; O "2"],			(function [a;b;c] -> 	[ct [a] "metr" "subst:_:$C:m3"; ct [b;c] "kwadratowy" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns");
   [O "m"; O "2"],				(function [a;b] -> 	[ct [a] "metr" "subst:_:$C:m3"; ct [b] "kwadratowy" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns");
   [O "m"; O "3"],				(function [a;b] -> 	[ct [a] "metr" "subst:_:$C:m3"; ct [b] "sześcienny" "adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns");
   (* [O "min"; S "."],				(function [a;b] -> 	std a b [1,"między","prep:inst";2,"inny","adj:pl:inst:_:pos"] | _ -> failwith "abr_patterns"); *)
+  [O "mc"; S "."],				(function [a;b] -> 	std a b [1,"masa","subst:sg:$C:f";1,"ciało","subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns");
   [O "mkw"; S "."],				(function [a;b] -> 	std a b [1,"metr","subst:_:$C:m3";2,"kwadratowy","adj:_:$C:m3:pos"] | _ -> failwith "abr_patterns");
   [O "n"; S "."; O "e"; S "."],			(function [a;b;c;d] -> 	[ct [a;b] "nasz" "adj:sg:gen:f:pos"; ct [c;d] "era" "subst:sg:gen:f"] | _ -> failwith "abr_patterns");
-  [O "n"; S "."; O "p"; S "."; O "m"; S "."],	(function [a;b;c;d;e;f] -> [ct [a;b] "nad" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n2"] | _ -> failwith "abr_patterns");
+  [O "n"; S "."; O "p"; S "."; O "m"; S "."],	(function [a;b;c;d;e;f] -> [ct [a;b] "nad" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns");
   [O "np"; S "."],				(function [a;b] -> 	std a b [1,"na","prep:acc";1,"przykład","subst:sg:acc:m3"] | _ -> failwith "abr_patterns");
   [O "nt"; S "."],				(function [a;b] -> 	std a b [1,"na","prep:acc";1,"temat","subst:sg:acc:m3"] | _ -> failwith "abr_patterns");
   [O "NTG"],					(function [a] -> 	st a [1,"nie","qub";1,"ta","adj:sg:nom:f:pos";1,"grupa","subst:sg:nom:f"] | _ -> failwith "abr_patterns");
   [O "o"; S "."; O "o"; S "."],			(function [a;b;c;d] -> 	[ct [a;b] "ograniczony" "adj:sg:$C:f:pos"; ct [c;d] "odpowiedzialność" "subst:sg:$C:f"] | _ -> failwith "abr_patterns");
   [O "p"; S "."; O "n"; S "."; O "e"; S "."],	(function [a;b;c;d;e;f] -> [ct [a;b] "przed" "prep:inst"; ct [c;d] "nasz" "adj:sg:inst:f:pos"; ct [e;f] "era" "subst:sg:inst:f"] | _ -> failwith "abr_patterns");
   [O "p"; S "."; O "o"; S "."],			(function [a;b;c;d] -> 	[ct [a;b] "pełniący" "pact:_:_:m1.m2.m3:imperf:aff"; ct [c;d] "obowiązek" "subst:pl:acc:m3"] | _ -> failwith "abr_patterns");
-  [O "p"; S "."; O "p"; S "."; O "m"; S "."],	(function [a;b;c;d;e;f] -> [ct [a;b] "pod" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n2"] | _ -> failwith "abr_patterns");
+  [O "p"; S "."; O "p"; S "."; O "m"; S "."],	(function [a;b;c;d;e;f] -> [ct [a;b] "pod" "prep:inst"; ct [c;d] "poziom" "subst:sg:inst:m3"; ct [e;f] "morze" "subst:sg:gen:n:ncol"] | _ -> failwith "abr_patterns");
   [O "p"; S "."; O "t"; S "."],			(function [a;b;c;d] -> 	[ct [a;b] "pod" "prep:inst:nwokc"; ct [c;d] "tytuł" "subst:sg:inst:m3"] | _ -> failwith "abr_patterns");
   [O "pn"; S "."],				(function [a;b] -> 	std a b [1,"pod","prep:inst";1,"nazwa","subst:sg:inst:f"] | _ -> failwith "abr_patterns");
   [O "pne"; S "."],				(function [a;b] -> 	std a b [1,"przed","prep:inst";1,"nasz","adj:sg:inst:f:pos";1,"era","subst:sg:inst:f"] | _ -> failwith "abr_patterns");
   [O "pt"; S "."],				(function [a;b] -> 	std a b [1,"pod","prep:inst";1,"tytuł","subst:sg:inst:m3"] | _ -> failwith "abr_patterns");
   [O "PW"],					(function [a] -> 	st a [1,"prywatny","adj:_:$C:f:pos";1,"wiadomość","subst:_:$C:f"] | _ -> failwith "abr_patterns");
-  [O "pw"; S "."],				(function [a;b] -> 	std a b [1,"pod","prep:inst";1,"wezwanie","subst:sg:inst:n2"] | _ -> failwith "abr_patterns");
+  [O "pw"; S "."],				(function [a;b] -> 	std a b [1,"pod","prep:inst";1,"wezwanie","subst:sg:inst:n:ncol"] | _ -> failwith "abr_patterns");
 (*  [O "S"; S "."; O "A"; S "."],			(function [a;b;c;d] -> 	[ct [a;b] "spółka" "subst:sg:$C:f"; ct [c;d] "akcyjny" "adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns");
   [O "s"; S "."; O "c"; S "."],			(function [a;b;c;d] -> 	[ct [a;b] "spółka" "subst:sg:$C:f"; ct [c;d] "cywilny" "adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns");*)
 (*   [O "SA"],					(function [a] -> 	st a [1,"spółka","subst:sg:$C:f";1,"akcyjny","adj:sg:$C:f:pos"] | _ -> failwith "abr_patterns"); *)
   [O "ś"; S "."; O "p"; S "."],			(function [a;b;c;d] -> 	[ct [a;b] "święty" "adj:sg:gen:f:pos"; ct [c;d] "pamięć" "subst:sg:gen:f"] | _ -> failwith "abr_patterns");
   [O "śp"; S "."],				(function [a;b] -> 	std a b [1,"święty","adj:sg:gen:f:pos";1,"pamięć","subst:sg:gen:f"] | _ -> failwith "abr_patterns");
   [O "tgz"; S "."],				(function [a;b] -> 	std a b [2,"tak","adv";1,"zwać","ppas:_:_:_:_:aff"] | _ -> failwith "abr_patterns");
-  [O "tj"; S "."],				(function [a;b] -> 	std a b [1,"to","subst:sg:nom:n2";1,"być","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns");
-  [O "tzn"; S "."],				(function [a;b] -> 	std a b [1,"to","subst:sg:nom:n2";2,"znaczyć","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns");
+  [O "tj"; S "."],				(function [a;b] -> 	std a b [1,"to","subst:sg:nom:n:ncol";1,"być","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns");
+  [O "tzn"; S "."],				(function [a;b] -> 	std a b [1,"to","subst:sg:nom:n:ncol";2,"znaczyć","fin:sg:ter:imperf"] | _ -> failwith "abr_patterns");
   [O "tzw"; S "."],				(function [a;b] -> 	std a b [1,"tak","adv:pos";2,"zwać","ppas:_:_:_:imperf:aff"] | _ -> failwith "abr_patterns");
   [O "ub"; S "."; O "r"; S "."],		(function [a;b;c;d] -> 	[ct [a;b] "ubiegły" "adj:sg:$C:m3:pos"; ct [c;d] "rok" "subst:sg:$C:m3"] | _ -> failwith "abr_patterns");
   [O "w"; S "."; O "w"; S "."],			(function [a;b;c;d] -> 	[ct [a;b] "wysoko" "adv:com"; ct [c;d] "wymienić" "ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns");
@@ -405,7 +405,7 @@ let digit_patterns4 = [
   [C "realnum-interval"; O "mld"], (function [x;_] -> make_tys 9 x | _ -> failwith "digit_patterns8");
   ]
  
-let url_patterns1 = [
+(*let url_patterns1 = [
   [L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
   [L; D "dig"; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
   [L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url"));
@@ -461,9 +461,12 @@ let url_patterns1 = [
 let url_patterns2 = [
   [L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
   [L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
+  [L; S "_"; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
   [L; S "."; L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
+  [L; S "."; D "dig"; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
   [L; D "intnum"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
   [L; S "."; L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
+  [L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email"));
   [O "http"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url"));
   [O "https"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url"));
   ]
@@ -472,7 +475,7 @@ let url_patterns3 = [
   [D "url"; S "/"], (function l -> Dig(concat_orths2 l,"url"));
   [D "url"; S "/"; L], (function l -> Dig(concat_orths2 l,"url"));
   [D "url"; S "/"; L; S "."; L], (function l -> Dig(concat_orths2 l,"url"));
-]
+]*)
  
 let html_patterns = [
   [S "<"; L; S ">"], (function l -> Dig(concat_orths2 l,"html-tag"));
@@ -701,7 +704,7 @@ let manage_query_boundaries tokens =
     if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else
     if find_beg_pattern [I "</query>";I "”s"] tokens then
       replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else
-    if find_beg_pattern [I "</query>";I ")s";I "</sentence>"] tokens then tokens else
+    if find_beg_pattern [I "</query>";I ")s"(*;I "</sentence>"*)] tokens then tokens else
     replace_beg_pattern [I "</query>"] add_sentence_end tokens in
   let tokens = Xlist.rev_map tokens revert_tokens in
   tokens
@@ -724,12 +727,12 @@ let find_replacement_patterns tokens =
   let tokens = find_patterns ENIAMacronyms.name_patterns tokens in
 (*   Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *)
   let tokens = normalize_tokens [] tokens in
-  let tokens = find_patterns url_patterns1 tokens in
+(*  let tokens = find_patterns url_patterns1 tokens in
   let tokens = normalize_tokens [] tokens in
   let tokens = find_patterns url_patterns2 tokens in
   let tokens = normalize_tokens [] tokens in
   let tokens = find_patterns url_patterns3 tokens in
-  let tokens = normalize_tokens [] tokens in
+  let tokens = normalize_tokens [] tokens in*)
   let tokens = find_patterns html_patterns tokens in
   let tokens = normalize_tokens [] tokens in
   (*   Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *)
@@ -21,7 +21,8 @@ open Xstd
 open ENIAMtokenizerTypes
  
 let initialize () =
-  ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns ()
+  ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns ();
+  ENIAMurl.top_level_domains := ENIAMurl.load_top_level_domains ()
  
 let string_of =
   ENIAMtokens.string_of_tokens
@@ -41,7 +41,7 @@ type token =
   | Tokens of string * int list (*cat * token id list *)
  
 type attr =
-    CS | MaybeCS | ReqValLemm | MWE | LemmNotVal | TokNotFound | NotValProper | LemmLowercase | Roman | Capitalics
+    FC | CS | MaybeCS | ReqValLemm | MWE | LemmNotVal | TokNotFound | NotValProper | LemmLowercase | Roman | Capitalics
   | SentBeg | SentEnd | SentBegEnd
   | BrevLemma of string
   | Disamb of string * string * string list list
@@ -71,6 +71,8 @@ type pat = L | CL | SL | (*SL2 |*) D of string | C of string | S of string | RD 
 let empty_token_env = {
   orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.}
  
+let internet_mode = ref false
+
 let resource_path =
   try Sys.getenv "ENIAM_RESOURCE_PATH"
   with Not_found ->
@@ -82,6 +84,8 @@ let resource_path =
 let mte_filename = resource_path ^ "/tokenizer/mte_20151215.tab"
 let mte_filename2 = resource_path ^ "/tokenizer/mte.tab"
  
+let top_level_domains_filename = resource_path ^ "/tokenizer/top-level-domains.tab"
+
 module OrderedTokenEnv = struct
  
   type t = token_env
@@ -87,7 +87,8 @@ let rec xml_of_token = function
   | Tokens(cat,l) -> Xml.Element("Tokens",["pos",cat],Xlist.map l (fun x -> Xml.Element("id",[],[Xml.PCData (string_of_int x)])))
  
 let string_of_attr = function
-    CS -> "cs"
+    FC -> "first capital"
+  | CS -> "cs"
   | MaybeCS -> "maybe cs"
   | ReqValLemm -> "required validated lemmatization"
   | MWE -> "mwe"
@@ -212,9 +213,9 @@ let merge_digits poss_s_beg i digs =
     (if Xlist.size digs <= 3 && List.hd digs <> "0" then [t (Dig(v,"pref3dig"));sc_t (Dig(v,"pref3dig"))] else []) in*)
   Variant variants
  
-let merge_url poss_s_beg i digs =
+(* let merge_url poss_s_beg i digs =
   let orth = String.concat "" digs in
-  Variant(dig_tokens orth poss_s_beg i digs orth "url")
+  Variant(dig_tokens orth poss_s_beg i digs orth "url") *)
  
 let recognize_roman_I v = function
     Capital("I",_) :: Capital("I",_) :: Capital("I",_) :: [] -> v+3,false
@@ -335,6 +336,7 @@ let get_first_lower = function
   | _ -> failwith "get_first_lower"
  
 let cs_weight = -1.
+let fc_weight = -10.
 let sc_cap_weight = -0.3
  
 let is_add_attr_token = function
@@ -361,13 +363,17 @@ let recognize_stem poss_s_beg has_sufix i letters =
           Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SmallLetter(merge (lowercase_first letters)); attrs=MaybeCS :: t.attrs}];
           Token{t with token=CapLetter(orth,merge (lowercase_first letters)); attrs=MaybeCS :: t.attrs};
           Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=CapLetter(orth,merge (lowercase_first letters)); weight=sc_cap_weight; attrs=MaybeCS :: t.attrs}]]
+      else if !internet_mode then Variant[
+          Token{t with token=SmallLetter orth};
+          Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SmallLetter orth}]]
       else Token{t with token=SmallLetter orth}
     else
       if first_capital letters then
-        if rest_small letters then Variant[
+        if rest_small letters then Variant([
           Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall(merge (lowercase_first letters))}];
           Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)};
-          Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters); weight=sc_cap_weight}]]
+          Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters); weight=sc_cap_weight}]] @
+          (if !internet_mode then [Token{t with token=AllSmall(merge (lowercase_first letters)); weight=fc_weight; attrs=FC :: t.attrs}] else []))
         else if rest_capital letters then Variant([
           Token{t with token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs};
           Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs}];
@@ -377,6 +383,13 @@ let recognize_stem poss_s_beg has_sufix i letters =
            Token{t with token=AllCap(orth,merge (lowercase_rest letters),merge (lowercase_all letters)); attrs=MaybeCS :: t.attrs};
            Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllCap(orth,merge (lowercase_rest letters),merge (lowercase_all letters)); attrs=MaybeCS :: t.attrs}]]))
         else Token{t with token=SomeCap orth}
+      else if !internet_mode then
+        if rest_small letters then Variant[
+          Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=AllSmall orth}];
+          Token{t with token=AllSmall orth}]
+        else Variant[
+          Seq[s_beg i;c_beg (i+1);Token{t with beg=t.beg+2; len=t.len-2; token=SomeCap orth}];
+          Token{t with token=SomeCap orth}]
       else
         if rest_small letters then Token{t with token=AllSmall orth}
         else Token{t with token=SomeCap orth}
@@ -388,8 +401,9 @@ let recognize_stem poss_s_beg has_sufix i letters =
       else Token{t with token=SmallLetter orth}
     else
       if first_capital letters then
-        if rest_small letters then
-          Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)}
+        if rest_small letters then Variant([
+          Token{t with token=FirstCap(orth,merge (lowercase_first letters),get_first_cap letters,get_first_lower letters)}] @
+          (if !internet_mode then [Token{t with token=AllSmall(merge (lowercase_first letters)); weight=fc_weight; attrs=FC :: t.attrs}] else []))
         else if rest_capital letters then Variant([
           Token{t with token=AllSmall(merge (lowercase_all letters)); weight=cs_weight; attrs=CS :: t.attrs};
           Token{t with token=FirstCap(merge (lowercase_rest letters),merge (lowercase_all letters),get_first_cap letters,get_first_lower letters); weight=cs_weight; attrs=CS :: t.attrs}] @
@@ -547,24 +561,24 @@ let rec group_others rev = function
   | x :: l -> List.rev rev, x :: l
  
 let create_sign_token poss_s_beg i signs l token =
-  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
+  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | Small s -> s | _ -> failwith "create_sign_token")) in
   let len = Xlist.size signs * factor in
   Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=token; attrs=[MaybeCS]},i+len,l,poss_s_beg
  
 let create_empty_sign_token i signs =
-  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
+  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_empty_sign_token")) in
   let len = Xlist.size signs * factor in
   {empty_token_env with orth=orth;beg=i;len=len;next=i+len; attrs=[MaybeCS]},i+len
  
 let create_sentence_seq i signs l lemma =
-  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
+  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq")) in
   let len = Xlist.size signs * factor in
   Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"};
       Token{empty_token_env with orth=orth;beg=i+20;len=len-30;next=i+len-10;token=make_lemma (lemma,"sinterj")};
       Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}]
  
 let create_sentence_seq_hapl i signs l lemma =
-  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
+  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_hapl")) in
   let len = Xlist.size signs * factor in
   Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]};
       Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"};
@@ -572,7 +586,7 @@ let create_sentence_seq_hapl i signs l lemma =
       Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}]
  
 let create_sentence_seq_q i signs l lemma =
-  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
+  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_q")) in
   let len = Xlist.size signs * factor in
   Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "?"};
       Token{empty_token_env with beg=i+20;len=10;next=i+30;token=Interp "</clause>"};
@@ -580,7 +594,7 @@ let create_sentence_seq_q i signs l lemma =
       Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}]
  
 let create_sentence_seq_hapl_q i signs l lemma =
-  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
+  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sentence_seq_hapl_q")) in
   let len = Xlist.size signs * factor in
   Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]};
       Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "?"};
@@ -589,7 +603,7 @@ let create_sentence_seq_hapl_q i signs l lemma =
       Token{empty_token_env with beg=i+len-10;len=10;next=i+len;token=Interp "</sentence>"}]
  
 let create_or_beg i signs l poss_s_beg =
-  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
+  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_or_beg")) in
   let len = Xlist.size signs * factor in
   Variant[
     Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=Symbol "-"; attrs=[MaybeCS]};
@@ -606,7 +620,7 @@ let create_or_beg i signs l poss_s_beg =
     ],i+len,l,poss_s_beg
  
 let create_or_beg2 i signs l poss_s_beg =
-  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_sign_token")) in
+  let orth = String.concat "" (Xlist.map signs (function Sign s -> s | _ -> failwith "create_or_beg2")) in
   let len = Xlist.size signs * factor in
   Variant[
     Token{empty_token_env with orth=orth;beg=i;len=len;next=i+len;token=Interp "<or>"};
@@ -631,18 +645,18 @@ let is_dot_sentence_end_marker = function
   | _ -> false
  
 let not_dot_sentence_end_marker = function
-    Sign " " :: Small _ :: _ -> true
-  | Sign "" :: Small _ :: _ -> true
-  | Sign " " :: Small _ :: _ -> true
+    Sign " " :: Small _ :: _ -> if !internet_mode then false else true
+  | Sign "" :: Small _ :: _ -> if !internet_mode then false else true
+  | Sign " " :: Small _ :: _ -> if !internet_mode then false else true
   | Sign "," :: _ -> true
   | Sign ":" :: _ -> true
   | Sign "?" :: _ -> true
   | Sign "!" :: _ -> true
-  | Small _ :: _ -> true
-  | ForeignSmall _ :: _ -> true
-  | Capital _ :: _ -> true
-  | ForeignCapital _ :: _ -> true
-  | Digit _ :: _ -> true
+  | Small _ :: _ -> if !internet_mode then false else true
+  | ForeignSmall _ :: _ -> if !internet_mode then false else true
+  | Capital _ :: _ -> if !internet_mode then false else true
+  | ForeignCapital _ :: _ -> if !internet_mode then false else true
+  | Digit _ :: _ -> if !internet_mode then false else true
   | _ -> false
  
 let is_comma_digit_marker = function
@@ -705,6 +719,7 @@ let rec recognize_sign_group poss_s_beg i = function
   | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ")
   | (Sign "") :: l -> create_sign_token poss_s_beg i [Sign ""] l (Symbol " ")
   | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ")
+  | (Sign "&") :: (Small "n") :: (Small "b") :: (Small "s") :: (Small "p") :: (Sign ";") :: l -> create_sign_token poss_s_beg i ((Sign "&") :: (Small "n") :: (Small "b") :: (Small "s") :: (Small "p") :: (Sign ";") :: []) l (Symbol " ")
   | (Sign "\"") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "\""] l
   | (Sign "\"") :: l ->
       let t,i2 = create_empty_sign_token i [Sign "\""] in
@@ -775,12 +790,16 @@ let rec recognize_sign_group poss_s_beg i = function
   | (Sign ";") :: (Sign ")") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: (Sign ")") :: []) l (make_lemma (";))","sinterj")) *)
   | (Sign ":") :: (Sign "|") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "|") :: []) l (make_lemma (":|","sinterj"))
   | (Sign ":") :: (Sign "\\") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "\\") :: []) l (make_lemma (":\\","sinterj"))
+  | (Sign ":") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "/") :: []) l (make_lemma (":/","sinterj"))
   | (Sign ":") :: (Sign "-") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "/") :: []) l (make_lemma (":-/","sinterj"))
   (* | (Sign ":") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign ")") :: []) l (make_lemma (":)","sinterj"))
   | (Sign ";") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: []) l (make_lemma (";)","sinterj")) *)
   | (Sign ")") :: l -> (*create_sign_token poss_s_beg i [Sign ")"] l (Interp ")")*)
-      let t,i = create_empty_sign_token i [Sign ")"] in
-      Variant[Token{t with token=Symbol ")"};Token{t with token=Interp ")"};Token{t with token=Interp ")s"}],i,l,poss_s_beg
+      let t,i2 = create_empty_sign_token i [Sign ")"] in
+      Variant[Token{t with token=Symbol ")"};Token{t with token=Interp ")"};Token{t with token=Interp ")s"};
+      Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Interp "</clause>"};
+          Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</sentence>"};
+          Token{empty_token_env with orth=":";beg=i+20;len=factor-20;next=i+factor;token=Interp ")s"}]],i2,l,true
   | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj"))
   | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj"))
   | (Sign "[") :: (Sign "+") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign "+") :: (Sign "]") :: []) l (make_lemma ("[+]","symbol"))
@@ -928,29 +947,32 @@ let rec recognize_sign_group poss_s_beg i = function
                 create_sentence_seq i ((Sign ".") :: (Sign ".") :: []) l "…";
                 Token{empty_token_env with orth="..";beg=i;len=2*factor;next=i+2*factor;token=make_lemma ("…","sinterj"); attrs=[MaybeCS]}],i+2*factor,l,true
   | (Sign ".") :: l ->
-      if is_dot_sentence_end_marker l then
+      if is_dot_sentence_end_marker l then ((*Printf.printf "dot 1 i=%d\n%!" i;*)
         Variant[Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]};
                     Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"};
                     Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}];
                 Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"};
-                    Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]],i+factor,l,true
-      else if not_dot_sentence_end_marker l then
-        Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]},i+factor,l,false
-      else
+                    Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}]],i+factor,l,true)
+      else if not_dot_sentence_end_marker l then ((*Printf.printf "dot 2 i=%d\n%!" i;*)
+        Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]},i+factor,l,false)
+      else ((*Printf.printf "dot 3 i=%d\n%!" i;*)
         Variant[Seq[Token{empty_token_env with beg=i;len=10;next=i+10;token=Symbol "."; attrs=[MaybeCS]};
                     Token{empty_token_env with beg=i+10;len=10;next=i+20;token=Interp "</clause>"};
                     Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}];
                 Seq[Token{empty_token_env with beg=i;len=20;next=i+20;token=Interp "</clause>"};
                     Token{empty_token_env with orth=".";beg=i+20;len=factor-20;next=i+factor;token=Interp "</sentence>"}];
-                Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]}],i+factor,l,true
+                Token{empty_token_env with orth=".";beg=i;len=factor;next=i+factor;token=Symbol "."; attrs=[MaybeCS]}],i+factor,l,true)
   | (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*";Sign "*";Sign "*"] l (Interp "*****") (* zastępniki liter *)
   | (Sign "*") :: (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*";Sign "*"] l (Interp "****")
   | (Sign "*") :: (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*";Sign "*"] l (Interp "***")
   | (Sign "*") :: (Sign "*") :: l -> create_sign_token poss_s_beg i [Sign "*";Sign "*"] l (Interp "**")
-  | (Sign "*") :: l -> (* Interp zastępnik liter i cudzysłów, symbol listy *)
-      let t,i = create_empty_sign_token i [Sign "*"] in
-      Variant[Token{t with token=Interp "*"};Token{t with token=Symbol "*"}],i,l,poss_s_beg
-  | (Sign "+") :: l -> create_sign_token poss_s_beg i [Sign "+"] l (Symbol "+")
+  | (Sign "*") :: l -> (* Interp to zastępnik liter i cudzysłów, symbol listy *)
+      let t,i2 = create_empty_sign_token i [Sign "*"] in
+      Variant([Token{t with token=Interp "*"};Token{t with token=Symbol "*"}] @
+      (if !internet_mode then [sc_dig_token "*" i [Sign "*"] (make_lemma ("*","symbol"))] else [])),i2,l,poss_s_beg
+  | (Sign "+") :: l -> (* Interp to spójnik *)
+      let t,i2 = create_empty_sign_token i [Sign "+"] in
+      Variant[Token{t with token=Interp "+"};Token{t with token=Symbol "+"}],i2,l,poss_s_beg
   | (Sign "«") :: l ->
       let t,i = create_empty_sign_token i [Sign "«"] in
       Variant[Token{t with token=Interp "«"};Token{t with token=Interp "«s"}],i,l,poss_s_beg
@@ -1005,6 +1027,7 @@ let rec recognize_sign_group poss_s_beg i = function
   | (Sign "_") :: l -> create_sign_token poss_s_beg i [Sign "_"] l (Symbol "_")
   | (Sign "@") :: l -> create_sign_token poss_s_beg i [Sign "@"] l (Symbol "@")
   | (Sign "×") :: l -> create_sign_token poss_s_beg i [Sign "×"] l (Symbol "×")
+  | (Sign "±") :: l -> create_sign_token poss_s_beg i [Sign "±"] l (Symbol "±")
   | (Sign "%") :: l ->
       let t,i = create_empty_sign_token i [Sign "%"] in
       Variant[Token{t with token=Symbol "%"};Token{t with token=make_lemma ("procent","subst:_:_:m3")}],i,l,false
@@ -1018,12 +1041,15 @@ let rec recognize_sign_group poss_s_beg i = function
   | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t")
   | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r")
   | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n")
+  | (Sign "®") :: l -> create_sign_token poss_s_beg i [Sign "®"] l (Symbol "®")
+  | (Sign "µ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ")
+  | (Sign "μ") :: l -> create_sign_token poss_s_beg i [Sign "µ"] l (Symbol "µ")
   | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s)
   | l ->  failwith "recognize_sign_group"
  
 (* FIXME: "„Szpak” frunie." trzeba przenie przenieść <sentence> przed „, ale zostawić po „s. *)
  
-let rec group_url rev = function
+(*let rec group_url rev = function
     Small s :: l -> group_url (s :: rev) l
   | Capital(s,t) :: l -> group_url (s :: rev) l
   | ForeignSmall s :: l -> group_url (s :: rev) l
@@ -1040,24 +1066,34 @@ let rec group_url rev = function
   | Sign "," :: l -> group_url ("," :: rev) l
   | Sign "~" :: l -> group_url ("~" :: rev) l
   | Sign "_" :: l -> group_url ("_" :: rev) l
-  | l -> List.rev rev, l
+  | l -> List.rev rev, l*)
+
+let merge_url poss_s_beg i len orth cat =
+  if poss_s_beg then
+    Variant[Token{empty_token_env with orth=orth;beg=i;len=len*factor;next=i+len*factor;token=Dig(orth,cat)};
+     Seq[s_beg i;c_beg (i+1);Token{empty_token_env with orth=orth;beg=i+2;len=len*factor-2;next=i+len*factor;token=Dig(orth,cat)}]]
+  else
+    Token{empty_token_env with orth=orth;beg=i;len=len*factor;next=i+len*factor;token=Dig(orth,cat)}
  
 let rec group_chars poss_s_beg i rev = function
     [] -> List.rev ((Token{empty_token_env with beg=i;len=factor;next=i+factor;token=Interp "</query>"}) :: rev)
-  | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l ->
+  (* | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l ->
        let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l
   | (Small "h") :: (Small "t") :: (Small "t") :: (Small "p") :: (Small "s") :: (Sign ":") :: (Sign "/") :: (Sign "/") :: _ as l ->
-       let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l
-  | (Digit s) :: l -> let x,l = group_digits [] ((Digit s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_digits poss_s_beg i x) :: rev) l
-  | (Sign s) :: l -> let x,i,l,poss_s_beg = recognize_sign_group poss_s_beg i ((Sign s) :: l) in group_chars poss_s_beg i (x :: rev) l
-  | (Capital(s,t)) :: l -> let x,l = group_letters [] ((Capital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) ::  rev) l
-  | (ForeignCapital(s,t)) :: l -> let x,l = group_letters [] ((ForeignCapital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) ::  rev) l
-  | (Small s) :: l -> let x,l = group_letters [] ((Small s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) ::  rev) l
-  | (ForeignSmall s) :: l -> let x,l = group_letters [] ((ForeignSmall s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) ::  rev) l
-  | (Other(s,x)) :: l ->
+       let x,l = group_url [] l in group_chars false (i + Xlist.size x * factor) ((merge_url poss_s_beg i x) :: rev) l *)
+  | Digit s :: l -> let x,l = group_digits [] ((Digit s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_digits poss_s_beg i x) :: rev) l
+  | Sign s :: l -> let x,i,l,poss_s_beg = recognize_sign_group poss_s_beg i ((Sign s) :: l) in group_chars poss_s_beg i (x :: rev) l
+  | Capital(s,t) :: l -> let x,l = group_letters [] ((Capital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
+  | ForeignCapital(s,t) :: l -> let x,l = group_letters [] ((ForeignCapital(s,t)) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
+  | Small s :: l -> let x,l = group_letters [] ((Small s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
+  | ForeignSmall s :: l -> let x,l = group_letters [] ((ForeignSmall s) :: l) in group_chars false (i + Xlist.size x * factor) ((merge_letters poss_s_beg i x) :: rev) l
+  | Emoticon s :: l -> group_chars poss_s_beg (i + factor) ((Token{empty_token_env with orth=s;beg=i;len=factor;next=i+factor;token=make_lemma (s,"sinterj")}) :: rev) l
+  | Other("url",len) :: Sign s :: l -> group_chars false (i + len * factor) ((merge_url poss_s_beg i len s "url") :: rev) l
+  | Other("email",len) :: Sign s :: l -> group_chars false (i + len * factor) ((merge_url poss_s_beg i len s "email") :: rev) l
+  | Other(s,x) :: l ->
         let x,l = group_others [] ((Other(s,x)) :: l) in
         group_chars false (i + Xlist.size x * factor)
-          ((Token{empty_token_env with orth=String.concat "" x;beg=i;len=Xlist.size x * factor;next=i+factor;token=Other(String.concat "" x)}) ::  rev) l
+          ((Token{empty_token_env with orth=String.concat "" x;beg=i;len=Xlist.size x * factor;next=i+Xlist.size x * factor;token=Other(String.concat "" x)}) :: rev) l
  
 let tokenize l =
-  (Token{empty_token_env with beg=0;len=factor;next=factor;token=Interp "<query>"}) :: (group_chars true factor [] l)
+  (Token{empty_token_env with beg=0;len=factor;next=factor;token=Interp "<query>"}) :: (group_chars true factor [] (ENIAMurl.find l))
@@ -6,25 +6,26 @@ OCAMLFLAGS=$(INCLUDES) -g
 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa
 INSTALLDIR=`ocamlc -where`/eniam
  
-SOURCES= ENIAMtokenizerTypes.ml ENIAMtokens.ml ENIAMacronyms.ml ENIAMpatterns.ml ENIAMtokenizer.ml
+SOURCES= ENIAMtokenizerTypes.ml ENIAMurl.ml ENIAMtokens.ml ENIAMacronyms.ml ENIAMpatterns.ml ENIAMtokenizer.ml
  
 all: eniam-tokenizer.cma eniam-tokenizer.cmxa
  
 install: all
 	mkdir -p $(INSTALLDIR)
 	cp eniam-tokenizer.cmxa eniam-tokenizer.a eniam-tokenizer.cma $(INSTALLDIR)
-	cp ENIAMtokenizerTypes.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR)
-	cp ENIAMtokenizerTypes.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR)
+	cp ENIAMtokenizerTypes.cmi ENIAMurl.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR)
+	cp ENIAMtokenizerTypes.cmx ENIAMurl.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR)
 	mkdir -p /usr/share/eniam/tokenizer
 	cp resources/*  /usr/share/eniam/tokenizer
  
 install-local: all
 	mkdir -p $(INSTALLDIR)
 	cp eniam-tokenizer.cmxa eniam-tokenizer.a eniam-tokenizer.cma $(INSTALLDIR)
-	cp ENIAMtokenizerTypes.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR)
-	cp ENIAMtokenizerTypes.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR)
+	cp ENIAMtokenizerTypes.cmi ENIAMurl.cmi ENIAMtokens.cmi ENIAMacronyms.cmi ENIAMpatterns.cmi ENIAMtokenizer.cmi $(INSTALLDIR)
+	cp ENIAMtokenizerTypes.cmx ENIAMurl.cmx ENIAMtokens.cmx ENIAMacronyms.cmx ENIAMpatterns.cmx ENIAMtokenizer.cmx $(INSTALLDIR)
 	mkdir -p /usr/local/share/eniam/tokenizer
 	cp resources/mte_20151215.tab  /usr/local/share/eniam/tokenizer/mte_20151215.tab
+	cp resources/top-level-domains.tab  /usr/local/share/eniam/tokenizer/top-level-domains.tab
 	cp resources/README  /usr/local/share/eniam/tokenizer/README
 #	ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab