Commit caeb305ab86cefe8f85c04835fae2a47f2715eae

Authored by Wojciech Jaworski
1 parent 584f9657

łączenie na poziomie haseł

Showing 1 changed file with 106 additions and 15 deletions
NKJP2/ENIAM_NKJP.ml
... ... @@ -19,6 +19,34 @@
19 19  
20 20 open Xstd
21 21  
  22 +type id = {corref: string; prefix: string; suffix: string; numbers: int list}
  23 +
  24 +let empty_id = {corref = ""; prefix = ""; suffix = ""; numbers = []}
  25 +
  26 +let parse_id id =
  27 + (* if String.length s = 0 then empty_id else *)
  28 + if String.length id < 6 then failwith "parse_id: za krótkie id" else
  29 + let corref,id = match Xstring.split "#" id with
  30 + [corref;id] -> corref,id
  31 + | [id] -> "",id
  32 + | _ -> failwith ("parse_id 1: " ^ id) in
  33 + let prefix,id = match Xstring.split "_" id with
  34 + [prefix;id] -> prefix,id
  35 + | _ -> failwith ("parse_id 2: " ^ id) in
  36 + let suffix,id = match Xstring.split "-" id with
  37 + [id;suffix] -> suffix,id
  38 + | _ -> failwith ("parse_id 3: " ^ id) in
  39 + let numbers = try Xlist.map (Xstring.split "\\." id) int_of_string with _ -> failwith ("parse_id 4: " ^ id) in
  40 + {corref=corref; prefix=prefix; suffix=suffix; numbers=numbers}
  41 +
  42 +let process_header_type typ =
  43 + if Xstring.check_prefix "#typ_" typ then Xstring.cut_prefix "#typ_" typ
  44 + else failwith ("process_header_type: " ^ typ)
  45 +
  46 +let process_header_channel c =
  47 + if Xstring.check_prefix "#kanal_" c then Xstring.cut_prefix "#kanal_" c
  48 + else failwith ("process_header_channel: " ^ c)
  49 +
22 50 let load_header path name =
23 51 match Xml.parse_file (path ^ name ^ "/header.xml") with
24 52 Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_);
... ... @@ -26,7 +54,7 @@ let load_header path name =
26 54 Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]);
27 55 Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]);
28 56 Xml.Element("revisionDesc",_,_)]) ->
29   - typ,channel
  57 + process_header_type typ,process_header_channel channel
30 58 | _ -> failwith "load_header"
31 59  
32 60 let get_folders path =
... ... @@ -35,12 +63,12 @@ let get_folders path =
35 63  
36 64 let load_paragraph = function
37 65 Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) ->
38   - id_ab,paragraph
  66 + parse_id id_ab,paragraph
39 67 | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml)
40 68  
41 69 let load_text_entry = function
42 70 Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) ->
43   - id_div,List.rev (Xlist.rev_map paragraphs load_paragraph)
  71 + parse_id id_div,List.rev (Xlist.rev_map paragraphs load_paragraph)
44 72 | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml)
45 73  
46 74 let load_text path name =
... ... @@ -64,9 +92,9 @@ let remove_rejected rev = function
64 92  
65 93 let rec load_segm_token = function
66 94 Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) ->
67   - [corresp,false,id_seg]
  95 + [corresp,false,parse_id id_seg]
68 96 | Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) ->
69   - [corresp,true,id_seg]
  97 + [corresp,true,parse_id id_seg]
70 98 | Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token)
71 99 | Xml.Element("choice",[],alt) as xml ->
72 100 let alt = Xlist.fold alt [] remove_rejected in
... ... @@ -77,12 +105,12 @@ let rec load_segm_token = function
77 105  
78 106 let load_segm_sentence = function
79 107 Xml.Element("s",["xml:id",id_s],tokens) ->
80   - id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token))
  108 + parse_id id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token))
81 109 | xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml)
82 110  
83 111 let load_segm_entry = function
84 112 Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) ->
85   - corresp,id_p,List.rev (Xlist.rev_map sentences load_segm_sentence)
  113 + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_segm_sentence)
86 114 | xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml)
87 115  
88 116 let load_segmentation path name =
... ... @@ -104,22 +132,22 @@ let load_morph_token = function
104 132 Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
105 133 [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
106 134 Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
107   - corresp,id_seg,orth,load_disamb disamb
  135 + parse_id corresp,parse_id id_seg,orth,load_disamb disamb
108 136 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
109 137 [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
110 138 Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]);
111 139 Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
112   - corresp,id_seg,orth,load_disamb disamb
  140 + parse_id corresp,parse_id id_seg,orth,load_disamb disamb
113 141 | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml)
114 142  
115 143 let load_morph_sentence = function
116 144 Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) ->
117   - corresp,id_s,List.rev (Xlist.rev_map tokens load_morph_token)
  145 + parse_id corresp,parse_id id_s,List.rev (Xlist.rev_map tokens load_morph_token)
118 146 | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml)
119 147  
120 148 let load_morph_entry = function
121 149 Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) ->
122   - corresp,id_p,List.rev (Xlist.rev_map sentences load_morph_sentence)
  150 + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_morph_sentence)
123 151 | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml)
124 152  
125 153 let load_morphosyntax path name =
... ... @@ -130,6 +158,16 @@ let load_morphosyntax path name =
130 158 List.rev (Xlist.rev_map entries load_morph_entry)
131 159 | _ -> failwith "load_morphosyntax"
132 160  
  161 +let rec merge_entries rev = function
  162 + ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text,
  163 + ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"},
  164 + {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation,
  165 + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"},
  166 + {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax ->
  167 + if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else
  168 + merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax)
  169 + | [],[],[] -> List.rev rev
  170 + | _ -> failwith "merge_entries"
133 171  
134 172 let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/"
135 173  
... ... @@ -137,13 +175,66 @@ let _ =
137 175 let names = get_folders nkjp_path in
138 176 Xlist.iter names (fun name ->
139 177 print_endline name;
140   - let header = load_text nkjp_path name in
141   - (* let text = load_text nkjp_path name in *)
142   - (* let segmentation = load_segmentation nkjp_path name in *)
143   - (* let morphosyntax = load_morphosyntax nkjp_path name in *)
  178 + let typ,channel = load_header nkjp_path name in
  179 + (* print_endline typ; *)
  180 + (* print_endline channel; *)
  181 + (* print_endline (typ ^ "\t" ^ channel); *)
  182 + let text = load_text nkjp_path name in
  183 + let segmentation = load_segmentation nkjp_path name in
  184 + let morphosyntax = load_morphosyntax nkjp_path name in
  185 + let entries = merge_entries [] (text,segmentation,morphosyntax) in
144 186 ())
145 187  
146 188 (*
  189 +frekwencje typów:
  190 + 127 fakt
  191 + 56 inf-por
  192 + 283 konwers
  193 + 2 listy
  194 + 376 lit
  195 + 1 lit_poezja
  196 + 80 media
  197 + 175 nd
  198 + 161 net_interakt
  199 + 227 net_nieinterakt
  200 + 20 nklas
  201 + 1986 publ
  202 + 8 qmow
  203 + 387 urzed
  204 +
  205 +frekwencje kanałów
  206 + 388 internet
  207 + 817 ksiazka
  208 + 363 mowiony
  209 + 146 prasa
  210 + 1744 prasa_dziennik
  211 + 398 prasa_inne
  212 + 5 prasa_miesiecznik
  213 + 28 prasa_tygodnik
  214 +
  215 +frekwencje łączne typów-kanałów
  216 + 127 fakt ksiazka
  217 + 56 inf-por ksiazka
  218 + 283 konwers mowiony
  219 + 2 listy ksiazka
  220 + 376 lit ksiazka
  221 + 1 lit_poezja ksiazka
  222 + 80 media mowiony
  223 + 175 nd ksiazka
  224 + 161 net_interakt internet
  225 + 227 net_nieinterakt internet
  226 + 20 nklas ksiazka
  227 + 60 publ ksiazka
  228 + 146 publ prasa
  229 + 1744 publ prasa_dziennik
  230 + 3 publ prasa_inne
  231 + 5 publ prasa_miesiecznik
  232 + 28 publ prasa_tygodnik
  233 + 8 qmow prasa_inne
  234 + 387 urzed prasa_inne
  235 +
  236 + *)
  237 +(*
147 238  
148 239 type id = {hash: bool; suffix: string; numbers: int list}
149 240  
... ...