Commit 584f96578ef06859f8178daa45954162be0229cf

Authored by Wojciech Jaworski
1 parent 9043cc39

wczytywanie header.xml

Showing 1 changed file with 25 additions and 7 deletions
NKJP2/ENIAM_NKJP.ml
... ... @@ -19,6 +19,16 @@
19 19  
20 20 open Xstd
21 21  
  22 +let load_header path name =
  23 + match Xml.parse_file (path ^ name ^ "/header.xml") with
  24 + Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_);
  25 + Xml.Element("profileDesc",[],[Xml.Element("textClass",[],[
  26 + Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]);
  27 + Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]);
  28 + Xml.Element("revisionDesc",_,_)]) ->
  29 + typ,channel
  30 + | _ -> failwith "load_header"
  31 +
22 32 let get_folders path =
23 33 Xlist.sort (Xlist.fold (Array.to_list (Sys.readdir path)) [] (fun l folder ->
24 34 if Sys.is_directory (path ^ folder) then folder :: l else l)) compare
... ... @@ -83,17 +93,24 @@ let load_segmentation path name =
83 93 List.rev (Xlist.rev_map entries load_segm_entry)
84 94 | _ -> failwith "load_segmentation"
85 95  
86   -let rec load_morph_token = function
  96 +let load_disamb = function
  97 + Xml.Element("fs",["feats",_;"type","tool_report"],
  98 + [Xml.Element("f",["fVal",_;"name","choice"],_);
  99 + Xml.Element("f",["name","interpretation"],[Xml.Element("string",[],[Xml.PCData interp])])]) ->
  100 + interp
  101 + | xml -> failwith ("load_disamb: " ^ Xml.to_string_fmt xml)
  102 +
  103 +let load_morph_token = function
87 104 Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
88 105 [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
89   - Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],_)])]) ->
90   - ()
  106 + Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
  107 + corresp,id_seg,orth,load_disamb disamb
91 108 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
92 109 [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
93 110 Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]);
94   - Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],_)])]) ->
95   - ()
96   - | xml -> failwith ("load_morph_token 1: " ^ Xml.to_string_fmt xml)
  111 + Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
  112 + corresp,id_seg,orth,load_disamb disamb
  113 + | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml)
97 114  
98 115 let load_morph_sentence = function
99 116 Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) ->
... ... @@ -120,9 +137,10 @@ let _ =
120 137 let names = get_folders nkjp_path in
121 138 Xlist.iter names (fun name ->
122 139 print_endline name;
  140 + let header = load_text nkjp_path name in
123 141 (* let text = load_text nkjp_path name in *)
124 142 (* let segmentation = load_segmentation nkjp_path name in *)
125   - let morphosyntax = load_morphosyntax nkjp_path name in
  143 + (* let morphosyntax = load_morphosyntax nkjp_path name in *)
126 144 ())
127 145  
128 146 (*
... ...