open Xstd open Printf let split_haslo s = match Xstring.split_delim "\xC2\xA0 " s with [lemma] -> lemma,"" | [lemma;"I"] -> lemma,"I" | [lemma;"II"] -> lemma,"II" | [lemma;"III"] -> lemma,"III" | [lemma;"IV"] -> lemma,"IV" | [lemma;"V"] -> lemma,"V" | [lemma;"VI"] -> lemma,"VI" | _ -> failwith ("split_haslo: " ^ s) let rec check_cut_prefix pats s msg = match pats with [] -> failwith (msg ^ ": " ^ s) | pat :: pats -> if Xstring.check_prefix pat s then Xstring.cut_prefix pat s else check_cut_prefix pats s msg let rec check_cut_sufix pats s msg = match pats with [] -> failwith (msg ^ ": " ^ s) | pat :: pats -> if Xstring.check_sufix pat s then Xstring.cut_sufix pat s else check_cut_sufix pats s msg let full_split pat s = Xlist.map (Str.full_split (Str.regexp pat) s) (function Str.Text s -> s | Str.Delim s -> s) type sense = {variant: string; qualifier: string; subsenses: (string * string * string list) list; see: string * string * string * string list} type entry = {id: string; head: string; head_variant: string; pos: string; titles: (string * string) list; senses: sense list; derivatives: string list; pron: string} let empty_sense = {variant=""; qualifier=""; subsenses=[]; see="","","",[]} let empty_entry = {id=""; head=""; head_variant=""; pos=""; titles=[]; senses=[]; derivatives=[]; pron=""} let parse_sense attrs l = let sense = Xlist.fold attrs empty_sense (fun sense -> function "variant",variant -> {sense with variant=variant} | "qualifier",qualifier -> {sense with qualifier=qualifier} | k,v -> failwith ("parse_sense 1: " ^ k ^ "=" ^ v)) in let sense = Xlist.fold l sense (fun sense -> function Xml.Element("subsense",attrs,Xml.PCData s :: l) -> let v = match attrs with [] -> "" | ["variant",v] -> v | _ -> failwith "parse_sense 6" in let l = Xlist.map l (function Xml.Element("example",[],[Xml.PCData s]) -> s | _ -> failwith "parse_sense 2") in {sense with subsenses=(v,s,l) :: sense.subsenses} | Xml.Element("see",attrs,l) -> let id,head,head_variant = Xlist.fold attrs ("","","") (fun (id,head,head_variant) -> function "id",id -> id,head,head_variant | "head",head -> id,head,head_variant | "head_variant",head_variant -> id,head,head_variant | k,v -> failwith ("parse_sense 3: " ^ k ^ "=" ^ v)) in let variants = Xlist.map l (function Xml.Element("sense",["variant",v],[]) -> v | _ -> failwith "parse_sense 4") in {sense with see=id,head,head_variant,variants} | _ -> failwith "parse_sense 5") in {sense with subsenses=List.rev sense.subsenses} let load_sjp filename = match Xml.parse_file filename with Xml.Element("sjp",[],l) -> Xlist.rev_map l (function Xml.Element("entry",attrs,l) -> let entry = Xlist.fold attrs empty_entry (fun entry -> function "id",id -> {entry with id=id} | "head",head -> {entry with head=head} | "head_variant",head_variant -> {entry with head_variant=head_variant} | "pos",pos -> {entry with pos=pos} | k,v -> failwith ("load_sjp 2: " ^ k ^ "=" ^ v)) in let entry = Xlist.fold l entry (fun entry -> function Xml.Element("title",[],[Xml.PCData s]) -> {entry with titles=(s,"") :: entry.titles} | Xml.Element("title",["title_variant",v],[Xml.PCData s]) -> {entry with titles=(s,v) :: entry.titles} | Xml.Element("sense",attrs,l) -> {entry with senses=(parse_sense attrs l) :: entry.senses} | Xml.Element("der",[],[Xml.PCData s]) -> {entry with derivatives=s :: entry.derivatives} | Xml.Element("pron",[],l) -> {entry with pron=String.concat "" (Xlist.map l Xml.to_string)} | xml -> failwith ("load_sjp 3: " ^ Xml.to_string xml)) in {entry with titles=List.rev entry.titles; senses=List.rev entry.senses; derivatives=List.rev entry.derivatives} | _ -> failwith "load_sjp 4") | _ -> failwith "load_sjp 1" let rec extract_words_rec = function "„" :: s :: "”" :: l -> s :: (extract_words_rec l) | "„" :: _ -> failwith "extract_words_rec" | "”" :: _ -> failwith "extract_words_rec" | s :: l -> extract_words_rec l | [] -> [] let extract_words s = extract_words_rec (full_split "„\\|”" s) let find_words sjp = Xlist.fold sjp (StringSet.empty,StringSet.empty) (fun (words,ids) entry -> (* if entry.derivatives <> [] then Printf.printf "%s: %s\n" entry.head (String.concat " " entry.derivatives); *) let words = Xlist.fold entry.titles words (fun words (s,_) -> StringSet.add words s) in Xlist.fold entry.senses (words,ids) (fun (words,ids) sense -> let id,head,_,_ = sense.see in let ids = if id = "" then ids else StringSet.add ids id in let words = if head = "" then words else StringSet.add words head in let words = Xlist.fold sense.subsenses words (fun words (_,s,_) -> let l = extract_words s in Xlist.fold l words StringSet.add) in words,ids)) let id_list dir = let l = Array.to_list (Sys.readdir dir) in Xlist.fold l [] (fun l name -> if name = "lista" then l else (check_cut_sufix [".html"] name "id_list") :: l) let html_header = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"> <html> <head> <META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\"> <TITLE>Słownik</TITLE> <META HTTP-EQUIV=\"Content-Language\" CONTENT=\"pl\"> </head> <body> <center>" let html_trailer = "</center> </body> </html>" (* wygenerowanie listy adresów leksemów do których są odnośniki, a których nie ma lokalnie *) (* let _ = let sjp = load_sjp "data/sjp.xml" in let words,ids = find_words sjp in Printf.printf "|words|=%d |ids|=%d\n%!" (StringSet.size words) (StringSet.size ids); let l = File.load_tab "data/adresy_hasla.tab" (function [name;url] -> name,url | _ -> failwith "load adresy_hasla") in let l = Xlist.fold l [] (fun l (name,url) -> let word,_ = split_haslo name in let id = check_cut_sufix [".html"] (check_cut_prefix ["http://sjp.pwn.pl/sjp/"] url "make id") "make id" in if StringSet.mem words word || StringSet.mem ids id then id :: l else l) in Printf.printf "|l|=%d\n%!" (Xlist.size l); let known_ids = StringSet.of_list (id_list "data/sjp.pwn.pl/sjp/") in Printf.printf "|known_ids|=%d\n%!" (StringSet.size known_ids); Xlist.iter l (fun id -> if StringSet.mem known_ids id then () else print_endline ("http://sjp.pwn.pl/sjp/" ^ id ^ ".html")); () *) let html_of_subsenses sv l = String.concat "<br>" (List.flatten (Xlist.map l (fun (v,gloss,exs) -> let v = match v with "" -> "" | "a" -> "<b>a</b>" | _ -> (if sv = "" then "" else " ") ^ "<b>"^v^"</b>" in (v^" "^gloss) :: (Xlist.map exs (fun ex -> " <i>" ^ ex ^ "</i>"))))) let html_of_see (id,head,head_variant,variants) = let s = if variants = [] then "" else " w znaczeniu " ^ String.concat ", " variants in sprintf "<b>zobacz</b> „%s%s” %s" head (if head_variant = "" then "" else " " ^ head_variant) s let html_of_entry e = let lemmas = String.concat "<br>" (Xlist.map e.titles (fun (s,v) -> if v = "" then s else s ^ " " ^ v)) in let head = "<b>" ^ (if e.head_variant = "" then e.head else e.head ^ " " ^ e.head_variant) ^ "</b>" in let lemmas = if lemmas = "" then head else head ^ "<br>" ^ lemmas in let senses = String.concat "<br>" (Xlist.map e.senses (fun s -> sprintf "<b>%s</b> <i>%s</i> %s" s.variant s.qualifier (if s.subsenses <> [] && s.see = ("","","",[]) then html_of_subsenses s.variant s.subsenses else if s.subsenses = [] && s.see <> ("","","",[]) then html_of_see s.see else failwith "html_of_entry"))) in sprintf "<td>%s</td><td>%s</td><td>%s</td>" lemmas e.pos senses let html_of_entries filename l = File.file_out filename (fun file -> fprintf file "%s\n" html_header; fprintf file "<table border=1>\n"; Xlist.iter l (fun e -> fprintf file "<tr>%s</tr>\n" (html_of_entry e)); fprintf file "</table>\n"; fprintf file "%s\n" html_trailer) let clean_titles e = let titles = Xlist.fold e.titles [] (fun titles (t,v) -> if t = e.head && v = e.head_variant then titles else (t,v) :: titles) in {e with titles = List.rev titles} let sjp_compare e f = if e.head <> f.head then compare e.head f.head else compare e.head_variant f.head_variant let sjp_compare2 e f = let s = List.hd e.senses in let t = List.hd f.senses in if s.see <> t.see then compare s.see t.see else compare s.subsenses t.subsenses let select_entries set sjp = Xlist.fold sjp [] (fun sjp e -> let b = Xlist.fold (e.head :: (Xlist.rev_map e.titles fst)) false (fun b s -> StringSet.mem set s || b) in if b then e :: sjp else sjp) let load_lu_list filename = File.fold_tab filename StringMap.empty (fun map -> function [name;cat] -> StringMap.add_inc map name (StringSet.singleton cat) (fun set -> StringSet.add set cat) | l -> failwith ("load_lu_list: " ^ String.concat "\t" l)) let select_lu lu_map l = StringMap.fold lu_map StringSet.empty (fun set s cats -> let b = Xlist.fold l false (fun b cat -> StringSet.mem cats cat || b) in if b then StringSet.add set s else set) let html_of_selected_entries sjp lu_map selected_cats filename = let set = select_lu lu_map selected_cats in let sjp = select_entries set sjp in let sjp = List.sort sjp_compare (Xlist.rev_map sjp clean_titles) in html_of_entries filename sjp let _ = let lu_map = load_lu_list "../base_LU.tab" in let sjp = load_sjp "data/SJP.xml" in (* html_of_selected_entries sjp lu_map ["adv"] "adv.html"; html_of_selected_entries sjp lu_map ["adj"] "adj.html"; html_of_selected_entries sjp lu_map ["subst"] "subst.html"; html_of_selected_entries sjp lu_map ["comp";"comp-NKJP1"] "comp.html"; html_of_selected_entries sjp lu_map ["conj";"conj-NKJP1"] "conj.html"; html_of_selected_entries sjp lu_map ["interj";"interj-NKJP1"] "interj.html"; html_of_selected_entries sjp lu_map ["num";"num-NKJP1"] "num.html"; html_of_selected_entries sjp lu_map ["prep";"prep-NKJP1"] "prep.html"; html_of_selected_entries sjp lu_map ["qub";"qub-NKJP1"] "qub.html"; html_of_selected_entries sjp lu_map ["burk";"burk-NKJP1"] "burk.html";*) (* let sjp = List.sort sjp_compare2 (Xlist.rev_map sjp clean_titles) in html_of_entries "sjp2.html" sjp; *) let sjp = List.sort sjp_compare (Xlist.rev_map sjp clean_titles) in html_of_entries "results/SJP.html" sjp; () let resolve_titles sjp = let sjp_set = Xlist.fold sjp StringSet.empty (fun set e -> StringSet.add set (e.head ^ "#" ^ e.head_variant)) in let titles = Xlist.fold sjp [] (fun titles e -> let e = clean_titles e in Xlist.fold e.titles titles (fun titles (title,title_variant) -> if StringSet.mem sjp_set (title ^ "#" ^ title_variant) then titles else (title,title_variant,e.head,e.head_variant) :: titles)) in Xlist.fold titles sjp (fun sjp (title,title_variant,head,head_variant) -> {empty_entry with head=title; head_variant=title_variant; senses=[{empty_sense with see="",head,head_variant,[]}]} :: sjp) let assign_pos sjp = let sjp_map = Xlist.fold sjp StringMap.empty (fun map e -> StringMap.add map (e.head ^ "#" ^ e.head_variant) e.pos) in Xlist.rev_map sjp (fun e -> let pos_set = StringSet.singleton e.pos in let pos_set = Xlist.fold e.senses pos_set (fun pos_set s -> let _,head,head_variant,_ = s.see in try StringSet.add pos_set (StringMap.find sjp_map (head ^ "#" ^ head_variant)) with Not_found -> pos_set) in let pos = String.concat "_" (StringSet.to_list (StringSet.remove pos_set "")) in {e with pos=pos}) let print_entry_sense file e s = let _,sh,sv,sl = s.see in if sh = "" then Xlist.iter s.subsenses (fun (v,gloss,ex) -> fprintf file "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" e.head e.pos e.head_variant s.variant v gloss (String.concat " " ex)) else let sl = if sl = [] then "" else " w znaczeniu " ^ String.concat ", " sl in let gloss = sprintf "zobacz „%s%s” %s" sh (if sv = "" then "" else " " ^ sv) sl in fprintf file "%s\t%s\t%s\t%s\t%s\t%s\t\n" e.head e.pos e.head_variant s.variant "" gloss let tab_of_entries filename l = File.file_out filename (fun file -> Xlist.iter l (fun e -> Xlist.iter e.senses (fun s -> print_entry_sense file e s))) let _ = let sjp = load_sjp "data/SJP.xml" in let sjp = resolve_titles sjp in let sjp = assign_pos sjp in let sjp = List.sort sjp_compare (Xlist.rev_map sjp clean_titles) in tab_of_entries "results/SJP.tab" sjp; ()