open Xstd let strony_listy = [ "A", 49; "B", 51; "C", 49; "Ć", 1; "D", 60; "E", 25; "F", 30; "G", 38; "H", 23; "I", 20; "J", 17; "K", 96; "L", 27; "Ł", 7; "M", 60; "N", 69; "O", 73; "Ó", 1; "P", 199; "R", 62; "S", 111; "Ś", 10; "T", 46; "U", 31; "V", 2; "W", 87; "X", 1; "Y", 1; "Z", 78; "Ż", 6; ] let list_address_genenerator l = Xlist.iter l (fun (letter,count) -> Int.iter 1 count (fun i -> let s = if i = 1 then "" else ";" ^ string_of_int i in Printf.printf "http://sjp.pwn.pl/sjp/lista/%s%s.html\n" letter s)) (* generowanie adresy_lista.tab *) (* let _ = list_address_genenerator strony_listy; () *) let match_string = " <li><img src=\"/theme/Dictionary/img/dictionary-api-source/s.svg?" let match_string2 = " <li><img src=\"/theme/Dictionary/img/dictionary-api-source/s.svg?1444379457\" class=\"alfa-image svg-image\" title=\"Słownik języka polskiego\" onerror=\"this.onerror=null; this.src='http://sjp.pwn.pl/theme/Dictionary/img/dictionary-api-source/s.png?1417608895'\" alt=\"\" /> <a href=\"" let match_string3 = " <li><img src=\"/theme/Dictionary/img/dictionary-api-source/s.svg?1444379457\" class=\"alfa-image\" title=\"Słownik języka polskiego\" alt=\"\" /> <a href=\"" let rec check_cut_prefix pats s msg = match pats with [] -> failwith (msg ^ ": " ^ s) | pat :: pats -> if Xstring.check_prefix pat s then Xstring.cut_prefix pat s else check_cut_prefix pats s msg let rec check_cut_sufix pats s msg = match pats with [] -> failwith (msg ^ ": " ^ s) | pat :: pats -> if Xstring.check_sufix pat s then Xstring.cut_sufix pat s else check_cut_sufix pats s msg let check_cut_infix pat s msg = match Xstring.split_delim pat s with [a;b] -> a,b | _ -> failwith (msg ^ ": " ^ s) let full_split pat s = Xlist.map (Str.full_split (Str.regexp pat) s) (function Str.Text s -> s | Str.Delim s -> s) let extract_line s = let s = check_cut_prefix [match_string2; match_string3] s "extract_line 1: " in (* if Xstring.check_prefix match_string2 s then Xstring.cut_prefix match_string2 s else if Xstring.check_prefix match_string3 s then Xstring.cut_prefix match_string3 s else failwith ("extract_line 1: " ^ s) in *) let s = check_cut_sufix ["</a></li>"] s "extract_line 2" in (* if Xstring.check_sufix "</a></li>" s then Xstring.cut_sufix "</a></li>" s else failwith ("extract_line 2: " ^ s) in *) check_cut_infix "\">" s "extract_line 3" (* match Xstring.split_delim "\">" s with [url;name] -> url,name | _ -> failwith ("extract_line 3: " ^ s) *) let list_address_extractor dir = let l = List.sort compare (Array.to_list (Sys.readdir dir)) in Xlist.map l (fun name -> name, List.rev (Xlist.fold (File.load_lines (dir ^ name)) [] (fun l line -> if Xstring.check_prefix match_string line then (extract_line line) :: l else l))) (* generowanie adresy_hasla.tab *) (* let _ = let l = list_address_extractor "../../../NLP resources/SJP/sjp.pwn.pl/sjp/lista/" in (* Xlist.iter l (fun (n,l) -> Printf.printf "%s %d\n" n (Xlist.size l)); *) Xlist.iter l (fun (_,l) -> Xlist.iter l (fun (url,name) -> Printf.printf "%s\t%s\n" name url)); () *) let split_haslo s = match Xstring.split_delim "\xC2\xA0 " s with [lemma] -> lemma,"" | [lemma;"I"] -> lemma,"I" | [lemma;"II"] -> lemma,"II" | [lemma;"III"] -> lemma,"III" | [lemma;"IV"] -> lemma,"IV" | [lemma;"V"] -> lemma,"V" | [lemma;"VI"] -> lemma,"VI" | _ -> failwith ("split_haslo: " ^ s) let split_title s = match Xstring.split_delim "\xC2\xA0" s with [lemma] -> lemma,"" | [lemma;"I"] -> lemma,"I" | [lemma;"II"] -> lemma,"II" | [lemma;"III"] -> lemma,"III" | [lemma;"IV"] -> lemma,"IV" | [lemma;"V"] -> lemma,"V" | [lemma;"VI"] -> lemma,"VI" | _ -> failwith ("split_title: '" ^ s ^ "'") let split_myslnik s = match Xstring.split_delim "-" s with [lemma] -> lemma | _ -> print_endline s; s let split_spacja s = match Xstring.split_delim " " s with [lemma] -> lemma | _ -> print_endline s; s let split_comma s = match Xstring.split_delim ", " s with [lemma] -> s | _ -> (*print_endline s;*) "###" (* hasła o złożonych nazwach np. "szkatułkowa forma, kompozycja utworu, dzieła itp." *) let standard_chars = StringSet.of_list [ "a";"ą";"b";"c";"ć";"d";"e";"ę";"f";"g";"h";"i";"j";"k";"l";"ł";"m";"n";"ń";"o";"ó";"p";"r"; "s";"ś";"t";"u";"w";"y";"z";"ź";"ż";(*"-";",";" ";*)"v";"é";"à";"x";"ü";"q";"ê";"è";"ö";"ü";"ñ"; "A";"Ą";"B";"C";"Ć";"D";"E";"Ę";"F";"G";"H";"I";"J";"K";"L";"Ł";"M";"N";"Ń";"O";"Ó";"P";"R"; "S";"Ś";"T";"U";"W";"Y";"Z";"Ź";"Ż";"X";"V";"";"";"";"";"";"";"";"";"";"";"";"";"";"";] let check_nonstandart_chars lemma = Xlist.fold (Xunicode.utf8_chars_of_utf8_string lemma) false (fun b c -> if StringSet.mem standard_chars c then b else true) (* UWAGA: w słowniku występują hasła złożone zawierające ", ", wyrażenia wielosłownie, dopowiedzenia w nawiasach, wyrazy obce, skróty, prefiksy, sufiksy, wyrażenia wielotokenowe *) let make_lemma_map l = Xlist.fold l StringMap.empty (fun map (name,url) -> let lemma,variant = split_haslo name in (* let lemma = split_comma lemma in *) (* let _ = split_spacja lemma in *) (* let _ = split_myslnik lemma in *) if lemma = "###" then map else ( (* if check_nonstandart_chars lemma then print_endline name; *) StringMap.add_inc map lemma [variant,url] (fun l -> (variant,url) :: l))) let load_lu_list filename = File.fold_tab filename StringMap.empty (fun map -> function [name;cat] -> StringMap.add_inc map name (StringSet.singleton cat) (fun set -> StringSet.add set cat) | l -> failwith ("load_lu_list: " ^ String.concat "\t" l)) let find_url_for_lu lemma_map lu_map = StringMap.iter lu_map (fun lu _ -> try Xlist.iter (StringMap.find lemma_map lu) (fun (_,url) -> print_endline url) with Not_found -> print_endline (lu ^ " NOT FOUND")) (* generowanie wybrane_adresy_hasla.tab *) (* let _ = let l = File.load_tab "data/adresy_hasla.tab" (function [name;url] -> name,url | _ -> failwith "load adresy_hasla") in let lemma_map = make_lemma_map l in let lu_map = load_lu_list "../base_LU.tab" in find_url_for_lu lemma_map lu_map; () *) let head_string1 = " <div class=\"base-entry-head\"><h1>" let head_string2 = "</h1></div>" let rec select_haslo2 name = function [] -> failwith ("select_haslo2: " ^ name) | "</article>" :: _ -> [] | line :: l -> line :: (select_haslo2 name l) let rec select_haslo name = function [] -> failwith ("select_haslo: " ^ name) | line :: l -> if Xstring.check_prefix head_string1 line then select_haslo2 name (line :: l) else select_haslo name l let haslo_extractor dir = let l = Array.to_list (Sys.readdir dir) in Xlist.fold l [] (fun l name -> if name = "lista" then l else (name, select_haslo name (File.load_lines (dir ^ name))) :: l) let rec select_lines = function [] -> [] | " <div class=\"base-entry-line\"> " :: l -> select_lines l | " <div class=\"base-entry-hint\">Słownik języka polskiego</div>" :: l -> select_lines l | " </div>" :: l -> select_lines l | " <div class=\"base-entry-body\">" :: l -> select_lines l | " <article>" :: l -> select_lines l | " <div class=\"ribbon\">" :: l -> select_lines l | " <div class=\"ribbon-section\">" :: l -> select_lines l | " <div class=\"ribbon-element type-187126\">" :: l -> select_lines l | " " :: l -> select_lines l | "</div>" :: l -> select_lines l | " </div>" :: l -> select_lines l | " </div>" :: l -> select_lines l | line :: l -> line :: (select_lines l) type token = Head of string | Tytul of string | Variant of string | SenseText of string | Href of string * string | Zob of string * string | Der of string | Example of string | Kwal of string * string | Skrot of string | Wym of string * string * string * string * string | Sense of (string * string * string list) list (* variant * gloss * examples *) | Zob2 of string * string * string * string list | Word of string let string_of_token = function Head s -> Printf.sprintf "Head(%s)" s | Tytul s -> Printf.sprintf "Tytul(%s)" s | Variant s -> Printf.sprintf "Variant(%s)" s | SenseText s -> Printf.sprintf "SenseText(%s)" s | Href(s,t) -> Printf.sprintf "Href(%s,%s)" s t | Zob(s,t) -> Printf.sprintf "Zob(%s,%s)" s t | Der s -> Printf.sprintf "Der(%s)" s | Example s -> Printf.sprintf "Example(%s)" s | Kwal(s,t) -> Printf.sprintf "Kwal(%s,%s)" s t | Skrot s -> Printf.sprintf "Skrot(%s)" s | Wym(s,t,a,b,c) -> Printf.sprintf "Wym(%s,%s,%s,%s,%s)" s t a b c | Sense l -> Printf.sprintf "Sense" | Zob2(s,t,r,l) -> Printf.sprintf "Zob(%s,%s,%s,[%s])" s t r (String.concat ";" l) | Word s -> Printf.sprintf "Word(%s)" s let mark_words = function Example "a" -> Word "a" | Example "a, b, c..." -> Word "a, b, c..." | Example "a zatem" -> Word "a zatem" | Example "bez względu na to skąd" -> Word "bez względu na to skąd" | Example "buszel" -> Word "buszel" | Example "bym, byś, byśmy, byście" -> Word "bym, byś, byśmy, byście" | Example "c" -> Word "c" | Example "ch" -> Word "ch" | Example "chyba że" -> Word "chyba że" | Example "chyba żeby" -> Word "chyba żeby" | Example "e" -> Word "e" | Example "gdzie" -> Word "gdzie" | Example "gdzie indziej" -> Word "gdzie indziej" | Example "hektometr" -> Word "hektometr" | Example "hektar" -> Word "hektar" | Example "i" -> Word "i" | Example "igrek" -> Word "igrek" | Example "ile" -> Word "ile" | Example "ile razy" -> Word "ile razy" | Example "im" -> Word "im" | Example "ipsylon" -> Word "ipsylon" | Example "j" -> Word "j" | Example "jak i" -> Word "jak i" | Example "jak również" -> Word "jak również" | Example "jak też" -> Word "jak też" | Example "kędy" -> Word "kędy" | Example "kh" -> Word "kh" | Example "kiedy" -> Word "kiedy" | Example "kiedy indziej" -> Word "kiedy indziej" | Example "ks" -> Word "ks" | Example "kto" -> Word "kto" | Example "który" -> Word "który" | Example "m" -> Word "m" | Example "M" -> Word "M" | Example "najpierw" -> Word "najpierw" | Example "nie" -> Word "nie" | Example "o" -> Word "o" | Example "o ile" -> Word "o ile" | Example "o tyle" -> Word "o tyle" | Example "skąd" -> Word "skąd" | Example "tak" -> Word "tak" | Example "ten" -> Word "ten" | Example "to" -> Word "to" | Example "tu" -> Word "tu" | Example "ty" -> Word "ty" | Example "tyle" -> Word "tyle" | Example "tym" -> Word "tym" | Example "u" -> Word "u" | Example "w" -> Word "w" | Example "x" -> Word "x" | Example "X" -> Word "X" | Example "y" -> Word "y" | Example "z" -> Word "z" | Example "zarówno..., jak" -> Word "zarówno..., jak" | Example "z jakiegokolwiek miejsca" -> Word "z jakiegokolwiek miejsca" | Example "μ" -> Word "μ" | Example "π" -> Word "π" | Example "Π, π" -> Word "Π, π" | Example "Φ, ϕ" -> Word "Φ, ϕ" | Example "praktycznie biorąc" -> Word "praktycznie biorąc" | Example "praktycznie rzecz biorąc" -> Word "praktycznie rzecz biorąc" | Example "co prawda..., ale (jednak, lecz)..." -> Word "co prawda..., ale (jednak, lecz)..." | Example "wprawdzie..., ale (jednak, lecz)..." -> Word "wprawdzie..., ale (jednak, lecz)..." | Example "zarówno ..., jak (i, też) ..." -> Word "zarówno ..., jak (i, też) ..." | Example "tylekroć, tyle razy, zawsze" -> Word "tylekroć, tyle razy, zawsze" | Example s -> (*if Xstring.size s < 30 then print_endline s;*) Example s | t -> t let rec merge_sense_text rev = function SenseText s :: SenseText t :: l -> merge_sense_text rev (SenseText(s^t) :: l) | SenseText s :: Skrot t :: l -> merge_sense_text rev (SenseText(s^t) :: l) | SenseText s :: Word t :: l -> merge_sense_text rev (SenseText(s^"„"^t^"”") :: l) | SenseText s :: l -> merge_sense_text (SenseText s :: rev) l | Example s :: l -> merge_sense_text (Example s :: rev) l | [] -> List.rev rev | l -> failwith ("merge_sense_text: " ^ String.concat " " (Xlist.map l string_of_token)) let rec clean_sense_text rev = function SenseText "." :: l -> clean_sense_text rev l | SenseText " " :: l -> clean_sense_text rev l | SenseText ", " :: l -> clean_sense_text rev l | SenseText "; " :: l -> clean_sense_text rev l | SenseText s :: l -> let s = if Xstring.check_prefix ", " s then Xstring.cut_prefix ", " s else s in let s = if Xstring.check_sufix ", np. " s then Xstring.cut_sufix ", np. " s else s in let s = if Xstring.check_sufix ", np." s then Xstring.cut_sufix ", np." s else s in let s = if Xstring.check_sufix ". np. " s then Xstring.cut_sufix ". np. " s else s in let s = if Xstring.check_sufix ", np.: " s then Xstring.cut_sufix ", np.: " s else s in let s = if Xstring.check_sufix " np." s then Xstring.cut_sufix " np." s else s in clean_sense_text ((SenseText s) :: rev) l | t :: l -> clean_sense_text (t :: rev) l | [] -> List.rev rev let rec split_sense_text rev = function SenseText s :: Example t1 :: Example t2 :: Example t3 :: l -> split_sense_text (("",s,[t1;t2;t3]) :: rev) l | SenseText s :: Example t1 :: Example t2 :: l -> split_sense_text (("",s,[t1;t2]) :: rev) l | SenseText s :: Example t1 :: l -> split_sense_text (("",s,[t1]) :: rev) l | SenseText s :: l -> split_sense_text (("",s,[]) :: rev) l | [] -> List.rev rev | l -> failwith ("split_sense_text: " ^ String.concat " " (Xlist.map l string_of_token)) let process_sense l = let l = Xlist.map l mark_words in let l = merge_sense_text [] l in let l = clean_sense_text [] l in let l = split_sense_text [] l in Sense l let rec parse_line = function [" "; "<"; "div class=\"base-entry-head\""; ">"; "<"; "h1"; ">"; head; "<"; "/h1"; ">"; "<"; "/div"; ">"] -> [Head head] | "<" :: "div class=\"znacz\"" :: ">" :: line -> parse_line line | "<" :: "/div" :: ">" :: line -> parse_line line | "<" :: "br" :: ">" :: line -> parse_line line | ", " :: line -> parse_line line | "; " :: line -> parse_line line | ". " :: line -> parse_line line | "." :: line -> parse_line line | "• " :: line -> parse_line line | " • " :: line -> parse_line line | "<" :: "span class=\"tytul\"" :: ">" :: tytul :: "<" :: "/span" :: ">" :: line -> Tytul tytul :: (parse_line line) | " " :: line -> parse_line line | "<" :: "b" :: ">" :: variant :: "<" :: "/b" :: ">" :: "<" :: "b" :: ">" :: name :: "<" :: "/b" :: ">" :: line -> Variant variant :: Href("",name) :: (parse_line line) | "<" :: "b" :: ">" :: variant :: "<" :: "/b" :: ">" :: line -> Variant variant :: (parse_line line) | "«" :: line -> parse_sense [] line | "<" :: "i" :: ">" :: "zob." :: "<" :: "/i" :: ">" :: " " :: "<" :: href :: ">" :: name :: "<" :: "/a" :: ">" :: " w zn. 7, 8. " :: line -> Zob(href,name ^ " w zn. 7, 8. ") :: (parse_line line) | "<" :: "i" :: ">" :: "zob." :: "<" :: "/i" :: ">" :: " " :: "<" :: href :: ">" :: name :: "<" :: "/a" :: ">" :: line -> Zob(href,name) :: (parse_line line) | "<" :: "i" :: ">" :: "zob." :: "<" :: "/i" :: ">" :: name :: "<" :: "/div" :: ">" :: line -> Zob("",name) :: (parse_line line) | "<" :: "i" :: ">" :: "zob." :: "<" :: "/i" :: ">" :: " " :: "<" :: href :: ">" :: name :: "<" :: "span class=\"skrot-inny\"" :: ">" :: name2 :: "<" :: "/span" :: ">" :: name3 :: "<" :: "/a" :: ">" :: line -> Zob(href,name ^ name2 ^ name3) :: (parse_line line) | "<" :: "i" :: ">" :: "zob." :: "<" :: "/i" :: ">" :: " " :: "<" :: href :: ">" :: name :: "<" :: "span class=\"skrot-inny\"" :: ">" :: "<" :: "abbr title=\"znaczenie, znaczy\"" :: ">" :: name2 :: "<" :: "/abbr" :: ">" :: "<" :: "/span" :: ">" :: name3 :: "<" :: "/a" :: ">" :: line -> Zob(href,name ^ name2 ^ name3) :: (parse_line line) | "stopień najwyższy od wysoko." :: line -> process_sense [SenseText "stopień najwyższy od ";Word "wysoko"] :: (parse_line line) | "stopień wyższy od " :: "<" :: href :: ">" :: "dużo" :: "<" :: "/a" :: ">" :: " lub od " :: "<" :: href2 :: ">" :: "wiele" :: "<" :: "/a" :: ">" :: ", " :: "<" :: "span class=\"skrot-inny\"" :: ">" :: "np." :: "<" :: "/span" :: ">" :: " " :: "<" :: "i" :: ">" :: example :: "<" :: "/i" :: ">" :: line -> process_sense [SenseText "stopień wyższy od ";Word "dużo";SenseText " lub od ";Word "wiele";SenseText " np.";Example example] :: (parse_line line) | " stopień najwyższy od " :: "<" :: "i" :: ">" :: "mało" :: "<" :: "/i" :: ">" :: line -> process_sense [SenseText "stopień najwyższy od ";Word "mało"] :: (parse_line line) | " stopień wyższy od " :: "<" :: href :: ">" :: "mało" :: "<" :: "/a" :: ">" :: line -> process_sense [SenseText "stopień wyższy od ";Word "mało"] :: (parse_line line) | "<" :: "span class=\"tytul-der\"" :: ">" :: der :: "<" :: "/span" :: ">" :: line -> Der der :: (parse_line line) | "<" :: "span class=\"kwal\"" :: ">" :: "<" :: title :: ">" :: text :: "<" :: "/abbr" :: ">" :: "<" :: "/span" :: ">" :: line -> Kwal(title,text) :: (parse_line line) | "<" :: "span class=\"kwal\"" :: ">" :: text :: "<" :: "/span" :: ">" :: line -> Kwal("",text) :: (parse_line line) | "[" :: "<" :: "i" :: ">" :: "wym." :: "<" :: "/i" :: ">" :: wym :: "]" :: line -> Wym(wym,"","","","") :: (parse_line line) | "[" :: "<" :: "i" :: ">" :: "wym." :: "<" :: "/i" :: ">" :: wym :: "<" :: "span class=\"akcent\"" :: ">" :: akcent :: "<" :: "/span" :: ">" :: "]" :: line -> Wym(wym,akcent,"","","") :: (parse_line line) | "[" :: "<" :: "i" :: ">" :: "wym." :: "<" :: "/i" :: ">" :: wym :: "<" :: "span class=\"akcent\"" :: ">" :: akcent :: "<" :: "/span" :: ">" :: wym2 :: "]" :: line -> Wym(wym,akcent,wym2,"","") :: (parse_line line) | "[" :: "<" :: "i" :: ">" :: "wym." :: "<" :: "/i" :: ">" :: wym :: "<" :: "span class=\"akcent\"" :: ">" :: akcent :: "<" :: "/span" :: ">" :: wym2 :: "<" :: "span class=\"akcent\"" :: ">" :: akcent2 :: "<" :: "/span" :: ">" :: wym3 :: "]" :: line -> Wym(wym,akcent,wym2,akcent2,wym3) :: (parse_line line) | [] -> [] | l -> Printf.printf "\"%s\"\n" (String.concat "\" :: \"" l); [] and parse_sense rev = function "<" :: "i" :: ">" :: example :: "<" :: "/i" :: ">" :: line -> parse_sense (Example example :: rev) line | "<" :: "span class=\"skrot-inny\"" :: ">" :: abbr :: "<" :: "/span" :: ">" :: line -> parse_sense (Skrot abbr :: rev) line | "<" :: "span class=\"skrot-inny\"" :: ">" :: "<" :: "i" :: ">" :: s :: "<" :: "/i" :: ">" :: "<" :: "/span" :: ">" :: line -> parse_sense (SenseText("„"^s^"”") :: rev) line | "<" :: "span class=\"gram\"" :: ">" :: s :: "<" :: "/span" :: ">" :: line -> parse_sense (SenseText s :: rev) line | "<" :: "sup" :: ">" :: s :: "<" :: "/sup" :: ">" :: line -> parse_sense (SenseText("<sup>"^s^"</sup>") :: rev) line | "<" :: "sub" :: ">" :: s :: "<" :: "/sub" :: ">" :: line -> parse_sense (SenseText("<sub>"^s^"</sub>") :: rev) line | "»" :: line -> (process_sense (List.rev rev)) :: (parse_line line) | "<" :: l -> Printf.printf "XXX \"%s\"\n" (String.concat "\" :: \"" ("<" :: l)); [] | "«" :: _ -> failwith "parse_sense" | ">" :: _ -> failwith "parse_sense" | []-> failwith "parse_sense" | s :: line -> parse_sense (SenseText s :: rev) line let get_head = function Head h :: l -> h, l | l -> failwith ("get_head: " ^ String.concat " :: " (Xlist.map l string_of_token)) let rec get_titles titles wym = function Tytul t :: l -> get_titles (t :: titles) wym l | Wym(s,t,a,b,c) :: l -> get_titles titles ((s,t,a,b,c) :: wym) l | l -> List.rev titles, List.rev wym, l let rec get_ders = function Der s :: l -> s :: get_ders l | []-> [] | l -> failwith "get_ders" let rec split_variants found rev = function Variant s :: l -> let found = if rev = [] then found else (List.rev rev :: found) in split_variants found [Variant s] l | Der s :: l -> let found = if rev = [] then found else (List.rev rev :: found) in List.rev found, get_ders (Der s :: l) | t :: l -> split_variants found (t :: rev) l | [] -> let found = if rev = [] then found else (List.rev rev :: found) in List.rev found, [] let variant_names = StringSet.of_list ["1"; "2"; "3"; "4"; "5"; "6"; "7"; "8"; "9"; "10"; "11"; "12"; "13"; ""; ] let kwal_names_a = StringSet.of_list [""] let kwal_names_b = StringSet.of_list ["daw."; "pot."; ""; ""; ""; ""; ""; ""; ] let process_zob s name = let id,head,head_variant = if s = "" then "","","" else let s = check_cut_prefix ["a href=\"http://sjp.pwn.pl/sjp/"] s "process_zob 1" in let s = check_cut_sufix ["\" class=\"anchor\""] s "process_zob 2" in let id,s = check_cut_infix ".html\" title=\"" s "process_zob 3" in let head,pwn_id = check_cut_infix "\" pwn-id=\"" s "process_zob 4" in let _,pwn_id2 = check_cut_infix ";" id "process_zob 5" in if pwn_id <> pwn_id2 then failwith "process_zob 6" else let head,head_variant = split_haslo head in id,head,head_variant in (* Printf.printf "%s %s %s\n" id head head_variant; *) let l = Str.split (Str.regexp " \\|\\.\\|,") name in let head_variant2,l = match l with "I" :: l -> "I",l | "II" :: l -> "II",l | "III" :: l -> "III",l | l -> "",l in let head2,l = match l with [s] -> s,[] | [s;t] -> s ^ " " ^ t,[] | s :: "w" :: "zn" :: l -> s,l | l -> "",l in let variants = List.flatten (Xlist.map l (function "" -> [] | "1" -> ["1"] | "2" -> ["2"] | "3" -> ["3"] | "4" -> ["4"] | "7" -> ["7"] | "8" -> ["8"] | "1–4" -> ["1";"2";"3";"4"] | s -> failwith ("process_zob 7: " ^ s))) in if head <> "" && head <> "związek koordynacyjny, kompleksowy" && head <> head2 then failwith ("process_zob 8: " ^ head ^ " " ^ head2) else if head_variant <> head_variant2 then failwith "process_zob 9" else Zob2(id,head2,head_variant2,variants) let add_word s = function (v,gloss,ex) :: l -> (v,"„" ^ s ^ "” " ^ gloss, ex) :: l | [] -> failwith "add_word" let pos_list = [ "liczebnik", "num"; "partykuła", "qub"; "przyimek", "prep"; "przysłówek", "adv"; "spójnik", "conj"; "wykrzyknik", "interj"; "zaimek", "pron"; ] let find_pos senses = let set = Xlist.fold senses StringSet.empty (fun set (_,_,s) -> match s with Zob2 _ -> set | Sense((_,gloss,_) :: _) -> Xlist.fold pos_list set (fun set (pat,pos) -> if Xstring.check_prefix pat gloss then StringSet.add set pos else set) | _ -> failwith "find_pos 1") in match StringSet.to_list set with [] -> "" | [pos] -> pos | l -> String.concat "_" l let rec add_subsense_variants_rec i = function [] -> [] | (v,gloss,ex) :: l -> let v = String.make 1 (Char.chr (Char.code 'a' + i)) in (v,gloss,ex) :: (add_subsense_variants_rec (i+1) l) let add_subsense_variants = function Sense[v,gloss,ex] -> Sense[v,gloss,ex] | Sense l -> Sense(add_subsense_variants_rec 0 l) | t -> t let rec process_haslo name l = let l = List.flatten (Xlist.map (select_lines l) (fun s -> parse_line (full_split "<\\|>\\|«\\|»\\|\\]\\|\\[" s))) in let head,l = get_head l in let titles,wym,l = get_titles [] [] l in let ll,der = split_variants [] [] l in let senses = Xlist.map ll (function | [Sense s] -> "","","",Sense s | [Zob(s,t)] -> "","","",process_zob s t | [Kwal(a,b); Sense s] -> "",a,b,Sense s | [Kwal(a,b); Zob(s,t)] -> "",a,b,process_zob s t | [Variant v; Sense s] -> v,"","",Sense s | [Variant v; Zob(s,t)] -> v,"","",process_zob s t | [Variant v; Href(s,t); Sense st] -> v,"","",Sense(add_word t st) | [Variant v; Kwal(a,b); Sense s] -> v,a,b,Sense s | [Variant v; Href(s,t); Kwal(a,b); Sense st] -> v,a,b,Sense(add_word t st) | [Variant v; Kwal(a,b); Zob(s,t)] -> v,a,b,process_zob s t | l -> failwith (String.concat " :: " (Xlist.map l string_of_token))) in let id = check_cut_sufix [".html"] name "process_haslo 1" in (* if Xstring.check_sufix ".html" name then Xstring.cut_sufix ".html" name else failwith "process_haslo" in *) let head,head_variant = split_haslo head in (* if head_variant <> "" then print_endline head_variant; *) (* if check_nonstandart_chars head then print_endline head; *) let titles = Xlist.map titles split_title in (* Xlist.iter titles (fun (title,title_variant) -> if check_nonstandart_chars title then print_endline title); *) (* Xlist.iter der print_endline; *) let senses = Xlist.map senses (fun (v,a,qual,s) -> let v = if v = "" then "" else check_cut_sufix [". "] v "process_haslo 2" in (* if Xstring.check_sufix ". " v then Xstring.cut_sufix ". " v else failwith "process_haslo" in *) let s = add_subsense_variants s in v,(*a,*)qual,s) in (* Xlist.iter senses (fun (v,(*a,*)b,s) -> print_endline (string_of_token s); (* if not (StringSet.mem variant_names v) then print_endline v; *) (* if a = "" && b <> "" then print_endline b; *) (*if not (StringSet.mem kwal_names_a a) then print_endline a; if not (StringSet.mem kwal_names_b b) then print_endline b*)); *) let pos = find_pos senses in id,head,head_variant,pos,titles,wym,senses,der let xml_of_sense = function Zob2(id,head,head_variant,variants) -> [Xml.Element("see",["id",id;"head",head] @ (if head_variant = "" then [] else ["head_variant",head_variant]), Xlist.map variants (fun v -> Xml.Element("sense",["variant",v],[])))] | Sense l -> Xlist.map l (fun (v,gloss,exs) -> Xml.Element("subsense",(if v = "" then [] else ["variant",v]),Xml.PCData gloss :: Xlist.map exs (fun ex -> Xml.Element("example",[],[Xml.PCData ex])))) | _ -> failwith "xml_of_sense" let xml_of_hasla l = Xml.Element("sjp",[],Xlist.rev_map l (fun (id,head,head_variant,pos,titles,wym,senses,der) -> Xml.Element("entry",["id",id;"head",head] @ (if head_variant = "" then [] else ["head_variant",head_variant]) @ (if pos = "" then [] else ["pos",pos]), Xlist.map titles (fun (title,title_variant) -> Xml.Element("title",(if title_variant = "" then [] else ["title_variant",title_variant]),[Xml.PCData title])) @ Xlist.map wym (fun (a,b,c,d,e) -> Xml.Element("pron",[],[Xml.PCData a] @ (if b = "" then [] else [Xml.Element("akcent",[],[Xml.PCData b])]) @ (if c = "" then [] else [Xml.PCData c]) @ (if d = "" then [] else [Xml.Element("akcent",[],[Xml.PCData d])]) @ (if e = "" then [] else [Xml.PCData e]))) @ Xlist.map senses (fun (v,qual,s) -> Xml.Element("sense", (if v = "" then [] else ["variant",v]) @ (if qual = "" then [] else ["qualifier",qual]), xml_of_sense s)) @ Xlist.map der (fun d -> Xml.Element("der",[],[Xml.PCData d]))))) let _ = let hasla = haslo_extractor "../../../NLP resources/SJP/sjp.pwn.pl/sjp/" in let hasla = Xlist.rev_map hasla (fun (name,l) -> (* print_endline name; *) process_haslo name l) in print_endline (Xml.to_string_fmt (xml_of_hasla hasla)); ()