Commit 91fb35c632c018a228d91c3518eac0a818d6a78b
1 parent
02015f19
Skrypt do importu Słowosieci
Showing
3 changed files
with
115 additions
and
0 deletions
meanings/management/__init__.py
0 → 100644
meanings/management/commands/__init__.py
0 → 100644
meanings/management/commands/import_plWordnet.py
0 → 100644
1 | +#! /usr/bin/python | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +from django.core.management.base import BaseCommand | |
5 | + | |
6 | +import sys, os, codecs | |
7 | +from xml.sax import saxutils, handler, make_parser | |
8 | +from meanings.models import Synset, LexicalUnit | |
9 | +from shellvalier.settings import BASE_DIR | |
10 | + | |
11 | +BULK = 250 | |
12 | + | |
13 | +class PlWNHandler(handler.ContentHandler): | |
14 | + | |
15 | + def __init__(self, out = sys.stdout): | |
16 | + handler.ContentHandler.__init__(self) | |
17 | + self._out = out | |
18 | + self._lexical_units = {} | |
19 | + self._reflexive_lexical_units = {} | |
20 | + self._mutual_lexical_units = {} | |
21 | + self._synsets = {} | |
22 | + self._root_synsets_ids = set() | |
23 | + self._root_synsets = [] | |
24 | + self._defined_synset = -1 | |
25 | + self._unit = False | |
26 | + self._content = '' | |
27 | + self._other_synset_relations = set() | |
28 | + self._lexical_relations = set() | |
29 | + self._synsets_to_base = [] | |
30 | + self._lexical_units_to_base = [] | |
31 | + self._hypernymy_to_base = [] | |
32 | + self._synonymy_to_base = [] | |
33 | + | |
34 | + def startElement(self, name, attrs): | |
35 | + if name == 'lexical-unit': | |
36 | + if 'pwn' not in attrs['pos']: | |
37 | + luid = int(attrs['id']) | |
38 | + lubase = attrs['name'] | |
39 | + lusense = int(attrs['variant']) | |
40 | + pos = attrs['pos'] | |
41 | + desc = attrs['desc'] | |
42 | + self._lexical_units[luid] = (lubase, lusense, pos, desc) | |
43 | + elif name == 'synset': | |
44 | + sid = int(attrs['id']) | |
45 | + self._defined_synset = sid | |
46 | + s = Synset(id=sid) | |
47 | + self._synsets_to_base.append(s) | |
48 | + self._synsets[sid] = s | |
49 | + elif name == 'unit-id': | |
50 | + self._unit = True | |
51 | + | |
52 | + def endElement(self, name): | |
53 | + if name == 'synset': | |
54 | + self._defined_synset = -1 | |
55 | + elif name == 'unit-id': | |
56 | + luid = int(self._content) | |
57 | + if luid in self._lexical_units: | |
58 | + s = self._synsets[self._defined_synset] | |
59 | + lubase, lusense, pos, desc = self._lexical_units[luid] | |
60 | + lu = LexicalUnit(luid=luid, base=lubase, sense=lusense, pos=pos, synset=s, glossa='', definition=desc) | |
61 | + # print luid, lubase, lusense | |
62 | + self._lexical_units_to_base.append(lu) | |
63 | + self._unit = False | |
64 | + self._content = '' | |
65 | + | |
66 | + def characters(self, content): | |
67 | + if self._unit and self._defined_synset >= 0 and content.strip(): | |
68 | + self._content += content | |
69 | + | |
70 | + def endDocument(self): | |
71 | + pass | |
72 | + | |
73 | +#==========================================================# | |
74 | +class Command(BaseCommand): | |
75 | + args = 'none' | |
76 | + help = '' | |
77 | + | |
78 | + def handle(self, **options): | |
79 | + import_plWordnet() | |
80 | + | |
81 | +def import_plWordnet(): | |
82 | + xml_file = os.path.join(BASE_DIR, 'data', 'plwordnet', 'plwordnet_2_1.xml') | |
83 | + xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file) | |
84 | + | |
85 | + parser = make_parser() | |
86 | + parser.setContentHandler(PlWNHandler()) | |
87 | + print("Parsing Wordnet...") | |
88 | + parser.parse(xml_path) | |
89 | + print("...DONE") | |
90 | + | |
91 | + print("") | |
92 | + | |
93 | + print("Storing synsets...") | |
94 | + synsets = parser.getContentHandler()._synsets_to_base | |
95 | + max_len = len(synsets) | |
96 | + i = 0 | |
97 | + while i*BULK < max_len: | |
98 | + store = synsets[i*BULK:min((i+1)*BULK, max_len)] | |
99 | + Synset.objects.bulk_create(store) | |
100 | + i += 1 | |
101 | + print(str(i*BULK) + "...") | |
102 | + print("...DONE") | |
103 | + | |
104 | + print("") | |
105 | + | |
106 | + print("Storing lexical units...") | |
107 | + lexical_units = parser.getContentHandler()._lexical_units_to_base | |
108 | + max_len = len(lexical_units) | |
109 | + i = 0 | |
110 | + while i*BULK < max_len: | |
111 | + store = lexical_units[i*BULK:min((i+1)*BULK, max_len)] | |
112 | + LexicalUnit.objects.bulk_create(store) | |
113 | + i += 1 | |
114 | + print(str(i*BULK) + "...") | |
115 | + print("...DONE") | |
... | ... |