import_plWordnet.py
3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#! /usr/bin/python
# -*- coding: utf-8 -*-
from django.core.management.base import BaseCommand
import sys, os, codecs
from xml.sax import saxutils, handler, make_parser
from meanings.models import Synset, LexicalUnit
from shellvalier.settings import BASE_DIR
BULK = 250
class PlWNHandler(handler.ContentHandler):
def __init__(self, out = sys.stdout):
handler.ContentHandler.__init__(self)
self._out = out
self._lexical_units = {}
self._reflexive_lexical_units = {}
self._mutual_lexical_units = {}
self._synsets = {}
self._root_synsets_ids = set()
self._root_synsets = []
self._defined_synset = -1
self._unit = False
self._content = ''
self._other_synset_relations = set()
self._lexical_relations = set()
self._synsets_to_base = []
self._lexical_units_to_base = []
self._hypernymy_to_base = []
self._synonymy_to_base = []
def startElement(self, name, attrs):
if name == 'lexical-unit':
if 'pwn' not in attrs['pos']:
luid = int(attrs['id'])
lubase = attrs['name']
lusense = int(attrs['variant'])
pos = attrs['pos']
desc = attrs['desc']
self._lexical_units[luid] = (lubase, lusense, pos, desc)
elif name == 'synset':
sid = int(attrs['id'])
self._defined_synset = sid
s = Synset(id=sid)
self._synsets_to_base.append(s)
self._synsets[sid] = s
elif name == 'unit-id':
self._unit = True
def endElement(self, name):
if name == 'synset':
self._defined_synset = -1
elif name == 'unit-id':
luid = int(self._content)
if luid in self._lexical_units:
s = self._synsets[self._defined_synset]
lubase, lusense, pos, desc = self._lexical_units[luid]
lu = LexicalUnit(luid=luid, base=lubase, sense=lusense, pos=pos, synset=s, glossa='', definition=desc, text_rep='{}-{}'.format(lubase, lusense))
# print luid, lubase, lusense
self._lexical_units_to_base.append(lu)
self._unit = False
self._content = ''
def characters(self, content):
if self._unit and self._defined_synset >= 0 and content.strip():
self._content += content
def endDocument(self):
pass
#==========================================================#
class Command(BaseCommand):
args = 'none'
help = ''
def handle(self, **options):
import_plWordnet()
def import_plWordnet():
xml_file = os.path.join(BASE_DIR, 'data', 'plwordnet', 'plwordnet_2_1.xml')
xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file)
parser = make_parser()
parser.setContentHandler(PlWNHandler())
print("Parsing Wordnet...")
parser.parse(xml_path)
print("...DONE")
print("")
print("Storing synsets...")
synsets = parser.getContentHandler()._synsets_to_base
max_len = len(synsets)
i = 0
while i*BULK < max_len:
store = synsets[i*BULK:min((i+1)*BULK, max_len)]
Synset.objects.bulk_create(store)
i += 1
if (i % 50 == 0):
print(str(i*BULK) + "...")
print("...DONE")
print("")
print("Storing lexical units...")
lexical_units = parser.getContentHandler()._lexical_units_to_base
max_len = len(lexical_units)
i = 0
while i*BULK < max_len:
store = lexical_units[i*BULK:min((i+1)*BULK, max_len)]
LexicalUnit.objects.bulk_create(store)
i += 1
if (i % 20 == 0):
print(str(i*BULK) + "...")
print("...DONE")