Blame view

semantics/management/commands/import_desc_plWordnet.py 3.76 KB
Bartłomiej Nitoń authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#! /usr/bin/python
# -*- coding: utf-8 -*-

from django.core.management.base import BaseCommand

import sys, os, codecs
from xml.sax import saxutils, handler, make_parser
from wordnet.models import Synset, LexicalUnit, Hypernymy, Synonymy
from settings import PROJECT_PATH

BULK = 250

class PlWNHandler(handler.ContentHandler):

    def __init__(self, out = sys.stdout):
        handler.ContentHandler.__init__(self)
        self._out = out
        self._lexical_units = {}
        self._reflexive_lexical_units = {}
        self._mutual_lexical_units = {}
        self._synsets = {}
        self._root_synsets_ids = set()
        self._root_synsets = []
        self._defined_synset = -1
        self._unit = False
        self._content = ''
        self._other_synset_relations = set()
        self._lexical_relations = set()
        self._synsets_to_base = []
        self._lexical_units_to_base = []
        self._hypernymy_to_base = []
        self._synonymy_to_base = []

    def startElement(self, name, attrs):
        if name == 'lexical-unit':
            if 'pwn' not in attrs['pos']:
                luid = int(attrs['id'])
                lubase = attrs['name']
                lusense = int(attrs['variant'])
                pos = attrs['pos']
                desc = attrs['desc']
                self._lexical_units[luid] = (lubase, lusense, pos, desc)
        elif name == 'synset':
            sid = int(attrs['id'])
            self._defined_synset = sid
            s = Synset(id=sid)
            self._synsets_to_base.append(s)
            self._synsets[sid] = s
        elif name == 'unit-id':
            self._unit = True
        elif name == 'synsetrelations':
            if attrs['valid'] == 'true':
                parent = int(attrs['parent'])
                child = int(attrs['child'])
                relation = int(attrs['relation'])
                if child in self._synsets and parent in self._synsets:
                    if relation == 11: # hiperonimia
                        p = self._synsets[parent]
                        c = self._synsets[child]
                        h = Hypernymy(parent=p,child=c)
                        self._hypernymy_to_base.append(h)
                    elif  relation == 60: # bliskoznacznosc
                        p = self._synsets[parent]
                        c = self._synsets[child]
                        s = Synonymy(parent=p,child=c)
                        self._synonymy_to_base.append(s)

    def endElement(self, name):
        if name == 'synset':
            self._defined_synset = -1
        elif name == 'unit-id':
            luid = int(self._content)
            if luid in self._lexical_units:
                s = self._synsets[self._defined_synset]
                lubase, lusense, pos, desc = self._lexical_units[luid]
                if len(desc) != 0 and desc != "brak danych":
                    lu = LexicalUnit.objects.get(luid=luid)
                    lu.definition = desc
                    lu.save()
                # print luid, lubase, lusense
                # self._lexical_units_to_base.append(lu)
            self._unit = False
            self._content = ''

    def characters(self, content):
        if self._unit and self._defined_synset >= 0 and content.strip():
            self._content += content

    def endDocument(self):
        pass

#==========================================================#
class Command(BaseCommand):
    args = 'none'
    help = ''

    def handle(self, **options):
        import_plWordnet()

def import_plWordnet():
    f = os.path.join(PROJECT_PATH, 'data', 'semantics', 'plwordnet_2_1.xml')

    parser = make_parser()
    parser.setContentHandler(PlWNHandler())
    print "Parsing Wordnet..."
    parser.parse(f)
    print "...DONE"