load_resources.py
7.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# -*- coding:utf-8 -*-
import os
import re
import sys
import time
import jsonpickle
from lxml import etree
from django.core.management.base import BaseCommand
from webapp.models import Category, Expression, Meaning, Segment, Source
from settings import PROJECT_PATH
from thrift.transport import TSocket
from multiservice.facade import Multiservice
from multiservice.facade.ttypes import *
from multiservice.types.ttypes import *
SJP_PATH = os.path.join(PROJECT_PATH, 'data', 'resources', 'sjp_20161017.xml')
PLWN_PATH = os.path.join(PROJECT_PATH, 'data', 'resources', 'plwn-3_0_extr_20160830.xml')
PORT = 20000
HOST = 'multiservice.nlp.ipipan.waw.pl'
PROCESS_CHAIN = ['Concraft', 'Spejd', 'Nerf', 'MentionDetector']
class Command(BaseCommand):
help = 'Load lemma frequency informations to system.'
def handle(self, **options):
# parse_expression(u'Neil Armstrong to pierwszy człowiek na księżycu.')
load_sources_data()
load_resource('plwn', PLWN_PATH)
#load_resource(SJP_PATH)
#lematize_and_get_mentions(u'siedziba przedstawicielstwa występującego w czyimś imieniu, reprezentującego interesy danej jednostki')
def load_sources_data():
sjp, _ = Source.objects.get_or_create(key='sjp', name='SJP', url='http://sjp.pl/',
description=u'Słownik języka polskiego, ortograficzny, wyrazów obcych i słownik do gier w jednym.')
szarada, _ = Source.objects.get_or_create(key='szarada', name='szarada.net', url='http://szarada.net/',
description=u'Internetowy świat krzyżówek')
plwn, _ = Source.objects.get_or_create(key='plwn', name=u'Słowosieć', url='http://plwordnet.pwr.wroc.pl/',
description=u'Słowosieć (z ang. wordnet) – to słownik semantyczny, który odzwierciedla system leksykalny języka polskiego.')
wikidata, _ = Source.objects.get_or_create(key='wikidata', name=u'Wikidane', url='https://www.wikidata.org/',
description=u'Wikidane, w języku angielskim Wikidata – projekt internetowy mający na celu stworzenie wolnej, otwartej, wielojęzycznej bazy różnorodnych danych. Głównym zastosowaniem tej bazy danych jest używanie jej w projektach Wikimedia Foundation, przede wszystkim w Wikipedii.')
def load_resource(name, path):
source = Source.objects.get(key=name)
for _, element in etree.iterparse(path):
if element.tag == 'entry':
load_entry(source, element)
def load_entry(source, entry):
wikilink = ''
plwn_synset = 0
expressions = []
for desc in entry.getchildren():
print desc.text
if desc.attrib['wikilink']:
wikilink = desc.attrib['wikilink']
if desc.attrib['synset']:
plwn_synset = int(desc.attrib['synset'])
category, _ = Category.objects.get_or_create(key='unk', name='unk', description='niezdefiniowana')
meaning = Meaning.objects.create(plWN_synset=plwn_synset, wikilink=wikilink, category=category)
for desc in entry.getchildren():
parse_and_load_expression(source, meaning, desc.text)
def parse_and_load_expression(source, meaning, expression):
transport, client = getThriftTransportAndClient(HOST, PORT)
request = createRequest(expression, PROCESS_CHAIN)
try:
token = client.putObjectRequest(request)
status = None
while status not in [RequestStatus.DONE, RequestStatus.FAILED]:
status = client.getRequestStatus(token)
time.sleep(0.1)
if status == RequestStatus.DONE:
result = client.getResultObject(token)
load_expression(source, expression, meaning, result)
else:
print >> sys.stderr, client.getException(token)
finally:
transport.close()
def getThriftTransportAndClient(host, port):
transport = TSocket.TSocket(host, port)
try:
transport = TTransport.TBufferedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = Multiservice.Client(protocol)
transport.open()
return (transport, client)
except:
transport.close()
raise
def createRequest(text, serviceNames):
ttext = TText(paragraphs=[TParagraph(text=chunk)
for chunk in re.split(r'\n\n+', text)])
chain = [RequestPart(serviceName=name) for name in serviceNames]
request = ObjectRequest(ttext, chain)
return request
def load_expression(source, expression, meaning, result):
biggest_mention = None
biggest_mention_tokens = []
main_category = ''
expr_segs = []
head = None
jsonStr = jsonpickle.encode(result, unpicklable=False)
jsonObj = jsonpickle.decode(jsonStr)
for para in jsonObj['paragraphs']:
for sent in para['sentences']:
expr_segs.extend(sent['tokens'])
for mention in sent['mentions']:
if biggest_mention == None:
biggest_mention = mention
biggest_mention_tokens, head, category = parse_mention_info(sent, mention)
if category:
main_category = category
elif len(mention['childIds']) > len(biggest_mention['childIds']):
biggest_mention = mention
biggest_mention_tokens, head, category = parse_mention_info(sent, mention)
if category:
main_category = category
if main_category and (len(expr_segs) > 1 or expression[0].isupper()):
category_obj, _ = Category.objects.get_or_create(key=main_category, name=main_category)
meaning.category = category_obj
meaning.save()
if not meaning.expressions.filter(text=expression).exists():
expr_obj = Expression.objects.create(text=expression, meaning=meaning, score=0.0, NKJP_freq=0)
expr_obj.sources.add(source)
add_segments(expr_obj, expr_segs, head)
if biggest_mention_tokens and len(expr_segs) != len(biggest_mention_tokens):
mention_text = ' '.join([tok['orth'] for tok in biggest_mention_tokens])
mention_obj = Expression.objects.create(text=mention_text, meaning=meaning, main_expression=expr_obj,
score=0.0, NKJP_freq=0)
mention_obj.sources.add(source)
add_segments(mention_obj, biggest_mention_tokens, head)
def parse_mention_info(sentence, mention):
tokens = []
for token_id in mention['childIds']:
tokens.append((token for token in sentence['tokens'] if token["id"] == token_id).next())
head = (token for token in sentence['tokens'] if token["id"] == mention['headIds'][0]).next()
category = get_category(sentence, head)
return tokens, head, category
def get_category(sentence, mention_head):
for name in sentence['names']:
if mention_head['id'] in name['childIds']:
return name['type']
return ''
def add_segments(expr_obj, tokens, head):
position = 1
for seg in tokens:
is_head = False
if seg == head:
is_head = True
Segment.objects.create(position_in_expr=position, expression=expr_obj, orth=seg['orth'],
base=seg['chosenInterpretation']['base'], ctag=seg['chosenInterpretation']['ctag'],
msd=seg['chosenInterpretation']['msd'], is_head=is_head)
position += 1
# def position_promotion(sentence, curr_mention):
# if sentence['tokens'][0]['id'] == curr_mention['childIds'][0]:
# return 1
# return 0
# - promowac wzmianki blizej poczatku wyrazenia, dystans we wzmiankach od poczatku zdania
# - ewentualnie laczyc wzmianki, tak, zeby "pierwszy czlowiek na ksiezycu sie lapal", to co adjunctem wchodzi
# powinno dzialac