normalize.py
3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import settings
import nkjp
from webapp.models import Segment
from wordnet.models import LexicalUnit, Relation
def expression(expression):
normalized_expressions = []
if expression.segments.count() > 1:
# normalize_verbs(expression)
normalized_expressions.extend(normalize_head(expression))
return normalized_expressions
def normalize_head(expression):
normalized_expressions = []
try:
head = expression.segments.get(is_head=True)
all_equivalents = []
head_lus = get_lus(head.base)
all_equivalents.extend(get_synonyms(head.base))
# all_equivalents.extend(get_by_relation(head.base, 1,
# Relation.objects.get(parent=None, name='hiponimia')))
all_equivalents.extend(get_by_relation(head.base, 1,
Relation.objects.get(parent=None, name='hiperonimia')))
for synonym in all_equivalents:
synonym_forms = segment(head, synonym)
for form in synonym_forms:
normalized_expression = get_normalized_expr_text(expression, form)
if head_lus.count() == 1 or nkjp.exists(normalized_expression):
normalized_expressions.append(normalized_expression)
except Segment.DoesNotExist:
pass
return set(normalized_expressions)
def get_synonyms(base):
synonyms = []
synsets = get_synsets(base)
for synset in synsets:
for lu in synset.lus.all():
if (base != lu.base and len(lu.base.split()) == 1
and lu.base not in synonyms):
synonyms.append(lu.base)
return synonyms
# tylko jedno znaczenie to ok
# w przeciwnym razie sprawdz czy jest w NKJP
def get_synsets(base):
synsets = []
lus = get_lus(base)
# if lus.count() == 1: # tylko jednoznaczne jednoski nas interesuja
# return synsets
for lu in lus:
if lu.synset not in synsets:
synsets.append(lu.synset)
return synsets
def get_lus(base):
return LexicalUnit.objects.filter(base=base)
def get_by_relation(base, max_depth, relation):
related_bases = []
source_synsets = get_synsets(base)
related_synsets = get_related_synsets(source_synsets, relation, max_depth)
for synset in related_synsets:
if synset not in source_synsets:
for lu in synset.lus.all():
if (base != lu.base and len(lu.base.split()) == 1
and lu.base not in related_bases):
related_bases.append(lu.base)
return related_bases
def get_related_synsets(sources, relation, max_depth):
related_synsets = []
for source in sources:
related_synsets.extend(get_related(source, relation, max_depth, 0))
return related_synsets
def get_related(source, relation, max_depth, depth):
depth += 1
visited = [source]
if depth > max_depth:
return visited
links = source.targets.filter(relation=relation)
for lid, link in enumerate(links):
visited.extend(get_related(link.parent, relation, max_depth, depth))
return visited
def get_normalized_expr_text(expression, new_head):
expr = ''
for seg in expression.segments.order_by('position_in_expr'):
orth = seg.orth
if seg.is_head:
orth = new_head
if seg.has_nps:
expr += orth
else:
expr += ' %s' % orth
return expr.lstrip()
def segment(orig_seg, synonym):
orig_morf2_interps = settings.MORFEUSZ2.analyse(orig_seg.orth.encode('utf8'))
return generate_inflected_forms(synonym, orig_morf2_interps)
def generate_inflected_forms(synonym, morf2_interps):
inflected = []
for interp in morf2_interps:
inflected.extend(inflect(synonym, interp.tagId))
return set(inflected)
def inflect(base, tag_id):
possible_forms = settings.MORFEUSZ2.generate(base.encode('utf8'), tag_id)
return [form.orth for form in possible_forms]