Detector.java
5.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
package pl.waw.ipipan.zil.core.md.detection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
import pl.waw.ipipan.zil.core.md.entities.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class Detector {
private static final Logger logger = LoggerFactory.getLogger(Detector.class);
private Detector() {
}
public static void findMentionsInText(Text text,
ZeroSubjectDetector zeroSubjectModel) {
text.clearMentions();
logger.debug("Detecting mentions in text " + text.getId());
for (Paragraph p : text)
for (Sentence s : p)
detectMentionsInSentence(s, zeroSubjectModel);
}
private static void detectMentionsInSentence(Sentence sentence,
ZeroSubjectDetector zeroSubjectModel) {
// adding mentions
addMentionsByTokenCtag(sentence);
addMentionsBySyntacticWordsCtag(sentence);
addMentionsByNamedEntities(sentence);
addMentionsByGroups(sentence);
addSpeakerMentionsInSpoken(sentence);
// zero subject detection
zeroSubjectModel.addZeroSubjectMentions(sentence);
// removing mentions
removeTo(sentence);
Cleaner.cleanUnnecessarySentenceMentions(sentence);
// updating mention heads
updateMentionHeads(sentence);
}
/**
* heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak
*
* @param sentence
*/
private static void updateMentionHeads(Sentence sentence) {
for (Mention m : sentence.getMentions())
if (m.getHeadSegments().isEmpty())
m.addHeadSegment(m.getFirstSegment());
}
/**
* heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro""
*
* @param sentence
*/
private static void removeTo(Sentence sentence) {
Set<String> orths = new HashSet<>();
for (Token morph : sentence)
orths.add(morph.getOrth());
if (orths.contains("jeśli") || orths.contains("jeżeli")
|| orths.contains("skoro")) {
for (Mention mention : sentence.getMentions()) {
List<Token> mentSegs = mention.getSegments();
if (mentSegs.size() == 1
&& "to".equals(mentSegs.get(0).getBase())) {
sentence.removeMention(mention);
}
}
}
}
private static void addSpeakerMentionsInSpoken(Sentence sentence) {
// heurystyka dla sp1:, sp2:, MarszałekJAkistam:
if (sentence.size() > 2) {
Token first = sentence.get(0);
Token second = sentence.get(1);
if (":".equals(second.getOrth())) {
sentence.addMention(new Mention(first));
}
}
}
/**
* Wyszukuję i oznaczam wszystkie NG*
*
* @param sentence
*/
private static void addMentionsByGroups(Sentence sentence) {
for (SyntacticGroup group : sentence.getGroups()) {
if (group.getType().startsWith("NG")) {
List<Token> segments = group.getTokens();
List<Token> heads = group.getSemanticHeadTokens();
sentence.addMention(new Mention(segments, heads));
}
}
}
/**
* Wyszukuję i oznaczam wszystkie NER
*
* @param sentence
*/
private static void addMentionsByNamedEntities(Sentence sentence) {
for (NamedEntity ne : sentence.getNamedEntities()) {
List<Token> headTokens = new ArrayList<>();
List<Token> tokens = ne.getTokens();
boolean containsNoun = false;
for (Token seg : tokens) {
if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) {
containsNoun = true;
break;
}
}
if (!containsNoun)
continue;
sentence.addMention(new Mention(tokens, headTokens));
}
}
private static void addMentionsBySyntacticWordsCtag(Sentence sentence) {
for (SyntacticWord w : sentence.getSyntacticWords())
if (w.getCtag().matches(Constants.WORDS_CTAGS)) {
List<Token> tokens = w.getTokens();
if (tokens.size() == 1) {
sentence.addMention(new Mention(tokens.get(0)));
} else {
List<Token> heads = new ArrayList<>();
sentence.addMention(new Mention(tokens, heads));
}
}
}
/**
* Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow
* skladniowych, to korzystam z niego zamiast morfoskladni
*
* @param sentence
*/
private static void addMentionsByTokenCtag(Sentence sentence) {
for (Token token : sentence)
if (token.getCtag().matches(Constants.MORPHO_CTAGS))
sentence.addMention(new Mention(token));
}
}