Commit 1dc4f9471ae6d1929a5d443457f85a77bd7f6ad4
1 parent
3682bbf2
Added new mention detection rules based on Walenty dictionary.
Showing
9 changed files
with
1166 additions
and
13 deletions
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
... | ... | @@ -2,6 +2,7 @@ package pl.waw.ipipan.zil.core.md; |
2 | 2 | |
3 | 3 | import org.slf4j.Logger; |
4 | 4 | import org.slf4j.LoggerFactory; |
5 | + | |
5 | 6 | import pl.waw.ipipan.zil.core.md.detection.Detector; |
6 | 7 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; |
7 | 8 | import pl.waw.ipipan.zil.core.md.entities.Text; |
... | ... | @@ -15,10 +16,16 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; |
15 | 16 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; |
16 | 17 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; |
17 | 18 | |
19 | +import java.io.BufferedReader; | |
18 | 20 | import java.io.File; |
19 | 21 | import java.io.FileInputStream; |
20 | 22 | import java.io.IOException; |
21 | 23 | import java.io.InputStream; |
24 | +import java.io.InputStreamReader; | |
25 | +import java.util.ArrayList; | |
26 | +import java.util.EnumMap; | |
27 | +import java.util.HashMap; | |
28 | +import java.util.Map; | |
22 | 29 | |
23 | 30 | public class Main { |
24 | 31 | |
... | ... | @@ -26,12 +33,125 @@ public class Main { |
26 | 33 | |
27 | 34 | private static final boolean GZIP_OUTPUT = true; |
28 | 35 | private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; |
36 | + private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt"; | |
37 | + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt"; | |
38 | + private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt"; | |
29 | 39 | |
30 | 40 | private static ZeroSubjectDetector zeroSubjectModel; |
41 | + | |
42 | + public static enum ValenceDicts { | |
43 | + VerbsValence, | |
44 | + NounsValence | |
45 | + } | |
46 | + | |
47 | + private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = | |
48 | + new EnumMap(ValenceDicts.class); | |
49 | + | |
50 | + private static final ArrayList<String> complexPreps; | |
31 | 51 | |
32 | 52 | static { |
33 | 53 | InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); |
34 | 54 | zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); |
55 | + | |
56 | + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); | |
57 | + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); | |
58 | + | |
59 | + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); | |
60 | + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); | |
61 | + | |
62 | + InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS); | |
63 | + complexPreps = readValues(complexPrepositionsStream); | |
64 | + } | |
65 | + | |
66 | + | |
67 | + public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream) | |
68 | + { | |
69 | + Map<String,ArrayList<String>> map; | |
70 | + try { | |
71 | + BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream)); | |
72 | + map = new HashMap<String,ArrayList<String>>(); | |
73 | + String line; | |
74 | + boolean firstLine = true; | |
75 | + while((line = br.readLine()) != null) { | |
76 | + if (firstLine) { | |
77 | + line = line.replace("\uFEFF", ""); // remove BOM character | |
78 | + firstLine = false; | |
79 | + } | |
80 | + | |
81 | + if (!line.startsWith("%")) { | |
82 | + String[] lineParts = line.split(":"); | |
83 | + String lemma = lineParts[0].trim(); | |
84 | + String schema = lineParts[5].trim(); | |
85 | + | |
86 | + if (schema.trim().isEmpty()) { | |
87 | + continue; | |
88 | + } | |
89 | + | |
90 | + String[] lemmaParts = lemma.split(" "); | |
91 | + if(lemmaParts.length == 1 && schemaContainsSie(schema)) { | |
92 | + lemma = lemma + " się"; | |
93 | + } | |
94 | + | |
95 | + ArrayList<String> schemata; | |
96 | + if (!map.containsKey(lemma)) { | |
97 | + schemata = new ArrayList<String>(); | |
98 | + schemata.add(schema); | |
99 | + map.put(lemma, schemata); | |
100 | + } else { | |
101 | + schemata = map.get(lemma); | |
102 | + schemata.add(schema); | |
103 | + map.put(lemma, schemata); | |
104 | + } | |
105 | + } | |
106 | + } | |
107 | + br.close(); | |
108 | + } catch (IOException ex) { | |
109 | + ex.printStackTrace(); | |
110 | + throw new RuntimeException(ex); | |
111 | + } | |
112 | + return map; | |
113 | + } | |
114 | + | |
115 | + private static boolean schemaContainsSie(String schema) { | |
116 | + for (String position : schema.split("\\s\\+\\s")) { | |
117 | + position = position.trim(); | |
118 | + position = position.substring(1, position.length()-1); | |
119 | + for (String phrT : position.split(";")) { | |
120 | + if (phrT.equals("refl") || phrT.equals("recip")) { | |
121 | + return true; | |
122 | + } | |
123 | + } | |
124 | + } | |
125 | + | |
126 | + return false; | |
127 | + } | |
128 | + | |
129 | + public static ArrayList<String> readValues(InputStream stream) { | |
130 | + ArrayList<String> values; | |
131 | + try { | |
132 | + BufferedReader br=new BufferedReader(new InputStreamReader(stream)); | |
133 | + values = new ArrayList<String>(); | |
134 | + String line; | |
135 | + boolean firstLine = true; | |
136 | + while((line = br.readLine()) != null) { | |
137 | + if (firstLine) { | |
138 | + line = line.replace("\uFEFF", ""); // remove BOM character | |
139 | + firstLine = false; | |
140 | + } | |
141 | + | |
142 | + if (!line.startsWith("%")) { | |
143 | + String value = line.trim(); | |
144 | + if (!value.isEmpty()) { | |
145 | + values.add(value); | |
146 | + } | |
147 | + } | |
148 | + } | |
149 | + br.close(); | |
150 | + } catch (IOException ex) { | |
151 | + ex.printStackTrace(); | |
152 | + throw new RuntimeException(ex); | |
153 | + } | |
154 | + return values; | |
35 | 155 | } |
36 | 156 | |
37 | 157 | private Main() { |
... | ... | @@ -71,6 +191,8 @@ public class Main { |
71 | 191 | return; |
72 | 192 | } |
73 | 193 | } |
194 | + | |
195 | + | |
74 | 196 | |
75 | 197 | int all = 0; |
76 | 198 | int errors = 0; |
... | ... | @@ -122,7 +244,7 @@ public class Main { |
122 | 244 | */ |
123 | 245 | public static void annotateThriftText(TText thriftText) throws MultiserviceException { |
124 | 246 | Text responseText = ThriftLoader.loadTextFromThrift(thriftText); |
125 | - Detector.findMentionsInText(responseText, zeroSubjectModel); | |
247 | + Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); | |
126 | 248 | ThriftSaver.updateThriftText(responseText, thriftText); |
127 | 249 | } |
128 | 250 | |
... | ... | @@ -135,7 +257,7 @@ public class Main { |
135 | 257 | */ |
136 | 258 | public static void annotateTeiText(TEICorpusText teiText) throws TEIException { |
137 | 259 | Text responseText = TeiLoader.loadTextFromTei(teiText); |
138 | - Detector.findMentionsInText(responseText, zeroSubjectModel); | |
260 | + Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); | |
139 | 261 | TeiSaver.updateTeiText(responseText, teiText); |
140 | 262 | } |
141 | 263 | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | |
3 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | |
3 | 4 | import pl.waw.ipipan.zil.core.md.entities.Mention; |
4 | 5 | import pl.waw.ipipan.zil.core.md.entities.Sentence; |
6 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
7 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
5 | 8 | import pl.waw.ipipan.zil.core.md.entities.Token; |
6 | 9 | |
10 | +import java.util.ArrayList; | |
7 | 11 | import java.util.Collection; |
8 | 12 | import java.util.HashSet; |
9 | 13 | import java.util.List; |
14 | +import java.util.Map; | |
10 | 15 | import java.util.Set; |
11 | 16 | |
12 | 17 | public class Cleaner { |
... | ... | @@ -125,4 +130,157 @@ public class Cleaner { |
125 | 130 | else |
126 | 131 | return m1; |
127 | 132 | } |
133 | + | |
134 | + public static void cleanWalentyFramedMentions(Sentence sentence, | |
135 | + Map<String,ArrayList<String>> verbsValence) { | |
136 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
137 | + for (Mention mention : sentence.getMentions()) { | |
138 | + int mentionStart = mention.getFirstSegment().getSentencePosition(); | |
139 | + int mentionEnd = mention.getLastSegment().getSentencePosition(); | |
140 | + SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd); | |
141 | + SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd); | |
142 | + | |
143 | + if (startGroup != null && endGroup != null | |
144 | + && startGroup.compareTo(endGroup) != 0) { | |
145 | + | |
146 | + SyntacticWord verb = startGroup.getPrecedingVerb(); | |
147 | + if (verb != null && !verb.getBase().equals("mieć") | |
148 | + && verbsValence.containsKey(verb.getBase())) { | |
149 | + ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations(); | |
150 | + ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations(); | |
151 | + | |
152 | + for (String schema : verbsValence.get(verb.getBase())) { | |
153 | + if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) { | |
154 | + mentionsToRemove.add(mention); | |
155 | + break; | |
156 | + } | |
157 | + } | |
158 | + } | |
159 | + } | |
160 | + } | |
161 | + | |
162 | + for (Mention mentionToRemove : mentionsToRemove) { | |
163 | + sentence.removeMention(mentionToRemove); | |
164 | + } | |
165 | + } | |
166 | + | |
167 | + /*private static void removeWalentyFramedMentions(Sentence sentence, | |
168 | + ArrayList<Mention> mentions, | |
169 | + ArrayList<String> schemata) { | |
170 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
171 | + for (Mention mention : mentions) { | |
172 | + int mentionStart = mention.getFirstSegment().getSentencePosition(); | |
173 | + int mentionEnd = mention.getLastSegment().getSentencePosition(); | |
174 | + SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd); | |
175 | + SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd); | |
176 | + if (startGroup != null && endGroup != null | |
177 | + && startGroup.compareTo(endGroup) != 0) { | |
178 | + ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations(); | |
179 | + ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations(); | |
180 | + for (String schema : schemata) { | |
181 | + if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) { | |
182 | + mentionsToRemove.add(mention); | |
183 | + break; | |
184 | + } | |
185 | + } | |
186 | + } | |
187 | + } | |
188 | + | |
189 | + for (Mention mentionToRemove : mentionsToRemove) { | |
190 | + sentence.removeMention(mentionToRemove); | |
191 | + } | |
192 | + }*/ | |
193 | + | |
194 | + private static boolean isProperSchema(String schema, ArrayList<String> group1Types, | |
195 | + ArrayList<String> group2Types) { | |
196 | + for (String group1Type : group1Types) { | |
197 | + for (String group2Type : group2Types) { | |
198 | + if (schemaContains(schema, group1Type, group2Type)) { | |
199 | + return true; | |
200 | + } | |
201 | + } | |
202 | + } | |
203 | + return false; | |
204 | + } | |
205 | + | |
206 | + private static boolean schemaContains(String schema, String phraseType1, | |
207 | + String phraseType2) { | |
208 | + boolean phrType1Found = false; | |
209 | + boolean phrType2Found = false; | |
210 | + for (String position : schema.split("\\+")) { | |
211 | + position = position.trim(); | |
212 | + position = position.substring(1, position.length()-1); | |
213 | + for (String phrT : position.split(";")) { | |
214 | + if (phrT.equals(phraseType1)) { | |
215 | + phrType1Found = true; | |
216 | + break; | |
217 | + } else if (phrT.equals(phraseType2)) { | |
218 | + phrType2Found = true; | |
219 | + break; | |
220 | + } | |
221 | + } | |
222 | + if (phrType1Found && phrType2Found) { | |
223 | + return true; | |
224 | + } | |
225 | + } | |
226 | + return false; | |
227 | + } | |
228 | + | |
229 | + | |
230 | + // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub) | |
231 | + public static void cleanQubs(Sentence sentence) { | |
232 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
233 | + for (Mention mention : sentence.getMentions()) { | |
234 | + if (mention.isPartOfQub()) { | |
235 | + mentionsToRemove.add(mention); | |
236 | + } | |
237 | + } | |
238 | + | |
239 | + for (Mention mentionToRemove : mentionsToRemove) { | |
240 | + sentence.removeMention(mentionToRemove); | |
241 | + } | |
242 | + } | |
243 | + | |
244 | + public static void cleanPreps(Sentence sentence) { | |
245 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
246 | + for (Mention mention : sentence.getMentions()) { | |
247 | + if (mention.isPartOfPrep()) { | |
248 | + mentionsToRemove.add(mention); | |
249 | + } | |
250 | + } | |
251 | + | |
252 | + for (Mention mentionToRemove : mentionsToRemove) { | |
253 | + sentence.removeMention(mentionToRemove); | |
254 | + } | |
255 | + } | |
256 | + | |
257 | + public static void cleanFrazeos(Sentence sentence) { | |
258 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
259 | + for (Mention mention : sentence.getMentions()) { | |
260 | + if (mention.isPartOfFrazeo()) { | |
261 | + mentionsToRemove.add(mention); | |
262 | + } | |
263 | + } | |
264 | + | |
265 | + for (Mention mentionToRemove : mentionsToRemove) { | |
266 | + sentence.removeMention(mentionToRemove); | |
267 | + } | |
268 | + } | |
269 | + | |
270 | + // wyrzuca wzmianki bedace czescia przyimkow zlozonych | |
271 | + public static void cleanComplexPreps(Sentence sentence, | |
272 | + ArrayList<String> complexPreps) { | |
273 | + | |
274 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
275 | + for (Mention mention : sentence.getMentions()) { | |
276 | + if (mention.isPartOfComplexPrep(complexPreps)) { | |
277 | + mentionsToRemove.add(mention); | |
278 | + } | |
279 | + } | |
280 | + | |
281 | + for (Mention mentionToRemove : mentionsToRemove) { | |
282 | + sentence.removeMention(mentionToRemove); | |
283 | + } | |
284 | + } | |
285 | + | |
128 | 286 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
... | ... | @@ -2,12 +2,15 @@ package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | |
3 | 3 | import org.slf4j.Logger; |
4 | 4 | import org.slf4j.LoggerFactory; |
5 | + | |
6 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | |
5 | 7 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; |
6 | 8 | import pl.waw.ipipan.zil.core.md.entities.*; |
7 | 9 | |
8 | 10 | import java.util.ArrayList; |
9 | 11 | import java.util.HashSet; |
10 | 12 | import java.util.List; |
13 | +import java.util.Map; | |
11 | 14 | import java.util.Set; |
12 | 15 | |
13 | 16 | public class Detector { |
... | ... | @@ -18,21 +21,25 @@ public class Detector { |
18 | 21 | } |
19 | 22 | |
20 | 23 | public static void findMentionsInText(Text text, |
21 | - ZeroSubjectDetector zeroSubjectModel) { | |
24 | + ZeroSubjectDetector zeroSubjectModel, | |
25 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
26 | + ArrayList<String> complexPreps) { | |
22 | 27 | text.clearMentions(); |
23 | 28 | logger.debug("Detecting mentions in text " + text.getId()); |
24 | 29 | for (Paragraph p : text) |
25 | 30 | for (Sentence s : p) |
26 | - detectMentionsInSentence(s, zeroSubjectModel); | |
31 | + detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps); | |
27 | 32 | } |
28 | 33 | |
29 | 34 | private static void detectMentionsInSentence(Sentence sentence, |
30 | - ZeroSubjectDetector zeroSubjectModel) { | |
35 | + ZeroSubjectDetector zeroSubjectModel, | |
36 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
37 | + ArrayList<String> complexPreps) { | |
31 | 38 | // adding mentions |
32 | 39 | addMentionsByTokenCtag(sentence); |
33 | 40 | addMentionsBySyntacticWordsCtag(sentence); |
34 | 41 | addMentionsByNamedEntities(sentence); |
35 | - addMentionsByGroups(sentence); | |
42 | + addMentionsByGroups(sentence, valence, complexPreps); | |
36 | 43 | addSpeakerMentionsInSpoken(sentence); |
37 | 44 | |
38 | 45 | // zero subject detection |
... | ... | @@ -41,6 +48,11 @@ public class Detector { |
41 | 48 | // removing mentions |
42 | 49 | removeTo(sentence); |
43 | 50 | Cleaner.cleanUnnecessarySentenceMentions(sentence); |
51 | + //Cleaner.cleanQubs(sentence); | |
52 | + //Cleaner.cleanPreps(sentence); | |
53 | + //Cleaner.cleanComplexPreps(sentence, complexPreps); | |
54 | + Cleaner.cleanFrazeos(sentence); | |
55 | + Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); | |
44 | 56 | |
45 | 57 | // updating mention heads |
46 | 58 | updateMentionHeads(sentence); |
... | ... | @@ -95,16 +107,468 @@ public class Detector { |
95 | 107 | * |
96 | 108 | * @param sentence |
97 | 109 | */ |
98 | - private static void addMentionsByGroups(Sentence sentence) { | |
99 | - for (SyntacticGroup group : sentence.getGroups()) { | |
110 | + private static void addMentionsByGroups(Sentence sentence, | |
111 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
112 | + ArrayList<String> complexPreps) { | |
113 | + List<SyntacticGroup> groups = sentence.getGroups(); | |
114 | + for (int i = 0; i < groups.size(); i++) { | |
115 | + SyntacticGroup thisGroup = groups.get(i); | |
116 | + | |
117 | + /*SyntacticGroup nearPrepNG = null; | |
118 | + SyntacticGroup nextNG = null;*/ | |
119 | + | |
120 | + SyntacticGroup nextGroup = thisGroup.getFollowingGroup(); | |
121 | + | |
122 | + /*if (thisGroup.getType().startsWith("NG")) { | |
123 | + nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(), | |
124 | + sentence); | |
125 | + nextNG = thisGroup.getNextNG(); | |
126 | + }*/ | |
127 | + | |
128 | + /*if (nextNG != null) { | |
129 | + int prepStart = thisGroup.getSentencePositionEnd() + 1; | |
130 | + int prepEnd = nextNG.getSentencePositionStart() - 1; | |
131 | + String prep = sentence.getTextInsideSpan(prepStart, prepEnd); | |
132 | + if (complexPreps.contains(prep)) { | |
133 | + String cos = ""; | |
134 | + } | |
135 | + }*/ | |
136 | + | |
137 | + /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && | |
138 | + //!isPartOfPrepNG(thisGroup, sentence) && | |
139 | + //getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && | |
140 | + precedingWordIsVerb(thisGroup, sentence) && | |
141 | + //!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | |
142 | + !precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | |
143 | + !sameSemanticHeads(thisGroup, nearPrepNG)) { | |
144 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
145 | + List<Token> segments = thisGroup.getTokens(); | |
146 | + segments.addAll(nearPrepNG.getTokens()); | |
147 | + | |
148 | + sentence.addMention(new Mention(segments, heads)); | |
149 | + }*/ | |
150 | + /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && | |
151 | + // !precedingWordIsVerb(thisGroup, sentence) && | |
152 | + !isPartOfPrepNG(thisGroup, sentence) && | |
153 | + getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && | |
154 | + //!precedingWordIsVerb(thisGroup, sentence) && | |
155 | + !precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | |
156 | + //!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | |
157 | + !sameSemanticHeads(thisGroup, nearPrepNG)) { | |
158 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
159 | + List<Token> segments = thisGroup.getTokens(); | |
160 | + segments.addAll(nearPrepNG.getTokens()); | |
161 | + | |
162 | + sentence.addMention(new Mention(segments, heads)); | |
163 | + }*/ | |
164 | + if (thisGroup.getType().startsWith("NG") && | |
165 | + nextGroup != null && nextGroup.getType().startsWith("PrepNG") && | |
166 | + NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) { | |
167 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
168 | + List<Token> segments = new ArrayList<Token>(); | |
169 | + segments.addAll(thisGroup.getTokens()); | |
170 | + segments.addAll(nextGroup.getTokens()); | |
171 | + | |
172 | + sentence.addMention(new Mention(segments, heads)); | |
173 | + } else if (thisGroup.getType().startsWith("NG") && nextGroup != null && | |
174 | + nextGroup.getType().startsWith("NG") && | |
175 | + NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence)) | |
176 | + ) { | |
177 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
178 | + List<Token> segments = new ArrayList<Token>(); | |
179 | + segments.addAll(thisGroup.getTokens()); | |
180 | + segments.addAll(nextGroup.getTokens()); | |
181 | + | |
182 | + sentence.addMention(new Mention(segments, heads)); | |
183 | + } /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null && | |
184 | + NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) { | |
185 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
186 | + | |
187 | + List<Token> segments = new ArrayList<Token>(); | |
188 | + segments.addAll(thisGroup.getTokens()); | |
189 | + | |
190 | + int prepStart = thisGroup.getSentencePositionEnd() + 1; | |
191 | + int prepEnd = nextNG.getSentencePositionStart() - 1; | |
192 | + ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd); | |
193 | + segments.addAll(prepSegments); | |
194 | + | |
195 | + segments.addAll(nextNG.getTokens()); | |
196 | + | |
197 | + sentence.addMention(new Mention(segments, heads)); | |
198 | + }*/ | |
199 | + //else if // NG + im./pt. NG | |
200 | + // daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka | |
201 | + // "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName) | |
202 | + else if (thisGroup.getType().startsWith("NG")) { | |
203 | + List<Token> segments = thisGroup.getTokens(); | |
204 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
205 | + | |
206 | + sentence.addMention(new Mention(segments, heads)); | |
207 | + } | |
208 | + } | |
209 | + | |
210 | + // oryginalna wersja | |
211 | + /*for (SyntacticGroup group : sentence.getGroups()) { | |
100 | 212 | if (group.getType().startsWith("NG")) { |
101 | 213 | List<Token> segments = group.getTokens(); |
102 | 214 | List<Token> heads = group.getSemanticHeadTokens(); |
103 | 215 | |
104 | 216 | sentence.addMention(new Mention(segments, heads)); |
105 | 217 | } |
106 | - } | |
218 | + }*/ | |
219 | + } | |
220 | + | |
221 | + private static boolean followingWordIsInf(SyntacticGroup group, | |
222 | + Sentence sentence) { | |
223 | + int followingTokenPosition = group.getSentencePositionEnd() + 1; | |
224 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
225 | + int firstWordPosition = word.getSentencePositionStart(); | |
226 | + if (followingTokenPosition == firstWordPosition && | |
227 | + (word.getCtag().equals("Inf"))) { | |
228 | + return true; | |
229 | + } | |
230 | + } | |
231 | + | |
232 | + return false; | |
233 | + } | |
234 | + | |
235 | + private static SyntacticGroup getFollowingPrepNGs(int sentencePosition, | |
236 | + Sentence sentence) { | |
237 | + SyntacticGroup largestGroup = null; | |
238 | + int nextTokenPosition = sentencePosition + 1; | |
239 | + for (SyntacticGroup group : sentence.getGroups()) { | |
240 | + if (group.getType().startsWith("PrepNG") && | |
241 | + group.getSentencePositionStart() == nextTokenPosition) { | |
242 | + if (largestGroup == null || | |
243 | + largestGroup.getTokens().size() < group.getTokens().size()) { | |
244 | + largestGroup = group; | |
245 | + } | |
246 | + } | |
247 | + } | |
248 | + return largestGroup; | |
249 | + } | |
250 | + | |
251 | + private static boolean isPartOfPrepNG(SyntacticGroup NGGroup, | |
252 | + Sentence sentence) { | |
253 | + int NGGroupStart = NGGroup.getSentencePositionStart(); | |
254 | + int NGGroupEnd = NGGroup.getSentencePositionEnd(); | |
255 | + for (SyntacticGroup group : sentence.getGroups()) { | |
256 | + if (group.getType().startsWith("PrepNG") && | |
257 | + group.getSentencePositionStart() <= NGGroupStart && | |
258 | + group.getSentencePositionEnd() >= NGGroupEnd) { | |
259 | + return true; | |
260 | + } | |
261 | + } | |
262 | + return false; | |
263 | + } | |
264 | + | |
265 | + private static boolean precedingWordIsVerb(SyntacticGroup group, | |
266 | + Sentence sentence) { | |
267 | + int precedingTokenPosition = group.getSentencePositionStart() - 1; | |
268 | + if(isPartOfPrepNG(group, sentence)) { | |
269 | + SyntacticGroup parentGroup = getParentPrepNG(group, sentence); | |
270 | + precedingTokenPosition = parentGroup.getSentencePositionStart() - 1; | |
271 | + } | |
272 | + | |
273 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
274 | + int lastWordPosition = word.getSentencePositionEnd(); | |
275 | + if (precedingTokenPosition == lastWordPosition && | |
276 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
277 | + return true; | |
278 | + } | |
279 | + } | |
280 | + return false; | |
281 | + } | |
282 | + | |
283 | + // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem? | |
284 | + // czy prep moze sie skladac z wiecej niz jednego segmentu? | |
285 | + // dopasowywac refla i recip do sie spejdowego | |
286 | + private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup, | |
287 | + SyntacticGroup PrepNGGroup, Sentence sentence, | |
288 | + Map<String,ArrayList<String>> walentyMapping) { | |
289 | + int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; | |
290 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
291 | + int lastWordPosition = word.getSentencePositionEnd(); | |
292 | + if (precedingTokenPosition == lastWordPosition && | |
293 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
294 | + String verb = word.getBase(); | |
295 | + if (!walentyMapping.containsKey(verb)) { | |
296 | + return true; | |
297 | + } else { | |
298 | + SyntacticWord prepWord = PrepNGGroup.getFirstWord(); | |
299 | + | |
300 | + if (prepWord.getTokens().size() == 1) { | |
301 | + Token prep = prepWord.getTokens().get(0); | |
302 | + String prepBase = prep.getBase(); | |
303 | + // sprawdzic czy glowa moze miec wiele tokenow | |
304 | + String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); | |
305 | + ArrayList<String> prepnps = getPrepnps(prepBase, prepCase); | |
306 | + | |
307 | + ArrayList<String> schemata = walentyMapping.get(verb); | |
308 | + for (String schema : schemata) { | |
309 | + for (String prepnp : prepnps) { | |
310 | + if (schema.contains(prepnp)) { | |
311 | + return true; | |
312 | + } | |
313 | + } | |
314 | + } | |
315 | + } else if (prepWord.getTokens().size() > 1) { | |
316 | + String prepOrth = prepWord.getOrth().toLowerCase(); | |
317 | + String comprepnp = String.format("comprepnp(%s)", prepOrth); | |
318 | + ArrayList<String> schemata = walentyMapping.get(verb); | |
319 | + for (String schema : schemata) { | |
320 | + if (schema.contains(comprepnp)) { | |
321 | + return true; | |
322 | + } | |
323 | + } | |
324 | + | |
325 | + } | |
326 | + | |
327 | + | |
328 | + } | |
329 | + } | |
330 | + } | |
331 | + return false; | |
332 | + } | |
333 | + | |
334 | + private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup, | |
335 | + SyntacticGroup PrepNGGroup, Sentence sentence, | |
336 | + Map<String,ArrayList<String>> walentyMapping) { | |
337 | + int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; | |
338 | + if(isPartOfPrepNG(NGGroup, sentence)) { | |
339 | + SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence); | |
340 | + precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; | |
341 | + } | |
342 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
343 | + int lastWordPosition = word.getSentencePositionEnd(); | |
344 | + if (precedingTokenPosition == lastWordPosition && | |
345 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
346 | + if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) { | |
347 | + return true; | |
348 | + } | |
349 | + if (!walentyMapping.containsKey(word.getBase())) { | |
350 | + return true; | |
351 | + } | |
352 | + | |
353 | + } | |
354 | + } | |
355 | + return false; | |
107 | 356 | } |
357 | + | |
358 | + private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup, | |
359 | + SyntacticGroup PrepNGGroup, Sentence sentence, | |
360 | + Map<String,ArrayList<String>> walentyMapping) { | |
361 | + String verbBase = verb.getBase(); | |
362 | + if (!walentyMapping.containsKey(verbBase)) { | |
363 | + return true; | |
364 | + } else { | |
365 | + ArrayList<String> schemata = walentyMapping.get(verbBase); | |
366 | + | |
367 | + // PrepNG + PrepNG | |
368 | + if (isPartOfPrepNG(NGGroup, sentence)) { | |
369 | + SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence); | |
370 | + ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations(); | |
371 | + ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations(); | |
372 | + for (String schema : schemata) { | |
373 | + if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) { | |
374 | + return true; | |
375 | + } | |
376 | + } | |
377 | + } | |
378 | + | |
379 | + // NG + PrepNG | |
380 | + else { | |
381 | + ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations(); | |
382 | + ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations(); | |
383 | + for (String schema : schemata) { | |
384 | + if (isProperSchema(schema, NGRealizations, prepNGRealizations)) { | |
385 | + return true; | |
386 | + } | |
387 | + } | |
388 | + } | |
389 | + } | |
390 | + return false; | |
391 | + } | |
392 | + | |
393 | + private static boolean isProperSchema(String schema, ArrayList<String> group1Types, | |
394 | + ArrayList<String> group2Types) { | |
395 | + for (String group1Type : group1Types) { | |
396 | + if (schema.contains(group1Type)) { | |
397 | + for (String group2Type : group2Types) { | |
398 | + if (schema.contains(group2Type)) { | |
399 | + return true; | |
400 | + } | |
401 | + } | |
402 | + } | |
403 | + } | |
404 | + return false; | |
405 | + } | |
406 | + | |
407 | + private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup, | |
408 | + Sentence sentence) { | |
409 | + SyntacticGroup parentPrepNG = null; | |
410 | + int NGGroupStart = NGGroup.getSentencePositionStart(); | |
411 | + int NGGroupEnd = NGGroup.getSentencePositionEnd(); | |
412 | + for (SyntacticGroup group : sentence.getGroups()) { | |
413 | + if (group.getType().startsWith("PrepNG") && | |
414 | + group.getSentencePositionStart() <= NGGroupStart && | |
415 | + group.getSentencePositionEnd() >= NGGroupEnd) { | |
416 | + if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { | |
417 | + parentPrepNG = group; | |
418 | + } | |
419 | + } | |
420 | + } | |
421 | + return parentPrepNG; | |
422 | + } | |
423 | + | |
424 | + private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup, | |
425 | + SyntacticGroup PrepNGGroup, Sentence sentence, | |
426 | + Map<String,ArrayList<String>> walentyMapping) { | |
427 | + Token NGHead = NGGroup.getSemanticHeadTokens().get(0); | |
428 | + | |
429 | + String NGHeadBase = NGHead.getBase(); | |
430 | + | |
431 | + if (!walentyMapping.containsKey(NGHeadBase)) { | |
432 | + return false; | |
433 | + } else { | |
434 | + SyntacticWord prepWord = PrepNGGroup.getFirstWord(); | |
435 | + | |
436 | + if (prepWord.getTokens().size() == 1) { | |
437 | + Token prep = prepWord.getTokens().get(0); | |
438 | + String prepBase = prep.getBase(); | |
439 | + String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); | |
440 | + String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase); | |
441 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
442 | + for (String schema : schemata) { | |
443 | + if (schemaContains(schema, prepnp)) { | |
444 | + return true; | |
445 | + } | |
446 | + } | |
447 | + } else if (prepWord.getTokens().size() > 1) { | |
448 | + String prepOrth = prepWord.getOrth().toLowerCase(); | |
449 | + String comprepnp = String.format("comprepnp(%s)", prepOrth); | |
450 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
451 | + for (String schema : schemata) { | |
452 | + if (schemaContains(schema, comprepnp)) { | |
453 | + return true; | |
454 | + } | |
455 | + } | |
456 | + | |
457 | + } | |
458 | + | |
459 | + } | |
460 | + return false; | |
461 | + } | |
462 | + | |
463 | + private static boolean NGNGValenceCompatibility(SyntacticGroup NG1, | |
464 | + SyntacticGroup NG2, Sentence sentence, | |
465 | + Map<String,ArrayList<String>> walentyMapping) { | |
466 | + Token NG1Head = NG1.getSemanticHeadTokens().get(0); | |
467 | + | |
468 | + String NGHeadBase = NG1Head.getBase(); | |
469 | + | |
470 | + if (!walentyMapping.containsKey(NGHeadBase)) { | |
471 | + return false; | |
472 | + } else { | |
473 | + ArrayList<String> NG2realizations = NG2.getWalentyRealizations(); | |
474 | + | |
475 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
476 | + for (String real : NG2realizations) { | |
477 | + for (String schema : schemata) { | |
478 | + if (schemaContains(schema, real)) { | |
479 | + return true; | |
480 | + } | |
481 | + } | |
482 | + } | |
483 | + } | |
484 | + return false; | |
485 | + } | |
486 | + | |
487 | + private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1, | |
488 | + SyntacticGroup NGGroup2, Sentence sentence, | |
489 | + Map<String,ArrayList<String>> walentyMapping) { | |
490 | + | |
491 | + Token NGHead = NGGroup1.getSemanticHeadTokens().get(0); | |
492 | + String NGHeadBase = NGHead.getBase(); | |
493 | + | |
494 | + if (!walentyMapping.containsKey(NGHeadBase)) { | |
495 | + return false; | |
496 | + } else { | |
497 | + int prepStart = NGGroup1.getSentencePositionEnd() + 1; | |
498 | + int prepEnd = NGGroup2.getSentencePositionStart() - 1; | |
499 | + String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd); | |
500 | + String comprepnp = String.format("comprepnp(%s)", complexPrep); | |
501 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
502 | + for (String schema : schemata) { | |
503 | + if (schemaContains(schema, comprepnp)) { | |
504 | + return true; | |
505 | + } | |
506 | + } | |
507 | + } | |
508 | + return false; | |
509 | + } | |
510 | + | |
511 | + private static boolean schemaContains(String schema, String phraseType) { | |
512 | + for (String position : schema.split("\\s\\+\\s")) { | |
513 | + position = position.trim(); | |
514 | + position = position.substring(1, position.length()-1); | |
515 | + for (String phrT : position.split(";")) { | |
516 | + if (phrT.equals(phraseType)) { | |
517 | + return true; | |
518 | + } | |
519 | + } | |
520 | + } | |
521 | + return false; | |
522 | + } | |
523 | + | |
524 | + private static boolean schemaContainsType(String schema, String type) { | |
525 | + // to lepiej dziala dla rzeczownikow | |
526 | + for (String position : schema.split("\\s\\+\\s")) { | |
527 | + position = position.trim(); | |
528 | + position = position.substring(1, position.length()-1); | |
529 | + for (String phrT : position.split(";")) { | |
530 | + | |
531 | + if (phrT.startsWith(type+"(")) { | |
532 | + return true; | |
533 | + } | |
534 | + } | |
535 | + } | |
536 | + return false; | |
537 | + } | |
538 | + | |
539 | + | |
540 | + // compar ?? | |
541 | + private static ArrayList<String> getPrepnps(String prepBase, String prepCase) { | |
542 | + ArrayList<String> prepnps = new ArrayList<String>(); | |
543 | + prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); | |
544 | + if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) { | |
545 | + prepnps.add(String.format("prepnp(%s,str)", prepBase)); | |
546 | + } | |
547 | + if (prepCase.equals("gen") || prepCase.equals("acc")) { | |
548 | + prepnps.add(String.format("prepnp(%s,part)", prepBase)); | |
549 | + } | |
550 | + return prepnps; | |
551 | + } | |
552 | + | |
553 | + // eliminuje "od wsi do wsi" | |
554 | + private static boolean sameSemanticHeads(SyntacticGroup group1, | |
555 | + SyntacticGroup group2) { | |
556 | + | |
557 | + List<Token> group1HeadTokens = group1.getSemanticHeadTokens(); | |
558 | + List<Token> group2HeadTokens = group2.getSemanticHeadTokens(); | |
559 | + if (group1HeadTokens.size() != group2HeadTokens.size()) { | |
560 | + return false; | |
561 | + } | |
562 | + | |
563 | + for (int i=0; i < group1HeadTokens.size(); i++) { | |
564 | + if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) { | |
565 | + return false; | |
566 | + } | |
567 | + } | |
568 | + | |
569 | + return true; | |
570 | + } | |
571 | + | |
108 | 572 | |
109 | 573 | /** |
110 | 574 | * Wyszukuję i oznaczam wszystkie NER |
... | ... | @@ -151,8 +615,9 @@ public class Detector { |
151 | 615 | * @param sentence |
152 | 616 | */ |
153 | 617 | private static void addMentionsByTokenCtag(Sentence sentence) { |
154 | - for (Token token : sentence) | |
618 | + for (Token token : sentence) { | |
155 | 619 | if (token.getCtag().matches(Constants.MORPHO_CTAGS)) |
156 | 620 | sentence.addMention(new Mention(token)); |
621 | + } | |
157 | 622 | } |
158 | 623 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
1 | 1 | package pl.waw.ipipan.zil.core.md.entities; |
2 | 2 | |
3 | 3 | import java.util.ArrayList; |
4 | +import java.util.Arrays; | |
4 | 5 | import java.util.List; |
5 | 6 | |
6 | 7 | /** |
... | ... | @@ -203,4 +204,83 @@ public class Mention implements Comparable<Mention> { |
203 | 204 | public boolean isZeroSubject() { |
204 | 205 | return isZeroSubject; |
205 | 206 | } |
207 | + | |
208 | + public int getSentencePositionStart() { | |
209 | + Token startToken = this.getFirstSegment(); | |
210 | + return startToken.getSentencePosition(); | |
211 | + } | |
212 | + | |
213 | + public int getSentencePositionEnd() { | |
214 | + Token endToken = this.getLastSegment(); | |
215 | + return endToken.getSentencePosition(); | |
216 | + } | |
217 | + | |
218 | + public boolean isPartOfQub() { | |
219 | + if (this.segments.size() == 1) { | |
220 | + Sentence sentence = this.segments.get(0).getSentence(); | |
221 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
222 | + if (word.getTokens().contains(this.segments.get(0)) && | |
223 | + word.getCtag().equals("Qub")) { | |
224 | + return true; | |
225 | + } | |
226 | + } | |
227 | + } | |
228 | + return false; | |
229 | + } | |
230 | + | |
231 | + public boolean isPartOfPrep() { | |
232 | + if (this.segments.size() == 1) { | |
233 | + Sentence sentence = this.segments.get(0).getSentence(); | |
234 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
235 | + if (word.getTokens().contains(this.segments.get(0)) && | |
236 | + word.getCtag().equals("Prep")) { | |
237 | + return true; | |
238 | + } | |
239 | + } | |
240 | + } | |
241 | + return false; | |
242 | + } | |
243 | + | |
244 | + private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj", | |
245 | + "Adj", "Conj", "Comp"); | |
246 | + | |
247 | + public boolean isPartOfFrazeo() { | |
248 | + if (this.segments.size() == 1) { | |
249 | + Sentence sentence = this.segments.get(0).getSentence(); | |
250 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
251 | + if (word.getTokens().contains(this.segments.get(0)) && | |
252 | + FRAZEOS.contains(word.getCtag())) { | |
253 | + return true; | |
254 | + } | |
255 | + } | |
256 | + } | |
257 | + return false; | |
258 | + } | |
259 | + | |
260 | + public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) { | |
261 | + if (this.segments.size() == 1) { | |
262 | + Sentence sentence = this.segments.get(0).getSentence(); | |
263 | + if (this.getSentencePositionStart() - 1 >= 0) { | |
264 | + String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth(); | |
265 | + String noun = sentence.get(this.getSentencePositionStart()).getOrth(); | |
266 | + String possiblePrep = String.format("%s %s", prep, noun); | |
267 | + if (complexPreps.contains(possiblePrep)) { | |
268 | + return true; | |
269 | + } | |
270 | + } | |
271 | + | |
272 | + if (this.getSentencePositionStart() - 1 >= 0 && | |
273 | + this.getSentencePositionStart() + 1 < sentence.size()) { | |
274 | + String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth(); | |
275 | + String noun = sentence.get(this.getSentencePositionStart()).getOrth(); | |
276 | + String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth(); | |
277 | + String possiblePrep = String.format("%s %s %s", prep1, noun, prep2); | |
278 | + if (complexPreps.contains(possiblePrep)) { | |
279 | + return true; | |
280 | + } | |
281 | + } | |
282 | + } | |
283 | + return false; | |
284 | + } | |
285 | + | |
206 | 286 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
... | ... | @@ -109,4 +109,118 @@ public class Sentence extends ArrayList<Token> { |
109 | 109 | public void addNamedEntity(NamedEntity namedEntity) { |
110 | 110 | namedEntities.add(namedEntity); |
111 | 111 | } |
112 | + | |
113 | + public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) { | |
114 | + ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); | |
115 | + for (SyntacticGroup group : this.syntacticGroups) { | |
116 | + if (group.getSentencePositionStart() >= start && | |
117 | + group.getSentencePositionEnd() <= end) { | |
118 | + if (!(group.getSentencePositionStart() == start && | |
119 | + group.getSentencePositionEnd() == end)) { | |
120 | + groupsAtSpan.add(group); | |
121 | + } | |
122 | + } | |
123 | + } | |
124 | + return groupsAtSpan; | |
125 | + } | |
126 | + | |
127 | + public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) { | |
128 | + ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); | |
129 | + for (SyntacticGroup group : this.syntacticGroups) { | |
130 | + | |
131 | + if (group.getSentencePositionStart() >= start && | |
132 | + group.getSentencePositionEnd() <= end) { | |
133 | + if (!(group.getSentencePositionStart() == start && | |
134 | + group.getSentencePositionEnd() == end)) { | |
135 | + groupsAtSpan.add(group); | |
136 | + } | |
137 | + } | |
138 | + } | |
139 | + return groupsAtSpan; | |
140 | + } | |
141 | + | |
142 | + public SyntacticGroup getFirstGroup(int start, int end) { | |
143 | + SyntacticGroup largestGroup = null; | |
144 | + int step = start; | |
145 | + while (step <= end && largestGroup == null) { | |
146 | + largestGroup = getLargestGroupOnStartPoint(step, end); | |
147 | + step++; | |
148 | + } | |
149 | + return largestGroup; | |
150 | + } | |
151 | + | |
152 | + private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) { | |
153 | + SyntacticGroup largestGroup = null; | |
154 | + for (SyntacticGroup group : this.getGroups()) { | |
155 | + int groupStart = group.getSentencePositionStart(); | |
156 | + int groupEnd = group.getSentencePositionEnd(); | |
157 | + if (groupStart == start && groupEnd <= end && | |
158 | + !(groupStart == start && groupEnd == end) && | |
159 | + (largestGroup == null || | |
160 | + largestGroup.getTokens().size() < group.getTokens().size())) { | |
161 | + largestGroup = group; | |
162 | + } | |
163 | + } | |
164 | + return largestGroup; | |
165 | + } | |
166 | + | |
167 | + public SyntacticGroup getLastGroup(int start, int end) { | |
168 | + SyntacticGroup largestGroup = null; | |
169 | + int step = end; | |
170 | + while (step != start && largestGroup == null) { | |
171 | + largestGroup = getLargestGroupOnEndPoint(start, step); | |
172 | + step--; | |
173 | + } | |
174 | + return largestGroup; | |
175 | + } | |
176 | + | |
177 | + private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) { | |
178 | + SyntacticGroup largestGroup = null; | |
179 | + for (SyntacticGroup group : this.getGroups()) { | |
180 | + int groupStart = group.getSentencePositionStart(); | |
181 | + int groupEnd = group.getSentencePositionEnd(); | |
182 | + if (groupEnd == end && groupStart >= start && | |
183 | + !(groupStart == start && groupEnd == end) && | |
184 | + (largestGroup == null || | |
185 | + largestGroup.getTokens().size() < group.getTokens().size())) { | |
186 | + largestGroup = group; | |
187 | + } | |
188 | + } | |
189 | + return largestGroup; | |
190 | + } | |
191 | + | |
192 | + public ArrayList<Mention> getMentionsInsideSpan(int start, int end) { | |
193 | + ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>(); | |
194 | + for (Mention mention : this.mentions) { | |
195 | + if (mention.getSentencePositionStart() >= start && | |
196 | + mention.getSentencePositionEnd() <= end) { | |
197 | + mentionsAtSpan.add(mention); | |
198 | + } | |
199 | + } | |
200 | + return mentionsAtSpan; | |
201 | + } | |
202 | + | |
203 | + public String getTextInsideSpan(int start, int end) { | |
204 | + String text = ""; | |
205 | + int step = start; | |
206 | + while (step <= end) { | |
207 | + if (step != start) { | |
208 | + text += " "; | |
209 | + } | |
210 | + text += this.get(step).getOrth(); | |
211 | + step++; | |
212 | + } | |
213 | + return text; | |
214 | + } | |
215 | + | |
216 | + public ArrayList<Token> getSegmentsInsideSpan(int start, int end) { | |
217 | + ArrayList<Token> tokensAtSpan = new ArrayList<Token>(); | |
218 | + int step = start; | |
219 | + while (step <= end) { | |
220 | + tokensAtSpan.add(this.get(step)); | |
221 | + step++; | |
222 | + } | |
223 | + return tokensAtSpan; | |
224 | + } | |
225 | + | |
112 | 226 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
1 | 1 | package pl.waw.ipipan.zil.core.md.entities; |
2 | 2 | |
3 | +import java.util.ArrayList; | |
3 | 4 | import java.util.Iterator; |
4 | 5 | import java.util.List; |
5 | 6 | |
... | ... | @@ -53,4 +54,175 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
53 | 54 | |
54 | 55 | return getType().compareTo(o.getType()); |
55 | 56 | } |
57 | + | |
58 | + public int getSentencePositionStart() { | |
59 | + Token startToken = tokens.get(0); | |
60 | + return startToken.getSentencePosition(); | |
61 | + } | |
62 | + | |
63 | + public int getSentencePositionEnd() { | |
64 | + Token endToken = tokens.get(tokens.size()-1); | |
65 | + return endToken.getSentencePosition(); | |
66 | + } | |
67 | + | |
68 | + | |
69 | + public SyntacticWord getFirstWord() { | |
70 | + SyntacticWord firstWord = null; | |
71 | + Token startToken = tokens.get(0); | |
72 | + Sentence sentence = startToken.getSentence(); | |
73 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
74 | + if(startToken.compareTo(word.getTokens().get(0)) == 0 && | |
75 | + (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) { | |
76 | + firstWord = word; | |
77 | + } | |
78 | + } | |
79 | + return firstWord; | |
80 | + } | |
81 | + | |
82 | + // NG and PrepNG only now | |
83 | + public ArrayList<String> getWalentyRealizations() { | |
84 | + ArrayList<String> realizations = new ArrayList<String>(); | |
85 | + if (this.type.startsWith("PrepNG")) { | |
86 | + SyntacticWord prepWord = this.getFirstWord(); | |
87 | + if (prepWord.getTokens().size() == 1) { | |
88 | + | |
89 | + Token prep = prepWord.getTokens().get(0); | |
90 | + String prepBase = prep.getBase(); | |
91 | + String prepCase = this.getSemanticHeadTokens().get(0).getCase(); | |
92 | + realizations.addAll(getPrepnps(prepBase, prepCase)); | |
93 | + | |
94 | + } else if (prepWord.getTokens().size() > 1) { | |
95 | + | |
96 | + String prepOrth = prepWord.getOrth().toLowerCase(); | |
97 | + String comprepnp = String.format("comprepnp(%s)", prepOrth); | |
98 | + realizations.add(comprepnp); | |
99 | + | |
100 | + } | |
101 | + } else if (this.type.startsWith("NG")) { | |
102 | + String npCase = this.getSemanticHeadTokens().get(0).getCase(); | |
103 | + realizations.addAll(getNps(npCase)); | |
104 | + } | |
105 | + return realizations; | |
106 | + } | |
107 | + | |
108 | + // compar ?? | |
109 | + private ArrayList<String> getPrepnps(String prepBase, String prepCase) { | |
110 | + ArrayList<String> prepnps = new ArrayList<String>(); | |
111 | + prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); | |
112 | + if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) { | |
113 | + prepnps.add(String.format("prepnp(%s,str)", prepBase)); | |
114 | + } | |
115 | + if (prepCase.equals("gen") || prepCase.equals("acc")) { | |
116 | + prepnps.add(String.format("prepnp(%s,part)", prepBase)); | |
117 | + } | |
118 | + return prepnps; | |
119 | + } | |
120 | + | |
121 | + private ArrayList<String> getNps(String npCase) { | |
122 | + ArrayList<String> nps = new ArrayList<String>(); | |
123 | + nps.add(String.format("np(%s)", npCase)); | |
124 | + if (npCase.equals("nom") || npCase.equals("gen") || npCase.equals("acc")) { | |
125 | + nps.add(String.format("np(str)")); | |
126 | + } | |
127 | + if (npCase.equals("gen") || npCase.equals("acc")) { | |
128 | + nps.add(String.format("np(part)")); | |
129 | + } | |
130 | + return nps; | |
131 | + } | |
132 | + | |
133 | + public boolean precedingWordIsVerb() { | |
134 | + Sentence sentence = this.tokens.get(0).getSentence(); | |
135 | + int precedingTokenPosition = this.getSentencePositionStart() - 1; | |
136 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
137 | + int lastWordPosition = word.getSentencePositionEnd(); | |
138 | + if (precedingTokenPosition == lastWordPosition && | |
139 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
140 | + return true; | |
141 | + } | |
142 | + } | |
143 | + return false; | |
144 | + } | |
145 | + | |
146 | + public SyntacticGroup getNextNG() { | |
147 | + Sentence sentence = this.tokens.get(0).getSentence(); | |
148 | + int thisGroupEnd = this.getSentencePositionEnd(); | |
149 | + int sentenceLength = sentence.size(); | |
150 | + | |
151 | + SyntacticGroup nextNG = null; | |
152 | + for (int step = thisGroupEnd; step < sentenceLength; step++) { | |
153 | + nextNG = sentence.getFirstGroup(step, sentenceLength); | |
154 | + if (nextNG != null && nextNG.type.startsWith("NG") && | |
155 | + this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) { | |
156 | + break; | |
157 | + } else { | |
158 | + nextNG = null; | |
159 | + } | |
160 | + } | |
161 | + return nextNG; | |
162 | + } | |
163 | + | |
164 | + public SyntacticGroup getFollowingGroup() { | |
165 | + SyntacticGroup largestGroup = null; | |
166 | + Sentence sentence = this.tokens.get(0).getSentence(); | |
167 | + int nextTokenPosition = this.getSentencePositionEnd() + 1; | |
168 | + for (SyntacticGroup group : sentence.getGroups()) { | |
169 | + if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) && | |
170 | + group.getSentencePositionStart() == nextTokenPosition) { | |
171 | + if (largestGroup == null || | |
172 | + largestGroup.getTokens().size() < group.getTokens().size()) { | |
173 | + largestGroup = group; | |
174 | + } | |
175 | + } | |
176 | + } | |
177 | + return largestGroup; | |
178 | + } | |
179 | + | |
180 | + public SyntacticWord getPrecedingVerb() { | |
181 | + int precedingTokenPosition = this.getSentencePositionStart() - 1; | |
182 | + Sentence sentence = this.tokens.get(0).getSentence(); | |
183 | + if(this.isPartOfPrepNG()) { | |
184 | + SyntacticGroup parentNGGroup = this.getParentPrepNG(); | |
185 | + precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; | |
186 | + } | |
187 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
188 | + int lastWordPosition = word.getSentencePositionEnd(); | |
189 | + if (precedingTokenPosition == lastWordPosition && | |
190 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
191 | + return word; | |
192 | + } | |
193 | + } | |
194 | + return null; | |
195 | + } | |
196 | + | |
197 | + private boolean isPartOfPrepNG() { | |
198 | + int NGGroupStart = this.getSentencePositionStart(); | |
199 | + int NGGroupEnd = this.getSentencePositionEnd(); | |
200 | + Sentence sentence = this.tokens.get(0).getSentence(); | |
201 | + for (SyntacticGroup group : sentence.getGroups()) { | |
202 | + if (group.getType().startsWith("PrepNG") && | |
203 | + group.getSentencePositionStart() <= NGGroupStart && | |
204 | + group.getSentencePositionEnd() >= NGGroupEnd) { | |
205 | + return true; | |
206 | + } | |
207 | + } | |
208 | + return false; | |
209 | + } | |
210 | + | |
211 | + private SyntacticGroup getParentPrepNG() { | |
212 | + SyntacticGroup parentPrepNG = null; | |
213 | + int NGGroupStart = this.getSentencePositionStart(); | |
214 | + int NGGroupEnd = this.getSentencePositionEnd(); | |
215 | + Sentence sentence = this.tokens.get(0).getSentence(); | |
216 | + for (SyntacticGroup group : sentence.getGroups()) { | |
217 | + if (group.getType().startsWith("PrepNG") && | |
218 | + group.getSentencePositionStart() <= NGGroupStart && | |
219 | + group.getSentencePositionEnd() >= NGGroupEnd) { | |
220 | + if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { | |
221 | + parentPrepNG = group; | |
222 | + } | |
223 | + } | |
224 | + } | |
225 | + return parentPrepNG; | |
226 | + } | |
227 | + | |
56 | 228 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
... | ... | @@ -6,11 +6,16 @@ import java.util.List; |
6 | 6 | |
7 | 7 | public class SyntacticWord implements Comparable<SyntacticWord> { |
8 | 8 | |
9 | + private String base; | |
9 | 10 | private String ctag; |
11 | + private String orth; | |
10 | 12 | private List<Token> tokens = new ArrayList<>(); |
11 | 13 | |
12 | - public SyntacticWord(String ctag, List<Token> tokens) { | |
14 | + public SyntacticWord(String ctag, List<Token> tokens, | |
15 | + String base, String orth) { | |
16 | + this.base = base; | |
13 | 17 | this.ctag = ctag; |
18 | + this.orth = orth; | |
14 | 19 | this.tokens = tokens; |
15 | 20 | } |
16 | 21 | |
... | ... | @@ -39,5 +44,37 @@ public class SyntacticWord implements Comparable<SyntacticWord> { |
39 | 44 | |
40 | 45 | return getCtag().compareTo(o.getCtag()); |
41 | 46 | } |
47 | + | |
48 | + public int getSentencePositionStart() { | |
49 | + Token startToken = tokens.get(0); | |
50 | + return startToken.getSentencePosition(); | |
51 | + } | |
52 | + | |
53 | + public int getSentencePositionEnd() { | |
54 | + Token endToken = tokens.get(tokens.size()-1); | |
55 | + return endToken.getSentencePosition(); | |
56 | + } | |
57 | + | |
58 | + public String getBase() { | |
59 | + return this.base; | |
60 | + } | |
61 | + | |
62 | + public String getOrth() { | |
63 | + return this.orth; | |
64 | + } | |
65 | + | |
66 | + public boolean isVerb() { | |
67 | + if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) { | |
68 | + return true; | |
69 | + } | |
70 | + return false; | |
71 | + } | |
72 | + | |
73 | + public boolean isInterp() { | |
74 | + if (this.ctag.equals("Interp")) { | |
75 | + return true; | |
76 | + } | |
77 | + return false; | |
78 | + } | |
42 | 79 | |
43 | 80 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
... | ... | @@ -70,6 +70,7 @@ public class TeiLoader { |
70 | 70 | for (TEIMorph mo : m.getHeadMorphs()) |
71 | 71 | headTokens.add(teiMorph2Segment.get(mo)); |
72 | 72 | s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); |
73 | + System.out.println(tokens.toString()); | |
73 | 74 | } |
74 | 75 | |
75 | 76 | private static void loadSyntacticGroup(Sentence s, TEIGroup g, |
... | ... | @@ -94,10 +95,12 @@ public class TeiLoader { |
94 | 95 | private static void loadSyntacticWord(Sentence s, TEIWord w, |
95 | 96 | Map<TEIMorph, Token> teiMorph2Segment) { |
96 | 97 | String ctag = w.getInterpretation().getCtag(); |
98 | + String base = w.getInterpretation().getBase(); | |
99 | + String orth = w.getOrth(); | |
97 | 100 | List<Token> tokens = new ArrayList<>(); |
98 | 101 | for (TEIMorph m : w.getAllMorphs()) |
99 | 102 | tokens.add(teiMorph2Segment.get(m)); |
100 | - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | |
103 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth)); | |
101 | 104 | } |
102 | 105 | |
103 | 106 | private static void loadNE(Sentence s, TEINamedEntity ne, |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
... | ... | @@ -73,10 +73,12 @@ public class ThriftLoader { |
73 | 73 | private static void loadSyntacticWord(Sentence s, TSyntacticWord w, |
74 | 74 | Map<String, Object> thirftId2Entity, |
75 | 75 | Map<String, Token> thiftTokenId2Token) { |
76 | + String base = w.getChosenInterpretation().getBase(); | |
76 | 77 | String ctag = w.getChosenInterpretation().getCtag(); |
78 | + String orth = w.getOrth(); | |
77 | 79 | List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, |
78 | 80 | thiftTokenId2Token, false); |
79 | - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | |
81 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth)); | |
80 | 82 | } |
81 | 83 | |
82 | 84 | private static void loadNE(Sentence s, TNamedEntity ne, |
... | ... |