Commit 1dc4f9471ae6d1929a5d443457f85a77bd7f6ad4
1 parent
3682bbf2
Added new mention detection rules based on Walenty dictionary.
Showing
9 changed files
with
1166 additions
and
13 deletions
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
@@ -2,6 +2,7 @@ package pl.waw.ipipan.zil.core.md; | @@ -2,6 +2,7 @@ package pl.waw.ipipan.zil.core.md; | ||
2 | 2 | ||
3 | import org.slf4j.Logger; | 3 | import org.slf4j.Logger; |
4 | import org.slf4j.LoggerFactory; | 4 | import org.slf4j.LoggerFactory; |
5 | + | ||
5 | import pl.waw.ipipan.zil.core.md.detection.Detector; | 6 | import pl.waw.ipipan.zil.core.md.detection.Detector; |
6 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | 7 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; |
7 | import pl.waw.ipipan.zil.core.md.entities.Text; | 8 | import pl.waw.ipipan.zil.core.md.entities.Text; |
@@ -15,10 +16,16 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | @@ -15,10 +16,16 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | ||
15 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | 16 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; |
16 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | 17 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; |
17 | 18 | ||
19 | +import java.io.BufferedReader; | ||
18 | import java.io.File; | 20 | import java.io.File; |
19 | import java.io.FileInputStream; | 21 | import java.io.FileInputStream; |
20 | import java.io.IOException; | 22 | import java.io.IOException; |
21 | import java.io.InputStream; | 23 | import java.io.InputStream; |
24 | +import java.io.InputStreamReader; | ||
25 | +import java.util.ArrayList; | ||
26 | +import java.util.EnumMap; | ||
27 | +import java.util.HashMap; | ||
28 | +import java.util.Map; | ||
22 | 29 | ||
23 | public class Main { | 30 | public class Main { |
24 | 31 | ||
@@ -26,12 +33,125 @@ public class Main { | @@ -26,12 +33,125 @@ public class Main { | ||
26 | 33 | ||
27 | private static final boolean GZIP_OUTPUT = true; | 34 | private static final boolean GZIP_OUTPUT = true; |
28 | private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; | 35 | private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; |
36 | + private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt"; | ||
37 | + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt"; | ||
38 | + private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt"; | ||
29 | 39 | ||
30 | private static ZeroSubjectDetector zeroSubjectModel; | 40 | private static ZeroSubjectDetector zeroSubjectModel; |
41 | + | ||
42 | + public static enum ValenceDicts { | ||
43 | + VerbsValence, | ||
44 | + NounsValence | ||
45 | + } | ||
46 | + | ||
47 | + private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = | ||
48 | + new EnumMap(ValenceDicts.class); | ||
49 | + | ||
50 | + private static final ArrayList<String> complexPreps; | ||
31 | 51 | ||
32 | static { | 52 | static { |
33 | InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); | 53 | InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); |
34 | zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | 54 | zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); |
55 | + | ||
56 | + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); | ||
57 | + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); | ||
58 | + | ||
59 | + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); | ||
60 | + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); | ||
61 | + | ||
62 | + InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS); | ||
63 | + complexPreps = readValues(complexPrepositionsStream); | ||
64 | + } | ||
65 | + | ||
66 | + | ||
67 | + public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream) | ||
68 | + { | ||
69 | + Map<String,ArrayList<String>> map; | ||
70 | + try { | ||
71 | + BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream)); | ||
72 | + map = new HashMap<String,ArrayList<String>>(); | ||
73 | + String line; | ||
74 | + boolean firstLine = true; | ||
75 | + while((line = br.readLine()) != null) { | ||
76 | + if (firstLine) { | ||
77 | + line = line.replace("\uFEFF", ""); // remove BOM character | ||
78 | + firstLine = false; | ||
79 | + } | ||
80 | + | ||
81 | + if (!line.startsWith("%")) { | ||
82 | + String[] lineParts = line.split(":"); | ||
83 | + String lemma = lineParts[0].trim(); | ||
84 | + String schema = lineParts[5].trim(); | ||
85 | + | ||
86 | + if (schema.trim().isEmpty()) { | ||
87 | + continue; | ||
88 | + } | ||
89 | + | ||
90 | + String[] lemmaParts = lemma.split(" "); | ||
91 | + if(lemmaParts.length == 1 && schemaContainsSie(schema)) { | ||
92 | + lemma = lemma + " się"; | ||
93 | + } | ||
94 | + | ||
95 | + ArrayList<String> schemata; | ||
96 | + if (!map.containsKey(lemma)) { | ||
97 | + schemata = new ArrayList<String>(); | ||
98 | + schemata.add(schema); | ||
99 | + map.put(lemma, schemata); | ||
100 | + } else { | ||
101 | + schemata = map.get(lemma); | ||
102 | + schemata.add(schema); | ||
103 | + map.put(lemma, schemata); | ||
104 | + } | ||
105 | + } | ||
106 | + } | ||
107 | + br.close(); | ||
108 | + } catch (IOException ex) { | ||
109 | + ex.printStackTrace(); | ||
110 | + throw new RuntimeException(ex); | ||
111 | + } | ||
112 | + return map; | ||
113 | + } | ||
114 | + | ||
115 | + private static boolean schemaContainsSie(String schema) { | ||
116 | + for (String position : schema.split("\\s\\+\\s")) { | ||
117 | + position = position.trim(); | ||
118 | + position = position.substring(1, position.length()-1); | ||
119 | + for (String phrT : position.split(";")) { | ||
120 | + if (phrT.equals("refl") || phrT.equals("recip")) { | ||
121 | + return true; | ||
122 | + } | ||
123 | + } | ||
124 | + } | ||
125 | + | ||
126 | + return false; | ||
127 | + } | ||
128 | + | ||
129 | + public static ArrayList<String> readValues(InputStream stream) { | ||
130 | + ArrayList<String> values; | ||
131 | + try { | ||
132 | + BufferedReader br=new BufferedReader(new InputStreamReader(stream)); | ||
133 | + values = new ArrayList<String>(); | ||
134 | + String line; | ||
135 | + boolean firstLine = true; | ||
136 | + while((line = br.readLine()) != null) { | ||
137 | + if (firstLine) { | ||
138 | + line = line.replace("\uFEFF", ""); // remove BOM character | ||
139 | + firstLine = false; | ||
140 | + } | ||
141 | + | ||
142 | + if (!line.startsWith("%")) { | ||
143 | + String value = line.trim(); | ||
144 | + if (!value.isEmpty()) { | ||
145 | + values.add(value); | ||
146 | + } | ||
147 | + } | ||
148 | + } | ||
149 | + br.close(); | ||
150 | + } catch (IOException ex) { | ||
151 | + ex.printStackTrace(); | ||
152 | + throw new RuntimeException(ex); | ||
153 | + } | ||
154 | + return values; | ||
35 | } | 155 | } |
36 | 156 | ||
37 | private Main() { | 157 | private Main() { |
@@ -71,6 +191,8 @@ public class Main { | @@ -71,6 +191,8 @@ public class Main { | ||
71 | return; | 191 | return; |
72 | } | 192 | } |
73 | } | 193 | } |
194 | + | ||
195 | + | ||
74 | 196 | ||
75 | int all = 0; | 197 | int all = 0; |
76 | int errors = 0; | 198 | int errors = 0; |
@@ -122,7 +244,7 @@ public class Main { | @@ -122,7 +244,7 @@ public class Main { | ||
122 | */ | 244 | */ |
123 | public static void annotateThriftText(TText thriftText) throws MultiserviceException { | 245 | public static void annotateThriftText(TText thriftText) throws MultiserviceException { |
124 | Text responseText = ThriftLoader.loadTextFromThrift(thriftText); | 246 | Text responseText = ThriftLoader.loadTextFromThrift(thriftText); |
125 | - Detector.findMentionsInText(responseText, zeroSubjectModel); | 247 | + Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); |
126 | ThriftSaver.updateThriftText(responseText, thriftText); | 248 | ThriftSaver.updateThriftText(responseText, thriftText); |
127 | } | 249 | } |
128 | 250 | ||
@@ -135,7 +257,7 @@ public class Main { | @@ -135,7 +257,7 @@ public class Main { | ||
135 | */ | 257 | */ |
136 | public static void annotateTeiText(TEICorpusText teiText) throws TEIException { | 258 | public static void annotateTeiText(TEICorpusText teiText) throws TEIException { |
137 | Text responseText = TeiLoader.loadTextFromTei(teiText); | 259 | Text responseText = TeiLoader.loadTextFromTei(teiText); |
138 | - Detector.findMentionsInText(responseText, zeroSubjectModel); | 260 | + Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); |
139 | TeiSaver.updateTeiText(responseText, teiText); | 261 | TeiSaver.updateTeiText(responseText, teiText); |
140 | } | 262 | } |
141 | 263 |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 | package pl.waw.ipipan.zil.core.md.detection; | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | ||
3 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | ||
3 | import pl.waw.ipipan.zil.core.md.entities.Mention; | 4 | import pl.waw.ipipan.zil.core.md.entities.Mention; |
4 | import pl.waw.ipipan.zil.core.md.entities.Sentence; | 5 | import pl.waw.ipipan.zil.core.md.entities.Sentence; |
6 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | ||
7 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | ||
5 | import pl.waw.ipipan.zil.core.md.entities.Token; | 8 | import pl.waw.ipipan.zil.core.md.entities.Token; |
6 | 9 | ||
10 | +import java.util.ArrayList; | ||
7 | import java.util.Collection; | 11 | import java.util.Collection; |
8 | import java.util.HashSet; | 12 | import java.util.HashSet; |
9 | import java.util.List; | 13 | import java.util.List; |
14 | +import java.util.Map; | ||
10 | import java.util.Set; | 15 | import java.util.Set; |
11 | 16 | ||
12 | public class Cleaner { | 17 | public class Cleaner { |
@@ -125,4 +130,157 @@ public class Cleaner { | @@ -125,4 +130,157 @@ public class Cleaner { | ||
125 | else | 130 | else |
126 | return m1; | 131 | return m1; |
127 | } | 132 | } |
133 | + | ||
134 | + public static void cleanWalentyFramedMentions(Sentence sentence, | ||
135 | + Map<String,ArrayList<String>> verbsValence) { | ||
136 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | ||
137 | + for (Mention mention : sentence.getMentions()) { | ||
138 | + int mentionStart = mention.getFirstSegment().getSentencePosition(); | ||
139 | + int mentionEnd = mention.getLastSegment().getSentencePosition(); | ||
140 | + SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd); | ||
141 | + SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd); | ||
142 | + | ||
143 | + if (startGroup != null && endGroup != null | ||
144 | + && startGroup.compareTo(endGroup) != 0) { | ||
145 | + | ||
146 | + SyntacticWord verb = startGroup.getPrecedingVerb(); | ||
147 | + if (verb != null && !verb.getBase().equals("mieć") | ||
148 | + && verbsValence.containsKey(verb.getBase())) { | ||
149 | + ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations(); | ||
150 | + ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations(); | ||
151 | + | ||
152 | + for (String schema : verbsValence.get(verb.getBase())) { | ||
153 | + if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) { | ||
154 | + mentionsToRemove.add(mention); | ||
155 | + break; | ||
156 | + } | ||
157 | + } | ||
158 | + } | ||
159 | + } | ||
160 | + } | ||
161 | + | ||
162 | + for (Mention mentionToRemove : mentionsToRemove) { | ||
163 | + sentence.removeMention(mentionToRemove); | ||
164 | + } | ||
165 | + } | ||
166 | + | ||
167 | + /*private static void removeWalentyFramedMentions(Sentence sentence, | ||
168 | + ArrayList<Mention> mentions, | ||
169 | + ArrayList<String> schemata) { | ||
170 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | ||
171 | + for (Mention mention : mentions) { | ||
172 | + int mentionStart = mention.getFirstSegment().getSentencePosition(); | ||
173 | + int mentionEnd = mention.getLastSegment().getSentencePosition(); | ||
174 | + SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd); | ||
175 | + SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd); | ||
176 | + if (startGroup != null && endGroup != null | ||
177 | + && startGroup.compareTo(endGroup) != 0) { | ||
178 | + ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations(); | ||
179 | + ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations(); | ||
180 | + for (String schema : schemata) { | ||
181 | + if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) { | ||
182 | + mentionsToRemove.add(mention); | ||
183 | + break; | ||
184 | + } | ||
185 | + } | ||
186 | + } | ||
187 | + } | ||
188 | + | ||
189 | + for (Mention mentionToRemove : mentionsToRemove) { | ||
190 | + sentence.removeMention(mentionToRemove); | ||
191 | + } | ||
192 | + }*/ | ||
193 | + | ||
194 | + private static boolean isProperSchema(String schema, ArrayList<String> group1Types, | ||
195 | + ArrayList<String> group2Types) { | ||
196 | + for (String group1Type : group1Types) { | ||
197 | + for (String group2Type : group2Types) { | ||
198 | + if (schemaContains(schema, group1Type, group2Type)) { | ||
199 | + return true; | ||
200 | + } | ||
201 | + } | ||
202 | + } | ||
203 | + return false; | ||
204 | + } | ||
205 | + | ||
206 | + private static boolean schemaContains(String schema, String phraseType1, | ||
207 | + String phraseType2) { | ||
208 | + boolean phrType1Found = false; | ||
209 | + boolean phrType2Found = false; | ||
210 | + for (String position : schema.split("\\+")) { | ||
211 | + position = position.trim(); | ||
212 | + position = position.substring(1, position.length()-1); | ||
213 | + for (String phrT : position.split(";")) { | ||
214 | + if (phrT.equals(phraseType1)) { | ||
215 | + phrType1Found = true; | ||
216 | + break; | ||
217 | + } else if (phrT.equals(phraseType2)) { | ||
218 | + phrType2Found = true; | ||
219 | + break; | ||
220 | + } | ||
221 | + } | ||
222 | + if (phrType1Found && phrType2Found) { | ||
223 | + return true; | ||
224 | + } | ||
225 | + } | ||
226 | + return false; | ||
227 | + } | ||
228 | + | ||
229 | + | ||
230 | + // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub) | ||
231 | + public static void cleanQubs(Sentence sentence) { | ||
232 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | ||
233 | + for (Mention mention : sentence.getMentions()) { | ||
234 | + if (mention.isPartOfQub()) { | ||
235 | + mentionsToRemove.add(mention); | ||
236 | + } | ||
237 | + } | ||
238 | + | ||
239 | + for (Mention mentionToRemove : mentionsToRemove) { | ||
240 | + sentence.removeMention(mentionToRemove); | ||
241 | + } | ||
242 | + } | ||
243 | + | ||
244 | + public static void cleanPreps(Sentence sentence) { | ||
245 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | ||
246 | + for (Mention mention : sentence.getMentions()) { | ||
247 | + if (mention.isPartOfPrep()) { | ||
248 | + mentionsToRemove.add(mention); | ||
249 | + } | ||
250 | + } | ||
251 | + | ||
252 | + for (Mention mentionToRemove : mentionsToRemove) { | ||
253 | + sentence.removeMention(mentionToRemove); | ||
254 | + } | ||
255 | + } | ||
256 | + | ||
257 | + public static void cleanFrazeos(Sentence sentence) { | ||
258 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | ||
259 | + for (Mention mention : sentence.getMentions()) { | ||
260 | + if (mention.isPartOfFrazeo()) { | ||
261 | + mentionsToRemove.add(mention); | ||
262 | + } | ||
263 | + } | ||
264 | + | ||
265 | + for (Mention mentionToRemove : mentionsToRemove) { | ||
266 | + sentence.removeMention(mentionToRemove); | ||
267 | + } | ||
268 | + } | ||
269 | + | ||
270 | + // wyrzuca wzmianki bedace czescia przyimkow zlozonych | ||
271 | + public static void cleanComplexPreps(Sentence sentence, | ||
272 | + ArrayList<String> complexPreps) { | ||
273 | + | ||
274 | + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | ||
275 | + for (Mention mention : sentence.getMentions()) { | ||
276 | + if (mention.isPartOfComplexPrep(complexPreps)) { | ||
277 | + mentionsToRemove.add(mention); | ||
278 | + } | ||
279 | + } | ||
280 | + | ||
281 | + for (Mention mentionToRemove : mentionsToRemove) { | ||
282 | + sentence.removeMention(mentionToRemove); | ||
283 | + } | ||
284 | + } | ||
285 | + | ||
128 | } | 286 | } |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
@@ -2,12 +2,15 @@ package pl.waw.ipipan.zil.core.md.detection; | @@ -2,12 +2,15 @@ package pl.waw.ipipan.zil.core.md.detection; | ||
2 | 2 | ||
3 | import org.slf4j.Logger; | 3 | import org.slf4j.Logger; |
4 | import org.slf4j.LoggerFactory; | 4 | import org.slf4j.LoggerFactory; |
5 | + | ||
6 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | ||
5 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | 7 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; |
6 | import pl.waw.ipipan.zil.core.md.entities.*; | 8 | import pl.waw.ipipan.zil.core.md.entities.*; |
7 | 9 | ||
8 | import java.util.ArrayList; | 10 | import java.util.ArrayList; |
9 | import java.util.HashSet; | 11 | import java.util.HashSet; |
10 | import java.util.List; | 12 | import java.util.List; |
13 | +import java.util.Map; | ||
11 | import java.util.Set; | 14 | import java.util.Set; |
12 | 15 | ||
13 | public class Detector { | 16 | public class Detector { |
@@ -18,21 +21,25 @@ public class Detector { | @@ -18,21 +21,25 @@ public class Detector { | ||
18 | } | 21 | } |
19 | 22 | ||
20 | public static void findMentionsInText(Text text, | 23 | public static void findMentionsInText(Text text, |
21 | - ZeroSubjectDetector zeroSubjectModel) { | 24 | + ZeroSubjectDetector zeroSubjectModel, |
25 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | ||
26 | + ArrayList<String> complexPreps) { | ||
22 | text.clearMentions(); | 27 | text.clearMentions(); |
23 | logger.debug("Detecting mentions in text " + text.getId()); | 28 | logger.debug("Detecting mentions in text " + text.getId()); |
24 | for (Paragraph p : text) | 29 | for (Paragraph p : text) |
25 | for (Sentence s : p) | 30 | for (Sentence s : p) |
26 | - detectMentionsInSentence(s, zeroSubjectModel); | 31 | + detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps); |
27 | } | 32 | } |
28 | 33 | ||
29 | private static void detectMentionsInSentence(Sentence sentence, | 34 | private static void detectMentionsInSentence(Sentence sentence, |
30 | - ZeroSubjectDetector zeroSubjectModel) { | 35 | + ZeroSubjectDetector zeroSubjectModel, |
36 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | ||
37 | + ArrayList<String> complexPreps) { | ||
31 | // adding mentions | 38 | // adding mentions |
32 | addMentionsByTokenCtag(sentence); | 39 | addMentionsByTokenCtag(sentence); |
33 | addMentionsBySyntacticWordsCtag(sentence); | 40 | addMentionsBySyntacticWordsCtag(sentence); |
34 | addMentionsByNamedEntities(sentence); | 41 | addMentionsByNamedEntities(sentence); |
35 | - addMentionsByGroups(sentence); | 42 | + addMentionsByGroups(sentence, valence, complexPreps); |
36 | addSpeakerMentionsInSpoken(sentence); | 43 | addSpeakerMentionsInSpoken(sentence); |
37 | 44 | ||
38 | // zero subject detection | 45 | // zero subject detection |
@@ -41,6 +48,11 @@ public class Detector { | @@ -41,6 +48,11 @@ public class Detector { | ||
41 | // removing mentions | 48 | // removing mentions |
42 | removeTo(sentence); | 49 | removeTo(sentence); |
43 | Cleaner.cleanUnnecessarySentenceMentions(sentence); | 50 | Cleaner.cleanUnnecessarySentenceMentions(sentence); |
51 | + //Cleaner.cleanQubs(sentence); | ||
52 | + //Cleaner.cleanPreps(sentence); | ||
53 | + //Cleaner.cleanComplexPreps(sentence, complexPreps); | ||
54 | + Cleaner.cleanFrazeos(sentence); | ||
55 | + Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); | ||
44 | 56 | ||
45 | // updating mention heads | 57 | // updating mention heads |
46 | updateMentionHeads(sentence); | 58 | updateMentionHeads(sentence); |
@@ -95,16 +107,468 @@ public class Detector { | @@ -95,16 +107,468 @@ public class Detector { | ||
95 | * | 107 | * |
96 | * @param sentence | 108 | * @param sentence |
97 | */ | 109 | */ |
98 | - private static void addMentionsByGroups(Sentence sentence) { | ||
99 | - for (SyntacticGroup group : sentence.getGroups()) { | 110 | + private static void addMentionsByGroups(Sentence sentence, |
111 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | ||
112 | + ArrayList<String> complexPreps) { | ||
113 | + List<SyntacticGroup> groups = sentence.getGroups(); | ||
114 | + for (int i = 0; i < groups.size(); i++) { | ||
115 | + SyntacticGroup thisGroup = groups.get(i); | ||
116 | + | ||
117 | + /*SyntacticGroup nearPrepNG = null; | ||
118 | + SyntacticGroup nextNG = null;*/ | ||
119 | + | ||
120 | + SyntacticGroup nextGroup = thisGroup.getFollowingGroup(); | ||
121 | + | ||
122 | + /*if (thisGroup.getType().startsWith("NG")) { | ||
123 | + nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(), | ||
124 | + sentence); | ||
125 | + nextNG = thisGroup.getNextNG(); | ||
126 | + }*/ | ||
127 | + | ||
128 | + /*if (nextNG != null) { | ||
129 | + int prepStart = thisGroup.getSentencePositionEnd() + 1; | ||
130 | + int prepEnd = nextNG.getSentencePositionStart() - 1; | ||
131 | + String prep = sentence.getTextInsideSpan(prepStart, prepEnd); | ||
132 | + if (complexPreps.contains(prep)) { | ||
133 | + String cos = ""; | ||
134 | + } | ||
135 | + }*/ | ||
136 | + | ||
137 | + /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && | ||
138 | + //!isPartOfPrepNG(thisGroup, sentence) && | ||
139 | + //getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && | ||
140 | + precedingWordIsVerb(thisGroup, sentence) && | ||
141 | + //!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | ||
142 | + !precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | ||
143 | + !sameSemanticHeads(thisGroup, nearPrepNG)) { | ||
144 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | ||
145 | + List<Token> segments = thisGroup.getTokens(); | ||
146 | + segments.addAll(nearPrepNG.getTokens()); | ||
147 | + | ||
148 | + sentence.addMention(new Mention(segments, heads)); | ||
149 | + }*/ | ||
150 | + /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && | ||
151 | + // !precedingWordIsVerb(thisGroup, sentence) && | ||
152 | + !isPartOfPrepNG(thisGroup, sentence) && | ||
153 | + getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && | ||
154 | + //!precedingWordIsVerb(thisGroup, sentence) && | ||
155 | + !precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | ||
156 | + //!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | ||
157 | + !sameSemanticHeads(thisGroup, nearPrepNG)) { | ||
158 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | ||
159 | + List<Token> segments = thisGroup.getTokens(); | ||
160 | + segments.addAll(nearPrepNG.getTokens()); | ||
161 | + | ||
162 | + sentence.addMention(new Mention(segments, heads)); | ||
163 | + }*/ | ||
164 | + if (thisGroup.getType().startsWith("NG") && | ||
165 | + nextGroup != null && nextGroup.getType().startsWith("PrepNG") && | ||
166 | + NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) { | ||
167 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | ||
168 | + List<Token> segments = new ArrayList<Token>(); | ||
169 | + segments.addAll(thisGroup.getTokens()); | ||
170 | + segments.addAll(nextGroup.getTokens()); | ||
171 | + | ||
172 | + sentence.addMention(new Mention(segments, heads)); | ||
173 | + } else if (thisGroup.getType().startsWith("NG") && nextGroup != null && | ||
174 | + nextGroup.getType().startsWith("NG") && | ||
175 | + NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence)) | ||
176 | + ) { | ||
177 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | ||
178 | + List<Token> segments = new ArrayList<Token>(); | ||
179 | + segments.addAll(thisGroup.getTokens()); | ||
180 | + segments.addAll(nextGroup.getTokens()); | ||
181 | + | ||
182 | + sentence.addMention(new Mention(segments, heads)); | ||
183 | + } /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null && | ||
184 | + NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) { | ||
185 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | ||
186 | + | ||
187 | + List<Token> segments = new ArrayList<Token>(); | ||
188 | + segments.addAll(thisGroup.getTokens()); | ||
189 | + | ||
190 | + int prepStart = thisGroup.getSentencePositionEnd() + 1; | ||
191 | + int prepEnd = nextNG.getSentencePositionStart() - 1; | ||
192 | + ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd); | ||
193 | + segments.addAll(prepSegments); | ||
194 | + | ||
195 | + segments.addAll(nextNG.getTokens()); | ||
196 | + | ||
197 | + sentence.addMention(new Mention(segments, heads)); | ||
198 | + }*/ | ||
199 | + //else if // NG + im./pt. NG | ||
200 | + // daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka | ||
201 | + // "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName) | ||
202 | + else if (thisGroup.getType().startsWith("NG")) { | ||
203 | + List<Token> segments = thisGroup.getTokens(); | ||
204 | + List<Token> heads = thisGroup.getSemanticHeadTokens(); | ||
205 | + | ||
206 | + sentence.addMention(new Mention(segments, heads)); | ||
207 | + } | ||
208 | + } | ||
209 | + | ||
210 | + // oryginalna wersja | ||
211 | + /*for (SyntacticGroup group : sentence.getGroups()) { | ||
100 | if (group.getType().startsWith("NG")) { | 212 | if (group.getType().startsWith("NG")) { |
101 | List<Token> segments = group.getTokens(); | 213 | List<Token> segments = group.getTokens(); |
102 | List<Token> heads = group.getSemanticHeadTokens(); | 214 | List<Token> heads = group.getSemanticHeadTokens(); |
103 | 215 | ||
104 | sentence.addMention(new Mention(segments, heads)); | 216 | sentence.addMention(new Mention(segments, heads)); |
105 | } | 217 | } |
106 | - } | 218 | + }*/ |
219 | + } | ||
220 | + | ||
221 | + private static boolean followingWordIsInf(SyntacticGroup group, | ||
222 | + Sentence sentence) { | ||
223 | + int followingTokenPosition = group.getSentencePositionEnd() + 1; | ||
224 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
225 | + int firstWordPosition = word.getSentencePositionStart(); | ||
226 | + if (followingTokenPosition == firstWordPosition && | ||
227 | + (word.getCtag().equals("Inf"))) { | ||
228 | + return true; | ||
229 | + } | ||
230 | + } | ||
231 | + | ||
232 | + return false; | ||
233 | + } | ||
234 | + | ||
235 | + private static SyntacticGroup getFollowingPrepNGs(int sentencePosition, | ||
236 | + Sentence sentence) { | ||
237 | + SyntacticGroup largestGroup = null; | ||
238 | + int nextTokenPosition = sentencePosition + 1; | ||
239 | + for (SyntacticGroup group : sentence.getGroups()) { | ||
240 | + if (group.getType().startsWith("PrepNG") && | ||
241 | + group.getSentencePositionStart() == nextTokenPosition) { | ||
242 | + if (largestGroup == null || | ||
243 | + largestGroup.getTokens().size() < group.getTokens().size()) { | ||
244 | + largestGroup = group; | ||
245 | + } | ||
246 | + } | ||
247 | + } | ||
248 | + return largestGroup; | ||
249 | + } | ||
250 | + | ||
251 | + private static boolean isPartOfPrepNG(SyntacticGroup NGGroup, | ||
252 | + Sentence sentence) { | ||
253 | + int NGGroupStart = NGGroup.getSentencePositionStart(); | ||
254 | + int NGGroupEnd = NGGroup.getSentencePositionEnd(); | ||
255 | + for (SyntacticGroup group : sentence.getGroups()) { | ||
256 | + if (group.getType().startsWith("PrepNG") && | ||
257 | + group.getSentencePositionStart() <= NGGroupStart && | ||
258 | + group.getSentencePositionEnd() >= NGGroupEnd) { | ||
259 | + return true; | ||
260 | + } | ||
261 | + } | ||
262 | + return false; | ||
263 | + } | ||
264 | + | ||
265 | + private static boolean precedingWordIsVerb(SyntacticGroup group, | ||
266 | + Sentence sentence) { | ||
267 | + int precedingTokenPosition = group.getSentencePositionStart() - 1; | ||
268 | + if(isPartOfPrepNG(group, sentence)) { | ||
269 | + SyntacticGroup parentGroup = getParentPrepNG(group, sentence); | ||
270 | + precedingTokenPosition = parentGroup.getSentencePositionStart() - 1; | ||
271 | + } | ||
272 | + | ||
273 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
274 | + int lastWordPosition = word.getSentencePositionEnd(); | ||
275 | + if (precedingTokenPosition == lastWordPosition && | ||
276 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | ||
277 | + return true; | ||
278 | + } | ||
279 | + } | ||
280 | + return false; | ||
281 | + } | ||
282 | + | ||
283 | + // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem? | ||
284 | + // czy prep moze sie skladac z wiecej niz jednego segmentu? | ||
285 | + // dopasowywac refla i recip do sie spejdowego | ||
286 | + private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup, | ||
287 | + SyntacticGroup PrepNGGroup, Sentence sentence, | ||
288 | + Map<String,ArrayList<String>> walentyMapping) { | ||
289 | + int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; | ||
290 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
291 | + int lastWordPosition = word.getSentencePositionEnd(); | ||
292 | + if (precedingTokenPosition == lastWordPosition && | ||
293 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | ||
294 | + String verb = word.getBase(); | ||
295 | + if (!walentyMapping.containsKey(verb)) { | ||
296 | + return true; | ||
297 | + } else { | ||
298 | + SyntacticWord prepWord = PrepNGGroup.getFirstWord(); | ||
299 | + | ||
300 | + if (prepWord.getTokens().size() == 1) { | ||
301 | + Token prep = prepWord.getTokens().get(0); | ||
302 | + String prepBase = prep.getBase(); | ||
303 | + // sprawdzic czy glowa moze miec wiele tokenow | ||
304 | + String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); | ||
305 | + ArrayList<String> prepnps = getPrepnps(prepBase, prepCase); | ||
306 | + | ||
307 | + ArrayList<String> schemata = walentyMapping.get(verb); | ||
308 | + for (String schema : schemata) { | ||
309 | + for (String prepnp : prepnps) { | ||
310 | + if (schema.contains(prepnp)) { | ||
311 | + return true; | ||
312 | + } | ||
313 | + } | ||
314 | + } | ||
315 | + } else if (prepWord.getTokens().size() > 1) { | ||
316 | + String prepOrth = prepWord.getOrth().toLowerCase(); | ||
317 | + String comprepnp = String.format("comprepnp(%s)", prepOrth); | ||
318 | + ArrayList<String> schemata = walentyMapping.get(verb); | ||
319 | + for (String schema : schemata) { | ||
320 | + if (schema.contains(comprepnp)) { | ||
321 | + return true; | ||
322 | + } | ||
323 | + } | ||
324 | + | ||
325 | + } | ||
326 | + | ||
327 | + | ||
328 | + } | ||
329 | + } | ||
330 | + } | ||
331 | + return false; | ||
332 | + } | ||
333 | + | ||
334 | + private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup, | ||
335 | + SyntacticGroup PrepNGGroup, Sentence sentence, | ||
336 | + Map<String,ArrayList<String>> walentyMapping) { | ||
337 | + int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; | ||
338 | + if(isPartOfPrepNG(NGGroup, sentence)) { | ||
339 | + SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence); | ||
340 | + precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; | ||
341 | + } | ||
342 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
343 | + int lastWordPosition = word.getSentencePositionEnd(); | ||
344 | + if (precedingTokenPosition == lastWordPosition && | ||
345 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | ||
346 | + if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) { | ||
347 | + return true; | ||
348 | + } | ||
349 | + if (!walentyMapping.containsKey(word.getBase())) { | ||
350 | + return true; | ||
351 | + } | ||
352 | + | ||
353 | + } | ||
354 | + } | ||
355 | + return false; | ||
107 | } | 356 | } |
357 | + | ||
358 | + private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup, | ||
359 | + SyntacticGroup PrepNGGroup, Sentence sentence, | ||
360 | + Map<String,ArrayList<String>> walentyMapping) { | ||
361 | + String verbBase = verb.getBase(); | ||
362 | + if (!walentyMapping.containsKey(verbBase)) { | ||
363 | + return true; | ||
364 | + } else { | ||
365 | + ArrayList<String> schemata = walentyMapping.get(verbBase); | ||
366 | + | ||
367 | + // PrepNG + PrepNG | ||
368 | + if (isPartOfPrepNG(NGGroup, sentence)) { | ||
369 | + SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence); | ||
370 | + ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations(); | ||
371 | + ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations(); | ||
372 | + for (String schema : schemata) { | ||
373 | + if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) { | ||
374 | + return true; | ||
375 | + } | ||
376 | + } | ||
377 | + } | ||
378 | + | ||
379 | + // NG + PrepNG | ||
380 | + else { | ||
381 | + ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations(); | ||
382 | + ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations(); | ||
383 | + for (String schema : schemata) { | ||
384 | + if (isProperSchema(schema, NGRealizations, prepNGRealizations)) { | ||
385 | + return true; | ||
386 | + } | ||
387 | + } | ||
388 | + } | ||
389 | + } | ||
390 | + return false; | ||
391 | + } | ||
392 | + | ||
393 | + private static boolean isProperSchema(String schema, ArrayList<String> group1Types, | ||
394 | + ArrayList<String> group2Types) { | ||
395 | + for (String group1Type : group1Types) { | ||
396 | + if (schema.contains(group1Type)) { | ||
397 | + for (String group2Type : group2Types) { | ||
398 | + if (schema.contains(group2Type)) { | ||
399 | + return true; | ||
400 | + } | ||
401 | + } | ||
402 | + } | ||
403 | + } | ||
404 | + return false; | ||
405 | + } | ||
406 | + | ||
407 | + private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup, | ||
408 | + Sentence sentence) { | ||
409 | + SyntacticGroup parentPrepNG = null; | ||
410 | + int NGGroupStart = NGGroup.getSentencePositionStart(); | ||
411 | + int NGGroupEnd = NGGroup.getSentencePositionEnd(); | ||
412 | + for (SyntacticGroup group : sentence.getGroups()) { | ||
413 | + if (group.getType().startsWith("PrepNG") && | ||
414 | + group.getSentencePositionStart() <= NGGroupStart && | ||
415 | + group.getSentencePositionEnd() >= NGGroupEnd) { | ||
416 | + if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { | ||
417 | + parentPrepNG = group; | ||
418 | + } | ||
419 | + } | ||
420 | + } | ||
421 | + return parentPrepNG; | ||
422 | + } | ||
423 | + | ||
424 | + private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup, | ||
425 | + SyntacticGroup PrepNGGroup, Sentence sentence, | ||
426 | + Map<String,ArrayList<String>> walentyMapping) { | ||
427 | + Token NGHead = NGGroup.getSemanticHeadTokens().get(0); | ||
428 | + | ||
429 | + String NGHeadBase = NGHead.getBase(); | ||
430 | + | ||
431 | + if (!walentyMapping.containsKey(NGHeadBase)) { | ||
432 | + return false; | ||
433 | + } else { | ||
434 | + SyntacticWord prepWord = PrepNGGroup.getFirstWord(); | ||
435 | + | ||
436 | + if (prepWord.getTokens().size() == 1) { | ||
437 | + Token prep = prepWord.getTokens().get(0); | ||
438 | + String prepBase = prep.getBase(); | ||
439 | + String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); | ||
440 | + String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase); | ||
441 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | ||
442 | + for (String schema : schemata) { | ||
443 | + if (schemaContains(schema, prepnp)) { | ||
444 | + return true; | ||
445 | + } | ||
446 | + } | ||
447 | + } else if (prepWord.getTokens().size() > 1) { | ||
448 | + String prepOrth = prepWord.getOrth().toLowerCase(); | ||
449 | + String comprepnp = String.format("comprepnp(%s)", prepOrth); | ||
450 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | ||
451 | + for (String schema : schemata) { | ||
452 | + if (schemaContains(schema, comprepnp)) { | ||
453 | + return true; | ||
454 | + } | ||
455 | + } | ||
456 | + | ||
457 | + } | ||
458 | + | ||
459 | + } | ||
460 | + return false; | ||
461 | + } | ||
462 | + | ||
463 | + private static boolean NGNGValenceCompatibility(SyntacticGroup NG1, | ||
464 | + SyntacticGroup NG2, Sentence sentence, | ||
465 | + Map<String,ArrayList<String>> walentyMapping) { | ||
466 | + Token NG1Head = NG1.getSemanticHeadTokens().get(0); | ||
467 | + | ||
468 | + String NGHeadBase = NG1Head.getBase(); | ||
469 | + | ||
470 | + if (!walentyMapping.containsKey(NGHeadBase)) { | ||
471 | + return false; | ||
472 | + } else { | ||
473 | + ArrayList<String> NG2realizations = NG2.getWalentyRealizations(); | ||
474 | + | ||
475 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | ||
476 | + for (String real : NG2realizations) { | ||
477 | + for (String schema : schemata) { | ||
478 | + if (schemaContains(schema, real)) { | ||
479 | + return true; | ||
480 | + } | ||
481 | + } | ||
482 | + } | ||
483 | + } | ||
484 | + return false; | ||
485 | + } | ||
486 | + | ||
487 | + private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1, | ||
488 | + SyntacticGroup NGGroup2, Sentence sentence, | ||
489 | + Map<String,ArrayList<String>> walentyMapping) { | ||
490 | + | ||
491 | + Token NGHead = NGGroup1.getSemanticHeadTokens().get(0); | ||
492 | + String NGHeadBase = NGHead.getBase(); | ||
493 | + | ||
494 | + if (!walentyMapping.containsKey(NGHeadBase)) { | ||
495 | + return false; | ||
496 | + } else { | ||
497 | + int prepStart = NGGroup1.getSentencePositionEnd() + 1; | ||
498 | + int prepEnd = NGGroup2.getSentencePositionStart() - 1; | ||
499 | + String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd); | ||
500 | + String comprepnp = String.format("comprepnp(%s)", complexPrep); | ||
501 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | ||
502 | + for (String schema : schemata) { | ||
503 | + if (schemaContains(schema, comprepnp)) { | ||
504 | + return true; | ||
505 | + } | ||
506 | + } | ||
507 | + } | ||
508 | + return false; | ||
509 | + } | ||
510 | + | ||
511 | + private static boolean schemaContains(String schema, String phraseType) { | ||
512 | + for (String position : schema.split("\\s\\+\\s")) { | ||
513 | + position = position.trim(); | ||
514 | + position = position.substring(1, position.length()-1); | ||
515 | + for (String phrT : position.split(";")) { | ||
516 | + if (phrT.equals(phraseType)) { | ||
517 | + return true; | ||
518 | + } | ||
519 | + } | ||
520 | + } | ||
521 | + return false; | ||
522 | + } | ||
523 | + | ||
524 | + private static boolean schemaContainsType(String schema, String type) { | ||
525 | + // to lepiej dziala dla rzeczownikow | ||
526 | + for (String position : schema.split("\\s\\+\\s")) { | ||
527 | + position = position.trim(); | ||
528 | + position = position.substring(1, position.length()-1); | ||
529 | + for (String phrT : position.split(";")) { | ||
530 | + | ||
531 | + if (phrT.startsWith(type+"(")) { | ||
532 | + return true; | ||
533 | + } | ||
534 | + } | ||
535 | + } | ||
536 | + return false; | ||
537 | + } | ||
538 | + | ||
539 | + | ||
540 | + // compar ?? | ||
541 | + private static ArrayList<String> getPrepnps(String prepBase, String prepCase) { | ||
542 | + ArrayList<String> prepnps = new ArrayList<String>(); | ||
543 | + prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); | ||
544 | + if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) { | ||
545 | + prepnps.add(String.format("prepnp(%s,str)", prepBase)); | ||
546 | + } | ||
547 | + if (prepCase.equals("gen") || prepCase.equals("acc")) { | ||
548 | + prepnps.add(String.format("prepnp(%s,part)", prepBase)); | ||
549 | + } | ||
550 | + return prepnps; | ||
551 | + } | ||
552 | + | ||
553 | + // eliminuje "od wsi do wsi" | ||
554 | + private static boolean sameSemanticHeads(SyntacticGroup group1, | ||
555 | + SyntacticGroup group2) { | ||
556 | + | ||
557 | + List<Token> group1HeadTokens = group1.getSemanticHeadTokens(); | ||
558 | + List<Token> group2HeadTokens = group2.getSemanticHeadTokens(); | ||
559 | + if (group1HeadTokens.size() != group2HeadTokens.size()) { | ||
560 | + return false; | ||
561 | + } | ||
562 | + | ||
563 | + for (int i=0; i < group1HeadTokens.size(); i++) { | ||
564 | + if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) { | ||
565 | + return false; | ||
566 | + } | ||
567 | + } | ||
568 | + | ||
569 | + return true; | ||
570 | + } | ||
571 | + | ||
108 | 572 | ||
109 | /** | 573 | /** |
110 | * Wyszukuję i oznaczam wszystkie NER | 574 | * Wyszukuję i oznaczam wszystkie NER |
@@ -151,8 +615,9 @@ public class Detector { | @@ -151,8 +615,9 @@ public class Detector { | ||
151 | * @param sentence | 615 | * @param sentence |
152 | */ | 616 | */ |
153 | private static void addMentionsByTokenCtag(Sentence sentence) { | 617 | private static void addMentionsByTokenCtag(Sentence sentence) { |
154 | - for (Token token : sentence) | 618 | + for (Token token : sentence) { |
155 | if (token.getCtag().matches(Constants.MORPHO_CTAGS)) | 619 | if (token.getCtag().matches(Constants.MORPHO_CTAGS)) |
156 | sentence.addMention(new Mention(token)); | 620 | sentence.addMention(new Mention(token)); |
621 | + } | ||
157 | } | 622 | } |
158 | } | 623 | } |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
1 | package pl.waw.ipipan.zil.core.md.entities; | 1 | package pl.waw.ipipan.zil.core.md.entities; |
2 | 2 | ||
3 | import java.util.ArrayList; | 3 | import java.util.ArrayList; |
4 | +import java.util.Arrays; | ||
4 | import java.util.List; | 5 | import java.util.List; |
5 | 6 | ||
6 | /** | 7 | /** |
@@ -203,4 +204,83 @@ public class Mention implements Comparable<Mention> { | @@ -203,4 +204,83 @@ public class Mention implements Comparable<Mention> { | ||
203 | public boolean isZeroSubject() { | 204 | public boolean isZeroSubject() { |
204 | return isZeroSubject; | 205 | return isZeroSubject; |
205 | } | 206 | } |
207 | + | ||
208 | + public int getSentencePositionStart() { | ||
209 | + Token startToken = this.getFirstSegment(); | ||
210 | + return startToken.getSentencePosition(); | ||
211 | + } | ||
212 | + | ||
213 | + public int getSentencePositionEnd() { | ||
214 | + Token endToken = this.getLastSegment(); | ||
215 | + return endToken.getSentencePosition(); | ||
216 | + } | ||
217 | + | ||
218 | + public boolean isPartOfQub() { | ||
219 | + if (this.segments.size() == 1) { | ||
220 | + Sentence sentence = this.segments.get(0).getSentence(); | ||
221 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
222 | + if (word.getTokens().contains(this.segments.get(0)) && | ||
223 | + word.getCtag().equals("Qub")) { | ||
224 | + return true; | ||
225 | + } | ||
226 | + } | ||
227 | + } | ||
228 | + return false; | ||
229 | + } | ||
230 | + | ||
231 | + public boolean isPartOfPrep() { | ||
232 | + if (this.segments.size() == 1) { | ||
233 | + Sentence sentence = this.segments.get(0).getSentence(); | ||
234 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
235 | + if (word.getTokens().contains(this.segments.get(0)) && | ||
236 | + word.getCtag().equals("Prep")) { | ||
237 | + return true; | ||
238 | + } | ||
239 | + } | ||
240 | + } | ||
241 | + return false; | ||
242 | + } | ||
243 | + | ||
244 | + private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj", | ||
245 | + "Adj", "Conj", "Comp"); | ||
246 | + | ||
247 | + public boolean isPartOfFrazeo() { | ||
248 | + if (this.segments.size() == 1) { | ||
249 | + Sentence sentence = this.segments.get(0).getSentence(); | ||
250 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
251 | + if (word.getTokens().contains(this.segments.get(0)) && | ||
252 | + FRAZEOS.contains(word.getCtag())) { | ||
253 | + return true; | ||
254 | + } | ||
255 | + } | ||
256 | + } | ||
257 | + return false; | ||
258 | + } | ||
259 | + | ||
260 | + public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) { | ||
261 | + if (this.segments.size() == 1) { | ||
262 | + Sentence sentence = this.segments.get(0).getSentence(); | ||
263 | + if (this.getSentencePositionStart() - 1 >= 0) { | ||
264 | + String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth(); | ||
265 | + String noun = sentence.get(this.getSentencePositionStart()).getOrth(); | ||
266 | + String possiblePrep = String.format("%s %s", prep, noun); | ||
267 | + if (complexPreps.contains(possiblePrep)) { | ||
268 | + return true; | ||
269 | + } | ||
270 | + } | ||
271 | + | ||
272 | + if (this.getSentencePositionStart() - 1 >= 0 && | ||
273 | + this.getSentencePositionStart() + 1 < sentence.size()) { | ||
274 | + String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth(); | ||
275 | + String noun = sentence.get(this.getSentencePositionStart()).getOrth(); | ||
276 | + String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth(); | ||
277 | + String possiblePrep = String.format("%s %s %s", prep1, noun, prep2); | ||
278 | + if (complexPreps.contains(possiblePrep)) { | ||
279 | + return true; | ||
280 | + } | ||
281 | + } | ||
282 | + } | ||
283 | + return false; | ||
284 | + } | ||
285 | + | ||
206 | } | 286 | } |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
@@ -109,4 +109,118 @@ public class Sentence extends ArrayList<Token> { | @@ -109,4 +109,118 @@ public class Sentence extends ArrayList<Token> { | ||
109 | public void addNamedEntity(NamedEntity namedEntity) { | 109 | public void addNamedEntity(NamedEntity namedEntity) { |
110 | namedEntities.add(namedEntity); | 110 | namedEntities.add(namedEntity); |
111 | } | 111 | } |
112 | + | ||
113 | + public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) { | ||
114 | + ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); | ||
115 | + for (SyntacticGroup group : this.syntacticGroups) { | ||
116 | + if (group.getSentencePositionStart() >= start && | ||
117 | + group.getSentencePositionEnd() <= end) { | ||
118 | + if (!(group.getSentencePositionStart() == start && | ||
119 | + group.getSentencePositionEnd() == end)) { | ||
120 | + groupsAtSpan.add(group); | ||
121 | + } | ||
122 | + } | ||
123 | + } | ||
124 | + return groupsAtSpan; | ||
125 | + } | ||
126 | + | ||
127 | + public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) { | ||
128 | + ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); | ||
129 | + for (SyntacticGroup group : this.syntacticGroups) { | ||
130 | + | ||
131 | + if (group.getSentencePositionStart() >= start && | ||
132 | + group.getSentencePositionEnd() <= end) { | ||
133 | + if (!(group.getSentencePositionStart() == start && | ||
134 | + group.getSentencePositionEnd() == end)) { | ||
135 | + groupsAtSpan.add(group); | ||
136 | + } | ||
137 | + } | ||
138 | + } | ||
139 | + return groupsAtSpan; | ||
140 | + } | ||
141 | + | ||
142 | + public SyntacticGroup getFirstGroup(int start, int end) { | ||
143 | + SyntacticGroup largestGroup = null; | ||
144 | + int step = start; | ||
145 | + while (step <= end && largestGroup == null) { | ||
146 | + largestGroup = getLargestGroupOnStartPoint(step, end); | ||
147 | + step++; | ||
148 | + } | ||
149 | + return largestGroup; | ||
150 | + } | ||
151 | + | ||
152 | + private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) { | ||
153 | + SyntacticGroup largestGroup = null; | ||
154 | + for (SyntacticGroup group : this.getGroups()) { | ||
155 | + int groupStart = group.getSentencePositionStart(); | ||
156 | + int groupEnd = group.getSentencePositionEnd(); | ||
157 | + if (groupStart == start && groupEnd <= end && | ||
158 | + !(groupStart == start && groupEnd == end) && | ||
159 | + (largestGroup == null || | ||
160 | + largestGroup.getTokens().size() < group.getTokens().size())) { | ||
161 | + largestGroup = group; | ||
162 | + } | ||
163 | + } | ||
164 | + return largestGroup; | ||
165 | + } | ||
166 | + | ||
167 | + public SyntacticGroup getLastGroup(int start, int end) { | ||
168 | + SyntacticGroup largestGroup = null; | ||
169 | + int step = end; | ||
170 | + while (step != start && largestGroup == null) { | ||
171 | + largestGroup = getLargestGroupOnEndPoint(start, step); | ||
172 | + step--; | ||
173 | + } | ||
174 | + return largestGroup; | ||
175 | + } | ||
176 | + | ||
177 | + private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) { | ||
178 | + SyntacticGroup largestGroup = null; | ||
179 | + for (SyntacticGroup group : this.getGroups()) { | ||
180 | + int groupStart = group.getSentencePositionStart(); | ||
181 | + int groupEnd = group.getSentencePositionEnd(); | ||
182 | + if (groupEnd == end && groupStart >= start && | ||
183 | + !(groupStart == start && groupEnd == end) && | ||
184 | + (largestGroup == null || | ||
185 | + largestGroup.getTokens().size() < group.getTokens().size())) { | ||
186 | + largestGroup = group; | ||
187 | + } | ||
188 | + } | ||
189 | + return largestGroup; | ||
190 | + } | ||
191 | + | ||
192 | + public ArrayList<Mention> getMentionsInsideSpan(int start, int end) { | ||
193 | + ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>(); | ||
194 | + for (Mention mention : this.mentions) { | ||
195 | + if (mention.getSentencePositionStart() >= start && | ||
196 | + mention.getSentencePositionEnd() <= end) { | ||
197 | + mentionsAtSpan.add(mention); | ||
198 | + } | ||
199 | + } | ||
200 | + return mentionsAtSpan; | ||
201 | + } | ||
202 | + | ||
203 | + public String getTextInsideSpan(int start, int end) { | ||
204 | + String text = ""; | ||
205 | + int step = start; | ||
206 | + while (step <= end) { | ||
207 | + if (step != start) { | ||
208 | + text += " "; | ||
209 | + } | ||
210 | + text += this.get(step).getOrth(); | ||
211 | + step++; | ||
212 | + } | ||
213 | + return text; | ||
214 | + } | ||
215 | + | ||
216 | + public ArrayList<Token> getSegmentsInsideSpan(int start, int end) { | ||
217 | + ArrayList<Token> tokensAtSpan = new ArrayList<Token>(); | ||
218 | + int step = start; | ||
219 | + while (step <= end) { | ||
220 | + tokensAtSpan.add(this.get(step)); | ||
221 | + step++; | ||
222 | + } | ||
223 | + return tokensAtSpan; | ||
224 | + } | ||
225 | + | ||
112 | } | 226 | } |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
1 | package pl.waw.ipipan.zil.core.md.entities; | 1 | package pl.waw.ipipan.zil.core.md.entities; |
2 | 2 | ||
3 | +import java.util.ArrayList; | ||
3 | import java.util.Iterator; | 4 | import java.util.Iterator; |
4 | import java.util.List; | 5 | import java.util.List; |
5 | 6 | ||
@@ -53,4 +54,175 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { | @@ -53,4 +54,175 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { | ||
53 | 54 | ||
54 | return getType().compareTo(o.getType()); | 55 | return getType().compareTo(o.getType()); |
55 | } | 56 | } |
57 | + | ||
58 | + public int getSentencePositionStart() { | ||
59 | + Token startToken = tokens.get(0); | ||
60 | + return startToken.getSentencePosition(); | ||
61 | + } | ||
62 | + | ||
63 | + public int getSentencePositionEnd() { | ||
64 | + Token endToken = tokens.get(tokens.size()-1); | ||
65 | + return endToken.getSentencePosition(); | ||
66 | + } | ||
67 | + | ||
68 | + | ||
69 | + public SyntacticWord getFirstWord() { | ||
70 | + SyntacticWord firstWord = null; | ||
71 | + Token startToken = tokens.get(0); | ||
72 | + Sentence sentence = startToken.getSentence(); | ||
73 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
74 | + if(startToken.compareTo(word.getTokens().get(0)) == 0 && | ||
75 | + (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) { | ||
76 | + firstWord = word; | ||
77 | + } | ||
78 | + } | ||
79 | + return firstWord; | ||
80 | + } | ||
81 | + | ||
82 | + // NG and PrepNG only now | ||
83 | + public ArrayList<String> getWalentyRealizations() { | ||
84 | + ArrayList<String> realizations = new ArrayList<String>(); | ||
85 | + if (this.type.startsWith("PrepNG")) { | ||
86 | + SyntacticWord prepWord = this.getFirstWord(); | ||
87 | + if (prepWord.getTokens().size() == 1) { | ||
88 | + | ||
89 | + Token prep = prepWord.getTokens().get(0); | ||
90 | + String prepBase = prep.getBase(); | ||
91 | + String prepCase = this.getSemanticHeadTokens().get(0).getCase(); | ||
92 | + realizations.addAll(getPrepnps(prepBase, prepCase)); | ||
93 | + | ||
94 | + } else if (prepWord.getTokens().size() > 1) { | ||
95 | + | ||
96 | + String prepOrth = prepWord.getOrth().toLowerCase(); | ||
97 | + String comprepnp = String.format("comprepnp(%s)", prepOrth); | ||
98 | + realizations.add(comprepnp); | ||
99 | + | ||
100 | + } | ||
101 | + } else if (this.type.startsWith("NG")) { | ||
102 | + String npCase = this.getSemanticHeadTokens().get(0).getCase(); | ||
103 | + realizations.addAll(getNps(npCase)); | ||
104 | + } | ||
105 | + return realizations; | ||
106 | + } | ||
107 | + | ||
108 | + // compar ?? | ||
109 | + private ArrayList<String> getPrepnps(String prepBase, String prepCase) { | ||
110 | + ArrayList<String> prepnps = new ArrayList<String>(); | ||
111 | + prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); | ||
112 | + if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) { | ||
113 | + prepnps.add(String.format("prepnp(%s,str)", prepBase)); | ||
114 | + } | ||
115 | + if (prepCase.equals("gen") || prepCase.equals("acc")) { | ||
116 | + prepnps.add(String.format("prepnp(%s,part)", prepBase)); | ||
117 | + } | ||
118 | + return prepnps; | ||
119 | + } | ||
120 | + | ||
121 | + private ArrayList<String> getNps(String npCase) { | ||
122 | + ArrayList<String> nps = new ArrayList<String>(); | ||
123 | + nps.add(String.format("np(%s)", npCase)); | ||
124 | + if (npCase.equals("nom") || npCase.equals("gen") || npCase.equals("acc")) { | ||
125 | + nps.add(String.format("np(str)")); | ||
126 | + } | ||
127 | + if (npCase.equals("gen") || npCase.equals("acc")) { | ||
128 | + nps.add(String.format("np(part)")); | ||
129 | + } | ||
130 | + return nps; | ||
131 | + } | ||
132 | + | ||
133 | + public boolean precedingWordIsVerb() { | ||
134 | + Sentence sentence = this.tokens.get(0).getSentence(); | ||
135 | + int precedingTokenPosition = this.getSentencePositionStart() - 1; | ||
136 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
137 | + int lastWordPosition = word.getSentencePositionEnd(); | ||
138 | + if (precedingTokenPosition == lastWordPosition && | ||
139 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | ||
140 | + return true; | ||
141 | + } | ||
142 | + } | ||
143 | + return false; | ||
144 | + } | ||
145 | + | ||
146 | + public SyntacticGroup getNextNG() { | ||
147 | + Sentence sentence = this.tokens.get(0).getSentence(); | ||
148 | + int thisGroupEnd = this.getSentencePositionEnd(); | ||
149 | + int sentenceLength = sentence.size(); | ||
150 | + | ||
151 | + SyntacticGroup nextNG = null; | ||
152 | + for (int step = thisGroupEnd; step < sentenceLength; step++) { | ||
153 | + nextNG = sentence.getFirstGroup(step, sentenceLength); | ||
154 | + if (nextNG != null && nextNG.type.startsWith("NG") && | ||
155 | + this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) { | ||
156 | + break; | ||
157 | + } else { | ||
158 | + nextNG = null; | ||
159 | + } | ||
160 | + } | ||
161 | + return nextNG; | ||
162 | + } | ||
163 | + | ||
164 | + public SyntacticGroup getFollowingGroup() { | ||
165 | + SyntacticGroup largestGroup = null; | ||
166 | + Sentence sentence = this.tokens.get(0).getSentence(); | ||
167 | + int nextTokenPosition = this.getSentencePositionEnd() + 1; | ||
168 | + for (SyntacticGroup group : sentence.getGroups()) { | ||
169 | + if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) && | ||
170 | + group.getSentencePositionStart() == nextTokenPosition) { | ||
171 | + if (largestGroup == null || | ||
172 | + largestGroup.getTokens().size() < group.getTokens().size()) { | ||
173 | + largestGroup = group; | ||
174 | + } | ||
175 | + } | ||
176 | + } | ||
177 | + return largestGroup; | ||
178 | + } | ||
179 | + | ||
180 | + public SyntacticWord getPrecedingVerb() { | ||
181 | + int precedingTokenPosition = this.getSentencePositionStart() - 1; | ||
182 | + Sentence sentence = this.tokens.get(0).getSentence(); | ||
183 | + if(this.isPartOfPrepNG()) { | ||
184 | + SyntacticGroup parentNGGroup = this.getParentPrepNG(); | ||
185 | + precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; | ||
186 | + } | ||
187 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | ||
188 | + int lastWordPosition = word.getSentencePositionEnd(); | ||
189 | + if (precedingTokenPosition == lastWordPosition && | ||
190 | + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | ||
191 | + return word; | ||
192 | + } | ||
193 | + } | ||
194 | + return null; | ||
195 | + } | ||
196 | + | ||
197 | + private boolean isPartOfPrepNG() { | ||
198 | + int NGGroupStart = this.getSentencePositionStart(); | ||
199 | + int NGGroupEnd = this.getSentencePositionEnd(); | ||
200 | + Sentence sentence = this.tokens.get(0).getSentence(); | ||
201 | + for (SyntacticGroup group : sentence.getGroups()) { | ||
202 | + if (group.getType().startsWith("PrepNG") && | ||
203 | + group.getSentencePositionStart() <= NGGroupStart && | ||
204 | + group.getSentencePositionEnd() >= NGGroupEnd) { | ||
205 | + return true; | ||
206 | + } | ||
207 | + } | ||
208 | + return false; | ||
209 | + } | ||
210 | + | ||
211 | + private SyntacticGroup getParentPrepNG() { | ||
212 | + SyntacticGroup parentPrepNG = null; | ||
213 | + int NGGroupStart = this.getSentencePositionStart(); | ||
214 | + int NGGroupEnd = this.getSentencePositionEnd(); | ||
215 | + Sentence sentence = this.tokens.get(0).getSentence(); | ||
216 | + for (SyntacticGroup group : sentence.getGroups()) { | ||
217 | + if (group.getType().startsWith("PrepNG") && | ||
218 | + group.getSentencePositionStart() <= NGGroupStart && | ||
219 | + group.getSentencePositionEnd() >= NGGroupEnd) { | ||
220 | + if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { | ||
221 | + parentPrepNG = group; | ||
222 | + } | ||
223 | + } | ||
224 | + } | ||
225 | + return parentPrepNG; | ||
226 | + } | ||
227 | + | ||
56 | } | 228 | } |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
@@ -6,11 +6,16 @@ import java.util.List; | @@ -6,11 +6,16 @@ import java.util.List; | ||
6 | 6 | ||
7 | public class SyntacticWord implements Comparable<SyntacticWord> { | 7 | public class SyntacticWord implements Comparable<SyntacticWord> { |
8 | 8 | ||
9 | + private String base; | ||
9 | private String ctag; | 10 | private String ctag; |
11 | + private String orth; | ||
10 | private List<Token> tokens = new ArrayList<>(); | 12 | private List<Token> tokens = new ArrayList<>(); |
11 | 13 | ||
12 | - public SyntacticWord(String ctag, List<Token> tokens) { | 14 | + public SyntacticWord(String ctag, List<Token> tokens, |
15 | + String base, String orth) { | ||
16 | + this.base = base; | ||
13 | this.ctag = ctag; | 17 | this.ctag = ctag; |
18 | + this.orth = orth; | ||
14 | this.tokens = tokens; | 19 | this.tokens = tokens; |
15 | } | 20 | } |
16 | 21 | ||
@@ -39,5 +44,37 @@ public class SyntacticWord implements Comparable<SyntacticWord> { | @@ -39,5 +44,37 @@ public class SyntacticWord implements Comparable<SyntacticWord> { | ||
39 | 44 | ||
40 | return getCtag().compareTo(o.getCtag()); | 45 | return getCtag().compareTo(o.getCtag()); |
41 | } | 46 | } |
47 | + | ||
48 | + public int getSentencePositionStart() { | ||
49 | + Token startToken = tokens.get(0); | ||
50 | + return startToken.getSentencePosition(); | ||
51 | + } | ||
52 | + | ||
53 | + public int getSentencePositionEnd() { | ||
54 | + Token endToken = tokens.get(tokens.size()-1); | ||
55 | + return endToken.getSentencePosition(); | ||
56 | + } | ||
57 | + | ||
58 | + public String getBase() { | ||
59 | + return this.base; | ||
60 | + } | ||
61 | + | ||
62 | + public String getOrth() { | ||
63 | + return this.orth; | ||
64 | + } | ||
65 | + | ||
66 | + public boolean isVerb() { | ||
67 | + if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) { | ||
68 | + return true; | ||
69 | + } | ||
70 | + return false; | ||
71 | + } | ||
72 | + | ||
73 | + public boolean isInterp() { | ||
74 | + if (this.ctag.equals("Interp")) { | ||
75 | + return true; | ||
76 | + } | ||
77 | + return false; | ||
78 | + } | ||
42 | 79 | ||
43 | } | 80 | } |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
@@ -70,6 +70,7 @@ public class TeiLoader { | @@ -70,6 +70,7 @@ public class TeiLoader { | ||
70 | for (TEIMorph mo : m.getHeadMorphs()) | 70 | for (TEIMorph mo : m.getHeadMorphs()) |
71 | headTokens.add(teiMorph2Segment.get(mo)); | 71 | headTokens.add(teiMorph2Segment.get(mo)); |
72 | s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); | 72 | s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); |
73 | + System.out.println(tokens.toString()); | ||
73 | } | 74 | } |
74 | 75 | ||
75 | private static void loadSyntacticGroup(Sentence s, TEIGroup g, | 76 | private static void loadSyntacticGroup(Sentence s, TEIGroup g, |
@@ -94,10 +95,12 @@ public class TeiLoader { | @@ -94,10 +95,12 @@ public class TeiLoader { | ||
94 | private static void loadSyntacticWord(Sentence s, TEIWord w, | 95 | private static void loadSyntacticWord(Sentence s, TEIWord w, |
95 | Map<TEIMorph, Token> teiMorph2Segment) { | 96 | Map<TEIMorph, Token> teiMorph2Segment) { |
96 | String ctag = w.getInterpretation().getCtag(); | 97 | String ctag = w.getInterpretation().getCtag(); |
98 | + String base = w.getInterpretation().getBase(); | ||
99 | + String orth = w.getOrth(); | ||
97 | List<Token> tokens = new ArrayList<>(); | 100 | List<Token> tokens = new ArrayList<>(); |
98 | for (TEIMorph m : w.getAllMorphs()) | 101 | for (TEIMorph m : w.getAllMorphs()) |
99 | tokens.add(teiMorph2Segment.get(m)); | 102 | tokens.add(teiMorph2Segment.get(m)); |
100 | - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | 103 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth)); |
101 | } | 104 | } |
102 | 105 | ||
103 | private static void loadNE(Sentence s, TEINamedEntity ne, | 106 | private static void loadNE(Sentence s, TEINamedEntity ne, |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
@@ -73,10 +73,12 @@ public class ThriftLoader { | @@ -73,10 +73,12 @@ public class ThriftLoader { | ||
73 | private static void loadSyntacticWord(Sentence s, TSyntacticWord w, | 73 | private static void loadSyntacticWord(Sentence s, TSyntacticWord w, |
74 | Map<String, Object> thirftId2Entity, | 74 | Map<String, Object> thirftId2Entity, |
75 | Map<String, Token> thiftTokenId2Token) { | 75 | Map<String, Token> thiftTokenId2Token) { |
76 | + String base = w.getChosenInterpretation().getBase(); | ||
76 | String ctag = w.getChosenInterpretation().getCtag(); | 77 | String ctag = w.getChosenInterpretation().getCtag(); |
78 | + String orth = w.getOrth(); | ||
77 | List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, | 79 | List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, |
78 | thiftTokenId2Token, false); | 80 | thiftTokenId2Token, false); |
79 | - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | 81 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth)); |
80 | } | 82 | } |
81 | 83 | ||
82 | private static void loadNE(Sentence s, TNamedEntity ne, | 84 | private static void loadNE(Sentence s, TNamedEntity ne, |