Commit 8f86545e14f99bbf47ab83bf202e26af7a2716c4
1 parent
1dc4f947
Cleaning unused experimental code.
Showing
9 changed files
with
233 additions
and
702 deletions
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
... | ... | @@ -33,9 +33,8 @@ public class Main { |
33 | 33 | |
34 | 34 | private static final boolean GZIP_OUTPUT = true; |
35 | 35 | private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; |
36 | - private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt"; | |
37 | - private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt"; | |
38 | - private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt"; | |
36 | + private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all_with_realizations.txt"; | |
37 | + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all_with_realizations.txt"; | |
39 | 38 | |
40 | 39 | private static ZeroSubjectDetector zeroSubjectModel; |
41 | 40 | |
... | ... | @@ -46,8 +45,6 @@ public class Main { |
46 | 45 | |
47 | 46 | private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = |
48 | 47 | new EnumMap(ValenceDicts.class); |
49 | - | |
50 | - private static final ArrayList<String> complexPreps; | |
51 | 48 | |
52 | 49 | static { |
53 | 50 | InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); |
... | ... | @@ -58,9 +55,6 @@ public class Main { |
58 | 55 | |
59 | 56 | InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); |
60 | 57 | valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); |
61 | - | |
62 | - InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS); | |
63 | - complexPreps = readValues(complexPrepositionsStream); | |
64 | 58 | } |
65 | 59 | |
66 | 60 | |
... | ... | @@ -125,34 +119,6 @@ public class Main { |
125 | 119 | |
126 | 120 | return false; |
127 | 121 | } |
128 | - | |
129 | - public static ArrayList<String> readValues(InputStream stream) { | |
130 | - ArrayList<String> values; | |
131 | - try { | |
132 | - BufferedReader br=new BufferedReader(new InputStreamReader(stream)); | |
133 | - values = new ArrayList<String>(); | |
134 | - String line; | |
135 | - boolean firstLine = true; | |
136 | - while((line = br.readLine()) != null) { | |
137 | - if (firstLine) { | |
138 | - line = line.replace("\uFEFF", ""); // remove BOM character | |
139 | - firstLine = false; | |
140 | - } | |
141 | - | |
142 | - if (!line.startsWith("%")) { | |
143 | - String value = line.trim(); | |
144 | - if (!value.isEmpty()) { | |
145 | - values.add(value); | |
146 | - } | |
147 | - } | |
148 | - } | |
149 | - br.close(); | |
150 | - } catch (IOException ex) { | |
151 | - ex.printStackTrace(); | |
152 | - throw new RuntimeException(ex); | |
153 | - } | |
154 | - return values; | |
155 | - } | |
156 | 122 | |
157 | 123 | private Main() { |
158 | 124 | } |
... | ... | @@ -244,7 +210,7 @@ public class Main { |
244 | 210 | */ |
245 | 211 | public static void annotateThriftText(TText thriftText) throws MultiserviceException { |
246 | 212 | Text responseText = ThriftLoader.loadTextFromThrift(thriftText); |
247 | - Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); | |
213 | + Detector.findMentionsInText(responseText, zeroSubjectModel, valence); | |
248 | 214 | ThriftSaver.updateThriftText(responseText, thriftText); |
249 | 215 | } |
250 | 216 | |
... | ... | @@ -257,7 +223,7 @@ public class Main { |
257 | 223 | */ |
258 | 224 | public static void annotateTeiText(TEICorpusText teiText) throws TEIException { |
259 | 225 | Text responseText = TeiLoader.loadTextFromTei(teiText); |
260 | - Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); | |
226 | + Detector.findMentionsInText(responseText, zeroSubjectModel, valence); | |
261 | 227 | TeiSaver.updateTeiText(responseText, teiText); |
262 | 228 | } |
263 | 229 | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | |
3 | -import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | |
4 | 3 | import pl.waw.ipipan.zil.core.md.entities.Mention; |
5 | 4 | import pl.waw.ipipan.zil.core.md.entities.Sentence; |
6 | 5 | import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; |
... | ... | @@ -164,33 +163,6 @@ public class Cleaner { |
164 | 163 | } |
165 | 164 | } |
166 | 165 | |
167 | - /*private static void removeWalentyFramedMentions(Sentence sentence, | |
168 | - ArrayList<Mention> mentions, | |
169 | - ArrayList<String> schemata) { | |
170 | - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
171 | - for (Mention mention : mentions) { | |
172 | - int mentionStart = mention.getFirstSegment().getSentencePosition(); | |
173 | - int mentionEnd = mention.getLastSegment().getSentencePosition(); | |
174 | - SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd); | |
175 | - SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd); | |
176 | - if (startGroup != null && endGroup != null | |
177 | - && startGroup.compareTo(endGroup) != 0) { | |
178 | - ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations(); | |
179 | - ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations(); | |
180 | - for (String schema : schemata) { | |
181 | - if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) { | |
182 | - mentionsToRemove.add(mention); | |
183 | - break; | |
184 | - } | |
185 | - } | |
186 | - } | |
187 | - } | |
188 | - | |
189 | - for (Mention mentionToRemove : mentionsToRemove) { | |
190 | - sentence.removeMention(mentionToRemove); | |
191 | - } | |
192 | - }*/ | |
193 | - | |
194 | 166 | private static boolean isProperSchema(String schema, ArrayList<String> group1Types, |
195 | 167 | ArrayList<String> group2Types) { |
196 | 168 | for (String group1Type : group1Types) { |
... | ... | @@ -207,7 +179,7 @@ public class Cleaner { |
207 | 179 | String phraseType2) { |
208 | 180 | boolean phrType1Found = false; |
209 | 181 | boolean phrType2Found = false; |
210 | - for (String position : schema.split("\\+")) { | |
182 | + for (String position : schema.split("\\s\\+\\s")) { | |
211 | 183 | position = position.trim(); |
212 | 184 | position = position.substring(1, position.length()-1); |
213 | 185 | for (String phrT : position.split(";")) { |
... | ... | @@ -226,34 +198,6 @@ public class Cleaner { |
226 | 198 | return false; |
227 | 199 | } |
228 | 200 | |
229 | - | |
230 | - // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub) | |
231 | - public static void cleanQubs(Sentence sentence) { | |
232 | - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
233 | - for (Mention mention : sentence.getMentions()) { | |
234 | - if (mention.isPartOfQub()) { | |
235 | - mentionsToRemove.add(mention); | |
236 | - } | |
237 | - } | |
238 | - | |
239 | - for (Mention mentionToRemove : mentionsToRemove) { | |
240 | - sentence.removeMention(mentionToRemove); | |
241 | - } | |
242 | - } | |
243 | - | |
244 | - public static void cleanPreps(Sentence sentence) { | |
245 | - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
246 | - for (Mention mention : sentence.getMentions()) { | |
247 | - if (mention.isPartOfPrep()) { | |
248 | - mentionsToRemove.add(mention); | |
249 | - } | |
250 | - } | |
251 | - | |
252 | - for (Mention mentionToRemove : mentionsToRemove) { | |
253 | - sentence.removeMention(mentionToRemove); | |
254 | - } | |
255 | - } | |
256 | - | |
257 | 201 | public static void cleanFrazeos(Sentence sentence) { |
258 | 202 | ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); |
259 | 203 | for (Mention mention : sentence.getMentions()) { |
... | ... | @@ -267,20 +211,4 @@ public class Cleaner { |
267 | 211 | } |
268 | 212 | } |
269 | 213 | |
270 | - // wyrzuca wzmianki bedace czescia przyimkow zlozonych | |
271 | - public static void cleanComplexPreps(Sentence sentence, | |
272 | - ArrayList<String> complexPreps) { | |
273 | - | |
274 | - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); | |
275 | - for (Mention mention : sentence.getMentions()) { | |
276 | - if (mention.isPartOfComplexPrep(complexPreps)) { | |
277 | - mentionsToRemove.add(mention); | |
278 | - } | |
279 | - } | |
280 | - | |
281 | - for (Mention mentionToRemove : mentionsToRemove) { | |
282 | - sentence.removeMention(mentionToRemove); | |
283 | - } | |
284 | - } | |
285 | - | |
286 | 214 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | |
3 | +import java.util.Arrays; | |
4 | +import java.util.List; | |
5 | + | |
3 | 6 | public class Constants { |
4 | 7 | public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger"; |
5 | 8 | public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt"; |
... | ... | @@ -7,6 +10,11 @@ public class Constants { |
7 | 10 | public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|" |
8 | 11 | + MORPHO_PRONOUN_CTAGS; |
9 | 12 | public static final String WORDS_CTAGS = "Noun|Ppron.*"; |
13 | + | |
14 | + public static final List<String> FRAZEO_CTAGS = Arrays.asList("Prep", "Qub", "Adv", "Interj", | |
15 | + "Adj", "Conj", "Comp"); | |
16 | + | |
17 | + public static final List<String> VERB_CTAGS = Arrays.asList("Inf", "Verbfin"); | |
10 | 18 | |
11 | 19 | private Constants() { |
12 | 20 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
... | ... | @@ -22,24 +22,22 @@ public class Detector { |
22 | 22 | |
23 | 23 | public static void findMentionsInText(Text text, |
24 | 24 | ZeroSubjectDetector zeroSubjectModel, |
25 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
26 | - ArrayList<String> complexPreps) { | |
25 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
27 | 26 | text.clearMentions(); |
28 | 27 | logger.debug("Detecting mentions in text " + text.getId()); |
29 | 28 | for (Paragraph p : text) |
30 | 29 | for (Sentence s : p) |
31 | - detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps); | |
30 | + detectMentionsInSentence(s, zeroSubjectModel, valence); | |
32 | 31 | } |
33 | 32 | |
34 | 33 | private static void detectMentionsInSentence(Sentence sentence, |
35 | 34 | ZeroSubjectDetector zeroSubjectModel, |
36 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
37 | - ArrayList<String> complexPreps) { | |
35 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
38 | 36 | // adding mentions |
39 | 37 | addMentionsByTokenCtag(sentence); |
40 | 38 | addMentionsBySyntacticWordsCtag(sentence); |
41 | 39 | addMentionsByNamedEntities(sentence); |
42 | - addMentionsByGroups(sentence, valence, complexPreps); | |
40 | + addMentionsByGroups(sentence, valence); | |
43 | 41 | addSpeakerMentionsInSpoken(sentence); |
44 | 42 | |
45 | 43 | // zero subject detection |
... | ... | @@ -47,12 +45,9 @@ public class Detector { |
47 | 45 | |
48 | 46 | // removing mentions |
49 | 47 | removeTo(sentence); |
48 | + Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); | |
50 | 49 | Cleaner.cleanUnnecessarySentenceMentions(sentence); |
51 | - //Cleaner.cleanQubs(sentence); | |
52 | - //Cleaner.cleanPreps(sentence); | |
53 | - //Cleaner.cleanComplexPreps(sentence, complexPreps); | |
54 | 50 | Cleaner.cleanFrazeos(sentence); |
55 | - Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); | |
56 | 51 | |
57 | 52 | // updating mention heads |
58 | 53 | updateMentionHeads(sentence); |
... | ... | @@ -108,294 +103,64 @@ public class Detector { |
108 | 103 | * @param sentence |
109 | 104 | */ |
110 | 105 | private static void addMentionsByGroups(Sentence sentence, |
111 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
112 | - ArrayList<String> complexPreps) { | |
113 | - List<SyntacticGroup> groups = sentence.getGroups(); | |
114 | - for (int i = 0; i < groups.size(); i++) { | |
115 | - SyntacticGroup thisGroup = groups.get(i); | |
116 | - | |
117 | - /*SyntacticGroup nearPrepNG = null; | |
118 | - SyntacticGroup nextNG = null;*/ | |
119 | - | |
120 | - SyntacticGroup nextGroup = thisGroup.getFollowingGroup(); | |
121 | - | |
122 | - /*if (thisGroup.getType().startsWith("NG")) { | |
123 | - nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(), | |
124 | - sentence); | |
125 | - nextNG = thisGroup.getNextNG(); | |
126 | - }*/ | |
127 | - | |
128 | - /*if (nextNG != null) { | |
129 | - int prepStart = thisGroup.getSentencePositionEnd() + 1; | |
130 | - int prepEnd = nextNG.getSentencePositionStart() - 1; | |
131 | - String prep = sentence.getTextInsideSpan(prepStart, prepEnd); | |
132 | - if (complexPreps.contains(prep)) { | |
133 | - String cos = ""; | |
134 | - } | |
135 | - }*/ | |
106 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
107 | + | |
108 | + for (SyntacticGroup group : sentence.getGroups()) { | |
109 | + SyntacticGroup nextGroup = group.getFollowingGroup(); | |
110 | + SyntacticGroup nextnextGroup = null; | |
111 | + SyntacticGroup nextnextnextGroup = null; | |
112 | + if (nextGroup != null) { | |
113 | + nextnextGroup = nextGroup.getFollowingGroup(); | |
114 | + if (nextnextGroup != null) { | |
115 | + nextnextnextGroup = nextnextGroup.getFollowingGroup(); | |
116 | + } | |
117 | + } | |
136 | 118 | |
137 | - /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && | |
138 | - //!isPartOfPrepNG(thisGroup, sentence) && | |
139 | - //getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && | |
140 | - precedingWordIsVerb(thisGroup, sentence) && | |
141 | - //!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | |
142 | - !precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | |
143 | - !sameSemanticHeads(thisGroup, nearPrepNG)) { | |
144 | - List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
145 | - List<Token> segments = thisGroup.getTokens(); | |
146 | - segments.addAll(nearPrepNG.getTokens()); | |
147 | - | |
148 | - sentence.addMention(new Mention(segments, heads)); | |
149 | - }*/ | |
150 | - /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && | |
151 | - // !precedingWordIsVerb(thisGroup, sentence) && | |
152 | - !isPartOfPrepNG(thisGroup, sentence) && | |
153 | - getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && | |
154 | - //!precedingWordIsVerb(thisGroup, sentence) && | |
155 | - !precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | |
156 | - //!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && | |
157 | - !sameSemanticHeads(thisGroup, nearPrepNG)) { | |
158 | - List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
159 | - List<Token> segments = thisGroup.getTokens(); | |
160 | - segments.addAll(nearPrepNG.getTokens()); | |
161 | - | |
162 | - sentence.addMention(new Mention(segments, heads)); | |
163 | - }*/ | |
164 | - if (thisGroup.getType().startsWith("NG") && | |
165 | - nextGroup != null && nextGroup.getType().startsWith("PrepNG") && | |
166 | - NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) { | |
167 | - List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
119 | + if (group.getType().startsWith("NG") && nextGroup != null && | |
120 | + nextnextGroup != null && nextnextnextGroup != null && | |
121 | + quatroCompatibility(group, nextGroup, nextnextGroup, | |
122 | + nextnextnextGroup, valence.get(ValenceDicts.NounsValence))) { | |
123 | + List<Token> heads = group.getSemanticHeadTokens(); | |
168 | 124 | List<Token> segments = new ArrayList<Token>(); |
169 | - segments.addAll(thisGroup.getTokens()); | |
125 | + segments.addAll(group.getTokens()); | |
170 | 126 | segments.addAll(nextGroup.getTokens()); |
127 | + segments.addAll(nextnextGroup.getTokens()); | |
128 | + segments.addAll(nextnextnextGroup.getTokens()); | |
171 | 129 | |
172 | 130 | sentence.addMention(new Mention(segments, heads)); |
173 | - } else if (thisGroup.getType().startsWith("NG") && nextGroup != null && | |
174 | - nextGroup.getType().startsWith("NG") && | |
175 | - NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence)) | |
176 | - ) { | |
177 | - List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
131 | + } else if (group.getType().startsWith("NG") && nextGroup != null && | |
132 | + nextnextGroup != null && tripleCompatibility(group, nextGroup, nextnextGroup, valence.get(ValenceDicts.NounsValence))) { | |
133 | + List<Token> heads = group.getSemanticHeadTokens(); | |
178 | 134 | List<Token> segments = new ArrayList<Token>(); |
179 | - segments.addAll(thisGroup.getTokens()); | |
135 | + segments.addAll(group.getTokens()); | |
180 | 136 | segments.addAll(nextGroup.getTokens()); |
137 | + segments.addAll(nextnextGroup.getTokens()); | |
181 | 138 | |
182 | 139 | sentence.addMention(new Mention(segments, heads)); |
183 | - } /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null && | |
184 | - NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) { | |
185 | - List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
186 | - | |
140 | + } else if (group.getType().startsWith("NG") && nextGroup != null && | |
141 | + groupsValenceCompatibility(group, nextGroup, sentence, valence.get(ValenceDicts.NounsValence)) | |
142 | + ) { | |
143 | + List<Token> heads = group.getSemanticHeadTokens(); | |
187 | 144 | List<Token> segments = new ArrayList<Token>(); |
188 | - segments.addAll(thisGroup.getTokens()); | |
189 | - | |
190 | - int prepStart = thisGroup.getSentencePositionEnd() + 1; | |
191 | - int prepEnd = nextNG.getSentencePositionStart() - 1; | |
192 | - ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd); | |
193 | - segments.addAll(prepSegments); | |
194 | - | |
195 | - segments.addAll(nextNG.getTokens()); | |
145 | + segments.addAll(group.getTokens()); | |
146 | + segments.addAll(nextGroup.getTokens()); | |
196 | 147 | |
197 | 148 | sentence.addMention(new Mention(segments, heads)); |
198 | - }*/ | |
199 | - //else if // NG + im./pt. NG | |
200 | - // daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka | |
201 | - // "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName) | |
202 | - else if (thisGroup.getType().startsWith("NG")) { | |
203 | - List<Token> segments = thisGroup.getTokens(); | |
204 | - List<Token> heads = thisGroup.getSemanticHeadTokens(); | |
205 | - | |
206 | - sentence.addMention(new Mention(segments, heads)); | |
207 | - } | |
208 | - } | |
209 | - | |
210 | - // oryginalna wersja | |
211 | - /*for (SyntacticGroup group : sentence.getGroups()) { | |
212 | - if (group.getType().startsWith("NG")) { | |
149 | + } else if (group.getType().startsWith("NG")) { | |
213 | 150 | List<Token> segments = group.getTokens(); |
214 | 151 | List<Token> heads = group.getSemanticHeadTokens(); |
215 | 152 | |
216 | 153 | sentence.addMention(new Mention(segments, heads)); |
217 | 154 | } |
218 | - }*/ | |
219 | - } | |
220 | - | |
221 | - private static boolean followingWordIsInf(SyntacticGroup group, | |
222 | - Sentence sentence) { | |
223 | - int followingTokenPosition = group.getSentencePositionEnd() + 1; | |
224 | - for (SyntacticWord word : sentence.getSyntacticWords()) { | |
225 | - int firstWordPosition = word.getSentencePositionStart(); | |
226 | - if (followingTokenPosition == firstWordPosition && | |
227 | - (word.getCtag().equals("Inf"))) { | |
228 | - return true; | |
229 | - } | |
230 | - } | |
231 | - | |
232 | - return false; | |
233 | - } | |
234 | - | |
235 | - private static SyntacticGroup getFollowingPrepNGs(int sentencePosition, | |
236 | - Sentence sentence) { | |
237 | - SyntacticGroup largestGroup = null; | |
238 | - int nextTokenPosition = sentencePosition + 1; | |
239 | - for (SyntacticGroup group : sentence.getGroups()) { | |
240 | - if (group.getType().startsWith("PrepNG") && | |
241 | - group.getSentencePositionStart() == nextTokenPosition) { | |
242 | - if (largestGroup == null || | |
243 | - largestGroup.getTokens().size() < group.getTokens().size()) { | |
244 | - largestGroup = group; | |
245 | - } | |
246 | - } | |
247 | - } | |
248 | - return largestGroup; | |
249 | - } | |
250 | - | |
251 | - private static boolean isPartOfPrepNG(SyntacticGroup NGGroup, | |
252 | - Sentence sentence) { | |
253 | - int NGGroupStart = NGGroup.getSentencePositionStart(); | |
254 | - int NGGroupEnd = NGGroup.getSentencePositionEnd(); | |
255 | - for (SyntacticGroup group : sentence.getGroups()) { | |
256 | - if (group.getType().startsWith("PrepNG") && | |
257 | - group.getSentencePositionStart() <= NGGroupStart && | |
258 | - group.getSentencePositionEnd() >= NGGroupEnd) { | |
259 | - return true; | |
260 | - } | |
261 | - } | |
262 | - return false; | |
263 | - } | |
264 | - | |
265 | - private static boolean precedingWordIsVerb(SyntacticGroup group, | |
266 | - Sentence sentence) { | |
267 | - int precedingTokenPosition = group.getSentencePositionStart() - 1; | |
268 | - if(isPartOfPrepNG(group, sentence)) { | |
269 | - SyntacticGroup parentGroup = getParentPrepNG(group, sentence); | |
270 | - precedingTokenPosition = parentGroup.getSentencePositionStart() - 1; | |
271 | - } | |
272 | - | |
273 | - for (SyntacticWord word : sentence.getSyntacticWords()) { | |
274 | - int lastWordPosition = word.getSentencePositionEnd(); | |
275 | - if (precedingTokenPosition == lastWordPosition && | |
276 | - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
277 | - return true; | |
278 | - } | |
279 | - } | |
280 | - return false; | |
281 | - } | |
282 | - | |
283 | - // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem? | |
284 | - // czy prep moze sie skladac z wiecej niz jednego segmentu? | |
285 | - // dopasowywac refla i recip do sie spejdowego | |
286 | - private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup, | |
287 | - SyntacticGroup PrepNGGroup, Sentence sentence, | |
288 | - Map<String,ArrayList<String>> walentyMapping) { | |
289 | - int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; | |
290 | - for (SyntacticWord word : sentence.getSyntacticWords()) { | |
291 | - int lastWordPosition = word.getSentencePositionEnd(); | |
292 | - if (precedingTokenPosition == lastWordPosition && | |
293 | - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
294 | - String verb = word.getBase(); | |
295 | - if (!walentyMapping.containsKey(verb)) { | |
296 | - return true; | |
297 | - } else { | |
298 | - SyntacticWord prepWord = PrepNGGroup.getFirstWord(); | |
299 | - | |
300 | - if (prepWord.getTokens().size() == 1) { | |
301 | - Token prep = prepWord.getTokens().get(0); | |
302 | - String prepBase = prep.getBase(); | |
303 | - // sprawdzic czy glowa moze miec wiele tokenow | |
304 | - String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); | |
305 | - ArrayList<String> prepnps = getPrepnps(prepBase, prepCase); | |
306 | - | |
307 | - ArrayList<String> schemata = walentyMapping.get(verb); | |
308 | - for (String schema : schemata) { | |
309 | - for (String prepnp : prepnps) { | |
310 | - if (schema.contains(prepnp)) { | |
311 | - return true; | |
312 | - } | |
313 | - } | |
314 | - } | |
315 | - } else if (prepWord.getTokens().size() > 1) { | |
316 | - String prepOrth = prepWord.getOrth().toLowerCase(); | |
317 | - String comprepnp = String.format("comprepnp(%s)", prepOrth); | |
318 | - ArrayList<String> schemata = walentyMapping.get(verb); | |
319 | - for (String schema : schemata) { | |
320 | - if (schema.contains(comprepnp)) { | |
321 | - return true; | |
322 | - } | |
323 | - } | |
324 | - | |
325 | - } | |
326 | - | |
327 | - | |
328 | - } | |
329 | - } | |
330 | 155 | } |
331 | - return false; | |
332 | - } | |
333 | - | |
334 | - private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup, | |
335 | - SyntacticGroup PrepNGGroup, Sentence sentence, | |
336 | - Map<String,ArrayList<String>> walentyMapping) { | |
337 | - int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; | |
338 | - if(isPartOfPrepNG(NGGroup, sentence)) { | |
339 | - SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence); | |
340 | - precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; | |
341 | - } | |
342 | - for (SyntacticWord word : sentence.getSyntacticWords()) { | |
343 | - int lastWordPosition = word.getSentencePositionEnd(); | |
344 | - if (precedingTokenPosition == lastWordPosition && | |
345 | - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
346 | - if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) { | |
347 | - return true; | |
348 | - } | |
349 | - if (!walentyMapping.containsKey(word.getBase())) { | |
350 | - return true; | |
351 | - } | |
352 | - | |
353 | - } | |
354 | - } | |
355 | - return false; | |
356 | - } | |
357 | - | |
358 | - private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup, | |
359 | - SyntacticGroup PrepNGGroup, Sentence sentence, | |
360 | - Map<String,ArrayList<String>> walentyMapping) { | |
361 | - String verbBase = verb.getBase(); | |
362 | - if (!walentyMapping.containsKey(verbBase)) { | |
363 | - return true; | |
364 | - } else { | |
365 | - ArrayList<String> schemata = walentyMapping.get(verbBase); | |
366 | - | |
367 | - // PrepNG + PrepNG | |
368 | - if (isPartOfPrepNG(NGGroup, sentence)) { | |
369 | - SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence); | |
370 | - ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations(); | |
371 | - ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations(); | |
372 | - for (String schema : schemata) { | |
373 | - if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) { | |
374 | - return true; | |
375 | - } | |
376 | - } | |
377 | - } | |
378 | - | |
379 | - // NG + PrepNG | |
380 | - else { | |
381 | - ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations(); | |
382 | - ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations(); | |
383 | - for (String schema : schemata) { | |
384 | - if (isProperSchema(schema, NGRealizations, prepNGRealizations)) { | |
385 | - return true; | |
386 | - } | |
387 | - } | |
388 | - } | |
389 | - } | |
390 | - return false; | |
391 | 156 | } |
392 | 157 | |
393 | 158 | private static boolean isProperSchema(String schema, ArrayList<String> group1Types, |
394 | 159 | ArrayList<String> group2Types) { |
395 | 160 | for (String group1Type : group1Types) { |
396 | - if (schema.contains(group1Type)) { | |
161 | + if (schemaContains(schema, group1Type)) { | |
397 | 162 | for (String group2Type : group2Types) { |
398 | - if (schema.contains(group2Type)) { | |
163 | + if (schemaContains(schema, group2Type)) { | |
399 | 164 | return true; |
400 | 165 | } |
401 | 166 | } |
... | ... | @@ -404,103 +169,71 @@ public class Detector { |
404 | 169 | return false; |
405 | 170 | } |
406 | 171 | |
407 | - private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup, | |
408 | - Sentence sentence) { | |
409 | - SyntacticGroup parentPrepNG = null; | |
410 | - int NGGroupStart = NGGroup.getSentencePositionStart(); | |
411 | - int NGGroupEnd = NGGroup.getSentencePositionEnd(); | |
412 | - for (SyntacticGroup group : sentence.getGroups()) { | |
413 | - if (group.getType().startsWith("PrepNG") && | |
414 | - group.getSentencePositionStart() <= NGGroupStart && | |
415 | - group.getSentencePositionEnd() >= NGGroupEnd) { | |
416 | - if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { | |
417 | - parentPrepNG = group; | |
418 | - } | |
419 | - } | |
420 | - } | |
421 | - return parentPrepNG; | |
422 | - } | |
423 | - | |
424 | - private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup, | |
425 | - SyntacticGroup PrepNGGroup, Sentence sentence, | |
172 | + private static boolean groupsValenceCompatibility(SyntacticGroup NG1, | |
173 | + SyntacticGroup NG2, Sentence sentence, | |
426 | 174 | Map<String,ArrayList<String>> walentyMapping) { |
427 | - Token NGHead = NGGroup.getSemanticHeadTokens().get(0); | |
175 | + Token NG1Head = NG1.getSemanticHeadTokens().get(0); | |
428 | 176 | |
429 | - String NGHeadBase = NGHead.getBase(); | |
177 | + String NGHeadBase = NG1Head.getBase(); | |
430 | 178 | |
431 | 179 | if (!walentyMapping.containsKey(NGHeadBase)) { |
432 | 180 | return false; |
433 | 181 | } else { |
434 | - SyntacticWord prepWord = PrepNGGroup.getFirstWord(); | |
182 | + ArrayList<String> NG2realizations = NG2.getWalentyRealizations(); | |
435 | 183 | |
436 | - if (prepWord.getTokens().size() == 1) { | |
437 | - Token prep = prepWord.getTokens().get(0); | |
438 | - String prepBase = prep.getBase(); | |
439 | - String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); | |
440 | - String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase); | |
441 | - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
442 | - for (String schema : schemata) { | |
443 | - if (schemaContains(schema, prepnp)) { | |
444 | - return true; | |
445 | - } | |
446 | - } | |
447 | - } else if (prepWord.getTokens().size() > 1) { | |
448 | - String prepOrth = prepWord.getOrth().toLowerCase(); | |
449 | - String comprepnp = String.format("comprepnp(%s)", prepOrth); | |
450 | - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
184 | + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
185 | + for (String real : NG2realizations) { | |
451 | 186 | for (String schema : schemata) { |
452 | - if (schemaContains(schema, comprepnp)) { | |
187 | + if (schemaContains(schema, real)) { | |
453 | 188 | return true; |
454 | 189 | } |
455 | 190 | } |
456 | - | |
457 | 191 | } |
458 | - | |
459 | 192 | } |
460 | 193 | return false; |
461 | 194 | } |
462 | 195 | |
463 | - private static boolean NGNGValenceCompatibility(SyntacticGroup NG1, | |
464 | - SyntacticGroup NG2, Sentence sentence, | |
196 | + private static boolean tripleCompatibility(SyntacticGroup group1, | |
197 | + SyntacticGroup group2, SyntacticGroup group3, | |
465 | 198 | Map<String,ArrayList<String>> walentyMapping) { |
466 | - Token NG1Head = NG1.getSemanticHeadTokens().get(0); | |
199 | + Token group1Head = group1.getSemanticHeadTokens().get(0); | |
467 | 200 | |
468 | - String NGHeadBase = NG1Head.getBase(); | |
201 | + String group1HeadBase = group1Head.getBase(); | |
469 | 202 | |
470 | - if (!walentyMapping.containsKey(NGHeadBase)) { | |
203 | + if (!walentyMapping.containsKey(group1HeadBase)) { | |
471 | 204 | return false; |
472 | 205 | } else { |
473 | - ArrayList<String> NG2realizations = NG2.getWalentyRealizations(); | |
206 | + ArrayList<String> group2realizations = group2.getWalentyRealizations(); | |
207 | + ArrayList<String> group3realizations = group3.getWalentyRealizations(); | |
474 | 208 | |
475 | - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
476 | - for (String real : NG2realizations) { | |
477 | - for (String schema : schemata) { | |
478 | - if (schemaContains(schema, real)) { | |
479 | - return true; | |
480 | - } | |
209 | + ArrayList<String> schemata = walentyMapping.get(group1HeadBase); | |
210 | + for (String schema : schemata) { | |
211 | + if (isProperSchema(schema, group2realizations, group3realizations)) { | |
212 | + return true; | |
481 | 213 | } |
482 | 214 | } |
483 | 215 | } |
484 | 216 | return false; |
485 | 217 | } |
486 | 218 | |
487 | - private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1, | |
488 | - SyntacticGroup NGGroup2, Sentence sentence, | |
219 | + private static boolean quatroCompatibility(SyntacticGroup group1, | |
220 | + SyntacticGroup group2, SyntacticGroup group3, SyntacticGroup group4, | |
489 | 221 | Map<String,ArrayList<String>> walentyMapping) { |
490 | - | |
491 | - Token NGHead = NGGroup1.getSemanticHeadTokens().get(0); | |
492 | - String NGHeadBase = NGHead.getBase(); | |
222 | + Token group1Head = group1.getSemanticHeadTokens().get(0); | |
223 | + | |
224 | + String group1HeadBase = group1Head.getBase(); | |
493 | 225 | |
494 | - if (!walentyMapping.containsKey(NGHeadBase)) { | |
226 | + if (!walentyMapping.containsKey(group1HeadBase)) { | |
495 | 227 | return false; |
496 | 228 | } else { |
497 | - int prepStart = NGGroup1.getSentencePositionEnd() + 1; | |
498 | - int prepEnd = NGGroup2.getSentencePositionStart() - 1; | |
499 | - String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd); | |
500 | - String comprepnp = String.format("comprepnp(%s)", complexPrep); | |
501 | - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); | |
229 | + ArrayList<String> group2realizations = group2.getWalentyRealizations(); | |
230 | + ArrayList<String> group3realizations = group3.getWalentyRealizations(); | |
231 | + ArrayList<String> group4realizations = group4.getWalentyRealizations(); | |
232 | + | |
233 | + ArrayList<String> schemata = walentyMapping.get(group1HeadBase); | |
502 | 234 | for (String schema : schemata) { |
503 | - if (schemaContains(schema, comprepnp)) { | |
235 | + if (isTripleProperSchema(schema, group2realizations, group3realizations, | |
236 | + group4realizations)) { | |
504 | 237 | return true; |
505 | 238 | } |
506 | 239 | } |
... | ... | @@ -508,67 +241,119 @@ public class Detector { |
508 | 241 | return false; |
509 | 242 | } |
510 | 243 | |
511 | - private static boolean schemaContains(String schema, String phraseType) { | |
512 | - for (String position : schema.split("\\s\\+\\s")) { | |
513 | - position = position.trim(); | |
514 | - position = position.substring(1, position.length()-1); | |
515 | - for (String phrT : position.split(";")) { | |
516 | - if (phrT.equals(phraseType)) { | |
517 | - return true; | |
244 | + private static boolean isTripleProperSchema(String schema, ArrayList<String> group1Types, | |
245 | + ArrayList<String> group2Types, ArrayList<String> group3Types) { | |
246 | + for (String group1Type : group1Types) { | |
247 | + if (schemaContains(schema, group1Type)) { | |
248 | + for (String group2Type : group2Types) { | |
249 | + if (schemaContains(schema, group2Type)) { | |
250 | + for (String group3Type : group3Types) { | |
251 | + if (schemaContains(schema, group3Type)) { | |
252 | + return true; | |
253 | + } | |
254 | + } | |
255 | + } | |
518 | 256 | } |
519 | 257 | } |
520 | 258 | } |
521 | 259 | return false; |
522 | 260 | } |
523 | 261 | |
524 | - private static boolean schemaContainsType(String schema, String type) { | |
525 | - // to lepiej dziala dla rzeczownikow | |
526 | - for (String position : schema.split("\\s\\+\\s")) { | |
527 | - position = position.trim(); | |
528 | - position = position.substring(1, position.length()-1); | |
529 | - for (String phrT : position.split(";")) { | |
530 | - | |
531 | - if (phrT.startsWith(type+"(")) { | |
532 | - return true; | |
262 | + /*private static boolean isTripleProperSchema(String schema, ArrayList<String> group1Types, | |
263 | + ArrayList<String> group2Types, ArrayList<String> group3Types) { | |
264 | + | |
265 | + ArrayList<String> group1MPositions = getMatchingPositions(schema, group1Types); | |
266 | + ArrayList<String> group2MPositions = getMatchingPositions(schema, group2Types); | |
267 | + ArrayList<String> group3MPositions = getMatchingPositions(schema, group3Types); | |
268 | + | |
269 | + | |
270 | + | |
271 | + ArrayList<String> group1MPositionsCopy = new ArrayList<String>(); | |
272 | + ArrayList<String> group2MPositionsCopy = getMatchingPositions(schema, group2Types); | |
273 | + ArrayList<String> group3MPositionsCopy = getMatchingPositions(schema, group3Types); | |
274 | + | |
275 | + | |
276 | + if (group1MPositions.isEmpty() || group2MPositions.isEmpty() || group3MPositions.isEmpty()) { | |
277 | + return false; | |
278 | + } | |
279 | + | |
280 | + boolean group1ok = false; | |
281 | + boolean group2ok = false; | |
282 | + boolean group3ok = false; | |
283 | + | |
284 | + for (String pos : group1MPositions) { | |
285 | + | |
286 | + } | |
287 | + | |
288 | + ArrayList<String> | |
289 | + | |
290 | + if (union(group1MPositions, group2MPositions).size() > group1MPositions.size() && | |
291 | + ) | |
292 | + | |
293 | + | |
294 | + for (String group1Type : group1Types) { | |
295 | + if (schemaContains(schema, group1Type)) { | |
296 | + for (String group2Type : group2Types) { | |
297 | + if (schemaContains(schema, group2Type)) { | |
298 | + for (String group3Type : group3Types) { | |
299 | + if (schemaContains(schema, group3Type)) { | |
300 | + return true; | |
301 | + } | |
302 | + } | |
303 | + } | |
533 | 304 | } |
534 | 305 | } |
535 | 306 | } |
536 | 307 | return false; |
308 | + }*/ | |
309 | + | |
310 | + public static List<String> union(List<String> list1, List<String> list2) { | |
311 | + HashSet<String> set = new HashSet<String>(); | |
312 | + | |
313 | + set.addAll(list1); | |
314 | + set.addAll(list2); | |
315 | + | |
316 | + return new ArrayList<String>(set); | |
537 | 317 | } |
538 | 318 | |
319 | + public static List<String> tripleUnion(List<String> list1, List<String> list2, | |
320 | + List<String> list3) { | |
321 | + HashSet<String> set = new HashSet<String>(); | |
322 | + | |
323 | + set.addAll(list1); | |
324 | + set.addAll(list2); | |
325 | + set.addAll(list3); | |
326 | + | |
327 | + return new ArrayList<String>(set); | |
328 | + } | |
539 | 329 | |
540 | - // compar ?? | |
541 | - private static ArrayList<String> getPrepnps(String prepBase, String prepCase) { | |
542 | - ArrayList<String> prepnps = new ArrayList<String>(); | |
543 | - prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); | |
544 | - if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) { | |
545 | - prepnps.add(String.format("prepnp(%s,str)", prepBase)); | |
546 | - } | |
547 | - if (prepCase.equals("gen") || prepCase.equals("acc")) { | |
548 | - prepnps.add(String.format("prepnp(%s,part)", prepBase)); | |
330 | + private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) { | |
331 | + ArrayList<String> positions = new ArrayList<String>(); | |
332 | + for (String position : schema.split("\\s\\+\\s")) { | |
333 | + position = position.trim(); | |
334 | + position = position.substring(1, position.length()-1); | |
335 | + for (String phrT : position.split(";")) { | |
336 | + if (phraseRealizations.contains(phrT.trim())) { | |
337 | + positions.add(position); | |
338 | + break; | |
339 | + } | |
340 | + } | |
549 | 341 | } |
550 | - return prepnps; | |
342 | + return positions; | |
551 | 343 | } |
552 | 344 | |
553 | - // eliminuje "od wsi do wsi" | |
554 | - private static boolean sameSemanticHeads(SyntacticGroup group1, | |
555 | - SyntacticGroup group2) { | |
556 | - | |
557 | - List<Token> group1HeadTokens = group1.getSemanticHeadTokens(); | |
558 | - List<Token> group2HeadTokens = group2.getSemanticHeadTokens(); | |
559 | - if (group1HeadTokens.size() != group2HeadTokens.size()) { | |
560 | - return false; | |
561 | - } | |
562 | - | |
563 | - for (int i=0; i < group1HeadTokens.size(); i++) { | |
564 | - if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) { | |
565 | - return false; | |
345 | + private static boolean schemaContains(String schema, String phraseType) { | |
346 | + for (String position : schema.split("\\s\\+\\s")) { | |
347 | + position = position.trim(); | |
348 | + position = position.substring(1, position.length()-1); | |
349 | + for (String phrT : position.split(";")) { | |
350 | + if (phrT.equals(phraseType)) { | |
351 | + return true; | |
352 | + } | |
566 | 353 | } |
567 | 354 | } |
568 | - | |
569 | - return true; | |
355 | + return false; | |
570 | 356 | } |
571 | - | |
572 | 357 | |
573 | 358 | /** |
574 | 359 | * Wyszukuję i oznaczam wszystkie NER |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
1 | 1 | package pl.waw.ipipan.zil.core.md.entities; |
2 | 2 | |
3 | 3 | import java.util.ArrayList; |
4 | -import java.util.Arrays; | |
5 | 4 | import java.util.List; |
6 | 5 | |
6 | +import pl.waw.ipipan.zil.core.md.detection.Constants; | |
7 | + | |
7 | 8 | /** |
8 | 9 | * @author Mateusz Kopec |
10 | + * Modified 2017 by Bartlomiej Niton | |
9 | 11 | * |
10 | 12 | */ |
11 | 13 | public class Mention implements Comparable<Mention> { |
... | ... | @@ -205,77 +207,22 @@ public class Mention implements Comparable<Mention> { |
205 | 207 | return isZeroSubject; |
206 | 208 | } |
207 | 209 | |
208 | - public int getSentencePositionStart() { | |
210 | + public int getSentenceStartPosition() { | |
209 | 211 | Token startToken = this.getFirstSegment(); |
210 | 212 | return startToken.getSentencePosition(); |
211 | 213 | } |
212 | 214 | |
213 | - public int getSentencePositionEnd() { | |
215 | + public int getSentenceEndPosition() { | |
214 | 216 | Token endToken = this.getLastSegment(); |
215 | 217 | return endToken.getSentencePosition(); |
216 | 218 | } |
217 | - | |
218 | - public boolean isPartOfQub() { | |
219 | - if (this.segments.size() == 1) { | |
220 | - Sentence sentence = this.segments.get(0).getSentence(); | |
221 | - for (SyntacticWord word : sentence.getSyntacticWords()) { | |
222 | - if (word.getTokens().contains(this.segments.get(0)) && | |
223 | - word.getCtag().equals("Qub")) { | |
224 | - return true; | |
225 | - } | |
226 | - } | |
227 | - } | |
228 | - return false; | |
229 | - } | |
230 | - | |
231 | - public boolean isPartOfPrep() { | |
232 | - if (this.segments.size() == 1) { | |
233 | - Sentence sentence = this.segments.get(0).getSentence(); | |
234 | - for (SyntacticWord word : sentence.getSyntacticWords()) { | |
235 | - if (word.getTokens().contains(this.segments.get(0)) && | |
236 | - word.getCtag().equals("Prep")) { | |
237 | - return true; | |
238 | - } | |
239 | - } | |
240 | - } | |
241 | - return false; | |
242 | - } | |
243 | - | |
244 | - private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj", | |
245 | - "Adj", "Conj", "Comp"); | |
246 | 219 | |
247 | 220 | public boolean isPartOfFrazeo() { |
248 | 221 | if (this.segments.size() == 1) { |
249 | 222 | Sentence sentence = this.segments.get(0).getSentence(); |
250 | 223 | for (SyntacticWord word : sentence.getSyntacticWords()) { |
251 | 224 | if (word.getTokens().contains(this.segments.get(0)) && |
252 | - FRAZEOS.contains(word.getCtag())) { | |
253 | - return true; | |
254 | - } | |
255 | - } | |
256 | - } | |
257 | - return false; | |
258 | - } | |
259 | - | |
260 | - public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) { | |
261 | - if (this.segments.size() == 1) { | |
262 | - Sentence sentence = this.segments.get(0).getSentence(); | |
263 | - if (this.getSentencePositionStart() - 1 >= 0) { | |
264 | - String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth(); | |
265 | - String noun = sentence.get(this.getSentencePositionStart()).getOrth(); | |
266 | - String possiblePrep = String.format("%s %s", prep, noun); | |
267 | - if (complexPreps.contains(possiblePrep)) { | |
268 | - return true; | |
269 | - } | |
270 | - } | |
271 | - | |
272 | - if (this.getSentencePositionStart() - 1 >= 0 && | |
273 | - this.getSentencePositionStart() + 1 < sentence.size()) { | |
274 | - String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth(); | |
275 | - String noun = sentence.get(this.getSentencePositionStart()).getOrth(); | |
276 | - String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth(); | |
277 | - String possiblePrep = String.format("%s %s %s", prep1, noun, prep2); | |
278 | - if (complexPreps.contains(possiblePrep)) { | |
225 | + Constants.FRAZEO_CTAGS.contains(word.getCtag())) { | |
279 | 226 | return true; |
280 | 227 | } |
281 | 228 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
... | ... | @@ -110,35 +110,6 @@ public class Sentence extends ArrayList<Token> { |
110 | 110 | namedEntities.add(namedEntity); |
111 | 111 | } |
112 | 112 | |
113 | - public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) { | |
114 | - ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); | |
115 | - for (SyntacticGroup group : this.syntacticGroups) { | |
116 | - if (group.getSentencePositionStart() >= start && | |
117 | - group.getSentencePositionEnd() <= end) { | |
118 | - if (!(group.getSentencePositionStart() == start && | |
119 | - group.getSentencePositionEnd() == end)) { | |
120 | - groupsAtSpan.add(group); | |
121 | - } | |
122 | - } | |
123 | - } | |
124 | - return groupsAtSpan; | |
125 | - } | |
126 | - | |
127 | - public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) { | |
128 | - ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); | |
129 | - for (SyntacticGroup group : this.syntacticGroups) { | |
130 | - | |
131 | - if (group.getSentencePositionStart() >= start && | |
132 | - group.getSentencePositionEnd() <= end) { | |
133 | - if (!(group.getSentencePositionStart() == start && | |
134 | - group.getSentencePositionEnd() == end)) { | |
135 | - groupsAtSpan.add(group); | |
136 | - } | |
137 | - } | |
138 | - } | |
139 | - return groupsAtSpan; | |
140 | - } | |
141 | - | |
142 | 113 | public SyntacticGroup getFirstGroup(int start, int end) { |
143 | 114 | SyntacticGroup largestGroup = null; |
144 | 115 | int step = start; |
... | ... | @@ -152,8 +123,8 @@ public class Sentence extends ArrayList<Token> { |
152 | 123 | private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) { |
153 | 124 | SyntacticGroup largestGroup = null; |
154 | 125 | for (SyntacticGroup group : this.getGroups()) { |
155 | - int groupStart = group.getSentencePositionStart(); | |
156 | - int groupEnd = group.getSentencePositionEnd(); | |
126 | + int groupStart = group.getSentenceStartPosition(); | |
127 | + int groupEnd = group.getSentenceEndPosition(); | |
157 | 128 | if (groupStart == start && groupEnd <= end && |
158 | 129 | !(groupStart == start && groupEnd == end) && |
159 | 130 | (largestGroup == null || |
... | ... | @@ -177,8 +148,8 @@ public class Sentence extends ArrayList<Token> { |
177 | 148 | private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) { |
178 | 149 | SyntacticGroup largestGroup = null; |
179 | 150 | for (SyntacticGroup group : this.getGroups()) { |
180 | - int groupStart = group.getSentencePositionStart(); | |
181 | - int groupEnd = group.getSentencePositionEnd(); | |
151 | + int groupStart = group.getSentenceStartPosition(); | |
152 | + int groupEnd = group.getSentenceEndPosition(); | |
182 | 153 | if (groupEnd == end && groupStart >= start && |
183 | 154 | !(groupStart == start && groupEnd == end) && |
184 | 155 | (largestGroup == null || |
... | ... | @@ -189,38 +160,4 @@ public class Sentence extends ArrayList<Token> { |
189 | 160 | return largestGroup; |
190 | 161 | } |
191 | 162 | |
192 | - public ArrayList<Mention> getMentionsInsideSpan(int start, int end) { | |
193 | - ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>(); | |
194 | - for (Mention mention : this.mentions) { | |
195 | - if (mention.getSentencePositionStart() >= start && | |
196 | - mention.getSentencePositionEnd() <= end) { | |
197 | - mentionsAtSpan.add(mention); | |
198 | - } | |
199 | - } | |
200 | - return mentionsAtSpan; | |
201 | - } | |
202 | - | |
203 | - public String getTextInsideSpan(int start, int end) { | |
204 | - String text = ""; | |
205 | - int step = start; | |
206 | - while (step <= end) { | |
207 | - if (step != start) { | |
208 | - text += " "; | |
209 | - } | |
210 | - text += this.get(step).getOrth(); | |
211 | - step++; | |
212 | - } | |
213 | - return text; | |
214 | - } | |
215 | - | |
216 | - public ArrayList<Token> getSegmentsInsideSpan(int start, int end) { | |
217 | - ArrayList<Token> tokensAtSpan = new ArrayList<Token>(); | |
218 | - int step = start; | |
219 | - while (step <= end) { | |
220 | - tokensAtSpan.add(this.get(step)); | |
221 | - step++; | |
222 | - } | |
223 | - return tokensAtSpan; | |
224 | - } | |
225 | - | |
226 | 163 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
... | ... | @@ -55,34 +55,19 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
55 | 55 | return getType().compareTo(o.getType()); |
56 | 56 | } |
57 | 57 | |
58 | - public int getSentencePositionStart() { | |
58 | + public int getSentenceStartPosition() { | |
59 | 59 | Token startToken = tokens.get(0); |
60 | 60 | return startToken.getSentencePosition(); |
61 | 61 | } |
62 | 62 | |
63 | - public int getSentencePositionEnd() { | |
63 | + public int getSentenceEndPosition() { | |
64 | 64 | Token endToken = tokens.get(tokens.size()-1); |
65 | 65 | return endToken.getSentencePosition(); |
66 | 66 | } |
67 | 67 | |
68 | - | |
69 | - public SyntacticWord getFirstWord() { | |
70 | - SyntacticWord firstWord = null; | |
71 | - Token startToken = tokens.get(0); | |
72 | - Sentence sentence = startToken.getSentence(); | |
73 | - for (SyntacticWord word : sentence.getSyntacticWords()) { | |
74 | - if(startToken.compareTo(word.getTokens().get(0)) == 0 && | |
75 | - (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) { | |
76 | - firstWord = word; | |
77 | - } | |
78 | - } | |
79 | - return firstWord; | |
80 | - } | |
81 | - | |
82 | - // NG and PrepNG only now | |
83 | 68 | public ArrayList<String> getWalentyRealizations() { |
84 | 69 | ArrayList<String> realizations = new ArrayList<String>(); |
85 | - if (this.type.startsWith("PrepNG")) { | |
70 | + if (this.type.equals("PrepNG")) { | |
86 | 71 | SyntacticWord prepWord = this.getFirstWord(); |
87 | 72 | if (prepWord.getTokens().size() == 1) { |
88 | 73 | |
... | ... | @@ -105,7 +90,19 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
105 | 90 | return realizations; |
106 | 91 | } |
107 | 92 | |
108 | - // compar ?? | |
93 | + public SyntacticWord getFirstWord() { | |
94 | + SyntacticWord firstWord = null; | |
95 | + Token startToken = tokens.get(0); | |
96 | + Sentence sentence = startToken.getSentence(); | |
97 | + for (SyntacticWord word : sentence.getSyntacticWords()) { | |
98 | + if(startToken.compareTo(word.getTokens().get(0)) == 0 && | |
99 | + (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) { | |
100 | + firstWord = word; | |
101 | + } | |
102 | + } | |
103 | + return firstWord; | |
104 | + } | |
105 | + | |
109 | 106 | private ArrayList<String> getPrepnps(String prepBase, String prepCase) { |
110 | 107 | ArrayList<String> prepnps = new ArrayList<String>(); |
111 | 108 | prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); |
... | ... | @@ -130,44 +127,13 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
130 | 127 | return nps; |
131 | 128 | } |
132 | 129 | |
133 | - public boolean precedingWordIsVerb() { | |
134 | - Sentence sentence = this.tokens.get(0).getSentence(); | |
135 | - int precedingTokenPosition = this.getSentencePositionStart() - 1; | |
136 | - for (SyntacticWord word : sentence.getSyntacticWords()) { | |
137 | - int lastWordPosition = word.getSentencePositionEnd(); | |
138 | - if (precedingTokenPosition == lastWordPosition && | |
139 | - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { | |
140 | - return true; | |
141 | - } | |
142 | - } | |
143 | - return false; | |
144 | - } | |
145 | - | |
146 | - public SyntacticGroup getNextNG() { | |
147 | - Sentence sentence = this.tokens.get(0).getSentence(); | |
148 | - int thisGroupEnd = this.getSentencePositionEnd(); | |
149 | - int sentenceLength = sentence.size(); | |
150 | - | |
151 | - SyntacticGroup nextNG = null; | |
152 | - for (int step = thisGroupEnd; step < sentenceLength; step++) { | |
153 | - nextNG = sentence.getFirstGroup(step, sentenceLength); | |
154 | - if (nextNG != null && nextNG.type.startsWith("NG") && | |
155 | - this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) { | |
156 | - break; | |
157 | - } else { | |
158 | - nextNG = null; | |
159 | - } | |
160 | - } | |
161 | - return nextNG; | |
162 | - } | |
163 | - | |
164 | 130 | public SyntacticGroup getFollowingGroup() { |
165 | 131 | SyntacticGroup largestGroup = null; |
166 | 132 | Sentence sentence = this.tokens.get(0).getSentence(); |
167 | - int nextTokenPosition = this.getSentencePositionEnd() + 1; | |
133 | + int nextTokenPosition = this.getSentenceEndPosition() + 1; | |
168 | 134 | for (SyntacticGroup group : sentence.getGroups()) { |
169 | - if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) && | |
170 | - group.getSentencePositionStart() == nextTokenPosition) { | |
135 | + if ((group.getType().equals("PrepNG") || group.getType().startsWith("NG")) && | |
136 | + group.getSentenceStartPosition() == nextTokenPosition) { | |
171 | 137 | if (largestGroup == null || |
172 | 138 | largestGroup.getTokens().size() < group.getTokens().size()) { |
173 | 139 | largestGroup = group; |
... | ... | @@ -178,14 +144,14 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
178 | 144 | } |
179 | 145 | |
180 | 146 | public SyntacticWord getPrecedingVerb() { |
181 | - int precedingTokenPosition = this.getSentencePositionStart() - 1; | |
147 | + int precedingTokenPosition = this.getSentenceStartPosition() - 1; | |
182 | 148 | Sentence sentence = this.tokens.get(0).getSentence(); |
183 | 149 | if(this.isPartOfPrepNG()) { |
184 | 150 | SyntacticGroup parentNGGroup = this.getParentPrepNG(); |
185 | - precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; | |
151 | + precedingTokenPosition = parentNGGroup.getSentenceStartPosition() - 1; | |
186 | 152 | } |
187 | 153 | for (SyntacticWord word : sentence.getSyntacticWords()) { |
188 | - int lastWordPosition = word.getSentencePositionEnd(); | |
154 | + int lastWordPosition = word.getSentenceEndPosition(); | |
189 | 155 | if (precedingTokenPosition == lastWordPosition && |
190 | 156 | (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { |
191 | 157 | return word; |
... | ... | @@ -195,13 +161,13 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
195 | 161 | } |
196 | 162 | |
197 | 163 | private boolean isPartOfPrepNG() { |
198 | - int NGGroupStart = this.getSentencePositionStart(); | |
199 | - int NGGroupEnd = this.getSentencePositionEnd(); | |
164 | + int NGGroupStart = this.getSentenceStartPosition(); | |
165 | + int NGGroupEnd = this.getSentenceEndPosition(); | |
200 | 166 | Sentence sentence = this.tokens.get(0).getSentence(); |
201 | 167 | for (SyntacticGroup group : sentence.getGroups()) { |
202 | - if (group.getType().startsWith("PrepNG") && | |
203 | - group.getSentencePositionStart() <= NGGroupStart && | |
204 | - group.getSentencePositionEnd() >= NGGroupEnd) { | |
168 | + if (group.getType().equals("PrepNG") && | |
169 | + group.getSentenceStartPosition() <= NGGroupStart && | |
170 | + group.getSentenceEndPosition() >= NGGroupEnd) { | |
205 | 171 | return true; |
206 | 172 | } |
207 | 173 | } |
... | ... | @@ -210,13 +176,13 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
210 | 176 | |
211 | 177 | private SyntacticGroup getParentPrepNG() { |
212 | 178 | SyntacticGroup parentPrepNG = null; |
213 | - int NGGroupStart = this.getSentencePositionStart(); | |
214 | - int NGGroupEnd = this.getSentencePositionEnd(); | |
179 | + int NGGroupStart = this.getSentenceStartPosition(); | |
180 | + int NGGroupEnd = this.getSentenceEndPosition(); | |
215 | 181 | Sentence sentence = this.tokens.get(0).getSentence(); |
216 | 182 | for (SyntacticGroup group : sentence.getGroups()) { |
217 | - if (group.getType().startsWith("PrepNG") && | |
218 | - group.getSentencePositionStart() <= NGGroupStart && | |
219 | - group.getSentencePositionEnd() >= NGGroupEnd) { | |
183 | + if (group.getType().equals("PrepNG") && | |
184 | + group.getSentenceStartPosition() <= NGGroupStart && | |
185 | + group.getSentenceEndPosition() >= NGGroupEnd) { | |
220 | 186 | if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { |
221 | 187 | parentPrepNG = group; |
222 | 188 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
... | ... | @@ -4,6 +4,8 @@ import java.util.ArrayList; |
4 | 4 | import java.util.Iterator; |
5 | 5 | import java.util.List; |
6 | 6 | |
7 | +import pl.waw.ipipan.zil.core.md.detection.Constants; | |
8 | + | |
7 | 9 | public class SyntacticWord implements Comparable<SyntacticWord> { |
8 | 10 | |
9 | 11 | private String base; |
... | ... | @@ -22,6 +24,14 @@ public class SyntacticWord implements Comparable<SyntacticWord> { |
22 | 24 | public String getCtag() { |
23 | 25 | return ctag; |
24 | 26 | } |
27 | + | |
28 | + public String getBase() { | |
29 | + return base; | |
30 | + } | |
31 | + | |
32 | + public String getOrth() { | |
33 | + return orth; | |
34 | + } | |
25 | 35 | |
26 | 36 | public List<Token> getTokens() { |
27 | 37 | return tokens; |
... | ... | @@ -45,33 +55,18 @@ public class SyntacticWord implements Comparable<SyntacticWord> { |
45 | 55 | return getCtag().compareTo(o.getCtag()); |
46 | 56 | } |
47 | 57 | |
48 | - public int getSentencePositionStart() { | |
58 | + public int getSentenceStartPosition() { | |
49 | 59 | Token startToken = tokens.get(0); |
50 | 60 | return startToken.getSentencePosition(); |
51 | 61 | } |
52 | 62 | |
53 | - public int getSentencePositionEnd() { | |
63 | + public int getSentenceEndPosition() { | |
54 | 64 | Token endToken = tokens.get(tokens.size()-1); |
55 | 65 | return endToken.getSentencePosition(); |
56 | 66 | } |
57 | 67 | |
58 | - public String getBase() { | |
59 | - return this.base; | |
60 | - } | |
61 | - | |
62 | - public String getOrth() { | |
63 | - return this.orth; | |
64 | - } | |
65 | - | |
66 | 68 | public boolean isVerb() { |
67 | - if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) { | |
68 | - return true; | |
69 | - } | |
70 | - return false; | |
71 | - } | |
72 | - | |
73 | - public boolean isInterp() { | |
74 | - if (this.ctag.equals("Interp")) { | |
69 | + if (Constants.VERB_CTAGS.contains(this.ctag)) { | |
75 | 70 | return true; |
76 | 71 | } |
77 | 72 | return false; |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
... | ... | @@ -70,7 +70,6 @@ public class TeiLoader { |
70 | 70 | for (TEIMorph mo : m.getHeadMorphs()) |
71 | 71 | headTokens.add(teiMorph2Segment.get(mo)); |
72 | 72 | s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); |
73 | - System.out.println(tokens.toString()); | |
74 | 73 | } |
75 | 74 | |
76 | 75 | private static void loadSyntacticGroup(Sentence s, TEIGroup g, |
... | ... |