Commit 8f86545e14f99bbf47ab83bf202e26af7a2716c4

Authored by Bartłomiej Nitoń
1 parent 1dc4f947

Cleaning unused experimental code.

src/main/java/pl/waw/ipipan/zil/core/md/Main.java
@@ -33,9 +33,8 @@ public class Main { @@ -33,9 +33,8 @@ public class Main {
33 33
34 private static final boolean GZIP_OUTPUT = true; 34 private static final boolean GZIP_OUTPUT = true;
35 private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; 35 private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
36 - private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt";  
37 - private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt";  
38 - private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt"; 36 + private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all_with_realizations.txt";
  37 + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all_with_realizations.txt";
39 38
40 private static ZeroSubjectDetector zeroSubjectModel; 39 private static ZeroSubjectDetector zeroSubjectModel;
41 40
@@ -46,8 +45,6 @@ public class Main { @@ -46,8 +45,6 @@ public class Main {
46 45
47 private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = 46 private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence =
48 new EnumMap(ValenceDicts.class); 47 new EnumMap(ValenceDicts.class);
49 -  
50 - private static final ArrayList<String> complexPreps;  
51 48
52 static { 49 static {
53 InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); 50 InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
@@ -58,9 +55,6 @@ public class Main { @@ -58,9 +55,6 @@ public class Main {
58 55
59 InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); 56 InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
60 valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); 57 valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
61 -  
62 - InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS);  
63 - complexPreps = readValues(complexPrepositionsStream);  
64 } 58 }
65 59
66 60
@@ -125,34 +119,6 @@ public class Main { @@ -125,34 +119,6 @@ public class Main {
125 119
126 return false; 120 return false;
127 } 121 }
128 -  
129 - public static ArrayList<String> readValues(InputStream stream) {  
130 - ArrayList<String> values;  
131 - try {  
132 - BufferedReader br=new BufferedReader(new InputStreamReader(stream));  
133 - values = new ArrayList<String>();  
134 - String line;  
135 - boolean firstLine = true;  
136 - while((line = br.readLine()) != null) {  
137 - if (firstLine) {  
138 - line = line.replace("\uFEFF", ""); // remove BOM character  
139 - firstLine = false;  
140 - }  
141 -  
142 - if (!line.startsWith("%")) {  
143 - String value = line.trim();  
144 - if (!value.isEmpty()) {  
145 - values.add(value);  
146 - }  
147 - }  
148 - }  
149 - br.close();  
150 - } catch (IOException ex) {  
151 - ex.printStackTrace();  
152 - throw new RuntimeException(ex);  
153 - }  
154 - return values;  
155 - }  
156 122
157 private Main() { 123 private Main() {
158 } 124 }
@@ -244,7 +210,7 @@ public class Main { @@ -244,7 +210,7 @@ public class Main {
244 */ 210 */
245 public static void annotateThriftText(TText thriftText) throws MultiserviceException { 211 public static void annotateThriftText(TText thriftText) throws MultiserviceException {
246 Text responseText = ThriftLoader.loadTextFromThrift(thriftText); 212 Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
247 - Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); 213 + Detector.findMentionsInText(responseText, zeroSubjectModel, valence);
248 ThriftSaver.updateThriftText(responseText, thriftText); 214 ThriftSaver.updateThriftText(responseText, thriftText);
249 } 215 }
250 216
@@ -257,7 +223,7 @@ public class Main { @@ -257,7 +223,7 @@ public class Main {
257 */ 223 */
258 public static void annotateTeiText(TEICorpusText teiText) throws TEIException { 224 public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
259 Text responseText = TeiLoader.loadTextFromTei(teiText); 225 Text responseText = TeiLoader.loadTextFromTei(teiText);
260 - Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); 226 + Detector.findMentionsInText(responseText, zeroSubjectModel, valence);
261 TeiSaver.updateTeiText(responseText, teiText); 227 TeiSaver.updateTeiText(responseText, teiText);
262 } 228 }
263 229
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 package pl.waw.ipipan.zil.core.md.detection; 1 package pl.waw.ipipan.zil.core.md.detection;
2 2
3 -import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;  
4 import pl.waw.ipipan.zil.core.md.entities.Mention; 3 import pl.waw.ipipan.zil.core.md.entities.Mention;
5 import pl.waw.ipipan.zil.core.md.entities.Sentence; 4 import pl.waw.ipipan.zil.core.md.entities.Sentence;
6 import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; 5 import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
@@ -164,33 +163,6 @@ public class Cleaner { @@ -164,33 +163,6 @@ public class Cleaner {
164 } 163 }
165 } 164 }
166 165
167 - /*private static void removeWalentyFramedMentions(Sentence sentence,  
168 - ArrayList<Mention> mentions,  
169 - ArrayList<String> schemata) {  
170 - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();  
171 - for (Mention mention : mentions) {  
172 - int mentionStart = mention.getFirstSegment().getSentencePosition();  
173 - int mentionEnd = mention.getLastSegment().getSentencePosition();  
174 - SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd);  
175 - SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd);  
176 - if (startGroup != null && endGroup != null  
177 - && startGroup.compareTo(endGroup) != 0) {  
178 - ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations();  
179 - ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations();  
180 - for (String schema : schemata) {  
181 - if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) {  
182 - mentionsToRemove.add(mention);  
183 - break;  
184 - }  
185 - }  
186 - }  
187 - }  
188 -  
189 - for (Mention mentionToRemove : mentionsToRemove) {  
190 - sentence.removeMention(mentionToRemove);  
191 - }  
192 - }*/  
193 -  
194 private static boolean isProperSchema(String schema, ArrayList<String> group1Types, 166 private static boolean isProperSchema(String schema, ArrayList<String> group1Types,
195 ArrayList<String> group2Types) { 167 ArrayList<String> group2Types) {
196 for (String group1Type : group1Types) { 168 for (String group1Type : group1Types) {
@@ -207,7 +179,7 @@ public class Cleaner { @@ -207,7 +179,7 @@ public class Cleaner {
207 String phraseType2) { 179 String phraseType2) {
208 boolean phrType1Found = false; 180 boolean phrType1Found = false;
209 boolean phrType2Found = false; 181 boolean phrType2Found = false;
210 - for (String position : schema.split("\\+")) { 182 + for (String position : schema.split("\\s\\+\\s")) {
211 position = position.trim(); 183 position = position.trim();
212 position = position.substring(1, position.length()-1); 184 position = position.substring(1, position.length()-1);
213 for (String phrT : position.split(";")) { 185 for (String phrT : position.split(";")) {
@@ -226,34 +198,6 @@ public class Cleaner { @@ -226,34 +198,6 @@ public class Cleaner {
226 return false; 198 return false;
227 } 199 }
228 200
229 -  
230 - // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub)  
231 - public static void cleanQubs(Sentence sentence) {  
232 - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();  
233 - for (Mention mention : sentence.getMentions()) {  
234 - if (mention.isPartOfQub()) {  
235 - mentionsToRemove.add(mention);  
236 - }  
237 - }  
238 -  
239 - for (Mention mentionToRemove : mentionsToRemove) {  
240 - sentence.removeMention(mentionToRemove);  
241 - }  
242 - }  
243 -  
244 - public static void cleanPreps(Sentence sentence) {  
245 - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();  
246 - for (Mention mention : sentence.getMentions()) {  
247 - if (mention.isPartOfPrep()) {  
248 - mentionsToRemove.add(mention);  
249 - }  
250 - }  
251 -  
252 - for (Mention mentionToRemove : mentionsToRemove) {  
253 - sentence.removeMention(mentionToRemove);  
254 - }  
255 - }  
256 -  
257 public static void cleanFrazeos(Sentence sentence) { 201 public static void cleanFrazeos(Sentence sentence) {
258 ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); 202 ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
259 for (Mention mention : sentence.getMentions()) { 203 for (Mention mention : sentence.getMentions()) {
@@ -267,20 +211,4 @@ public class Cleaner { @@ -267,20 +211,4 @@ public class Cleaner {
267 } 211 }
268 } 212 }
269 213
270 - // wyrzuca wzmianki bedace czescia przyimkow zlozonych  
271 - public static void cleanComplexPreps(Sentence sentence,  
272 - ArrayList<String> complexPreps) {  
273 -  
274 - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();  
275 - for (Mention mention : sentence.getMentions()) {  
276 - if (mention.isPartOfComplexPrep(complexPreps)) {  
277 - mentionsToRemove.add(mention);  
278 - }  
279 - }  
280 -  
281 - for (Mention mentionToRemove : mentionsToRemove) {  
282 - sentence.removeMention(mentionToRemove);  
283 - }  
284 - }  
285 -  
286 } 214 }
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
1 package pl.waw.ipipan.zil.core.md.detection; 1 package pl.waw.ipipan.zil.core.md.detection;
2 2
  3 +import java.util.Arrays;
  4 +import java.util.List;
  5 +
3 public class Constants { 6 public class Constants {
4 public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger"; 7 public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger";
5 public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt"; 8 public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt";
@@ -7,6 +10,11 @@ public class Constants { @@ -7,6 +10,11 @@ public class Constants {
7 public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|" 10 public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|"
8 + MORPHO_PRONOUN_CTAGS; 11 + MORPHO_PRONOUN_CTAGS;
9 public static final String WORDS_CTAGS = "Noun|Ppron.*"; 12 public static final String WORDS_CTAGS = "Noun|Ppron.*";
  13 +
  14 + public static final List<String> FRAZEO_CTAGS = Arrays.asList("Prep", "Qub", "Adv", "Interj",
  15 + "Adj", "Conj", "Comp");
  16 +
  17 + public static final List<String> VERB_CTAGS = Arrays.asList("Inf", "Verbfin");
10 18
11 private Constants() { 19 private Constants() {
12 } 20 }
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
@@ -22,24 +22,22 @@ public class Detector { @@ -22,24 +22,22 @@ public class Detector {
22 22
23 public static void findMentionsInText(Text text, 23 public static void findMentionsInText(Text text,
24 ZeroSubjectDetector zeroSubjectModel, 24 ZeroSubjectDetector zeroSubjectModel,
25 - Map<ValenceDicts,Map<String,ArrayList<String>>> valence,  
26 - ArrayList<String> complexPreps) { 25 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
27 text.clearMentions(); 26 text.clearMentions();
28 logger.debug("Detecting mentions in text " + text.getId()); 27 logger.debug("Detecting mentions in text " + text.getId());
29 for (Paragraph p : text) 28 for (Paragraph p : text)
30 for (Sentence s : p) 29 for (Sentence s : p)
31 - detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps); 30 + detectMentionsInSentence(s, zeroSubjectModel, valence);
32 } 31 }
33 32
34 private static void detectMentionsInSentence(Sentence sentence, 33 private static void detectMentionsInSentence(Sentence sentence,
35 ZeroSubjectDetector zeroSubjectModel, 34 ZeroSubjectDetector zeroSubjectModel,
36 - Map<ValenceDicts,Map<String,ArrayList<String>>> valence,  
37 - ArrayList<String> complexPreps) { 35 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
38 // adding mentions 36 // adding mentions
39 addMentionsByTokenCtag(sentence); 37 addMentionsByTokenCtag(sentence);
40 addMentionsBySyntacticWordsCtag(sentence); 38 addMentionsBySyntacticWordsCtag(sentence);
41 addMentionsByNamedEntities(sentence); 39 addMentionsByNamedEntities(sentence);
42 - addMentionsByGroups(sentence, valence, complexPreps); 40 + addMentionsByGroups(sentence, valence);
43 addSpeakerMentionsInSpoken(sentence); 41 addSpeakerMentionsInSpoken(sentence);
44 42
45 // zero subject detection 43 // zero subject detection
@@ -47,12 +45,9 @@ public class Detector { @@ -47,12 +45,9 @@ public class Detector {
47 45
48 // removing mentions 46 // removing mentions
49 removeTo(sentence); 47 removeTo(sentence);
  48 + Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
50 Cleaner.cleanUnnecessarySentenceMentions(sentence); 49 Cleaner.cleanUnnecessarySentenceMentions(sentence);
51 - //Cleaner.cleanQubs(sentence);  
52 - //Cleaner.cleanPreps(sentence);  
53 - //Cleaner.cleanComplexPreps(sentence, complexPreps);  
54 Cleaner.cleanFrazeos(sentence); 50 Cleaner.cleanFrazeos(sentence);
55 - Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));  
56 51
57 // updating mention heads 52 // updating mention heads
58 updateMentionHeads(sentence); 53 updateMentionHeads(sentence);
@@ -108,294 +103,64 @@ public class Detector { @@ -108,294 +103,64 @@ public class Detector {
108 * @param sentence 103 * @param sentence
109 */ 104 */
110 private static void addMentionsByGroups(Sentence sentence, 105 private static void addMentionsByGroups(Sentence sentence,
111 - Map<ValenceDicts,Map<String,ArrayList<String>>> valence,  
112 - ArrayList<String> complexPreps) {  
113 - List<SyntacticGroup> groups = sentence.getGroups();  
114 - for (int i = 0; i < groups.size(); i++) {  
115 - SyntacticGroup thisGroup = groups.get(i);  
116 -  
117 - /*SyntacticGroup nearPrepNG = null;  
118 - SyntacticGroup nextNG = null;*/  
119 -  
120 - SyntacticGroup nextGroup = thisGroup.getFollowingGroup();  
121 -  
122 - /*if (thisGroup.getType().startsWith("NG")) {  
123 - nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(),  
124 - sentence);  
125 - nextNG = thisGroup.getNextNG();  
126 - }*/  
127 -  
128 - /*if (nextNG != null) {  
129 - int prepStart = thisGroup.getSentencePositionEnd() + 1;  
130 - int prepEnd = nextNG.getSentencePositionStart() - 1;  
131 - String prep = sentence.getTextInsideSpan(prepStart, prepEnd);  
132 - if (complexPreps.contains(prep)) {  
133 - String cos = "";  
134 - }  
135 - }*/ 106 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
  107 +
  108 + for (SyntacticGroup group : sentence.getGroups()) {
  109 + SyntacticGroup nextGroup = group.getFollowingGroup();
  110 + SyntacticGroup nextnextGroup = null;
  111 + SyntacticGroup nextnextnextGroup = null;
  112 + if (nextGroup != null) {
  113 + nextnextGroup = nextGroup.getFollowingGroup();
  114 + if (nextnextGroup != null) {
  115 + nextnextnextGroup = nextnextGroup.getFollowingGroup();
  116 + }
  117 + }
136 118
137 - /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null &&  
138 - //!isPartOfPrepNG(thisGroup, sentence) &&  
139 - //getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null &&  
140 - precedingWordIsVerb(thisGroup, sentence) &&  
141 - //!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&  
142 - !precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&  
143 - !sameSemanticHeads(thisGroup, nearPrepNG)) {  
144 - List<Token> heads = thisGroup.getSemanticHeadTokens();  
145 - List<Token> segments = thisGroup.getTokens();  
146 - segments.addAll(nearPrepNG.getTokens());  
147 -  
148 - sentence.addMention(new Mention(segments, heads));  
149 - }*/  
150 - /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null &&  
151 - // !precedingWordIsVerb(thisGroup, sentence) &&  
152 - !isPartOfPrepNG(thisGroup, sentence) &&  
153 - getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null &&  
154 - //!precedingWordIsVerb(thisGroup, sentence) &&  
155 - !precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&  
156 - //!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&  
157 - !sameSemanticHeads(thisGroup, nearPrepNG)) {  
158 - List<Token> heads = thisGroup.getSemanticHeadTokens();  
159 - List<Token> segments = thisGroup.getTokens();  
160 - segments.addAll(nearPrepNG.getTokens());  
161 -  
162 - sentence.addMention(new Mention(segments, heads));  
163 - }*/  
164 - if (thisGroup.getType().startsWith("NG") &&  
165 - nextGroup != null && nextGroup.getType().startsWith("PrepNG") &&  
166 - NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) {  
167 - List<Token> heads = thisGroup.getSemanticHeadTokens(); 119 + if (group.getType().startsWith("NG") && nextGroup != null &&
  120 + nextnextGroup != null && nextnextnextGroup != null &&
  121 + quatroCompatibility(group, nextGroup, nextnextGroup,
  122 + nextnextnextGroup, valence.get(ValenceDicts.NounsValence))) {
  123 + List<Token> heads = group.getSemanticHeadTokens();
168 List<Token> segments = new ArrayList<Token>(); 124 List<Token> segments = new ArrayList<Token>();
169 - segments.addAll(thisGroup.getTokens()); 125 + segments.addAll(group.getTokens());
170 segments.addAll(nextGroup.getTokens()); 126 segments.addAll(nextGroup.getTokens());
  127 + segments.addAll(nextnextGroup.getTokens());
  128 + segments.addAll(nextnextnextGroup.getTokens());
171 129
172 sentence.addMention(new Mention(segments, heads)); 130 sentence.addMention(new Mention(segments, heads));
173 - } else if (thisGroup.getType().startsWith("NG") && nextGroup != null &&  
174 - nextGroup.getType().startsWith("NG") &&  
175 - NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))  
176 - ) {  
177 - List<Token> heads = thisGroup.getSemanticHeadTokens(); 131 + } else if (group.getType().startsWith("NG") && nextGroup != null &&
  132 + nextnextGroup != null && tripleCompatibility(group, nextGroup, nextnextGroup, valence.get(ValenceDicts.NounsValence))) {
  133 + List<Token> heads = group.getSemanticHeadTokens();
178 List<Token> segments = new ArrayList<Token>(); 134 List<Token> segments = new ArrayList<Token>();
179 - segments.addAll(thisGroup.getTokens()); 135 + segments.addAll(group.getTokens());
180 segments.addAll(nextGroup.getTokens()); 136 segments.addAll(nextGroup.getTokens());
  137 + segments.addAll(nextnextGroup.getTokens());
181 138
182 sentence.addMention(new Mention(segments, heads)); 139 sentence.addMention(new Mention(segments, heads));
183 - } /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null &&  
184 - NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) {  
185 - List<Token> heads = thisGroup.getSemanticHeadTokens();  
186 - 140 + } else if (group.getType().startsWith("NG") && nextGroup != null &&
  141 + groupsValenceCompatibility(group, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))
  142 + ) {
  143 + List<Token> heads = group.getSemanticHeadTokens();
187 List<Token> segments = new ArrayList<Token>(); 144 List<Token> segments = new ArrayList<Token>();
188 - segments.addAll(thisGroup.getTokens());  
189 -  
190 - int prepStart = thisGroup.getSentencePositionEnd() + 1;  
191 - int prepEnd = nextNG.getSentencePositionStart() - 1;  
192 - ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd);  
193 - segments.addAll(prepSegments);  
194 -  
195 - segments.addAll(nextNG.getTokens()); 145 + segments.addAll(group.getTokens());
  146 + segments.addAll(nextGroup.getTokens());
196 147
197 sentence.addMention(new Mention(segments, heads)); 148 sentence.addMention(new Mention(segments, heads));
198 - }*/  
199 - //else if // NG + im./pt. NG  
200 - // daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka  
201 - // "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName)  
202 - else if (thisGroup.getType().startsWith("NG")) {  
203 - List<Token> segments = thisGroup.getTokens();  
204 - List<Token> heads = thisGroup.getSemanticHeadTokens();  
205 -  
206 - sentence.addMention(new Mention(segments, heads));  
207 - }  
208 - }  
209 -  
210 - // oryginalna wersja  
211 - /*for (SyntacticGroup group : sentence.getGroups()) {  
212 - if (group.getType().startsWith("NG")) { 149 + } else if (group.getType().startsWith("NG")) {
213 List<Token> segments = group.getTokens(); 150 List<Token> segments = group.getTokens();
214 List<Token> heads = group.getSemanticHeadTokens(); 151 List<Token> heads = group.getSemanticHeadTokens();
215 152
216 sentence.addMention(new Mention(segments, heads)); 153 sentence.addMention(new Mention(segments, heads));
217 } 154 }
218 - }*/  
219 - }  
220 -  
221 - private static boolean followingWordIsInf(SyntacticGroup group,  
222 - Sentence sentence) {  
223 - int followingTokenPosition = group.getSentencePositionEnd() + 1;  
224 - for (SyntacticWord word : sentence.getSyntacticWords()) {  
225 - int firstWordPosition = word.getSentencePositionStart();  
226 - if (followingTokenPosition == firstWordPosition &&  
227 - (word.getCtag().equals("Inf"))) {  
228 - return true;  
229 - }  
230 - }  
231 -  
232 - return false;  
233 - }  
234 -  
235 - private static SyntacticGroup getFollowingPrepNGs(int sentencePosition,  
236 - Sentence sentence) {  
237 - SyntacticGroup largestGroup = null;  
238 - int nextTokenPosition = sentencePosition + 1;  
239 - for (SyntacticGroup group : sentence.getGroups()) {  
240 - if (group.getType().startsWith("PrepNG") &&  
241 - group.getSentencePositionStart() == nextTokenPosition) {  
242 - if (largestGroup == null ||  
243 - largestGroup.getTokens().size() < group.getTokens().size()) {  
244 - largestGroup = group;  
245 - }  
246 - }  
247 - }  
248 - return largestGroup;  
249 - }  
250 -  
251 - private static boolean isPartOfPrepNG(SyntacticGroup NGGroup,  
252 - Sentence sentence) {  
253 - int NGGroupStart = NGGroup.getSentencePositionStart();  
254 - int NGGroupEnd = NGGroup.getSentencePositionEnd();  
255 - for (SyntacticGroup group : sentence.getGroups()) {  
256 - if (group.getType().startsWith("PrepNG") &&  
257 - group.getSentencePositionStart() <= NGGroupStart &&  
258 - group.getSentencePositionEnd() >= NGGroupEnd) {  
259 - return true;  
260 - }  
261 - }  
262 - return false;  
263 - }  
264 -  
265 - private static boolean precedingWordIsVerb(SyntacticGroup group,  
266 - Sentence sentence) {  
267 - int precedingTokenPosition = group.getSentencePositionStart() - 1;  
268 - if(isPartOfPrepNG(group, sentence)) {  
269 - SyntacticGroup parentGroup = getParentPrepNG(group, sentence);  
270 - precedingTokenPosition = parentGroup.getSentencePositionStart() - 1;  
271 - }  
272 -  
273 - for (SyntacticWord word : sentence.getSyntacticWords()) {  
274 - int lastWordPosition = word.getSentencePositionEnd();  
275 - if (precedingTokenPosition == lastWordPosition &&  
276 - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {  
277 - return true;  
278 - }  
279 - }  
280 - return false;  
281 - }  
282 -  
283 - // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem?  
284 - // czy prep moze sie skladac z wiecej niz jednego segmentu?  
285 - // dopasowywac refla i recip do sie spejdowego  
286 - private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup,  
287 - SyntacticGroup PrepNGGroup, Sentence sentence,  
288 - Map<String,ArrayList<String>> walentyMapping) {  
289 - int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1;  
290 - for (SyntacticWord word : sentence.getSyntacticWords()) {  
291 - int lastWordPosition = word.getSentencePositionEnd();  
292 - if (precedingTokenPosition == lastWordPosition &&  
293 - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {  
294 - String verb = word.getBase();  
295 - if (!walentyMapping.containsKey(verb)) {  
296 - return true;  
297 - } else {  
298 - SyntacticWord prepWord = PrepNGGroup.getFirstWord();  
299 -  
300 - if (prepWord.getTokens().size() == 1) {  
301 - Token prep = prepWord.getTokens().get(0);  
302 - String prepBase = prep.getBase();  
303 - // sprawdzic czy glowa moze miec wiele tokenow  
304 - String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase();  
305 - ArrayList<String> prepnps = getPrepnps(prepBase, prepCase);  
306 -  
307 - ArrayList<String> schemata = walentyMapping.get(verb);  
308 - for (String schema : schemata) {  
309 - for (String prepnp : prepnps) {  
310 - if (schema.contains(prepnp)) {  
311 - return true;  
312 - }  
313 - }  
314 - }  
315 - } else if (prepWord.getTokens().size() > 1) {  
316 - String prepOrth = prepWord.getOrth().toLowerCase();  
317 - String comprepnp = String.format("comprepnp(%s)", prepOrth);  
318 - ArrayList<String> schemata = walentyMapping.get(verb);  
319 - for (String schema : schemata) {  
320 - if (schema.contains(comprepnp)) {  
321 - return true;  
322 - }  
323 - }  
324 -  
325 - }  
326 -  
327 -  
328 - }  
329 - }  
330 } 155 }
331 - return false;  
332 - }  
333 -  
334 - private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup,  
335 - SyntacticGroup PrepNGGroup, Sentence sentence,  
336 - Map<String,ArrayList<String>> walentyMapping) {  
337 - int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1;  
338 - if(isPartOfPrepNG(NGGroup, sentence)) {  
339 - SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence);  
340 - precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1;  
341 - }  
342 - for (SyntacticWord word : sentence.getSyntacticWords()) {  
343 - int lastWordPosition = word.getSentencePositionEnd();  
344 - if (precedingTokenPosition == lastWordPosition &&  
345 - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {  
346 - if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) {  
347 - return true;  
348 - }  
349 - if (!walentyMapping.containsKey(word.getBase())) {  
350 - return true;  
351 - }  
352 -  
353 - }  
354 - }  
355 - return false;  
356 - }  
357 -  
358 - private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup,  
359 - SyntacticGroup PrepNGGroup, Sentence sentence,  
360 - Map<String,ArrayList<String>> walentyMapping) {  
361 - String verbBase = verb.getBase();  
362 - if (!walentyMapping.containsKey(verbBase)) {  
363 - return true;  
364 - } else {  
365 - ArrayList<String> schemata = walentyMapping.get(verbBase);  
366 -  
367 - // PrepNG + PrepNG  
368 - if (isPartOfPrepNG(NGGroup, sentence)) {  
369 - SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence);  
370 - ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations();  
371 - ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations();  
372 - for (String schema : schemata) {  
373 - if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) {  
374 - return true;  
375 - }  
376 - }  
377 - }  
378 -  
379 - // NG + PrepNG  
380 - else {  
381 - ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations();  
382 - ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations();  
383 - for (String schema : schemata) {  
384 - if (isProperSchema(schema, NGRealizations, prepNGRealizations)) {  
385 - return true;  
386 - }  
387 - }  
388 - }  
389 - }  
390 - return false;  
391 } 156 }
392 157
393 private static boolean isProperSchema(String schema, ArrayList<String> group1Types, 158 private static boolean isProperSchema(String schema, ArrayList<String> group1Types,
394 ArrayList<String> group2Types) { 159 ArrayList<String> group2Types) {
395 for (String group1Type : group1Types) { 160 for (String group1Type : group1Types) {
396 - if (schema.contains(group1Type)) { 161 + if (schemaContains(schema, group1Type)) {
397 for (String group2Type : group2Types) { 162 for (String group2Type : group2Types) {
398 - if (schema.contains(group2Type)) { 163 + if (schemaContains(schema, group2Type)) {
399 return true; 164 return true;
400 } 165 }
401 } 166 }
@@ -404,103 +169,71 @@ public class Detector { @@ -404,103 +169,71 @@ public class Detector {
404 return false; 169 return false;
405 } 170 }
406 171
407 - private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup,  
408 - Sentence sentence) {  
409 - SyntacticGroup parentPrepNG = null;  
410 - int NGGroupStart = NGGroup.getSentencePositionStart();  
411 - int NGGroupEnd = NGGroup.getSentencePositionEnd();  
412 - for (SyntacticGroup group : sentence.getGroups()) {  
413 - if (group.getType().startsWith("PrepNG") &&  
414 - group.getSentencePositionStart() <= NGGroupStart &&  
415 - group.getSentencePositionEnd() >= NGGroupEnd) {  
416 - if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) {  
417 - parentPrepNG = group;  
418 - }  
419 - }  
420 - }  
421 - return parentPrepNG;  
422 - }  
423 -  
424 - private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup,  
425 - SyntacticGroup PrepNGGroup, Sentence sentence, 172 + private static boolean groupsValenceCompatibility(SyntacticGroup NG1,
  173 + SyntacticGroup NG2, Sentence sentence,
426 Map<String,ArrayList<String>> walentyMapping) { 174 Map<String,ArrayList<String>> walentyMapping) {
427 - Token NGHead = NGGroup.getSemanticHeadTokens().get(0); 175 + Token NG1Head = NG1.getSemanticHeadTokens().get(0);
428 176
429 - String NGHeadBase = NGHead.getBase(); 177 + String NGHeadBase = NG1Head.getBase();
430 178
431 if (!walentyMapping.containsKey(NGHeadBase)) { 179 if (!walentyMapping.containsKey(NGHeadBase)) {
432 return false; 180 return false;
433 } else { 181 } else {
434 - SyntacticWord prepWord = PrepNGGroup.getFirstWord(); 182 + ArrayList<String> NG2realizations = NG2.getWalentyRealizations();
435 183
436 - if (prepWord.getTokens().size() == 1) {  
437 - Token prep = prepWord.getTokens().get(0);  
438 - String prepBase = prep.getBase();  
439 - String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase();  
440 - String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase);  
441 - ArrayList<String> schemata = walentyMapping.get(NGHeadBase);  
442 - for (String schema : schemata) {  
443 - if (schemaContains(schema, prepnp)) {  
444 - return true;  
445 - }  
446 - }  
447 - } else if (prepWord.getTokens().size() > 1) {  
448 - String prepOrth = prepWord.getOrth().toLowerCase();  
449 - String comprepnp = String.format("comprepnp(%s)", prepOrth);  
450 - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); 184 + ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
  185 + for (String real : NG2realizations) {
451 for (String schema : schemata) { 186 for (String schema : schemata) {
452 - if (schemaContains(schema, comprepnp)) { 187 + if (schemaContains(schema, real)) {
453 return true; 188 return true;
454 } 189 }
455 } 190 }
456 -  
457 } 191 }
458 -  
459 } 192 }
460 return false; 193 return false;
461 } 194 }
462 195
463 - private static boolean NGNGValenceCompatibility(SyntacticGroup NG1,  
464 - SyntacticGroup NG2, Sentence sentence, 196 + private static boolean tripleCompatibility(SyntacticGroup group1,
  197 + SyntacticGroup group2, SyntacticGroup group3,
465 Map<String,ArrayList<String>> walentyMapping) { 198 Map<String,ArrayList<String>> walentyMapping) {
466 - Token NG1Head = NG1.getSemanticHeadTokens().get(0); 199 + Token group1Head = group1.getSemanticHeadTokens().get(0);
467 200
468 - String NGHeadBase = NG1Head.getBase(); 201 + String group1HeadBase = group1Head.getBase();
469 202
470 - if (!walentyMapping.containsKey(NGHeadBase)) { 203 + if (!walentyMapping.containsKey(group1HeadBase)) {
471 return false; 204 return false;
472 } else { 205 } else {
473 - ArrayList<String> NG2realizations = NG2.getWalentyRealizations(); 206 + ArrayList<String> group2realizations = group2.getWalentyRealizations();
  207 + ArrayList<String> group3realizations = group3.getWalentyRealizations();
474 208
475 - ArrayList<String> schemata = walentyMapping.get(NGHeadBase);  
476 - for (String real : NG2realizations) {  
477 - for (String schema : schemata) {  
478 - if (schemaContains(schema, real)) {  
479 - return true;  
480 - } 209 + ArrayList<String> schemata = walentyMapping.get(group1HeadBase);
  210 + for (String schema : schemata) {
  211 + if (isProperSchema(schema, group2realizations, group3realizations)) {
  212 + return true;
481 } 213 }
482 } 214 }
483 } 215 }
484 return false; 216 return false;
485 } 217 }
486 218
487 - private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1,  
488 - SyntacticGroup NGGroup2, Sentence sentence, 219 + private static boolean quatroCompatibility(SyntacticGroup group1,
  220 + SyntacticGroup group2, SyntacticGroup group3, SyntacticGroup group4,
489 Map<String,ArrayList<String>> walentyMapping) { 221 Map<String,ArrayList<String>> walentyMapping) {
490 -  
491 - Token NGHead = NGGroup1.getSemanticHeadTokens().get(0);  
492 - String NGHeadBase = NGHead.getBase(); 222 + Token group1Head = group1.getSemanticHeadTokens().get(0);
  223 +
  224 + String group1HeadBase = group1Head.getBase();
493 225
494 - if (!walentyMapping.containsKey(NGHeadBase)) { 226 + if (!walentyMapping.containsKey(group1HeadBase)) {
495 return false; 227 return false;
496 } else { 228 } else {
497 - int prepStart = NGGroup1.getSentencePositionEnd() + 1;  
498 - int prepEnd = NGGroup2.getSentencePositionStart() - 1;  
499 - String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd);  
500 - String comprepnp = String.format("comprepnp(%s)", complexPrep);  
501 - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); 229 + ArrayList<String> group2realizations = group2.getWalentyRealizations();
  230 + ArrayList<String> group3realizations = group3.getWalentyRealizations();
  231 + ArrayList<String> group4realizations = group4.getWalentyRealizations();
  232 +
  233 + ArrayList<String> schemata = walentyMapping.get(group1HeadBase);
502 for (String schema : schemata) { 234 for (String schema : schemata) {
503 - if (schemaContains(schema, comprepnp)) { 235 + if (isTripleProperSchema(schema, group2realizations, group3realizations,
  236 + group4realizations)) {
504 return true; 237 return true;
505 } 238 }
506 } 239 }
@@ -508,67 +241,119 @@ public class Detector { @@ -508,67 +241,119 @@ public class Detector {
508 return false; 241 return false;
509 } 242 }
510 243
511 - private static boolean schemaContains(String schema, String phraseType) {  
512 - for (String position : schema.split("\\s\\+\\s")) {  
513 - position = position.trim();  
514 - position = position.substring(1, position.length()-1);  
515 - for (String phrT : position.split(";")) {  
516 - if (phrT.equals(phraseType)) {  
517 - return true; 244 + private static boolean isTripleProperSchema(String schema, ArrayList<String> group1Types,
  245 + ArrayList<String> group2Types, ArrayList<String> group3Types) {
  246 + for (String group1Type : group1Types) {
  247 + if (schemaContains(schema, group1Type)) {
  248 + for (String group2Type : group2Types) {
  249 + if (schemaContains(schema, group2Type)) {
  250 + for (String group3Type : group3Types) {
  251 + if (schemaContains(schema, group3Type)) {
  252 + return true;
  253 + }
  254 + }
  255 + }
518 } 256 }
519 } 257 }
520 } 258 }
521 return false; 259 return false;
522 } 260 }
523 261
524 - private static boolean schemaContainsType(String schema, String type) {  
525 - // to lepiej dziala dla rzeczownikow  
526 - for (String position : schema.split("\\s\\+\\s")) {  
527 - position = position.trim();  
528 - position = position.substring(1, position.length()-1);  
529 - for (String phrT : position.split(";")) {  
530 -  
531 - if (phrT.startsWith(type+"(")) {  
532 - return true; 262 + /*private static boolean isTripleProperSchema(String schema, ArrayList<String> group1Types,
  263 + ArrayList<String> group2Types, ArrayList<String> group3Types) {
  264 +
  265 + ArrayList<String> group1MPositions = getMatchingPositions(schema, group1Types);
  266 + ArrayList<String> group2MPositions = getMatchingPositions(schema, group2Types);
  267 + ArrayList<String> group3MPositions = getMatchingPositions(schema, group3Types);
  268 +
  269 +
  270 +
  271 + ArrayList<String> group1MPositionsCopy = new ArrayList<String>();
  272 + ArrayList<String> group2MPositionsCopy = getMatchingPositions(schema, group2Types);
  273 + ArrayList<String> group3MPositionsCopy = getMatchingPositions(schema, group3Types);
  274 +
  275 +
  276 + if (group1MPositions.isEmpty() || group2MPositions.isEmpty() || group3MPositions.isEmpty()) {
  277 + return false;
  278 + }
  279 +
  280 + boolean group1ok = false;
  281 + boolean group2ok = false;
  282 + boolean group3ok = false;
  283 +
  284 + for (String pos : group1MPositions) {
  285 +
  286 + }
  287 +
  288 + ArrayList<String>
  289 +
  290 + if (union(group1MPositions, group2MPositions).size() > group1MPositions.size() &&
  291 + )
  292 +
  293 +
  294 + for (String group1Type : group1Types) {
  295 + if (schemaContains(schema, group1Type)) {
  296 + for (String group2Type : group2Types) {
  297 + if (schemaContains(schema, group2Type)) {
  298 + for (String group3Type : group3Types) {
  299 + if (schemaContains(schema, group3Type)) {
  300 + return true;
  301 + }
  302 + }
  303 + }
533 } 304 }
534 } 305 }
535 } 306 }
536 return false; 307 return false;
  308 + }*/
  309 +
  310 + public static List<String> union(List<String> list1, List<String> list2) {
  311 + HashSet<String> set = new HashSet<String>();
  312 +
  313 + set.addAll(list1);
  314 + set.addAll(list2);
  315 +
  316 + return new ArrayList<String>(set);
537 } 317 }
538 318
  319 + public static List<String> tripleUnion(List<String> list1, List<String> list2,
  320 + List<String> list3) {
  321 + HashSet<String> set = new HashSet<String>();
  322 +
  323 + set.addAll(list1);
  324 + set.addAll(list2);
  325 + set.addAll(list3);
  326 +
  327 + return new ArrayList<String>(set);
  328 + }
539 329
540 - // compar ??  
541 - private static ArrayList<String> getPrepnps(String prepBase, String prepCase) {  
542 - ArrayList<String> prepnps = new ArrayList<String>();  
543 - prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase));  
544 - if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) {  
545 - prepnps.add(String.format("prepnp(%s,str)", prepBase));  
546 - }  
547 - if (prepCase.equals("gen") || prepCase.equals("acc")) {  
548 - prepnps.add(String.format("prepnp(%s,part)", prepBase)); 330 + private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) {
  331 + ArrayList<String> positions = new ArrayList<String>();
  332 + for (String position : schema.split("\\s\\+\\s")) {
  333 + position = position.trim();
  334 + position = position.substring(1, position.length()-1);
  335 + for (String phrT : position.split(";")) {
  336 + if (phraseRealizations.contains(phrT.trim())) {
  337 + positions.add(position);
  338 + break;
  339 + }
  340 + }
549 } 341 }
550 - return prepnps; 342 + return positions;
551 } 343 }
552 344
553 - // eliminuje "od wsi do wsi"  
554 - private static boolean sameSemanticHeads(SyntacticGroup group1,  
555 - SyntacticGroup group2) {  
556 -  
557 - List<Token> group1HeadTokens = group1.getSemanticHeadTokens();  
558 - List<Token> group2HeadTokens = group2.getSemanticHeadTokens();  
559 - if (group1HeadTokens.size() != group2HeadTokens.size()) {  
560 - return false;  
561 - }  
562 -  
563 - for (int i=0; i < group1HeadTokens.size(); i++) {  
564 - if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) {  
565 - return false; 345 + private static boolean schemaContains(String schema, String phraseType) {
  346 + for (String position : schema.split("\\s\\+\\s")) {
  347 + position = position.trim();
  348 + position = position.substring(1, position.length()-1);
  349 + for (String phrT : position.split(";")) {
  350 + if (phrT.equals(phraseType)) {
  351 + return true;
  352 + }
566 } 353 }
567 } 354 }
568 -  
569 - return true; 355 + return false;
570 } 356 }
571 -  
572 357
573 /** 358 /**
574 * Wyszukuję i oznaczam wszystkie NER 359 * Wyszukuję i oznaczam wszystkie NER
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
1 package pl.waw.ipipan.zil.core.md.entities; 1 package pl.waw.ipipan.zil.core.md.entities;
2 2
3 import java.util.ArrayList; 3 import java.util.ArrayList;
4 -import java.util.Arrays;  
5 import java.util.List; 4 import java.util.List;
6 5
  6 +import pl.waw.ipipan.zil.core.md.detection.Constants;
  7 +
7 /** 8 /**
8 * @author Mateusz Kopec 9 * @author Mateusz Kopec
  10 + * Modified 2017 by Bartlomiej Niton
9 * 11 *
10 */ 12 */
11 public class Mention implements Comparable<Mention> { 13 public class Mention implements Comparable<Mention> {
@@ -205,77 +207,22 @@ public class Mention implements Comparable&lt;Mention&gt; { @@ -205,77 +207,22 @@ public class Mention implements Comparable&lt;Mention&gt; {
205 return isZeroSubject; 207 return isZeroSubject;
206 } 208 }
207 209
208 - public int getSentencePositionStart() { 210 + public int getSentenceStartPosition() {
209 Token startToken = this.getFirstSegment(); 211 Token startToken = this.getFirstSegment();
210 return startToken.getSentencePosition(); 212 return startToken.getSentencePosition();
211 } 213 }
212 214
213 - public int getSentencePositionEnd() { 215 + public int getSentenceEndPosition() {
214 Token endToken = this.getLastSegment(); 216 Token endToken = this.getLastSegment();
215 return endToken.getSentencePosition(); 217 return endToken.getSentencePosition();
216 } 218 }
217 -  
218 - public boolean isPartOfQub() {  
219 - if (this.segments.size() == 1) {  
220 - Sentence sentence = this.segments.get(0).getSentence();  
221 - for (SyntacticWord word : sentence.getSyntacticWords()) {  
222 - if (word.getTokens().contains(this.segments.get(0)) &&  
223 - word.getCtag().equals("Qub")) {  
224 - return true;  
225 - }  
226 - }  
227 - }  
228 - return false;  
229 - }  
230 -  
231 - public boolean isPartOfPrep() {  
232 - if (this.segments.size() == 1) {  
233 - Sentence sentence = this.segments.get(0).getSentence();  
234 - for (SyntacticWord word : sentence.getSyntacticWords()) {  
235 - if (word.getTokens().contains(this.segments.get(0)) &&  
236 - word.getCtag().equals("Prep")) {  
237 - return true;  
238 - }  
239 - }  
240 - }  
241 - return false;  
242 - }  
243 -  
244 - private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj",  
245 - "Adj", "Conj", "Comp");  
246 219
247 public boolean isPartOfFrazeo() { 220 public boolean isPartOfFrazeo() {
248 if (this.segments.size() == 1) { 221 if (this.segments.size() == 1) {
249 Sentence sentence = this.segments.get(0).getSentence(); 222 Sentence sentence = this.segments.get(0).getSentence();
250 for (SyntacticWord word : sentence.getSyntacticWords()) { 223 for (SyntacticWord word : sentence.getSyntacticWords()) {
251 if (word.getTokens().contains(this.segments.get(0)) && 224 if (word.getTokens().contains(this.segments.get(0)) &&
252 - FRAZEOS.contains(word.getCtag())) {  
253 - return true;  
254 - }  
255 - }  
256 - }  
257 - return false;  
258 - }  
259 -  
260 - public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) {  
261 - if (this.segments.size() == 1) {  
262 - Sentence sentence = this.segments.get(0).getSentence();  
263 - if (this.getSentencePositionStart() - 1 >= 0) {  
264 - String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth();  
265 - String noun = sentence.get(this.getSentencePositionStart()).getOrth();  
266 - String possiblePrep = String.format("%s %s", prep, noun);  
267 - if (complexPreps.contains(possiblePrep)) {  
268 - return true;  
269 - }  
270 - }  
271 -  
272 - if (this.getSentencePositionStart() - 1 >= 0 &&  
273 - this.getSentencePositionStart() + 1 < sentence.size()) {  
274 - String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth();  
275 - String noun = sentence.get(this.getSentencePositionStart()).getOrth();  
276 - String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth();  
277 - String possiblePrep = String.format("%s %s %s", prep1, noun, prep2);  
278 - if (complexPreps.contains(possiblePrep)) { 225 + Constants.FRAZEO_CTAGS.contains(word.getCtag())) {
279 return true; 226 return true;
280 } 227 }
281 } 228 }
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
@@ -110,35 +110,6 @@ public class Sentence extends ArrayList&lt;Token&gt; { @@ -110,35 +110,6 @@ public class Sentence extends ArrayList&lt;Token&gt; {
110 namedEntities.add(namedEntity); 110 namedEntities.add(namedEntity);
111 } 111 }
112 112
113 - public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) {  
114 - ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>();  
115 - for (SyntacticGroup group : this.syntacticGroups) {  
116 - if (group.getSentencePositionStart() >= start &&  
117 - group.getSentencePositionEnd() <= end) {  
118 - if (!(group.getSentencePositionStart() == start &&  
119 - group.getSentencePositionEnd() == end)) {  
120 - groupsAtSpan.add(group);  
121 - }  
122 - }  
123 - }  
124 - return groupsAtSpan;  
125 - }  
126 -  
127 - public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) {  
128 - ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>();  
129 - for (SyntacticGroup group : this.syntacticGroups) {  
130 -  
131 - if (group.getSentencePositionStart() >= start &&  
132 - group.getSentencePositionEnd() <= end) {  
133 - if (!(group.getSentencePositionStart() == start &&  
134 - group.getSentencePositionEnd() == end)) {  
135 - groupsAtSpan.add(group);  
136 - }  
137 - }  
138 - }  
139 - return groupsAtSpan;  
140 - }  
141 -  
142 public SyntacticGroup getFirstGroup(int start, int end) { 113 public SyntacticGroup getFirstGroup(int start, int end) {
143 SyntacticGroup largestGroup = null; 114 SyntacticGroup largestGroup = null;
144 int step = start; 115 int step = start;
@@ -152,8 +123,8 @@ public class Sentence extends ArrayList&lt;Token&gt; { @@ -152,8 +123,8 @@ public class Sentence extends ArrayList&lt;Token&gt; {
152 private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) { 123 private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) {
153 SyntacticGroup largestGroup = null; 124 SyntacticGroup largestGroup = null;
154 for (SyntacticGroup group : this.getGroups()) { 125 for (SyntacticGroup group : this.getGroups()) {
155 - int groupStart = group.getSentencePositionStart();  
156 - int groupEnd = group.getSentencePositionEnd(); 126 + int groupStart = group.getSentenceStartPosition();
  127 + int groupEnd = group.getSentenceEndPosition();
157 if (groupStart == start && groupEnd <= end && 128 if (groupStart == start && groupEnd <= end &&
158 !(groupStart == start && groupEnd == end) && 129 !(groupStart == start && groupEnd == end) &&
159 (largestGroup == null || 130 (largestGroup == null ||
@@ -177,8 +148,8 @@ public class Sentence extends ArrayList&lt;Token&gt; { @@ -177,8 +148,8 @@ public class Sentence extends ArrayList&lt;Token&gt; {
177 private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) { 148 private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) {
178 SyntacticGroup largestGroup = null; 149 SyntacticGroup largestGroup = null;
179 for (SyntacticGroup group : this.getGroups()) { 150 for (SyntacticGroup group : this.getGroups()) {
180 - int groupStart = group.getSentencePositionStart();  
181 - int groupEnd = group.getSentencePositionEnd(); 151 + int groupStart = group.getSentenceStartPosition();
  152 + int groupEnd = group.getSentenceEndPosition();
182 if (groupEnd == end && groupStart >= start && 153 if (groupEnd == end && groupStart >= start &&
183 !(groupStart == start && groupEnd == end) && 154 !(groupStart == start && groupEnd == end) &&
184 (largestGroup == null || 155 (largestGroup == null ||
@@ -189,38 +160,4 @@ public class Sentence extends ArrayList&lt;Token&gt; { @@ -189,38 +160,4 @@ public class Sentence extends ArrayList&lt;Token&gt; {
189 return largestGroup; 160 return largestGroup;
190 } 161 }
191 162
192 - public ArrayList<Mention> getMentionsInsideSpan(int start, int end) {  
193 - ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>();  
194 - for (Mention mention : this.mentions) {  
195 - if (mention.getSentencePositionStart() >= start &&  
196 - mention.getSentencePositionEnd() <= end) {  
197 - mentionsAtSpan.add(mention);  
198 - }  
199 - }  
200 - return mentionsAtSpan;  
201 - }  
202 -  
203 - public String getTextInsideSpan(int start, int end) {  
204 - String text = "";  
205 - int step = start;  
206 - while (step <= end) {  
207 - if (step != start) {  
208 - text += " ";  
209 - }  
210 - text += this.get(step).getOrth();  
211 - step++;  
212 - }  
213 - return text;  
214 - }  
215 -  
216 - public ArrayList<Token> getSegmentsInsideSpan(int start, int end) {  
217 - ArrayList<Token> tokensAtSpan = new ArrayList<Token>();  
218 - int step = start;  
219 - while (step <= end) {  
220 - tokensAtSpan.add(this.get(step));  
221 - step++;  
222 - }  
223 - return tokensAtSpan;  
224 - }  
225 -  
226 } 163 }
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
@@ -55,34 +55,19 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; { @@ -55,34 +55,19 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
55 return getType().compareTo(o.getType()); 55 return getType().compareTo(o.getType());
56 } 56 }
57 57
58 - public int getSentencePositionStart() { 58 + public int getSentenceStartPosition() {
59 Token startToken = tokens.get(0); 59 Token startToken = tokens.get(0);
60 return startToken.getSentencePosition(); 60 return startToken.getSentencePosition();
61 } 61 }
62 62
63 - public int getSentencePositionEnd() { 63 + public int getSentenceEndPosition() {
64 Token endToken = tokens.get(tokens.size()-1); 64 Token endToken = tokens.get(tokens.size()-1);
65 return endToken.getSentencePosition(); 65 return endToken.getSentencePosition();
66 } 66 }
67 67
68 -  
69 - public SyntacticWord getFirstWord() {  
70 - SyntacticWord firstWord = null;  
71 - Token startToken = tokens.get(0);  
72 - Sentence sentence = startToken.getSentence();  
73 - for (SyntacticWord word : sentence.getSyntacticWords()) {  
74 - if(startToken.compareTo(word.getTokens().get(0)) == 0 &&  
75 - (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) {  
76 - firstWord = word;  
77 - }  
78 - }  
79 - return firstWord;  
80 - }  
81 -  
82 - // NG and PrepNG only now  
83 public ArrayList<String> getWalentyRealizations() { 68 public ArrayList<String> getWalentyRealizations() {
84 ArrayList<String> realizations = new ArrayList<String>(); 69 ArrayList<String> realizations = new ArrayList<String>();
85 - if (this.type.startsWith("PrepNG")) { 70 + if (this.type.equals("PrepNG")) {
86 SyntacticWord prepWord = this.getFirstWord(); 71 SyntacticWord prepWord = this.getFirstWord();
87 if (prepWord.getTokens().size() == 1) { 72 if (prepWord.getTokens().size() == 1) {
88 73
@@ -105,7 +90,19 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; { @@ -105,7 +90,19 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
105 return realizations; 90 return realizations;
106 } 91 }
107 92
108 - // compar ?? 93 + public SyntacticWord getFirstWord() {
  94 + SyntacticWord firstWord = null;
  95 + Token startToken = tokens.get(0);
  96 + Sentence sentence = startToken.getSentence();
  97 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  98 + if(startToken.compareTo(word.getTokens().get(0)) == 0 &&
  99 + (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) {
  100 + firstWord = word;
  101 + }
  102 + }
  103 + return firstWord;
  104 + }
  105 +
109 private ArrayList<String> getPrepnps(String prepBase, String prepCase) { 106 private ArrayList<String> getPrepnps(String prepBase, String prepCase) {
110 ArrayList<String> prepnps = new ArrayList<String>(); 107 ArrayList<String> prepnps = new ArrayList<String>();
111 prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); 108 prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase));
@@ -130,44 +127,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; { @@ -130,44 +127,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
130 return nps; 127 return nps;
131 } 128 }
132 129
133 - public boolean precedingWordIsVerb() {  
134 - Sentence sentence = this.tokens.get(0).getSentence();  
135 - int precedingTokenPosition = this.getSentencePositionStart() - 1;  
136 - for (SyntacticWord word : sentence.getSyntacticWords()) {  
137 - int lastWordPosition = word.getSentencePositionEnd();  
138 - if (precedingTokenPosition == lastWordPosition &&  
139 - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {  
140 - return true;  
141 - }  
142 - }  
143 - return false;  
144 - }  
145 -  
146 - public SyntacticGroup getNextNG() {  
147 - Sentence sentence = this.tokens.get(0).getSentence();  
148 - int thisGroupEnd = this.getSentencePositionEnd();  
149 - int sentenceLength = sentence.size();  
150 -  
151 - SyntacticGroup nextNG = null;  
152 - for (int step = thisGroupEnd; step < sentenceLength; step++) {  
153 - nextNG = sentence.getFirstGroup(step, sentenceLength);  
154 - if (nextNG != null && nextNG.type.startsWith("NG") &&  
155 - this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) {  
156 - break;  
157 - } else {  
158 - nextNG = null;  
159 - }  
160 - }  
161 - return nextNG;  
162 - }  
163 -  
164 public SyntacticGroup getFollowingGroup() { 130 public SyntacticGroup getFollowingGroup() {
165 SyntacticGroup largestGroup = null; 131 SyntacticGroup largestGroup = null;
166 Sentence sentence = this.tokens.get(0).getSentence(); 132 Sentence sentence = this.tokens.get(0).getSentence();
167 - int nextTokenPosition = this.getSentencePositionEnd() + 1; 133 + int nextTokenPosition = this.getSentenceEndPosition() + 1;
168 for (SyntacticGroup group : sentence.getGroups()) { 134 for (SyntacticGroup group : sentence.getGroups()) {
169 - if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) &&  
170 - group.getSentencePositionStart() == nextTokenPosition) { 135 + if ((group.getType().equals("PrepNG") || group.getType().startsWith("NG")) &&
  136 + group.getSentenceStartPosition() == nextTokenPosition) {
171 if (largestGroup == null || 137 if (largestGroup == null ||
172 largestGroup.getTokens().size() < group.getTokens().size()) { 138 largestGroup.getTokens().size() < group.getTokens().size()) {
173 largestGroup = group; 139 largestGroup = group;
@@ -178,14 +144,14 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; { @@ -178,14 +144,14 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
178 } 144 }
179 145
180 public SyntacticWord getPrecedingVerb() { 146 public SyntacticWord getPrecedingVerb() {
181 - int precedingTokenPosition = this.getSentencePositionStart() - 1; 147 + int precedingTokenPosition = this.getSentenceStartPosition() - 1;
182 Sentence sentence = this.tokens.get(0).getSentence(); 148 Sentence sentence = this.tokens.get(0).getSentence();
183 if(this.isPartOfPrepNG()) { 149 if(this.isPartOfPrepNG()) {
184 SyntacticGroup parentNGGroup = this.getParentPrepNG(); 150 SyntacticGroup parentNGGroup = this.getParentPrepNG();
185 - precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; 151 + precedingTokenPosition = parentNGGroup.getSentenceStartPosition() - 1;
186 } 152 }
187 for (SyntacticWord word : sentence.getSyntacticWords()) { 153 for (SyntacticWord word : sentence.getSyntacticWords()) {
188 - int lastWordPosition = word.getSentencePositionEnd(); 154 + int lastWordPosition = word.getSentenceEndPosition();
189 if (precedingTokenPosition == lastWordPosition && 155 if (precedingTokenPosition == lastWordPosition &&
190 (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { 156 (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
191 return word; 157 return word;
@@ -195,13 +161,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; { @@ -195,13 +161,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
195 } 161 }
196 162
197 private boolean isPartOfPrepNG() { 163 private boolean isPartOfPrepNG() {
198 - int NGGroupStart = this.getSentencePositionStart();  
199 - int NGGroupEnd = this.getSentencePositionEnd(); 164 + int NGGroupStart = this.getSentenceStartPosition();
  165 + int NGGroupEnd = this.getSentenceEndPosition();
200 Sentence sentence = this.tokens.get(0).getSentence(); 166 Sentence sentence = this.tokens.get(0).getSentence();
201 for (SyntacticGroup group : sentence.getGroups()) { 167 for (SyntacticGroup group : sentence.getGroups()) {
202 - if (group.getType().startsWith("PrepNG") &&  
203 - group.getSentencePositionStart() <= NGGroupStart &&  
204 - group.getSentencePositionEnd() >= NGGroupEnd) { 168 + if (group.getType().equals("PrepNG") &&
  169 + group.getSentenceStartPosition() <= NGGroupStart &&
  170 + group.getSentenceEndPosition() >= NGGroupEnd) {
205 return true; 171 return true;
206 } 172 }
207 } 173 }
@@ -210,13 +176,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; { @@ -210,13 +176,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
210 176
211 private SyntacticGroup getParentPrepNG() { 177 private SyntacticGroup getParentPrepNG() {
212 SyntacticGroup parentPrepNG = null; 178 SyntacticGroup parentPrepNG = null;
213 - int NGGroupStart = this.getSentencePositionStart();  
214 - int NGGroupEnd = this.getSentencePositionEnd(); 179 + int NGGroupStart = this.getSentenceStartPosition();
  180 + int NGGroupEnd = this.getSentenceEndPosition();
215 Sentence sentence = this.tokens.get(0).getSentence(); 181 Sentence sentence = this.tokens.get(0).getSentence();
216 for (SyntacticGroup group : sentence.getGroups()) { 182 for (SyntacticGroup group : sentence.getGroups()) {
217 - if (group.getType().startsWith("PrepNG") &&  
218 - group.getSentencePositionStart() <= NGGroupStart &&  
219 - group.getSentencePositionEnd() >= NGGroupEnd) { 183 + if (group.getType().equals("PrepNG") &&
  184 + group.getSentenceStartPosition() <= NGGroupStart &&
  185 + group.getSentenceEndPosition() >= NGGroupEnd) {
220 if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { 186 if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) {
221 parentPrepNG = group; 187 parentPrepNG = group;
222 } 188 }
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
@@ -4,6 +4,8 @@ import java.util.ArrayList; @@ -4,6 +4,8 @@ import java.util.ArrayList;
4 import java.util.Iterator; 4 import java.util.Iterator;
5 import java.util.List; 5 import java.util.List;
6 6
  7 +import pl.waw.ipipan.zil.core.md.detection.Constants;
  8 +
7 public class SyntacticWord implements Comparable<SyntacticWord> { 9 public class SyntacticWord implements Comparable<SyntacticWord> {
8 10
9 private String base; 11 private String base;
@@ -22,6 +24,14 @@ public class SyntacticWord implements Comparable&lt;SyntacticWord&gt; { @@ -22,6 +24,14 @@ public class SyntacticWord implements Comparable&lt;SyntacticWord&gt; {
22 public String getCtag() { 24 public String getCtag() {
23 return ctag; 25 return ctag;
24 } 26 }
  27 +
  28 + public String getBase() {
  29 + return base;
  30 + }
  31 +
  32 + public String getOrth() {
  33 + return orth;
  34 + }
25 35
26 public List<Token> getTokens() { 36 public List<Token> getTokens() {
27 return tokens; 37 return tokens;
@@ -45,33 +55,18 @@ public class SyntacticWord implements Comparable&lt;SyntacticWord&gt; { @@ -45,33 +55,18 @@ public class SyntacticWord implements Comparable&lt;SyntacticWord&gt; {
45 return getCtag().compareTo(o.getCtag()); 55 return getCtag().compareTo(o.getCtag());
46 } 56 }
47 57
48 - public int getSentencePositionStart() { 58 + public int getSentenceStartPosition() {
49 Token startToken = tokens.get(0); 59 Token startToken = tokens.get(0);
50 return startToken.getSentencePosition(); 60 return startToken.getSentencePosition();
51 } 61 }
52 62
53 - public int getSentencePositionEnd() { 63 + public int getSentenceEndPosition() {
54 Token endToken = tokens.get(tokens.size()-1); 64 Token endToken = tokens.get(tokens.size()-1);
55 return endToken.getSentencePosition(); 65 return endToken.getSentencePosition();
56 } 66 }
57 67
58 - public String getBase() {  
59 - return this.base;  
60 - }  
61 -  
62 - public String getOrth() {  
63 - return this.orth;  
64 - }  
65 -  
66 public boolean isVerb() { 68 public boolean isVerb() {
67 - if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) {  
68 - return true;  
69 - }  
70 - return false;  
71 - }  
72 -  
73 - public boolean isInterp() {  
74 - if (this.ctag.equals("Interp")) { 69 + if (Constants.VERB_CTAGS.contains(this.ctag)) {
75 return true; 70 return true;
76 } 71 }
77 return false; 72 return false;
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
@@ -70,7 +70,6 @@ public class TeiLoader { @@ -70,7 +70,6 @@ public class TeiLoader {
70 for (TEIMorph mo : m.getHeadMorphs()) 70 for (TEIMorph mo : m.getHeadMorphs())
71 headTokens.add(teiMorph2Segment.get(mo)); 71 headTokens.add(teiMorph2Segment.get(mo));
72 s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); 72 s.addMention(new Mention(tokens, headTokens, m.isZeroSubject()));
73 - System.out.println(tokens.toString());  
74 } 73 }
75 74
76 private static void loadSyntacticGroup(Sentence s, TEIGroup g, 75 private static void loadSyntacticGroup(Sentence s, TEIGroup g,