Commit 8f86545e14f99bbf47ab83bf202e26af7a2716c4

Authored by Bartłomiej Nitoń
1 parent 1dc4f947

Cleaning unused experimental code.

src/main/java/pl/waw/ipipan/zil/core/md/Main.java
... ... @@ -33,9 +33,8 @@ public class Main {
33 33  
34 34 private static final boolean GZIP_OUTPUT = true;
35 35 private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
36   - private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt";
37   - private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt";
38   - private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt";
  36 + private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all_with_realizations.txt";
  37 + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all_with_realizations.txt";
39 38  
40 39 private static ZeroSubjectDetector zeroSubjectModel;
41 40  
... ... @@ -46,8 +45,6 @@ public class Main {
46 45  
47 46 private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence =
48 47 new EnumMap(ValenceDicts.class);
49   -
50   - private static final ArrayList<String> complexPreps;
51 48  
52 49 static {
53 50 InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
... ... @@ -58,9 +55,6 @@ public class Main {
58 55  
59 56 InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
60 57 valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
61   -
62   - InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS);
63   - complexPreps = readValues(complexPrepositionsStream);
64 58 }
65 59  
66 60  
... ... @@ -125,34 +119,6 @@ public class Main {
125 119  
126 120 return false;
127 121 }
128   -
129   - public static ArrayList<String> readValues(InputStream stream) {
130   - ArrayList<String> values;
131   - try {
132   - BufferedReader br=new BufferedReader(new InputStreamReader(stream));
133   - values = new ArrayList<String>();
134   - String line;
135   - boolean firstLine = true;
136   - while((line = br.readLine()) != null) {
137   - if (firstLine) {
138   - line = line.replace("\uFEFF", ""); // remove BOM character
139   - firstLine = false;
140   - }
141   -
142   - if (!line.startsWith("%")) {
143   - String value = line.trim();
144   - if (!value.isEmpty()) {
145   - values.add(value);
146   - }
147   - }
148   - }
149   - br.close();
150   - } catch (IOException ex) {
151   - ex.printStackTrace();
152   - throw new RuntimeException(ex);
153   - }
154   - return values;
155   - }
156 122  
157 123 private Main() {
158 124 }
... ... @@ -244,7 +210,7 @@ public class Main {
244 210 */
245 211 public static void annotateThriftText(TText thriftText) throws MultiserviceException {
246 212 Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
247   - Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps);
  213 + Detector.findMentionsInText(responseText, zeroSubjectModel, valence);
248 214 ThriftSaver.updateThriftText(responseText, thriftText);
249 215 }
250 216  
... ... @@ -257,7 +223,7 @@ public class Main {
257 223 */
258 224 public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
259 225 Text responseText = TeiLoader.loadTextFromTei(teiText);
260   - Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps);
  226 + Detector.findMentionsInText(responseText, zeroSubjectModel, valence);
261 227 TeiSaver.updateTeiText(responseText, teiText);
262 228 }
263 229  
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 1 package pl.waw.ipipan.zil.core.md.detection;
2 2  
3   -import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
4 3 import pl.waw.ipipan.zil.core.md.entities.Mention;
5 4 import pl.waw.ipipan.zil.core.md.entities.Sentence;
6 5 import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
... ... @@ -164,33 +163,6 @@ public class Cleaner {
164 163 }
165 164 }
166 165  
167   - /*private static void removeWalentyFramedMentions(Sentence sentence,
168   - ArrayList<Mention> mentions,
169   - ArrayList<String> schemata) {
170   - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
171   - for (Mention mention : mentions) {
172   - int mentionStart = mention.getFirstSegment().getSentencePosition();
173   - int mentionEnd = mention.getLastSegment().getSentencePosition();
174   - SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd);
175   - SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd);
176   - if (startGroup != null && endGroup != null
177   - && startGroup.compareTo(endGroup) != 0) {
178   - ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations();
179   - ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations();
180   - for (String schema : schemata) {
181   - if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) {
182   - mentionsToRemove.add(mention);
183   - break;
184   - }
185   - }
186   - }
187   - }
188   -
189   - for (Mention mentionToRemove : mentionsToRemove) {
190   - sentence.removeMention(mentionToRemove);
191   - }
192   - }*/
193   -
194 166 private static boolean isProperSchema(String schema, ArrayList<String> group1Types,
195 167 ArrayList<String> group2Types) {
196 168 for (String group1Type : group1Types) {
... ... @@ -207,7 +179,7 @@ public class Cleaner {
207 179 String phraseType2) {
208 180 boolean phrType1Found = false;
209 181 boolean phrType2Found = false;
210   - for (String position : schema.split("\\+")) {
  182 + for (String position : schema.split("\\s\\+\\s")) {
211 183 position = position.trim();
212 184 position = position.substring(1, position.length()-1);
213 185 for (String phrT : position.split(";")) {
... ... @@ -226,34 +198,6 @@ public class Cleaner {
226 198 return false;
227 199 }
228 200  
229   -
230   - // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub)
231   - public static void cleanQubs(Sentence sentence) {
232   - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
233   - for (Mention mention : sentence.getMentions()) {
234   - if (mention.isPartOfQub()) {
235   - mentionsToRemove.add(mention);
236   - }
237   - }
238   -
239   - for (Mention mentionToRemove : mentionsToRemove) {
240   - sentence.removeMention(mentionToRemove);
241   - }
242   - }
243   -
244   - public static void cleanPreps(Sentence sentence) {
245   - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
246   - for (Mention mention : sentence.getMentions()) {
247   - if (mention.isPartOfPrep()) {
248   - mentionsToRemove.add(mention);
249   - }
250   - }
251   -
252   - for (Mention mentionToRemove : mentionsToRemove) {
253   - sentence.removeMention(mentionToRemove);
254   - }
255   - }
256   -
257 201 public static void cleanFrazeos(Sentence sentence) {
258 202 ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
259 203 for (Mention mention : sentence.getMentions()) {
... ... @@ -267,20 +211,4 @@ public class Cleaner {
267 211 }
268 212 }
269 213  
270   - // wyrzuca wzmianki bedace czescia przyimkow zlozonych
271   - public static void cleanComplexPreps(Sentence sentence,
272   - ArrayList<String> complexPreps) {
273   -
274   - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
275   - for (Mention mention : sentence.getMentions()) {
276   - if (mention.isPartOfComplexPrep(complexPreps)) {
277   - mentionsToRemove.add(mention);
278   - }
279   - }
280   -
281   - for (Mention mentionToRemove : mentionsToRemove) {
282   - sentence.removeMention(mentionToRemove);
283   - }
284   - }
285   -
286 214 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
1 1 package pl.waw.ipipan.zil.core.md.detection;
2 2  
  3 +import java.util.Arrays;
  4 +import java.util.List;
  5 +
3 6 public class Constants {
4 7 public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger";
5 8 public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt";
... ... @@ -7,6 +10,11 @@ public class Constants {
7 10 public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|"
8 11 + MORPHO_PRONOUN_CTAGS;
9 12 public static final String WORDS_CTAGS = "Noun|Ppron.*";
  13 +
  14 + public static final List<String> FRAZEO_CTAGS = Arrays.asList("Prep", "Qub", "Adv", "Interj",
  15 + "Adj", "Conj", "Comp");
  16 +
  17 + public static final List<String> VERB_CTAGS = Arrays.asList("Inf", "Verbfin");
10 18  
11 19 private Constants() {
12 20 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
... ... @@ -22,24 +22,22 @@ public class Detector {
22 22  
23 23 public static void findMentionsInText(Text text,
24 24 ZeroSubjectDetector zeroSubjectModel,
25   - Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
26   - ArrayList<String> complexPreps) {
  25 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
27 26 text.clearMentions();
28 27 logger.debug("Detecting mentions in text " + text.getId());
29 28 for (Paragraph p : text)
30 29 for (Sentence s : p)
31   - detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps);
  30 + detectMentionsInSentence(s, zeroSubjectModel, valence);
32 31 }
33 32  
34 33 private static void detectMentionsInSentence(Sentence sentence,
35 34 ZeroSubjectDetector zeroSubjectModel,
36   - Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
37   - ArrayList<String> complexPreps) {
  35 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
38 36 // adding mentions
39 37 addMentionsByTokenCtag(sentence);
40 38 addMentionsBySyntacticWordsCtag(sentence);
41 39 addMentionsByNamedEntities(sentence);
42   - addMentionsByGroups(sentence, valence, complexPreps);
  40 + addMentionsByGroups(sentence, valence);
43 41 addSpeakerMentionsInSpoken(sentence);
44 42  
45 43 // zero subject detection
... ... @@ -47,12 +45,9 @@ public class Detector {
47 45  
48 46 // removing mentions
49 47 removeTo(sentence);
  48 + Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
50 49 Cleaner.cleanUnnecessarySentenceMentions(sentence);
51   - //Cleaner.cleanQubs(sentence);
52   - //Cleaner.cleanPreps(sentence);
53   - //Cleaner.cleanComplexPreps(sentence, complexPreps);
54 50 Cleaner.cleanFrazeos(sentence);
55   - Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
56 51  
57 52 // updating mention heads
58 53 updateMentionHeads(sentence);
... ... @@ -108,294 +103,64 @@ public class Detector {
108 103 * @param sentence
109 104 */
110 105 private static void addMentionsByGroups(Sentence sentence,
111   - Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
112   - ArrayList<String> complexPreps) {
113   - List<SyntacticGroup> groups = sentence.getGroups();
114   - for (int i = 0; i < groups.size(); i++) {
115   - SyntacticGroup thisGroup = groups.get(i);
116   -
117   - /*SyntacticGroup nearPrepNG = null;
118   - SyntacticGroup nextNG = null;*/
119   -
120   - SyntacticGroup nextGroup = thisGroup.getFollowingGroup();
121   -
122   - /*if (thisGroup.getType().startsWith("NG")) {
123   - nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(),
124   - sentence);
125   - nextNG = thisGroup.getNextNG();
126   - }*/
127   -
128   - /*if (nextNG != null) {
129   - int prepStart = thisGroup.getSentencePositionEnd() + 1;
130   - int prepEnd = nextNG.getSentencePositionStart() - 1;
131   - String prep = sentence.getTextInsideSpan(prepStart, prepEnd);
132   - if (complexPreps.contains(prep)) {
133   - String cos = "";
134   - }
135   - }*/
  106 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
  107 +
  108 + for (SyntacticGroup group : sentence.getGroups()) {
  109 + SyntacticGroup nextGroup = group.getFollowingGroup();
  110 + SyntacticGroup nextnextGroup = null;
  111 + SyntacticGroup nextnextnextGroup = null;
  112 + if (nextGroup != null) {
  113 + nextnextGroup = nextGroup.getFollowingGroup();
  114 + if (nextnextGroup != null) {
  115 + nextnextnextGroup = nextnextGroup.getFollowingGroup();
  116 + }
  117 + }
136 118  
137   - /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null &&
138   - //!isPartOfPrepNG(thisGroup, sentence) &&
139   - //getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null &&
140   - precedingWordIsVerb(thisGroup, sentence) &&
141   - //!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
142   - !precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
143   - !sameSemanticHeads(thisGroup, nearPrepNG)) {
144   - List<Token> heads = thisGroup.getSemanticHeadTokens();
145   - List<Token> segments = thisGroup.getTokens();
146   - segments.addAll(nearPrepNG.getTokens());
147   -
148   - sentence.addMention(new Mention(segments, heads));
149   - }*/
150   - /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null &&
151   - // !precedingWordIsVerb(thisGroup, sentence) &&
152   - !isPartOfPrepNG(thisGroup, sentence) &&
153   - getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null &&
154   - //!precedingWordIsVerb(thisGroup, sentence) &&
155   - !precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
156   - //!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
157   - !sameSemanticHeads(thisGroup, nearPrepNG)) {
158   - List<Token> heads = thisGroup.getSemanticHeadTokens();
159   - List<Token> segments = thisGroup.getTokens();
160   - segments.addAll(nearPrepNG.getTokens());
161   -
162   - sentence.addMention(new Mention(segments, heads));
163   - }*/
164   - if (thisGroup.getType().startsWith("NG") &&
165   - nextGroup != null && nextGroup.getType().startsWith("PrepNG") &&
166   - NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) {
167   - List<Token> heads = thisGroup.getSemanticHeadTokens();
  119 + if (group.getType().startsWith("NG") && nextGroup != null &&
  120 + nextnextGroup != null && nextnextnextGroup != null &&
  121 + quatroCompatibility(group, nextGroup, nextnextGroup,
  122 + nextnextnextGroup, valence.get(ValenceDicts.NounsValence))) {
  123 + List<Token> heads = group.getSemanticHeadTokens();
168 124 List<Token> segments = new ArrayList<Token>();
169   - segments.addAll(thisGroup.getTokens());
  125 + segments.addAll(group.getTokens());
170 126 segments.addAll(nextGroup.getTokens());
  127 + segments.addAll(nextnextGroup.getTokens());
  128 + segments.addAll(nextnextnextGroup.getTokens());
171 129  
172 130 sentence.addMention(new Mention(segments, heads));
173   - } else if (thisGroup.getType().startsWith("NG") && nextGroup != null &&
174   - nextGroup.getType().startsWith("NG") &&
175   - NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))
176   - ) {
177   - List<Token> heads = thisGroup.getSemanticHeadTokens();
  131 + } else if (group.getType().startsWith("NG") && nextGroup != null &&
  132 + nextnextGroup != null && tripleCompatibility(group, nextGroup, nextnextGroup, valence.get(ValenceDicts.NounsValence))) {
  133 + List<Token> heads = group.getSemanticHeadTokens();
178 134 List<Token> segments = new ArrayList<Token>();
179   - segments.addAll(thisGroup.getTokens());
  135 + segments.addAll(group.getTokens());
180 136 segments.addAll(nextGroup.getTokens());
  137 + segments.addAll(nextnextGroup.getTokens());
181 138  
182 139 sentence.addMention(new Mention(segments, heads));
183   - } /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null &&
184   - NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) {
185   - List<Token> heads = thisGroup.getSemanticHeadTokens();
186   -
  140 + } else if (group.getType().startsWith("NG") && nextGroup != null &&
  141 + groupsValenceCompatibility(group, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))
  142 + ) {
  143 + List<Token> heads = group.getSemanticHeadTokens();
187 144 List<Token> segments = new ArrayList<Token>();
188   - segments.addAll(thisGroup.getTokens());
189   -
190   - int prepStart = thisGroup.getSentencePositionEnd() + 1;
191   - int prepEnd = nextNG.getSentencePositionStart() - 1;
192   - ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd);
193   - segments.addAll(prepSegments);
194   -
195   - segments.addAll(nextNG.getTokens());
  145 + segments.addAll(group.getTokens());
  146 + segments.addAll(nextGroup.getTokens());
196 147  
197 148 sentence.addMention(new Mention(segments, heads));
198   - }*/
199   - //else if // NG + im./pt. NG
200   - // daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka
201   - // "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName)
202   - else if (thisGroup.getType().startsWith("NG")) {
203   - List<Token> segments = thisGroup.getTokens();
204   - List<Token> heads = thisGroup.getSemanticHeadTokens();
205   -
206   - sentence.addMention(new Mention(segments, heads));
207   - }
208   - }
209   -
210   - // oryginalna wersja
211   - /*for (SyntacticGroup group : sentence.getGroups()) {
212   - if (group.getType().startsWith("NG")) {
  149 + } else if (group.getType().startsWith("NG")) {
213 150 List<Token> segments = group.getTokens();
214 151 List<Token> heads = group.getSemanticHeadTokens();
215 152  
216 153 sentence.addMention(new Mention(segments, heads));
217 154 }
218   - }*/
219   - }
220   -
221   - private static boolean followingWordIsInf(SyntacticGroup group,
222   - Sentence sentence) {
223   - int followingTokenPosition = group.getSentencePositionEnd() + 1;
224   - for (SyntacticWord word : sentence.getSyntacticWords()) {
225   - int firstWordPosition = word.getSentencePositionStart();
226   - if (followingTokenPosition == firstWordPosition &&
227   - (word.getCtag().equals("Inf"))) {
228   - return true;
229   - }
230   - }
231   -
232   - return false;
233   - }
234   -
235   - private static SyntacticGroup getFollowingPrepNGs(int sentencePosition,
236   - Sentence sentence) {
237   - SyntacticGroup largestGroup = null;
238   - int nextTokenPosition = sentencePosition + 1;
239   - for (SyntacticGroup group : sentence.getGroups()) {
240   - if (group.getType().startsWith("PrepNG") &&
241   - group.getSentencePositionStart() == nextTokenPosition) {
242   - if (largestGroup == null ||
243   - largestGroup.getTokens().size() < group.getTokens().size()) {
244   - largestGroup = group;
245   - }
246   - }
247   - }
248   - return largestGroup;
249   - }
250   -
251   - private static boolean isPartOfPrepNG(SyntacticGroup NGGroup,
252   - Sentence sentence) {
253   - int NGGroupStart = NGGroup.getSentencePositionStart();
254   - int NGGroupEnd = NGGroup.getSentencePositionEnd();
255   - for (SyntacticGroup group : sentence.getGroups()) {
256   - if (group.getType().startsWith("PrepNG") &&
257   - group.getSentencePositionStart() <= NGGroupStart &&
258   - group.getSentencePositionEnd() >= NGGroupEnd) {
259   - return true;
260   - }
261   - }
262   - return false;
263   - }
264   -
265   - private static boolean precedingWordIsVerb(SyntacticGroup group,
266   - Sentence sentence) {
267   - int precedingTokenPosition = group.getSentencePositionStart() - 1;
268   - if(isPartOfPrepNG(group, sentence)) {
269   - SyntacticGroup parentGroup = getParentPrepNG(group, sentence);
270   - precedingTokenPosition = parentGroup.getSentencePositionStart() - 1;
271   - }
272   -
273   - for (SyntacticWord word : sentence.getSyntacticWords()) {
274   - int lastWordPosition = word.getSentencePositionEnd();
275   - if (precedingTokenPosition == lastWordPosition &&
276   - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
277   - return true;
278   - }
279   - }
280   - return false;
281   - }
282   -
283   - // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem?
284   - // czy prep moze sie skladac z wiecej niz jednego segmentu?
285   - // dopasowywac refla i recip do sie spejdowego
286   - private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup,
287   - SyntacticGroup PrepNGGroup, Sentence sentence,
288   - Map<String,ArrayList<String>> walentyMapping) {
289   - int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1;
290   - for (SyntacticWord word : sentence.getSyntacticWords()) {
291   - int lastWordPosition = word.getSentencePositionEnd();
292   - if (precedingTokenPosition == lastWordPosition &&
293   - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
294   - String verb = word.getBase();
295   - if (!walentyMapping.containsKey(verb)) {
296   - return true;
297   - } else {
298   - SyntacticWord prepWord = PrepNGGroup.getFirstWord();
299   -
300   - if (prepWord.getTokens().size() == 1) {
301   - Token prep = prepWord.getTokens().get(0);
302   - String prepBase = prep.getBase();
303   - // sprawdzic czy glowa moze miec wiele tokenow
304   - String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase();
305   - ArrayList<String> prepnps = getPrepnps(prepBase, prepCase);
306   -
307   - ArrayList<String> schemata = walentyMapping.get(verb);
308   - for (String schema : schemata) {
309   - for (String prepnp : prepnps) {
310   - if (schema.contains(prepnp)) {
311   - return true;
312   - }
313   - }
314   - }
315   - } else if (prepWord.getTokens().size() > 1) {
316   - String prepOrth = prepWord.getOrth().toLowerCase();
317   - String comprepnp = String.format("comprepnp(%s)", prepOrth);
318   - ArrayList<String> schemata = walentyMapping.get(verb);
319   - for (String schema : schemata) {
320   - if (schema.contains(comprepnp)) {
321   - return true;
322   - }
323   - }
324   -
325   - }
326   -
327   -
328   - }
329   - }
330 155 }
331   - return false;
332   - }
333   -
334   - private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup,
335   - SyntacticGroup PrepNGGroup, Sentence sentence,
336   - Map<String,ArrayList<String>> walentyMapping) {
337   - int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1;
338   - if(isPartOfPrepNG(NGGroup, sentence)) {
339   - SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence);
340   - precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1;
341   - }
342   - for (SyntacticWord word : sentence.getSyntacticWords()) {
343   - int lastWordPosition = word.getSentencePositionEnd();
344   - if (precedingTokenPosition == lastWordPosition &&
345   - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
346   - if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) {
347   - return true;
348   - }
349   - if (!walentyMapping.containsKey(word.getBase())) {
350   - return true;
351   - }
352   -
353   - }
354   - }
355   - return false;
356   - }
357   -
358   - private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup,
359   - SyntacticGroup PrepNGGroup, Sentence sentence,
360   - Map<String,ArrayList<String>> walentyMapping) {
361   - String verbBase = verb.getBase();
362   - if (!walentyMapping.containsKey(verbBase)) {
363   - return true;
364   - } else {
365   - ArrayList<String> schemata = walentyMapping.get(verbBase);
366   -
367   - // PrepNG + PrepNG
368   - if (isPartOfPrepNG(NGGroup, sentence)) {
369   - SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence);
370   - ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations();
371   - ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations();
372   - for (String schema : schemata) {
373   - if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) {
374   - return true;
375   - }
376   - }
377   - }
378   -
379   - // NG + PrepNG
380   - else {
381   - ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations();
382   - ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations();
383   - for (String schema : schemata) {
384   - if (isProperSchema(schema, NGRealizations, prepNGRealizations)) {
385   - return true;
386   - }
387   - }
388   - }
389   - }
390   - return false;
391 156 }
392 157  
393 158 private static boolean isProperSchema(String schema, ArrayList<String> group1Types,
394 159 ArrayList<String> group2Types) {
395 160 for (String group1Type : group1Types) {
396   - if (schema.contains(group1Type)) {
  161 + if (schemaContains(schema, group1Type)) {
397 162 for (String group2Type : group2Types) {
398   - if (schema.contains(group2Type)) {
  163 + if (schemaContains(schema, group2Type)) {
399 164 return true;
400 165 }
401 166 }
... ... @@ -404,103 +169,71 @@ public class Detector {
404 169 return false;
405 170 }
406 171  
407   - private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup,
408   - Sentence sentence) {
409   - SyntacticGroup parentPrepNG = null;
410   - int NGGroupStart = NGGroup.getSentencePositionStart();
411   - int NGGroupEnd = NGGroup.getSentencePositionEnd();
412   - for (SyntacticGroup group : sentence.getGroups()) {
413   - if (group.getType().startsWith("PrepNG") &&
414   - group.getSentencePositionStart() <= NGGroupStart &&
415   - group.getSentencePositionEnd() >= NGGroupEnd) {
416   - if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) {
417   - parentPrepNG = group;
418   - }
419   - }
420   - }
421   - return parentPrepNG;
422   - }
423   -
424   - private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup,
425   - SyntacticGroup PrepNGGroup, Sentence sentence,
  172 + private static boolean groupsValenceCompatibility(SyntacticGroup NG1,
  173 + SyntacticGroup NG2, Sentence sentence,
426 174 Map<String,ArrayList<String>> walentyMapping) {
427   - Token NGHead = NGGroup.getSemanticHeadTokens().get(0);
  175 + Token NG1Head = NG1.getSemanticHeadTokens().get(0);
428 176  
429   - String NGHeadBase = NGHead.getBase();
  177 + String NGHeadBase = NG1Head.getBase();
430 178  
431 179 if (!walentyMapping.containsKey(NGHeadBase)) {
432 180 return false;
433 181 } else {
434   - SyntacticWord prepWord = PrepNGGroup.getFirstWord();
  182 + ArrayList<String> NG2realizations = NG2.getWalentyRealizations();
435 183  
436   - if (prepWord.getTokens().size() == 1) {
437   - Token prep = prepWord.getTokens().get(0);
438   - String prepBase = prep.getBase();
439   - String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase();
440   - String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase);
441   - ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
442   - for (String schema : schemata) {
443   - if (schemaContains(schema, prepnp)) {
444   - return true;
445   - }
446   - }
447   - } else if (prepWord.getTokens().size() > 1) {
448   - String prepOrth = prepWord.getOrth().toLowerCase();
449   - String comprepnp = String.format("comprepnp(%s)", prepOrth);
450   - ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
  184 + ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
  185 + for (String real : NG2realizations) {
451 186 for (String schema : schemata) {
452   - if (schemaContains(schema, comprepnp)) {
  187 + if (schemaContains(schema, real)) {
453 188 return true;
454 189 }
455 190 }
456   -
457 191 }
458   -
459 192 }
460 193 return false;
461 194 }
462 195  
463   - private static boolean NGNGValenceCompatibility(SyntacticGroup NG1,
464   - SyntacticGroup NG2, Sentence sentence,
  196 + private static boolean tripleCompatibility(SyntacticGroup group1,
  197 + SyntacticGroup group2, SyntacticGroup group3,
465 198 Map<String,ArrayList<String>> walentyMapping) {
466   - Token NG1Head = NG1.getSemanticHeadTokens().get(0);
  199 + Token group1Head = group1.getSemanticHeadTokens().get(0);
467 200  
468   - String NGHeadBase = NG1Head.getBase();
  201 + String group1HeadBase = group1Head.getBase();
469 202  
470   - if (!walentyMapping.containsKey(NGHeadBase)) {
  203 + if (!walentyMapping.containsKey(group1HeadBase)) {
471 204 return false;
472 205 } else {
473   - ArrayList<String> NG2realizations = NG2.getWalentyRealizations();
  206 + ArrayList<String> group2realizations = group2.getWalentyRealizations();
  207 + ArrayList<String> group3realizations = group3.getWalentyRealizations();
474 208  
475   - ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
476   - for (String real : NG2realizations) {
477   - for (String schema : schemata) {
478   - if (schemaContains(schema, real)) {
479   - return true;
480   - }
  209 + ArrayList<String> schemata = walentyMapping.get(group1HeadBase);
  210 + for (String schema : schemata) {
  211 + if (isProperSchema(schema, group2realizations, group3realizations)) {
  212 + return true;
481 213 }
482 214 }
483 215 }
484 216 return false;
485 217 }
486 218  
487   - private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1,
488   - SyntacticGroup NGGroup2, Sentence sentence,
  219 + private static boolean quatroCompatibility(SyntacticGroup group1,
  220 + SyntacticGroup group2, SyntacticGroup group3, SyntacticGroup group4,
489 221 Map<String,ArrayList<String>> walentyMapping) {
490   -
491   - Token NGHead = NGGroup1.getSemanticHeadTokens().get(0);
492   - String NGHeadBase = NGHead.getBase();
  222 + Token group1Head = group1.getSemanticHeadTokens().get(0);
  223 +
  224 + String group1HeadBase = group1Head.getBase();
493 225  
494   - if (!walentyMapping.containsKey(NGHeadBase)) {
  226 + if (!walentyMapping.containsKey(group1HeadBase)) {
495 227 return false;
496 228 } else {
497   - int prepStart = NGGroup1.getSentencePositionEnd() + 1;
498   - int prepEnd = NGGroup2.getSentencePositionStart() - 1;
499   - String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd);
500   - String comprepnp = String.format("comprepnp(%s)", complexPrep);
501   - ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
  229 + ArrayList<String> group2realizations = group2.getWalentyRealizations();
  230 + ArrayList<String> group3realizations = group3.getWalentyRealizations();
  231 + ArrayList<String> group4realizations = group4.getWalentyRealizations();
  232 +
  233 + ArrayList<String> schemata = walentyMapping.get(group1HeadBase);
502 234 for (String schema : schemata) {
503   - if (schemaContains(schema, comprepnp)) {
  235 + if (isTripleProperSchema(schema, group2realizations, group3realizations,
  236 + group4realizations)) {
504 237 return true;
505 238 }
506 239 }
... ... @@ -508,67 +241,119 @@ public class Detector {
508 241 return false;
509 242 }
510 243  
511   - private static boolean schemaContains(String schema, String phraseType) {
512   - for (String position : schema.split("\\s\\+\\s")) {
513   - position = position.trim();
514   - position = position.substring(1, position.length()-1);
515   - for (String phrT : position.split(";")) {
516   - if (phrT.equals(phraseType)) {
517   - return true;
  244 + private static boolean isTripleProperSchema(String schema, ArrayList<String> group1Types,
  245 + ArrayList<String> group2Types, ArrayList<String> group3Types) {
  246 + for (String group1Type : group1Types) {
  247 + if (schemaContains(schema, group1Type)) {
  248 + for (String group2Type : group2Types) {
  249 + if (schemaContains(schema, group2Type)) {
  250 + for (String group3Type : group3Types) {
  251 + if (schemaContains(schema, group3Type)) {
  252 + return true;
  253 + }
  254 + }
  255 + }
518 256 }
519 257 }
520 258 }
521 259 return false;
522 260 }
523 261  
524   - private static boolean schemaContainsType(String schema, String type) {
525   - // to lepiej dziala dla rzeczownikow
526   - for (String position : schema.split("\\s\\+\\s")) {
527   - position = position.trim();
528   - position = position.substring(1, position.length()-1);
529   - for (String phrT : position.split(";")) {
530   -
531   - if (phrT.startsWith(type+"(")) {
532   - return true;
  262 + /*private static boolean isTripleProperSchema(String schema, ArrayList<String> group1Types,
  263 + ArrayList<String> group2Types, ArrayList<String> group3Types) {
  264 +
  265 + ArrayList<String> group1MPositions = getMatchingPositions(schema, group1Types);
  266 + ArrayList<String> group2MPositions = getMatchingPositions(schema, group2Types);
  267 + ArrayList<String> group3MPositions = getMatchingPositions(schema, group3Types);
  268 +
  269 +
  270 +
  271 + ArrayList<String> group1MPositionsCopy = new ArrayList<String>();
  272 + ArrayList<String> group2MPositionsCopy = getMatchingPositions(schema, group2Types);
  273 + ArrayList<String> group3MPositionsCopy = getMatchingPositions(schema, group3Types);
  274 +
  275 +
  276 + if (group1MPositions.isEmpty() || group2MPositions.isEmpty() || group3MPositions.isEmpty()) {
  277 + return false;
  278 + }
  279 +
  280 + boolean group1ok = false;
  281 + boolean group2ok = false;
  282 + boolean group3ok = false;
  283 +
  284 + for (String pos : group1MPositions) {
  285 +
  286 + }
  287 +
  288 + ArrayList<String>
  289 +
  290 + if (union(group1MPositions, group2MPositions).size() > group1MPositions.size() &&
  291 + )
  292 +
  293 +
  294 + for (String group1Type : group1Types) {
  295 + if (schemaContains(schema, group1Type)) {
  296 + for (String group2Type : group2Types) {
  297 + if (schemaContains(schema, group2Type)) {
  298 + for (String group3Type : group3Types) {
  299 + if (schemaContains(schema, group3Type)) {
  300 + return true;
  301 + }
  302 + }
  303 + }
533 304 }
534 305 }
535 306 }
536 307 return false;
  308 + }*/
  309 +
  310 + public static List<String> union(List<String> list1, List<String> list2) {
  311 + HashSet<String> set = new HashSet<String>();
  312 +
  313 + set.addAll(list1);
  314 + set.addAll(list2);
  315 +
  316 + return new ArrayList<String>(set);
537 317 }
538 318  
  319 + public static List<String> tripleUnion(List<String> list1, List<String> list2,
  320 + List<String> list3) {
  321 + HashSet<String> set = new HashSet<String>();
  322 +
  323 + set.addAll(list1);
  324 + set.addAll(list2);
  325 + set.addAll(list3);
  326 +
  327 + return new ArrayList<String>(set);
  328 + }
539 329  
540   - // compar ??
541   - private static ArrayList<String> getPrepnps(String prepBase, String prepCase) {
542   - ArrayList<String> prepnps = new ArrayList<String>();
543   - prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase));
544   - if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) {
545   - prepnps.add(String.format("prepnp(%s,str)", prepBase));
546   - }
547   - if (prepCase.equals("gen") || prepCase.equals("acc")) {
548   - prepnps.add(String.format("prepnp(%s,part)", prepBase));
  330 + private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) {
  331 + ArrayList<String> positions = new ArrayList<String>();
  332 + for (String position : schema.split("\\s\\+\\s")) {
  333 + position = position.trim();
  334 + position = position.substring(1, position.length()-1);
  335 + for (String phrT : position.split(";")) {
  336 + if (phraseRealizations.contains(phrT.trim())) {
  337 + positions.add(position);
  338 + break;
  339 + }
  340 + }
549 341 }
550   - return prepnps;
  342 + return positions;
551 343 }
552 344  
553   - // eliminuje "od wsi do wsi"
554   - private static boolean sameSemanticHeads(SyntacticGroup group1,
555   - SyntacticGroup group2) {
556   -
557   - List<Token> group1HeadTokens = group1.getSemanticHeadTokens();
558   - List<Token> group2HeadTokens = group2.getSemanticHeadTokens();
559   - if (group1HeadTokens.size() != group2HeadTokens.size()) {
560   - return false;
561   - }
562   -
563   - for (int i=0; i < group1HeadTokens.size(); i++) {
564   - if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) {
565   - return false;
  345 + private static boolean schemaContains(String schema, String phraseType) {
  346 + for (String position : schema.split("\\s\\+\\s")) {
  347 + position = position.trim();
  348 + position = position.substring(1, position.length()-1);
  349 + for (String phrT : position.split(";")) {
  350 + if (phrT.equals(phraseType)) {
  351 + return true;
  352 + }
566 353 }
567 354 }
568   -
569   - return true;
  355 + return false;
570 356 }
571   -
572 357  
573 358 /**
574 359 * Wyszukuję i oznaczam wszystkie NER
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
1 1 package pl.waw.ipipan.zil.core.md.entities;
2 2  
3 3 import java.util.ArrayList;
4   -import java.util.Arrays;
5 4 import java.util.List;
6 5  
  6 +import pl.waw.ipipan.zil.core.md.detection.Constants;
  7 +
7 8 /**
8 9 * @author Mateusz Kopec
  10 + * Modified 2017 by Bartlomiej Niton
9 11 *
10 12 */
11 13 public class Mention implements Comparable<Mention> {
... ... @@ -205,77 +207,22 @@ public class Mention implements Comparable&lt;Mention&gt; {
205 207 return isZeroSubject;
206 208 }
207 209  
208   - public int getSentencePositionStart() {
  210 + public int getSentenceStartPosition() {
209 211 Token startToken = this.getFirstSegment();
210 212 return startToken.getSentencePosition();
211 213 }
212 214  
213   - public int getSentencePositionEnd() {
  215 + public int getSentenceEndPosition() {
214 216 Token endToken = this.getLastSegment();
215 217 return endToken.getSentencePosition();
216 218 }
217   -
218   - public boolean isPartOfQub() {
219   - if (this.segments.size() == 1) {
220   - Sentence sentence = this.segments.get(0).getSentence();
221   - for (SyntacticWord word : sentence.getSyntacticWords()) {
222   - if (word.getTokens().contains(this.segments.get(0)) &&
223   - word.getCtag().equals("Qub")) {
224   - return true;
225   - }
226   - }
227   - }
228   - return false;
229   - }
230   -
231   - public boolean isPartOfPrep() {
232   - if (this.segments.size() == 1) {
233   - Sentence sentence = this.segments.get(0).getSentence();
234   - for (SyntacticWord word : sentence.getSyntacticWords()) {
235   - if (word.getTokens().contains(this.segments.get(0)) &&
236   - word.getCtag().equals("Prep")) {
237   - return true;
238   - }
239   - }
240   - }
241   - return false;
242   - }
243   -
244   - private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj",
245   - "Adj", "Conj", "Comp");
246 219  
247 220 public boolean isPartOfFrazeo() {
248 221 if (this.segments.size() == 1) {
249 222 Sentence sentence = this.segments.get(0).getSentence();
250 223 for (SyntacticWord word : sentence.getSyntacticWords()) {
251 224 if (word.getTokens().contains(this.segments.get(0)) &&
252   - FRAZEOS.contains(word.getCtag())) {
253   - return true;
254   - }
255   - }
256   - }
257   - return false;
258   - }
259   -
260   - public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) {
261   - if (this.segments.size() == 1) {
262   - Sentence sentence = this.segments.get(0).getSentence();
263   - if (this.getSentencePositionStart() - 1 >= 0) {
264   - String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth();
265   - String noun = sentence.get(this.getSentencePositionStart()).getOrth();
266   - String possiblePrep = String.format("%s %s", prep, noun);
267   - if (complexPreps.contains(possiblePrep)) {
268   - return true;
269   - }
270   - }
271   -
272   - if (this.getSentencePositionStart() - 1 >= 0 &&
273   - this.getSentencePositionStart() + 1 < sentence.size()) {
274   - String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth();
275   - String noun = sentence.get(this.getSentencePositionStart()).getOrth();
276   - String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth();
277   - String possiblePrep = String.format("%s %s %s", prep1, noun, prep2);
278   - if (complexPreps.contains(possiblePrep)) {
  225 + Constants.FRAZEO_CTAGS.contains(word.getCtag())) {
279 226 return true;
280 227 }
281 228 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
... ... @@ -110,35 +110,6 @@ public class Sentence extends ArrayList&lt;Token&gt; {
110 110 namedEntities.add(namedEntity);
111 111 }
112 112  
113   - public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) {
114   - ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>();
115   - for (SyntacticGroup group : this.syntacticGroups) {
116   - if (group.getSentencePositionStart() >= start &&
117   - group.getSentencePositionEnd() <= end) {
118   - if (!(group.getSentencePositionStart() == start &&
119   - group.getSentencePositionEnd() == end)) {
120   - groupsAtSpan.add(group);
121   - }
122   - }
123   - }
124   - return groupsAtSpan;
125   - }
126   -
127   - public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) {
128   - ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>();
129   - for (SyntacticGroup group : this.syntacticGroups) {
130   -
131   - if (group.getSentencePositionStart() >= start &&
132   - group.getSentencePositionEnd() <= end) {
133   - if (!(group.getSentencePositionStart() == start &&
134   - group.getSentencePositionEnd() == end)) {
135   - groupsAtSpan.add(group);
136   - }
137   - }
138   - }
139   - return groupsAtSpan;
140   - }
141   -
142 113 public SyntacticGroup getFirstGroup(int start, int end) {
143 114 SyntacticGroup largestGroup = null;
144 115 int step = start;
... ... @@ -152,8 +123,8 @@ public class Sentence extends ArrayList&lt;Token&gt; {
152 123 private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) {
153 124 SyntacticGroup largestGroup = null;
154 125 for (SyntacticGroup group : this.getGroups()) {
155   - int groupStart = group.getSentencePositionStart();
156   - int groupEnd = group.getSentencePositionEnd();
  126 + int groupStart = group.getSentenceStartPosition();
  127 + int groupEnd = group.getSentenceEndPosition();
157 128 if (groupStart == start && groupEnd <= end &&
158 129 !(groupStart == start && groupEnd == end) &&
159 130 (largestGroup == null ||
... ... @@ -177,8 +148,8 @@ public class Sentence extends ArrayList&lt;Token&gt; {
177 148 private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) {
178 149 SyntacticGroup largestGroup = null;
179 150 for (SyntacticGroup group : this.getGroups()) {
180   - int groupStart = group.getSentencePositionStart();
181   - int groupEnd = group.getSentencePositionEnd();
  151 + int groupStart = group.getSentenceStartPosition();
  152 + int groupEnd = group.getSentenceEndPosition();
182 153 if (groupEnd == end && groupStart >= start &&
183 154 !(groupStart == start && groupEnd == end) &&
184 155 (largestGroup == null ||
... ... @@ -189,38 +160,4 @@ public class Sentence extends ArrayList&lt;Token&gt; {
189 160 return largestGroup;
190 161 }
191 162  
192   - public ArrayList<Mention> getMentionsInsideSpan(int start, int end) {
193   - ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>();
194   - for (Mention mention : this.mentions) {
195   - if (mention.getSentencePositionStart() >= start &&
196   - mention.getSentencePositionEnd() <= end) {
197   - mentionsAtSpan.add(mention);
198   - }
199   - }
200   - return mentionsAtSpan;
201   - }
202   -
203   - public String getTextInsideSpan(int start, int end) {
204   - String text = "";
205   - int step = start;
206   - while (step <= end) {
207   - if (step != start) {
208   - text += " ";
209   - }
210   - text += this.get(step).getOrth();
211   - step++;
212   - }
213   - return text;
214   - }
215   -
216   - public ArrayList<Token> getSegmentsInsideSpan(int start, int end) {
217   - ArrayList<Token> tokensAtSpan = new ArrayList<Token>();
218   - int step = start;
219   - while (step <= end) {
220   - tokensAtSpan.add(this.get(step));
221   - step++;
222   - }
223   - return tokensAtSpan;
224   - }
225   -
226 163 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
... ... @@ -55,34 +55,19 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
55 55 return getType().compareTo(o.getType());
56 56 }
57 57  
58   - public int getSentencePositionStart() {
  58 + public int getSentenceStartPosition() {
59 59 Token startToken = tokens.get(0);
60 60 return startToken.getSentencePosition();
61 61 }
62 62  
63   - public int getSentencePositionEnd() {
  63 + public int getSentenceEndPosition() {
64 64 Token endToken = tokens.get(tokens.size()-1);
65 65 return endToken.getSentencePosition();
66 66 }
67 67  
68   -
69   - public SyntacticWord getFirstWord() {
70   - SyntacticWord firstWord = null;
71   - Token startToken = tokens.get(0);
72   - Sentence sentence = startToken.getSentence();
73   - for (SyntacticWord word : sentence.getSyntacticWords()) {
74   - if(startToken.compareTo(word.getTokens().get(0)) == 0 &&
75   - (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) {
76   - firstWord = word;
77   - }
78   - }
79   - return firstWord;
80   - }
81   -
82   - // NG and PrepNG only now
83 68 public ArrayList<String> getWalentyRealizations() {
84 69 ArrayList<String> realizations = new ArrayList<String>();
85   - if (this.type.startsWith("PrepNG")) {
  70 + if (this.type.equals("PrepNG")) {
86 71 SyntacticWord prepWord = this.getFirstWord();
87 72 if (prepWord.getTokens().size() == 1) {
88 73  
... ... @@ -105,7 +90,19 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
105 90 return realizations;
106 91 }
107 92  
108   - // compar ??
  93 + public SyntacticWord getFirstWord() {
  94 + SyntacticWord firstWord = null;
  95 + Token startToken = tokens.get(0);
  96 + Sentence sentence = startToken.getSentence();
  97 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  98 + if(startToken.compareTo(word.getTokens().get(0)) == 0 &&
  99 + (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) {
  100 + firstWord = word;
  101 + }
  102 + }
  103 + return firstWord;
  104 + }
  105 +
109 106 private ArrayList<String> getPrepnps(String prepBase, String prepCase) {
110 107 ArrayList<String> prepnps = new ArrayList<String>();
111 108 prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase));
... ... @@ -130,44 +127,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
130 127 return nps;
131 128 }
132 129  
133   - public boolean precedingWordIsVerb() {
134   - Sentence sentence = this.tokens.get(0).getSentence();
135   - int precedingTokenPosition = this.getSentencePositionStart() - 1;
136   - for (SyntacticWord word : sentence.getSyntacticWords()) {
137   - int lastWordPosition = word.getSentencePositionEnd();
138   - if (precedingTokenPosition == lastWordPosition &&
139   - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
140   - return true;
141   - }
142   - }
143   - return false;
144   - }
145   -
146   - public SyntacticGroup getNextNG() {
147   - Sentence sentence = this.tokens.get(0).getSentence();
148   - int thisGroupEnd = this.getSentencePositionEnd();
149   - int sentenceLength = sentence.size();
150   -
151   - SyntacticGroup nextNG = null;
152   - for (int step = thisGroupEnd; step < sentenceLength; step++) {
153   - nextNG = sentence.getFirstGroup(step, sentenceLength);
154   - if (nextNG != null && nextNG.type.startsWith("NG") &&
155   - this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) {
156   - break;
157   - } else {
158   - nextNG = null;
159   - }
160   - }
161   - return nextNG;
162   - }
163   -
164 130 public SyntacticGroup getFollowingGroup() {
165 131 SyntacticGroup largestGroup = null;
166 132 Sentence sentence = this.tokens.get(0).getSentence();
167   - int nextTokenPosition = this.getSentencePositionEnd() + 1;
  133 + int nextTokenPosition = this.getSentenceEndPosition() + 1;
168 134 for (SyntacticGroup group : sentence.getGroups()) {
169   - if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) &&
170   - group.getSentencePositionStart() == nextTokenPosition) {
  135 + if ((group.getType().equals("PrepNG") || group.getType().startsWith("NG")) &&
  136 + group.getSentenceStartPosition() == nextTokenPosition) {
171 137 if (largestGroup == null ||
172 138 largestGroup.getTokens().size() < group.getTokens().size()) {
173 139 largestGroup = group;
... ... @@ -178,14 +144,14 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
178 144 }
179 145  
180 146 public SyntacticWord getPrecedingVerb() {
181   - int precedingTokenPosition = this.getSentencePositionStart() - 1;
  147 + int precedingTokenPosition = this.getSentenceStartPosition() - 1;
182 148 Sentence sentence = this.tokens.get(0).getSentence();
183 149 if(this.isPartOfPrepNG()) {
184 150 SyntacticGroup parentNGGroup = this.getParentPrepNG();
185   - precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1;
  151 + precedingTokenPosition = parentNGGroup.getSentenceStartPosition() - 1;
186 152 }
187 153 for (SyntacticWord word : sentence.getSyntacticWords()) {
188   - int lastWordPosition = word.getSentencePositionEnd();
  154 + int lastWordPosition = word.getSentenceEndPosition();
189 155 if (precedingTokenPosition == lastWordPosition &&
190 156 (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
191 157 return word;
... ... @@ -195,13 +161,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
195 161 }
196 162  
197 163 private boolean isPartOfPrepNG() {
198   - int NGGroupStart = this.getSentencePositionStart();
199   - int NGGroupEnd = this.getSentencePositionEnd();
  164 + int NGGroupStart = this.getSentenceStartPosition();
  165 + int NGGroupEnd = this.getSentenceEndPosition();
200 166 Sentence sentence = this.tokens.get(0).getSentence();
201 167 for (SyntacticGroup group : sentence.getGroups()) {
202   - if (group.getType().startsWith("PrepNG") &&
203   - group.getSentencePositionStart() <= NGGroupStart &&
204   - group.getSentencePositionEnd() >= NGGroupEnd) {
  168 + if (group.getType().equals("PrepNG") &&
  169 + group.getSentenceStartPosition() <= NGGroupStart &&
  170 + group.getSentenceEndPosition() >= NGGroupEnd) {
205 171 return true;
206 172 }
207 173 }
... ... @@ -210,13 +176,13 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
210 176  
211 177 private SyntacticGroup getParentPrepNG() {
212 178 SyntacticGroup parentPrepNG = null;
213   - int NGGroupStart = this.getSentencePositionStart();
214   - int NGGroupEnd = this.getSentencePositionEnd();
  179 + int NGGroupStart = this.getSentenceStartPosition();
  180 + int NGGroupEnd = this.getSentenceEndPosition();
215 181 Sentence sentence = this.tokens.get(0).getSentence();
216 182 for (SyntacticGroup group : sentence.getGroups()) {
217   - if (group.getType().startsWith("PrepNG") &&
218   - group.getSentencePositionStart() <= NGGroupStart &&
219   - group.getSentencePositionEnd() >= NGGroupEnd) {
  183 + if (group.getType().equals("PrepNG") &&
  184 + group.getSentenceStartPosition() <= NGGroupStart &&
  185 + group.getSentenceEndPosition() >= NGGroupEnd) {
220 186 if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) {
221 187 parentPrepNG = group;
222 188 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
... ... @@ -4,6 +4,8 @@ import java.util.ArrayList;
4 4 import java.util.Iterator;
5 5 import java.util.List;
6 6  
  7 +import pl.waw.ipipan.zil.core.md.detection.Constants;
  8 +
7 9 public class SyntacticWord implements Comparable<SyntacticWord> {
8 10  
9 11 private String base;
... ... @@ -22,6 +24,14 @@ public class SyntacticWord implements Comparable&lt;SyntacticWord&gt; {
22 24 public String getCtag() {
23 25 return ctag;
24 26 }
  27 +
  28 + public String getBase() {
  29 + return base;
  30 + }
  31 +
  32 + public String getOrth() {
  33 + return orth;
  34 + }
25 35  
26 36 public List<Token> getTokens() {
27 37 return tokens;
... ... @@ -45,33 +55,18 @@ public class SyntacticWord implements Comparable&lt;SyntacticWord&gt; {
45 55 return getCtag().compareTo(o.getCtag());
46 56 }
47 57  
48   - public int getSentencePositionStart() {
  58 + public int getSentenceStartPosition() {
49 59 Token startToken = tokens.get(0);
50 60 return startToken.getSentencePosition();
51 61 }
52 62  
53   - public int getSentencePositionEnd() {
  63 + public int getSentenceEndPosition() {
54 64 Token endToken = tokens.get(tokens.size()-1);
55 65 return endToken.getSentencePosition();
56 66 }
57 67  
58   - public String getBase() {
59   - return this.base;
60   - }
61   -
62   - public String getOrth() {
63   - return this.orth;
64   - }
65   -
66 68 public boolean isVerb() {
67   - if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) {
68   - return true;
69   - }
70   - return false;
71   - }
72   -
73   - public boolean isInterp() {
74   - if (this.ctag.equals("Interp")) {
  69 + if (Constants.VERB_CTAGS.contains(this.ctag)) {
75 70 return true;
76 71 }
77 72 return false;
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
... ... @@ -70,7 +70,6 @@ public class TeiLoader {
70 70 for (TEIMorph mo : m.getHeadMorphs())
71 71 headTokens.add(teiMorph2Segment.get(mo));
72 72 s.addMention(new Mention(tokens, headTokens, m.isZeroSubject()));
73   - System.out.println(tokens.toString());
74 73 }
75 74  
76 75 private static void loadSyntacticGroup(Sentence s, TEIGroup g,
... ...