Commit 1dc4f9471ae6d1929a5d443457f85a77bd7f6ad4

Authored by Bartłomiej Nitoń
1 parent 3682bbf2

Added new mention detection rules based on Walenty dictionary.

src/main/java/pl/waw/ipipan/zil/core/md/Main.java
... ... @@ -2,6 +2,7 @@ package pl.waw.ipipan.zil.core.md;
2 2  
3 3 import org.slf4j.Logger;
4 4 import org.slf4j.LoggerFactory;
  5 +
5 6 import pl.waw.ipipan.zil.core.md.detection.Detector;
6 7 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
7 8 import pl.waw.ipipan.zil.core.md.entities.Text;
... ... @@ -15,10 +16,16 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
15 16 import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
16 17 import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
17 18  
  19 +import java.io.BufferedReader;
18 20 import java.io.File;
19 21 import java.io.FileInputStream;
20 22 import java.io.IOException;
21 23 import java.io.InputStream;
  24 +import java.io.InputStreamReader;
  25 +import java.util.ArrayList;
  26 +import java.util.EnumMap;
  27 +import java.util.HashMap;
  28 +import java.util.Map;
22 29  
23 30 public class Main {
24 31  
... ... @@ -26,12 +33,125 @@ public class Main {
26 33  
27 34 private static final boolean GZIP_OUTPUT = true;
28 35 private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
  36 + private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt";
  37 + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt";
  38 + private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt";
29 39  
30 40 private static ZeroSubjectDetector zeroSubjectModel;
  41 +
  42 + public static enum ValenceDicts {
  43 + VerbsValence,
  44 + NounsValence
  45 + }
  46 +
  47 + private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence =
  48 + new EnumMap(ValenceDicts.class);
  49 +
  50 + private static final ArrayList<String> complexPreps;
31 51  
32 52 static {
33 53 InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
34 54 zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
  55 +
  56 + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE);
  57 + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream));
  58 +
  59 + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
  60 + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
  61 +
  62 + InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS);
  63 + complexPreps = readValues(complexPrepositionsStream);
  64 + }
  65 +
  66 +
  67 + public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream)
  68 + {
  69 + Map<String,ArrayList<String>> map;
  70 + try {
  71 + BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream));
  72 + map = new HashMap<String,ArrayList<String>>();
  73 + String line;
  74 + boolean firstLine = true;
  75 + while((line = br.readLine()) != null) {
  76 + if (firstLine) {
  77 + line = line.replace("\uFEFF", ""); // remove BOM character
  78 + firstLine = false;
  79 + }
  80 +
  81 + if (!line.startsWith("%")) {
  82 + String[] lineParts = line.split(":");
  83 + String lemma = lineParts[0].trim();
  84 + String schema = lineParts[5].trim();
  85 +
  86 + if (schema.trim().isEmpty()) {
  87 + continue;
  88 + }
  89 +
  90 + String[] lemmaParts = lemma.split(" ");
  91 + if(lemmaParts.length == 1 && schemaContainsSie(schema)) {
  92 + lemma = lemma + " się";
  93 + }
  94 +
  95 + ArrayList<String> schemata;
  96 + if (!map.containsKey(lemma)) {
  97 + schemata = new ArrayList<String>();
  98 + schemata.add(schema);
  99 + map.put(lemma, schemata);
  100 + } else {
  101 + schemata = map.get(lemma);
  102 + schemata.add(schema);
  103 + map.put(lemma, schemata);
  104 + }
  105 + }
  106 + }
  107 + br.close();
  108 + } catch (IOException ex) {
  109 + ex.printStackTrace();
  110 + throw new RuntimeException(ex);
  111 + }
  112 + return map;
  113 + }
  114 +
  115 + private static boolean schemaContainsSie(String schema) {
  116 + for (String position : schema.split("\\s\\+\\s")) {
  117 + position = position.trim();
  118 + position = position.substring(1, position.length()-1);
  119 + for (String phrT : position.split(";")) {
  120 + if (phrT.equals("refl") || phrT.equals("recip")) {
  121 + return true;
  122 + }
  123 + }
  124 + }
  125 +
  126 + return false;
  127 + }
  128 +
  129 + public static ArrayList<String> readValues(InputStream stream) {
  130 + ArrayList<String> values;
  131 + try {
  132 + BufferedReader br=new BufferedReader(new InputStreamReader(stream));
  133 + values = new ArrayList<String>();
  134 + String line;
  135 + boolean firstLine = true;
  136 + while((line = br.readLine()) != null) {
  137 + if (firstLine) {
  138 + line = line.replace("\uFEFF", ""); // remove BOM character
  139 + firstLine = false;
  140 + }
  141 +
  142 + if (!line.startsWith("%")) {
  143 + String value = line.trim();
  144 + if (!value.isEmpty()) {
  145 + values.add(value);
  146 + }
  147 + }
  148 + }
  149 + br.close();
  150 + } catch (IOException ex) {
  151 + ex.printStackTrace();
  152 + throw new RuntimeException(ex);
  153 + }
  154 + return values;
35 155 }
36 156  
37 157 private Main() {
... ... @@ -71,6 +191,8 @@ public class Main {
71 191 return;
72 192 }
73 193 }
  194 +
  195 +
74 196  
75 197 int all = 0;
76 198 int errors = 0;
... ... @@ -122,7 +244,7 @@ public class Main {
122 244 */
123 245 public static void annotateThriftText(TText thriftText) throws MultiserviceException {
124 246 Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
125   - Detector.findMentionsInText(responseText, zeroSubjectModel);
  247 + Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps);
126 248 ThriftSaver.updateThriftText(responseText, thriftText);
127 249 }
128 250  
... ... @@ -135,7 +257,7 @@ public class Main {
135 257 */
136 258 public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
137 259 Text responseText = TeiLoader.loadTextFromTei(teiText);
138   - Detector.findMentionsInText(responseText, zeroSubjectModel);
  260 + Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps);
139 261 TeiSaver.updateTeiText(responseText, teiText);
140 262 }
141 263  
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 1 package pl.waw.ipipan.zil.core.md.detection;
2 2  
  3 +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
3 4 import pl.waw.ipipan.zil.core.md.entities.Mention;
4 5 import pl.waw.ipipan.zil.core.md.entities.Sentence;
  6 +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
  7 +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
5 8 import pl.waw.ipipan.zil.core.md.entities.Token;
6 9  
  10 +import java.util.ArrayList;
7 11 import java.util.Collection;
8 12 import java.util.HashSet;
9 13 import java.util.List;
  14 +import java.util.Map;
10 15 import java.util.Set;
11 16  
12 17 public class Cleaner {
... ... @@ -125,4 +130,157 @@ public class Cleaner {
125 130 else
126 131 return m1;
127 132 }
  133 +
  134 + public static void cleanWalentyFramedMentions(Sentence sentence,
  135 + Map<String,ArrayList<String>> verbsValence) {
  136 + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
  137 + for (Mention mention : sentence.getMentions()) {
  138 + int mentionStart = mention.getFirstSegment().getSentencePosition();
  139 + int mentionEnd = mention.getLastSegment().getSentencePosition();
  140 + SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd);
  141 + SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd);
  142 +
  143 + if (startGroup != null && endGroup != null
  144 + && startGroup.compareTo(endGroup) != 0) {
  145 +
  146 + SyntacticWord verb = startGroup.getPrecedingVerb();
  147 + if (verb != null && !verb.getBase().equals("mieć")
  148 + && verbsValence.containsKey(verb.getBase())) {
  149 + ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations();
  150 + ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations();
  151 +
  152 + for (String schema : verbsValence.get(verb.getBase())) {
  153 + if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) {
  154 + mentionsToRemove.add(mention);
  155 + break;
  156 + }
  157 + }
  158 + }
  159 + }
  160 + }
  161 +
  162 + for (Mention mentionToRemove : mentionsToRemove) {
  163 + sentence.removeMention(mentionToRemove);
  164 + }
  165 + }
  166 +
  167 + /*private static void removeWalentyFramedMentions(Sentence sentence,
  168 + ArrayList<Mention> mentions,
  169 + ArrayList<String> schemata) {
  170 + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
  171 + for (Mention mention : mentions) {
  172 + int mentionStart = mention.getFirstSegment().getSentencePosition();
  173 + int mentionEnd = mention.getLastSegment().getSentencePosition();
  174 + SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd);
  175 + SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd);
  176 + if (startGroup != null && endGroup != null
  177 + && startGroup.compareTo(endGroup) != 0) {
  178 + ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations();
  179 + ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations();
  180 + for (String schema : schemata) {
  181 + if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) {
  182 + mentionsToRemove.add(mention);
  183 + break;
  184 + }
  185 + }
  186 + }
  187 + }
  188 +
  189 + for (Mention mentionToRemove : mentionsToRemove) {
  190 + sentence.removeMention(mentionToRemove);
  191 + }
  192 + }*/
  193 +
  194 + private static boolean isProperSchema(String schema, ArrayList<String> group1Types,
  195 + ArrayList<String> group2Types) {
  196 + for (String group1Type : group1Types) {
  197 + for (String group2Type : group2Types) {
  198 + if (schemaContains(schema, group1Type, group2Type)) {
  199 + return true;
  200 + }
  201 + }
  202 + }
  203 + return false;
  204 + }
  205 +
  206 + private static boolean schemaContains(String schema, String phraseType1,
  207 + String phraseType2) {
  208 + boolean phrType1Found = false;
  209 + boolean phrType2Found = false;
  210 + for (String position : schema.split("\\+")) {
  211 + position = position.trim();
  212 + position = position.substring(1, position.length()-1);
  213 + for (String phrT : position.split(";")) {
  214 + if (phrT.equals(phraseType1)) {
  215 + phrType1Found = true;
  216 + break;
  217 + } else if (phrT.equals(phraseType2)) {
  218 + phrType2Found = true;
  219 + break;
  220 + }
  221 + }
  222 + if (phrType1Found && phrType2Found) {
  223 + return true;
  224 + }
  225 + }
  226 + return false;
  227 + }
  228 +
  229 +
  230 + // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub)
  231 + public static void cleanQubs(Sentence sentence) {
  232 + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
  233 + for (Mention mention : sentence.getMentions()) {
  234 + if (mention.isPartOfQub()) {
  235 + mentionsToRemove.add(mention);
  236 + }
  237 + }
  238 +
  239 + for (Mention mentionToRemove : mentionsToRemove) {
  240 + sentence.removeMention(mentionToRemove);
  241 + }
  242 + }
  243 +
  244 + public static void cleanPreps(Sentence sentence) {
  245 + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
  246 + for (Mention mention : sentence.getMentions()) {
  247 + if (mention.isPartOfPrep()) {
  248 + mentionsToRemove.add(mention);
  249 + }
  250 + }
  251 +
  252 + for (Mention mentionToRemove : mentionsToRemove) {
  253 + sentence.removeMention(mentionToRemove);
  254 + }
  255 + }
  256 +
  257 + public static void cleanFrazeos(Sentence sentence) {
  258 + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
  259 + for (Mention mention : sentence.getMentions()) {
  260 + if (mention.isPartOfFrazeo()) {
  261 + mentionsToRemove.add(mention);
  262 + }
  263 + }
  264 +
  265 + for (Mention mentionToRemove : mentionsToRemove) {
  266 + sentence.removeMention(mentionToRemove);
  267 + }
  268 + }
  269 +
  270 + // wyrzuca wzmianki bedace czescia przyimkow zlozonych
  271 + public static void cleanComplexPreps(Sentence sentence,
  272 + ArrayList<String> complexPreps) {
  273 +
  274 + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
  275 + for (Mention mention : sentence.getMentions()) {
  276 + if (mention.isPartOfComplexPrep(complexPreps)) {
  277 + mentionsToRemove.add(mention);
  278 + }
  279 + }
  280 +
  281 + for (Mention mentionToRemove : mentionsToRemove) {
  282 + sentence.removeMention(mentionToRemove);
  283 + }
  284 + }
  285 +
128 286 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
... ... @@ -2,12 +2,15 @@ package pl.waw.ipipan.zil.core.md.detection;
2 2  
3 3 import org.slf4j.Logger;
4 4 import org.slf4j.LoggerFactory;
  5 +
  6 +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
5 7 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
6 8 import pl.waw.ipipan.zil.core.md.entities.*;
7 9  
8 10 import java.util.ArrayList;
9 11 import java.util.HashSet;
10 12 import java.util.List;
  13 +import java.util.Map;
11 14 import java.util.Set;
12 15  
13 16 public class Detector {
... ... @@ -18,21 +21,25 @@ public class Detector {
18 21 }
19 22  
20 23 public static void findMentionsInText(Text text,
21   - ZeroSubjectDetector zeroSubjectModel) {
  24 + ZeroSubjectDetector zeroSubjectModel,
  25 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
  26 + ArrayList<String> complexPreps) {
22 27 text.clearMentions();
23 28 logger.debug("Detecting mentions in text " + text.getId());
24 29 for (Paragraph p : text)
25 30 for (Sentence s : p)
26   - detectMentionsInSentence(s, zeroSubjectModel);
  31 + detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps);
27 32 }
28 33  
29 34 private static void detectMentionsInSentence(Sentence sentence,
30   - ZeroSubjectDetector zeroSubjectModel) {
  35 + ZeroSubjectDetector zeroSubjectModel,
  36 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
  37 + ArrayList<String> complexPreps) {
31 38 // adding mentions
32 39 addMentionsByTokenCtag(sentence);
33 40 addMentionsBySyntacticWordsCtag(sentence);
34 41 addMentionsByNamedEntities(sentence);
35   - addMentionsByGroups(sentence);
  42 + addMentionsByGroups(sentence, valence, complexPreps);
36 43 addSpeakerMentionsInSpoken(sentence);
37 44  
38 45 // zero subject detection
... ... @@ -41,6 +48,11 @@ public class Detector {
41 48 // removing mentions
42 49 removeTo(sentence);
43 50 Cleaner.cleanUnnecessarySentenceMentions(sentence);
  51 + //Cleaner.cleanQubs(sentence);
  52 + //Cleaner.cleanPreps(sentence);
  53 + //Cleaner.cleanComplexPreps(sentence, complexPreps);
  54 + Cleaner.cleanFrazeos(sentence);
  55 + Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
44 56  
45 57 // updating mention heads
46 58 updateMentionHeads(sentence);
... ... @@ -95,16 +107,468 @@ public class Detector {
95 107 *
96 108 * @param sentence
97 109 */
98   - private static void addMentionsByGroups(Sentence sentence) {
99   - for (SyntacticGroup group : sentence.getGroups()) {
  110 + private static void addMentionsByGroups(Sentence sentence,
  111 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
  112 + ArrayList<String> complexPreps) {
  113 + List<SyntacticGroup> groups = sentence.getGroups();
  114 + for (int i = 0; i < groups.size(); i++) {
  115 + SyntacticGroup thisGroup = groups.get(i);
  116 +
  117 + /*SyntacticGroup nearPrepNG = null;
  118 + SyntacticGroup nextNG = null;*/
  119 +
  120 + SyntacticGroup nextGroup = thisGroup.getFollowingGroup();
  121 +
  122 + /*if (thisGroup.getType().startsWith("NG")) {
  123 + nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(),
  124 + sentence);
  125 + nextNG = thisGroup.getNextNG();
  126 + }*/
  127 +
  128 + /*if (nextNG != null) {
  129 + int prepStart = thisGroup.getSentencePositionEnd() + 1;
  130 + int prepEnd = nextNG.getSentencePositionStart() - 1;
  131 + String prep = sentence.getTextInsideSpan(prepStart, prepEnd);
  132 + if (complexPreps.contains(prep)) {
  133 + String cos = "";
  134 + }
  135 + }*/
  136 +
  137 + /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null &&
  138 + //!isPartOfPrepNG(thisGroup, sentence) &&
  139 + //getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null &&
  140 + precedingWordIsVerb(thisGroup, sentence) &&
  141 + //!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
  142 + !precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
  143 + !sameSemanticHeads(thisGroup, nearPrepNG)) {
  144 + List<Token> heads = thisGroup.getSemanticHeadTokens();
  145 + List<Token> segments = thisGroup.getTokens();
  146 + segments.addAll(nearPrepNG.getTokens());
  147 +
  148 + sentence.addMention(new Mention(segments, heads));
  149 + }*/
  150 + /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null &&
  151 + // !precedingWordIsVerb(thisGroup, sentence) &&
  152 + !isPartOfPrepNG(thisGroup, sentence) &&
  153 + getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null &&
  154 + //!precedingWordIsVerb(thisGroup, sentence) &&
  155 + !precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
  156 + //!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
  157 + !sameSemanticHeads(thisGroup, nearPrepNG)) {
  158 + List<Token> heads = thisGroup.getSemanticHeadTokens();
  159 + List<Token> segments = thisGroup.getTokens();
  160 + segments.addAll(nearPrepNG.getTokens());
  161 +
  162 + sentence.addMention(new Mention(segments, heads));
  163 + }*/
  164 + if (thisGroup.getType().startsWith("NG") &&
  165 + nextGroup != null && nextGroup.getType().startsWith("PrepNG") &&
  166 + NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) {
  167 + List<Token> heads = thisGroup.getSemanticHeadTokens();
  168 + List<Token> segments = new ArrayList<Token>();
  169 + segments.addAll(thisGroup.getTokens());
  170 + segments.addAll(nextGroup.getTokens());
  171 +
  172 + sentence.addMention(new Mention(segments, heads));
  173 + } else if (thisGroup.getType().startsWith("NG") && nextGroup != null &&
  174 + nextGroup.getType().startsWith("NG") &&
  175 + NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))
  176 + ) {
  177 + List<Token> heads = thisGroup.getSemanticHeadTokens();
  178 + List<Token> segments = new ArrayList<Token>();
  179 + segments.addAll(thisGroup.getTokens());
  180 + segments.addAll(nextGroup.getTokens());
  181 +
  182 + sentence.addMention(new Mention(segments, heads));
  183 + } /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null &&
  184 + NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) {
  185 + List<Token> heads = thisGroup.getSemanticHeadTokens();
  186 +
  187 + List<Token> segments = new ArrayList<Token>();
  188 + segments.addAll(thisGroup.getTokens());
  189 +
  190 + int prepStart = thisGroup.getSentencePositionEnd() + 1;
  191 + int prepEnd = nextNG.getSentencePositionStart() - 1;
  192 + ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd);
  193 + segments.addAll(prepSegments);
  194 +
  195 + segments.addAll(nextNG.getTokens());
  196 +
  197 + sentence.addMention(new Mention(segments, heads));
  198 + }*/
  199 + //else if // NG + im./pt. NG
  200 + // daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka
  201 + // "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName)
  202 + else if (thisGroup.getType().startsWith("NG")) {
  203 + List<Token> segments = thisGroup.getTokens();
  204 + List<Token> heads = thisGroup.getSemanticHeadTokens();
  205 +
  206 + sentence.addMention(new Mention(segments, heads));
  207 + }
  208 + }
  209 +
  210 + // oryginalna wersja
  211 + /*for (SyntacticGroup group : sentence.getGroups()) {
100 212 if (group.getType().startsWith("NG")) {
101 213 List<Token> segments = group.getTokens();
102 214 List<Token> heads = group.getSemanticHeadTokens();
103 215  
104 216 sentence.addMention(new Mention(segments, heads));
105 217 }
106   - }
  218 + }*/
  219 + }
  220 +
  221 + private static boolean followingWordIsInf(SyntacticGroup group,
  222 + Sentence sentence) {
  223 + int followingTokenPosition = group.getSentencePositionEnd() + 1;
  224 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  225 + int firstWordPosition = word.getSentencePositionStart();
  226 + if (followingTokenPosition == firstWordPosition &&
  227 + (word.getCtag().equals("Inf"))) {
  228 + return true;
  229 + }
  230 + }
  231 +
  232 + return false;
  233 + }
  234 +
  235 + private static SyntacticGroup getFollowingPrepNGs(int sentencePosition,
  236 + Sentence sentence) {
  237 + SyntacticGroup largestGroup = null;
  238 + int nextTokenPosition = sentencePosition + 1;
  239 + for (SyntacticGroup group : sentence.getGroups()) {
  240 + if (group.getType().startsWith("PrepNG") &&
  241 + group.getSentencePositionStart() == nextTokenPosition) {
  242 + if (largestGroup == null ||
  243 + largestGroup.getTokens().size() < group.getTokens().size()) {
  244 + largestGroup = group;
  245 + }
  246 + }
  247 + }
  248 + return largestGroup;
  249 + }
  250 +
  251 + private static boolean isPartOfPrepNG(SyntacticGroup NGGroup,
  252 + Sentence sentence) {
  253 + int NGGroupStart = NGGroup.getSentencePositionStart();
  254 + int NGGroupEnd = NGGroup.getSentencePositionEnd();
  255 + for (SyntacticGroup group : sentence.getGroups()) {
  256 + if (group.getType().startsWith("PrepNG") &&
  257 + group.getSentencePositionStart() <= NGGroupStart &&
  258 + group.getSentencePositionEnd() >= NGGroupEnd) {
  259 + return true;
  260 + }
  261 + }
  262 + return false;
  263 + }
  264 +
  265 + private static boolean precedingWordIsVerb(SyntacticGroup group,
  266 + Sentence sentence) {
  267 + int precedingTokenPosition = group.getSentencePositionStart() - 1;
  268 + if(isPartOfPrepNG(group, sentence)) {
  269 + SyntacticGroup parentGroup = getParentPrepNG(group, sentence);
  270 + precedingTokenPosition = parentGroup.getSentencePositionStart() - 1;
  271 + }
  272 +
  273 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  274 + int lastWordPosition = word.getSentencePositionEnd();
  275 + if (precedingTokenPosition == lastWordPosition &&
  276 + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
  277 + return true;
  278 + }
  279 + }
  280 + return false;
  281 + }
  282 +
  283 + // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem?
  284 + // czy prep moze sie skladac z wiecej niz jednego segmentu?
  285 + // dopasowywac refla i recip do sie spejdowego
  286 + private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup,
  287 + SyntacticGroup PrepNGGroup, Sentence sentence,
  288 + Map<String,ArrayList<String>> walentyMapping) {
  289 + int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1;
  290 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  291 + int lastWordPosition = word.getSentencePositionEnd();
  292 + if (precedingTokenPosition == lastWordPosition &&
  293 + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
  294 + String verb = word.getBase();
  295 + if (!walentyMapping.containsKey(verb)) {
  296 + return true;
  297 + } else {
  298 + SyntacticWord prepWord = PrepNGGroup.getFirstWord();
  299 +
  300 + if (prepWord.getTokens().size() == 1) {
  301 + Token prep = prepWord.getTokens().get(0);
  302 + String prepBase = prep.getBase();
  303 + // sprawdzic czy glowa moze miec wiele tokenow
  304 + String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase();
  305 + ArrayList<String> prepnps = getPrepnps(prepBase, prepCase);
  306 +
  307 + ArrayList<String> schemata = walentyMapping.get(verb);
  308 + for (String schema : schemata) {
  309 + for (String prepnp : prepnps) {
  310 + if (schema.contains(prepnp)) {
  311 + return true;
  312 + }
  313 + }
  314 + }
  315 + } else if (prepWord.getTokens().size() > 1) {
  316 + String prepOrth = prepWord.getOrth().toLowerCase();
  317 + String comprepnp = String.format("comprepnp(%s)", prepOrth);
  318 + ArrayList<String> schemata = walentyMapping.get(verb);
  319 + for (String schema : schemata) {
  320 + if (schema.contains(comprepnp)) {
  321 + return true;
  322 + }
  323 + }
  324 +
  325 + }
  326 +
  327 +
  328 + }
  329 + }
  330 + }
  331 + return false;
  332 + }
  333 +
  334 + private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup,
  335 + SyntacticGroup PrepNGGroup, Sentence sentence,
  336 + Map<String,ArrayList<String>> walentyMapping) {
  337 + int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1;
  338 + if(isPartOfPrepNG(NGGroup, sentence)) {
  339 + SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence);
  340 + precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1;
  341 + }
  342 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  343 + int lastWordPosition = word.getSentencePositionEnd();
  344 + if (precedingTokenPosition == lastWordPosition &&
  345 + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
  346 + if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) {
  347 + return true;
  348 + }
  349 + if (!walentyMapping.containsKey(word.getBase())) {
  350 + return true;
  351 + }
  352 +
  353 + }
  354 + }
  355 + return false;
107 356 }
  357 +
  358 + private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup,
  359 + SyntacticGroup PrepNGGroup, Sentence sentence,
  360 + Map<String,ArrayList<String>> walentyMapping) {
  361 + String verbBase = verb.getBase();
  362 + if (!walentyMapping.containsKey(verbBase)) {
  363 + return true;
  364 + } else {
  365 + ArrayList<String> schemata = walentyMapping.get(verbBase);
  366 +
  367 + // PrepNG + PrepNG
  368 + if (isPartOfPrepNG(NGGroup, sentence)) {
  369 + SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence);
  370 + ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations();
  371 + ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations();
  372 + for (String schema : schemata) {
  373 + if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) {
  374 + return true;
  375 + }
  376 + }
  377 + }
  378 +
  379 + // NG + PrepNG
  380 + else {
  381 + ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations();
  382 + ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations();
  383 + for (String schema : schemata) {
  384 + if (isProperSchema(schema, NGRealizations, prepNGRealizations)) {
  385 + return true;
  386 + }
  387 + }
  388 + }
  389 + }
  390 + return false;
  391 + }
  392 +
  393 + private static boolean isProperSchema(String schema, ArrayList<String> group1Types,
  394 + ArrayList<String> group2Types) {
  395 + for (String group1Type : group1Types) {
  396 + if (schema.contains(group1Type)) {
  397 + for (String group2Type : group2Types) {
  398 + if (schema.contains(group2Type)) {
  399 + return true;
  400 + }
  401 + }
  402 + }
  403 + }
  404 + return false;
  405 + }
  406 +
  407 + private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup,
  408 + Sentence sentence) {
  409 + SyntacticGroup parentPrepNG = null;
  410 + int NGGroupStart = NGGroup.getSentencePositionStart();
  411 + int NGGroupEnd = NGGroup.getSentencePositionEnd();
  412 + for (SyntacticGroup group : sentence.getGroups()) {
  413 + if (group.getType().startsWith("PrepNG") &&
  414 + group.getSentencePositionStart() <= NGGroupStart &&
  415 + group.getSentencePositionEnd() >= NGGroupEnd) {
  416 + if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) {
  417 + parentPrepNG = group;
  418 + }
  419 + }
  420 + }
  421 + return parentPrepNG;
  422 + }
  423 +
  424 + private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup,
  425 + SyntacticGroup PrepNGGroup, Sentence sentence,
  426 + Map<String,ArrayList<String>> walentyMapping) {
  427 + Token NGHead = NGGroup.getSemanticHeadTokens().get(0);
  428 +
  429 + String NGHeadBase = NGHead.getBase();
  430 +
  431 + if (!walentyMapping.containsKey(NGHeadBase)) {
  432 + return false;
  433 + } else {
  434 + SyntacticWord prepWord = PrepNGGroup.getFirstWord();
  435 +
  436 + if (prepWord.getTokens().size() == 1) {
  437 + Token prep = prepWord.getTokens().get(0);
  438 + String prepBase = prep.getBase();
  439 + String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase();
  440 + String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase);
  441 + ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
  442 + for (String schema : schemata) {
  443 + if (schemaContains(schema, prepnp)) {
  444 + return true;
  445 + }
  446 + }
  447 + } else if (prepWord.getTokens().size() > 1) {
  448 + String prepOrth = prepWord.getOrth().toLowerCase();
  449 + String comprepnp = String.format("comprepnp(%s)", prepOrth);
  450 + ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
  451 + for (String schema : schemata) {
  452 + if (schemaContains(schema, comprepnp)) {
  453 + return true;
  454 + }
  455 + }
  456 +
  457 + }
  458 +
  459 + }
  460 + return false;
  461 + }
  462 +
  463 + private static boolean NGNGValenceCompatibility(SyntacticGroup NG1,
  464 + SyntacticGroup NG2, Sentence sentence,
  465 + Map<String,ArrayList<String>> walentyMapping) {
  466 + Token NG1Head = NG1.getSemanticHeadTokens().get(0);
  467 +
  468 + String NGHeadBase = NG1Head.getBase();
  469 +
  470 + if (!walentyMapping.containsKey(NGHeadBase)) {
  471 + return false;
  472 + } else {
  473 + ArrayList<String> NG2realizations = NG2.getWalentyRealizations();
  474 +
  475 + ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
  476 + for (String real : NG2realizations) {
  477 + for (String schema : schemata) {
  478 + if (schemaContains(schema, real)) {
  479 + return true;
  480 + }
  481 + }
  482 + }
  483 + }
  484 + return false;
  485 + }
  486 +
  487 + private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1,
  488 + SyntacticGroup NGGroup2, Sentence sentence,
  489 + Map<String,ArrayList<String>> walentyMapping) {
  490 +
  491 + Token NGHead = NGGroup1.getSemanticHeadTokens().get(0);
  492 + String NGHeadBase = NGHead.getBase();
  493 +
  494 + if (!walentyMapping.containsKey(NGHeadBase)) {
  495 + return false;
  496 + } else {
  497 + int prepStart = NGGroup1.getSentencePositionEnd() + 1;
  498 + int prepEnd = NGGroup2.getSentencePositionStart() - 1;
  499 + String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd);
  500 + String comprepnp = String.format("comprepnp(%s)", complexPrep);
  501 + ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
  502 + for (String schema : schemata) {
  503 + if (schemaContains(schema, comprepnp)) {
  504 + return true;
  505 + }
  506 + }
  507 + }
  508 + return false;
  509 + }
  510 +
  511 + private static boolean schemaContains(String schema, String phraseType) {
  512 + for (String position : schema.split("\\s\\+\\s")) {
  513 + position = position.trim();
  514 + position = position.substring(1, position.length()-1);
  515 + for (String phrT : position.split(";")) {
  516 + if (phrT.equals(phraseType)) {
  517 + return true;
  518 + }
  519 + }
  520 + }
  521 + return false;
  522 + }
  523 +
  524 + private static boolean schemaContainsType(String schema, String type) {
  525 + // to lepiej dziala dla rzeczownikow
  526 + for (String position : schema.split("\\s\\+\\s")) {
  527 + position = position.trim();
  528 + position = position.substring(1, position.length()-1);
  529 + for (String phrT : position.split(";")) {
  530 +
  531 + if (phrT.startsWith(type+"(")) {
  532 + return true;
  533 + }
  534 + }
  535 + }
  536 + return false;
  537 + }
  538 +
  539 +
  540 + // compar ??
  541 + private static ArrayList<String> getPrepnps(String prepBase, String prepCase) {
  542 + ArrayList<String> prepnps = new ArrayList<String>();
  543 + prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase));
  544 + if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) {
  545 + prepnps.add(String.format("prepnp(%s,str)", prepBase));
  546 + }
  547 + if (prepCase.equals("gen") || prepCase.equals("acc")) {
  548 + prepnps.add(String.format("prepnp(%s,part)", prepBase));
  549 + }
  550 + return prepnps;
  551 + }
  552 +
  553 + // eliminuje "od wsi do wsi"
  554 + private static boolean sameSemanticHeads(SyntacticGroup group1,
  555 + SyntacticGroup group2) {
  556 +
  557 + List<Token> group1HeadTokens = group1.getSemanticHeadTokens();
  558 + List<Token> group2HeadTokens = group2.getSemanticHeadTokens();
  559 + if (group1HeadTokens.size() != group2HeadTokens.size()) {
  560 + return false;
  561 + }
  562 +
  563 + for (int i=0; i < group1HeadTokens.size(); i++) {
  564 + if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) {
  565 + return false;
  566 + }
  567 + }
  568 +
  569 + return true;
  570 + }
  571 +
108 572  
109 573 /**
110 574 * Wyszukuję i oznaczam wszystkie NER
... ... @@ -151,8 +615,9 @@ public class Detector {
151 615 * @param sentence
152 616 */
153 617 private static void addMentionsByTokenCtag(Sentence sentence) {
154   - for (Token token : sentence)
  618 + for (Token token : sentence) {
155 619 if (token.getCtag().matches(Constants.MORPHO_CTAGS))
156 620 sentence.addMention(new Mention(token));
  621 + }
157 622 }
158 623 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
1 1 package pl.waw.ipipan.zil.core.md.entities;
2 2  
3 3 import java.util.ArrayList;
  4 +import java.util.Arrays;
4 5 import java.util.List;
5 6  
6 7 /**
... ... @@ -203,4 +204,83 @@ public class Mention implements Comparable&lt;Mention&gt; {
203 204 public boolean isZeroSubject() {
204 205 return isZeroSubject;
205 206 }
  207 +
  208 + public int getSentencePositionStart() {
  209 + Token startToken = this.getFirstSegment();
  210 + return startToken.getSentencePosition();
  211 + }
  212 +
  213 + public int getSentencePositionEnd() {
  214 + Token endToken = this.getLastSegment();
  215 + return endToken.getSentencePosition();
  216 + }
  217 +
  218 + public boolean isPartOfQub() {
  219 + if (this.segments.size() == 1) {
  220 + Sentence sentence = this.segments.get(0).getSentence();
  221 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  222 + if (word.getTokens().contains(this.segments.get(0)) &&
  223 + word.getCtag().equals("Qub")) {
  224 + return true;
  225 + }
  226 + }
  227 + }
  228 + return false;
  229 + }
  230 +
  231 + public boolean isPartOfPrep() {
  232 + if (this.segments.size() == 1) {
  233 + Sentence sentence = this.segments.get(0).getSentence();
  234 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  235 + if (word.getTokens().contains(this.segments.get(0)) &&
  236 + word.getCtag().equals("Prep")) {
  237 + return true;
  238 + }
  239 + }
  240 + }
  241 + return false;
  242 + }
  243 +
  244 + private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj",
  245 + "Adj", "Conj", "Comp");
  246 +
  247 + public boolean isPartOfFrazeo() {
  248 + if (this.segments.size() == 1) {
  249 + Sentence sentence = this.segments.get(0).getSentence();
  250 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  251 + if (word.getTokens().contains(this.segments.get(0)) &&
  252 + FRAZEOS.contains(word.getCtag())) {
  253 + return true;
  254 + }
  255 + }
  256 + }
  257 + return false;
  258 + }
  259 +
  260 + public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) {
  261 + if (this.segments.size() == 1) {
  262 + Sentence sentence = this.segments.get(0).getSentence();
  263 + if (this.getSentencePositionStart() - 1 >= 0) {
  264 + String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth();
  265 + String noun = sentence.get(this.getSentencePositionStart()).getOrth();
  266 + String possiblePrep = String.format("%s %s", prep, noun);
  267 + if (complexPreps.contains(possiblePrep)) {
  268 + return true;
  269 + }
  270 + }
  271 +
  272 + if (this.getSentencePositionStart() - 1 >= 0 &&
  273 + this.getSentencePositionStart() + 1 < sentence.size()) {
  274 + String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth();
  275 + String noun = sentence.get(this.getSentencePositionStart()).getOrth();
  276 + String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth();
  277 + String possiblePrep = String.format("%s %s %s", prep1, noun, prep2);
  278 + if (complexPreps.contains(possiblePrep)) {
  279 + return true;
  280 + }
  281 + }
  282 + }
  283 + return false;
  284 + }
  285 +
206 286 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
... ... @@ -109,4 +109,118 @@ public class Sentence extends ArrayList&lt;Token&gt; {
109 109 public void addNamedEntity(NamedEntity namedEntity) {
110 110 namedEntities.add(namedEntity);
111 111 }
  112 +
  113 + public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) {
  114 + ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>();
  115 + for (SyntacticGroup group : this.syntacticGroups) {
  116 + if (group.getSentencePositionStart() >= start &&
  117 + group.getSentencePositionEnd() <= end) {
  118 + if (!(group.getSentencePositionStart() == start &&
  119 + group.getSentencePositionEnd() == end)) {
  120 + groupsAtSpan.add(group);
  121 + }
  122 + }
  123 + }
  124 + return groupsAtSpan;
  125 + }
  126 +
  127 + public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) {
  128 + ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>();
  129 + for (SyntacticGroup group : this.syntacticGroups) {
  130 +
  131 + if (group.getSentencePositionStart() >= start &&
  132 + group.getSentencePositionEnd() <= end) {
  133 + if (!(group.getSentencePositionStart() == start &&
  134 + group.getSentencePositionEnd() == end)) {
  135 + groupsAtSpan.add(group);
  136 + }
  137 + }
  138 + }
  139 + return groupsAtSpan;
  140 + }
  141 +
  142 + public SyntacticGroup getFirstGroup(int start, int end) {
  143 + SyntacticGroup largestGroup = null;
  144 + int step = start;
  145 + while (step <= end && largestGroup == null) {
  146 + largestGroup = getLargestGroupOnStartPoint(step, end);
  147 + step++;
  148 + }
  149 + return largestGroup;
  150 + }
  151 +
  152 + private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) {
  153 + SyntacticGroup largestGroup = null;
  154 + for (SyntacticGroup group : this.getGroups()) {
  155 + int groupStart = group.getSentencePositionStart();
  156 + int groupEnd = group.getSentencePositionEnd();
  157 + if (groupStart == start && groupEnd <= end &&
  158 + !(groupStart == start && groupEnd == end) &&
  159 + (largestGroup == null ||
  160 + largestGroup.getTokens().size() < group.getTokens().size())) {
  161 + largestGroup = group;
  162 + }
  163 + }
  164 + return largestGroup;
  165 + }
  166 +
  167 + public SyntacticGroup getLastGroup(int start, int end) {
  168 + SyntacticGroup largestGroup = null;
  169 + int step = end;
  170 + while (step != start && largestGroup == null) {
  171 + largestGroup = getLargestGroupOnEndPoint(start, step);
  172 + step--;
  173 + }
  174 + return largestGroup;
  175 + }
  176 +
  177 + private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) {
  178 + SyntacticGroup largestGroup = null;
  179 + for (SyntacticGroup group : this.getGroups()) {
  180 + int groupStart = group.getSentencePositionStart();
  181 + int groupEnd = group.getSentencePositionEnd();
  182 + if (groupEnd == end && groupStart >= start &&
  183 + !(groupStart == start && groupEnd == end) &&
  184 + (largestGroup == null ||
  185 + largestGroup.getTokens().size() < group.getTokens().size())) {
  186 + largestGroup = group;
  187 + }
  188 + }
  189 + return largestGroup;
  190 + }
  191 +
  192 + public ArrayList<Mention> getMentionsInsideSpan(int start, int end) {
  193 + ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>();
  194 + for (Mention mention : this.mentions) {
  195 + if (mention.getSentencePositionStart() >= start &&
  196 + mention.getSentencePositionEnd() <= end) {
  197 + mentionsAtSpan.add(mention);
  198 + }
  199 + }
  200 + return mentionsAtSpan;
  201 + }
  202 +
  203 + public String getTextInsideSpan(int start, int end) {
  204 + String text = "";
  205 + int step = start;
  206 + while (step <= end) {
  207 + if (step != start) {
  208 + text += " ";
  209 + }
  210 + text += this.get(step).getOrth();
  211 + step++;
  212 + }
  213 + return text;
  214 + }
  215 +
  216 + public ArrayList<Token> getSegmentsInsideSpan(int start, int end) {
  217 + ArrayList<Token> tokensAtSpan = new ArrayList<Token>();
  218 + int step = start;
  219 + while (step <= end) {
  220 + tokensAtSpan.add(this.get(step));
  221 + step++;
  222 + }
  223 + return tokensAtSpan;
  224 + }
  225 +
112 226 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
1 1 package pl.waw.ipipan.zil.core.md.entities;
2 2  
  3 +import java.util.ArrayList;
3 4 import java.util.Iterator;
4 5 import java.util.List;
5 6  
... ... @@ -53,4 +54,175 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
53 54  
54 55 return getType().compareTo(o.getType());
55 56 }
  57 +
  58 + public int getSentencePositionStart() {
  59 + Token startToken = tokens.get(0);
  60 + return startToken.getSentencePosition();
  61 + }
  62 +
  63 + public int getSentencePositionEnd() {
  64 + Token endToken = tokens.get(tokens.size()-1);
  65 + return endToken.getSentencePosition();
  66 + }
  67 +
  68 +
  69 + public SyntacticWord getFirstWord() {
  70 + SyntacticWord firstWord = null;
  71 + Token startToken = tokens.get(0);
  72 + Sentence sentence = startToken.getSentence();
  73 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  74 + if(startToken.compareTo(word.getTokens().get(0)) == 0 &&
  75 + (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) {
  76 + firstWord = word;
  77 + }
  78 + }
  79 + return firstWord;
  80 + }
  81 +
  82 + // NG and PrepNG only now
  83 + public ArrayList<String> getWalentyRealizations() {
  84 + ArrayList<String> realizations = new ArrayList<String>();
  85 + if (this.type.startsWith("PrepNG")) {
  86 + SyntacticWord prepWord = this.getFirstWord();
  87 + if (prepWord.getTokens().size() == 1) {
  88 +
  89 + Token prep = prepWord.getTokens().get(0);
  90 + String prepBase = prep.getBase();
  91 + String prepCase = this.getSemanticHeadTokens().get(0).getCase();
  92 + realizations.addAll(getPrepnps(prepBase, prepCase));
  93 +
  94 + } else if (prepWord.getTokens().size() > 1) {
  95 +
  96 + String prepOrth = prepWord.getOrth().toLowerCase();
  97 + String comprepnp = String.format("comprepnp(%s)", prepOrth);
  98 + realizations.add(comprepnp);
  99 +
  100 + }
  101 + } else if (this.type.startsWith("NG")) {
  102 + String npCase = this.getSemanticHeadTokens().get(0).getCase();
  103 + realizations.addAll(getNps(npCase));
  104 + }
  105 + return realizations;
  106 + }
  107 +
  108 + // compar ??
  109 + private ArrayList<String> getPrepnps(String prepBase, String prepCase) {
  110 + ArrayList<String> prepnps = new ArrayList<String>();
  111 + prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase));
  112 + if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) {
  113 + prepnps.add(String.format("prepnp(%s,str)", prepBase));
  114 + }
  115 + if (prepCase.equals("gen") || prepCase.equals("acc")) {
  116 + prepnps.add(String.format("prepnp(%s,part)", prepBase));
  117 + }
  118 + return prepnps;
  119 + }
  120 +
  121 + private ArrayList<String> getNps(String npCase) {
  122 + ArrayList<String> nps = new ArrayList<String>();
  123 + nps.add(String.format("np(%s)", npCase));
  124 + if (npCase.equals("nom") || npCase.equals("gen") || npCase.equals("acc")) {
  125 + nps.add(String.format("np(str)"));
  126 + }
  127 + if (npCase.equals("gen") || npCase.equals("acc")) {
  128 + nps.add(String.format("np(part)"));
  129 + }
  130 + return nps;
  131 + }
  132 +
  133 + public boolean precedingWordIsVerb() {
  134 + Sentence sentence = this.tokens.get(0).getSentence();
  135 + int precedingTokenPosition = this.getSentencePositionStart() - 1;
  136 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  137 + int lastWordPosition = word.getSentencePositionEnd();
  138 + if (precedingTokenPosition == lastWordPosition &&
  139 + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
  140 + return true;
  141 + }
  142 + }
  143 + return false;
  144 + }
  145 +
  146 + public SyntacticGroup getNextNG() {
  147 + Sentence sentence = this.tokens.get(0).getSentence();
  148 + int thisGroupEnd = this.getSentencePositionEnd();
  149 + int sentenceLength = sentence.size();
  150 +
  151 + SyntacticGroup nextNG = null;
  152 + for (int step = thisGroupEnd; step < sentenceLength; step++) {
  153 + nextNG = sentence.getFirstGroup(step, sentenceLength);
  154 + if (nextNG != null && nextNG.type.startsWith("NG") &&
  155 + this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) {
  156 + break;
  157 + } else {
  158 + nextNG = null;
  159 + }
  160 + }
  161 + return nextNG;
  162 + }
  163 +
  164 + public SyntacticGroup getFollowingGroup() {
  165 + SyntacticGroup largestGroup = null;
  166 + Sentence sentence = this.tokens.get(0).getSentence();
  167 + int nextTokenPosition = this.getSentencePositionEnd() + 1;
  168 + for (SyntacticGroup group : sentence.getGroups()) {
  169 + if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) &&
  170 + group.getSentencePositionStart() == nextTokenPosition) {
  171 + if (largestGroup == null ||
  172 + largestGroup.getTokens().size() < group.getTokens().size()) {
  173 + largestGroup = group;
  174 + }
  175 + }
  176 + }
  177 + return largestGroup;
  178 + }
  179 +
  180 + public SyntacticWord getPrecedingVerb() {
  181 + int precedingTokenPosition = this.getSentencePositionStart() - 1;
  182 + Sentence sentence = this.tokens.get(0).getSentence();
  183 + if(this.isPartOfPrepNG()) {
  184 + SyntacticGroup parentNGGroup = this.getParentPrepNG();
  185 + precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1;
  186 + }
  187 + for (SyntacticWord word : sentence.getSyntacticWords()) {
  188 + int lastWordPosition = word.getSentencePositionEnd();
  189 + if (precedingTokenPosition == lastWordPosition &&
  190 + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
  191 + return word;
  192 + }
  193 + }
  194 + return null;
  195 + }
  196 +
  197 + private boolean isPartOfPrepNG() {
  198 + int NGGroupStart = this.getSentencePositionStart();
  199 + int NGGroupEnd = this.getSentencePositionEnd();
  200 + Sentence sentence = this.tokens.get(0).getSentence();
  201 + for (SyntacticGroup group : sentence.getGroups()) {
  202 + if (group.getType().startsWith("PrepNG") &&
  203 + group.getSentencePositionStart() <= NGGroupStart &&
  204 + group.getSentencePositionEnd() >= NGGroupEnd) {
  205 + return true;
  206 + }
  207 + }
  208 + return false;
  209 + }
  210 +
  211 + private SyntacticGroup getParentPrepNG() {
  212 + SyntacticGroup parentPrepNG = null;
  213 + int NGGroupStart = this.getSentencePositionStart();
  214 + int NGGroupEnd = this.getSentencePositionEnd();
  215 + Sentence sentence = this.tokens.get(0).getSentence();
  216 + for (SyntacticGroup group : sentence.getGroups()) {
  217 + if (group.getType().startsWith("PrepNG") &&
  218 + group.getSentencePositionStart() <= NGGroupStart &&
  219 + group.getSentencePositionEnd() >= NGGroupEnd) {
  220 + if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) {
  221 + parentPrepNG = group;
  222 + }
  223 + }
  224 + }
  225 + return parentPrepNG;
  226 + }
  227 +
56 228 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
... ... @@ -6,11 +6,16 @@ import java.util.List;
6 6  
7 7 public class SyntacticWord implements Comparable<SyntacticWord> {
8 8  
  9 + private String base;
9 10 private String ctag;
  11 + private String orth;
10 12 private List<Token> tokens = new ArrayList<>();
11 13  
12   - public SyntacticWord(String ctag, List<Token> tokens) {
  14 + public SyntacticWord(String ctag, List<Token> tokens,
  15 + String base, String orth) {
  16 + this.base = base;
13 17 this.ctag = ctag;
  18 + this.orth = orth;
14 19 this.tokens = tokens;
15 20 }
16 21  
... ... @@ -39,5 +44,37 @@ public class SyntacticWord implements Comparable&lt;SyntacticWord&gt; {
39 44  
40 45 return getCtag().compareTo(o.getCtag());
41 46 }
  47 +
  48 + public int getSentencePositionStart() {
  49 + Token startToken = tokens.get(0);
  50 + return startToken.getSentencePosition();
  51 + }
  52 +
  53 + public int getSentencePositionEnd() {
  54 + Token endToken = tokens.get(tokens.size()-1);
  55 + return endToken.getSentencePosition();
  56 + }
  57 +
  58 + public String getBase() {
  59 + return this.base;
  60 + }
  61 +
  62 + public String getOrth() {
  63 + return this.orth;
  64 + }
  65 +
  66 + public boolean isVerb() {
  67 + if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) {
  68 + return true;
  69 + }
  70 + return false;
  71 + }
  72 +
  73 + public boolean isInterp() {
  74 + if (this.ctag.equals("Interp")) {
  75 + return true;
  76 + }
  77 + return false;
  78 + }
42 79  
43 80 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
... ... @@ -70,6 +70,7 @@ public class TeiLoader {
70 70 for (TEIMorph mo : m.getHeadMorphs())
71 71 headTokens.add(teiMorph2Segment.get(mo));
72 72 s.addMention(new Mention(tokens, headTokens, m.isZeroSubject()));
  73 + System.out.println(tokens.toString());
73 74 }
74 75  
75 76 private static void loadSyntacticGroup(Sentence s, TEIGroup g,
... ... @@ -94,10 +95,12 @@ public class TeiLoader {
94 95 private static void loadSyntacticWord(Sentence s, TEIWord w,
95 96 Map<TEIMorph, Token> teiMorph2Segment) {
96 97 String ctag = w.getInterpretation().getCtag();
  98 + String base = w.getInterpretation().getBase();
  99 + String orth = w.getOrth();
97 100 List<Token> tokens = new ArrayList<>();
98 101 for (TEIMorph m : w.getAllMorphs())
99 102 tokens.add(teiMorph2Segment.get(m));
100   - s.addSyntacticWord(new SyntacticWord(ctag, tokens));
  103 + s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth));
101 104 }
102 105  
103 106 private static void loadNE(Sentence s, TEINamedEntity ne,
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
... ... @@ -73,10 +73,12 @@ public class ThriftLoader {
73 73 private static void loadSyntacticWord(Sentence s, TSyntacticWord w,
74 74 Map<String, Object> thirftId2Entity,
75 75 Map<String, Token> thiftTokenId2Token) {
  76 + String base = w.getChosenInterpretation().getBase();
76 77 String ctag = w.getChosenInterpretation().getCtag();
  78 + String orth = w.getOrth();
77 79 List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity,
78 80 thiftTokenId2Token, false);
79   - s.addSyntacticWord(new SyntacticWord(ctag, tokens));
  81 + s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth));
80 82 }
81 83  
82 84 private static void loadNE(Sentence s, TNamedEntity ne,
... ...