Commit c9b1021d986bf2636bfd978885170dedd7d45546
1 parent
d3b1a80b
fix sonar issues
Showing
15 changed files
with
177 additions
and
142 deletions
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java
... | ... | @@ -92,7 +92,10 @@ class Cli { |
92 | 92 | public void validate(String name, String value) { |
93 | 93 | File file = new File(value); |
94 | 94 | try { |
95 | - file.createNewFile(); | |
95 | + boolean newFile = file.createNewFile(); | |
96 | + if (!newFile) { | |
97 | + LOG.warn("Output file exists and will be overridden."); | |
98 | + } | |
96 | 99 | } catch (IOException ex) { |
97 | 100 | throw new ParameterException("Parameter " + name |
98 | 101 | + " should be a valid file path (found " + value + ")", ex); |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
... | ... | @@ -67,11 +67,14 @@ public class FeatureExtractor { |
67 | 67 | } |
68 | 68 | } |
69 | 69 | for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) { |
70 | - for (Attribute attribute : attribute2max.keySet()) { | |
70 | + for (Map.Entry<Attribute, Double> entry : attribute2max.entrySet()) { | |
71 | + Attribute attribute = entry.getKey(); | |
72 | + Double max = entry.getValue(); | |
73 | + | |
71 | 74 | Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); |
72 | 75 | entityAttributes.put(normalizedAttribute, |
73 | 76 | (entityAttributes.get(attribute) - attribute2min.get(attribute)) |
74 | - / (attribute2max.get(attribute) - attribute2min.get(attribute))); | |
77 | + / (max - attribute2min.get(attribute))); | |
75 | 78 | } |
76 | 79 | } |
77 | 80 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
... | ... | @@ -54,49 +54,56 @@ public class FeatureHelper { |
54 | 54 | coref2mentions.put(coref, ments); |
55 | 55 | } |
56 | 56 | |
57 | - int parIdx = 0; | |
58 | - int sentIdx = 0; | |
59 | - int mentionIdx = 0; | |
57 | + Counters counters = new Counters(); | |
60 | 58 | for (TParagraph par : preprocessedText.getParagraphs()) { |
61 | - Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); | |
62 | - mention2Orth.putAll(m2o); | |
63 | - Map<TMention, String> m2b = loadMention2Base(par.getSentences()); | |
64 | - mention2Base.putAll(m2b); | |
65 | - | |
66 | - int sentIdxInPar = 0; | |
67 | - int mentionIdxInPar = 0; | |
68 | - for (TSentence sent : par.getSentences()) { | |
69 | - | |
70 | - Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity())); | |
71 | - | |
72 | - Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap(); | |
73 | - for (TNamedEntity namedEntity : sent.getNames()) { | |
74 | - for (String childId : namedEntity.getChildIds()) { | |
75 | - tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet()); | |
76 | - tokenId2namedEntities.get(childId).add(namedEntity); | |
77 | - } | |
78 | - } | |
59 | + processParagraph(counters, par); | |
60 | + } | |
61 | + } | |
79 | 62 | |
80 | - int mentionIdxInSent = 0; | |
81 | - for (TMention mention : sent.getMentions()) { | |
82 | - mention2sent.put(mention, sent); | |
83 | - mention2par.put(mention, par); | |
84 | - mention2index.put(mention, mentionIdx++); | |
85 | - mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next()))); | |
86 | - mention2indexInSent.put(mention, mentionIdxInSent++); | |
87 | - mention2indexInPar.put(mention, mentionIdxInPar++); | |
88 | - | |
89 | - String firstHeadTokenId = mention.getHeadIds().iterator().next(); | |
90 | - mention2head.put(mention, tokenId2token.get(firstHeadTokenId)); | |
91 | - if (tokenId2namedEntities.containsKey(firstHeadTokenId)) | |
92 | - mentionsInNamedEntities.add(mention); | |
93 | - } | |
94 | - sent2Index.put(sent, sentIdx++); | |
95 | - sent2IndexInPar.put(sent, sentIdxInPar++); | |
63 | + private void processParagraph(Counters counters, TParagraph par) { | |
64 | + Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); | |
65 | + mention2Orth.putAll(m2o); | |
66 | + Map<TMention, String> m2b = loadMention2Base(par.getSentences()); | |
67 | + mention2Base.putAll(m2b); | |
68 | + | |
69 | + int sentIdxInPar = 0; | |
70 | + int mentionIdxInPar = 0; | |
71 | + for (TSentence sent : par.getSentences()) { | |
72 | + | |
73 | + Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity())); | |
74 | + | |
75 | + Map<String, Set<TNamedEntity>> tokenId2namedEntities = getTokenId2NamedEntities(sent); | |
76 | + | |
77 | + int mentionIdxInSent = 0; | |
78 | + for (TMention mention : sent.getMentions()) { | |
79 | + mention2sent.put(mention, sent); | |
80 | + mention2par.put(mention, par); | |
81 | + mention2index.put(mention, counters.mentionIdx++); | |
82 | + mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next()))); | |
83 | + mention2indexInSent.put(mention, mentionIdxInSent++); | |
84 | + mention2indexInPar.put(mention, mentionIdxInPar++); | |
85 | + | |
86 | + String firstHeadTokenId = mention.getHeadIds().iterator().next(); | |
87 | + mention2head.put(mention, tokenId2token.get(firstHeadTokenId)); | |
88 | + if (tokenId2namedEntities.containsKey(firstHeadTokenId)) | |
89 | + mentionsInNamedEntities.add(mention); | |
96 | 90 | } |
91 | + sent2Index.put(sent, counters.sentIdx++); | |
92 | + sent2IndexInPar.put(sent, sentIdxInPar++); | |
93 | + } | |
97 | 94 | |
98 | - par2Index.put(par, parIdx++); | |
95 | + par2Index.put(par, counters.parIdx++); | |
96 | + } | |
97 | + | |
98 | + private Map<String, Set<TNamedEntity>> getTokenId2NamedEntities(TSentence sent) { | |
99 | + Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap(); | |
100 | + for (TNamedEntity namedEntity : sent.getNames()) { | |
101 | + for (String childId : namedEntity.getChildIds()) { | |
102 | + tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet()); | |
103 | + tokenId2namedEntities.get(childId).add(namedEntity); | |
104 | + } | |
99 | 105 | } |
106 | + return tokenId2namedEntities; | |
100 | 107 | } |
101 | 108 | |
102 | 109 | public List<TMention> getMentions() { |
... | ... | @@ -220,31 +227,35 @@ public class FeatureHelper { |
220 | 227 | return mention2sent.get(mention).getTokens().get(idx - 1); |
221 | 228 | } |
222 | 229 | |
223 | - private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { | |
230 | + private static Map<TMention, String> loadMention2Orth(List<TSentence> sentences) { | |
224 | 231 | Map<TMention, String> mention2orth = Maps.newHashMap(); |
225 | - for (TSentence s : sents) { | |
226 | - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | |
232 | + for (TSentence sentence : sentences) { | |
233 | + Map<String, TToken> tokId2tok = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | |
227 | 234 | |
228 | - for (TMention m : s.getMentions()) { | |
229 | - StringBuilder mentionOrth = new StringBuilder(); | |
230 | - for (String tokId : m.getChildIds()) { | |
231 | - TToken token = tokId2tok.get(tokId); | |
232 | - if (!token.isNoPrecedingSpace()) | |
233 | - mentionOrth.append(" "); | |
234 | - mentionOrth.append(token.getOrth()); | |
235 | - } | |
236 | - mention2orth.put(m, mentionOrth.toString().trim()); | |
235 | + for (TMention mention : sentence.getMentions()) { | |
236 | + mention2orth.put(mention, getMentionOrth(tokId2tok, mention)); | |
237 | 237 | } |
238 | 238 | } |
239 | 239 | return mention2orth; |
240 | 240 | } |
241 | 241 | |
242 | - private static Map<TMention, String> loadMention2Base(List<TSentence> sents) { | |
242 | + private static String getMentionOrth(Map<String, TToken> tokId2tok, TMention m) { | |
243 | + StringBuilder mentionOrth = new StringBuilder(); | |
244 | + for (String tokId : m.getChildIds()) { | |
245 | + TToken token = tokId2tok.get(tokId); | |
246 | + if (!token.isNoPrecedingSpace()) | |
247 | + mentionOrth.append(" "); | |
248 | + mentionOrth.append(token.getOrth()); | |
249 | + } | |
250 | + return mentionOrth.toString().trim(); | |
251 | + } | |
252 | + | |
253 | + private static Map<TMention, String> loadMention2Base(List<TSentence> sentences) { | |
243 | 254 | Map<TMention, String> mention2base = Maps.newHashMap(); |
244 | - for (TSentence s : sents) { | |
245 | - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); | |
255 | + for (TSentence sentence : sentences) { | |
256 | + Map<String, String> tokId2base = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); | |
246 | 257 | |
247 | - for (TMention m : s.getMentions()) { | |
258 | + for (TMention m : sentence.getMentions()) { | |
248 | 259 | StringBuilder mentionBase = new StringBuilder(); |
249 | 260 | for (String tokId : m.getChildIds()) { |
250 | 261 | mentionBase.append(" "); |
... | ... | @@ -255,4 +266,10 @@ public class FeatureHelper { |
255 | 266 | } |
256 | 267 | return mention2base; |
257 | 268 | } |
269 | + | |
270 | + private class Counters { | |
271 | + int parIdx = 0; | |
272 | + int sentIdx = 0; | |
273 | + int mentionIdx = 0; | |
274 | + } | |
258 | 275 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
... | ... | @@ -18,6 +18,10 @@ import java.util.stream.Collectors; |
18 | 18 | |
19 | 19 | public class MentionFeatureExtractor extends FeatureExtractor { |
20 | 20 | |
21 | + private static final String SCORE_ATTRIBUTE_NAME = "score"; | |
22 | + private static final String OTHER_VALUE = "other"; | |
23 | + private static final String NULL_VALUE = "null"; | |
24 | + | |
21 | 25 | private final List<String> frequentBases; |
22 | 26 | |
23 | 27 | public MentionFeatureExtractor() throws IOException { |
... | ... | @@ -48,10 +52,10 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
48 | 52 | addBinaryAttribute(prefix + "_is_named"); |
49 | 53 | addBinaryAttribute(prefix + "_is_pronoun"); |
50 | 54 | addNominalAttribute(prefix + "_ctag", Constants.POS_TAGS); |
51 | - addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter")); | |
52 | - addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc")); | |
53 | - addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl")); | |
54 | - addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n")); | |
55 | + addNominalAttribute(prefix + "_person", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "pri", "sec", "ter")); | |
56 | + addNominalAttribute(prefix + "_case", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "nom", "acc", "dat", "gen", "loc", "inst", "voc")); | |
57 | + addNominalAttribute(prefix + "_number", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "sg", "pl")); | |
58 | + addNominalAttribute(prefix + "_gender", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "f", "m1", "m2", "m3", "n")); | |
55 | 59 | |
56 | 60 | // relation to other |
57 | 61 | addBinaryAttribute(prefix + "_is_nested"); |
... | ... | @@ -76,8 +80,8 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
76 | 80 | } |
77 | 81 | } |
78 | 82 | |
79 | - addNominalAttribute("score", Lists.newArrayList("bad", "good")); | |
80 | - fillSortedAttributes("score"); | |
83 | + addNominalAttribute(SCORE_ATTRIBUTE_NAME, Lists.newArrayList("bad", "good")); | |
84 | + fillSortedAttributes(SCORE_ATTRIBUTE_NAME); | |
81 | 85 | } |
82 | 86 | |
83 | 87 | private String encodeBase(String base) { |
... | ... | @@ -143,8 +147,11 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
143 | 147 | attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); |
144 | 148 | |
145 | 149 | String orth = helper.getMentionOrth(mention); |
146 | - attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1)))); | |
147 | - attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth))); | |
150 | + String firstLetter = orth.substring(0, 1); | |
151 | + String firstLetterUpperCased = firstLetter.toUpperCase(); | |
152 | + attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && firstLetterUpperCased.equals(firstLetter))); | |
153 | + String upperCased = orth.toUpperCase(); | |
154 | + attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(upperCased.equals(orth))); | |
148 | 155 | attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length()); |
149 | 156 | |
150 | 157 | // par characteristics |
... | ... | @@ -159,8 +166,8 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
159 | 166 | attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size()); |
160 | 167 | attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence)); |
161 | 168 | attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence)); |
162 | - attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("."))); | |
163 | - attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?"))); | |
169 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(".".equals(helper.getSentenceLastTokenOrth(mentionSentence)))); | |
170 | + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary("?".equals(helper.getSentenceLastTokenOrth(mentionSentence)))); | |
164 | 171 | |
165 | 172 | // frequent bases |
166 | 173 | String mentionBase = helper.getMentionBase(mention); |
... | ... | @@ -174,14 +181,14 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
174 | 181 | int index = att.indexOfValue(value); |
175 | 182 | if (index == -1) |
176 | 183 | LOG.warn("{} not found for attribute {}", value, attributeName); |
177 | - attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); | |
184 | + attribute2value.put(att, (double) (index == -1 ? att.indexOfValue(OTHER_VALUE) : index)); | |
178 | 185 | } |
179 | 186 | |
180 | 187 | |
181 | 188 | private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) { |
182 | 189 | for (TMention m : mentions) { |
183 | 190 | Map<Attribute, Double> map = Maps.newHashMap(); |
184 | - map.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | |
191 | + map.put(getAttributeByName(SCORE_ATTRIBUTE_NAME), weka.core.Utils.missingValue()); | |
185 | 192 | result.put(m, map); |
186 | 193 | } |
187 | 194 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
... | ... | @@ -13,6 +13,8 @@ import java.util.stream.Collectors; |
13 | 13 | |
14 | 14 | public class SentenceFeatureExtractor extends FeatureExtractor { |
15 | 15 | |
16 | + private static final String SCORE_ATTRIBUTE_NAME = "score"; | |
17 | + | |
16 | 18 | public SentenceFeatureExtractor() { |
17 | 19 | |
18 | 20 | addNumericAttributeNormalized("sent_mention_cluster_count"); |
... | ... | @@ -39,8 +41,8 @@ public class SentenceFeatureExtractor extends FeatureExtractor { |
39 | 41 | addNumericAttribute("text_mention_count"); |
40 | 42 | addNumericAttribute("text_cluster_count"); |
41 | 43 | |
42 | - addNumericAttribute("score"); | |
43 | - fillSortedAttributes("score"); | |
44 | + addNumericAttribute(SCORE_ATTRIBUTE_NAME); | |
45 | + fillSortedAttributes(SCORE_ATTRIBUTE_NAME); | |
44 | 46 | } |
45 | 47 | |
46 | 48 | public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) { |
... | ... | @@ -70,8 +72,8 @@ public class SentenceFeatureExtractor extends FeatureExtractor { |
70 | 72 | feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size()); |
71 | 73 | feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar); |
72 | 74 | feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx); |
73 | - feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("."))); | |
74 | - feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?"))); | |
75 | + feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(".".equals(helper.getSentenceLastTokenOrth(sentence)))); | |
76 | + feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary("?".equals(helper.getSentenceLastTokenOrth(sentence)))); | |
75 | 77 | |
76 | 78 | feature2value.put(getAttributeByName("par_idx"), (double) parIdx); |
77 | 79 | feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum()); |
... | ... | @@ -84,7 +86,7 @@ public class SentenceFeatureExtractor extends FeatureExtractor { |
84 | 86 | feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); |
85 | 87 | feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); |
86 | 88 | |
87 | - feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | |
89 | + feature2value.put(getAttributeByName(SCORE_ATTRIBUTE_NAME), weka.core.Utils.missingValue()); | |
88 | 90 | |
89 | 91 | feature2value.remove(null); |
90 | 92 | |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java
... | ... | @@ -42,12 +42,13 @@ public class ResourceUtils { |
42 | 42 | } |
43 | 43 | |
44 | 44 | private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException { |
45 | + Predicate<String> stringIsNonempty = (String s) -> !s.isEmpty(); | |
45 | 46 | try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) { |
46 | 47 | return IOUtils.readLines(stream, Constants.ENCODING) |
47 | 48 | .stream() |
48 | 49 | .map(String::trim) |
49 | 50 | .map(String::toLowerCase) |
50 | - .filter(((Predicate<String>) String::isEmpty).negate()) | |
51 | + .filter(stringIsNonempty) | |
51 | 52 | .sorted() |
52 | 53 | .distinct() |
53 | 54 | .collect(Collectors.toList()); |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... | ... | @@ -37,15 +37,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
37 | 37 | private static final String SENTENCE_MENTION_COUNT = "_sentence_mention_count"; |
38 | 38 | private static final String SENTENCE_TOKEN_LENGTH = "_sentence_token_length"; |
39 | 39 | private static final String IS_PAN_OR_PANI = "_is_pan_or_pani"; |
40 | - | |
41 | - // private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet( | |
42 | -// "zespół", "tylko", "gdy", ".", ":", "też", "kandydat", "do", "dziś", "bo", "by", "z", "a", "jednak", "jak", "który", "ale", "czy", "i", "się", "rok", "-", "\"", "to", "być", "że", ","); | |
43 | 40 | private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet("to", "z", "do", "o", "czyli", "nie", "\"", "też", "jak", "czy"); |
44 | - | |
45 | 41 | private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet(); |
46 | -// private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet( | |
47 | -// "mówić", "ii", "twierdzić", "już", "(", "budzić", "stanowić", "powinien", "do", "stać", "musieć", "stanąć", "móc", "o", "chcieć", "się", "-", "zostać", ":", "?", "i", "na", "z", "mieć", "\"", "to", "w", "nie", "być", ".", ","); | |
48 | - | |
49 | 42 | private static final String PREV_TOKEN_LEMMA = "_prev_token_lemma_equal_"; |
50 | 43 | private static final String NEXT_TOKEN_LEMMA = "_next_token_lemma_equal_"; |
51 | 44 | |
... | ... | @@ -105,7 +98,10 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
105 | 98 | candidateFeatures.put(getAttributeByName(SCORE), weka.core.Utils.missingValue()); |
106 | 99 | |
107 | 100 | TMention mention = candidate.getZeroCandidateMention(); |
108 | - TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); | |
101 | + TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().orElse(null); | |
102 | + if (antecedent == null) { | |
103 | + throw new IllegalArgumentException("Mention pair without first element!"); | |
104 | + } | |
109 | 105 | |
110 | 106 | addMentionFeatures(helper, candidateFeatures, mention, CANDIDATE_PREFIX); |
111 | 107 | addMentionFeatures(helper, candidateFeatures, antecedent, ANTECEDENT_PREFIX); |
... | ... | @@ -165,14 +161,14 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
165 | 161 | TSentence mentionSentence = helper.getMentionSentence(mention); |
166 | 162 | candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_MENTION_COUNT), (double) mentionSentence.getMentions().size()); |
167 | 163 | candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_TOKEN_LENGTH), (double) mentionSentence.getTokens().size()); |
168 | - candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth().equals("?"))); | |
164 | + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary("?".equals(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth()))); | |
169 | 165 | } |
170 | 166 | |
171 | 167 | private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { |
172 | 168 | Attribute att = getAttributeByName(attributeName); |
173 | 169 | int index = att.indexOfValue(value); |
174 | 170 | if (index == -1) |
175 | - LOG.warn(value + "not found for attribute " + attributeName); | |
171 | + LOG.warn("{} not found for attribute {}", value, attributeName); | |
176 | 172 | attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); |
177 | 173 | } |
178 | 174 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
... | ... | @@ -32,7 +32,7 @@ public class CorpusHelper { |
32 | 32 | } |
33 | 33 | |
34 | 34 | public static List<Summary> getAbstractSummaries(Text text) { |
35 | - return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | |
35 | + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | |
36 | 36 | } |
37 | 37 | |
38 | 38 | public static Set<String> loadTrainTextIds() throws IOException { |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
... | ... | @@ -67,15 +67,19 @@ public class PathConstants { |
67 | 67 | } |
68 | 68 | } |
69 | 69 | |
70 | - public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException, ZipException { | |
70 | + public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException { | |
71 | 71 | downloadFile(url, targetZipFile); |
72 | 72 | extractZipFile(targetZipFile, targetDir); |
73 | 73 | } |
74 | 74 | |
75 | - private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { | |
75 | + private static void extractZipFile(File targetZipFile, File targetDir) throws IOException { | |
76 | 76 | createFolder(targetDir); |
77 | - ZipFile zipFile = new ZipFile(targetZipFile); | |
78 | - zipFile.extractAll(targetDir.getPath()); | |
77 | + try { | |
78 | + ZipFile zipFile = new ZipFile(targetZipFile); | |
79 | + zipFile.extractAll(targetDir.getPath()); | |
80 | + } catch (ZipException e) { | |
81 | + throw new IOException(e); | |
82 | + } | |
79 | 83 | LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); |
80 | 84 | } |
81 | 85 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
... | ... | @@ -106,6 +106,10 @@ class Crossvalidate { |
106 | 106 | return Pair.of(acc, name); |
107 | 107 | }).max(Comparator.comparingDouble(Pair::getLeft)); |
108 | 108 | |
109 | + printBestResult(watch, max); | |
110 | + } | |
111 | + | |
112 | + private static void printBestResult(StopWatch watch, Optional<Pair<Double, String>> max) { | |
109 | 113 | LOG.info("#########"); |
110 | 114 | if (max.isPresent()) { |
111 | 115 | LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); |
... | ... | @@ -142,13 +146,6 @@ class Crossvalidate { |
142 | 146 | return Pair.of(acc, name); |
143 | 147 | }).max(Comparator.comparingDouble(Pair::getLeft)); |
144 | 148 | |
145 | - LOG.info("#########"); | |
146 | - if (max.isPresent()) { | |
147 | - LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); | |
148 | - } else { | |
149 | - LOG.info("Empty algorithms list"); | |
150 | - } | |
151 | - watch.stop(); | |
152 | - LOG.info("Elapsed time: {}", watch); | |
149 | + printBestResult(watch, max); | |
153 | 150 | } |
154 | 151 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java
... | ... | @@ -22,33 +22,37 @@ public class MentionScorer { |
22 | 22 | Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); |
23 | 23 | |
24 | 24 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); |
25 | - Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences); | |
25 | + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStoptags(sentences); | |
26 | 26 | |
27 | 27 | return booleanTokenIntersection(mention2Orth, tokenCounts); |
28 | 28 | } |
29 | 29 | |
30 | - private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sentences) { | |
30 | + private Map<TMention, String> loadMention2OrthExcludingStoptags(List<TSentence> sentences) { | |
31 | 31 | Map<TMention, String> mention2orth = Maps.newHashMap(); |
32 | 32 | for (TSentence sentence : sentences) { |
33 | 33 | Map<String, TToken> tokId2tok = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); |
34 | 34 | |
35 | 35 | for (TMention mention : sentence.getMentions()) { |
36 | - StringBuilder mentionOrth = new StringBuilder(); | |
37 | - for (String tokId : mention.getChildIds()) { | |
38 | - TToken token = tokId2tok.get(tokId); | |
39 | - if (STOP_POS_TAGS.contains(token.getChosenInterpretation().getCtag())) | |
40 | - continue; | |
41 | - | |
42 | - if (!token.isNoPrecedingSpace()) | |
43 | - mentionOrth.append(" "); | |
44 | - mentionOrth.append(token.getOrth()); | |
45 | - } | |
46 | - mention2orth.put(mention, mentionOrth.toString().trim()); | |
36 | + mention2orth.put(mention, getMentionOrth(tokId2tok, mention)); | |
47 | 37 | } |
48 | 38 | } |
49 | 39 | return mention2orth; |
50 | 40 | } |
51 | 41 | |
42 | + private String getMentionOrth(Map<String, TToken> tokId2tok, TMention mention) { | |
43 | + StringBuilder mentionOrth = new StringBuilder(); | |
44 | + for (String tokId : mention.getChildIds()) { | |
45 | + TToken token = tokId2tok.get(tokId); | |
46 | + if (STOP_POS_TAGS.contains(token.getChosenInterpretation().getCtag())) | |
47 | + continue; | |
48 | + | |
49 | + if (!token.isNoPrecedingSpace()) | |
50 | + mentionOrth.append(" "); | |
51 | + mentionOrth.append(token.getOrth()); | |
52 | + } | |
53 | + return mentionOrth.toString().trim(); | |
54 | + } | |
55 | + | |
52 | 56 | private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { |
53 | 57 | Map<TMention, Double> mention2score = Maps.newHashMap(); |
54 | 58 | for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
... | ... | @@ -17,7 +17,6 @@ import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
17 | 17 | |
18 | 18 | public class ExtractGoldSummaries { |
19 | 19 | |
20 | - | |
21 | 20 | private ExtractGoldSummaries() { |
22 | 21 | } |
23 | 22 | |
... | ... | @@ -28,25 +27,30 @@ public class ExtractGoldSummaries { |
28 | 27 | File[] files = EXTRACTED_CORPUS_DATA_DIR.listFiles(); |
29 | 28 | if (files != null) { |
30 | 29 | for (File file : files) { |
31 | - Text text = PSC_IO.readText(file); | |
30 | + extractGoldSummariesFromFile(file); | |
31 | + } | |
32 | + } | |
33 | + } | |
32 | 34 | |
33 | - List<Summary> goldSummaries; | |
35 | + private static void extractGoldSummariesFromFile(File file) throws IOException, JAXBException { | |
36 | + Text text = PSC_IO.readText(file); | |
34 | 37 | |
35 | - boolean isTest = CorpusHelper.isTest(text); | |
36 | - if (isTest) { | |
37 | - goldSummaries = CorpusHelper.getAbstractSummaries(text); | |
38 | - } else { | |
39 | - goldSummaries = CorpusHelper.getExtractSummaries(text); | |
40 | - } | |
38 | + List<Summary> goldSummaries; | |
39 | + File targetDir; | |
41 | 40 | |
42 | - for (Summary summary : goldSummaries) { | |
43 | - File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; | |
44 | - File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); | |
41 | + boolean isTest = CorpusHelper.isTest(text); | |
42 | + if (isTest) { | |
43 | + goldSummaries = CorpusHelper.getAbstractSummaries(text); | |
44 | + targetDir = GOLD_TEST_SUMMARIES_DIR; | |
45 | + } else { | |
46 | + goldSummaries = CorpusHelper.getExtractSummaries(text); | |
47 | + targetDir = GOLD_TRAIN_SUMMARIES_DIR; | |
48 | + } | |
45 | 49 | |
46 | - try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { | |
47 | - writer.append(summary.getBody()); | |
48 | - } | |
49 | - } | |
50 | + for (Summary summary : goldSummaries) { | |
51 | + File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); | |
52 | + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { | |
53 | + writer.append(summary.getBody()); | |
50 | 54 | } |
51 | 55 | } |
52 | 56 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractMostFrequentMentions.java
... | ... | @@ -9,7 +9,6 @@ import pl.waw.ipipan.zil.summ.nicolas.PathConstants; |
9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
11 | 11 | |
12 | -import javax.xml.bind.JAXBException; | |
13 | 12 | import java.io.BufferedWriter; |
14 | 13 | import java.io.FileWriter; |
15 | 14 | import java.io.IOException; |
... | ... | @@ -26,7 +25,7 @@ public class ExtractMostFrequentMentions { |
26 | 25 | private ExtractMostFrequentMentions() { |
27 | 26 | } |
28 | 27 | |
29 | - public static void main(String[] args) throws IOException, JAXBException { | |
28 | + public static void main(String[] args) throws IOException { | |
30 | 29 | List<String> mostFrequentMentionBases = getMostFrequentMentionBases(); |
31 | 30 | try (BufferedWriter bw = new BufferedWriter(new FileWriter(PathConstants.TARGET_MODEL_DIR + Constants.FREQUENT_BASES_RESOURCE_PATH))) { |
32 | 31 | for (String base : mostFrequentMentionBases) { |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
... | ... | @@ -89,7 +89,7 @@ public class PrepareTrainingData { |
89 | 89 | } |
90 | 90 | } |
91 | 91 | |
92 | - private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { | |
92 | + private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException { | |
93 | 93 | |
94 | 94 | SentenceScorer sentenceScorer = new SentenceScorer(); |
95 | 95 | SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); |
... | ... | @@ -97,10 +97,12 @@ public class PrepareTrainingData { |
97 | 97 | Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); |
98 | 98 | |
99 | 99 | int i = 1; |
100 | - for (String textId : id2preprocessedText.keySet()) { | |
100 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
101 | 101 | logProgress(id2preprocessedText, i++); |
102 | 102 | |
103 | - TText preprocessedText = id2preprocessedText.get(textId); | |
103 | + String textId = entry.getKey(); | |
104 | + TText preprocessedText = entry.getValue(); | |
105 | + | |
104 | 106 | String optimalSummary = id2optimalSummary.get(textId); |
105 | 107 | if (optimalSummary == null) |
106 | 108 | continue; |
... | ... | @@ -110,9 +112,9 @@ public class PrepareTrainingData { |
110 | 112 | = loadGoldGoodMentions(textId, preprocessedText, id2optimalSummary); |
111 | 113 | |
112 | 114 | Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); |
113 | - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | |
114 | - TSentence sentence = entry.getKey(); | |
115 | - Instance instance = entry.getValue(); | |
115 | + for (Map.Entry<TSentence, Instance> sentenceInstance : sentence2instance.entrySet()) { | |
116 | + TSentence sentence = sentenceInstance.getKey(); | |
117 | + Instance instance = sentenceInstance.getValue(); | |
116 | 118 | instance.setDataset(instances); |
117 | 119 | instance.setClassValue(sentence2score.get(sentence)); |
118 | 120 | instances.add(instance); |
... | ... | @@ -121,7 +123,7 @@ public class PrepareTrainingData { |
121 | 123 | saveInstancesToFile(instances, SENTENCE_ARFF); |
122 | 124 | } |
123 | 125 | |
124 | - private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) throws IOException { | |
126 | + private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) { | |
125 | 127 | String optimalSummary = id2optimalSummary.get(id); |
126 | 128 | |
127 | 129 | MentionScorer scorer = new MentionScorer(); |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
... | ... | @@ -7,11 +7,10 @@ import pl.waw.ipipan.zil.summ.nicolas.Constants; |
7 | 7 | import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; |
8 | 8 | import weka.classifiers.Classifier; |
9 | 9 | import weka.core.Instances; |
10 | +import weka.core.SerializationHelper; | |
10 | 11 | import weka.core.converters.ArffLoader; |
11 | 12 | |
12 | 13 | import java.io.File; |
13 | -import java.io.FileOutputStream; | |
14 | -import java.io.ObjectOutputStream; | |
15 | 14 | import java.util.logging.LogManager; |
16 | 15 | |
17 | 16 | import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
... | ... | @@ -48,10 +47,7 @@ public class TrainAllModels { |
48 | 47 | |
49 | 48 | String target = TARGET_MODEL_DIR + targetPath; |
50 | 49 | LOG.info("Saving classifier at: {}", target); |
51 | - try (ObjectOutputStream oos = new ObjectOutputStream( | |
52 | - new FileOutputStream(target))) { | |
53 | - oos.writeObject(classifier); | |
54 | - } | |
50 | + SerializationHelper.write(target, classifier); | |
55 | 51 | |
56 | 52 | watch.stop(); |
57 | 53 | LOG.info("Elapsed time: {}", watch); |
... | ... |