Commit c9b1021d986bf2636bfd978885170dedd7d45546

Authored by Mateusz Kopeć
1 parent d3b1a80b

fix sonar issues

nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java
... ... @@ -92,7 +92,10 @@ class Cli {
92 92 public void validate(String name, String value) {
93 93 File file = new File(value);
94 94 try {
95   - file.createNewFile();
  95 + boolean newFile = file.createNewFile();
  96 + if (!newFile) {
  97 + LOG.warn("Output file exists and will be overridden.");
  98 + }
96 99 } catch (IOException ex) {
97 100 throw new ParameterException("Parameter " + name
98 101 + " should be a valid file path (found " + value + ")", ex);
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
... ... @@ -67,11 +67,14 @@ public class FeatureExtractor {
67 67 }
68 68 }
69 69 for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) {
70   - for (Attribute attribute : attribute2max.keySet()) {
  70 + for (Map.Entry<Attribute, Double> entry : attribute2max.entrySet()) {
  71 + Attribute attribute = entry.getKey();
  72 + Double max = entry.getValue();
  73 +
71 74 Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized");
72 75 entityAttributes.put(normalizedAttribute,
73 76 (entityAttributes.get(attribute) - attribute2min.get(attribute))
74   - / (attribute2max.get(attribute) - attribute2min.get(attribute)));
  77 + / (max - attribute2min.get(attribute)));
75 78 }
76 79 }
77 80 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
... ... @@ -54,49 +54,56 @@ public class FeatureHelper {
54 54 coref2mentions.put(coref, ments);
55 55 }
56 56  
57   - int parIdx = 0;
58   - int sentIdx = 0;
59   - int mentionIdx = 0;
  57 + Counters counters = new Counters();
60 58 for (TParagraph par : preprocessedText.getParagraphs()) {
61   - Map<TMention, String> m2o = loadMention2Orth(par.getSentences());
62   - mention2Orth.putAll(m2o);
63   - Map<TMention, String> m2b = loadMention2Base(par.getSentences());
64   - mention2Base.putAll(m2b);
65   -
66   - int sentIdxInPar = 0;
67   - int mentionIdxInPar = 0;
68   - for (TSentence sent : par.getSentences()) {
69   -
70   - Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity()));
71   -
72   - Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap();
73   - for (TNamedEntity namedEntity : sent.getNames()) {
74   - for (String childId : namedEntity.getChildIds()) {
75   - tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet());
76   - tokenId2namedEntities.get(childId).add(namedEntity);
77   - }
78   - }
  59 + processParagraph(counters, par);
  60 + }
  61 + }
79 62  
80   - int mentionIdxInSent = 0;
81   - for (TMention mention : sent.getMentions()) {
82   - mention2sent.put(mention, sent);
83   - mention2par.put(mention, par);
84   - mention2index.put(mention, mentionIdx++);
85   - mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next())));
86   - mention2indexInSent.put(mention, mentionIdxInSent++);
87   - mention2indexInPar.put(mention, mentionIdxInPar++);
88   -
89   - String firstHeadTokenId = mention.getHeadIds().iterator().next();
90   - mention2head.put(mention, tokenId2token.get(firstHeadTokenId));
91   - if (tokenId2namedEntities.containsKey(firstHeadTokenId))
92   - mentionsInNamedEntities.add(mention);
93   - }
94   - sent2Index.put(sent, sentIdx++);
95   - sent2IndexInPar.put(sent, sentIdxInPar++);
  63 + private void processParagraph(Counters counters, TParagraph par) {
  64 + Map<TMention, String> m2o = loadMention2Orth(par.getSentences());
  65 + mention2Orth.putAll(m2o);
  66 + Map<TMention, String> m2b = loadMention2Base(par.getSentences());
  67 + mention2Base.putAll(m2b);
  68 +
  69 + int sentIdxInPar = 0;
  70 + int mentionIdxInPar = 0;
  71 + for (TSentence sent : par.getSentences()) {
  72 +
  73 + Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity()));
  74 +
  75 + Map<String, Set<TNamedEntity>> tokenId2namedEntities = getTokenId2NamedEntities(sent);
  76 +
  77 + int mentionIdxInSent = 0;
  78 + for (TMention mention : sent.getMentions()) {
  79 + mention2sent.put(mention, sent);
  80 + mention2par.put(mention, par);
  81 + mention2index.put(mention, counters.mentionIdx++);
  82 + mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next())));
  83 + mention2indexInSent.put(mention, mentionIdxInSent++);
  84 + mention2indexInPar.put(mention, mentionIdxInPar++);
  85 +
  86 + String firstHeadTokenId = mention.getHeadIds().iterator().next();
  87 + mention2head.put(mention, tokenId2token.get(firstHeadTokenId));
  88 + if (tokenId2namedEntities.containsKey(firstHeadTokenId))
  89 + mentionsInNamedEntities.add(mention);
96 90 }
  91 + sent2Index.put(sent, counters.sentIdx++);
  92 + sent2IndexInPar.put(sent, sentIdxInPar++);
  93 + }
97 94  
98   - par2Index.put(par, parIdx++);
  95 + par2Index.put(par, counters.parIdx++);
  96 + }
  97 +
  98 + private Map<String, Set<TNamedEntity>> getTokenId2NamedEntities(TSentence sent) {
  99 + Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap();
  100 + for (TNamedEntity namedEntity : sent.getNames()) {
  101 + for (String childId : namedEntity.getChildIds()) {
  102 + tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet());
  103 + tokenId2namedEntities.get(childId).add(namedEntity);
  104 + }
99 105 }
  106 + return tokenId2namedEntities;
100 107 }
101 108  
102 109 public List<TMention> getMentions() {
... ... @@ -220,31 +227,35 @@ public class FeatureHelper {
220 227 return mention2sent.get(mention).getTokens().get(idx - 1);
221 228 }
222 229  
223   - private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) {
  230 + private static Map<TMention, String> loadMention2Orth(List<TSentence> sentences) {
224 231 Map<TMention, String> mention2orth = Maps.newHashMap();
225   - for (TSentence s : sents) {
226   - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
  232 + for (TSentence sentence : sentences) {
  233 + Map<String, TToken> tokId2tok = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
227 234  
228   - for (TMention m : s.getMentions()) {
229   - StringBuilder mentionOrth = new StringBuilder();
230   - for (String tokId : m.getChildIds()) {
231   - TToken token = tokId2tok.get(tokId);
232   - if (!token.isNoPrecedingSpace())
233   - mentionOrth.append(" ");
234   - mentionOrth.append(token.getOrth());
235   - }
236   - mention2orth.put(m, mentionOrth.toString().trim());
  235 + for (TMention mention : sentence.getMentions()) {
  236 + mention2orth.put(mention, getMentionOrth(tokId2tok, mention));
237 237 }
238 238 }
239 239 return mention2orth;
240 240 }
241 241  
242   - private static Map<TMention, String> loadMention2Base(List<TSentence> sents) {
  242 + private static String getMentionOrth(Map<String, TToken> tokId2tok, TMention m) {
  243 + StringBuilder mentionOrth = new StringBuilder();
  244 + for (String tokId : m.getChildIds()) {
  245 + TToken token = tokId2tok.get(tokId);
  246 + if (!token.isNoPrecedingSpace())
  247 + mentionOrth.append(" ");
  248 + mentionOrth.append(token.getOrth());
  249 + }
  250 + return mentionOrth.toString().trim();
  251 + }
  252 +
  253 + private static Map<TMention, String> loadMention2Base(List<TSentence> sentences) {
243 254 Map<TMention, String> mention2base = Maps.newHashMap();
244   - for (TSentence s : sents) {
245   - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase()));
  255 + for (TSentence sentence : sentences) {
  256 + Map<String, String> tokId2base = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase()));
246 257  
247   - for (TMention m : s.getMentions()) {
  258 + for (TMention m : sentence.getMentions()) {
248 259 StringBuilder mentionBase = new StringBuilder();
249 260 for (String tokId : m.getChildIds()) {
250 261 mentionBase.append(" ");
... ... @@ -255,4 +266,10 @@ public class FeatureHelper {
255 266 }
256 267 return mention2base;
257 268 }
  269 +
  270 + private class Counters {
  271 + int parIdx = 0;
  272 + int sentIdx = 0;
  273 + int mentionIdx = 0;
  274 + }
258 275 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
... ... @@ -18,6 +18,10 @@ import java.util.stream.Collectors;
18 18  
19 19 public class MentionFeatureExtractor extends FeatureExtractor {
20 20  
  21 + private static final String SCORE_ATTRIBUTE_NAME = "score";
  22 + private static final String OTHER_VALUE = "other";
  23 + private static final String NULL_VALUE = "null";
  24 +
21 25 private final List<String> frequentBases;
22 26  
23 27 public MentionFeatureExtractor() throws IOException {
... ... @@ -48,10 +52,10 @@ public class MentionFeatureExtractor extends FeatureExtractor {
48 52 addBinaryAttribute(prefix + "_is_named");
49 53 addBinaryAttribute(prefix + "_is_pronoun");
50 54 addNominalAttribute(prefix + "_ctag", Constants.POS_TAGS);
51   - addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter"));
52   - addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc"));
53   - addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl"));
54   - addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n"));
  55 + addNominalAttribute(prefix + "_person", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "pri", "sec", "ter"));
  56 + addNominalAttribute(prefix + "_case", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "nom", "acc", "dat", "gen", "loc", "inst", "voc"));
  57 + addNominalAttribute(prefix + "_number", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "sg", "pl"));
  58 + addNominalAttribute(prefix + "_gender", Lists.newArrayList(OTHER_VALUE, NULL_VALUE, "f", "m1", "m2", "m3", "n"));
55 59  
56 60 // relation to other
57 61 addBinaryAttribute(prefix + "_is_nested");
... ... @@ -76,8 +80,8 @@ public class MentionFeatureExtractor extends FeatureExtractor {
76 80 }
77 81 }
78 82  
79   - addNominalAttribute("score", Lists.newArrayList("bad", "good"));
80   - fillSortedAttributes("score");
  83 + addNominalAttribute(SCORE_ATTRIBUTE_NAME, Lists.newArrayList("bad", "good"));
  84 + fillSortedAttributes(SCORE_ATTRIBUTE_NAME);
81 85 }
82 86  
83 87 private String encodeBase(String base) {
... ... @@ -143,8 +147,11 @@ public class MentionFeatureExtractor extends FeatureExtractor {
143 147 attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention)));
144 148  
145 149 String orth = helper.getMentionOrth(mention);
146   - attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1))));
147   - attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth)));
  150 + String firstLetter = orth.substring(0, 1);
  151 + String firstLetterUpperCased = firstLetter.toUpperCase();
  152 + attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && firstLetterUpperCased.equals(firstLetter)));
  153 + String upperCased = orth.toUpperCase();
  154 + attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(upperCased.equals(orth)));
148 155 attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length());
149 156  
150 157 // par characteristics
... ... @@ -159,8 +166,8 @@ public class MentionFeatureExtractor extends FeatureExtractor {
159 166 attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size());
160 167 attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence));
161 168 attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence));
162   - attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals(".")));
163   - attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?")));
  169 + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(".".equals(helper.getSentenceLastTokenOrth(mentionSentence))));
  170 + attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary("?".equals(helper.getSentenceLastTokenOrth(mentionSentence))));
164 171  
165 172 // frequent bases
166 173 String mentionBase = helper.getMentionBase(mention);
... ... @@ -174,14 +181,14 @@ public class MentionFeatureExtractor extends FeatureExtractor {
174 181 int index = att.indexOfValue(value);
175 182 if (index == -1)
176 183 LOG.warn("{} not found for attribute {}", value, attributeName);
177   - attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
  184 + attribute2value.put(att, (double) (index == -1 ? att.indexOfValue(OTHER_VALUE) : index));
178 185 }
179 186  
180 187  
181 188 private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) {
182 189 for (TMention m : mentions) {
183 190 Map<Attribute, Double> map = Maps.newHashMap();
184   - map.put(getAttributeByName("score"), weka.core.Utils.missingValue());
  191 + map.put(getAttributeByName(SCORE_ATTRIBUTE_NAME), weka.core.Utils.missingValue());
185 192 result.put(m, map);
186 193 }
187 194 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
... ... @@ -13,6 +13,8 @@ import java.util.stream.Collectors;
13 13  
14 14 public class SentenceFeatureExtractor extends FeatureExtractor {
15 15  
  16 + private static final String SCORE_ATTRIBUTE_NAME = "score";
  17 +
16 18 public SentenceFeatureExtractor() {
17 19  
18 20 addNumericAttributeNormalized("sent_mention_cluster_count");
... ... @@ -39,8 +41,8 @@ public class SentenceFeatureExtractor extends FeatureExtractor {
39 41 addNumericAttribute("text_mention_count");
40 42 addNumericAttribute("text_cluster_count");
41 43  
42   - addNumericAttribute("score");
43   - fillSortedAttributes("score");
  44 + addNumericAttribute(SCORE_ATTRIBUTE_NAME);
  45 + fillSortedAttributes(SCORE_ATTRIBUTE_NAME);
44 46 }
45 47  
46 48 public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) {
... ... @@ -70,8 +72,8 @@ public class SentenceFeatureExtractor extends FeatureExtractor {
70 72 feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size());
71 73 feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar);
72 74 feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx);
73   - feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals(".")));
74   - feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?")));
  75 + feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(".".equals(helper.getSentenceLastTokenOrth(sentence))));
  76 + feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary("?".equals(helper.getSentenceLastTokenOrth(sentence))));
75 77  
76 78 feature2value.put(getAttributeByName("par_idx"), (double) parIdx);
77 79 feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum());
... ... @@ -84,7 +86,7 @@ public class SentenceFeatureExtractor extends FeatureExtractor {
84 86 feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
85 87 feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
86 88  
87   - feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue());
  89 + feature2value.put(getAttributeByName(SCORE_ATTRIBUTE_NAME), weka.core.Utils.missingValue());
88 90  
89 91 feature2value.remove(null);
90 92  
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java
... ... @@ -42,12 +42,13 @@ public class ResourceUtils {
42 42 }
43 43  
44 44 private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException {
  45 + Predicate<String> stringIsNonempty = (String s) -> !s.isEmpty();
45 46 try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) {
46 47 return IOUtils.readLines(stream, Constants.ENCODING)
47 48 .stream()
48 49 .map(String::trim)
49 50 .map(String::toLowerCase)
50   - .filter(((Predicate<String>) String::isEmpty).negate())
  51 + .filter(stringIsNonempty)
51 52 .sorted()
52 53 .distinct()
53 54 .collect(Collectors.toList());
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... ... @@ -37,15 +37,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
37 37 private static final String SENTENCE_MENTION_COUNT = "_sentence_mention_count";
38 38 private static final String SENTENCE_TOKEN_LENGTH = "_sentence_token_length";
39 39 private static final String IS_PAN_OR_PANI = "_is_pan_or_pani";
40   -
41   - // private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet(
42   -// "zespół", "tylko", "gdy", ".", ":", "też", "kandydat", "do", "dziś", "bo", "by", "z", "a", "jednak", "jak", "który", "ale", "czy", "i", "się", "rok", "-", "\"", "to", "być", "że", ",");
43 40 private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet("to", "z", "do", "o", "czyli", "nie", "\"", "też", "jak", "czy");
44   -
45 41 private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet();
46   -// private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet(
47   -// "mówić", "ii", "twierdzić", "już", "(", "budzić", "stanowić", "powinien", "do", "stać", "musieć", "stanąć", "móc", "o", "chcieć", "się", "-", "zostać", ":", "?", "i", "na", "z", "mieć", "\"", "to", "w", "nie", "być", ".", ",");
48   -
49 42 private static final String PREV_TOKEN_LEMMA = "_prev_token_lemma_equal_";
50 43 private static final String NEXT_TOKEN_LEMMA = "_next_token_lemma_equal_";
51 44  
... ... @@ -105,7 +98,10 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
105 98 candidateFeatures.put(getAttributeByName(SCORE), weka.core.Utils.missingValue());
106 99  
107 100 TMention mention = candidate.getZeroCandidateMention();
108   - TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get();
  101 + TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().orElse(null);
  102 + if (antecedent == null) {
  103 + throw new IllegalArgumentException("Mention pair without first element!");
  104 + }
109 105  
110 106 addMentionFeatures(helper, candidateFeatures, mention, CANDIDATE_PREFIX);
111 107 addMentionFeatures(helper, candidateFeatures, antecedent, ANTECEDENT_PREFIX);
... ... @@ -165,14 +161,14 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
165 161 TSentence mentionSentence = helper.getMentionSentence(mention);
166 162 candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_MENTION_COUNT), (double) mentionSentence.getMentions().size());
167 163 candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_TOKEN_LENGTH), (double) mentionSentence.getTokens().size());
168   - candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth().equals("?")));
  164 + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary("?".equals(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth())));
169 165 }
170 166  
171 167 private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) {
172 168 Attribute att = getAttributeByName(attributeName);
173 169 int index = att.indexOfValue(value);
174 170 if (index == -1)
175   - LOG.warn(value + "not found for attribute " + attributeName);
  171 + LOG.warn("{} not found for attribute {}", value, attributeName);
176 172 attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
177 173 }
178 174 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
... ... @@ -32,7 +32,7 @@ public class CorpusHelper {
32 32 }
33 33  
34 34 public static List<Summary> getAbstractSummaries(Text text) {
35   - return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  35 + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
36 36 }
37 37  
38 38 public static Set<String> loadTrainTextIds() throws IOException {
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
... ... @@ -67,15 +67,19 @@ public class PathConstants {
67 67 }
68 68 }
69 69  
70   - public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException, ZipException {
  70 + public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException {
71 71 downloadFile(url, targetZipFile);
72 72 extractZipFile(targetZipFile, targetDir);
73 73 }
74 74  
75   - private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException {
  75 + private static void extractZipFile(File targetZipFile, File targetDir) throws IOException {
76 76 createFolder(targetDir);
77   - ZipFile zipFile = new ZipFile(targetZipFile);
78   - zipFile.extractAll(targetDir.getPath());
  77 + try {
  78 + ZipFile zipFile = new ZipFile(targetZipFile);
  79 + zipFile.extractAll(targetDir.getPath());
  80 + } catch (ZipException e) {
  81 + throw new IOException(e);
  82 + }
79 83 LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir);
80 84 }
81 85 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
... ... @@ -106,6 +106,10 @@ class Crossvalidate {
106 106 return Pair.of(acc, name);
107 107 }).max(Comparator.comparingDouble(Pair::getLeft));
108 108  
  109 + printBestResult(watch, max);
  110 + }
  111 +
  112 + private static void printBestResult(StopWatch watch, Optional<Pair<Double, String>> max) {
109 113 LOG.info("#########");
110 114 if (max.isPresent()) {
111 115 LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft());
... ... @@ -142,13 +146,6 @@ class Crossvalidate {
142 146 return Pair.of(acc, name);
143 147 }).max(Comparator.comparingDouble(Pair::getLeft));
144 148  
145   - LOG.info("#########");
146   - if (max.isPresent()) {
147   - LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft());
148   - } else {
149   - LOG.info("Empty algorithms list");
150   - }
151   - watch.stop();
152   - LOG.info("Elapsed time: {}", watch);
  149 + printBestResult(watch, max);
153 150 }
154 151 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java
... ... @@ -22,33 +22,37 @@ public class MentionScorer {
22 22 Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase()));
23 23  
24 24 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
25   - Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences);
  25 + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStoptags(sentences);
26 26  
27 27 return booleanTokenIntersection(mention2Orth, tokenCounts);
28 28 }
29 29  
30   - private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sentences) {
  30 + private Map<TMention, String> loadMention2OrthExcludingStoptags(List<TSentence> sentences) {
31 31 Map<TMention, String> mention2orth = Maps.newHashMap();
32 32 for (TSentence sentence : sentences) {
33 33 Map<String, TToken> tokId2tok = sentence.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
34 34  
35 35 for (TMention mention : sentence.getMentions()) {
36   - StringBuilder mentionOrth = new StringBuilder();
37   - for (String tokId : mention.getChildIds()) {
38   - TToken token = tokId2tok.get(tokId);
39   - if (STOP_POS_TAGS.contains(token.getChosenInterpretation().getCtag()))
40   - continue;
41   -
42   - if (!token.isNoPrecedingSpace())
43   - mentionOrth.append(" ");
44   - mentionOrth.append(token.getOrth());
45   - }
46   - mention2orth.put(mention, mentionOrth.toString().trim());
  36 + mention2orth.put(mention, getMentionOrth(tokId2tok, mention));
47 37 }
48 38 }
49 39 return mention2orth;
50 40 }
51 41  
  42 + private String getMentionOrth(Map<String, TToken> tokId2tok, TMention mention) {
  43 + StringBuilder mentionOrth = new StringBuilder();
  44 + for (String tokId : mention.getChildIds()) {
  45 + TToken token = tokId2tok.get(tokId);
  46 + if (STOP_POS_TAGS.contains(token.getChosenInterpretation().getCtag()))
  47 + continue;
  48 +
  49 + if (!token.isNoPrecedingSpace())
  50 + mentionOrth.append(" ");
  51 + mentionOrth.append(token.getOrth());
  52 + }
  53 + return mentionOrth.toString().trim();
  54 + }
  55 +
52 56 private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
53 57 Map<TMention, Double> mention2score = Maps.newHashMap();
54 58 for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
... ... @@ -17,7 +17,6 @@ import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
17 17  
18 18 public class ExtractGoldSummaries {
19 19  
20   -
21 20 private ExtractGoldSummaries() {
22 21 }
23 22  
... ... @@ -28,25 +27,30 @@ public class ExtractGoldSummaries {
28 27 File[] files = EXTRACTED_CORPUS_DATA_DIR.listFiles();
29 28 if (files != null) {
30 29 for (File file : files) {
31   - Text text = PSC_IO.readText(file);
  30 + extractGoldSummariesFromFile(file);
  31 + }
  32 + }
  33 + }
32 34  
33   - List<Summary> goldSummaries;
  35 + private static void extractGoldSummariesFromFile(File file) throws IOException, JAXBException {
  36 + Text text = PSC_IO.readText(file);
34 37  
35   - boolean isTest = CorpusHelper.isTest(text);
36   - if (isTest) {
37   - goldSummaries = CorpusHelper.getAbstractSummaries(text);
38   - } else {
39   - goldSummaries = CorpusHelper.getExtractSummaries(text);
40   - }
  38 + List<Summary> goldSummaries;
  39 + File targetDir;
41 40  
42   - for (Summary summary : goldSummaries) {
43   - File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR;
44   - File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt");
  41 + boolean isTest = CorpusHelper.isTest(text);
  42 + if (isTest) {
  43 + goldSummaries = CorpusHelper.getAbstractSummaries(text);
  44 + targetDir = GOLD_TEST_SUMMARIES_DIR;
  45 + } else {
  46 + goldSummaries = CorpusHelper.getExtractSummaries(text);
  47 + targetDir = GOLD_TRAIN_SUMMARIES_DIR;
  48 + }
45 49  
46   - try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) {
47   - writer.append(summary.getBody());
48   - }
49   - }
  50 + for (Summary summary : goldSummaries) {
  51 + File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt");
  52 + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) {
  53 + writer.append(summary.getBody());
50 54 }
51 55 }
52 56 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractMostFrequentMentions.java
... ... @@ -9,7 +9,6 @@ import pl.waw.ipipan.zil.summ.nicolas.PathConstants;
9 9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 10 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
11 11  
12   -import javax.xml.bind.JAXBException;
13 12 import java.io.BufferedWriter;
14 13 import java.io.FileWriter;
15 14 import java.io.IOException;
... ... @@ -26,7 +25,7 @@ public class ExtractMostFrequentMentions {
26 25 private ExtractMostFrequentMentions() {
27 26 }
28 27  
29   - public static void main(String[] args) throws IOException, JAXBException {
  28 + public static void main(String[] args) throws IOException {
30 29 List<String> mostFrequentMentionBases = getMostFrequentMentionBases();
31 30 try (BufferedWriter bw = new BufferedWriter(new FileWriter(PathConstants.TARGET_MODEL_DIR + Constants.FREQUENT_BASES_RESOURCE_PATH))) {
32 31 for (String base : mostFrequentMentionBases) {
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
... ... @@ -89,7 +89,7 @@ public class PrepareTrainingData {
89 89 }
90 90 }
91 91  
92   - private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception {
  92 + private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException {
93 93  
94 94 SentenceScorer sentenceScorer = new SentenceScorer();
95 95 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
... ... @@ -97,10 +97,12 @@ public class PrepareTrainingData {
97 97 Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
98 98  
99 99 int i = 1;
100   - for (String textId : id2preprocessedText.keySet()) {
  100 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
101 101 logProgress(id2preprocessedText, i++);
102 102  
103   - TText preprocessedText = id2preprocessedText.get(textId);
  103 + String textId = entry.getKey();
  104 + TText preprocessedText = entry.getValue();
  105 +
104 106 String optimalSummary = id2optimalSummary.get(textId);
105 107 if (optimalSummary == null)
106 108 continue;
... ... @@ -110,9 +112,9 @@ public class PrepareTrainingData {
110 112 = loadGoldGoodMentions(textId, preprocessedText, id2optimalSummary);
111 113  
112 114 Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
113   - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
114   - TSentence sentence = entry.getKey();
115   - Instance instance = entry.getValue();
  115 + for (Map.Entry<TSentence, Instance> sentenceInstance : sentence2instance.entrySet()) {
  116 + TSentence sentence = sentenceInstance.getKey();
  117 + Instance instance = sentenceInstance.getValue();
116 118 instance.setDataset(instances);
117 119 instance.setClassValue(sentence2score.get(sentence));
118 120 instances.add(instance);
... ... @@ -121,7 +123,7 @@ public class PrepareTrainingData {
121 123 saveInstancesToFile(instances, SENTENCE_ARFF);
122 124 }
123 125  
124   - private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) throws IOException {
  126 + private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) {
125 127 String optimalSummary = id2optimalSummary.get(id);
126 128  
127 129 MentionScorer scorer = new MentionScorer();
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
... ... @@ -7,11 +7,10 @@ import pl.waw.ipipan.zil.summ.nicolas.Constants;
7 7 import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings;
8 8 import weka.classifiers.Classifier;
9 9 import weka.core.Instances;
  10 +import weka.core.SerializationHelper;
10 11 import weka.core.converters.ArffLoader;
11 12  
12 13 import java.io.File;
13   -import java.io.FileOutputStream;
14   -import java.io.ObjectOutputStream;
15 14 import java.util.logging.LogManager;
16 15  
17 16 import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
... ... @@ -48,10 +47,7 @@ public class TrainAllModels {
48 47  
49 48 String target = TARGET_MODEL_DIR + targetPath;
50 49 LOG.info("Saving classifier at: {}", target);
51   - try (ObjectOutputStream oos = new ObjectOutputStream(
52   - new FileOutputStream(target))) {
53   - oos.writeObject(classifier);
54   - }
  50 + SerializationHelper.write(target, classifier);
55 51  
56 52 watch.stop();
57 53 LOG.info("Elapsed time: {}", watch);
... ...