rough draft (e1126cdb) | Commits | summarization / nicolas

Browse Code »

Commit e1126cdba70bd5287871ebbe89e9ae6635bb5a01

Authored by Mateusz Kopeć 9 years ago

0 parents

rough draft

Inline Side-by-side

Showing 28 changed files with 2105 additions and 0 deletions

.gitignore 0 → 100644

View file @e1126cd

	1	+++ a/.gitignore
	1	+# Created by .ignore support plugin (hsz.mobi)
	2	+### Java template
	3	+*.
	4	+target/
	5	+
	6	+# Mobile Tools for Java (J2ME)
	7	+.mtj.tmp/
	8	+
	9	+# Package Files #
	10	+*.jar
	11	+*.war
	12	+*.ear
	13	+
	14	+# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
	15	+hs_err_pid*
	16	+
	17	+.idea
	18	+*.iml
0	19	\ No newline at end of file
...	...

nicolas-cli/pom.xml 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-cli/pom.xml
	1	+<?xml version="1.0" encoding="UTF-8"?>
	2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
	3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	5	+ <modelVersion>4.0.0</modelVersion>
	6	+ <parent>
	7	+ <artifactId>nicolas-container</artifactId>
	8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
	9	+ <version>1.0-SNAPSHOT</version>
	10	+ </parent>
	11	+
	12	+ <artifactId>nicolas-cli</artifactId>
	13	+
	14	+</project>
0	15	\ No newline at end of file
...	...

nicolas-core/pom.xml 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/pom.xml
	1	+<?xml version="1.0" encoding="UTF-8"?>
	2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
	3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	5	+ <modelVersion>4.0.0</modelVersion>
	6	+ <parent>
	7	+ <artifactId>nicolas-container</artifactId>
	8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
	9	+ <version>1.0-SNAPSHOT</version>
	10	+ </parent>
	11	+
	12	+ <artifactId>nicolas</artifactId>
	13	+
	14	+ <dependencies>
	15	+ <dependency>
	16	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
	17	+ <artifactId>nicolas-model</artifactId>
	18	+ <version>${project.version}</version>
	19	+ <scope>runtime</scope>
	20	+ </dependency>
	21	+ </dependencies>
	22	+</project>
0	23	\ No newline at end of file
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
	1	+package pl.waw.ipipan.zil.summ.nicolas;
	2	+
	3	+import weka.classifiers.Classifier;
	4	+import weka.classifiers.trees.RandomForest;
	5	+
	6	+
	7	+public class Constants {
	8	+
	9	+ public static final String MENTIONS_MODEL_PATH = "mentions_model.bin";
	10	+ public static final String SENTENCES_MODEL_PATH = "sentences_model.bin";
	11	+ public static final String MENTIONS_DATASET_PATH = "mentions_train.arff";
	12	+ public static final String SENTENCES_DATASET_PATH = "sentences_train.arff";
	13	+
	14	+ private Constants() {
	15	+ }
	16	+
	17	+ public static Classifier getClassifier() {
	18	+ RandomForest classifier = new RandomForest();
	19	+ classifier.setNumIterations(250);
	20	+ classifier.setSeed(0);
	21	+ classifier.setNumExecutionSlots(8);
	22	+ return classifier;
	23	+ }
	24	+
	25	+
	26	+ public static Classifier getSentencesClassifier() {
	27	+ RandomForest classifier = new RandomForest();
	28	+ classifier.setNumIterations(250);
	29	+ classifier.setSeed(0);
	30	+ classifier.setNumExecutionSlots(8);
	31	+ return classifier;
	32	+ }
	33	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
	1	+package pl.waw.ipipan.zil.summ.nicolas;
	2	+
	3	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
	4	+
	5	+public class Nicolas {
	6	+
	7	+ public String summarizeThrift(TText text, int targetTokenCount) {
	8	+ return "test nicolas";
	9	+ }
	10	+
	11	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java
	1	+package pl.waw.ipipan.zil.summ.nicolas;
	2	+
	3	+import com.google.common.base.Charsets;
	4	+import com.google.common.collect.Lists;
	5	+import com.google.common.collect.Maps;
	6	+import com.google.common.collect.Sets;
	7	+import com.google.common.io.Files;
	8	+import org.slf4j.Logger;
	9	+import org.slf4j.LoggerFactory;
	10	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
	11	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
	12	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
	13	+import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
	14	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
	15	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;
	16	+import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
	17	+import weka.classifiers.Classifier;
	18	+import weka.core.Attribute;
	19	+import weka.core.DenseInstance;
	20	+import weka.core.Instance;
	21	+import weka.core.Instances;
	22	+
	23	+import java.io.File;
	24	+import java.io.FileInputStream;
	25	+import java.io.IOException;
	26	+import java.io.ObjectInputStream;
	27	+import java.util.*;
	28	+import java.util.function.Function;
	29	+import java.util.stream.Collectors;
	30	+
	31	+import static java.util.stream.Collectors.toList;
	32	+
	33	+public class Utils {
	34	+
	35	+ private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
	36	+
	37	+ private static final String DATASET_NAME = "Dataset";
	38	+
	39	+ public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
	40	+ List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
	41	+ Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
	42	+
	43	+ LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention.");
	44	+ Map<TMention, Instance> mention2instance = Maps.newHashMap();
	45	+ for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {
	46	+ Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
	47	+ Map<Attribute, Double> mentionFeatures = mention2features.get(tMention);
	48	+ for (Attribute attribute : featureExtractor.getAttributesList()) {
	49	+ instance.setValue(attribute, mentionFeatures.get(attribute));
	50	+ }
	51	+ mention2instance.put(tMention, instance);
	52	+ }
	53	+ return mention2instance;
	54	+ }
	55	+
	56	+ public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) {
	57	+ List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
	58	+ Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);
	59	+
	60	+ LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence.");
	61	+ Map<TSentence, Instance> sentence2instance = Maps.newHashMap();
	62	+ for (TSentence sentence : sentences) {
	63	+ Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
	64	+ Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence);
	65	+ for (Attribute attribute : featureExtractor.getAttributesList()) {
	66	+ instance.setValue(attribute, sentenceFeatures.get(attribute));
	67	+ }
	68	+ sentence2instance.put(sentence, instance);
	69	+ }
	70	+ return sentence2instance;
	71	+ }
	72	+
	73	+ public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
	74	+ Instances instances = new Instances(DATASET_NAME, attributesList, 0);
	75	+ instances.setClassIndex(0);
	76	+ return instances;
	77	+ }
	78	+
	79	+ public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException {
	80	+ LOG.info("Loading classifier...");
	81	+ try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) {
	82	+ Classifier classifier = (Classifier) ois.readObject();
	83	+ LOG.info("Done. " + classifier.toString());
	84	+ return classifier;
	85	+ }
	86	+ }
	87	+
	88	+ public static Map<String, TText> loadPreprocessedTexts(String path) {
	89	+ Map<String, TText> id2text = Maps.newHashMap();
	90	+ for (File processedFullTextFile : new File(path).listFiles()) {
	91	+ TText processedFullText = loadThrifted(processedFullTextFile);
	92	+ id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText);
	93	+ }
	94	+ LOG.info(id2text.size() + " preprocessed texts found.");
	95	+ return id2text;
	96	+ }
	97	+
	98	+
	99	+ public static TText loadThrifted(File originalFile) {
	100	+ try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) {
	101	+ return (TText) ois.readObject();
	102	+ } catch (ClassNotFoundException \| IOException e) {
	103	+ LOG.error("Error reading serialized file: " + e);
	104	+ return null;
	105	+ }
	106	+ }
	107	+
	108	+ public static List<String> tokenize(String text) {
	109	+ return Arrays.asList(text.split("[^\\p{L}0-9]+"));
	110	+ }
	111	+
	112	+ public static List<String> tokenizeOnWhitespace(String text) {
	113	+ return Arrays.asList(text.split(" +"));
	114	+ }
	115	+
	116	+ public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) {
	117	+ Map<TMention, String> mention2orth = Maps.newHashMap();
	118	+ for (TSentence s : sents) {
	119	+ Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth));
	120	+ Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace));
	121	+
	122	+ for (TMention m : s.getMentions()) {
	123	+ StringBuffer mentionOrth = new StringBuffer();
	124	+ for (String tokId : m.getHeadIds()) {
	125	+ if (!tokId2nps.get(tokId))
	126	+ mentionOrth.append(" ");
	127	+ mentionOrth.append(tokId2orth.get(tokId));
	128	+ }
	129	+ mention2orth.put(m, mentionOrth.toString().trim());
	130	+ }
	131	+ }
	132	+ return mention2orth;
	133	+ }
	134	+
	135	+ private static final Collection<String> STOPWORDS = Sets.newHashSet();
	136	+
	137	+ static {
	138	+ STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co"));
	139	+ }
	140	+
	141	+ public static Map<TMention, String> loadMention2Orth(List<TSentence> sents) {
	142	+ Map<TMention, String> mention2orth = Maps.newHashMap();
	143	+ for (TSentence s : sents) {
	144	+ Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
	145	+
	146	+ for (TMention m : s.getMentions()) {
	147	+ StringBuffer mentionOrth = new StringBuffer();
	148	+ for (String tokId : m.getChildIds()) {
	149	+ TToken token = tokId2tok.get(tokId);
	150	+ if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) {
	151	+ continue;
	152	+ }
	153	+
	154	+ if (!token.isNoPrecedingSpace())
	155	+ mentionOrth.append(" ");
	156	+ mentionOrth.append(token.getOrth());
	157	+ }
	158	+ mention2orth.put(m, mentionOrth.toString().trim());
	159	+ }
	160	+ }
	161	+ return mention2orth;
	162	+ }
	163	+
	164	+ public static Map<TMention, String> loadMention2Base(List<TSentence> sents) {
	165	+ Map<TMention, String> mention2base = Maps.newHashMap();
	166	+ for (TSentence s : sents) {
	167	+ Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase()));
	168	+
	169	+ for (TMention m : s.getMentions()) {
	170	+ StringBuilder mentionBase = new StringBuilder();
	171	+ for (String tokId : m.getChildIds()) {
	172	+ mentionBase.append(" ");
	173	+ mentionBase.append(tokId2base.get(tokId));
	174	+ }
	175	+ mention2base.put(m, mentionBase.toString().toLowerCase().trim());
	176	+ }
	177	+ }
	178	+ return mention2base;
	179	+ }
	180	+
	181	+ public static String loadSentence2Orth(TSentence sentence) {
	182	+ StringBuilder sb = new StringBuilder();
	183	+ for (TToken token : sentence.getTokens()) {
	184	+ if (!token.isNoPrecedingSpace())
	185	+ sb.append(" ");
	186	+ sb.append(token.getOrth());
	187	+ }
	188	+ return sb.toString().trim();
	189	+ }
	190	+
	191	+ public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {
	192	+ String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);
	193	+
	194	+ MentionScorer scorer = new MentionScorer();
	195	+ Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);
	196	+
	197	+ mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);
	198	+ return mention2score.keySet();
	199	+ }
	200	+}
0	201	\ No newline at end of file
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.apply;
	2	+
	3	+import com.google.common.collect.Lists;
	4	+import com.google.common.collect.Maps;
	5	+import com.google.common.collect.Sets;
	6	+import org.slf4j.Logger;
	7	+import org.slf4j.LoggerFactory;
	8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
	9	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
	10	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
	11	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
	12	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
	13	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
	14	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
	15	+import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
	16	+import weka.classifiers.Classifier;
	17	+import weka.core.Instance;
	18	+import weka.core.Instances;
	19	+
	20	+import java.io.BufferedWriter;
	21	+import java.io.File;
	22	+import java.io.FileWriter;
	23	+import java.util.*;
	24	+
	25	+import static java.util.stream.Collectors.toList;
	26	+
	27	+public class ApplyModel2 {
	28	+
	29	+ private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class);
	30	+
	31	+ private static final String TEST_PREPROCESSED_DATA_PATH = "src/main/resources/preprocessed_full_texts/test";
	32	+ private static final String TARGET_DIR = "summaries";
	33	+
	34	+ public static void main(String[] args) throws Exception {
	35	+ Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
	36	+ MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
	37	+
	38	+ Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH);
	39	+ SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
	40	+
	41	+ Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH);
	42	+ int i = 1;
	43	+ double avgSize = 0;
	44	+ for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
	45	+ TText text = entry.getValue();
	46	+
	47	+ Set<TMention> goodMentions
	48	+ = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text);
	49	+
	50	+ int targetSize = calculateTargetSize(text);
	51	+ String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);
	52	+ int size = Utils.tokenize(summary).size();
	53	+ avgSize += size;
	54	+ try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily3.txt")))) {
	55	+ bw.append(summary);
	56	+ }
	57	+
	58	+ LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey());
	59	+ }
	60	+
	61	+ LOG.info("Avg size:" + avgSize / id2preprocessedText.size());
	62	+ }
	63	+
	64	+ private static int calculateTargetSize(TText text) {
	65	+ List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
	66	+ StringBuffer body = new StringBuffer();
	67	+ for (TSentence sent : sents)
	68	+ body.append(Utils.loadSentence2Orth(sent) + " ");
	69	+ int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size();
	70	+ return (int) (0.2 * tokenCount);
	71	+ }
	72	+
	73	+ private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
	74	+ List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);
	75	+
	76	+ StringBuffer sb = new StringBuffer();
	77	+ for (TSentence sent : selectedSentences) {
	78	+ sb.append(" " + Utils.loadSentence2Orth(sent));
	79	+ }
	80	+ return sb.toString().trim();
	81	+ }
	82	+
	83	+ private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
	84	+
	85	+ List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
	86	+
	87	+ Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
	88	+ Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
	89	+
	90	+ Map<TSentence, Double> sentence2score = Maps.newHashMap();
	91	+ for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
	92	+ Instance instance = entry.getValue();
	93	+ instance.setDataset(instances);
	94	+ double score = sentenceClassifier.classifyInstance(instance);
	95	+ sentence2score.put(entry.getKey(), score);
	96	+ }
	97	+
	98	+ List<TSentence> sortedSents = Lists.newArrayList(sents);
	99	+ Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed());
	100	+
	101	+ int size = 0;
	102	+ Random r = new Random(1);
	103	+ Set<TSentence> summary = Sets.newHashSet();
	104	+ for (TSentence sent : sortedSents) {
	105	+ size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size();
	106	+ if (r.nextDouble() > 0.4 && size > targetSize)
	107	+ break;
	108	+ summary.add(sent);
	109	+ if (size > targetSize)
	110	+ break;
	111	+ }
	112	+ List<TSentence> selectedSentences = Lists.newArrayList();
	113	+ for (TSentence sent : sents) {
	114	+ if (summary.contains(sent))
	115	+ selectedSentences.add(sent);
	116	+ }
	117	+ return selectedSentences;
	118	+ }
	119	+
	120	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.features;
	2	+
	3	+import com.google.common.collect.*;
	4	+import org.slf4j.Logger;
	5	+import org.slf4j.LoggerFactory;
	6	+import weka.core.Attribute;
	7	+
	8	+import java.util.*;
	9	+
	10	+public class FeatureExtractor {
	11	+
	12	+ protected static final Logger LOG = LoggerFactory.getLogger(FeatureExtractor.class);
	13	+
	14	+ private final List<Attribute> sortedAttributes = Lists.newArrayList();
	15	+
	16	+ private final BiMap<String, Attribute> name2attribute = HashBiMap.create();
	17	+
	18	+ private final Set<String> normalizedAttributes = Sets.newHashSet();
	19	+
	20	+ public ArrayList<Attribute> getAttributesList() {
	21	+ return Lists.newArrayList(sortedAttributes);
	22	+ }
	23	+
	24	+ protected Attribute getAttributeByName(String name) {
	25	+ return name2attribute.get(name);
	26	+ }
	27	+
	28	+ protected void addNumericAttribute(String attributeName) {
	29	+ name2attribute.put(attributeName, new Attribute(attributeName));
	30	+ }
	31	+
	32	+ protected void addBinaryAttribute(String attributeName) {
	33	+ name2attribute.put(attributeName, new Attribute(attributeName, Lists.newArrayList("f", "t")));
	34	+ }
	35	+
	36	+ protected void addNominalAttribute(String attributeName, List<String> values) {
	37	+ name2attribute.put(attributeName, new Attribute(attributeName, values));
	38	+ }
	39	+
	40	+ protected void addNumericAttributeNormalized(String attributeName) {
	41	+ addNumericAttribute(attributeName);
	42	+ addNumericAttribute(attributeName + "_normalized");
	43	+ normalizedAttributes.add(attributeName);
	44	+ }
	45	+
	46	+ protected void fillSortedAttributes(String scoreAttName) {
	47	+ sortedAttributes.addAll(name2attribute.values());
	48	+ sortedAttributes.remove(getAttributeByName(scoreAttName));
	49	+ Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2)));
	50	+ sortedAttributes.add(0, getAttributeByName(scoreAttName));
	51	+ }
	52	+
	53	+ protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) {
	54	+ Map<Attribute, Double> attribute2max = Maps.newHashMap();
	55	+ Map<Attribute, Double> attribute2min = Maps.newHashMap();
	56	+ for (T entity : entity2attributes.keySet()) {
	57	+ Map<Attribute, Double> entityAttributes = entity2attributes.get(entity);
	58	+ for (String attributeName : normalizedAttributes) {
	59	+ Attribute attribute = getAttributeByName(attributeName);
	60	+ Double value = entityAttributes.get(attribute);
	61	+
	62	+ attribute2max.putIfAbsent(attribute, Double.MIN_VALUE);
	63	+ attribute2max.compute(attribute, (k, v) -> Math.max(v, value));
	64	+
	65	+ attribute2min.putIfAbsent(attribute, Double.MAX_VALUE);
	66	+ attribute2min.compute(attribute, (k, v) -> Math.min(v, value));
	67	+ }
	68	+ }
	69	+ for (T mention : entity2attributes.keySet()) {
	70	+ Map<Attribute, Double> entityAttributes = entity2attributes.get(mention);
	71	+ for (Attribute attribute : attribute2max.keySet()) {
	72	+ Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized");
	73	+ entityAttributes.put(normalizedAttribute,
	74	+ (entityAttributes.get(attribute) - attribute2min.get(attribute))
	75	+ / (attribute2max.get(attribute) - attribute2min.get(attribute)));
	76	+ }
	77	+ }
	78	+ }
	79	+
	80	+ protected double toBinary(boolean bool) {
	81	+ return bool ? 1.0 : 0.0;
	82	+ }
	83	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.features;
	2	+
	3	+import com.google.common.collect.Maps;
	4	+import com.google.common.collect.Sets;
	5	+import pl.waw.ipipan.zil.multiservice.thrift.types.*;
	6	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
	7	+
	8	+import java.util.List;
	9	+import java.util.Map;
	10	+import java.util.Set;
	11	+import java.util.function.Function;
	12	+import java.util.stream.Collectors;
	13	+
	14	+import static java.util.stream.Collectors.toList;
	15	+import static java.util.stream.Collectors.toMap;
	16	+
	17	+/**
	18	+ * Created by me2 on 04.04.16.
	19	+ */
	20	+public class FeatureHelper {
	21	+
	22	+ private final List<TMention> mentions;
	23	+ private final Map<String, TMention> mentionId2mention;
	24	+ private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap();
	25	+ private final Map<TMention, TCoreference> mention2coref = Maps.newHashMap();
	26	+ private final Map<TMention, TSentence> mention2sent = Maps.newHashMap();
	27	+ private final Map<TMention, TParagraph> mention2par = Maps.newHashMap();
	28	+ private final Map<TMention, String> mention2Orth = Maps.newHashMap();
	29	+ private final Map<TMention, String> mention2Base = Maps.newHashMap();
	30	+ private final Map<TMention, TToken> mention2head = Maps.newHashMap();
	31	+ private final Set<TMention> mentionsInNamedEntities = Sets.newHashSet();
	32	+
	33	+ private final Map<TMention, Integer> mention2Index = Maps.newHashMap();
	34	+ private final Map<TSentence, Integer> sent2Index = Maps.newHashMap();
	35	+ private final Map<TParagraph, Integer> par2Index = Maps.newHashMap();
	36	+ private final Map<TSentence, Integer> sent2IndexInPar = Maps.newHashMap();
	37	+ private final Map<TMention, Integer> mention2indexInPar = Maps.newHashMap();
	38	+ private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap();
	39	+
	40	+
	41	+ public FeatureHelper(TText preprocessedText) {
	42	+ mentions = preprocessedText.getParagraphs().stream()
	43	+ .flatMap(p -> p.getSentences().stream())
	44	+ .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList());
	45	+
	46	+ mentionId2mention = mentions.stream().collect(Collectors.toMap(TMention::getId, Function.identity()));
	47	+
	48	+ for (TCoreference coref : preprocessedText.getCoreferences()) {
	49	+ List<TMention> ments = coref.getMentionIds().stream().map(mentionId2mention::get).collect(toList());
	50	+ for (TMention m : ments) {
	51	+ mention2coref.put(m, coref);
	52	+ }
	53	+ coref2mentions.put(coref, ments);
	54	+ }
	55	+
	56	+ int parIdx = 0;
	57	+ int sentIdx = 0;
	58	+ int mentionIdx = 0;
	59	+ for (TParagraph par : preprocessedText.getParagraphs()) {
	60	+ Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences());
	61	+ mention2Orth.putAll(m2o);
	62	+ Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences());
	63	+ mention2Base.putAll(m2b);
	64	+
	65	+ int sentIdxInPar = 0;
	66	+ int mentionIdxInPar = 0;
	67	+ for (TSentence sent : par.getSentences()) {
	68	+
	69	+ Map<String, TToken> tokenId2token = sent.getTokens().stream().collect(toMap(TToken::getId, Function.identity()));
	70	+
	71	+ Map<String, Set<TNamedEntity>> tokenId2namedEntities = Maps.newHashMap();
	72	+ for (TNamedEntity namedEntity : sent.getNames()) {
	73	+ for (String childId : namedEntity.getChildIds()) {
	74	+ tokenId2namedEntities.putIfAbsent(childId, Sets.newHashSet());
	75	+ tokenId2namedEntities.get(childId).add(namedEntity);
	76	+ }
	77	+ }
	78	+
	79	+ int mentionIdxInSent = 0;
	80	+ for (TMention mention : sent.getMentions()) {
	81	+ mention2sent.put(mention, sent);
	82	+ mention2par.put(mention, par);
	83	+ mention2Index.put(mention, mentionIdx++);
	84	+ mention2indexInSent.put(mention, mentionIdxInSent++);
	85	+ mention2indexInPar.put(mention, mentionIdxInPar++);
	86	+
	87	+ String firstHeadTokenId = mention.getHeadIds().iterator().next();
	88	+ mention2head.put(mention, tokenId2token.get(firstHeadTokenId));
	89	+ if (tokenId2namedEntities.containsKey(firstHeadTokenId))
	90	+ mentionsInNamedEntities.add(mention);
	91	+ }
	92	+ sent2Index.put(sent, sentIdx++);
	93	+ sent2IndexInPar.put(sent, sentIdxInPar++);
	94	+ }
	95	+
	96	+ par2Index.put(par, parIdx++);
	97	+ }
	98	+ }
	99	+
	100	+ public List<TMention> getMentions() {
	101	+ return mentions;
	102	+ }
	103	+
	104	+ public int getMentionIndexInChain(TMention mention) {
	105	+ return coref2mentions.get(mention2coref.get(mention)).indexOf(mention);
	106	+ }
	107	+
	108	+ public int getChainLength(TMention mention) {
	109	+ return coref2mentions.get(mention2coref.get(mention)).size();
	110	+ }
	111	+
	112	+ public String getSentenceLastTokenOrth(TSentence sent) {
	113	+ return sent.getTokens().get(sent.getTokensSize() - 1).getOrth();
	114	+ }
	115	+
	116	+ public String getMentionOrth(TMention mention) {
	117	+ return mention2Orth.get(mention);
	118	+ }
	119	+
	120	+ public String getMentionBase(TMention mention) {
	121	+ return mention2Base.get(mention);
	122	+ }
	123	+
	124	+ public int getMentionIndex(TMention mention) {
	125	+ return mention2Index.get(mention);
	126	+ }
	127	+
	128	+ public int getMentionIndexInSent(TMention mention) {
	129	+ return mention2indexInSent.get(mention);
	130	+ }
	131	+
	132	+ public int getMentionIndexInPar(TMention mention) {
	133	+ return mention2indexInPar.get(mention);
	134	+ }
	135	+
	136	+ public int getParIndex(TParagraph paragraph) {
	137	+ return par2Index.get(paragraph);
	138	+ }
	139	+
	140	+ public int getSentIndex(TSentence sent) {
	141	+ return sent2Index.get(sent);
	142	+ }
	143	+
	144	+ public int getSentIndexInPar(TSentence sent) {
	145	+ return sent2IndexInPar.get(sent);
	146	+ }
	147	+
	148	+ public TParagraph getMentionParagraph(TMention mention) {
	149	+ return mention2par.get(mention);
	150	+ }
	151	+
	152	+ public TSentence getMentionSentence(TMention mention) {
	153	+ return mention2sent.get(mention);
	154	+ }
	155	+
	156	+ public TMention getFirstChainMention(TMention mention) {
	157	+ return mentionId2mention.get(mention2coref.get(mention).getMentionIdsIterator().next());
	158	+ }
	159	+
	160	+ public TToken getMentionHeadToken(TMention mention) {
	161	+ return mention2head.get(mention);
	162	+ }
	163	+
	164	+ public boolean isMentionNamedEntity(TMention mention) {
	165	+ return mentionsInNamedEntities.contains(mention);
	166	+ }
	167	+
	168	+ public boolean isNested(TMention mention) {
	169	+ return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds()));
	170	+ }
	171	+
	172	+ public boolean isNesting(TMention mention) {
	173	+ return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds()));
	174	+ }
	175	+
	176	+ public Set<TCoreference> getClusters() {
	177	+ return coref2mentions.keySet();
	178	+ }
	179	+
	180	+ public Set<TMention> getCoreferentMentions(TMention tMention) {
	181	+ return getMentionCluster(tMention).getMentionIds().stream().map(this.mentionId2mention::get).collect(Collectors.toSet());
	182	+ }
	183	+
	184	+ public TCoreference getMentionCluster(TMention tMention) {
	185	+ return this.mention2coref.get(tMention);
	186	+ }
	187	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.features;
	2	+
	3	+import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation;
	4	+
	5	+
	6	+public class Interpretation {
	7	+ private String ctag = "null";
	8	+ private String casee = "null";
	9	+ private String gender = "null";
	10	+ private String number = "null";
	11	+ private String person = "null";
	12	+
	13	+ public Interpretation(TInterpretation chosenInterpretation) {
	14	+ ctag = chosenInterpretation.getCtag();
	15	+ String[] split = chosenInterpretation.getMsd().split(":");
	16	+ switch (ctag) {
	17	+ case "ger":
	18	+ case "subst":
	19	+ case "pact":
	20	+ case "ppas":
	21	+ case "num":
	22	+ case "numcol":
	23	+ case "adj":
	24	+ number = split[0];
	25	+ casee = split[1];
	26	+ gender = split[2];
	27	+ break;
	28	+ case "ppron12":
	29	+ case "ppron3":
	30	+ number = split[0];
	31	+ casee = split[1];
	32	+ gender = split[2];
	33	+ person = split[3];
	34	+ break;
	35	+ case "siebie":
	36	+ casee = split[0];
	37	+ break;
	38	+ case "fin":
	39	+ case "bedzie":
	40	+ case "aglt":
	41	+ case "impt":
	42	+ number = split[0];
	43	+ person = split[1];
	44	+ break;
	45	+ case "praet":
	46	+ case "winien":
	47	+ number = split[0];
	48	+ gender = split[1];
	49	+ break;
	50	+ case "prep":
	51	+ casee = split[0];
	52	+ break;
	53	+ default:
	54	+ break;
	55	+ }
	56	+ }
	57	+
	58	+ public String getCase() {
	59	+ return casee;
	60	+ }
	61	+
	62	+ public String getGender() {
	63	+ return gender;
	64	+ }
	65	+
	66	+ public String getNumber() {
	67	+ return number;
	68	+ }
	69	+
	70	+ public String getPerson() {
	71	+ return person;
	72	+ }
	73	+
	74	+ public String getCtag() {
	75	+ return ctag;
	76	+ }
	77	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
	2	+
	3	+import com.google.common.collect.*;
	4	+import pl.waw.ipipan.zil.multiservice.thrift.types.*;
	5	+import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
	6	+import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
	7	+import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation;
	8	+import weka.core.Attribute;
	9	+
	10	+import java.io.File;
	11	+import java.io.IOException;
	12	+import java.nio.file.Files;
	13	+import java.util.*;
	14	+import java.util.stream.Collectors;
	15	+import java.util.stream.Stream;
	16	+
	17	+
	18	+public class MentionFeatureExtractor extends FeatureExtractor {
	19	+
	20	+ private final List<String> frequentBases = Lists.newArrayList();
	21	+
	22	+ public MentionFeatureExtractor() {
	23	+
	24	+ //coref
	25	+ addNumericAttributeNormalized("chain_length");
	26	+
	27	+ // text characteristics
	28	+ addNumericAttribute("text_token_count");
	29	+ addNumericAttribute("text_sent_count");
	30	+ addNumericAttribute("text_par_count");
	31	+ addNumericAttribute("text_mention_count");
	32	+ addNumericAttribute("text_cluster_count");
	33	+
	34	+ //mention characteristics
	35	+ for (String prefix : Lists.newArrayList("mention", "chain_first_mention")) {
	36	+ // mention characteristics
	37	+ addNumericAttributeNormalized(prefix + "_index");
	38	+ addNumericAttributeNormalized(prefix + "_index_in_sent");
	39	+ addNumericAttributeNormalized(prefix + "_index_in_par");
	40	+ addNumericAttributeNormalized(prefix + "_index_in_chain");
	41	+ addBinaryAttribute(prefix + "_capitalized");
	42	+ addBinaryAttribute(prefix + "_all_caps");
	43	+ addNumericAttributeNormalized(prefix + "_char_count");
	44	+ addNumericAttributeNormalized(prefix + "_token_count");
	45	+ addBinaryAttribute(prefix + "_is_zero");
	46	+ addBinaryAttribute(prefix + "_is_named");
	47	+ addBinaryAttribute(prefix + "_is_pronoun");
	48	+ addNominalAttribute(prefix + "_ctag", Lists.newArrayList("other", "null", "impt", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"));
	49	+ addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter"));
	50	+ addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc"));
	51	+ addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl"));
	52	+ addNominalAttribute(prefix + "_gender", Lists.newArrayList("other", "null", "f", "m1", "m2", "m3", "n"));
	53	+
	54	+ // relation to other
	55	+ addBinaryAttribute(prefix + "_is_nested");
	56	+ addBinaryAttribute(prefix + "_is_nesting");
	57	+
	58	+ // par characteristics
	59	+ addNumericAttributeNormalized(prefix + "_par_idx");
	60	+ addNumericAttributeNormalized(prefix + "_par_token_count");
	61	+ addNumericAttributeNormalized(prefix + "_par_sent_count");
	62	+
	63	+ // sent characteristics
	64	+ addNumericAttributeNormalized(prefix + "_sent_token_count");
	65	+ addNumericAttributeNormalized(prefix + "_sent_mention_count");
	66	+ addNumericAttributeNormalized(prefix + "_sent_idx");
	67	+ addNumericAttributeNormalized(prefix + "_sent_idx_in_par");
	68	+ addBinaryAttribute(prefix + "_sent_ends_with_dot");
	69	+ addBinaryAttribute(prefix + "_sent_ends_with_questionmark");
	70	+
	71	+ // frequent bases
	72	+ loadFrequentBases();
	73	+ for (String base : frequentBases) {
	74	+ addBinaryAttribute(prefix + "_" + encodeBase(base));
	75	+ }
	76	+ }
	77	+
	78	+ addNominalAttribute("score", Lists.newArrayList("bad", "good"));
	79	+ fillSortedAttributes("score");
	80	+ }
	81	+
	82	+ private String encodeBase(String base) {
	83	+ return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q");
	84	+ }
	85	+
	86	+ private void loadFrequentBases() {
	87	+ try {
	88	+ Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath());
	89	+ this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList()));
	90	+ } catch (IOException e) {
	91	+ e.printStackTrace();
	92	+ }
	93	+ }
	94	+
	95	+ public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) {
	96	+ Map<TMention, Map<Attribute, Double>> result = Maps.newHashMap();
	97	+
	98	+ FeatureHelper helper = new FeatureHelper(preprocessedText);
	99	+
	100	+ addScoreFeature(result, helper.getMentions());
	101	+
	102	+ for (TMention mention : helper.getMentions()) {
	103	+ Map<Attribute, Double> attribute2value = result.get(mention);
	104	+
	105	+ //mention
	106	+ addMentionAttributes(helper, mention, attribute2value, "mention");
	107	+
	108	+ //first chain mention
	109	+ TMention firstChainMention = helper.getFirstChainMention(mention);
	110	+ addMentionAttributes(helper, firstChainMention, attribute2value, "chain_first_mention");
	111	+
	112	+ //coref
	113	+ attribute2value.put(getAttributeByName("chain_length"), (double) helper.getChainLength(mention));
	114	+
	115	+ //text
	116	+ List<TParagraph> pars = preprocessedText.getParagraphs();
	117	+ List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
	118	+ List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList());
	119	+ attribute2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum());
	120	+ attribute2value.put(getAttributeByName("text_token_count"), (double) tokens.size());
	121	+ attribute2value.put(getAttributeByName("text_sent_count"), (double) sents.size());
	122	+ attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size());
	123	+ attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
	124	+ attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
	125	+
	126	+ assert (attribute2value.size() == getAttributesList().size());
	127	+ }
	128	+ addNormalizedAttributeValues(result);
	129	+
	130	+ return result;
	131	+ }
	132	+
	133	+ private void addMentionAttributes(FeatureHelper helper, TMention mention, Map<Attribute, Double> attribute2value, String attributePrefix) {
	134	+ // mention characteristics
	135	+ attribute2value.put(getAttributeByName(attributePrefix + "_index"), (double) helper.getMentionIndex(mention));
	136	+ attribute2value.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention));
	137	+ attribute2value.put(getAttributeByName(attributePrefix + "_index_in_par"), (double) helper.getMentionIndexInPar(mention));
	138	+ attribute2value.put(getAttributeByName(attributePrefix + "_index_in_chain"), (double) helper.getMentionIndexInChain(mention));
	139	+ attribute2value.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize());
	140	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject()));
	141	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*")));
	142	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention)));
	143	+
	144	+ Interpretation interp = new Interpretation(helper.getMentionHeadToken(mention).getChosenInterpretation());
	145	+ addNominalAttributeValue(interp.getCtag(), attribute2value, attributePrefix + "_ctag");
	146	+ addNominalAttributeValue(interp.getPerson(), attribute2value, attributePrefix + "_person");
	147	+ addNominalAttributeValue(interp.getNumber(), attribute2value, attributePrefix + "_number");
	148	+ addNominalAttributeValue(interp.getGender(), attribute2value, attributePrefix + "_gender");
	149	+ addNominalAttributeValue(interp.getCase(), attribute2value, attributePrefix + "_case");
	150	+
	151	+ // relation to other mentions
	152	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention)));
	153	+ attribute2value.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention)));
	154	+
	155	+ String orth = helper.getMentionOrth(mention);
	156	+ attribute2value.put(getAttributeByName(attributePrefix + "_capitalized"), toBinary(orth.length() != 0 && orth.substring(0, 1).toUpperCase().equals(orth.substring(0, 1))));
	157	+ attribute2value.put(getAttributeByName(attributePrefix + "_all_caps"), toBinary(orth.toUpperCase().equals(orth)));
	158	+ attribute2value.put(getAttributeByName(attributePrefix + "_char_count"), (double) orth.length());
	159	+
	160	+ // par characteristics
	161	+ TParagraph mentionParagraph = helper.getMentionParagraph(mention);
	162	+ attribute2value.put(getAttributeByName(attributePrefix + "_par_idx"), (double) helper.getParIndex(mentionParagraph));
	163	+ attribute2value.put(getAttributeByName(attributePrefix + "_par_token_count"), mentionParagraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum());
	164	+ attribute2value.put(getAttributeByName(attributePrefix + "_par_sent_count"), (double) mentionParagraph.getSentences().size());
	165	+
	166	+ // sent characteristics
	167	+ TSentence mentionSentence = helper.getMentionSentence(mention);
	168	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_token_count"), (double) mentionSentence.getTokensSize());
	169	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_mention_count"), (double) mentionSentence.getMentions().size());
	170	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx"), (double) helper.getSentIndex(mentionSentence));
	171	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_idx_in_par"), (double) helper.getSentIndexInPar(mentionSentence));
	172	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals(".")));
	173	+ attribute2value.put(getAttributeByName(attributePrefix + "_sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(mentionSentence).equals("?")));
	174	+
	175	+ // frequent bases
	176	+ String mentionBase = helper.getMentionBase(mention);
	177	+ for (String base : frequentBases) {
	178	+ attribute2value.put(getAttributeByName(attributePrefix + "_" + encodeBase(base)), toBinary(mentionBase.equals(base)));
	179	+ }
	180	+ }
	181	+
	182	+ private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) {
	183	+ Attribute att = getAttributeByName(attributeName);
	184	+ int index = att.indexOfValue(value);
	185	+ if (index == -1)
	186	+ LOG.warn(value + " not found for attribute " + attributeName);
	187	+ attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
	188	+ }
	189	+
	190	+
	191	+ private void addScoreFeature(Map<TMention, Map<Attribute, Double>> result, List<TMention> mentions) {
	192	+ for (TMention m : mentions) {
	193	+ Map<Attribute, Double> map = Maps.newHashMap();
	194	+ map.put(getAttributeByName("score"), weka.core.Utils.missingValue());
	195	+ result.put(m, map);
	196	+ }
	197	+ }
	198	+
	199	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
	2	+
	3	+import com.google.common.collect.Sets;
	4	+import org.slf4j.Logger;
	5	+import org.slf4j.LoggerFactory;
	6	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
	7	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
	8	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
	9	+import weka.classifiers.Classifier;
	10	+import weka.core.Instance;
	11	+import weka.core.Instances;
	12	+
	13	+import java.util.Map;
	14	+import java.util.Set;
	15	+
	16	+public class MentionModel {
	17	+
	18	+ private static final Logger LOG = LoggerFactory.getLogger(MentionModel.class);
	19	+
	20	+ public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception {
	21	+ Set<TMention> goodMentions = Sets.newHashSet();
	22	+
	23	+ Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
	24	+ Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor);
	25	+ for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
	26	+ Instance instance = entry.getValue();
	27	+ instance.setDataset(instances);
	28	+ instance.setClassMissing();
	29	+ boolean good = classifier.classifyInstance(instance) > 0.5;
	30	+ if (good)
	31	+ goodMentions.add(entry.getKey());
	32	+ }
	33	+ LOG.info("\t" + goodMentions.size() + "\t" + mention2instance.size());
	34	+ return goodMentions;
	35	+ }
	36	+
	37	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
	2	+
	3	+import com.google.common.collect.HashMultiset;
	4	+import com.google.common.collect.Maps;
	5	+import com.google.common.collect.Multiset;
	6	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
	7	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
	8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
	9	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
	10	+
	11	+import java.util.Collection;
	12	+import java.util.List;
	13	+import java.util.Map;
	14	+import java.util.stream.Collectors;
	15	+
	16	+public class MentionScorer {
	17	+
	18	+
	19	+ public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) {
	20	+ Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
	21	+
	22	+ List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
	23	+ Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences);
	24	+
	25	+ return booleanTokenIntersection(mention2Orth, tokenCounts);
	26	+ }
	27	+
	28	+ private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
	29	+ Map<TMention, Double> mention2score = Maps.newHashMap();
	30	+ for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
	31	+ TMention mention = entry.getKey();
	32	+ String mentionOrth = mention2Orth.get(mention);
	33	+ for (String token : Utils.tokenize(mentionOrth)) {
	34	+ if (tokenCounts.contains(token.toLowerCase())) {
	35	+ mention2score.put(mention, 1.0);
	36	+ break;
	37	+ }
	38	+ }
	39	+ mention2score.putIfAbsent(mention, 0.0);
	40	+ }
	41	+ return mention2score;
	42	+ }
	43	+
	44	+ private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
	45	+ Map<TMention, Double> mention2score = Maps.newHashMap();
	46	+ for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
	47	+ TMention mention = entry.getKey();
	48	+ String mentionOrth = mention2Orth.get(mention);
	49	+ int present = 0;
	50	+ for (String token : Utils.tokenize(mentionOrth)) {
	51	+ if (tokenCounts.contains(token.toLowerCase())) {
	52	+ present++;
	53	+ }
	54	+ }
	55	+ mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0);
	56	+ }
	57	+ return mention2score;
	58	+ }
	59	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
	2	+
	3	+import com.google.common.base.Charsets;
	4	+import com.google.common.collect.Maps;
	5	+import com.google.common.io.Files;
	6	+import org.apache.logging.log4j.LogManager;
	7	+import org.apache.logging.log4j.Logger;
	8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
	9	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
	10	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
	11	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
	12	+import weka.core.Instance;
	13	+import weka.core.Instances;
	14	+import weka.core.converters.ArffSaver;
	15	+
	16	+import java.io.File;
	17	+import java.io.IOException;
	18	+import java.util.Map;
	19	+
	20	+
	21	+public class PrepareTrainingData {
	22	+
	23	+ private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class);
	24	+
	25	+ public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
	26	+ public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
	27	+
	28	+ public static void main(String[] args) throws IOException {
	29	+
	30	+ Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
	31	+ Map<String, String> id2optimalSummary = loadOptimalSummaries();
	32	+
	33	+ MentionScorer mentionScorer = new MentionScorer();
	34	+ MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
	35	+
	36	+ Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
	37	+
	38	+ int i = 1;
	39	+ for (String textId : id2preprocessedText.keySet()) {
	40	+ LOG.info(i++ + "/" + id2preprocessedText.size());
	41	+
	42	+ TText preprocessedText = id2preprocessedText.get(textId);
	43	+ String optimalSummary = id2optimalSummary.get(textId);
	44	+ if (optimalSummary == null)
	45	+ continue;
	46	+ Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
	47	+
	48	+ Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor);
	49	+ for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
	50	+ TMention mention = entry.getKey();
	51	+ Instance instance = entry.getValue();
	52	+ instance.setDataset(instances);
	53	+ instance.setClassValue(mention2score.get(mention));
	54	+ instances.add(instance);
	55	+ }
	56	+ }
	57	+ saveInstancesToFile(instances);
	58	+ }
	59	+
	60	+ private static void saveInstancesToFile(Instances instances) throws IOException {
	61	+ ArffSaver saver = new ArffSaver();
	62	+ saver.setInstances(instances);
	63	+ saver.setFile(new File(Constants.MENTIONS_DATASET_PATH));
	64	+ saver.writeBatch();
	65	+ }
	66	+
	67	+ private static Map<String, String> loadOptimalSummaries() throws IOException {
	68	+ Map<String, String> id2optimalSummary = Maps.newHashMap();
	69	+ for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {
	70	+ String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
	71	+ id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);
	72	+ }
	73	+ LOG.info(id2optimalSummary.size() + " optimal summaries found.");
	74	+ return id2optimalSummary;
	75	+ }
	76	+
	77	+
	78	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.mention;
	2	+
	3	+import org.apache.commons.lang3.time.StopWatch;
	4	+import org.slf4j.Logger;
	5	+import org.slf4j.LoggerFactory;
	6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
	7	+import weka.classifiers.Classifier;
	8	+import weka.core.Instances;
	9	+import weka.core.converters.ArffLoader;
	10	+
	11	+import java.io.File;
	12	+import java.io.FileOutputStream;
	13	+import java.io.ObjectOutputStream;
	14	+
	15	+
	16	+public class TrainModel {
	17	+ private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
	18	+
	19	+ public static void main(String[] args) throws Exception {
	20	+
	21	+ ArffLoader loader = new ArffLoader();
	22	+ loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
	23	+ Instances instances = loader.getDataSet();
	24	+ instances.setClassIndex(0);
	25	+ LOG.info(instances.size() + " instances loaded.");
	26	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
	27	+
	28	+ StopWatch watch = new StopWatch();
	29	+ watch.start();
	30	+
	31	+ Classifier classifier = Constants.getClassifier();
	32	+
	33	+ LOG.info("Building classifier...");
	34	+ classifier.buildClassifier(instances);
	35	+ LOG.info("...done.");
	36	+
	37	+ try (ObjectOutputStream oos = new ObjectOutputStream(
	38	+ new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) {
	39	+ oos.writeObject(classifier);
	40	+ }
	41	+
	42	+ watch.stop();
	43	+ LOG.info("Elapsed time: " + watch);
	44	+
	45	+ LOG.info(classifier.toString());
	46	+ }
	47	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.mention.test;
	2	+
	3	+import org.apache.commons.lang3.time.StopWatch;
	4	+import org.slf4j.Logger;
	5	+import org.slf4j.LoggerFactory;
	6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
	7	+import weka.classifiers.Classifier;
	8	+import weka.classifiers.evaluation.Evaluation;
	9	+import weka.core.Instances;
	10	+import weka.core.converters.ArffLoader;
	11	+
	12	+import java.io.File;
	13	+import java.util.Random;
	14	+
	15	+
	16	+public class Crossvalidate {
	17	+
	18	+ private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
	19	+
	20	+ public static void main(String[] args) throws Exception {
	21	+
	22	+ ArffLoader loader = new ArffLoader();
	23	+ loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
	24	+ Instances instances = loader.getDataSet();
	25	+ instances.setClassIndex(0);
	26	+ LOG.info(instances.size() + " instances loaded.");
	27	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
	28	+
	29	+// while (instances.size() > 10000)
	30	+// instances.remove(instances.size() - 1);
	31	+
	32	+ StopWatch watch = new StopWatch();
	33	+ watch.start();
	34	+
	35	+ Classifier tree = Constants.getClassifier();
	36	+
	37	+ Evaluation eval = new Evaluation(instances);
	38	+ eval.crossValidateModel(tree, instances, 10, new Random(1));
	39	+ LOG.info(eval.toSummaryString());
	40	+
	41	+ watch.stop();
	42	+ LOG.info("Elapsed time: " + watch);
	43	+ }
	44	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.mention.test;
	2	+
	3	+import org.apache.commons.lang3.time.StopWatch;
	4	+import org.slf4j.Logger;
	5	+import org.slf4j.LoggerFactory;
	6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
	7	+import weka.classifiers.Classifier;
	8	+import weka.classifiers.evaluation.Evaluation;
	9	+import weka.core.Instances;
	10	+import weka.core.converters.ArffLoader;
	11	+
	12	+import java.io.File;
	13	+import java.io.FileInputStream;
	14	+import java.io.IOException;
	15	+import java.io.ObjectInputStream;
	16	+
	17	+/**
	18	+ * Created by me2 on 05.04.16.
	19	+ */
	20	+public class Validate {
	21	+ private static final Logger LOG = LoggerFactory.getLogger(Validate.class);
	22	+
	23	+ public static void main(String[] args) throws Exception {
	24	+
	25	+ ArffLoader loader = new ArffLoader();
	26	+ loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
	27	+ Instances instances = loader.getDataSet();
	28	+ instances.setClassIndex(0);
	29	+ LOG.info(instances.size() + " instances loaded.");
	30	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
	31	+
	32	+ Classifier classifier = loadClassifier();
	33	+
	34	+ StopWatch watch = new StopWatch();
	35	+ watch.start();
	36	+
	37	+ Evaluation eval = new Evaluation(instances);
	38	+ eval.evaluateModel(classifier, instances);
	39	+
	40	+ LOG.info(eval.toSummaryString());
	41	+
	42	+ watch.stop();
	43	+ LOG.info("Elapsed time: " + watch);
	44	+ }
	45	+
	46	+ private static Classifier loadClassifier() throws IOException, ClassNotFoundException {
	47	+ LOG.info("Loading classifier...");
	48	+ try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) {
	49	+ Classifier classifier = (Classifier) ois.readObject();
	50	+ LOG.info("Done. " + classifier.toString());
	51	+ return classifier;
	52	+ }
	53	+ }
	54	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.sentence;
	2	+
	3	+import com.google.common.base.Charsets;
	4	+import com.google.common.collect.Maps;
	5	+import com.google.common.io.Files;
	6	+import org.apache.logging.log4j.LogManager;
	7	+import org.apache.logging.log4j.Logger;
	8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
	9	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
	10	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
	11	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
	12	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
	13	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
	14	+import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
	15	+import weka.classifiers.Classifier;
	16	+import weka.core.Instance;
	17	+import weka.core.Instances;
	18	+import weka.core.converters.ArffSaver;
	19	+
	20	+import java.io.File;
	21	+import java.io.IOException;
	22	+import java.util.Map;
	23	+import java.util.Set;
	24	+
	25	+
	26	+public class PrepareTrainingData {
	27	+
	28	+ private static final Logger LOG = LogManager.getLogger(PrepareTrainingData.class);
	29	+
	30	+ private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
	31	+ private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
	32	+
	33	+ public static void main(String[] args) throws Exception {
	34	+
	35	+ Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
	36	+ Map<String, String> id2optimalSummary = loadOptimalSummaries();
	37	+
	38	+ SentenceScorer sentenceScorer = new SentenceScorer();
	39	+ SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
	40	+
	41	+ Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
	42	+
	43	+ Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
	44	+ MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor();
	45	+
	46	+ int i = 1;
	47	+ for (String textId : id2preprocessedText.keySet()) {
	48	+ LOG.info(i++ + "/" + id2preprocessedText.size());
	49	+
	50	+ TText preprocessedText = id2preprocessedText.get(textId);
	51	+ String optimalSummary = id2optimalSummary.get(textId);
	52	+ if (optimalSummary == null)
	53	+ continue;
	54	+ Map<TSentence, Double> sentence2score = sentenceScorer.calculateSentenceScores(optimalSummary, preprocessedText);
	55	+
	56	+ Set<TMention> goodMentions
	57	+ = MentionModel.detectGoodMentions(classifier, mentionFeatureExtractor, preprocessedText);
	58	+// Set<TMention> goodMentions
	59	+// = Utils.loadGoldGoodMentions(textId, preprocessedText, true);
	60	+
	61	+ Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
	62	+ for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
	63	+ TSentence sentence = entry.getKey();
	64	+ Instance instance = entry.getValue();
	65	+ instance.setDataset(instances);
	66	+ instance.setClassValue(sentence2score.get(sentence));
	67	+ instances.add(instance);
	68	+ }
	69	+ }
	70	+ saveInstancesToFile(instances);
	71	+ }
	72	+
	73	+ private static void saveInstancesToFile(Instances instances) throws IOException {
	74	+ ArffSaver saver = new ArffSaver();
	75	+ saver.setInstances(instances);
	76	+ saver.setFile(new File(Constants.SENTENCES_DATASET_PATH));
	77	+ saver.writeBatch();
	78	+ }
	79	+
	80	+ private static Map<String, String> loadOptimalSummaries() throws IOException {
	81	+ Map<String, String> id2optimalSummary = Maps.newHashMap();
	82	+ for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {
	83	+ String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
	84	+ id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);
	85	+ }
	86	+ LOG.info(id2optimalSummary.size() + " optimal summaries found.");
	87	+ return id2optimalSummary;
	88	+ }
	89	+
	90	+
	91	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.sentence;
	2	+
	3	+import com.google.common.collect.Maps;
	4	+import pl.waw.ipipan.zil.multiservice.thrift.types.*;
	5	+import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
	6	+import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
	7	+import weka.core.Attribute;
	8	+
	9	+import java.util.List;
	10	+import java.util.Map;
	11	+import java.util.Set;
	12	+import java.util.stream.Collectors;
	13	+
	14	+public class SentenceFeatureExtractor extends FeatureExtractor {
	15	+
	16	+ public SentenceFeatureExtractor() {
	17	+
	18	+ addNumericAttributeNormalized("sent_mention_cluster_count");
	19	+ addNumericAttributeNormalized("sent_good_mention_cluster_count");
	20	+ addNumericAttributeNormalized("sent_good_mention_cluster_good_count");
	21	+ addNumericAttributeNormalized("sent_cluster_count");
	22	+ addNumericAttributeNormalized("sent_good_cluster_count");
	23	+ addNumericAttributeNormalized("sent_mention_count");
	24	+ addNumericAttributeNormalized("sent_good_mention_count");
	25	+
	26	+ addNumericAttributeNormalized("sent_token_length");
	27	+ addNumericAttributeNormalized("sent_idx");
	28	+ addNumericAttributeNormalized("sent_idx_in_par");
	29	+ addBinaryAttribute("sent_ends_with_dot");
	30	+ addBinaryAttribute("sent_ends_with_questionmark");
	31	+
	32	+ addNumericAttributeNormalized("par_idx");
	33	+ addNumericAttributeNormalized("par_token_count");
	34	+ addNumericAttributeNormalized("par_sent_count");
	35	+
	36	+ addNumericAttribute("text_token_count");
	37	+ addNumericAttribute("text_sent_count");
	38	+ addNumericAttribute("text_par_count");
	39	+ addNumericAttribute("text_mention_count");
	40	+ addNumericAttribute("text_cluster_count");
	41	+
	42	+ addNumericAttribute("score");
	43	+ fillSortedAttributes("score");
	44	+ }
	45	+
	46	+ public Map<TSentence, Map<Attribute, Double>> calculateFeatures(TText preprocessedText, Set<TMention> goodMentions) {
	47	+
	48	+ int sentenceIdx = 0;
	49	+ int parIdx = 0;
	50	+
	51	+ FeatureHelper helper = new FeatureHelper(preprocessedText);
	52	+ List<TParagraph> pars = preprocessedText.getParagraphs();
	53	+ List<TSentence> sents = pars.stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
	54	+ List<TToken> tokens = sents.stream().flatMap(s -> s.getTokens().stream()).collect(Collectors.toList());
	55	+
	56	+ Map<TSentence, Map<Attribute, Double>> sentence2features = Maps.newLinkedHashMap();
	57	+ for (TParagraph paragraph : preprocessedText.getParagraphs()) {
	58	+ int sentenceIdxInPar = 0;
	59	+ for (TSentence sentence : paragraph.getSentences()) {
	60	+ Map<Attribute, Double> feature2value = Maps.newHashMap();
	61	+
	62	+ feature2value.put(getAttributeByName("sent_mention_cluster_count"), sentence.getMentions().stream().mapToDouble(helper::getChainLength).sum());
	63	+ feature2value.put(getAttributeByName("sent_good_mention_cluster_count"), sentence.getMentions().stream().filter(goodMentions::contains).mapToDouble(helper::getChainLength).sum());
	64	+ feature2value.put(getAttributeByName("sent_good_mention_cluster_good_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).flatMap(m -> helper.getCoreferentMentions(m).stream()).filter(goodMentions::contains).count());
	65	+ feature2value.put(getAttributeByName("sent_cluster_count"), (double) sentence.getMentions().stream().map(helper::getMentionCluster).collect(Collectors.toSet()).size());
	66	+ feature2value.put(getAttributeByName("sent_good_cluster_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).map(helper::getMentionCluster).collect(Collectors.toSet()).size());
	67	+ feature2value.put(getAttributeByName("sent_mention_count"), (double) sentence.getMentions().size());
	68	+ feature2value.put(getAttributeByName("sent_good_mention_count"), (double) sentence.getMentions().stream().filter(goodMentions::contains).count());
	69	+
	70	+ feature2value.put(getAttributeByName("sent_token_length"), (double) sentence.getTokens().size());
	71	+ feature2value.put(getAttributeByName("sent_idx_in_par"), (double) sentenceIdxInPar);
	72	+ feature2value.put(getAttributeByName("sent_idx"), (double) sentenceIdx);
	73	+ feature2value.put(getAttributeByName("sent_ends_with_dot"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals(".")));
	74	+ feature2value.put(getAttributeByName("sent_ends_with_questionmark"), toBinary(helper.getSentenceLastTokenOrth(sentence).equals("?")));
	75	+
	76	+ feature2value.put(getAttributeByName("par_idx"), (double) parIdx);
	77	+ feature2value.put(getAttributeByName("par_token_count"), paragraph.getSentences().stream().map(s -> s.getTokens().size()).mapToDouble(s -> s).sum());
	78	+ feature2value.put(getAttributeByName("par_sent_count"), (double) paragraph.getSentences().size());
	79	+
	80	+ feature2value.put(getAttributeByName("text_char_count"), tokens.stream().mapToDouble(t -> t.getOrth().length()).sum());
	81	+ feature2value.put(getAttributeByName("text_token_count"), (double) tokens.size());
	82	+ feature2value.put(getAttributeByName("text_sent_count"), (double) sents.size());
	83	+ feature2value.put(getAttributeByName("text_par_count"), (double) pars.size());
	84	+ feature2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
	85	+ feature2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
	86	+
	87	+ feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue());
	88	+
	89	+ feature2value.remove(null);
	90	+ assert (feature2value.size() == getAttributesList().size());
	91	+
	92	+ sentence2features.put(sentence, feature2value);
	93	+
	94	+ sentenceIdx++;
	95	+ sentenceIdxInPar++;
	96	+ }
	97	+ parIdx++;
	98	+ }
	99	+ addNormalizedAttributeValues(sentence2features);
	100	+
	101	+ return sentence2features;
	102	+ }
	103	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.sentence;
	2	+
	3	+import com.google.common.collect.HashMultiset;
	4	+import com.google.common.collect.Maps;
	5	+import com.google.common.collect.Multiset;
	6	+import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
	7	+import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
	8	+import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
	9	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
	10	+
	11	+import java.util.List;
	12	+import java.util.Map;
	13	+
	14	+public class SentenceScorer {
	15	+ public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) {
	16	+ Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
	17	+
	18	+ Map<TSentence, Double> sentence2score = Maps.newHashMap();
	19	+ for (TParagraph paragraph : preprocessedText.getParagraphs())
	20	+ for (TSentence sentence : paragraph.getSentences()) {
	21	+ double score = 0.0;
	22	+
	23	+ String orth = Utils.loadSentence2Orth(sentence);
	24	+ List<String> tokens = Utils.tokenize(orth);
	25	+ for (String token : tokens) {
	26	+ score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0;
	27	+ }
	28	+ sentence2score.put(sentence, score / tokens.size());
	29	+ }
	30	+ return sentence2score;
	31	+ }
	32	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.sentence;
	2	+
	3	+import org.apache.commons.lang3.time.StopWatch;
	4	+import org.slf4j.Logger;
	5	+import org.slf4j.LoggerFactory;
	6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
	7	+import weka.classifiers.Classifier;
	8	+import weka.core.Instances;
	9	+import weka.core.converters.ArffLoader;
	10	+
	11	+import java.io.File;
	12	+import java.io.FileOutputStream;
	13	+import java.io.ObjectOutputStream;
	14	+
	15	+
	16	+public class TrainModel {
	17	+ private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
	18	+
	19	+ public static void main(String[] args) throws Exception {
	20	+
	21	+ ArffLoader loader = new ArffLoader();
	22	+ loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));
	23	+ Instances instances = loader.getDataSet();
	24	+ instances.setClassIndex(0);
	25	+ LOG.info(instances.size() + " instances loaded.");
	26	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
	27	+
	28	+ StopWatch watch = new StopWatch();
	29	+ watch.start();
	30	+
	31	+ Classifier classifier = Constants.getSentencesClassifier();
	32	+
	33	+ LOG.info("Building classifier...");
	34	+ classifier.buildClassifier(instances);
	35	+ LOG.info("...done.");
	36	+
	37	+ try (ObjectOutputStream oos = new ObjectOutputStream(
	38	+ new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) {
	39	+ oos.writeObject(classifier);
	40	+ }
	41	+
	42	+ watch.stop();
	43	+ LOG.info("Elapsed time: " + watch);
	44	+
	45	+ LOG.info(classifier.toString());
	46	+ }
	47	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.sentence.test;
	2	+
	3	+import org.apache.commons.lang3.time.StopWatch;
	4	+import org.slf4j.Logger;
	5	+import org.slf4j.LoggerFactory;
	6	+import pl.waw.ipipan.zil.summ.nicolas.Constants;
	7	+import weka.classifiers.Classifier;
	8	+import weka.classifiers.evaluation.Evaluation;
	9	+import weka.core.Instances;
	10	+import weka.core.converters.ArffLoader;
	11	+
	12	+import java.io.File;
	13	+import java.util.Random;
	14	+
	15	+
	16	+public class Crossvalidate {
	17	+
	18	+ private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
	19	+
	20	+ public static void main(String[] args) throws Exception {
	21	+
	22	+ ArffLoader loader = new ArffLoader();
	23	+ loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));
	24	+ Instances instances = loader.getDataSet();
	25	+ instances.setClassIndex(0);
	26	+ LOG.info(instances.size() + " instances loaded.");
	27	+ LOG.info(instances.numAttributes() + " attributes for each instance.");
	28	+
	29	+ StopWatch watch = new StopWatch();
	30	+ watch.start();
	31	+
	32	+ Classifier tree = Constants.getSentencesClassifier();
	33	+
	34	+ Evaluation eval = new Evaluation(instances);
	35	+ eval.crossValidateModel(tree, instances, 10, new Random(1));
	36	+ LOG.info(eval.toSummaryString());
	37	+
	38	+ watch.stop();
	39	+ LOG.info("Elapsed time: " + watch);
	40	+ }
	41	+}
...	...

nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java
	1	+package pl.waw.ipipan.zil.summ.nicolas.zero;
	2	+
	3	+import com.google.common.collect.Lists;
	4	+import com.google.common.collect.Maps;
	5	+import com.google.common.collect.Sets;
	6	+import org.apache.commons.csv.CSVFormat;
	7	+import org.apache.commons.csv.CSVPrinter;
	8	+import org.apache.commons.csv.QuoteMode;
	9	+import org.apache.commons.io.IOUtils;
	10	+import pl.waw.ipipan.zil.multiservice.thrift.types.*;
	11	+import pl.waw.ipipan.zil.summ.nicolas.Utils;
	12	+
	13	+import java.io.File;
	14	+import java.io.FileReader;
	15	+import java.io.FileWriter;
	16	+import java.io.IOException;
	17	+import java.util.Arrays;
	18	+import java.util.List;
	19	+import java.util.Map;
	20	+import java.util.Set;
	21	+
	22	+/**
	23	+ * Created by me2 on 26.07.16.
	24	+ */
	25	+public class Zero {
	26	+
	27	+ private static final String IDS_PATH = "summaries_dev";
	28	+ private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/";
	29	+
	30	+ public static void main(String[] args) throws IOException {
	31	+
	32	+ Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);
	33	+ Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH);
	34	+
	35	+ int mentionCount = 0;
	36	+ int mentionInNom = 0;
	37	+ int mentionInNomSequential = 0;
	38	+
	39	+ List<List<Object>> rows = Lists.newArrayList();
	40	+ for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
	41	+ String textId = entry.getKey();
	42	+// System.out.println(id);
	43	+
	44	+ TText text = entry.getValue();
	45	+ List<String> sentenceIds = id2sentIds.get(textId);
	46	+// System.out.println(sentenceIds);
	47	+
	48	+ Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
	49	+ for (TCoreference coreference : text.getCoreferences()) {
	50	+ for (String mentionId : coreference.getMentionIds()) {
	51	+ mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
	52	+ }
	53	+ }
	54	+
	55	+ Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();
	56	+ TSentence prevSentence = null;
	57	+ for (TParagraph p : text.getParagraphs()) {
	58	+ Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences());
	59	+
	60	+ for (TSentence sentence : p.getSentences()) {
	61	+ if (!sentenceIds.contains(sentence.getId()))
	62	+ continue;
	63	+ Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
	64	+
	65	+ Map<String, TToken> tokenId2Token = Maps.newHashMap();
	66	+ for (TToken t : sentence.getTokens())
	67	+ tokenId2Token.put(t.getId(), t);
	68	+
	69	+ for (TMention mention : sentence.getMentions()) {
	70	+ mentionCount++;
	71	+
	72	+ for (String tokenId : mention.getHeadIds()) {
	73	+ TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
	74	+ if (isInNominative(interp)) {
	75	+ mentionInNom++;
	76	+
	77	+ currentSentenceNominativeMentionIds.add(mention.getId());
	78	+ if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {
	79	+ mentionInNomSequential++;
	80	+ System.out.println(tMentionStringMap.get(mention)
	81	+ + "\n\t" + Utils.loadSentence2Orth(prevSentence)
	82	+ + "\n\t" + Utils.loadSentence2Orth(sentence));
	83	+
	84	+ List<Object> row = Lists.newArrayList();
	85	+ row.add("C");
	86	+ row.add(textId);
	87	+ row.add(tMentionStringMap.get(mention));
	88	+ row.add(Utils.loadSentence2Orth(prevSentence));
	89	+ row.add(Utils.loadSentence2Orth(sentence));
	90	+ rows.add(row);
	91	+ }
	92	+ break;
	93	+ }
	94	+ }
	95	+ }
	96	+
	97	+ prevSentence = sentence;
	98	+ prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds;
	99	+ }
	100	+ }
	101	+ }
	102	+
	103	+ System.out.println(mentionCount + " mentions");
	104	+ System.out.println(mentionInNom + " mention in nom");
	105	+ System.out.println(mentionInNomSequential + " mention in nom with previous in nom");
	106	+
	107	+ try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) {
	108	+ for (List<Object> row : rows) {
	109	+ csvPrinter.printRecord(row);
	110	+ }
	111	+ }
	112	+
	113	+ }
	114	+
	115	+ private static boolean isInNominative(TInterpretation interp) {
	116	+ return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom"));
	117	+ }
	118	+
	119	+ private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException {
	120	+ Map<String, List<String>> result = Maps.newHashMap();
	121	+ for (File f : new File(idsPath).listFiles()) {
	122	+ String id = f.getName().split("_")[0];
	123	+ List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
	124	+ result.put(id, sentenceIds);
	125	+ }
	126	+ return result;
	127	+ }
	128	+}
...	...

nicolas-model/pom.xml 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-model/pom.xml
	1	+<?xml version="1.0" encoding="UTF-8"?>
	2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
	3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	5	+ <modelVersion>4.0.0</modelVersion>
	6	+ <parent>
	7	+ <artifactId>nicolas-container</artifactId>
	8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
	9	+ <version>1.0-SNAPSHOT</version>
	10	+ </parent>
	11	+
	12	+ <artifactId>nicolas-model</artifactId>
	13	+
	14	+</project>
0	15	\ No newline at end of file
...	...

nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt
	1	+on
	2	+to
	3	+co
	4	+rok
	5	+być
	6	+wszystko
	7	+polska
	8	+człowiek
	9	+sobie
	10	+raz
	11	+my
	12	+mieć
	13	+czas
	14	+państwo
	15	+praca
	16	+osoba
	17	+sprawa
	18	+ja
	19	+kraj
	20	+pieniądz
	21	+nikt
	22	+kto
	23	+przykład
	24	+nic
	25	+koniec
	26	+rząd
	27	+prawo
	28	+życie
	29	+miejsce
	30	+móc
	31	+fot
	32	+problem
	33	+władza
	34	+miesiąc
	35	+rzecz
	36	+stan
	37	+świat
	38	+wszyscy
	39	+mówić
	40	+rozmowa
	41	+coś
	42	+sytuacja
	43	+powód
	44	+początek
	45	+wiedzieć
	46	+dzień
	47	+uwaga
	48	+strona
	49	+udział
	50	+in
	51	+musieć
	52	+polityk
	53	+ktoś
	54	+ogół
	55	+polityka
	56	+chcieć
	57	+walka
	58	+zmiana
	59	+decyzja
	60	+ciąg
	61	+m .
	62	+pan
	63	+szansa
	64	+polak
	65	+przypadek
	66	+większość
	67	+pytanie
	68	+wzgląd
	69	+warszawa
	70	+proca
	71	+pomoc
	72	+prezydent
	73	+społeczeństwo
	74	+wynik
	75	+dziecko
	76	+prawda
	77	+związek
	78	+gospodarka
	79	+część
	80	+wojna
	81	+tydzień
	82	+granica
	83	+głos
	84	+przyszłość
	85	+autor
	86	+wybory
	87	+rynek
	88	+cel
	89	+ustawa
	90	+uważać
	91	+ten rok
	92	+droga
	93	+dom
	94	+rys
	95	+myśleć
	96	+firma
	97	+zasada
	98	+fakt
	99	+kolej
	100	+nadzieja
	101	+dolar
	102	+wraz
	103	+miasto
	104	+rozwój
	105	+ten sposób
	106	+europa
	107	+temat
	108	+siła
	109	+rodzina
	110	+minister
	111	+historia
	112	+wpływ
	113	+współpraca
	114	+środek
	115	+informacja
	116	+procent
	117	+wniosek
	118	+unia europejski
	119	+niemcy
	120	+podstawa
	121	+reforma
	122	+partia
	123	+interes
	124	+ten sprawa
	125	+kandydat
	126	+sukces
	127	+sposób
	128	+wątpliwość
	129	+złoty
	130	+sld
	131	+pracownik
	132	+stanowisko
	133	+dyskusja
	134	+telewizja
	135	+pewność
	136	+odpowiedź
	137	+rzeczywistość
	138	+program
	139	+cena
	140	+działanie
	141	+system
	142	+unia
	143	+ręka
	144	+odpowiedzialność
	145	+środowisko
	146	+solidarność
	147	+demokracja
	148	+maić
	149	+ramy
	150	+badanie
	151	+media
	152	+wartość
	153	+wybór
	154	+głowa
	155	+zostać
	156	+usa
	157	+pracować
	158	+porozumienie
	159	+widzieć
	160	+zdanie
	161	+akcja
	162	+wolność
	163	+spotkanie
	164	+przeszłość
	165	+stosunek
	166	+okazja
	167	+prowadzić
	168	+zachód
	169	+kobieta
	170	+obywatel
	171	+sąd
	172	+ubiegły rok
	173	+dziennikarz
	174	+kultura
	175	+grupa
	176	+opinia publiczny
	177	+obrona
	178	+bezpieczeństwo
	179	+opinia
	180	+rzeczpospolita
	181	+dokument
	182	+racja
	183	+szkoła
	184	+góra
	185	+warunek
	186	+organizacja
	187	+oko
	188	+godzina
	189	+tysiąc
	190	+ten czas
	191	+możliwość
	192	+błąd
	193	+ziemia
	194	+parlament
	195	+ten pora
	196	+chwila
	197	+naród
	198	+konflikt
	199	+działalność
	200	+sejm
	201	+powrót
	202	+premier
	203	+działać
	204	+rada
	205	+zdrowie
	206	+wiek
	207	+dodatek
	208	+poziom
	209	+widzenie
	210	+żyć
	211	+powiedzieć
	212	+inwestycja
	213	+rosja
	214	+niemiec
	215	+samochód
	216	+skutek
	217	+punkt
	218	+rola
	219	+mieszkaniec
	220	+wyborca
	221	+koszt
	222	+budżet
	223	+szef
	224	+styczeń
	225	+instytucja
	226	+pełnia
	227	+ulica
	228	+aws
	229	+ochrona
	230	+dostęp
	231	+zagrożenie
	232	+zgoda
	233	+ue
	234	+" rzeczpospolita "
	235	+liczba
	236	+wieś
	237	+połowa
0	238	\ No newline at end of file
...	...

nicolas-train/pom.xml 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-train/pom.xml
	1	+<?xml version="1.0" encoding="UTF-8"?>
	2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
	3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	5	+ <modelVersion>4.0.0</modelVersion>
	6	+ <parent>
	7	+ <artifactId>nicolas-container</artifactId>
	8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
	9	+ <version>1.0-SNAPSHOT</version>
	10	+ </parent>
	11	+
	12	+ <artifactId>nicolas-train</artifactId>
	13	+
	14	+</project>
0	15	\ No newline at end of file
...	...

nicolas-zero/pom.xml 0 → 100644

View file @e1126cd

	1	+++ a/nicolas-zero/pom.xml
	1	+<?xml version="1.0" encoding="UTF-8"?>
	2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
	3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	5	+ <modelVersion>4.0.0</modelVersion>
	6	+ <parent>
	7	+ <artifactId>nicolas-container</artifactId>
	8	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
	9	+ <version>1.0-SNAPSHOT</version>
	10	+ </parent>
	11	+
	12	+ <artifactId>nicolas-zero</artifactId>
	13	+
	14	+</project>
0	15	\ No newline at end of file
...	...

pom.xml 0 → 100644

View file @e1126cd

	1	+++ a/pom.xml
	1	+<?xml version="1.0" encoding="UTF-8"?>
	2	+<project xmlns="http://maven.apache.org/POM/4.0.0"
	3	+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	4	+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	5	+ <modelVersion>4.0.0</modelVersion>
	6	+
	7	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
	8	+ <artifactId>nicolas-container</artifactId>
	9	+ <packaging>pom</packaging>
	10	+ <version>1.0-SNAPSHOT</version>
	11	+
	12	+ <modules>
	13	+ <module>nicolas-core</module>
	14	+ <module>nicolas-cli</module>
	15	+ <module>nicolas-model</module>
	16	+ <module>nicolas-train</module>
	17	+ <module>nicolas-zero</module>
	18	+ </modules>
	19	+
	20	+ <properties>
	21	+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	22	+ <java.version.build>1.8</java.version.build>
	23	+ </properties>
	24	+
	25	+ <prerequisites>
	26	+ <maven>3.0.5</maven>
	27	+ </prerequisites>
	28	+
	29	+ <developers>
	30	+ <developer>
	31	+ <name>Mateusz Kopeć</name>
	32	+ <organization>ICS PAS</organization>
	33	+ <email>m.kopec@ipipan.waw.pl</email>
	34	+ </developer>
	35	+ </developers>
	36	+
	37	+ <dependencies>
	38	+ <dependency>
	39	+ <groupId>pl.waw.ipipan.zil.summ</groupId>
	40	+ <artifactId>pscapi</artifactId>
	41	+ <version>1.0-SNAPSHOT</version>
	42	+ </dependency>
	43	+ <dependency>
	44	+ <groupId>pl.waw.ipipan.zil.multiservice</groupId>
	45	+ <artifactId>utils</artifactId>
	46	+ <version>1.0-SNAPSHOT</version>
	47	+ </dependency>
	48	+
	49	+ <dependency>
	50	+ <groupId>org.apache.commons</groupId>
	51	+ <artifactId>commons-csv</artifactId>
	52	+ <version>1.3</version>
	53	+ </dependency>
	54	+ <dependency>
	55	+ <groupId>com.google.guava</groupId>
	56	+ <artifactId>guava</artifactId>
	57	+ <version>19.0</version>
	58	+ </dependency>
	59	+ <dependency>
	60	+ <groupId>nz.ac.waikato.cms.weka</groupId>
	61	+ <artifactId>weka-dev</artifactId>
	62	+ <version>3.9.0</version>
	63	+ </dependency>
	64	+ <dependency>
	65	+ <groupId>org.apache.commons</groupId>
	66	+ <artifactId>commons-lang3</artifactId>
	67	+ <version>3.4</version>
	68	+ </dependency>
	69	+ <dependency>
	70	+ <groupId>commons-io</groupId>
	71	+ <artifactId>commons-io</artifactId>
	72	+ <version>2.5</version>
	73	+ </dependency>
	74	+ </dependencies>
	75	+
	76	+
	77	+ <build>
	78	+ <plugins>
	79	+ <plugin>
	80	+ <groupId>org.apache.maven.plugins</groupId>
	81	+ <artifactId>maven-compiler-plugin</artifactId>
	82	+ <version>3.1</version>
	83	+ <configuration>
	84	+ <source>${java.version.build}</source>
	85	+ <target>${java.version.build}</target>
	86	+ </configuration>
	87	+ </plugin>
	88	+ </plugins>
	89	+ </build>
	90	+
	91	+ <distributionManagement>
	92	+ <repository>
	93	+ <id>deployment</id>
	94	+ <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url>
	95	+ </repository>
	96	+ <snapshotRepository>
	97	+ <id>deployment</id>
	98	+ <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url>
	99	+ </snapshotRepository>
	100	+ </distributionManagement>
	101	+</project>
0	102	\ No newline at end of file
...	...